Module elpis.models
Expand source code
from elpis.models.annotation import Annotation
from elpis.models.elan_options import ElanOptions, ElanTierSelector
from elpis.models.job import DataArguments, Job, ModelArguments
from elpis.models.vocab import VOCAB_FILE, Vocab
__all__ = [
"Annotation",
"ElanOptions",
"ElanTierSelector",
"Job",
"Vocab",
"VOCAB_FILE",
"DataArguments",
"ModelArguments",
]
Sub-modules
elpis.models.annotation
elpis.models.elan_options
elpis.models.job
elpis.models.vocab
Classes
class Annotation (audio_file: Path, transcript: str, start_ms: Optional[int] = None, stop_ms: Optional[int] = None)
-
A class which represents a section of speech for a given audio file and sample rate. If start_ms and end_ms aren't specified, it is assumed that the Annotation spans the entire audio file.
Expand source code
@dataclass class Annotation: """A class which represents a section of speech for a given audio file and sample rate. If start_ms and end_ms aren't specified, it is assumed that the Annotation spans the entire audio file. """ audio_file: Path transcript: str start_ms: Optional[int] = None stop_ms: Optional[int] = None def is_timed(self) -> bool: """Returns true iff the annotation exists between a start and stop time for the given recording. """ return self.start_ms is not None and self.stop_ms is not None def to_dict(self) -> Dict[str, Any]: """Converts an annotation to a serializable dictionary""" result = dict(self.__dict__) result["audio_file"] = str(self.audio_file) return result @classmethod def from_dict(cls, data: Dict[str, Any]) -> Annotation: """Builds an annotation from a serializable dictionary Throws an error if the required keys are not found. """ return cls( audio_file=Path(data["audio_file"]), transcript=data["transcript"], start_ms=data.get("start_ms"), stop_ms=data.get("stop_ms"), )
Class variables
var audio_file : pathlib.Path
var start_ms : Optional[int]
var stop_ms : Optional[int]
var transcript : str
Static methods
def from_dict(data: Dict[str, Any]) ‑> Annotation
-
Builds an annotation from a serializable dictionary
Throws an error if the required keys are not found.
Expand source code
@classmethod def from_dict(cls, data: Dict[str, Any]) -> Annotation: """Builds an annotation from a serializable dictionary Throws an error if the required keys are not found. """ return cls( audio_file=Path(data["audio_file"]), transcript=data["transcript"], start_ms=data.get("start_ms"), stop_ms=data.get("stop_ms"), )
Methods
def is_timed(self) ‑> bool
-
Returns true iff the annotation exists between a start and stop time for the given recording.
Expand source code
def is_timed(self) -> bool: """Returns true iff the annotation exists between a start and stop time for the given recording. """ return self.start_ms is not None and self.stop_ms is not None
def to_dict(self) ‑> Dict[str, Any]
-
Converts an annotation to a serializable dictionary
Expand source code
def to_dict(self) -> Dict[str, Any]: """Converts an annotation to a serializable dictionary""" result = dict(self.__dict__) result["audio_file"] = str(self.audio_file) return result
class DataArguments (dataset_name_or_path: str, dataset_config_name: Optional[str] = None, stream_dataset: bool = False, train_split_name: str = 'train+validation', eval_split_name: str = 'test', audio_column_name: str = 'audio', text_column_name: str = 'text', overwrite_cache: bool = False, preprocessing_num_workers: Optional[int] = None, max_train_samples: Optional[int] = None, max_eval_samples: Optional[int] = None, do_clean: bool = True, words_to_remove: Optional[List[str]] = <factory>, chars_to_remove: Optional[List[str]] = <factory>, chars_to_explode: Optional[List[str]] = <factory>, do_lower_case: Optional[bool] = None, eval_metrics: List[str] = <factory>, max_duration_in_seconds: float = 20.0, min_duration_in_seconds: float = 0.0, preprocessing_only: bool = False, token: Optional[str] = None, use_auth_token: Optional[bool] = None, trust_remote_code: bool = False, unk_token: str = '[UNK]', pad_token: str = '[PAD]', word_delimiter_token: str = '|', phoneme_language: Optional[str] = None)
-
Arguments pertaining to what data we are going to input our model for training and eval.
Using
HfArgumentParser
we can turn this class into argparse arguments to be able to specify them on the command line.Expand source code
@dataclass class DataArguments: """ Arguments pertaining to what data we are going to input our model for training and eval. Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command line. """ dataset_name_or_path: str = field( metadata={ "help": "If a path, the path to a directory containing the dataset files. " "Otherwise- the name of the dataset to use (via the datasets library)." } ) dataset_config_name: Optional[str] = field( default=None, metadata={ "help": "The configuration name of the dataset to use (via the datasets library)." }, ) stream_dataset: bool = field( default=False, metadata={ "help": "Whether to stream the dataset as opposed to downloading it all at once." }, ) train_split_name: str = field( default="train+validation", metadata={ "help": ( "The name of the training data set split to use (via the datasets library). Defaults to " "'train+validation'" ) }, ) eval_split_name: str = field( default="test", metadata={ "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'" }, ) audio_column_name: str = field( default="audio", metadata={ "help": "The name of the dataset column containing the audio data. Defaults to 'audio'" }, ) text_column_name: str = field( default="text", metadata={ "help": "The name of the dataset column containing the text data. Defaults to 'text'" }, ) overwrite_cache: bool = field( default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}, ) preprocessing_num_workers: Optional[int] = field( default=None, metadata={"help": "The number of processes to use for the preprocessing."}, ) max_train_samples: Optional[int] = field( default=None, metadata={ "help": ( "For debugging purposes or quicker training, truncate the number of training examples to this " "value if set." ) }, ) max_eval_samples: Optional[int] = field( default=None, metadata={ "help": ( "For debugging purposes or quicker training, truncate the number of validation examples to this " "value if set." ) }, ) do_clean: bool = field( default=True, metadata={"help": "True if the dataset should be cleaned before use."}, ) words_to_remove: Optional[List[str]] = list_field( default=[], metadata={ "help": "A list of words to remove from the transcripts during dataset cleaning." }, ) chars_to_remove: Optional[List[str]] = list_field( default=[], metadata={ "help": "A list of characters to remove from the transcripts during dataset cleaning." }, ) chars_to_explode: Optional[List[str]] = list_field( default=[], metadata={ "help": "A list of characters to replace with spaces in the transcripts during dataset cleaning." }, ) do_lower_case: Optional[bool] = field( default=None, metadata={"help": "Whether the target text should be lower cased."}, ) eval_metrics: List[str] = list_field( # type: ignore default=["wer", "cer"], metadata={ "help": "A list of metrics the model should be evaluated on. E.g. `('wer', 'cer')`" }, ) max_duration_in_seconds: float = field( default=20.0, metadata={ "help": ( "Filter audio files that are longer than `max_duration_in_seconds` seconds to" " 'max_duration_in_seconds`" ) }, ) min_duration_in_seconds: float = field( default=0.0, metadata={ "help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds" }, ) preprocessing_only: bool = field( default=False, metadata={ "help": ( "Whether to only do data preprocessing and skip training. This is especially useful when data" " preprocessing errors out in distributed training due to timeout. In this case, one should run the" " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets" " can consequently be loaded in distributed training" ) }, ) token: Optional[str] = field( default=None, metadata={ "help": ( "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." ) }, ) use_auth_token: Optional[bool] = field( default=None, metadata={ "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`." }, ) trust_remote_code: bool = field( default=False, metadata={ "help": ( "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" "should only be set to `True` for repositories you trust and in which you have read the code, as it will" "execute code present on the Hub on your local machine." ) }, ) unk_token: str = field( default="[UNK]", metadata={"help": "The unk token for the tokenizer"}, ) pad_token: str = field( default="[PAD]", metadata={"help": "The padding token for the tokenizer"}, ) word_delimiter_token: str = field( default="|", metadata={"help": "The word delimiter token for the tokenizer"}, ) phoneme_language: Optional[str] = field( default=None, metadata={ "help": ( "The target language that should be used be" " passed to the tokenizer for tokenization. Note that" " this is only relevant if the model classifies the" " input audio to a sequence of phoneme sequences." ) }, ) def to_dict(self) -> Dict[str, Any]: result = dict(self.__dict__) return result
Class variables
var audio_column_name : str
var chars_to_explode : Optional[List[str]]
var chars_to_remove : Optional[List[str]]
var dataset_config_name : Optional[str]
var dataset_name_or_path : str
var do_clean : bool
var do_lower_case : Optional[bool]
var eval_metrics : List[str]
var eval_split_name : str
var max_duration_in_seconds : float
var max_eval_samples : Optional[int]
var max_train_samples : Optional[int]
var min_duration_in_seconds : float
var overwrite_cache : bool
var pad_token : str
var phoneme_language : Optional[str]
var preprocessing_num_workers : Optional[int]
var preprocessing_only : bool
var stream_dataset : bool
var text_column_name : str
var token : Optional[str]
var train_split_name : str
var trust_remote_code : bool
var unk_token : str
var use_auth_token : Optional[bool]
var word_delimiter_token : str
var words_to_remove : Optional[List[str]]
Methods
def to_dict(self) ‑> Dict[str, Any]
-
Expand source code
def to_dict(self) -> Dict[str, Any]: result = dict(self.__dict__) return result
class ElanOptions (selection_mechanism: ElanTierSelector, selection_value: str)
-
A class representing options for how to extract utterance information from an elan file.
Expand source code
@dataclass class ElanOptions: """A class representing options for how to extract utterance information from an elan file.""" selection_mechanism: ElanTierSelector selection_value: str @classmethod def from_dict(cls, data: Dict[str, str]) -> "ElanOptions": return cls( selection_mechanism=ElanTierSelector(data["selection_mechanism"]), selection_value=data["selection_value"], ) def to_dict(self) -> Dict[str, str]: result = dict(self.__dict__) result["selection_mechanism"] = self.selection_mechanism.value return result
Class variables
var selection_mechanism : ElanTierSelector
var selection_value : str
Static methods
def from_dict(data: Dict[str, str]) ‑> ElanOptions
-
Expand source code
@classmethod def from_dict(cls, data: Dict[str, str]) -> "ElanOptions": return cls( selection_mechanism=ElanTierSelector(data["selection_mechanism"]), selection_value=data["selection_value"], )
Methods
def to_dict(self) ‑> Dict[str, str]
-
Expand source code
def to_dict(self) -> Dict[str, str]: result = dict(self.__dict__) result["selection_mechanism"] = self.selection_mechanism.value return result
class ElanTierSelector (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
A class representing a method of selecting elan tiers
Expand source code
class ElanTierSelector(Enum): """A class representing a method of selecting elan tiers""" ORDER = "tier_order" TYPE = "tier_type" NAME = "tier_name"
Ancestors
- enum.Enum
Class variables
var NAME
var ORDER
var TYPE
class Job (model_args: ModelArguments, data_args: DataArguments, training_args: TrainingArguments)
-
Generic class which encapsulates elpis training functionality
Expand source code
@dataclass class Job: """Generic class which encapsulates elpis training functionality""" model_args: ModelArguments data_args: DataArguments training_args: TrainingArguments @staticmethod def parser(): return HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) # type: ignore @classmethod def from_args(cls, args=None) -> Job: ( model_args, data_args, training_args, ) = Job.parser().parse_args_into_dataclasses(args) return cls( model_args=model_args, data_args=data_args, training_args=training_args ) @classmethod def from_json(cls, file: Path) -> Job: ( model_args, data_args, training_args, ) = Job.parser().parse_json_file(str(file)) return cls( model_args=model_args, data_args=data_args, training_args=training_args ) def save(self, path: Path, overwrite=True) -> None: if not overwrite and path.is_file(): return with open(path, "w") as out_file: json.dump(self.to_dict(), out_file) @classmethod def from_dict(cls, data: Dict[str, Any]) -> Job: ( model_args, data_args, training_args, ) = Job.parser().parse_dict(data) return cls( model_args=model_args, data_args=data_args, training_args=training_args ) def to_dict(self) -> Dict[str, Any]: return ( self.training_args.to_dict() | self.data_args.to_dict() | self.model_args.to_dict() ) def __eq__(self, __value: object) -> bool: if not isinstance(__value, Job): return False job = __value return ( self.training_args.to_dict() == job.training_args.to_dict() and self.model_args == job.model_args and self.data_args == job.data_args )
Class variables
var data_args : DataArguments
var model_args : ModelArguments
var training_args : transformers.training_args.TrainingArguments
Static methods
def from_args(args=None) ‑> Job
-
Expand source code
@classmethod def from_args(cls, args=None) -> Job: ( model_args, data_args, training_args, ) = Job.parser().parse_args_into_dataclasses(args) return cls( model_args=model_args, data_args=data_args, training_args=training_args )
def from_dict(data: Dict[str, Any]) ‑> Job
-
Expand source code
@classmethod def from_dict(cls, data: Dict[str, Any]) -> Job: ( model_args, data_args, training_args, ) = Job.parser().parse_dict(data) return cls( model_args=model_args, data_args=data_args, training_args=training_args )
def from_json(file: Path) ‑> Job
-
Expand source code
@classmethod def from_json(cls, file: Path) -> Job: ( model_args, data_args, training_args, ) = Job.parser().parse_json_file(str(file)) return cls( model_args=model_args, data_args=data_args, training_args=training_args )
def parser()
-
Expand source code
@staticmethod def parser(): return HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) # type: ignore
Methods
def save(self, path: Path, overwrite=True) ‑> None
-
Expand source code
def save(self, path: Path, overwrite=True) -> None: if not overwrite and path.is_file(): return with open(path, "w") as out_file: json.dump(self.to_dict(), out_file)
def to_dict(self) ‑> Dict[str, Any]
-
Expand source code
def to_dict(self) -> Dict[str, Any]: return ( self.training_args.to_dict() | self.data_args.to_dict() | self.model_args.to_dict() )
class ModelArguments (model_name_or_path: str, tokenizer_name_or_path: Optional[str] = None, cache_dir: Optional[str] = None, freeze_feature_encoder: bool = True, attention_dropout: float = 0.0, activation_dropout: float = 0.0, feat_proj_dropout: float = 0.0, hidden_dropout: float = 0.0, final_dropout: float = 0.0, mask_time_prob: float = 0.05, mask_time_length: int = 10, mask_feature_prob: float = 0.0, mask_feature_length: int = 10, layerdrop: float = 0.0, ctc_loss_reduction: Optional[str] = 'mean', ctc_zero_infinity: bool = False)
-
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
Expand source code
@dataclass class ModelArguments: """ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. """ model_name_or_path: str = field( metadata={ "help": "Path to pretrained model or model identifier from huggingface.co/models" } ) tokenizer_name_or_path: Optional[str] = field( default=None, metadata={ "help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models" }, ) cache_dir: Optional[str] = field( default=None, metadata={ "help": "Where do you want to store the pretrained models downloaded from huggingface.co" }, ) freeze_feature_encoder: bool = field( default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}, ) attention_dropout: float = field( default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}, ) activation_dropout: float = field( default=0.0, metadata={ "help": "The dropout ratio for activations inside the fully connected layer." }, ) feat_proj_dropout: float = field( default=0.0, metadata={"help": "The dropout ratio for the projected features."} ) hidden_dropout: float = field( default=0.0, metadata={ "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler." }, ) final_dropout: float = field( default=0.0, metadata={"help": "The dropout probability for the final projection layer."}, ) mask_time_prob: float = field( default=0.05, metadata={ "help": ( "Probability of each feature vector along the time axis to be chosen as the start of the vector" "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature" "vectors will be masked along the time axis." ) }, ) mask_time_length: int = field( default=10, metadata={"help": "Length of vector span to mask along the time axis."}, ) mask_feature_prob: float = field( default=0.0, metadata={ "help": ( "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan" " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature" " bins will be masked along the time axis." ) }, ) mask_feature_length: int = field( default=10, metadata={"help": "Length of vector span to mask along the feature axis."}, ) layerdrop: float = field( default=0.0, metadata={"help": "The LayerDrop probability."} ) ctc_loss_reduction: Optional[str] = field( default="mean", metadata={ "help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'." }, ) ctc_zero_infinity: bool = field( default=False, metadata={ "help": "Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. " "Infinite losses mainly occur when the inputs are too short to be aligned to the targets. " "Only relevant when training an instance of Wav2Vec2ForCTC." }, ) def to_dict(self) -> Dict[str, Any]: result = dict(self.__dict__) return result
Class variables
var activation_dropout : float
var attention_dropout : float
var cache_dir : Optional[str]
var ctc_loss_reduction : Optional[str]
var ctc_zero_infinity : bool
var feat_proj_dropout : float
var final_dropout : float
var freeze_feature_encoder : bool
var layerdrop : float
var mask_feature_length : int
var mask_feature_prob : float
var mask_time_length : int
var mask_time_prob : float
var model_name_or_path : str
var tokenizer_name_or_path : Optional[str]
Methods
def to_dict(self) ‑> Dict[str, Any]
-
Expand source code
def to_dict(self) -> Dict[str, Any]: result = dict(self.__dict__) return result
class Vocab (vocab: Dict[str, int])
-
A class which represents a dictionary of encountered tokens in a dataset.
Expand source code
@dataclass class Vocab: """A class which represents a dictionary of encountered tokens in a dataset.""" vocab: Dict[str, int] @property def symbols(self) -> Set[str]: return set(self.vocab.keys()) def merge(self, other: "Vocab") -> "Vocab": """Creates a new Vocab which includes all symbols in the merged two.""" vocab = self.symbols | other.symbols return Vocab.from_set(vocab) def save(self, path: Path) -> None: """Saves the vocab to the supplied path. If the path is a folder, saves as vocab.json, within it. """ if path.is_dir(): path /= VOCAB_FILE with open(path, "w") as out: json.dump(self.vocab, out) def add(self, char: str) -> None: """Adds a new character into the vocab.""" if char in self.vocab: return self.vocab[char] = len(self.vocab) def replace(self, original: str, replacement: str) -> None: """Replaces the supplied character mapping in the vocab.""" if original not in self.vocab or original == replacement: return self.vocab[replacement] = self.vocab[original] self.vocab.pop(original) @classmethod def from_set(cls, symbols: Set[str]) -> "Vocab": """Builds a vocab from a set of symbols.""" vocab = {symbol: index for index, symbol in enumerate(sorted(symbols))} return cls(vocab=vocab) @classmethod def from_strings(cls, texts: Iterable[str]) -> "Vocab": """Builds an vocab from a iterable text collection.""" def reducer(result: Set[str], text: str) -> Set[str]: return result | set(text) symbols = reduce(reducer, texts, set()) return cls.from_set(symbols)
Class variables
var vocab : Dict[str, int]
Static methods
def from_set(symbols: Set[str]) ‑> Vocab
-
Builds a vocab from a set of symbols.
Expand source code
@classmethod def from_set(cls, symbols: Set[str]) -> "Vocab": """Builds a vocab from a set of symbols.""" vocab = {symbol: index for index, symbol in enumerate(sorted(symbols))} return cls(vocab=vocab)
def from_strings(texts: Iterable[str]) ‑> Vocab
-
Builds an vocab from a iterable text collection.
Expand source code
@classmethod def from_strings(cls, texts: Iterable[str]) -> "Vocab": """Builds an vocab from a iterable text collection.""" def reducer(result: Set[str], text: str) -> Set[str]: return result | set(text) symbols = reduce(reducer, texts, set()) return cls.from_set(symbols)
Instance variables
var symbols : Set[str]
-
Expand source code
@property def symbols(self) -> Set[str]: return set(self.vocab.keys())
Methods
def add(self, char: str) ‑> None
-
Adds a new character into the vocab.
Expand source code
def add(self, char: str) -> None: """Adds a new character into the vocab.""" if char in self.vocab: return self.vocab[char] = len(self.vocab)
def merge(self, other: Vocab) ‑> Vocab
-
Creates a new Vocab which includes all symbols in the merged two.
Expand source code
def merge(self, other: "Vocab") -> "Vocab": """Creates a new Vocab which includes all symbols in the merged two.""" vocab = self.symbols | other.symbols return Vocab.from_set(vocab)
def replace(self, original: str, replacement: str) ‑> None
-
Replaces the supplied character mapping in the vocab.
Expand source code
def replace(self, original: str, replacement: str) -> None: """Replaces the supplied character mapping in the vocab.""" if original not in self.vocab or original == replacement: return self.vocab[replacement] = self.vocab[original] self.vocab.pop(original)
def save(self, path: pathlib.Path) ‑> None
-
Saves the vocab to the supplied path.
If the path is a folder, saves as vocab.json, within it.
Expand source code
def save(self, path: Path) -> None: """Saves the vocab to the supplied path. If the path is a folder, saves as vocab.json, within it. """ if path.is_dir(): path /= VOCAB_FILE with open(path, "w") as out: json.dump(self.vocab, out)