Module elpis.models.job
Expand source code
from __future__ import annotations
import json
from copy import copy
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
from transformers import HfArgumentParser, TrainingArguments
def list_field(default=None, metadata=None):
return field(default_factory=lambda: default, metadata=metadata)
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: str = field(
metadata={
"help": "Path to pretrained model or model identifier from huggingface.co/models"
}
)
tokenizer_name_or_path: Optional[str] = field(
default=None,
metadata={
"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"
},
)
cache_dir: Optional[str] = field(
default=None,
metadata={
"help": "Where do you want to store the pretrained models downloaded from huggingface.co"
},
)
freeze_feature_encoder: bool = field(
default=True,
metadata={"help": "Whether to freeze the feature encoder layers of the model."},
)
attention_dropout: float = field(
default=0.0,
metadata={"help": "The dropout ratio for the attention probabilities."},
)
activation_dropout: float = field(
default=0.0,
metadata={
"help": "The dropout ratio for activations inside the fully connected layer."
},
)
feat_proj_dropout: float = field(
default=0.0, metadata={"help": "The dropout ratio for the projected features."}
)
hidden_dropout: float = field(
default=0.0,
metadata={
"help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
},
)
final_dropout: float = field(
default=0.0,
metadata={"help": "The dropout probability for the final projection layer."},
)
mask_time_prob: float = field(
default=0.05,
metadata={
"help": (
"Probability of each feature vector along the time axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
"vectors will be masked along the time axis."
)
},
)
mask_time_length: int = field(
default=10,
metadata={"help": "Length of vector span to mask along the time axis."},
)
mask_feature_prob: float = field(
default=0.0,
metadata={
"help": (
"Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
" to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
" bins will be masked along the time axis."
)
},
)
mask_feature_length: int = field(
default=10,
metadata={"help": "Length of vector span to mask along the feature axis."},
)
layerdrop: float = field(
default=0.0, metadata={"help": "The LayerDrop probability."}
)
ctc_loss_reduction: Optional[str] = field(
default="mean",
metadata={
"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."
},
)
ctc_zero_infinity: bool = field(
default=False,
metadata={
"help": "Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. "
"Infinite losses mainly occur when the inputs are too short to be aligned to the targets. "
"Only relevant when training an instance of Wav2Vec2ForCTC."
},
)
def to_dict(self) -> Dict[str, Any]:
result = dict(self.__dict__)
return result
@dataclass
class DataArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
Using `HfArgumentParser` we can turn this class
into argparse arguments to be able to specify them on
the command line.
"""
dataset_name_or_path: str = field(
metadata={
"help": "If a path, the path to a directory containing the dataset files. "
"Otherwise- the name of the dataset to use (via the datasets library)."
}
)
dataset_config_name: Optional[str] = field(
default=None,
metadata={
"help": "The configuration name of the dataset to use (via the datasets library)."
},
)
stream_dataset: bool = field(
default=False,
metadata={
"help": "Whether to stream the dataset as opposed to downloading it all at once."
},
)
train_split_name: str = field(
default="train+validation",
metadata={
"help": (
"The name of the training data set split to use (via the datasets library). Defaults to "
"'train+validation'"
)
},
)
eval_split_name: str = field(
default="test",
metadata={
"help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
},
)
audio_column_name: str = field(
default="audio",
metadata={
"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"
},
)
text_column_name: str = field(
default="text",
metadata={
"help": "The name of the dataset column containing the text data. Defaults to 'text'"
},
)
overwrite_cache: bool = field(
default=False,
metadata={"help": "Overwrite the cached preprocessed datasets or not."},
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
)
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of validation examples to this "
"value if set."
)
},
)
do_clean: bool = field(
default=True,
metadata={"help": "True if the dataset should be cleaned before use."},
)
words_to_remove: Optional[List[str]] = list_field(
default=[],
metadata={
"help": "A list of words to remove from the transcripts during dataset cleaning."
},
)
chars_to_remove: Optional[List[str]] = list_field(
default=[],
metadata={
"help": "A list of characters to remove from the transcripts during dataset cleaning."
},
)
chars_to_explode: Optional[List[str]] = list_field(
default=[],
metadata={
"help": "A list of characters to replace with spaces in the transcripts during dataset cleaning."
},
)
do_lower_case: Optional[bool] = field(
default=None,
metadata={"help": "Whether the target text should be lower cased."},
)
eval_metrics: List[str] = list_field( # type: ignore
default=["wer", "cer"],
metadata={
"help": "A list of metrics the model should be evaluated on. E.g. `('wer', 'cer')`"
},
)
max_duration_in_seconds: float = field(
default=20.0,
metadata={
"help": (
"Filter audio files that are longer than `max_duration_in_seconds` seconds to"
" 'max_duration_in_seconds`"
)
},
)
min_duration_in_seconds: float = field(
default=0.0,
metadata={
"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"
},
)
preprocessing_only: bool = field(
default=False,
metadata={
"help": (
"Whether to only do data preprocessing and skip training. This is especially useful when data"
" preprocessing errors out in distributed training due to timeout. In this case, one should run the"
" preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
" can consequently be loaded in distributed training"
)
},
)
token: Optional[str] = field(
default=None,
metadata={
"help": (
"The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
"generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
)
},
)
use_auth_token: Optional[bool] = field(
default=None,
metadata={
"help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
},
)
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
"should only be set to `True` for repositories you trust and in which you have read the code, as it will"
"execute code present on the Hub on your local machine."
)
},
)
unk_token: str = field(
default="[UNK]",
metadata={"help": "The unk token for the tokenizer"},
)
pad_token: str = field(
default="[PAD]",
metadata={"help": "The padding token for the tokenizer"},
)
word_delimiter_token: str = field(
default="|",
metadata={"help": "The word delimiter token for the tokenizer"},
)
phoneme_language: Optional[str] = field(
default=None,
metadata={
"help": (
"The target language that should be used be"
" passed to the tokenizer for tokenization. Note that"
" this is only relevant if the model classifies the"
" input audio to a sequence of phoneme sequences."
)
},
)
def to_dict(self) -> Dict[str, Any]:
result = dict(self.__dict__)
return result
@dataclass
class Job:
"""Generic class which encapsulates elpis training functionality"""
model_args: ModelArguments
data_args: DataArguments
training_args: TrainingArguments
@staticmethod
def parser():
return HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) # type: ignore
@classmethod
def from_args(cls, args=None) -> Job:
(
model_args,
data_args,
training_args,
) = Job.parser().parse_args_into_dataclasses(args)
return cls(
model_args=model_args, data_args=data_args, training_args=training_args
)
@classmethod
def from_json(cls, file: Path) -> Job:
(
model_args,
data_args,
training_args,
) = Job.parser().parse_json_file(str(file))
return cls(
model_args=model_args, data_args=data_args, training_args=training_args
)
def save(self, path: Path, overwrite=True) -> None:
if not overwrite and path.is_file():
return
with open(path, "w") as out_file:
json.dump(self.to_dict(), out_file)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> Job:
(
model_args,
data_args,
training_args,
) = Job.parser().parse_dict(data)
return cls(
model_args=model_args, data_args=data_args, training_args=training_args
)
def to_dict(self) -> Dict[str, Any]:
return (
self.training_args.to_dict()
| self.data_args.to_dict()
| self.model_args.to_dict()
)
def __eq__(self, __value: object) -> bool:
if not isinstance(__value, Job):
return False
job = __value
return (
self.training_args.to_dict() == job.training_args.to_dict()
and self.model_args == job.model_args
and self.data_args == job.data_args
)
Functions
def list_field(default=None, metadata=None)
-
Expand source code
def list_field(default=None, metadata=None): return field(default_factory=lambda: default, metadata=metadata)
Classes
class DataArguments (dataset_name_or_path: str, dataset_config_name: Optional[str] = None, stream_dataset: bool = False, train_split_name: str = 'train+validation', eval_split_name: str = 'test', audio_column_name: str = 'audio', text_column_name: str = 'text', overwrite_cache: bool = False, preprocessing_num_workers: Optional[int] = None, max_train_samples: Optional[int] = None, max_eval_samples: Optional[int] = None, do_clean: bool = True, words_to_remove: Optional[List[str]] = <factory>, chars_to_remove: Optional[List[str]] = <factory>, chars_to_explode: Optional[List[str]] = <factory>, do_lower_case: Optional[bool] = None, eval_metrics: List[str] = <factory>, max_duration_in_seconds: float = 20.0, min_duration_in_seconds: float = 0.0, preprocessing_only: bool = False, token: Optional[str] = None, use_auth_token: Optional[bool] = None, trust_remote_code: bool = False, unk_token: str = '[UNK]', pad_token: str = '[PAD]', word_delimiter_token: str = '|', phoneme_language: Optional[str] = None)
-
Arguments pertaining to what data we are going to input our model for training and eval.
Using
HfArgumentParser
we can turn this class into argparse arguments to be able to specify them on the command line.Expand source code
@dataclass class DataArguments: """ Arguments pertaining to what data we are going to input our model for training and eval. Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command line. """ dataset_name_or_path: str = field( metadata={ "help": "If a path, the path to a directory containing the dataset files. " "Otherwise- the name of the dataset to use (via the datasets library)." } ) dataset_config_name: Optional[str] = field( default=None, metadata={ "help": "The configuration name of the dataset to use (via the datasets library)." }, ) stream_dataset: bool = field( default=False, metadata={ "help": "Whether to stream the dataset as opposed to downloading it all at once." }, ) train_split_name: str = field( default="train+validation", metadata={ "help": ( "The name of the training data set split to use (via the datasets library). Defaults to " "'train+validation'" ) }, ) eval_split_name: str = field( default="test", metadata={ "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'" }, ) audio_column_name: str = field( default="audio", metadata={ "help": "The name of the dataset column containing the audio data. Defaults to 'audio'" }, ) text_column_name: str = field( default="text", metadata={ "help": "The name of the dataset column containing the text data. Defaults to 'text'" }, ) overwrite_cache: bool = field( default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}, ) preprocessing_num_workers: Optional[int] = field( default=None, metadata={"help": "The number of processes to use for the preprocessing."}, ) max_train_samples: Optional[int] = field( default=None, metadata={ "help": ( "For debugging purposes or quicker training, truncate the number of training examples to this " "value if set." ) }, ) max_eval_samples: Optional[int] = field( default=None, metadata={ "help": ( "For debugging purposes or quicker training, truncate the number of validation examples to this " "value if set." ) }, ) do_clean: bool = field( default=True, metadata={"help": "True if the dataset should be cleaned before use."}, ) words_to_remove: Optional[List[str]] = list_field( default=[], metadata={ "help": "A list of words to remove from the transcripts during dataset cleaning." }, ) chars_to_remove: Optional[List[str]] = list_field( default=[], metadata={ "help": "A list of characters to remove from the transcripts during dataset cleaning." }, ) chars_to_explode: Optional[List[str]] = list_field( default=[], metadata={ "help": "A list of characters to replace with spaces in the transcripts during dataset cleaning." }, ) do_lower_case: Optional[bool] = field( default=None, metadata={"help": "Whether the target text should be lower cased."}, ) eval_metrics: List[str] = list_field( # type: ignore default=["wer", "cer"], metadata={ "help": "A list of metrics the model should be evaluated on. E.g. `('wer', 'cer')`" }, ) max_duration_in_seconds: float = field( default=20.0, metadata={ "help": ( "Filter audio files that are longer than `max_duration_in_seconds` seconds to" " 'max_duration_in_seconds`" ) }, ) min_duration_in_seconds: float = field( default=0.0, metadata={ "help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds" }, ) preprocessing_only: bool = field( default=False, metadata={ "help": ( "Whether to only do data preprocessing and skip training. This is especially useful when data" " preprocessing errors out in distributed training due to timeout. In this case, one should run the" " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets" " can consequently be loaded in distributed training" ) }, ) token: Optional[str] = field( default=None, metadata={ "help": ( "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." ) }, ) use_auth_token: Optional[bool] = field( default=None, metadata={ "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`." }, ) trust_remote_code: bool = field( default=False, metadata={ "help": ( "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" "should only be set to `True` for repositories you trust and in which you have read the code, as it will" "execute code present on the Hub on your local machine." ) }, ) unk_token: str = field( default="[UNK]", metadata={"help": "The unk token for the tokenizer"}, ) pad_token: str = field( default="[PAD]", metadata={"help": "The padding token for the tokenizer"}, ) word_delimiter_token: str = field( default="|", metadata={"help": "The word delimiter token for the tokenizer"}, ) phoneme_language: Optional[str] = field( default=None, metadata={ "help": ( "The target language that should be used be" " passed to the tokenizer for tokenization. Note that" " this is only relevant if the model classifies the" " input audio to a sequence of phoneme sequences." ) }, ) def to_dict(self) -> Dict[str, Any]: result = dict(self.__dict__) return result
Class variables
var audio_column_name : str
var chars_to_explode : Optional[List[str]]
var chars_to_remove : Optional[List[str]]
var dataset_config_name : Optional[str]
var dataset_name_or_path : str
var do_clean : bool
var do_lower_case : Optional[bool]
var eval_metrics : List[str]
var eval_split_name : str
var max_duration_in_seconds : float
var max_eval_samples : Optional[int]
var max_train_samples : Optional[int]
var min_duration_in_seconds : float
var overwrite_cache : bool
var pad_token : str
var phoneme_language : Optional[str]
var preprocessing_num_workers : Optional[int]
var preprocessing_only : bool
var stream_dataset : bool
var text_column_name : str
var token : Optional[str]
var train_split_name : str
var trust_remote_code : bool
var unk_token : str
var use_auth_token : Optional[bool]
var word_delimiter_token : str
var words_to_remove : Optional[List[str]]
Methods
def to_dict(self) ‑> Dict[str, Any]
-
Expand source code
def to_dict(self) -> Dict[str, Any]: result = dict(self.__dict__) return result
class Job (model_args: ModelArguments, data_args: DataArguments, training_args: TrainingArguments)
-
Generic class which encapsulates elpis training functionality
Expand source code
@dataclass class Job: """Generic class which encapsulates elpis training functionality""" model_args: ModelArguments data_args: DataArguments training_args: TrainingArguments @staticmethod def parser(): return HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) # type: ignore @classmethod def from_args(cls, args=None) -> Job: ( model_args, data_args, training_args, ) = Job.parser().parse_args_into_dataclasses(args) return cls( model_args=model_args, data_args=data_args, training_args=training_args ) @classmethod def from_json(cls, file: Path) -> Job: ( model_args, data_args, training_args, ) = Job.parser().parse_json_file(str(file)) return cls( model_args=model_args, data_args=data_args, training_args=training_args ) def save(self, path: Path, overwrite=True) -> None: if not overwrite and path.is_file(): return with open(path, "w") as out_file: json.dump(self.to_dict(), out_file) @classmethod def from_dict(cls, data: Dict[str, Any]) -> Job: ( model_args, data_args, training_args, ) = Job.parser().parse_dict(data) return cls( model_args=model_args, data_args=data_args, training_args=training_args ) def to_dict(self) -> Dict[str, Any]: return ( self.training_args.to_dict() | self.data_args.to_dict() | self.model_args.to_dict() ) def __eq__(self, __value: object) -> bool: if not isinstance(__value, Job): return False job = __value return ( self.training_args.to_dict() == job.training_args.to_dict() and self.model_args == job.model_args and self.data_args == job.data_args )
Class variables
var data_args : DataArguments
var model_args : ModelArguments
var training_args : transformers.training_args.TrainingArguments
Static methods
def from_args(args=None) ‑> Job
-
Expand source code
@classmethod def from_args(cls, args=None) -> Job: ( model_args, data_args, training_args, ) = Job.parser().parse_args_into_dataclasses(args) return cls( model_args=model_args, data_args=data_args, training_args=training_args )
def from_dict(data: Dict[str, Any]) ‑> Job
-
Expand source code
@classmethod def from_dict(cls, data: Dict[str, Any]) -> Job: ( model_args, data_args, training_args, ) = Job.parser().parse_dict(data) return cls( model_args=model_args, data_args=data_args, training_args=training_args )
def from_json(file: Path) ‑> Job
-
Expand source code
@classmethod def from_json(cls, file: Path) -> Job: ( model_args, data_args, training_args, ) = Job.parser().parse_json_file(str(file)) return cls( model_args=model_args, data_args=data_args, training_args=training_args )
def parser()
-
Expand source code
@staticmethod def parser(): return HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) # type: ignore
Methods
def save(self, path: Path, overwrite=True) ‑> None
-
Expand source code
def save(self, path: Path, overwrite=True) -> None: if not overwrite and path.is_file(): return with open(path, "w") as out_file: json.dump(self.to_dict(), out_file)
def to_dict(self) ‑> Dict[str, Any]
-
Expand source code
def to_dict(self) -> Dict[str, Any]: return ( self.training_args.to_dict() | self.data_args.to_dict() | self.model_args.to_dict() )
class ModelArguments (model_name_or_path: str, tokenizer_name_or_path: Optional[str] = None, cache_dir: Optional[str] = None, freeze_feature_encoder: bool = True, attention_dropout: float = 0.0, activation_dropout: float = 0.0, feat_proj_dropout: float = 0.0, hidden_dropout: float = 0.0, final_dropout: float = 0.0, mask_time_prob: float = 0.05, mask_time_length: int = 10, mask_feature_prob: float = 0.0, mask_feature_length: int = 10, layerdrop: float = 0.0, ctc_loss_reduction: Optional[str] = 'mean', ctc_zero_infinity: bool = False)
-
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
Expand source code
@dataclass class ModelArguments: """ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. """ model_name_or_path: str = field( metadata={ "help": "Path to pretrained model or model identifier from huggingface.co/models" } ) tokenizer_name_or_path: Optional[str] = field( default=None, metadata={ "help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models" }, ) cache_dir: Optional[str] = field( default=None, metadata={ "help": "Where do you want to store the pretrained models downloaded from huggingface.co" }, ) freeze_feature_encoder: bool = field( default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}, ) attention_dropout: float = field( default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}, ) activation_dropout: float = field( default=0.0, metadata={ "help": "The dropout ratio for activations inside the fully connected layer." }, ) feat_proj_dropout: float = field( default=0.0, metadata={"help": "The dropout ratio for the projected features."} ) hidden_dropout: float = field( default=0.0, metadata={ "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler." }, ) final_dropout: float = field( default=0.0, metadata={"help": "The dropout probability for the final projection layer."}, ) mask_time_prob: float = field( default=0.05, metadata={ "help": ( "Probability of each feature vector along the time axis to be chosen as the start of the vector" "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature" "vectors will be masked along the time axis." ) }, ) mask_time_length: int = field( default=10, metadata={"help": "Length of vector span to mask along the time axis."}, ) mask_feature_prob: float = field( default=0.0, metadata={ "help": ( "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan" " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature" " bins will be masked along the time axis." ) }, ) mask_feature_length: int = field( default=10, metadata={"help": "Length of vector span to mask along the feature axis."}, ) layerdrop: float = field( default=0.0, metadata={"help": "The LayerDrop probability."} ) ctc_loss_reduction: Optional[str] = field( default="mean", metadata={ "help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'." }, ) ctc_zero_infinity: bool = field( default=False, metadata={ "help": "Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. " "Infinite losses mainly occur when the inputs are too short to be aligned to the targets. " "Only relevant when training an instance of Wav2Vec2ForCTC." }, ) def to_dict(self) -> Dict[str, Any]: result = dict(self.__dict__) return result
Class variables
var activation_dropout : float
var attention_dropout : float
var cache_dir : Optional[str]
var ctc_loss_reduction : Optional[str]
var ctc_zero_infinity : bool
var feat_proj_dropout : float
var final_dropout : float
var freeze_feature_encoder : bool
var layerdrop : float
var mask_feature_length : int
var mask_feature_prob : float
var mask_time_length : int
var mask_time_prob : float
var model_name_or_path : str
var tokenizer_name_or_path : Optional[str]
Methods
def to_dict(self) ‑> Dict[str, Any]
-
Expand source code
def to_dict(self) -> Dict[str, Any]: result = dict(self.__dict__) return result