Module `elpis.models`

Expand source code

from elpis.models.annotation import Annotation
from elpis.models.elan_options import ElanOptions, ElanTierSelector
from elpis.models.job import DataArguments, Job, ModelArguments
from elpis.models.vocab import VOCAB_FILE, Vocab

__all__ = [
    "Annotation",
    "ElanOptions",
    "ElanTierSelector",
    "Job",
    "Vocab",
    "VOCAB_FILE",
    "DataArguments",
    "ModelArguments",
]

Sub-modules

elpis.models.annotation
elpis.models.elan_options
elpis.models.job
elpis.models.vocab

Classes

class Annotation (audio_file: Path, transcript: str, start_ms: Optional[int] = None, stop_ms: Optional[int] = None)

A class which represents a section of speech for a given audio file and sample rate. If start_ms and end_ms aren't specified, it is assumed that the Annotation spans the entire audio file.

Expand source code

@dataclass
class Annotation:
    """A class which represents a section of speech for a given audio file and
    sample rate. If start_ms and end_ms aren't specified, it is assumed that the
    Annotation spans the entire audio file.
    """

    audio_file: Path
    transcript: str
    start_ms: Optional[int] = None
    stop_ms: Optional[int] = None

    def is_timed(self) -> bool:
        """Returns true iff the annotation exists between a start and stop time for
        the given recording.
        """
        return self.start_ms is not None and self.stop_ms is not None

    def to_dict(self) -> Dict[str, Any]:
        """Converts an annotation to a serializable dictionary"""
        result = dict(self.__dict__)
        result["audio_file"] = str(self.audio_file)
        return result

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> Annotation:
        """Builds an annotation from a serializable dictionary

        Throws an error if the required keys are not found.
        """
        return cls(
            audio_file=Path(data["audio_file"]),
            transcript=data["transcript"],
            start_ms=data.get("start_ms"),
            stop_ms=data.get("stop_ms"),
        )

Class variables

var audio_file : pathlib.Path
var start_ms : Optional[int]
var stop_ms : Optional[int]
var transcript : str

Static methods

def from_dict(data: Dict[str, Any]) ‑> Annotation

Builds an annotation from a serializable dictionary

Throws an error if the required keys are not found.

Expand source code

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> Annotation:
    """Builds an annotation from a serializable dictionary

    Throws an error if the required keys are not found.
    """
    return cls(
        audio_file=Path(data["audio_file"]),
        transcript=data["transcript"],
        start_ms=data.get("start_ms"),
        stop_ms=data.get("stop_ms"),
    )

Methods

def is_timed(self) ‑> bool

Returns true iff the annotation exists between a start and stop time for the given recording.

Expand source code

def is_timed(self) -> bool:
    """Returns true iff the annotation exists between a start and stop time for
    the given recording.
    """
    return self.start_ms is not None and self.stop_ms is not None

def to_dict(self) ‑> Dict[str, Any]

Converts an annotation to a serializable dictionary

Expand source code

def to_dict(self) -> Dict[str, Any]:
    """Converts an annotation to a serializable dictionary"""
    result = dict(self.__dict__)
    result["audio_file"] = str(self.audio_file)
    return result

class DataArguments (dataset_name_or_path: str, dataset_config_name: Optional[str] = None, stream_dataset: bool = False, train_split_name: str = 'train+validation', eval_split_name: str = 'test', audio_column_name: str = 'audio', text_column_name: str = 'text', overwrite_cache: bool = False, preprocessing_num_workers: Optional[int] = None, max_train_samples: Optional[int] = None, max_eval_samples: Optional[int] = None, do_clean: bool = True, words_to_remove: Optional[List[str]] = <factory>, chars_to_remove: Optional[List[str]] = <factory>, chars_to_explode: Optional[List[str]] = <factory>, do_lower_case: Optional[bool] = None, eval_metrics: List[str] = <factory>, max_duration_in_seconds: float = 20.0, min_duration_in_seconds: float = 0.0, preprocessing_only: bool = False, token: Optional[str] = None, use_auth_token: Optional[bool] = None, trust_remote_code: bool = False, unk_token: str = '[UNK]', pad_token: str = '[PAD]', word_delimiter_token: str = '|', phoneme_language: Optional[str] = None)

Arguments pertaining to what data we are going to input our model for training and eval.

Using HfArgumentParser we can turn this class into argparse arguments to be able to specify them on the command line.

Expand source code

@dataclass
class DataArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    dataset_name_or_path: str = field(
        metadata={
            "help": "If a path, the path to a directory containing the dataset files. "
            "Otherwise- the name of the dataset to use (via the datasets library)."
        }
    )
    dataset_config_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "The configuration name of the dataset to use (via the datasets library)."
        },
    )
    stream_dataset: bool = field(
        default=False,
        metadata={
            "help": "Whether to stream the dataset as opposed to downloading it all at once."
        },
    )
    train_split_name: str = field(
        default="train+validation",
        metadata={
            "help": (
                "The name of the training data set split to use (via the datasets library). Defaults to "
                "'train+validation'"
            )
        },
    )
    eval_split_name: str = field(
        default="test",
        metadata={
            "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
        },
    )
    audio_column_name: str = field(
        default="audio",
        metadata={
            "help": "The name of the dataset column containing the audio data. Defaults to 'audio'"
        },
    )
    text_column_name: str = field(
        default="text",
        metadata={
            "help": "The name of the dataset column containing the text data. Defaults to 'text'"
        },
    )
    overwrite_cache: bool = field(
        default=False,
        metadata={"help": "Overwrite the cached preprocessed datasets or not."},
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of validation examples to this "
                "value if set."
            )
        },
    )
    do_clean: bool = field(
        default=True,
        metadata={"help": "True if the dataset should be cleaned before use."},
    )
    words_to_remove: Optional[List[str]] = list_field(
        default=[],
        metadata={
            "help": "A list of words to remove from the transcripts during dataset cleaning."
        },
    )
    chars_to_remove: Optional[List[str]] = list_field(
        default=[],
        metadata={
            "help": "A list of characters to remove from the transcripts during dataset cleaning."
        },
    )
    chars_to_explode: Optional[List[str]] = list_field(
        default=[],
        metadata={
            "help": "A list of characters to replace with spaces in the transcripts during dataset cleaning."
        },
    )
    do_lower_case: Optional[bool] = field(
        default=None,
        metadata={"help": "Whether the target text should be lower cased."},
    )
    eval_metrics: List[str] = list_field(  # type: ignore
        default=["wer", "cer"],
        metadata={
            "help": "A list of metrics the model should be evaluated on. E.g. `('wer', 'cer')`"
        },
    )
    max_duration_in_seconds: float = field(
        default=20.0,
        metadata={
            "help": (
                "Filter audio files that are longer than `max_duration_in_seconds` seconds to"
                " 'max_duration_in_seconds`"
            )
        },
    )
    min_duration_in_seconds: float = field(
        default=0.0,
        metadata={
            "help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"
        },
    )
    preprocessing_only: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to only do data preprocessing and skip training. This is especially useful when data"
                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
                " can consequently be loaded in distributed training"
            )
        },
    )
    token: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
            )
        },
    )
    use_auth_token: Optional[bool] = field(
        default=None,
        metadata={
            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
        },
    )
    trust_remote_code: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
                "execute code present on the Hub on your local machine."
            )
        },
    )
    unk_token: str = field(
        default="[UNK]",
        metadata={"help": "The unk token for the tokenizer"},
    )
    pad_token: str = field(
        default="[PAD]",
        metadata={"help": "The padding token for the tokenizer"},
    )
    word_delimiter_token: str = field(
        default="|",
        metadata={"help": "The word delimiter token for the tokenizer"},
    )
    phoneme_language: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "The target language that should be used be"
                " passed to the tokenizer for tokenization. Note that"
                " this is only relevant if the model classifies the"
                " input audio to a sequence of phoneme sequences."
            )
        },
    )

    def to_dict(self) -> Dict[str, Any]:
        result = dict(self.__dict__)
        return result

Class variables

var audio_column_name : str
var chars_to_explode : Optional[List[str]]
var chars_to_remove : Optional[List[str]]
var dataset_config_name : Optional[str]
var dataset_name_or_path : str
var do_clean : bool
var do_lower_case : Optional[bool]
var eval_metrics : List[str]
var eval_split_name : str
var max_duration_in_seconds : float
var max_eval_samples : Optional[int]
var max_train_samples : Optional[int]
var min_duration_in_seconds : float
var overwrite_cache : bool
var pad_token : str
var phoneme_language : Optional[str]
var preprocessing_num_workers : Optional[int]
var preprocessing_only : bool
var stream_dataset : bool
var text_column_name : str
var token : Optional[str]
var train_split_name : str
var trust_remote_code : bool
var unk_token : str
var use_auth_token : Optional[bool]
var word_delimiter_token : str
var words_to_remove : Optional[List[str]]

Methods

def to_dict(self) ‑> Dict[str, Any]

Expand source code

def to_dict(self) -> Dict[str, Any]:
    result = dict(self.__dict__)
    return result

class ElanOptions (selection_mechanism: ElanTierSelector, selection_value: str)

A class representing options for how to extract utterance information from an elan file.

Expand source code

@dataclass
class ElanOptions:
    """A class representing options for how to extract utterance information
    from an elan file."""

    selection_mechanism: ElanTierSelector
    selection_value: str

    @classmethod
    def from_dict(cls, data: Dict[str, str]) -> "ElanOptions":
        return cls(
            selection_mechanism=ElanTierSelector(data["selection_mechanism"]),
            selection_value=data["selection_value"],
        )

    def to_dict(self) -> Dict[str, str]:
        result = dict(self.__dict__)
        result["selection_mechanism"] = self.selection_mechanism.value
        return result

Class variables

var selection_mechanism : ElanTierSelector
var selection_value : str

Static methods

def from_dict(data: Dict[str, str]) ‑> ElanOptions

Expand source code

@classmethod
def from_dict(cls, data: Dict[str, str]) -> "ElanOptions":
    return cls(
        selection_mechanism=ElanTierSelector(data["selection_mechanism"]),
        selection_value=data["selection_value"],
    )

Methods

def to_dict(self) ‑> Dict[str, str]

Expand source code

def to_dict(self) -> Dict[str, str]:
    result = dict(self.__dict__)
    result["selection_mechanism"] = self.selection_mechanism.value
    return result

class ElanTierSelector (value, names=None, *, module=None, qualname=None, type=None, start=1)

A class representing a method of selecting elan tiers

Expand source code

class ElanTierSelector(Enum):
    """A class representing a method of selecting elan tiers"""

    ORDER = "tier_order"
    TYPE = "tier_type"
    NAME = "tier_name"

Ancestors

enum.Enum

Class variables

var NAME
var ORDER
var TYPE

class Job (model_args: ModelArguments, data_args: DataArguments, training_args: TrainingArguments)

Generic class which encapsulates elpis training functionality

Expand source code

@dataclass
class Job:
    """Generic class which encapsulates elpis training functionality"""

    model_args: ModelArguments
    data_args: DataArguments
    training_args: TrainingArguments

    @staticmethod
    def parser():
        return HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))  # type: ignore

    @classmethod
    def from_args(cls, args=None) -> Job:
        (
            model_args,
            data_args,
            training_args,
        ) = Job.parser().parse_args_into_dataclasses(args)
        return cls(
            model_args=model_args, data_args=data_args, training_args=training_args
        )

    @classmethod
    def from_json(cls, file: Path) -> Job:
        (
            model_args,
            data_args,
            training_args,
        ) = Job.parser().parse_json_file(str(file))
        return cls(
            model_args=model_args, data_args=data_args, training_args=training_args
        )

    def save(self, path: Path, overwrite=True) -> None:
        if not overwrite and path.is_file():
            return

        with open(path, "w") as out_file:
            json.dump(self.to_dict(), out_file)

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> Job:
        (
            model_args,
            data_args,
            training_args,
        ) = Job.parser().parse_dict(data)
        return cls(
            model_args=model_args, data_args=data_args, training_args=training_args
        )

    def to_dict(self) -> Dict[str, Any]:
        return (
            self.training_args.to_dict()
            | self.data_args.to_dict()
            | self.model_args.to_dict()
        )

    def __eq__(self, __value: object) -> bool:
        if not isinstance(__value, Job):
            return False

        job = __value

        return (
            self.training_args.to_dict() == job.training_args.to_dict()
            and self.model_args == job.model_args
            and self.data_args == job.data_args
        )

Class variables

var data_args : DataArguments
var model_args : ModelArguments
var training_args : transformers.training_args.TrainingArguments

Static methods

def from_args(args=None) ‑> Job

Expand source code

@classmethod
def from_args(cls, args=None) -> Job:
    (
        model_args,
        data_args,
        training_args,
    ) = Job.parser().parse_args_into_dataclasses(args)
    return cls(
        model_args=model_args, data_args=data_args, training_args=training_args
    )

def from_dict(data: Dict[str, Any]) ‑> Job

Expand source code

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> Job:
    (
        model_args,
        data_args,
        training_args,
    ) = Job.parser().parse_dict(data)
    return cls(
        model_args=model_args, data_args=data_args, training_args=training_args
    )

def from_json(file: Path) ‑> Job

Expand source code

@classmethod
def from_json(cls, file: Path) -> Job:
    (
        model_args,
        data_args,
        training_args,
    ) = Job.parser().parse_json_file(str(file))
    return cls(
        model_args=model_args, data_args=data_args, training_args=training_args
    )

def parser()

Expand source code

@staticmethod
def parser():
    return HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))  # type: ignore

Methods

def save(self, path: Path, overwrite=True) ‑> None

Expand source code

def save(self, path: Path, overwrite=True) -> None:
    if not overwrite and path.is_file():
        return

    with open(path, "w") as out_file:
        json.dump(self.to_dict(), out_file)

def to_dict(self) ‑> Dict[str, Any]

Expand source code

def to_dict(self) -> Dict[str, Any]:
    return (
        self.training_args.to_dict()
        | self.data_args.to_dict()
        | self.model_args.to_dict()
    )

class ModelArguments (model_name_or_path: str, tokenizer_name_or_path: Optional[str] = None, cache_dir: Optional[str] = None, freeze_feature_encoder: bool = True, attention_dropout: float = 0.0, activation_dropout: float = 0.0, feat_proj_dropout: float = 0.0, hidden_dropout: float = 0.0, final_dropout: float = 0.0, mask_time_prob: float = 0.05, mask_time_length: int = 10, mask_feature_prob: float = 0.0, mask_feature_length: int = 10, layerdrop: float = 0.0, ctc_loss_reduction: Optional[str] = 'mean', ctc_zero_infinity: bool = False)

Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.

Expand source code

@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={
            "help": "Path to pretrained model or model identifier from huggingface.co/models"
        }
    )
    tokenizer_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"
        },
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={
            "help": "Where do you want to store the pretrained models downloaded from huggingface.co"
        },
    )
    freeze_feature_encoder: bool = field(
        default=True,
        metadata={"help": "Whether to freeze the feature encoder layers of the model."},
    )
    attention_dropout: float = field(
        default=0.0,
        metadata={"help": "The dropout ratio for the attention probabilities."},
    )
    activation_dropout: float = field(
        default=0.0,
        metadata={
            "help": "The dropout ratio for activations inside the fully connected layer."
        },
    )
    feat_proj_dropout: float = field(
        default=0.0, metadata={"help": "The dropout ratio for the projected features."}
    )
    hidden_dropout: float = field(
        default=0.0,
        metadata={
            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
        },
    )
    final_dropout: float = field(
        default=0.0,
        metadata={"help": "The dropout probability for the final projection layer."},
    )
    mask_time_prob: float = field(
        default=0.05,
        metadata={
            "help": (
                "Probability of each feature vector along the time axis to be chosen as the start of the vector"
                "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
                "vectors will be masked along the time axis."
            )
        },
    )
    mask_time_length: int = field(
        default=10,
        metadata={"help": "Length of vector span to mask along the time axis."},
    )
    mask_feature_prob: float = field(
        default=0.0,
        metadata={
            "help": (
                "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
                " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
                " bins will be masked along the time axis."
            )
        },
    )
    mask_feature_length: int = field(
        default=10,
        metadata={"help": "Length of vector span to mask along the feature axis."},
    )
    layerdrop: float = field(
        default=0.0, metadata={"help": "The LayerDrop probability."}
    )
    ctc_loss_reduction: Optional[str] = field(
        default="mean",
        metadata={
            "help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."
        },
    )
    ctc_zero_infinity: bool = field(
        default=False,
        metadata={
            "help": "Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. "
            "Infinite losses mainly occur when the inputs are too short to be aligned to the targets. "
            "Only relevant when training an instance of Wav2Vec2ForCTC."
        },
    )

    def to_dict(self) -> Dict[str, Any]:
        result = dict(self.__dict__)
        return result

Class variables

var activation_dropout : float
var attention_dropout : float
var cache_dir : Optional[str]
var ctc_loss_reduction : Optional[str]
var ctc_zero_infinity : bool
var feat_proj_dropout : float
var final_dropout : float
var freeze_feature_encoder : bool
var hidden_dropout : float
var layerdrop : float
var mask_feature_length : int
var mask_feature_prob : float
var mask_time_length : int
var mask_time_prob : float
var model_name_or_path : str
var tokenizer_name_or_path : Optional[str]

Methods

def to_dict(self) ‑> Dict[str, Any]

Expand source code

def to_dict(self) -> Dict[str, Any]:
    result = dict(self.__dict__)
    return result

class Vocab (vocab: Dict[str, int])

A class which represents a dictionary of encountered tokens in a dataset.

Expand source code

@dataclass
class Vocab:
    """A class which represents a dictionary of encountered tokens in a dataset."""

    vocab: Dict[str, int]

    @property
    def symbols(self) -> Set[str]:
        return set(self.vocab.keys())

    def merge(self, other: "Vocab") -> "Vocab":
        """Creates a new Vocab which includes all symbols in the merged two."""
        vocab = self.symbols | other.symbols
        return Vocab.from_set(vocab)

    def save(self, path: Path) -> None:
        """Saves the vocab to the supplied path.

        If the path is a folder, saves as vocab.json, within it.
        """
        if path.is_dir():
            path /= VOCAB_FILE

        with open(path, "w") as out:
            json.dump(self.vocab, out)

    def add(self, char: str) -> None:
        """Adds a new character into the vocab."""
        if char in self.vocab:
            return

        self.vocab[char] = len(self.vocab)

    def replace(self, original: str, replacement: str) -> None:
        """Replaces the supplied character mapping in the vocab."""
        if original not in self.vocab or original == replacement:
            return

        self.vocab[replacement] = self.vocab[original]
        self.vocab.pop(original)

    @classmethod
    def from_set(cls, symbols: Set[str]) -> "Vocab":
        """Builds a vocab from a set of symbols."""
        vocab = {symbol: index for index, symbol in enumerate(sorted(symbols))}
        return cls(vocab=vocab)

    @classmethod
    def from_strings(cls, texts: Iterable[str]) -> "Vocab":
        """Builds an vocab from a iterable text collection."""

        def reducer(result: Set[str], text: str) -> Set[str]:
            return result | set(text)

        symbols = reduce(reducer, texts, set())
        return cls.from_set(symbols)

Class variables

var vocab : Dict[str, int]

Static methods

def from_set(symbols: Set[str]) ‑> Vocab

Builds a vocab from a set of symbols.

Expand source code

@classmethod
def from_set(cls, symbols: Set[str]) -> "Vocab":
    """Builds a vocab from a set of symbols."""
    vocab = {symbol: index for index, symbol in enumerate(sorted(symbols))}
    return cls(vocab=vocab)

def from_strings(texts: Iterable[str]) ‑> Vocab

Builds an vocab from a iterable text collection.

Expand source code

@classmethod
def from_strings(cls, texts: Iterable[str]) -> "Vocab":
    """Builds an vocab from a iterable text collection."""

    def reducer(result: Set[str], text: str) -> Set[str]:
        return result | set(text)

    symbols = reduce(reducer, texts, set())
    return cls.from_set(symbols)

Instance variables

var symbols : Set[str]

Expand source code

@property
def symbols(self) -> Set[str]:
    return set(self.vocab.keys())

Methods

def add(self, char: str) ‑> None

Adds a new character into the vocab.

Expand source code

def add(self, char: str) -> None:
    """Adds a new character into the vocab."""
    if char in self.vocab:
        return

    self.vocab[char] = len(self.vocab)

def merge(self, other: Vocab) ‑> Vocab

Creates a new Vocab which includes all symbols in the merged two.

Expand source code

def merge(self, other: "Vocab") -> "Vocab":
    """Creates a new Vocab which includes all symbols in the merged two."""
    vocab = self.symbols | other.symbols
    return Vocab.from_set(vocab)

def replace(self, original: str, replacement: str) ‑> None

Replaces the supplied character mapping in the vocab.

Expand source code

def replace(self, original: str, replacement: str) -> None:
    """Replaces the supplied character mapping in the vocab."""
    if original not in self.vocab or original == replacement:
        return

    self.vocab[replacement] = self.vocab[original]
    self.vocab.pop(original)

def save(self, path: pathlib.Path) ‑> None

Saves the vocab to the supplied path.

If the path is a folder, saves as vocab.json, within it.

Expand source code

def save(self, path: Path) -> None:
    """Saves the vocab to the supplied path.

    If the path is a folder, saves as vocab.json, within it.
    """
    if path.is_dir():
        path /= VOCAB_FILE

    with open(path, "w") as out:
        json.dump(self.vocab, out)