Module `elpis.datasets.dataset`

Expand source code

from __future__ import annotations

from dataclasses import dataclass, field, fields
from functools import cached_property, reduce
from itertools import chain, groupby
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple

from elpis.models import ElanOptions

TRANSCRIPTION_EXTENSIONS = {".eaf", ".txt"}


@dataclass
class CleaningOptions:
    """A class representing cleaning options for a dataset."""

    punctuation_to_remove: str = ""
    punctuation_to_explode: str = ""
    words_to_remove: List[str] = field(default_factory=list)

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> CleaningOptions:
        kwargs = {field.name: data[field.name] for field in fields(CleaningOptions)}
        return cls(**kwargs)

    def to_dict(self) -> Dict[str, Any]:
        return dict(self.__dict__)


@dataclass
class ProcessingBatch:
    """A class encapsulating the data needed for an individual processing job"""

    audio_file: Path
    transcription_file: Path
    cleaning_options: CleaningOptions
    elan_options: Optional[ElanOptions]

    def to_dict(self) -> Dict[str, Any]:
        result = {}

        result["audio_file"] = str(self.audio_file)
        result["transcription_file"] = str(self.transcription_file)
        result["cleaning_options"] = self.cleaning_options.to_dict()
        if self.elan_options is not None:
            result["elan_options"] = self.elan_options.to_dict()

        return result

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> ProcessingBatch:
        audio_file = Path(data["audio_file"])
        transcription_file = Path(data["transcription_file"])
        cleaning_options = CleaningOptions.from_dict(data["cleaning_options"])
        elan_options = ElanOptions.from_dict(data["elan_options"])
        return cls(
            audio_file=audio_file,
            transcription_file=transcription_file,
            cleaning_options=cleaning_options,
            elan_options=elan_options,
        )


@dataclass
class Dataset:
    """A class representing an unprocessed dataset."""

    name: str
    files: List[Path]
    cleaning_options: CleaningOptions
    elan_options: Optional[ElanOptions]

    def __post_init__(self):
        self.files = sorted(self.files)

    def is_empty(self) -> bool:
        """Returns true iff the dataset contains no files."""
        return len(self.files) == 0

    def has_elan(self) -> bool:
        """Returns true iff any of the files in the dataset is an elan file."""
        return any(map((lambda file_name: file_name.suffix == ".eaf"), self.files))

    def is_valid(self) -> bool:
        """Returns true iff this dataset is valid for processing."""
        return (
            not self.is_empty()
            and len(self.files) % 2 == 0
            and len(self.mismatched_files) == 0
            and len(self.colliding_files) == 0
        )

    @staticmethod
    def is_audio(file: Path) -> bool:
        return file.suffix == ".wav"

    @staticmethod
    def is_transcript(file: Path) -> bool:
        return file.suffix in TRANSCRIPTION_EXTENSIONS

    @staticmethod
    def corresponding_audio_name(transcript_file: Path) -> Path:
        """Gets the corresponding audio file name for a given transcript file."""
        return Path(transcript_file).parent / (transcript_file.stem + ".wav")

    @property
    def transcript_files(self) -> Iterable[Path]:
        """Returns an iterable of all transcription files within the dataset."""
        return filter(Dataset.is_transcript, self.files)

    @cached_property
    def mismatched_files(self) -> Set[Path]:
        """Returns the list of transcript files with no corresponding
        audio and vice versa.

        Corresponding in this case means that for every transcript file with
        name x.some_extension, there is a corresponding file x.wav in the dataset.

        Returns:
            A list of the mismatched file names.
        """
        grouped_by_stems = groupby(self.files, lambda path: path.stem)

        def mismatches(files: Iterable[Path]) -> list[Path]:
            files = list(files)
            has_audio = any(Dataset.is_audio(file) for file in files)
            has_transcript = any(Dataset.is_transcript(file) for file in files)
            return [] if has_transcript == has_audio else files

        groups = (mismatches(g) for _, g in grouped_by_stems)
        result = set(chain.from_iterable(groups))
        return result

    @cached_property
    def colliding_files(self) -> Set[Path]:
        """Returns the list of transcript file names that collide.

        Collide means that two transcript files would be for the same .wav
        file.

        Returns:
            A list of the colliding file names.
        """
        grouped_by_stems = groupby(self.transcript_files, lambda path: path.stem)

        def collisions(files: Iterable[Path]) -> list[Path]:
            files = list(files)
            return files if len(files) >= 2 else []

        collision_groups = (collisions(g) for _, g in grouped_by_stems)
        return set(chain.from_iterable(collision_groups))

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> Dataset:
        name = data["name"]
        files = [Path(file) for file in data["files"]]
        cleaning_options = CleaningOptions.from_dict(data["cleaning_options"])

        elan_options = None
        if "elan_options" in data:
            elan_options = ElanOptions.from_dict(data["elan_options"])

        return cls(
            name=name,
            files=files,
            cleaning_options=cleaning_options,
            elan_options=elan_options,
        )

    @property
    def valid_transcriptions(self):
        is_valid = lambda path: path not in (
            self.mismatched_files | self.colliding_files
        )
        return filter(is_valid, self.transcript_files)

    def to_batches(self) -> Iterable[ProcessingBatch]:
        """Converts a valid dataset to a list of processing jobs, matching
        transcript and audio files.
        """
        return (
            ProcessingBatch(
                transcription_file=transcription_file,
                audio_file=self.corresponding_audio_name(transcription_file),
                cleaning_options=self.cleaning_options,
                elan_options=self.elan_options,
            )
            for transcription_file in self.valid_transcriptions
        )

    def to_dict(self) -> Dict[str, Any]:
        result = {
            "name": self.name,
            "files": [file.name for file in self.files],
            "cleaning_options": self.cleaning_options.to_dict(),
        }

        if self.elan_options is not None:
            result["elan_options"] = self.elan_options.to_dict()

        return result

Classes

class CleaningOptions (punctuation_to_remove: str = '', punctuation_to_explode: str = '', words_to_remove: List[str] = <factory>)

A class representing cleaning options for a dataset.

Expand source code

@dataclass
class CleaningOptions:
    """A class representing cleaning options for a dataset."""

    punctuation_to_remove: str = ""
    punctuation_to_explode: str = ""
    words_to_remove: List[str] = field(default_factory=list)

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> CleaningOptions:
        kwargs = {field.name: data[field.name] for field in fields(CleaningOptions)}
        return cls(**kwargs)

    def to_dict(self) -> Dict[str, Any]:
        return dict(self.__dict__)

Class variables

var punctuation_to_explode : str
var punctuation_to_remove : str
var words_to_remove : List[str]

Static methods

def from_dict(data: Dict[str, Any]) ‑> CleaningOptions

Expand source code

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> CleaningOptions:
    kwargs = {field.name: data[field.name] for field in fields(CleaningOptions)}
    return cls(**kwargs)

Methods

def to_dict(self) ‑> Dict[str, Any]

Expand source code

def to_dict(self) -> Dict[str, Any]:
    return dict(self.__dict__)

class Dataset (name: str, files: List[Path], cleaning_options: CleaningOptions, elan_options: Optional[ElanOptions])

A class representing an unprocessed dataset.

Expand source code

@dataclass
class Dataset:
    """A class representing an unprocessed dataset."""

    name: str
    files: List[Path]
    cleaning_options: CleaningOptions
    elan_options: Optional[ElanOptions]

    def __post_init__(self):
        self.files = sorted(self.files)

    def is_empty(self) -> bool:
        """Returns true iff the dataset contains no files."""
        return len(self.files) == 0

    def has_elan(self) -> bool:
        """Returns true iff any of the files in the dataset is an elan file."""
        return any(map((lambda file_name: file_name.suffix == ".eaf"), self.files))

    def is_valid(self) -> bool:
        """Returns true iff this dataset is valid for processing."""
        return (
            not self.is_empty()
            and len(self.files) % 2 == 0
            and len(self.mismatched_files) == 0
            and len(self.colliding_files) == 0
        )

    @staticmethod
    def is_audio(file: Path) -> bool:
        return file.suffix == ".wav"

    @staticmethod
    def is_transcript(file: Path) -> bool:
        return file.suffix in TRANSCRIPTION_EXTENSIONS

    @staticmethod
    def corresponding_audio_name(transcript_file: Path) -> Path:
        """Gets the corresponding audio file name for a given transcript file."""
        return Path(transcript_file).parent / (transcript_file.stem + ".wav")

    @property
    def transcript_files(self) -> Iterable[Path]:
        """Returns an iterable of all transcription files within the dataset."""
        return filter(Dataset.is_transcript, self.files)

    @cached_property
    def mismatched_files(self) -> Set[Path]:
        """Returns the list of transcript files with no corresponding
        audio and vice versa.

        Corresponding in this case means that for every transcript file with
        name x.some_extension, there is a corresponding file x.wav in the dataset.

        Returns:
            A list of the mismatched file names.
        """
        grouped_by_stems = groupby(self.files, lambda path: path.stem)

        def mismatches(files: Iterable[Path]) -> list[Path]:
            files = list(files)
            has_audio = any(Dataset.is_audio(file) for file in files)
            has_transcript = any(Dataset.is_transcript(file) for file in files)
            return [] if has_transcript == has_audio else files

        groups = (mismatches(g) for _, g in grouped_by_stems)
        result = set(chain.from_iterable(groups))
        return result

    @cached_property
    def colliding_files(self) -> Set[Path]:
        """Returns the list of transcript file names that collide.

        Collide means that two transcript files would be for the same .wav
        file.

        Returns:
            A list of the colliding file names.
        """
        grouped_by_stems = groupby(self.transcript_files, lambda path: path.stem)

        def collisions(files: Iterable[Path]) -> list[Path]:
            files = list(files)
            return files if len(files) >= 2 else []

        collision_groups = (collisions(g) for _, g in grouped_by_stems)
        return set(chain.from_iterable(collision_groups))

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> Dataset:
        name = data["name"]
        files = [Path(file) for file in data["files"]]
        cleaning_options = CleaningOptions.from_dict(data["cleaning_options"])

        elan_options = None
        if "elan_options" in data:
            elan_options = ElanOptions.from_dict(data["elan_options"])

        return cls(
            name=name,
            files=files,
            cleaning_options=cleaning_options,
            elan_options=elan_options,
        )

    @property
    def valid_transcriptions(self):
        is_valid = lambda path: path not in (
            self.mismatched_files | self.colliding_files
        )
        return filter(is_valid, self.transcript_files)

    def to_batches(self) -> Iterable[ProcessingBatch]:
        """Converts a valid dataset to a list of processing jobs, matching
        transcript and audio files.
        """
        return (
            ProcessingBatch(
                transcription_file=transcription_file,
                audio_file=self.corresponding_audio_name(transcription_file),
                cleaning_options=self.cleaning_options,
                elan_options=self.elan_options,
            )
            for transcription_file in self.valid_transcriptions
        )

    def to_dict(self) -> Dict[str, Any]:
        result = {
            "name": self.name,
            "files": [file.name for file in self.files],
            "cleaning_options": self.cleaning_options.to_dict(),
        }

        if self.elan_options is not None:
            result["elan_options"] = self.elan_options.to_dict()

        return result

Class variables

var cleaning_options : CleaningOptions
var elan_options : Optional[ElanOptions]
var files : List[pathlib.Path]
var name : str

Static methods

def corresponding_audio_name(transcript_file: Path) ‑> pathlib.Path

Gets the corresponding audio file name for a given transcript file.

Expand source code

@staticmethod
def corresponding_audio_name(transcript_file: Path) -> Path:
    """Gets the corresponding audio file name for a given transcript file."""
    return Path(transcript_file).parent / (transcript_file.stem + ".wav")

def from_dict(data: Dict[str, Any]) ‑> Dataset

Expand source code

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> Dataset:
    name = data["name"]
    files = [Path(file) for file in data["files"]]
    cleaning_options = CleaningOptions.from_dict(data["cleaning_options"])

    elan_options = None
    if "elan_options" in data:
        elan_options = ElanOptions.from_dict(data["elan_options"])

    return cls(
        name=name,
        files=files,
        cleaning_options=cleaning_options,
        elan_options=elan_options,
    )

def is_audio(file: Path) ‑> bool

Expand source code

@staticmethod
def is_audio(file: Path) -> bool:
    return file.suffix == ".wav"

def is_transcript(file: Path) ‑> bool

Expand source code

@staticmethod
def is_transcript(file: Path) -> bool:
    return file.suffix in TRANSCRIPTION_EXTENSIONS

Instance variables

var colliding_files

Returns the list of transcript file names that collide.

Collide means that two transcript files would be for the same .wav file.

Returns

A list of the colliding file names.

Expand source code

def __get__(self, instance, owner=None):
    if instance is None:
        return self
    if self.attrname is None:
        raise TypeError(
            "Cannot use cached_property instance without calling __set_name__ on it.")
    try:
        cache = instance.__dict__
    except AttributeError:  # not all objects have __dict__ (e.g. class defines slots)
        msg = (
            f"No '__dict__' attribute on {type(instance).__name__!r} "
            f"instance to cache {self.attrname!r} property."
        )
        raise TypeError(msg) from None
    val = cache.get(self.attrname, _NOT_FOUND)
    if val is _NOT_FOUND:
        with self.lock:
            # check if another thread filled cache while we awaited lock
            val = cache.get(self.attrname, _NOT_FOUND)
            if val is _NOT_FOUND:
                val = self.func(instance)
                try:
                    cache[self.attrname] = val
                except TypeError:
                    msg = (
                        f"The '__dict__' attribute on {type(instance).__name__!r} instance "
                        f"does not support item assignment for caching {self.attrname!r} property."
                    )
                    raise TypeError(msg) from None
    return val

var mismatched_files

Returns the list of transcript files with no corresponding audio and vice versa.

Corresponding in this case means that for every transcript file with name x.some_extension, there is a corresponding file x.wav in the dataset.

Returns

A list of the mismatched file names.

Expand source code

def __get__(self, instance, owner=None):
    if instance is None:
        return self
    if self.attrname is None:
        raise TypeError(
            "Cannot use cached_property instance without calling __set_name__ on it.")
    try:
        cache = instance.__dict__
    except AttributeError:  # not all objects have __dict__ (e.g. class defines slots)
        msg = (
            f"No '__dict__' attribute on {type(instance).__name__!r} "
            f"instance to cache {self.attrname!r} property."
        )
        raise TypeError(msg) from None
    val = cache.get(self.attrname, _NOT_FOUND)
    if val is _NOT_FOUND:
        with self.lock:
            # check if another thread filled cache while we awaited lock
            val = cache.get(self.attrname, _NOT_FOUND)
            if val is _NOT_FOUND:
                val = self.func(instance)
                try:
                    cache[self.attrname] = val
                except TypeError:
                    msg = (
                        f"The '__dict__' attribute on {type(instance).__name__!r} instance "
                        f"does not support item assignment for caching {self.attrname!r} property."
                    )
                    raise TypeError(msg) from None
    return val

var transcript_files : Iterable[pathlib.Path]

Returns an iterable of all transcription files within the dataset.

Expand source code

@property
def transcript_files(self) -> Iterable[Path]:
    """Returns an iterable of all transcription files within the dataset."""
    return filter(Dataset.is_transcript, self.files)

var valid_transcriptions

Expand source code

@property
def valid_transcriptions(self):
    is_valid = lambda path: path not in (
        self.mismatched_files | self.colliding_files
    )
    return filter(is_valid, self.transcript_files)

Methods

def has_elan(self) ‑> bool

Returns true iff any of the files in the dataset is an elan file.

Expand source code

def has_elan(self) -> bool:
    """Returns true iff any of the files in the dataset is an elan file."""
    return any(map((lambda file_name: file_name.suffix == ".eaf"), self.files))

def is_empty(self) ‑> bool

Returns true iff the dataset contains no files.

Expand source code

def is_empty(self) -> bool:
    """Returns true iff the dataset contains no files."""
    return len(self.files) == 0

def is_valid(self) ‑> bool

Returns true iff this dataset is valid for processing.

Expand source code

def is_valid(self) -> bool:
    """Returns true iff this dataset is valid for processing."""
    return (
        not self.is_empty()
        and len(self.files) % 2 == 0
        and len(self.mismatched_files) == 0
        and len(self.colliding_files) == 0
    )

def to_batches(self) ‑> Iterable[ProcessingBatch]

Converts a valid dataset to a list of processing jobs, matching transcript and audio files.

Expand source code

def to_batches(self) -> Iterable[ProcessingBatch]:
    """Converts a valid dataset to a list of processing jobs, matching
    transcript and audio files.
    """
    return (
        ProcessingBatch(
            transcription_file=transcription_file,
            audio_file=self.corresponding_audio_name(transcription_file),
            cleaning_options=self.cleaning_options,
            elan_options=self.elan_options,
        )
        for transcription_file in self.valid_transcriptions
    )

def to_dict(self) ‑> Dict[str, Any]

Expand source code

def to_dict(self) -> Dict[str, Any]:
    result = {
        "name": self.name,
        "files": [file.name for file in self.files],
        "cleaning_options": self.cleaning_options.to_dict(),
    }

    if self.elan_options is not None:
        result["elan_options"] = self.elan_options.to_dict()

    return result

class ProcessingBatch (audio_file: Path, transcription_file: Path, cleaning_options: CleaningOptions, elan_options: Optional[ElanOptions])

A class encapsulating the data needed for an individual processing job

Expand source code

@dataclass
class ProcessingBatch:
    """A class encapsulating the data needed for an individual processing job"""

    audio_file: Path
    transcription_file: Path
    cleaning_options: CleaningOptions
    elan_options: Optional[ElanOptions]

    def to_dict(self) -> Dict[str, Any]:
        result = {}

        result["audio_file"] = str(self.audio_file)
        result["transcription_file"] = str(self.transcription_file)
        result["cleaning_options"] = self.cleaning_options.to_dict()
        if self.elan_options is not None:
            result["elan_options"] = self.elan_options.to_dict()

        return result

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> ProcessingBatch:
        audio_file = Path(data["audio_file"])
        transcription_file = Path(data["transcription_file"])
        cleaning_options = CleaningOptions.from_dict(data["cleaning_options"])
        elan_options = ElanOptions.from_dict(data["elan_options"])
        return cls(
            audio_file=audio_file,
            transcription_file=transcription_file,
            cleaning_options=cleaning_options,
            elan_options=elan_options,
        )

Class variables

var audio_file : pathlib.Path
var cleaning_options : CleaningOptions
var elan_options : Optional[ElanOptions]
var transcription_file : pathlib.Path

Static methods

def from_dict(data: Dict[str, Any]) ‑> ProcessingBatch

Expand source code

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> ProcessingBatch:
    audio_file = Path(data["audio_file"])
    transcription_file = Path(data["transcription_file"])
    cleaning_options = CleaningOptions.from_dict(data["cleaning_options"])
    elan_options = ElanOptions.from_dict(data["elan_options"])
    return cls(
        audio_file=audio_file,
        transcription_file=transcription_file,
        cleaning_options=cleaning_options,
        elan_options=elan_options,
    )

Methods

def to_dict(self) ‑> Dict[str, Any]

Expand source code

def to_dict(self) -> Dict[str, Any]:
    result = {}

    result["audio_file"] = str(self.audio_file)
    result["transcription_file"] = str(self.transcription_file)
    result["cleaning_options"] = self.cleaning_options.to_dict()
    if self.elan_options is not None:
        result["elan_options"] = self.elan_options.to_dict()

    return result