Module elpis.datasets.extract_annotations
Expand source code
from itertools import chain
from pathlib import Path
from typing import List, Optional, Tuple
from loguru import logger
from pympi.Elan import Eaf
from elpis.models import Annotation, ElanOptions, ElanTierSelector
def extract_annotations(
transcription_file: Path, elan_options: Optional[ElanOptions] = None
) -> List[Annotation]:
"""Extracts annotations from the supplied transcription file.
If the transcription file is an elan file, elan_options is required.
Parameters:
transcription_file: The file from which to extract annotations
elan_options: Options to include for determining how to extract annotations
from elan data.
Returns:
A list of found annotations.
Returns an empty list if there was a problem.
"""
if transcription_file.suffix == ".txt":
return extract_text_annotations(transcription_file)
if transcription_file.suffix != ".eaf":
logger.error(f"Unrecognised file format: {transcription_file}")
return []
if elan_options is None:
logger.error(f"Missing elan options for extraction job.")
return []
return extract_elan_annotations(
transcription_file,
selection_type=elan_options.selection_mechanism,
selection_data=elan_options.selection_value,
)
def extract_text_annotations(file: Path) -> List[Annotation]:
"""Extract transcription information from a text file.
Parameters:
file_name: The name of the downloaded file.
Returns:
A list of utterance information for the given file.
"""
with open(file) as transcription_file:
transcription = transcription_file.read()
return [
Annotation(
audio_file=file.parent / (file.stem + ".wav"),
transcript=transcription,
)
]
def extract_elan_annotations(
elan_file_path: Path, selection_type: ElanTierSelector, selection_data: str
) -> List[Annotation]:
"""Extracts annotations from a particular tier in an eaf file (ELAN
Annotation Format).
Tiers are nodes from the tree structure in the .eaf file.
The tier to read from is determined by tier order (eg top tier would be order 1),
tier type (eg default-lt) or tier name (eg Phrase).
Parameters:
elan_file_path: The path to the eaf file.
selection_type: The method of determining which tier data to extract.
selection_data: The data corresponding to the selection_type.
Returns:
A list of the annotations contained for the supplied data. Returns an
empty list if the given selection isn't found.
"""
match selection_type:
case ElanTierSelector.NAME:
return get_annotations_by_tier_name(elan_file_path, selection_data)
case ElanTierSelector.TYPE:
return get_annotations_by_tier_type(elan_file_path, selection_data)
case ElanTierSelector.ORDER:
try:
order = int(selection_data)
except:
order = 1
return get_annotations_by_tier_order(elan_file_path, order)
def get_annotations_by_tier_order(
elan_file_path: Path, tier_order: int
) -> List[Annotation]:
"""Retrieves all annotations for a given tier order within an eaf file.
Parameters:
elan_file_path: The path to the eaf file.
tier_order: The tier order to extract from (starts at 1)
Returns:
A list of the annotations contained for the supplied tier order.
Returns an empty list if the given tier order exceeds the nesting of
the file.
"""
elan = Eaf(elan_file_path)
tier_names: List[str] = list(elan.get_tier_names())
if tier_order > len(tier_names):
logger.error(
f"tier_order: {tier_order} exceeds tier length for {elan_file_path}"
)
return []
tier_name = tier_names[tier_order - 1]
return get_annotations_by_tier_name(
elan_file_path=elan_file_path, tier_name=tier_name
)
def get_annotations_by_tier_type(
elan_file_path: Path, tier_type: str
) -> List[Annotation]:
"""Retrieves all annotations for a given linguistic tier type in an eaf file.
Parameters:
elan_file_path: The path to the eaf file.
tier_type: The linguistic type from which to extract Annotation data.
Returns:
A list of the annotations contained for the supplied linguistic type.
Returns an empty list if the type is not found.
"""
elan = Eaf(elan_file_path)
if tier_type not in list(elan.get_linguistic_type_names()):
logger.error(f"tier_type: {tier_type} not found in file: {elan_file_path}")
return []
tier_names = elan.get_tier_ids_for_linguistic_type(tier_type)
annotations = (
get_annotations_by_tier_name(elan_file_path, name) for name in tier_names
)
# Flatten list of annotations
return list(chain(*annotations))
def get_annotations_by_tier_name(
elan_file_path: Path, tier_name: str
) -> List[Annotation]:
"""Retrieves all annotations for a given tier name in an eaf file.
Parameters:
elan_file_path: The path to the eaf file.
tier_name: The tier name from which to extract Annotation data.
Returns:
A list of the annotations contained for the supplied tier name.
Returns an empty list if the name is not found.
"""
elan = Eaf(elan_file_path)
if tier_name not in list(elan.get_tier_names()):
logger.error(f"tier_name: {tier_name} not found in file {elan_file_path}")
return []
def create_annotation(elan_annotation: Tuple[str, str, str]) -> Annotation:
start, end, transcript = elan_annotation
return Annotation(
audio_file=elan_file_path.parent / (elan_file_path.stem + ".wav"),
transcript=transcript,
start_ms=int(start),
stop_ms=int(end),
)
return list(map(create_annotation, elan.get_annotation_data_for_tier(tier_name)))
Functions
def extract_annotations(transcription_file: pathlib.Path, elan_options: Optional[ElanOptions] = None) ‑> List[Annotation]
-
Extracts annotations from the supplied transcription file.
If the transcription file is an elan file, elan_options is required.
Parameters
transcription_file: The file from which to extract annotations elan_options: Options to include for determining how to extract annotations from elan data.
Returns
A list of found annotations. Returns an empty list if there was a problem.
Expand source code
def extract_annotations( transcription_file: Path, elan_options: Optional[ElanOptions] = None ) -> List[Annotation]: """Extracts annotations from the supplied transcription file. If the transcription file is an elan file, elan_options is required. Parameters: transcription_file: The file from which to extract annotations elan_options: Options to include for determining how to extract annotations from elan data. Returns: A list of found annotations. Returns an empty list if there was a problem. """ if transcription_file.suffix == ".txt": return extract_text_annotations(transcription_file) if transcription_file.suffix != ".eaf": logger.error(f"Unrecognised file format: {transcription_file}") return [] if elan_options is None: logger.error(f"Missing elan options for extraction job.") return [] return extract_elan_annotations( transcription_file, selection_type=elan_options.selection_mechanism, selection_data=elan_options.selection_value, )
def extract_elan_annotations(elan_file_path: pathlib.Path, selection_type: ElanTierSelector, selection_data: str) ‑> List[Annotation]
-
Extracts annotations from a particular tier in an eaf file (ELAN Annotation Format).
Tiers are nodes from the tree structure in the .eaf file. The tier to read from is determined by tier order (eg top tier would be order 1), tier type (eg default-lt) or tier name (eg Phrase).
Parameters
elan_file_path: The path to the eaf file. selection_type: The method of determining which tier data to extract. selection_data: The data corresponding to the selection_type.
Returns
A list of the annotations contained for the supplied data. Returns an empty list if the given selection isn't found.
Expand source code
def extract_elan_annotations( elan_file_path: Path, selection_type: ElanTierSelector, selection_data: str ) -> List[Annotation]: """Extracts annotations from a particular tier in an eaf file (ELAN Annotation Format). Tiers are nodes from the tree structure in the .eaf file. The tier to read from is determined by tier order (eg top tier would be order 1), tier type (eg default-lt) or tier name (eg Phrase). Parameters: elan_file_path: The path to the eaf file. selection_type: The method of determining which tier data to extract. selection_data: The data corresponding to the selection_type. Returns: A list of the annotations contained for the supplied data. Returns an empty list if the given selection isn't found. """ match selection_type: case ElanTierSelector.NAME: return get_annotations_by_tier_name(elan_file_path, selection_data) case ElanTierSelector.TYPE: return get_annotations_by_tier_type(elan_file_path, selection_data) case ElanTierSelector.ORDER: try: order = int(selection_data) except: order = 1 return get_annotations_by_tier_order(elan_file_path, order)
def extract_text_annotations(file: pathlib.Path) ‑> List[Annotation]
-
Extract transcription information from a text file.
Parameters
file_name: The name of the downloaded file.
Returns
A list of utterance information for the given file.
Expand source code
def extract_text_annotations(file: Path) -> List[Annotation]: """Extract transcription information from a text file. Parameters: file_name: The name of the downloaded file. Returns: A list of utterance information for the given file. """ with open(file) as transcription_file: transcription = transcription_file.read() return [ Annotation( audio_file=file.parent / (file.stem + ".wav"), transcript=transcription, ) ]
def get_annotations_by_tier_name(elan_file_path: pathlib.Path, tier_name: str) ‑> List[Annotation]
-
Retrieves all annotations for a given tier name in an eaf file.
Parameters
elan_file_path: The path to the eaf file. tier_name: The tier name from which to extract Annotation data.
Returns
A list of the annotations contained for the supplied tier name. Returns an empty list if the name is not found.
Expand source code
def get_annotations_by_tier_name( elan_file_path: Path, tier_name: str ) -> List[Annotation]: """Retrieves all annotations for a given tier name in an eaf file. Parameters: elan_file_path: The path to the eaf file. tier_name: The tier name from which to extract Annotation data. Returns: A list of the annotations contained for the supplied tier name. Returns an empty list if the name is not found. """ elan = Eaf(elan_file_path) if tier_name not in list(elan.get_tier_names()): logger.error(f"tier_name: {tier_name} not found in file {elan_file_path}") return [] def create_annotation(elan_annotation: Tuple[str, str, str]) -> Annotation: start, end, transcript = elan_annotation return Annotation( audio_file=elan_file_path.parent / (elan_file_path.stem + ".wav"), transcript=transcript, start_ms=int(start), stop_ms=int(end), ) return list(map(create_annotation, elan.get_annotation_data_for_tier(tier_name)))
def get_annotations_by_tier_order(elan_file_path: pathlib.Path, tier_order: int) ‑> List[Annotation]
-
Retrieves all annotations for a given tier order within an eaf file.
Parameters
elan_file_path: The path to the eaf file. tier_order: The tier order to extract from (starts at 1)
Returns
A list of the annotations contained for the supplied tier order. Returns an empty list if the given tier order exceeds the nesting of the file.
Expand source code
def get_annotations_by_tier_order( elan_file_path: Path, tier_order: int ) -> List[Annotation]: """Retrieves all annotations for a given tier order within an eaf file. Parameters: elan_file_path: The path to the eaf file. tier_order: The tier order to extract from (starts at 1) Returns: A list of the annotations contained for the supplied tier order. Returns an empty list if the given tier order exceeds the nesting of the file. """ elan = Eaf(elan_file_path) tier_names: List[str] = list(elan.get_tier_names()) if tier_order > len(tier_names): logger.error( f"tier_order: {tier_order} exceeds tier length for {elan_file_path}" ) return [] tier_name = tier_names[tier_order - 1] return get_annotations_by_tier_name( elan_file_path=elan_file_path, tier_name=tier_name )
def get_annotations_by_tier_type(elan_file_path: pathlib.Path, tier_type: str) ‑> List[Annotation]
-
Retrieves all annotations for a given linguistic tier type in an eaf file.
Parameters
elan_file_path: The path to the eaf file. tier_type: The linguistic type from which to extract Annotation data.
Returns
A list of the annotations contained for the supplied linguistic type. Returns an empty list if the type is not found.
Expand source code
def get_annotations_by_tier_type( elan_file_path: Path, tier_type: str ) -> List[Annotation]: """Retrieves all annotations for a given linguistic tier type in an eaf file. Parameters: elan_file_path: The path to the eaf file. tier_type: The linguistic type from which to extract Annotation data. Returns: A list of the annotations contained for the supplied linguistic type. Returns an empty list if the type is not found. """ elan = Eaf(elan_file_path) if tier_type not in list(elan.get_linguistic_type_names()): logger.error(f"tier_type: {tier_type} not found in file: {elan_file_path}") return [] tier_names = elan.get_tier_ids_for_linguistic_type(tier_type) annotations = ( get_annotations_by_tier_name(elan_file_path, name) for name in tier_names ) # Flatten list of annotations return list(chain(*annotations))