Module elpis.models.vocab
Expand source code
import json
from dataclasses import dataclass
from functools import reduce
from pathlib import Path
from typing import Any, Dict, Iterable, Optional, Set
from datasets import DatasetDict
VOCAB_FILE = "vocab.json"
@dataclass
class Vocab:
"""A class which represents a dictionary of encountered tokens in a dataset."""
vocab: Dict[str, int]
@property
def symbols(self) -> Set[str]:
return set(self.vocab.keys())
def merge(self, other: "Vocab") -> "Vocab":
"""Creates a new Vocab which includes all symbols in the merged two."""
vocab = self.symbols | other.symbols
return Vocab.from_set(vocab)
def save(self, path: Path) -> None:
"""Saves the vocab to the supplied path.
If the path is a folder, saves as vocab.json, within it.
"""
if path.is_dir():
path /= VOCAB_FILE
with open(path, "w") as out:
json.dump(self.vocab, out)
def add(self, char: str) -> None:
"""Adds a new character into the vocab."""
if char in self.vocab:
return
self.vocab[char] = len(self.vocab)
def replace(self, original: str, replacement: str) -> None:
"""Replaces the supplied character mapping in the vocab."""
if original not in self.vocab or original == replacement:
return
self.vocab[replacement] = self.vocab[original]
self.vocab.pop(original)
@classmethod
def from_set(cls, symbols: Set[str]) -> "Vocab":
"""Builds a vocab from a set of symbols."""
vocab = {symbol: index for index, symbol in enumerate(sorted(symbols))}
return cls(vocab=vocab)
@classmethod
def from_strings(cls, texts: Iterable[str]) -> "Vocab":
"""Builds an vocab from a iterable text collection."""
def reducer(result: Set[str], text: str) -> Set[str]:
return result | set(text)
symbols = reduce(reducer, texts, set())
return cls.from_set(symbols)
Classes
class Vocab (vocab: Dict[str, int])
-
A class which represents a dictionary of encountered tokens in a dataset.
Expand source code
@dataclass class Vocab: """A class which represents a dictionary of encountered tokens in a dataset.""" vocab: Dict[str, int] @property def symbols(self) -> Set[str]: return set(self.vocab.keys()) def merge(self, other: "Vocab") -> "Vocab": """Creates a new Vocab which includes all symbols in the merged two.""" vocab = self.symbols | other.symbols return Vocab.from_set(vocab) def save(self, path: Path) -> None: """Saves the vocab to the supplied path. If the path is a folder, saves as vocab.json, within it. """ if path.is_dir(): path /= VOCAB_FILE with open(path, "w") as out: json.dump(self.vocab, out) def add(self, char: str) -> None: """Adds a new character into the vocab.""" if char in self.vocab: return self.vocab[char] = len(self.vocab) def replace(self, original: str, replacement: str) -> None: """Replaces the supplied character mapping in the vocab.""" if original not in self.vocab or original == replacement: return self.vocab[replacement] = self.vocab[original] self.vocab.pop(original) @classmethod def from_set(cls, symbols: Set[str]) -> "Vocab": """Builds a vocab from a set of symbols.""" vocab = {symbol: index for index, symbol in enumerate(sorted(symbols))} return cls(vocab=vocab) @classmethod def from_strings(cls, texts: Iterable[str]) -> "Vocab": """Builds an vocab from a iterable text collection.""" def reducer(result: Set[str], text: str) -> Set[str]: return result | set(text) symbols = reduce(reducer, texts, set()) return cls.from_set(symbols)
Class variables
var vocab : Dict[str, int]
Static methods
def from_set(symbols: Set[str]) ‑> Vocab
-
Builds a vocab from a set of symbols.
Expand source code
@classmethod def from_set(cls, symbols: Set[str]) -> "Vocab": """Builds a vocab from a set of symbols.""" vocab = {symbol: index for index, symbol in enumerate(sorted(symbols))} return cls(vocab=vocab)
def from_strings(texts: Iterable[str]) ‑> Vocab
-
Builds an vocab from a iterable text collection.
Expand source code
@classmethod def from_strings(cls, texts: Iterable[str]) -> "Vocab": """Builds an vocab from a iterable text collection.""" def reducer(result: Set[str], text: str) -> Set[str]: return result | set(text) symbols = reduce(reducer, texts, set()) return cls.from_set(symbols)
Instance variables
var symbols : Set[str]
-
Expand source code
@property def symbols(self) -> Set[str]: return set(self.vocab.keys())
Methods
def add(self, char: str) ‑> None
-
Adds a new character into the vocab.
Expand source code
def add(self, char: str) -> None: """Adds a new character into the vocab.""" if char in self.vocab: return self.vocab[char] = len(self.vocab)
def merge(self, other: Vocab) ‑> Vocab
-
Creates a new Vocab which includes all symbols in the merged two.
Expand source code
def merge(self, other: "Vocab") -> "Vocab": """Creates a new Vocab which includes all symbols in the merged two.""" vocab = self.symbols | other.symbols return Vocab.from_set(vocab)
def replace(self, original: str, replacement: str) ‑> None
-
Replaces the supplied character mapping in the vocab.
Expand source code
def replace(self, original: str, replacement: str) -> None: """Replaces the supplied character mapping in the vocab.""" if original not in self.vocab or original == replacement: return self.vocab[replacement] = self.vocab[original] self.vocab.pop(original)
def save(self, path: pathlib.Path) ‑> None
-
Saves the vocab to the supplied path.
If the path is a folder, saves as vocab.json, within it.
Expand source code
def save(self, path: Path) -> None: """Saves the vocab to the supplied path. If the path is a folder, saves as vocab.json, within it. """ if path.is_dir(): path /= VOCAB_FILE with open(path, "w") as out: json.dump(self.vocab, out)