Module elpis.datasets.clean_text

Expand source code
import re
from typing import List, Optional


def clean_text(
    text: str,
    words_to_remove: Optional[List[str]] = None,
    characters_to_explode: str = "",
    characters_to_remove: str = "",
    to_lower=True,
) -> str:
    """Cleans the text based on the supplied options.

    Parameters:
        text: The text to clean.
        words_to_remove: Words to remove from the text.
        characters_to_remove: A string of chars to remove from the text.
        characters_to_explode: A string of chars to replace with spaces in the text.
        to_lower: True iff the resulting text should be converted to lower case.
            Converts to uppercase if False.

    Returns:
        The cleaned text
    """
    words = text.split()

    if words_to_remove is not None:
        words = filter(lambda word: word not in words_to_remove, words)

    if characters_to_explode != "":
        words = map(lambda word: explode(word, characters_to_explode), words)

    if characters_to_remove != "":
        words = map(lambda word: collapse(word, characters_to_remove), words)

    result = " ".join(words).strip()
    result = remove_consecutive_spaces(result)
    return result.lower() if to_lower else result.upper()


def explode(text: str, pattern: str) -> str:
    """Replace occurences of the pattern with spaces within the given text.

    Parameters:
        text: The text to modify.
        pattern: The pattern of characters to replace with spaces.

    Returns:
        The text with instances of the pattern exploded.
    """
    pattern = re.escape(pattern)
    return re.sub(rf"[{pattern}]", " ", text)


def collapse(text: str, pattern: str) -> str:
    """Remove occurences of the pattern within the given text.

    Parameters:
        text: The text to modify.
        pattern: The pattern of characters to remove.

    Returns:
        The text with instances of the pattern removed.
    """
    pattern = re.escape(pattern)
    return re.sub(rf"[{pattern}]", "", text)


def remove_consecutive_spaces(text: str) -> str:
    """Replace consecutive spaces with a single one in some given text.

    Parameters:
        text: The text to modify.

    Returns
        The supplied text with conseucutive spaces reduced to one.
    """
    return re.sub("[ ]+", " ", text)

Functions

def clean_text(text: str, words_to_remove: Optional[List[str]] = None, characters_to_explode: str = '', characters_to_remove: str = '', to_lower=True) ‑> str

Cleans the text based on the supplied options.

Parameters

text: The text to clean. words_to_remove: Words to remove from the text. characters_to_remove: A string of chars to remove from the text. characters_to_explode: A string of chars to replace with spaces in the text. to_lower: True iff the resulting text should be converted to lower case. Converts to uppercase if False.

Returns

The cleaned text

Expand source code
def clean_text(
    text: str,
    words_to_remove: Optional[List[str]] = None,
    characters_to_explode: str = "",
    characters_to_remove: str = "",
    to_lower=True,
) -> str:
    """Cleans the text based on the supplied options.

    Parameters:
        text: The text to clean.
        words_to_remove: Words to remove from the text.
        characters_to_remove: A string of chars to remove from the text.
        characters_to_explode: A string of chars to replace with spaces in the text.
        to_lower: True iff the resulting text should be converted to lower case.
            Converts to uppercase if False.

    Returns:
        The cleaned text
    """
    words = text.split()

    if words_to_remove is not None:
        words = filter(lambda word: word not in words_to_remove, words)

    if characters_to_explode != "":
        words = map(lambda word: explode(word, characters_to_explode), words)

    if characters_to_remove != "":
        words = map(lambda word: collapse(word, characters_to_remove), words)

    result = " ".join(words).strip()
    result = remove_consecutive_spaces(result)
    return result.lower() if to_lower else result.upper()
def collapse(text: str, pattern: str) ‑> str

Remove occurences of the pattern within the given text.

Parameters

text: The text to modify. pattern: The pattern of characters to remove.

Returns

The text with instances of the pattern removed.

Expand source code
def collapse(text: str, pattern: str) -> str:
    """Remove occurences of the pattern within the given text.

    Parameters:
        text: The text to modify.
        pattern: The pattern of characters to remove.

    Returns:
        The text with instances of the pattern removed.
    """
    pattern = re.escape(pattern)
    return re.sub(rf"[{pattern}]", "", text)
def explode(text: str, pattern: str) ‑> str

Replace occurences of the pattern with spaces within the given text.

Parameters

text: The text to modify. pattern: The pattern of characters to replace with spaces.

Returns

The text with instances of the pattern exploded.

Expand source code
def explode(text: str, pattern: str) -> str:
    """Replace occurences of the pattern with spaces within the given text.

    Parameters:
        text: The text to modify.
        pattern: The pattern of characters to replace with spaces.

    Returns:
        The text with instances of the pattern exploded.
    """
    pattern = re.escape(pattern)
    return re.sub(rf"[{pattern}]", " ", text)
def remove_consecutive_spaces(text: str) ‑> str

Replace consecutive spaces with a single one in some given text.

Parameters

text: The text to modify.

Returns The supplied text with conseucutive spaces reduced to one.

Expand source code
def remove_consecutive_spaces(text: str) -> str:
    """Replace consecutive spaces with a single one in some given text.

    Parameters:
        text: The text to modify.

    Returns
        The supplied text with conseucutive spaces reduced to one.
    """
    return re.sub("[ ]+", " ", text)