Source code for benchmarkstt.normalization

"""
Responsible for normalization of text.
"""

import os
from abc import ABC, abstractmethod
from benchmarkstt.normalization.logger import log
from benchmarkstt.factory import CoreFactory
from benchmarkstt import settings
from benchmarkstt import csv


class _NormalizerNoLogs(ABC):
    """
    Abstract base class for normalization, without providing logging.
    """

    @abstractmethod
    def normalize(self, text: str) -> str:
        """
        Returns normalized text with rules supplied by the called class.
        """

        raise NotImplementedError()

    def __repr__(self):
        return type(self).__name__


[docs]class Normalizer(_NormalizerNoLogs):
    """
    Abstract base class for normalization
    """

    @log
    def normalize(self, text: str) -> str:
        """
        Returns normalized text with rules supplied by the called class.
        """
        return self._normalize(text)

[docs]    @abstractmethod
    def _normalize(self, text: str) -> str:
        """
        :meta public:
        """
        raise NotImplementedError()


[docs]class NormalizerWithFileSupport(Normalizer):
    """
    This kind of normalization class supports loading the values from a file, i.e.
    being wrapped in a core.File wrapper.
    """

[docs]    @abstractmethod
    def _normalize(self, text: str) -> str:
        """
        :meta public:
        """
        raise NotImplementedError()


[docs]class NormalizationAggregate(Normalizer):
    """
    Combining normalizers
    """

    def __init__(self, title=None):
        """
        :meta public:
        """
        self._normalizers = []
        self._title = type(self).__name__ if title is None else title

[docs]    def add(self, normalizer):
        """Adds a normalizer to the composite "stack"
        """
        self._normalizers.append(normalizer)

[docs]    def _normalize(self, text: str) -> str:
        """
        :meta public:
        """
        # allow for an empty file
        if not self._normalizers:
            return text

        for normalizer in self._normalizers:
            text = normalizer.normalize(text)
        return text

    def __repr__(self):
        return self._title


[docs]class File(Normalizer):
    """
    Read one per line and pass it to the given normalizer

    :param str|class normalizer: Normalizer name (or class)
    :param file: The file to read rules from
    :param encoding: The file encoding

    :example text: "This is an Ex-Parakeet"
    :example normalizer: "regex"
    :example file: "./resources/test/normalizers/regex/en_US"
    :example encoding: "UTF-8"
    :example return: "This is an Ex Parrot"
    """

    def __init__(self, normalizer, file, encoding=None, path=None):
        """
        :meta public:
        """
        if encoding is None:
            encoding = settings.default_encoding

        title = file
        if path is not None:
            file = os.path.join(path, file)

        with open(file, encoding=encoding) as f:
            self._normalizer = NormalizationAggregate(title=title)
            for line in csv.reader(f):
                try:
                    self._normalizer.add(normalizer(*line))
                except TypeError as e:
                    raise ValueError("%s:%d %r(%r) %r" % (file, line.lineno, normalizer, line, e))

[docs]    def _normalize(self, text: str) -> str:
        return self._normalizer.normalize(text)


[docs]class FileFactory(CoreFactory):
[docs]    def create(self, name, file=None, encoding=None, path=None):
        cls = super().__getitem__(name)
        return File(cls, file, encoding, path=path)

    def __getitem__(self, item):
        """
        :meta public:
        """
        raise NotImplementedError("Not supported")


factory = CoreFactory(_NormalizerNoLogs)
file_factory = FileFactory(NormalizerWithFileSupport, False)