Source code for benchmarkstt.normalization

from benchmarkstt.normalization.logger import log
import logging
from benchmarkstt.factory import Factory
from benchmarkstt import settings
from benchmarkstt import csv
import os

_normalizer_namespaces = (
    "benchmarkstt.normalization.core",
    ""
)


logger = logging.getLogger(__name__)


[docs]class Base: @log def normalize(self, text: str) -> str: """ Returns normalized text with rules supplied by the called class. """ return self._normalize(text) def __repr__(self): return type(self).__name__ def _normalize(self, text: str) -> str: raise NotImplementedError()
[docs]class BaseWithFileSupport(Base): """ This kind of normalization class supports loading the values from a file, i.e. being wrapped in a core.File wrapper. """ def _normalize(self, text: str) -> str: raise NotImplementedError()
[docs]class NormalizationComposite(Base): """ Combining normalizers """ def __init__(self, title=None): self._normalizers = [] self._title = type(self).__name__ if title is None else title
[docs] def add(self, normalizer): """Adds a normalizer to the composite "stack" """ self._normalizers.append(normalizer)
def _normalize(self, text: str) -> str: # allow for an empty file if not self._normalizers: return text for normalizer in self._normalizers: text = normalizer.normalize(text) return text def __repr__(self): return self._title
[docs]class File(Base): """ Read one per line and pass it to the given normalizer :param str|class normalizer: Normalizer name (or class) :param file: The file to read rules from :param encoding: The file encoding :example text: "This is an Ex-Parakeet" :example normalizer: "regex" :example file: "./resources/test/normalizers/regex/en_US" :example encoding: "UTF-8" :example return: "This is an Ex Parrot" """ def __init__(self, normalizer, file, encoding=None, path=None): if encoding is None: encoding = settings.default_encoding title = file if path is not None: file = os.path.join(path, file) with open(file, encoding=encoding) as f: self._normalizer = NormalizationComposite(title=title) for line in csv.reader(f): try: self._normalizer.add(normalizer(*line)) except TypeError as e: raise ValueError("%s:%d %r(%r) %r" % (file, line.lineno, normalizer, line, e)) def _normalize(self, text: str) -> str: return self._normalizer.normalize(text)
factory = Factory(Base, _normalizer_namespaces)
[docs]class FileFactory(Factory):
[docs] def create(self, name, file=None, encoding=None, path=None): cls = super().__getitem__(name) return File(cls, file, encoding, path=path)
def __getitem__(self, item): raise NotImplementedError("Not supported")
file_factory = FileFactory(BaseWithFileSupport, _normalizer_namespaces)