Source code for benchmarkstt.input.core

"""
Default input formats

"""

import benchmarkstt.segmentation.core as segmenters
from benchmarkstt import input, settings


[docs]class PlainText(input.Input):
    """
    Plain text.
    """
    def __init__(self, text, normalizer=None, segmenter=None):
        if segmenter is None:
            segmenter = segmenters.Simple
        self._text = text
        self._segmenter = segmenter
        self._normalizer = normalizer

    def __iter__(self):
        return iter(self._segmenter(self._text, normalizer=self._normalizer))


[docs]class File(input.Input):
    """
    Load from a given filename.
    """

    _extension_to_class = {
        "txt": PlainText,
    }

[docs]    @classmethod
    def available_types(cls):
        return {cls_config.name: ' '.join([cls.__doc__.strip(),
                                           'Treat file as',
                                           cls_config.cls.__doc__.strip()])
                for cls_config in input.factory
                if cls_config.name != 'file'}

    def __init__(self, file, input_type=None, normalizer=None):
        self._normalizer = normalizer
        if input_type is None or input_type == 'infer':
            if '.' not in file:
                raise ValueError('Cannot infer input file type of files without an extension')

            extension = file.rsplit('.', 1)[1].lower()
            if extension not in self._extension_to_class:
                raise ValueError('Cannot infer input file type for files of extension %s' % (extension,))

            input_type = self._extension_to_class[extension]

        encoding = settings.default_encoding
        with open(file, encoding=encoding):
            """Just checks that file is readable..."""

        self._file = file

        if type(input_type) is str:
            input_type = input.factory[input_type]

        self._input_class = input_type

    def __iter__(self):
        encoding = settings.default_encoding
        with open(self._file, encoding=encoding) as f:
            text = f.read()

        return iter(self._input_class(text, normalizer=self._normalizer))