"""
Apply normalization to given input
"""
import sys
import argparse
import logging
from benchmarkstt import settings
[docs]def args_logs(parser: argparse.ArgumentParser):
parser.add_argument('--log', action='store_true',
help='show normalization logs (warning: for large files with many normalization rules this will'
' cause a significant performance penalty and a lot of output data)')
[docs]def args_normalizers(parser: argparse.ArgumentParser):
from benchmarkstt.normalization import factory
from benchmarkstt.cli import args_from_factory
normalizers_desc = """
A list of normalizers to execute on the input, can be one or more normalizers
which are applied sequentially.
The program will automatically find the normalizer in benchmarkstt.normalization.core,
then benchmarkstt.normalization and finally in the global namespace.
"""
normalizers = parser.add_argument_group('available normalizers', description=normalizers_desc)
args_from_factory('normalizers', factory, normalizers)
[docs]def argparser(parser: argparse.ArgumentParser):
"""
Adds the help and arguments specific to this module
"""
args_logs(parser)
files_desc = """
You can provide multiple input and output files, each preceded by -i and -o
respectively.
If no input file is given, only one output file can be used.
If using both multiple input and output files there should be an equal amount
of each. Each processed input file will then be written to the corresponding
output file."""
files = parser.add_argument_group('input and output files', description=files_desc)
args_inputfile(files)
files.add_argument('-o', '--outputfile', action='append', nargs=1,
help='write output to this file, defaults to STDOUT',
metavar='file')
args_normalizers(parser)
return parser
[docs]def get_normalizer_from_args(args):
from benchmarkstt.normalization import factory, NormalizationAggregate
from benchmarkstt.normalization.logger import DiffLoggingFormatter, normalization_logger
if args.log:
handler = logging.StreamHandler()
formatter = DiffLoggingFormatter('ansi', show_color_key=False)
handler.setFormatter(formatter)
handler.setLevel(logging.INFO)
normalization_logger.logger.addHandler(handler)
composite = NormalizationAggregate()
if 'normalizers' in args:
for item in args.normalizers:
normalizer_name = item.pop(0).replace('-', '.')
normalizer = factory.create(normalizer_name, *item)
composite.add(normalizer)
return composite
[docs]def run(parser, args):
input_files = [f[0] for f in args.inputfile] if args.inputfile else None
output_files = [f[0] for f in args.outputfile] if args.outputfile else None
if 'normalizers' not in args or not len(args.normalizers):
parser.error("need at least one normalizer")
if input_files is None and output_files is not None:
parser.error("can only write output to stdout when reading from stdin")
composite = get_normalizer_from_args(args)
encoding = settings.default_encoding
if output_files is not None:
# pre-open the output files before doing the grunt work
output_files = [open(output_file, 'xt', encoding=encoding) for output_file in output_files]
if input_files is not None:
for idx, file in enumerate(input_files):
with open(file, encoding=encoding) as input_file:
text = input_file.read()
text = composite.normalize(text)
if output_files is None:
sys.stdout.write(text)
else:
output_file = output_files[idx]
output_file.write(text)
output_file.close()
else:
text = sys.stdin.read()
text = composite.normalize(text)
sys.stdout.write(text)