Source code for benchmarkstt.diff.formatter

import logging
from benchmarkstt.helpers import make_printable
import difflib
from markupsafe import escape
from benchmarkstt.schema import Schema
from io import StringIO
from collections import OrderedDict

logger = logging.getLogger(__name__)


[docs]class Dialect: preprocessor = None delete_format = '%s' insert_format = '%s' equal_format = '%s' replace_format = None def __init__(self): self._stream = StringIO() @property def stream(self): return self._stream def __enter__(self): self._stream = StringIO() return self._stream def __exit__(self, exc_type, exc_val, exc_tb): pass
[docs] def output(self): return self._stream.getvalue()
[docs]class ANSIDiffDialect(Dialect): def __init__(self, show_color_key=None): self.show_color_key = bool(show_color_key) if show_color_key is not None else True self.color_key = 'Color key: Unchanged %s %s\n\n' % ( self.delete_format % 'Reference', self.insert_format % 'Hypothesis') super().__init__()
[docs] @staticmethod def preprocessor(txt): return make_printable(txt)
def __enter__(self): super().__enter__() self._stream = StringIO() if self.show_color_key: self._stream.write(self.color_key) return self delete_format = '\033[31m%s\033[0m' insert_format = '\033[32m%s\033[0m'
[docs]class UTF8Dialect(Dialect):
[docs] @staticmethod def preprocessor(txt): return make_printable(txt)
[docs] def delete_format(self, txt): self._stream.writelines(c + '\u0338' for c in txt)
[docs] def insert_format(self, txt): self._stream.writelines(c + '\u0359' for c in txt)
[docs]class HTMLDiffDialect(Dialect):
[docs] @staticmethod def preprocessor(txt): return escape(txt)
delete_format = '<span class="delete">%s</span>' insert_format = '<span class="insert">%s</span>'
[docs]class RestructuredTextDialect(ANSIDiffDialect):
[docs] @staticmethod def preprocessor(txt): return ANSIDiffDialect.preprocessor(txt).replace('·', '\u200B·\u200B').replace('`', r'\`')
delete_format = '\\ :diffdelete:`%s`\\ ' insert_format = '\\ :diffinsert:`%s`\\ '
[docs]class ListDialect(Dialect):
[docs] @staticmethod def preprocessor(txt): return txt
[docs] def delete_format(self, txt): return self._format('delete', txt)
[docs] def insert_format(self, txt): return self._format('insert', txt)
[docs] def equal_format(self, txt): return self._format('equal', txt)
[docs] def replace_format(self, a, b): return self._format('replace', a, b)
def _format(self, kind, txt, txt2=None): txt = txt.split() if txt2 is None: txt2 = txt else: txt2 = txt2.split() oor = len(txt2) for idx, word in enumerate(txt): ref = word if kind != 'insert' else None if idx >= oor: kind = 'delete' hyp = txt2[idx] if kind != 'delete' else None result = OrderedDict((('type', kind), ('reference', ref), ('hypothesis', hyp))) self._output.append(result) if idx < oor: for word_ in txt2[idx+1:]: result = OrderedDict((('type', 'insert'), ('reference', None), ('hypothesis', word_))) self._output.append(result) def __enter__(self): self._output = [] return self def __exit__(self, exc_type, exc_val, exc_tb): pass
[docs] def output(self): return self._output
[docs]class JSONDiffDialect(ListDialect): def __init__(self): self._line = None def __enter__(self): super().__enter__() self._stream = StringIO() self._line = 0 return self def __exit__(self, exc_type, exc_val, exc_tb): super().__exit__(exc_type, exc_val, exc_tb) self._line = None self._stream.write(Schema.dumps(super().output()))
[docs] def output(self): return self._stream.getvalue()
[docs]class DiffFormatter: diff_dialects = { "ansi": ANSIDiffDialect, "html": HTMLDiffDialect, "text": UTF8Dialect, "json": JSONDiffDialect, "list": ListDialect, "rst": RestructuredTextDialect, } def __init__(self, dialect=None, *args, **kwargs): if dialect is None: dialect = 'text' if not self.has_dialect(dialect): raise ValueError("Unknown diff dialect", dialect) self._dialect = self.diff_dialects[dialect](*args, **kwargs)
[docs] def diff(self, a, b, opcodes=None, preprocessor=None): formats = dict(insert=None, delete=None, equal=None, replace=None) dialect = self._dialect with dialect as stream: def format_string(formatting): def _(*args): dialect.stream.write(formatting % args) return _ # dialect = self._dialect for f in formats.keys(): formatter = getattr(dialect, f + '_format') if type(formatter) is str: formats[f] = format_string(formatter) else: formats[f] = formatter if formats['replace'] is None: def _(deleted, inserted): formats['delete'](deleted) formats['insert'](inserted) formats['replace'] = _ if preprocessor is not None: def _pre(txt): return dialect.preprocessor(preprocessor(txt)) else: _pre = dialect.preprocessor if opcodes is None: opcodes = difflib.SequenceMatcher(None, a, b).get_opcodes() for tag, alo, ahi, blo, bhi in opcodes: a_ = _pre(a[alo:ahi]) if tag in ('equal', 'delete'): formats[tag](_pre(a[alo:ahi])) else: b_ = _pre(b[blo:bhi]) if tag == 'insert': formats[tag](b_) else: formats[tag](a_, b_) return dialect.output()
[docs] @classmethod def has_dialect(cls, dialect): return dialect in cls.diff_dialects
[docs]def format_diff(a, b, opcodes=None, dialect=None, preprocessor=None): formatter = DiffFormatter(dialect) return formatter.diff(a, b, opcodes, preprocessor)