textaleidretting / testdiff.py
svanhvit's picture
highlighting added, first version
5a692ce
import gradio as gr
from transformers import pipeline
import tokenizer
from difflib import Differ, SequenceMatcher
text1 = "Kver á á þenan bússtað"
text2 = "Hver á þennan bústað?"
def diff_texts(text1, text2):
d = Differ()
return [
(token[2:], token[0] if token[0] != " " else None)
for token in d.compare(text1, text2)
]
def split_text(text):
sentence_list = [i for i in tokenizer.split_into_sentences(text, original=True)]
return sentence_list
def mark_text( text, tag,):
return (text, tag, )
def mark_span(text, tag,):
return [mark_text(token, tag) for token in text]
def markup_diff(a, b,
mark=mark_span,
default_mark = lambda x: x,
isjunk=None):
"""Returns a and b with any differences processed by mark
Junk is ignored by the differ
"""
seqmatcher = SequenceMatcher(isjunk=isjunk, a=a, b=b, autojunk=False)
out_a, out_b = [], []
for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes():
#markup = (default_mark) if tag == 'equal' else mark
markup=mark
out_a += markup(a[a0:a1], tag)
out_b += markup(b[b0:b1], tag)
assert len(out_a) == len(a)
assert len(out_b) == len(b)
return out_a, out_b
print(diff_texts(text1, text2))
print(markup_diff(text1.split(" "), text2.split(" ")))