import gradio as gr from transformers import pipeline import tokenizer from difflib import Differ, SequenceMatcher text1 = "Kver á á þenan bússtað" text2 = "Hver á þennan bústað?" def diff_texts(text1, text2): d = Differ() return [ (token[2:], token[0] if token[0] != " " else None) for token in d.compare(text1, text2) ] def split_text(text): sentence_list = [i for i in tokenizer.split_into_sentences(text, original=True)] return sentence_list def mark_text( text, tag,): return (text, tag, ) def mark_span(text, tag,): return [mark_text(token, tag) for token in text] def markup_diff(a, b, mark=mark_span, default_mark = lambda x: x, isjunk=None): """Returns a and b with any differences processed by mark Junk is ignored by the differ """ seqmatcher = SequenceMatcher(isjunk=isjunk, a=a, b=b, autojunk=False) out_a, out_b = [], [] for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes(): #markup = (default_mark) if tag == 'equal' else mark markup=mark out_a += markup(a[a0:a1], tag) out_b += markup(b[b0:b1], tag) assert len(out_a) == len(a) assert len(out_b) == len(b) return out_a, out_b print(diff_texts(text1, text2)) print(markup_diff(text1.split(" "), text2.split(" ")))