svanhvit commited on
Commit
5a692ce
1 Parent(s): b02cc92

highlighting added, first version

Browse files
Files changed (1) hide show
  1. testdiff.py +49 -0
testdiff.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import gradio as gr
4
+ from transformers import pipeline
5
+ import tokenizer
6
+ from difflib import Differ, SequenceMatcher
7
+
8
+ text1 = "Kver á á þenan bússtað"
9
+ text2 = "Hver á þennan bústað?"
10
+
11
+ def diff_texts(text1, text2):
12
+ d = Differ()
13
+ return [
14
+ (token[2:], token[0] if token[0] != " " else None)
15
+ for token in d.compare(text1, text2)
16
+ ]
17
+
18
+ def split_text(text):
19
+ sentence_list = [i for i in tokenizer.split_into_sentences(text, original=True)]
20
+ return sentence_list
21
+
22
+ def mark_text( text, tag,):
23
+ return (text, tag, )
24
+
25
+ def mark_span(text, tag,):
26
+ return [mark_text(token, tag) for token in text]
27
+
28
+ def markup_diff(a, b,
29
+ mark=mark_span,
30
+ default_mark = lambda x: x,
31
+ isjunk=None):
32
+ """Returns a and b with any differences processed by mark
33
+
34
+ Junk is ignored by the differ
35
+ """
36
+ seqmatcher = SequenceMatcher(isjunk=isjunk, a=a, b=b, autojunk=False)
37
+
38
+ out_a, out_b = [], []
39
+ for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes():
40
+ #markup = (default_mark) if tag == 'equal' else mark
41
+ markup=mark
42
+ out_a += markup(a[a0:a1], tag)
43
+ out_b += markup(b[b0:b1], tag)
44
+ assert len(out_a) == len(a)
45
+ assert len(out_b) == len(b)
46
+ return out_a, out_b
47
+
48
+ print(diff_texts(text1, text2))
49
+ print(markup_diff(text1.split(" "), text2.split(" ")))