Initial commit
Browse files- app.py +117 -0
- requirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import spacy
|
3 |
+
from paraphrase_metrics import metrics as pm
|
4 |
+
import time
|
5 |
+
import difflib
|
6 |
+
|
7 |
+
st.set_page_config(page_title="TextDiff Visualizer")
|
8 |
+
|
9 |
+
def render_single_para(paragraph, segment_info, prefix="a", other="b", gap=" "):
|
10 |
+
# span (diff text) change from red to cyan
|
11 |
+
span_diff_1 = """<span style="background-color:LightCoral;color:black;border-radius:2px;" onmouseover="chbg_"""
|
12 |
+
span_diff_2 = """('cyan')" onmouseout="chbg_"""
|
13 |
+
span_diff_3 = """('LightCoral')" id='"""
|
14 |
+
# span (same text) change from green to cyan
|
15 |
+
span_same_1 = """<span style="background-color:LightGreen;color:black;border-radius:2px;" onmouseover="chbg_"""
|
16 |
+
span_same_2 = """('cyan')" onmouseout="chbg_"""
|
17 |
+
span_same_3 = """('LightGreen')" id='"""
|
18 |
+
segments = ["<p>",]
|
19 |
+
for i, m in enumerate(segment_info):
|
20 |
+
span1_id = prefix+"_"+str(i)+"_1"
|
21 |
+
span1_id_other = other+"_"+str(i)+"_1"
|
22 |
+
if i > 0:
|
23 |
+
m_prev = segment_info[i-1]
|
24 |
+
segment1 = span_diff_1 + span1_id + span_diff_2 + span1_id + span_diff_3 + span1_id + "'>" + paragraph[m_prev[0]+m_prev[1]:m[0]] + "</span>"
|
25 |
+
else:
|
26 |
+
segment1 = span_diff_1 + span1_id + span_diff_2 + span1_id + span_diff_3 + span1_id + "'>" + paragraph[:m[0]] + "</span>"
|
27 |
+
span2_id = prefix+"_"+str(i)+"_2"
|
28 |
+
span2_id_other = other+"_"+str(i)+"_2"
|
29 |
+
segment2 = span_same_1 + span2_id + span_same_2 + span2_id + span_same_3 + span2_id + "'>" + paragraph[m[0]:m[0]+m[1]] + "</span>"
|
30 |
+
highlighting_code = """<script>
|
31 |
+
function chbg_"""+span1_id+"""(colour){
|
32 |
+
document.getElementById('"""+span1_id+"""').style.backgroundColor=colour;
|
33 |
+
document.getElementById('"""+span1_id_other+"""').style.backgroundColor=colour;
|
34 |
+
}
|
35 |
+
function chbg_"""+span2_id+"""(colour){
|
36 |
+
document.getElementById('"""+span2_id+"""').style.backgroundColor=colour;
|
37 |
+
document.getElementById('"""+span2_id_other+"""').style.backgroundColor=colour;
|
38 |
+
}
|
39 |
+
</script>"""
|
40 |
+
segments += [highlighting_code, segment1, segment2]
|
41 |
+
segments.append("</p>")
|
42 |
+
return gap.join(segments)
|
43 |
+
|
44 |
+
def render_diff(a_parapgraph, b_parapgraph, gap=" ", prefix=None):
|
45 |
+
if prefix is None:
|
46 |
+
prefix = str(int(time.time()))
|
47 |
+
s = difflib.SequenceMatcher(None, a_parapgraph.lower(), b_parapgraph.lower(), autojunk=False)
|
48 |
+
matching_blocks = s.get_matching_blocks()
|
49 |
+
# a
|
50 |
+
a_segment_info = [[b.a,b.size] for b in matching_blocks]
|
51 |
+
a_html_paragraph = render_single_para(a_parapgraph, a_segment_info, gap=gap, prefix=prefix+"_a", other=prefix+"_b")
|
52 |
+
# b
|
53 |
+
b_segment_info = [[b.b,b.size] for b in matching_blocks]
|
54 |
+
b_html_paragraph = render_single_para(b_parapgraph, b_segment_info, gap=gap, prefix=prefix+"_b", other=prefix+"_a")
|
55 |
+
# table
|
56 |
+
table = """<table style="width:100%;font-family:sans-serif;font-size:large;"><tr style="background-color:white;padding=1px;">
|
57 |
+
<td style="border: 1px solid silver;padding:0.4em;border-radius:4px;">"""+a_html_paragraph+"""</td>
|
58 |
+
<td style="border: 1px solid silver;padding:0.4em;border-radius:4px;">"""+b_html_paragraph+"""</td>
|
59 |
+
</tr></table>"""
|
60 |
+
return table
|
61 |
+
|
62 |
+
@st.cache(allow_output_mutation=True)
|
63 |
+
def load_model():
|
64 |
+
nlp = spacy.load("en_core_web_sm")
|
65 |
+
return nlp
|
66 |
+
|
67 |
+
nlp = load_model()
|
68 |
+
|
69 |
+
st.markdown("### TextDiff Visualizer")
|
70 |
+
|
71 |
+
mode = st.selectbox("Input", ["Custom", "Examples"])
|
72 |
+
|
73 |
+
if mode == "Custom":
|
74 |
+
col1, col2 = st.columns(2)
|
75 |
+
with col1:
|
76 |
+
text_A = st.text_area("Text 1", value="The findings are being published July 1st in the Annals of Internal Medicine.")
|
77 |
+
with col2:
|
78 |
+
text_B = st.text_area("Text 2", value="The findings are published in the July 1st issue of the Annals of Internal Medicine.")
|
79 |
+
else:
|
80 |
+
examples = st.radio("Examples", [
|
81 |
+
"The top rate will go to 4.45 percent for all residents with taxable incomes above $500,000. ; For residents with incomes above $500,000, the income-tax rate will increase to 4.45 percent.",
|
82 |
+
"However, prosecutors have declined to take criminal action against guards, though Fine said his inquiry is not finished. ; Prosecutors have declined to take criminal action against corrections officers, although Fine said his inquiry was not finished.",
|
83 |
+
"In trading on the New York Stock Exchange, Kraft shares fell 25 cents to close at $32.30. ; Kraft's shares fell 25 cents to close at $32.30 yesterday on the New York Stock Exchange.",
|
84 |
+
"An attempt last month in the Senate to keep the fund open for another year fell flat. ; An attempt to keep the fund open for another year fell flat in the Senate last month.",
|
85 |
+
"Prisoners were tortured and executed -- their ears and scalps severed for souvenirs. ; They frequently tortured and shot prisoners, severing ears and scalps for souvenirs.",
|
86 |
+
"American has laid off 6,500 of its flight attendants since Dec. 31. ; Since October 2001, American has laid off 6,149 flight attendants.",
|
87 |
+
])
|
88 |
+
text_A, text_B = examples.split(" ; ")
|
89 |
+
|
90 |
+
st.markdown("Visualization")
|
91 |
+
|
92 |
+
html_viz = render_diff(text_A, text_B)
|
93 |
+
|
94 |
+
st.components.v1.html(html_viz)
|
95 |
+
|
96 |
+
dist = round(pm.edit_distance(text_A, text_B), 2)
|
97 |
+
bleu = round(pm.self_bleu(text_A, text_B), 2)
|
98 |
+
text_A, text_B = nlp(text_A), nlp(text_B)
|
99 |
+
wpd = round(pm.wpd(text_A, text_B), 2)
|
100 |
+
ld = round(pm.ld(text_A, text_B), 2)
|
101 |
+
|
102 |
+
metriccol1, metriccol2, metriccol3, metriccol4 = st.columns(4)
|
103 |
+
metriccol1.metric("WPD", wpd)
|
104 |
+
metriccol2.metric("LD", ld)
|
105 |
+
metriccol3.metric("Edit Dist.", dist)
|
106 |
+
metriccol4.metric("BLEU", bleu)
|
107 |
+
|
108 |
+
with st.expander("More info"):
|
109 |
+
st.markdown("""**Explantion of Metrics**
|
110 |
+
|
111 |
+
* **WPD**: Word Position Deviation measures structural changes between two paraphrases
|
112 |
+
* **LD**: Lexical Deviation measures degree of vocabulary changes between two paraphrases
|
113 |
+
* **Edit Dist.**: Levenshtein edit distance
|
114 |
+
* **BLEU**: SELF-BLEU score
|
115 |
+
|
116 |
+
For more information, see https://github.com/tlkh/paraphrase-metrics
|
117 |
+
""")
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
spacy
|
3 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm
|
4 |
+
paraphrase_metrics
|