Spaces:
Runtime error
Runtime error
better output formatting (removing spaces around punctuations)
Browse files
app.py
CHANGED
@@ -87,9 +87,11 @@ def get_similar_paper(
|
|
87 |
# get scores for each word in the format for Gradio Interpretation component
|
88 |
word_scores = dict()
|
89 |
for i in range(num_sents):
|
|
|
|
|
90 |
word_scores[str(i)] = {
|
91 |
"original": ab,
|
92 |
-
"interpretation": list(zip(
|
93 |
}
|
94 |
|
95 |
results[display_title[aa]] = {
|
|
|
87 |
# get scores for each word in the format for Gradio Interpretation component
|
88 |
word_scores = dict()
|
89 |
for i in range(num_sents):
|
90 |
+
|
91 |
+
ww, ss = remove_spaces(info['all_words'], info[i]['scores'])
|
92 |
word_scores[str(i)] = {
|
93 |
"original": ab,
|
94 |
+
"interpretation": list(zip(ww, ss))
|
95 |
}
|
96 |
|
97 |
results[display_title[aa]] = {
|
score.py
CHANGED
@@ -78,6 +78,77 @@ def get_match_phrase(w1, w2, method='pos'):
|
|
78 |
mask1[j] = 1
|
79 |
return mask1, mask2
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
|
82 |
"""
|
83 |
Mark the words that are highlighted, both by in terms of sentence and phrase
|
@@ -158,6 +229,11 @@ def get_highlight_info(model, text1, text2, K=None):
|
|
158 |
mask1 *= -sc # mark matching phrases as blue (-1: darkest)
|
159 |
mask2 *= -sc # mark matching phrases as blue
|
160 |
assert(len(mask1) == len(q_words) and len(mask2) == len(c_words))
|
|
|
|
|
|
|
|
|
|
|
161 |
top_pairs_info[count] = {
|
162 |
'query': {
|
163 |
'original': q_sent,
|
|
|
78 |
mask1[j] = 1
|
79 |
return mask1, mask2
|
80 |
|
81 |
+
def remove_spaces(words, attrs):
|
82 |
+
# make the output more readable by removing unnecessary spacings from the tokenizer
|
83 |
+
# e.g.
|
84 |
+
# 1. spacing for parenthesis
|
85 |
+
# 2. spacing for single/double quotations
|
86 |
+
# 3. spacing for commas and periods
|
87 |
+
# 4. spacing for possessive quotations
|
88 |
+
assert(len(words) == len(attrs))
|
89 |
+
word_out, attr_out = [], []
|
90 |
+
idx, single_q, double_q = 0, 0, 0
|
91 |
+
while idx < len(words):
|
92 |
+
# stick to the word that appears right before
|
93 |
+
if words[idx] in [',', '.', '%', ')', ':', '?', ';', "'s"]:
|
94 |
+
ww = word_out.pop()
|
95 |
+
aa = attr_out.pop()
|
96 |
+
word_out.append(ww + words[idx])
|
97 |
+
attr_out.append(aa)
|
98 |
+
idx += 1
|
99 |
+
# stick to the word that appears right after
|
100 |
+
elif words[idx] in ["("]:
|
101 |
+
word_out.append(words[idx] + words[idx+1])
|
102 |
+
attr_out.append(attrs[idx+1])
|
103 |
+
idx += 2
|
104 |
+
# quotes
|
105 |
+
elif words[idx] == '"':
|
106 |
+
double_q += 1
|
107 |
+
if double_q == 2:
|
108 |
+
# this is closing quote: stick to word before
|
109 |
+
ww = word_out.pop()
|
110 |
+
aa = attr_out.pop()
|
111 |
+
word_out.append(ww + words[idx])
|
112 |
+
attr_out.append(aa)
|
113 |
+
idx += 1
|
114 |
+
double_q = 0
|
115 |
+
else:
|
116 |
+
# this is opening quote: stick to the word after
|
117 |
+
word_out.append(words[idx] + words[idx+1])
|
118 |
+
attr_out.append(attrs[idx+1])
|
119 |
+
idx += 2
|
120 |
+
elif words[idx] == "'":
|
121 |
+
single_q += 1
|
122 |
+
if single_q == 2:
|
123 |
+
# this is closing quote: stick to word before
|
124 |
+
ww = word_out.pop()
|
125 |
+
aa = attr_out.pop()
|
126 |
+
word_out.append(ww + words[idx])
|
127 |
+
attr_out.append(aa)
|
128 |
+
idx += 1
|
129 |
+
single_q = 0
|
130 |
+
else:
|
131 |
+
if words[idx-1][-1] == 's': #possessive quote
|
132 |
+
# stick to the word before, reset counter
|
133 |
+
ww = word_out.pop()
|
134 |
+
aa = attr_out.pop()
|
135 |
+
word_out.append(ww + words[idx])
|
136 |
+
attr_out.append(aa)
|
137 |
+
idx += 1
|
138 |
+
single_q = 0
|
139 |
+
else:
|
140 |
+
# this is opening quote: stick to the word after
|
141 |
+
word_out.append(words[idx] + words[idx+1])
|
142 |
+
attr_out.append(attrs[idx+1])
|
143 |
+
idx += 2
|
144 |
+
else:
|
145 |
+
word_out.append(words[idx])
|
146 |
+
attr_out.append(attrs[idx])
|
147 |
+
idx += 1
|
148 |
+
|
149 |
+
assert(len(word_out) == len(attr_out))
|
150 |
+
return word_out, attr_out
|
151 |
+
|
152 |
def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
|
153 |
"""
|
154 |
Mark the words that are highlighted, both by in terms of sentence and phrase
|
|
|
229 |
mask1 *= -sc # mark matching phrases as blue (-1: darkest)
|
230 |
mask2 *= -sc # mark matching phrases as blue
|
231 |
assert(len(mask1) == len(q_words) and len(mask2) == len(c_words))
|
232 |
+
|
233 |
+
# spacing
|
234 |
+
q_words, mask1 = remove_spaces(q_words, mask1)
|
235 |
+
c_words, mask2 = remove_spaces(c_words, mask2)
|
236 |
+
|
237 |
top_pairs_info[count] = {
|
238 |
'query': {
|
239 |
'original': q_sent,
|