jskim commited on
Commit
5b4e16a
1 Parent(s): a3a8d41

better output formatting (removing spaces around punctuations)

Browse files
Files changed (2) hide show
  1. app.py +3 -1
  2. score.py +76 -0
app.py CHANGED
@@ -87,9 +87,11 @@ def get_similar_paper(
87
  # get scores for each word in the format for Gradio Interpretation component
88
  word_scores = dict()
89
  for i in range(num_sents):
 
 
90
  word_scores[str(i)] = {
91
  "original": ab,
92
- "interpretation": list(zip(info['all_words'], info[i]['scores']))
93
  }
94
 
95
  results[display_title[aa]] = {
 
87
  # get scores for each word in the format for Gradio Interpretation component
88
  word_scores = dict()
89
  for i in range(num_sents):
90
+
91
+ ww, ss = remove_spaces(info['all_words'], info[i]['scores'])
92
  word_scores[str(i)] = {
93
  "original": ab,
94
+ "interpretation": list(zip(ww, ss))
95
  }
96
 
97
  results[display_title[aa]] = {
score.py CHANGED
@@ -78,6 +78,77 @@ def get_match_phrase(w1, w2, method='pos'):
78
  mask1[j] = 1
79
  return mask1, mask2
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
82
  """
83
  Mark the words that are highlighted, both by in terms of sentence and phrase
@@ -158,6 +229,11 @@ def get_highlight_info(model, text1, text2, K=None):
158
  mask1 *= -sc # mark matching phrases as blue (-1: darkest)
159
  mask2 *= -sc # mark matching phrases as blue
160
  assert(len(mask1) == len(q_words) and len(mask2) == len(c_words))
 
 
 
 
 
161
  top_pairs_info[count] = {
162
  'query': {
163
  'original': q_sent,
 
78
  mask1[j] = 1
79
  return mask1, mask2
80
 
81
+ def remove_spaces(words, attrs):
82
+ # make the output more readable by removing unnecessary spacings from the tokenizer
83
+ # e.g.
84
+ # 1. spacing for parenthesis
85
+ # 2. spacing for single/double quotations
86
+ # 3. spacing for commas and periods
87
+ # 4. spacing for possessive quotations
88
+ assert(len(words) == len(attrs))
89
+ word_out, attr_out = [], []
90
+ idx, single_q, double_q = 0, 0, 0
91
+ while idx < len(words):
92
+ # stick to the word that appears right before
93
+ if words[idx] in [',', '.', '%', ')', ':', '?', ';', "'s"]:
94
+ ww = word_out.pop()
95
+ aa = attr_out.pop()
96
+ word_out.append(ww + words[idx])
97
+ attr_out.append(aa)
98
+ idx += 1
99
+ # stick to the word that appears right after
100
+ elif words[idx] in ["("]:
101
+ word_out.append(words[idx] + words[idx+1])
102
+ attr_out.append(attrs[idx+1])
103
+ idx += 2
104
+ # quotes
105
+ elif words[idx] == '"':
106
+ double_q += 1
107
+ if double_q == 2:
108
+ # this is closing quote: stick to word before
109
+ ww = word_out.pop()
110
+ aa = attr_out.pop()
111
+ word_out.append(ww + words[idx])
112
+ attr_out.append(aa)
113
+ idx += 1
114
+ double_q = 0
115
+ else:
116
+ # this is opening quote: stick to the word after
117
+ word_out.append(words[idx] + words[idx+1])
118
+ attr_out.append(attrs[idx+1])
119
+ idx += 2
120
+ elif words[idx] == "'":
121
+ single_q += 1
122
+ if single_q == 2:
123
+ # this is closing quote: stick to word before
124
+ ww = word_out.pop()
125
+ aa = attr_out.pop()
126
+ word_out.append(ww + words[idx])
127
+ attr_out.append(aa)
128
+ idx += 1
129
+ single_q = 0
130
+ else:
131
+ if words[idx-1][-1] == 's': #possessive quote
132
+ # stick to the word before, reset counter
133
+ ww = word_out.pop()
134
+ aa = attr_out.pop()
135
+ word_out.append(ww + words[idx])
136
+ attr_out.append(aa)
137
+ idx += 1
138
+ single_q = 0
139
+ else:
140
+ # this is opening quote: stick to the word after
141
+ word_out.append(words[idx] + words[idx+1])
142
+ attr_out.append(attrs[idx+1])
143
+ idx += 2
144
+ else:
145
+ word_out.append(words[idx])
146
+ attr_out.append(attrs[idx])
147
+ idx += 1
148
+
149
+ assert(len(word_out) == len(attr_out))
150
+ return word_out, attr_out
151
+
152
  def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
153
  """
154
  Mark the words that are highlighted, both by in terms of sentence and phrase
 
229
  mask1 *= -sc # mark matching phrases as blue (-1: darkest)
230
  mask2 *= -sc # mark matching phrases as blue
231
  assert(len(mask1) == len(q_words) and len(mask2) == len(c_words))
232
+
233
+ # spacing
234
+ q_words, mask1 = remove_spaces(q_words, mask1)
235
+ c_words, mask2 = remove_spaces(c_words, mask2)
236
+
237
  top_pairs_info[count] = {
238
  'query': {
239
  'original': q_sent,