Spaces:

jskim
/

paper-matching

Runtime error

App Files Files Community

jskim commited on Mar 13, 2023

Commit

5b4e16a

1 Parent(s): a3a8d41

better output formatting (removing spaces around punctuations)

Browse files

Files changed (2) hide show

app.py +3 -1
score.py +76 -0

app.py CHANGED Viewed

@@ -87,9 +87,11 @@ def get_similar_paper(
         # get scores for each word in the format for Gradio Interpretation component
         word_scores = dict()
         for i in range(num_sents):
             word_scores[str(i)] = {
                 "original": ab,
-                "interpretation": list(zip(info['all_words'], info[i]['scores']))
             }
         results[display_title[aa]] = {

         # get scores for each word in the format for Gradio Interpretation component
         word_scores = dict()
         for i in range(num_sents):
+            ww, ss = remove_spaces(info['all_words'], info[i]['scores'])
             word_scores[str(i)] = {
                 "original": ab,
+                "interpretation": list(zip(ww, ss))
             }
         results[display_title[aa]] = {

score.py CHANGED Viewed

@@ -78,6 +78,77 @@ def get_match_phrase(w1, w2, method='pos'):
                 mask1[j] = 1
     return mask1, mask2
 def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
     """
     Mark the words that are highlighted, both by in terms of sentence and phrase
@@ -158,6 +229,11 @@ def get_highlight_info(model, text1, text2, K=None):
         mask1 *= -sc # mark matching phrases as blue (-1: darkest)
         mask2 *= -sc # mark matching phrases as blue
         assert(len(mask1) == len(q_words) and len(mask2) == len(c_words))
         top_pairs_info[count] = {
             'query': {
                 'original': q_sent,

                 mask1[j] = 1
     return mask1, mask2
+def remove_spaces(words, attrs):
+    # make the output more readable by removing unnecessary spacings from the tokenizer
+    # e.g.
+    # 1. spacing for parenthesis
+    # 2. spacing for single/double quotations
+    # 3. spacing for commas and periods
+    # 4. spacing for possessive quotations
+    assert(len(words) == len(attrs))
+    word_out, attr_out = [], []
+    idx, single_q, double_q = 0, 0, 0
+    while idx < len(words):
+        # stick to the word that appears right before
+        if words[idx] in [',', '.', '%', ')', ':', '?', ';', "'s"]:
+            ww = word_out.pop()
+            aa = attr_out.pop()
+            word_out.append(ww + words[idx])
+            attr_out.append(aa)
+            idx += 1
+        # stick to the word that appears right after
+        elif words[idx] in ["("]:
+            word_out.append(words[idx] + words[idx+1])
+            attr_out.append(attrs[idx+1])
+            idx += 2
+        # quotes
+        elif words[idx] == '"':
+            double_q += 1
+            if double_q == 2:
+                # this is closing quote: stick to word before
+                ww = word_out.pop()
+                aa = attr_out.pop()
+                word_out.append(ww + words[idx])
+                attr_out.append(aa)
+                idx += 1
+                double_q = 0
+            else:
+                # this is opening quote: stick to the word after
+                word_out.append(words[idx] + words[idx+1])
+                attr_out.append(attrs[idx+1])
+                idx += 2
+        elif words[idx] == "'":
+            single_q += 1
+            if single_q == 2:
+                # this is closing quote: stick to word before
+                ww = word_out.pop()
+                aa = attr_out.pop()
+                word_out.append(ww + words[idx])
+                attr_out.append(aa)
+                idx += 1
+                single_q = 0
+            else:
+                if words[idx-1][-1] == 's': #possessive quote
+                    # stick to the word before, reset counter
+                    ww = word_out.pop()
+                    aa = attr_out.pop()
+                    word_out.append(ww + words[idx])
+                    attr_out.append(aa)
+                    idx += 1
+                    single_q = 0
+                else:
+                    # this is opening quote: stick to the word after
+                    word_out.append(words[idx] + words[idx+1])
+                    attr_out.append(attrs[idx+1])
+                    idx += 2
+        else:
+            word_out.append(words[idx])
+            attr_out.append(attrs[idx])
+            idx += 1
+    assert(len(word_out) == len(attr_out))
+    return word_out, attr_out
 def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
     """
     Mark the words that are highlighted, both by in terms of sentence and phrase
         mask1 *= -sc # mark matching phrases as blue (-1: darkest)
         mask2 *= -sc # mark matching phrases as blue
         assert(len(mask1) == len(q_words) and len(mask2) == len(c_words))
+        # spacing
+        q_words, mask1 = remove_spaces(q_words, mask1)
+        c_words, mask2 = remove_spaces(c_words, mask2)
         top_pairs_info[count] = {
             'query': {
                 'original': q_sent,