Liyan06 commited on
Commit
3fe4664
1 Parent(s): 3fbb656

add span highlight (rogue) for neg chunk

Browse files
Files changed (1) hide show
  1. handler.py +47 -29
handler.py CHANGED
@@ -5,6 +5,7 @@ import evaluate
5
 
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.metrics.pairwise import cosine_similarity
 
8
 
9
 
10
  def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
@@ -51,7 +52,9 @@ class EndpointHandler():
51
  def __init__(self, path="./"):
52
  self.scorer = MiniCheck(path=path)
53
  self.rouge = evaluate.load('rouge')
 
54
  self.tfidf_order = True
 
55
 
56
 
57
  def __call__(self, data):
@@ -64,20 +67,17 @@ class EndpointHandler():
64
  _, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=data)
65
  ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
66
 
67
- span_to_highlight = []
68
- for doc_chunk, score in zip(ranked_docs, scores):
69
- # If the chunk can support the claim, find the sentence with the highest rouge score
70
- if score > 0.5:
71
- highest_score_sent, _ = self.chunk_and_highest_rouge_score(doc_chunk, claim)
72
- span_to_highlight.append(highest_score_sent)
73
- else:
74
- span_to_highlight.append("")
75
 
76
  outputs = {
77
  'ranked_docs': ranked_docs,
78
  'scores': scores,
79
  'span_to_highlight': span_to_highlight,
80
- 'entities': ents
 
81
  }
82
 
83
  else:
@@ -85,21 +85,18 @@ class EndpointHandler():
85
 
86
  ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim, tfidf_order=self.tfidf_order)
87
 
88
- span_to_highlight = []
89
- for doc_chunk, score in zip(ranked_docs, scores):
90
- # If the chunk can support the claim, find the sentence with the highest rouge score
91
- if score > 0.5:
92
- highest_score_sent, _ = self.chunk_and_highest_rouge_score(doc_chunk, claim)
93
- span_to_highlight.append(highest_score_sent)
94
- else:
95
- span_to_highlight.append("")
96
-
97
  outputs = {
98
  'ranked_docs': ranked_docs,
99
  'scores': scores,
100
  'ranked_urls': ranked_urls,
101
  'span_to_highlight': span_to_highlight,
102
- 'entities': ents
 
103
  }
104
 
105
  return outputs
@@ -159,10 +156,9 @@ class EndpointHandler():
159
  return ranked_docs, scores, ranked_urls
160
 
161
 
162
- def chunk_and_highest_rouge_score(self, doc, claim):
163
-
164
  '''
165
- Given a document and a claim, return the sentence with the highest rouge score and the score
166
  '''
167
 
168
  doc_sentences = sent_tokenize(doc)
@@ -173,11 +169,33 @@ class EndpointHandler():
173
  references=claims,
174
  use_aggregator=False)
175
 
176
- highest_score = 0
177
- highest_score_sent = ""
 
178
  for i in range(len(doc_sentences)):
179
- if results['rouge1'][i] > highest_score:
180
- highest_score = results['rouge1'][i]
181
- highest_score_sent = doc_sentences[i]
182
-
183
- return highest_score_sent, highest_score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.metrics.pairwise import cosine_similarity
8
+ from heapq import heappush, heappop
9
 
10
 
11
  def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
 
52
  def __init__(self, path="./"):
53
  self.scorer = MiniCheck(path=path)
54
  self.rouge = evaluate.load('rouge')
55
+
56
  self.tfidf_order = True
57
+ self.num_highlights = 1
58
 
59
 
60
  def __call__(self, data):
 
67
  _, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=data)
68
  ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
69
 
70
+ span_to_highlight, rouge_score = [], []
71
+ for doc_chunk in ranked_docs:
72
+ highest_score_sent, rouge_score = self.chunk_and_highest_rouge_score(doc_chunk, claim, k=self.num_highlights)
73
+ span_to_highlight.append(highest_score_sent)
 
 
 
 
74
 
75
  outputs = {
76
  'ranked_docs': ranked_docs,
77
  'scores': scores,
78
  'span_to_highlight': span_to_highlight,
79
+ 'entities': ents,
80
+ 'rouge_score': rouge_score
81
  }
82
 
83
  else:
 
85
 
86
  ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim, tfidf_order=self.tfidf_order)
87
 
88
+ span_to_highlight, rouge_score = [], []
89
+ for doc_chunk in ranked_docs:
90
+ highest_score_sent, rouge_score = self.chunk_and_highest_rouge_score(doc_chunk, claim, k=self.num_highlights)
91
+ span_to_highlight.append(highest_score_sent)
92
+
 
 
 
 
93
  outputs = {
94
  'ranked_docs': ranked_docs,
95
  'scores': scores,
96
  'ranked_urls': ranked_urls,
97
  'span_to_highlight': span_to_highlight,
98
+ 'entities': ents,
99
+ 'rouge_score': rouge_score
100
  }
101
 
102
  return outputs
 
156
  return ranked_docs, scores, ranked_urls
157
 
158
 
159
+ def chunk_and_highest_rouge_score(self, doc, claim, k=1):
 
160
  '''
161
+ Given a document and a claim, return the top k sentences with the highest rouge scores and their scores
162
  '''
163
 
164
  doc_sentences = sent_tokenize(doc)
 
169
  references=claims,
170
  use_aggregator=False)
171
 
172
+ # Initialize a min heap to store the top k sentences and their scores
173
+ top_k_heap = []
174
+
175
  for i in range(len(doc_sentences)):
176
+ score = results['rouge1'][i]
177
+ sentence = doc_sentences[i]
178
+
179
+ # If the heap has less than k elements, push the current sentence and score
180
+ if len(top_k_heap) < k:
181
+ heappush(top_k_heap, (score, sentence))
182
+ else:
183
+ # If the current score is higher than the minimum score in the heap,
184
+ # remove the minimum and push the current sentence and score
185
+ if score > top_k_heap[0][0]:
186
+ heappop(top_k_heap)
187
+ heappush(top_k_heap, (score, sentence))
188
+
189
+ # Extract the top k sentences and scores from the heap
190
+ top_k_sentences = []
191
+ top_k_scores = []
192
+ while top_k_heap:
193
+ score, sentence = heappop(top_k_heap)
194
+ top_k_sentences.append(sentence)
195
+ top_k_scores.append(score)
196
+
197
+ # Reverse the order of sentences and scores to get them in descending order
198
+ top_k_sentences = top_k_sentences[::-1]
199
+ top_k_scores = top_k_scores[::-1]
200
+
201
+ return top_k_sentences, top_k_scores