lytang
/

MiniCheck-Flan-T5-Large

@@ -5,6 +5,7 @@ import evaluate
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
@@ -51,7 +52,9 @@ class EndpointHandler():
     def __init__(self, path="./"):
         self.scorer = MiniCheck(path=path)
         self.rouge = evaluate.load('rouge')
         self.tfidf_order = True
     def __call__(self, data):
@@ -64,20 +67,17 @@ class EndpointHandler():
             _, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=data)
             ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
-            span_to_highlight = []
-            for doc_chunk, score in zip(ranked_docs, scores):
-                # If the chunk can support the claim, find the sentence with the highest rouge score
-                if score > 0.5:
-                    highest_score_sent, _ = self.chunk_and_highest_rouge_score(doc_chunk, claim)
-                    span_to_highlight.append(highest_score_sent)
-                else:
-                    span_to_highlight.append("")
             outputs = {
                 'ranked_docs': ranked_docs,
                 'scores': scores,
                 'span_to_highlight': span_to_highlight,
-                'entities': ents
             }
         else:
@@ -85,21 +85,18 @@ class EndpointHandler():
             ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim, tfidf_order=self.tfidf_order)
-            span_to_highlight = []
-            for doc_chunk, score in zip(ranked_docs, scores):
-                # If the chunk can support the claim, find the sentence with the highest rouge score
-                if score > 0.5:
-                    highest_score_sent, _ = self.chunk_and_highest_rouge_score(doc_chunk, claim)
-                    span_to_highlight.append(highest_score_sent)
-                else:
-                    span_to_highlight.append("")
             outputs = {
                 'ranked_docs': ranked_docs,
                 'scores': scores,
                 'ranked_urls': ranked_urls,
                 'span_to_highlight': span_to_highlight,
-                'entities': ents
             }
         return outputs
@@ -159,10 +156,9 @@ class EndpointHandler():
             return ranked_docs, scores, ranked_urls
-    def chunk_and_highest_rouge_score(self, doc, claim):
         '''
-        Given a document and a claim, return the sentence with the highest rouge score and the score
         '''
         doc_sentences = sent_tokenize(doc)
@@ -173,11 +169,33 @@ class EndpointHandler():
             references=claims,
             use_aggregator=False)
-        highest_score = 0
-        highest_score_sent = ""
         for i in range(len(doc_sentences)):
-            if results['rouge1'][i] > highest_score:
-                highest_score = results['rouge1'][i]
-                highest_score_sent = doc_sentences[i]
-        return highest_score_sent, highest_score

 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+from heapq import heappush, heappop
 def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
     def __init__(self, path="./"):
         self.scorer = MiniCheck(path=path)
         self.rouge = evaluate.load('rouge')
         self.tfidf_order = True
+        self.num_highlights = 1
     def __call__(self, data):
             _, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=data)
             ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
+            span_to_highlight, rouge_score = [], []
+            for doc_chunk in ranked_docs:
+                highest_score_sent, rouge_score = self.chunk_and_highest_rouge_score(doc_chunk, claim, k=self.num_highlights)
+                span_to_highlight.append(highest_score_sent)
             outputs = {
                 'ranked_docs': ranked_docs,
                 'scores': scores,
                 'span_to_highlight': span_to_highlight,
+                'entities': ents,
+                'rouge_score': rouge_score
             }
         else:
             ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim, tfidf_order=self.tfidf_order)
+            span_to_highlight, rouge_score = [], []
+            for doc_chunk in ranked_docs:
+                highest_score_sent, rouge_score = self.chunk_and_highest_rouge_score(doc_chunk, claim, k=self.num_highlights)
+                span_to_highlight.append(highest_score_sent)
             outputs = {
                 'ranked_docs': ranked_docs,
                 'scores': scores,
                 'ranked_urls': ranked_urls,
                 'span_to_highlight': span_to_highlight,
+                'entities': ents,
+                'rouge_score': rouge_score
             }
         return outputs
             return ranked_docs, scores, ranked_urls
+    def chunk_and_highest_rouge_score(self, doc, claim, k=1):
         '''
+        Given a document and a claim, return the top k sentences with the highest rouge scores and their scores
         '''
         doc_sentences = sent_tokenize(doc)
             references=claims,
             use_aggregator=False)
+        # Initialize a min heap to store the top k sentences and their scores
+        top_k_heap = []
         for i in range(len(doc_sentences)):
+            score = results['rouge1'][i]
+            sentence = doc_sentences[i]
+            # If the heap has less than k elements, push the current sentence and score
+            if len(top_k_heap) < k:
+                heappush(top_k_heap, (score, sentence))
+            else:
+                # If the current score is higher than the minimum score in the heap,
+                # remove the minimum and push the current sentence and score
+                if score > top_k_heap[0][0]:
+                    heappop(top_k_heap)
+                    heappush(top_k_heap, (score, sentence))
+        # Extract the top k sentences and scores from the heap
+        top_k_sentences = []
+        top_k_scores = []
+        while top_k_heap:
+            score, sentence = heappop(top_k_heap)
+            top_k_sentences.append(sentence)
+            top_k_scores.append(score)
+        # Reverse the order of sentences and scores to get them in descending order
+        top_k_sentences = top_k_sentences[::-1]
+        top_k_scores = top_k_scores[::-1]
+        return top_k_sentences, top_k_scores