File size: 8,377 Bytes
19d69d5 1104bf8 0677600 3201a95 113a57e 3fe4664 113a57e a496016 2d158d3 113a57e 3201a95 0677600 3fe4664 3fbb656 3fe4664 09efa05 0677600 3201a95 09efa05 0677600 2d158d3 0677600 a496016 1104bf8 a496016 1104bf8 3fe4664 0677600 1104bf8 98d958b 0677600 2d158d3 3fe4664 0677600 1104bf8 3fbb656 9695d05 3fe4664 98d958b 93e9112 2d158d3 3fe4664 98d958b 1104bf8 3fbb656 1104bf8 98d958b 1104bf8 c191acc 1104bf8 4ec6f2d 98d958b c191acc 1104bf8 113a57e 1104bf8 98d958b 1104bf8 09efa05 1104bf8 a496016 09efa05 1104bf8 98d958b 1104bf8 3fbb656 a496016 3fbb656 0677600 3fe4664 0677600 3fe4664 0677600 3fe4664 0677600 3fe4664 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
from minicheck_web.minicheck import MiniCheck
from web_retrieval import *
from nltk.tokenize import sent_tokenize
import evaluate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from heapq import heappush, heappop
def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
'''
Sort the chunks in a single document based on the probability of "supported" in descending order.
This function is used when a user document is provided.
'''
flattened_docs = [doc for chunk in used_chunk for doc in chunk]
flattened_scores = [score for chunk in support_prob_per_chunk for score in chunk]
doc_score = list(zip(flattened_docs, flattened_scores))
ranked_doc_score = sorted(doc_score, key=lambda x: x[1], reverse=True)
ranked_docs, scores = zip(*ranked_doc_score)
return ranked_docs, scores
def rank_documents_TFIDF(claim, scraped_results):
"""
each element in scraped_results is a tuple of (document, URL)
"""
documents = [result[0] for result in scraped_results]
corpus = [claim] + documents
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
claim_vector = tfidf_matrix[0]
similarity_scores = cosine_similarity(claim_vector, tfidf_matrix[1:])
ranked_results = [(scraped_results[i][0], scraped_results[i][1], score)
for i, score in enumerate(similarity_scores[0])]
ranked_results.sort(key=lambda x: x[2], reverse=True)
ranked_documents = [(result[0], result[1]) for result in ranked_results]
return ranked_documents
class EndpointHandler():
def __init__(self, path="./"):
self.scorer = MiniCheck(path=path)
self.rouge = evaluate.load('rouge')
self.tfidf_order = True
self.num_highlights = 1
self.default_chunk_size = 500
self.chunk_size = 500
def __call__(self, data):
# this is necessary for setting the chunk size for
# retrived docs
if 'chunk_size' in data['inputs']:
self.chunk_size = int(data['inputs']['chunk_size'])
else:
self.chunk_size = self.default_chunk_size
claim = data['inputs']['claims'][0]
ents = extract_entities(claim)
# Using user-provided document to do fact-checking
if len(data['inputs']['docs']) == 1 and data['inputs']['docs'][0] != '':
_, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=data)
ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
span_to_highlight, rouge_score = [], []
for doc_chunk in ranked_docs:
highest_score_sent, rouge_score = self.chunk_and_highest_rouge_score(doc_chunk, claim, k=self.num_highlights)
span_to_highlight.append(highest_score_sent)
outputs = {
'ranked_docs': ranked_docs,
'scores': scores,
'span_to_highlight': span_to_highlight,
'entities': ents,
'rouge_score': rouge_score
}
else:
assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim, tfidf_order=self.tfidf_order)
span_to_highlight, rouge_score = [], []
for doc_chunk in ranked_docs:
highest_score_sent, rouge_score = self.chunk_and_highest_rouge_score(doc_chunk, claim, k=self.num_highlights)
span_to_highlight.append(highest_score_sent)
outputs = {
'ranked_docs': ranked_docs,
'scores': scores,
'ranked_urls': ranked_urls,
'span_to_highlight': span_to_highlight,
'entities': ents,
'rouge_score': rouge_score
}
return outputs
def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False, tfidf_order=False):
"""
if tfidf_order == True, then display the docs in the order of TF-IDF similarity with the claim, regardless of the entailment score
otherwise, display the docs in the order of the entailment score
"""
search_results = search_google(claim, timeout=timeout)
print('Searching webpages...')
start = time()
with concurrent.futures.ThreadPoolExecutor() as e:
scraped_results = e.map(scrape_url, search_results, itertools.repeat(timeout))
end = time()
print(f"Finished searching in {round((end - start), 1)} seconds.\n")
scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]] # those can be ranked based on TF-IDF to be more efficient
scraped_results = rank_documents_TFIDF(claim, scraped_results)
retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
print('Scoring webpages...')
start = time()
retrieved_data = {
'inputs': {
'docs': list(retrieved_docs),
'claims': [claim]*len(retrieved_docs),
'chunk_size': self.chunk_size
}
}
_, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=retrieved_data)
end = time()
num_chunks = len([item for items in used_chunk for item in items])
print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
if tfidf_order:
tfidf_docs, scores = [], []
for used_c, support_prob_per_c in zip(used_chunk, support_prob_per_chunk):
# If the doc can support the claim, find the chunk with the
# highest entailment score; otherwise, use the first chunk
if max(support_prob_per_c) > 0.5:
tfidf_docs.append(used_c[np.argmax(support_prob_per_c)])
scores.append(max(support_prob_per_c))
else:
tfidf_docs.append(used_c[0])
scores.append(support_prob_per_c[0])
return tfidf_docs, scores, urls
else:
ranked_docs, scores, ranked_urls = order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=allow_duplicated_urls)
return ranked_docs, scores, ranked_urls
def chunk_and_highest_rouge_score(self, doc, claim, k=1):
'''
Given a document and a claim, return the top k sentences with the highest rouge scores and their scores
'''
doc_sentences = sent_tokenize(doc)
claims = [claim] * len(doc_sentences)
results = self.rouge.compute(
predictions=doc_sentences,
references=claims,
use_aggregator=False)
# Initialize a min heap to store the top k sentences and their scores
top_k_heap = []
for i in range(len(doc_sentences)):
score = results['rouge1'][i]
sentence = doc_sentences[i]
# If the heap has less than k elements, push the current sentence and score
if len(top_k_heap) < k:
heappush(top_k_heap, (score, sentence))
else:
# If the current score is higher than the minimum score in the heap,
# remove the minimum and push the current sentence and score
if score > top_k_heap[0][0]:
heappop(top_k_heap)
heappush(top_k_heap, (score, sentence))
# Extract the top k sentences and scores from the heap
top_k_sentences = []
top_k_scores = []
while top_k_heap:
score, sentence = heappop(top_k_heap)
top_k_sentences.append(sentence)
top_k_scores.append(score)
# Reverse the order of sentences and scores to get them in descending order
top_k_sentences = top_k_sentences[::-1]
top_k_scores = top_k_scores[::-1]
return top_k_sentences, top_k_scores |