File size: 5,297 Bytes
19d69d5 1104bf8 0677600 3201a95 a496016 2d158d3 3201a95 0677600 3201a95 0677600 2d158d3 0677600 a496016 1104bf8 a496016 1104bf8 0677600 1104bf8 98d958b 0677600 2d158d3 0677600 1104bf8 98d958b 9695d05 93e9112 98d958b 93e9112 2d158d3 98d958b 1104bf8 4ec6f2d 1104bf8 98d958b 1104bf8 c191acc 1104bf8 4ec6f2d 98d958b c191acc 1104bf8 98d958b 1104bf8 a496016 1104bf8 98d958b 1104bf8 a496016 0677600 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
from minicheck_web.minicheck import MiniCheck
from web_retrieval import *
from nltk.tokenize import sent_tokenize
import evaluate
def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
'''
Sort the chunks in a single document based on the probability of "supported" in descending order.
This function is used when a user document is provided.
'''
flattened_docs = [doc for chunk in used_chunk for doc in chunk]
flattened_scores = [score for chunk in support_prob_per_chunk for score in chunk]
doc_score = list(zip(flattened_docs, flattened_scores))
ranked_doc_score = sorted(doc_score, key=lambda x: x[1], reverse=True)
ranked_docs, scores = zip(*ranked_doc_score)
return ranked_docs, scores
class EndpointHandler():
def __init__(self, path="./"):
self.scorer = MiniCheck(path=path)
self.rouge = evaluate.load('rouge')
def __call__(self, data):
claim = data['inputs']['claims'][0]
ents = extract_entities(claim)
# Using user-provided document to do fact-checking
if len(data['inputs']['docs']) == 1 and data['inputs']['docs'][0] != '':
_, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=data)
ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
span_to_highlight = []
for doc_chunk, score in zip(ranked_docs, scores):
# If the chunk can support the claim, find the sentence with the highest rouge score
if score > 0.5:
highest_score_sent, _ = self.chunk_and_highest_rouge_score(doc_chunk, claim)
span_to_highlight.append(highest_score_sent)
else:
span_to_highlight.append("")
outputs = {
'ranked_docs': ranked_docs,
'scores': scores,
'span_to_highlight': span_to_highlight,
'entities': ents
}
else:
assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim)
span_to_highlight = []
for doc_chunk, score in zip(ranked_docs, scores):
# If the chunk can support the claim, find the sentence with the highest rouge score
if score > 0.5:
highest_score_sent, _ = self.chunk_and_highest_rouge_score(doc_chunk, claim)
span_to_highlight.append(highest_score_sent)
else:
span_to_highlight.append("")
outputs = {
'ranked_docs': ranked_docs,
'scores': scores,
'ranked_urls': ranked_urls,
'span_to_highlight': span_to_highlight,
'entities': ents
}
return outputs
def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=10, allow_duplicated_urls=False):
search_results = search_google(claim, timeout=timeout)
print('Searching webpages...')
start = time()
with concurrent.futures.ThreadPoolExecutor() as e:
scraped_results = e.map(scrape_url, search_results, itertools.repeat(timeout))
end = time()
print(f"Finished searching in {round((end - start), 1)} seconds.\n")
scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]] # those can be ranked based on TF-IDF to be more efficient
retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
print('Scoring webpages...')
start = time()
retrieved_data = {
'inputs': {
'docs': list(retrieved_docs),
'claims': [claim]*len(retrieved_docs)
}
}
_, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=retrieved_data)
end = time()
num_chunks = len([item for items in used_chunk for item in items])
print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
ranked_docs, scores, ranked_urls = order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=allow_duplicated_urls)
return ranked_docs, scores, ranked_urls
def chunk_and_highest_rouge_score(self, doc, claim):
'''
Given a document and a claim, return the sentence with the highest rouge score and the score
'''
doc_sentences = sent_tokenize(doc)
claims = [claim] * len(doc_sentences)
results = self.rouge.compute(
predictions=doc_sentences,
references=claims,
use_aggregator=False)
highest_score = 0
highest_score_sent = ""
for i in range(len(doc_sentences)):
if results['rouge1'][i] > highest_score:
highest_score = results['rouge1'][i]
highest_score_sent = doc_sentences[i]
return highest_score_sent, highest_score |