File size: 4,021 Bytes
49664ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import pandas as pd
import numpy as np
import sqlite3, torch, json, re, os, torch, itertools, html2text
from ast import literal_eval as leval
from tqdm.auto import tqdm
from utils.verbalisation_module import VerbModule
from utils.sentence_retrieval_module import SentenceRetrievalModule
from utils.textual_entailment_module import TextualEntailmentModule
from importlib import reload
import llm_load
from html.parser import HTMLParser
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gradio as gr
def verbalisation(claim_df):
verb_module = VerbModule()
triples = []
for _, row in claim_df.iterrows():
triple = {
'subject': row['entity_label'],
'predicate': row['property_label'],
'object': row['object_label']
}
triples.append(triple)
claim_df['verbalisation'] = verb_module.verbalise_triples(triples)
claim_df['verbalisation_unks_replaced'] = claim_df['verbalisation'].apply(verb_module.replace_unks_on_sentence)
claim_df['verbalisation_unks_replaced_then_dropped'] = claim_df['verbalisation'].apply(lambda x: verb_module.replace_unks_on_sentence(x, empty_after=True))
return claim_df
def RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress):
join_df = pd.merge(verbalised_claims_df_final, reference_text_df[['reference_id', 'url', 'html']], on='reference_id', how='left')
tokenizer, model = llm_load.llmLoad(4096)
h = html2text.HTML2Text()
h.ignore_links = True
filtered_htmls = []
answers = []
verifications = []
for idx, (html, verb) in enumerate(zip(join_df['html'], join_df['verbalisation'])):
try:
filtered_html = h.handle(html)
filtered_htmls.append(filtered_html)
instruct = "Find the most relevant sentences from the filtered HTML document based on the given target sentence. If there are no directly related sentences, try to find sentences that provide context or background information related to the target sentence. Only answer 'nothing' if there is absolutely no relevant information in the document. Do not include any HTML tags or markup in your answer."
question = f"target sentence:'{verb}', filtered HTML document:{filtered_html}"
answer = llm_load.llmQuestion(tokenizer, model, instruct, question, output_size=128)
answers.append(answer)
except:
answers.append('Malformed html')
instruct = "Determine whether the target sentence is supported by the given evidence or not. If so, answer 'supportive'. It not, answer 'No supports'. Or, you can't determine with the given evidence, then asnwer 'Not enough information'"
question = f"target sentence:'{verb}', evidence:{answers[-1]}"
verification = llm_load.llmQuestion(tokenizer, model, instruct, question, output_size=64)
verifications.append(verification)
update_progress(idx, len(join_df)) # Update progress
return pd.DataFrame({'verbalisation': join_df['verbalisation'], 'verification': verifications, 'evidence_set': answers, 'url': join_df['url'], 'filtered_html': filtered_htmls})
if __name__ == '__main__':
target_QID = 'Q42'
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
claim_df = pd.read_sql_query(query, conn)
query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
reference_text_df = pd.read_sql_query(query, conn)
verbalised_claims_df_final = verbalisation(claim_df)
progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar
def update_progress(curr_step, total_steps):
progress((curr_step + 1) / total_steps)
result = RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress)
conn.commit()
conn.close() |