Prove_KCL / SimpleUI_llm.py
Jongmo's picture
Upload 10 files
49664ed verified
raw
history blame
5.27 kB
import gradio as gr
import Wikidata_Text_Parser as wtr
import sqlite3
import CodeArchive.Prove_llm as prv
import pandas as pd
def wtr_process(qid):
try:
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
target_QID = qid
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='claims'")
table_exists = cursor.fetchone()
if table_exists:
cursor.execute("SELECT entity_id FROM claims WHERE entity_id=?", (target_QID,))
result = cursor.fetchone()
if result is not None and result[0] == target_QID:
print(result)
print(f"{target_QID} already exists in the 'claims' table. Skipping execution.")
else:
progress = gr.Progress(0)
progress(0.00, desc="Wikidata claims parsing...")
wtr.claimParser(target_QID) #save results in .db
filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
progress(0.25, desc="URL and HTML parsing...")
url_set = wtr.urlParser() #from ref table in .db
html_set = wtr.htmlParser(url_set, qid) #Original html docs collection
progress(0.50, desc="claim2Text...")
claim_text = wtr.claim2text(html_set) #Claims generation
progress(0.74, desc="html2Text...")
html_text = wtr.html2text(html_set)
claim_text = claim_text.astype(str)
html_text = html_text.astype(str)
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
progress(1, desc="completed...")
else:
progress = gr.Progress(0)
progress(0.00, desc="Wikidata claims parsing...")
wtr.claimParser(target_QID) #save results in .db
filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
progress(0.25, desc="URL and HTML parsing...")
url_set = wtr.urlParser() #from ref table in .db
html_set = wtr.htmlParser(url_set) #Original html docs collection
progress(0.50, desc="claim2Text...")
claim_text = wtr.claim2text(html_set) #Claims generation
progress(0.74, desc="html2Text...")
html_text = wtr.html2text(html_set)
claim_text = claim_text.astype(str)
html_text = html_text.astype(str)
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
progress(1, desc="completed...")
query = f"""
SELECT
claim_text.entity_label,
claim_text.property_label,
claim_text.object_label,
html_text.url
FROM claim_text
INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
WHERE claim_text.entity_id = '{target_QID}'
"""
result_df = pd.read_sql_query(query, conn)
conn.commit()
conn.close()
return result_df
except Exception as e:
error_df = pd.DataFrame({'Error': [str(e)]})
return error_df
def prv_process(qid):
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
query = f"""
SELECT html_text.*
FROM html_text
INNER JOIN claim_text ON html_text.reference_id = claim_text.reference_id
WHERE claim_text.entity_id = '{qid}'
"""
reference_text_df = pd.read_sql_query(query, conn)
query = f"SELECT * FROM claim_text WHERE entity_id = '{qid}'"
claim_df = pd.read_sql_query(query, conn)
verbalised_claims_df_final = prv.verbalisation(claim_df)
progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar
def update_progress(curr_step, total_steps):
progress((curr_step + 1) / total_steps)
result = prv.RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress)
conn.close()
return result
with gr.Blocks() as demo:
print("gradio started!")
gr.Markdown(
"""
# Reference Quality Verification Tool
This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
Parsing could take 3~5 mins depending on the number of references.
"""
)
inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q42)")
out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)", headers=["entity_label", "property_label", "object_label", "url"])
run_button_1 = gr.Button("Start parsing")
run_button_1.click(wtr_process, inp, out)
gr.Markdown(
"""
LLM-based HTML parsing and verification !
"""
)
out_2 = gr.DataFrame(label="LLM-based verificaiton result")
run_button_2 = gr.Button("Start processing")
run_button_2.click(prv_process, inp, out_2)
if __name__ == "__main__":
demo.launch(share=True)