|
import gradio as gr |
|
import Wikidata_Text_Parser as wtr |
|
import sqlite3 |
|
import CodeArchive.Prove_llm as prv |
|
import pandas as pd |
|
|
|
def wtr_process(qid): |
|
try: |
|
conn = sqlite3.connect('wikidata_claims_refs_parsed.db') |
|
target_QID = qid |
|
|
|
cursor = conn.cursor() |
|
|
|
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='claims'") |
|
table_exists = cursor.fetchone() |
|
|
|
if table_exists: |
|
cursor.execute("SELECT entity_id FROM claims WHERE entity_id=?", (target_QID,)) |
|
result = cursor.fetchone() |
|
|
|
if result is not None and result[0] == target_QID: |
|
print(result) |
|
print(f"{target_QID} already exists in the 'claims' table. Skipping execution.") |
|
else: |
|
progress = gr.Progress(0) |
|
progress(0.00, desc="Wikidata claims parsing...") |
|
wtr.claimParser(target_QID) |
|
filtered_df = wtr.propertyFiltering(target_QID) |
|
progress(0.25, desc="URL and HTML parsing...") |
|
url_set = wtr.urlParser() |
|
html_set = wtr.htmlParser(url_set, qid) |
|
progress(0.50, desc="claim2Text...") |
|
claim_text = wtr.claim2text(html_set) |
|
progress(0.74, desc="html2Text...") |
|
html_text = wtr.html2text(html_set) |
|
claim_text = claim_text.astype(str) |
|
html_text = html_text.astype(str) |
|
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False) |
|
html_text.to_sql('html_text', conn, if_exists='replace', index=False) |
|
progress(1, desc="completed...") |
|
else: |
|
progress = gr.Progress(0) |
|
progress(0.00, desc="Wikidata claims parsing...") |
|
wtr.claimParser(target_QID) |
|
filtered_df = wtr.propertyFiltering(target_QID) |
|
progress(0.25, desc="URL and HTML parsing...") |
|
url_set = wtr.urlParser() |
|
html_set = wtr.htmlParser(url_set) |
|
progress(0.50, desc="claim2Text...") |
|
claim_text = wtr.claim2text(html_set) |
|
progress(0.74, desc="html2Text...") |
|
html_text = wtr.html2text(html_set) |
|
claim_text = claim_text.astype(str) |
|
html_text = html_text.astype(str) |
|
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False) |
|
html_text.to_sql('html_text', conn, if_exists='replace', index=False) |
|
progress(1, desc="completed...") |
|
|
|
|
|
query = f""" |
|
SELECT |
|
claim_text.entity_label, |
|
claim_text.property_label, |
|
claim_text.object_label, |
|
html_text.url |
|
FROM claim_text |
|
INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id |
|
WHERE claim_text.entity_id = '{target_QID}' |
|
""" |
|
|
|
result_df = pd.read_sql_query(query, conn) |
|
|
|
conn.commit() |
|
conn.close() |
|
|
|
return result_df |
|
|
|
except Exception as e: |
|
error_df = pd.DataFrame({'Error': [str(e)]}) |
|
return error_df |
|
|
|
|
|
def prv_process(qid): |
|
conn = sqlite3.connect('wikidata_claims_refs_parsed.db') |
|
|
|
query = f""" |
|
SELECT html_text.* |
|
FROM html_text |
|
INNER JOIN claim_text ON html_text.reference_id = claim_text.reference_id |
|
WHERE claim_text.entity_id = '{qid}' |
|
""" |
|
reference_text_df = pd.read_sql_query(query, conn) |
|
query = f"SELECT * FROM claim_text WHERE entity_id = '{qid}'" |
|
claim_df = pd.read_sql_query(query, conn) |
|
|
|
verbalised_claims_df_final = prv.verbalisation(claim_df) |
|
progress = gr.Progress(len(verbalised_claims_df_final)) |
|
|
|
def update_progress(curr_step, total_steps): |
|
progress((curr_step + 1) / total_steps) |
|
|
|
result = prv.RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress) |
|
|
|
conn.close() |
|
return result |
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
print("gradio started!") |
|
gr.Markdown( |
|
""" |
|
# Reference Quality Verification Tool |
|
This is a tool for verifying the reference quality of Wikidata claims related to the target entity item. |
|
Parsing could take 3~5 mins depending on the number of references. |
|
""" |
|
) |
|
inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q42)") |
|
out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)", headers=["entity_label", "property_label", "object_label", "url"]) |
|
run_button_1 = gr.Button("Start parsing") |
|
run_button_1.click(wtr_process, inp, out) |
|
|
|
|
|
gr.Markdown( |
|
""" |
|
LLM-based HTML parsing and verification ! |
|
""" |
|
) |
|
out_2 = gr.DataFrame(label="LLM-based verificaiton result") |
|
|
|
run_button_2 = gr.Button("Start processing") |
|
run_button_2.click(prv_process, inp, out_2) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |