File size: 5,265 Bytes
953a835 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import gradio as gr
import Wikidata_Text_Parser as wtr
import sqlite3
import CodeArchive.Prove_llm as prv
import pandas as pd
def wtr_process(qid):
try:
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
target_QID = qid
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='claims'")
table_exists = cursor.fetchone()
if table_exists:
cursor.execute("SELECT entity_id FROM claims WHERE entity_id=?", (target_QID,))
result = cursor.fetchone()
if result is not None and result[0] == target_QID:
print(result)
print(f"{target_QID} already exists in the 'claims' table. Skipping execution.")
else:
progress = gr.Progress(0)
progress(0.00, desc="Wikidata claims parsing...")
wtr.claimParser(target_QID) #save results in .db
filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
progress(0.25, desc="URL and HTML parsing...")
url_set = wtr.urlParser() #from ref table in .db
html_set = wtr.htmlParser(url_set, qid) #Original html docs collection
progress(0.50, desc="claim2Text...")
claim_text = wtr.claim2text(html_set) #Claims generation
progress(0.74, desc="html2Text...")
html_text = wtr.html2text(html_set)
claim_text = claim_text.astype(str)
html_text = html_text.astype(str)
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
progress(1, desc="completed...")
else:
progress = gr.Progress(0)
progress(0.00, desc="Wikidata claims parsing...")
wtr.claimParser(target_QID) #save results in .db
filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
progress(0.25, desc="URL and HTML parsing...")
url_set = wtr.urlParser() #from ref table in .db
html_set = wtr.htmlParser(url_set) #Original html docs collection
progress(0.50, desc="claim2Text...")
claim_text = wtr.claim2text(html_set) #Claims generation
progress(0.74, desc="html2Text...")
html_text = wtr.html2text(html_set)
claim_text = claim_text.astype(str)
html_text = html_text.astype(str)
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
progress(1, desc="completed...")
query = f"""
SELECT
claim_text.entity_label,
claim_text.property_label,
claim_text.object_label,
html_text.url
FROM claim_text
INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
WHERE claim_text.entity_id = '{target_QID}'
"""
result_df = pd.read_sql_query(query, conn)
conn.commit()
conn.close()
return result_df
except Exception as e:
error_df = pd.DataFrame({'Error': [str(e)]})
return error_df
def prv_process(qid):
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
query = f"""
SELECT html_text.*
FROM html_text
INNER JOIN claim_text ON html_text.reference_id = claim_text.reference_id
WHERE claim_text.entity_id = '{qid}'
"""
reference_text_df = pd.read_sql_query(query, conn)
query = f"SELECT * FROM claim_text WHERE entity_id = '{qid}'"
claim_df = pd.read_sql_query(query, conn)
verbalised_claims_df_final = prv.verbalisation(claim_df)
progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar
def update_progress(curr_step, total_steps):
progress((curr_step + 1) / total_steps)
result = prv.RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress)
conn.close()
return result
with gr.Blocks() as demo:
print("gradio started!")
gr.Markdown(
"""
# Reference Quality Verification Tool
This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
Parsing could take 3~5 mins depending on the number of references.
"""
)
inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q42)")
out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)", headers=["entity_label", "property_label", "object_label", "url"])
run_button_1 = gr.Button("Start parsing")
run_button_1.click(wtr_process, inp, out)
gr.Markdown(
"""
LLM-based HTML parsing and verification !
"""
)
out_2 = gr.DataFrame(label="LLM-based verificaiton result")
run_button_2 = gr.Button("Start processing")
run_button_2.click(prv_process, inp, out_2)
if __name__ == "__main__":
demo.launch(share=True) |