sivan22's picture
Upload folder using huggingface_hub
c1fdc9d verified
raw
history blame
5.31 kB
import streamlit as st
from streamlit.logger import get_logger
from timeit import default_timer as timer
import sqlite3
import pandas as pd
LOGGER = get_logger(__name__)
def preprocess(s:str)->str:
return s.replace('"','').replace('ืขืœ','').replace('ืคืจืง','').replace('ืคืกื•ืง','').replace('ื“ืฃ','').replace('ืขืžื•ื“','').replace('ืกื™ืžืŸ','').replace('ืกืขื™ืฃ','').replace('ื—ื™ื“ื•ืฉื™','').replace("'",'')
@st.cache_resource
def get_dfs()->object:
print('hello from get_dfs..')
# //get the books table//
# Connect to the database
conn = sqlite3.connect('test42.db')
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM titles")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
titles = pd.DataFrame(results)
titles.columns=list(map(lambda x: x[0], cursor.description))
# //get the texts table//
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM texts")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
texts = pd.DataFrame(results)
texts.columns=list(map(lambda x: x[0], cursor.description))
# //get the references database
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM refs")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
refs = pd.DataFrame(results)
refs.columns=list(map(lambda x: x[0], cursor.description))
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM books")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
books = pd.DataFrame(list(results))
books.columns=list(map(lambda x: x[0], cursor.description))
#merge the books and refs with the texts
merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id')
texts_df = pd.merge(merged,refs,left_on='_id_x',right_on='tid')
titles_df = titles
return titles_df, texts_df
def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
from rapidfuzz import fuzz, process as rapidfuzz_process
from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio, WRatio
print('hello from find_ref..')
if not input_text: return
print(eval(algorithm))
results = []
books = titles_df['he_titles']
input_text = input_text.replace(':','ืขืžื•ื“ ื‘').replace('.','ืขืžื•ื“ ื')
scorer = eval(algorithm)
# search only the references database in case the user set the top_k to 0
if top_k == 0:
refs = texts_df['ref_text_long'].unique()
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results,processor=preprocess):
results += [{'ref':ref,'ref_score':ref_score}]
else:
# search first only in the books database (for top_k books)
for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k,processor=preprocess):
# get all the references of that book
book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
# then search these references and add them all to the results
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer,processor=preprocess):
results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
# finaly, sort all the references by their own score (and not the book score)
results.sort(key=lambda x: x['ref_score'],reverse=True)
return results[:num_of_results]
def run():
st.set_page_config(
page_title=" ื—ื™ืคื•ืฉ ืžืงื•ืจื•ืช",
page_icon="๐Ÿ“š",
layout="wide",
initial_sidebar_state="expanded"
)
get_dfs()
st.write("# ื—ื™ืคื•ืฉ ืžืงื•ืจื•ืช ื‘ืืžืฆืขื•ืช ืžืจื—ืง ืœื•ื™ื ืฉื˜ื™ื™ืŸ")
titles_df,texts_df = get_dfs()
user_input = st.text_input('ื›ืชื•ื‘ ืืช ื”ืžืงื•ืจ ื”ืžื‘ื•ืงืฉ', placeholder='ื‘ื‘ื ืงืžื ื“ืฃ ื‘ ืขืžื•ื“ ื‘')
top_k = st.sidebar.slider('ื›ืžื” ืกืคืจื™ื ืœืกืจื•ืง top_k:',0,20,10)
num_of_results = st.sidebar.slider('ืžืกืคืจ ื”ืชื•ืฆืื•ืช ืฉื‘ืจืฆื•ื ืš ืœื”ืฆื™ื’:',1,25,5)
algorithm = st.sidebar.selectbox('ื”ืืœื’ื•ืจื™ืชื ืœื“ื™ืจื•ื’ ื”ืชื•ืฆืื•ืช',['token_ratio','ratio','WRatio','partial_ratio','token_set_ratio','partial_token_set_ratio','token_sort_ratio'])
if user_input!="":
time0 = timer()
results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results,algorithm)
time = f"finished in {1e3*(timer()-time0):.1f} ms"
st.write(time)
buttons = []
for i, result in enumerate(results):
st.write(result)
buttons.append(st.button("ืคืชื— " +result['ref'],i))
if buttons[i]:
st.write(texts_df.loc[texts_df['ref_text_long']==result['ref']][['heText','ref_text_long']])
if __name__ == "__main__":
run()