Spaces:
Sleeping
Sleeping
File size: 5,305 Bytes
5a3e088 e710286 5a3e088 e710286 5a3e088 c1fdc9d e710286 5a3e088 c1fdc9d 5a3e088 c1fdc9d 5a3e088 c1fdc9d 5a3e088 c1fdc9d 5a3e088 c1fdc9d 5a3e088 c1fdc9d 5a3e088 c1fdc9d 5a3e088 bcadd46 e710286 bcadd46 5a3e088 bcadd46 5a3e088 bcadd46 5a3e088 bcadd46 5a3e088 e710286 5a3e088 e710286 5a3e088 e710286 5a3e088 2a6c96b 5a3e088 bcadd46 5a3e088 c1fdc9d 5a3e088 c1fdc9d 5a3e088 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import streamlit as st
from streamlit.logger import get_logger
from timeit import default_timer as timer
import sqlite3
import pandas as pd
LOGGER = get_logger(__name__)
def preprocess(s:str)->str:
return s.replace('"','').replace('ืขื','').replace('ืคืจืง','').replace('ืคืกืืง','').replace('ืืฃ','').replace('ืขืืื','').replace('ืกืืื','').replace('ืกืขืืฃ','').replace('ืืืืืฉื','').replace("'",'')
@st.cache_resource
def get_dfs()->object:
print('hello from get_dfs..')
# //get the books table//
# Connect to the database
conn = sqlite3.connect('test42.db')
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM titles")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
titles = pd.DataFrame(results)
titles.columns=list(map(lambda x: x[0], cursor.description))
# //get the texts table//
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM texts")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
texts = pd.DataFrame(results)
texts.columns=list(map(lambda x: x[0], cursor.description))
# //get the references database
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM refs")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
refs = pd.DataFrame(results)
refs.columns=list(map(lambda x: x[0], cursor.description))
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM books")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
books = pd.DataFrame(list(results))
books.columns=list(map(lambda x: x[0], cursor.description))
#merge the books and refs with the texts
merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id')
texts_df = pd.merge(merged,refs,left_on='_id_x',right_on='tid')
titles_df = titles
return titles_df, texts_df
def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
from rapidfuzz import fuzz, process as rapidfuzz_process
from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio, WRatio
print('hello from find_ref..')
if not input_text: return
print(eval(algorithm))
results = []
books = titles_df['he_titles']
input_text = input_text.replace(':','ืขืืื ื').replace('.','ืขืืื ื')
scorer = eval(algorithm)
# search only the references database in case the user set the top_k to 0
if top_k == 0:
refs = texts_df['ref_text_long'].unique()
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results,processor=preprocess):
results += [{'ref':ref,'ref_score':ref_score}]
else:
# search first only in the books database (for top_k books)
for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k,processor=preprocess):
# get all the references of that book
book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
# then search these references and add them all to the results
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer,processor=preprocess):
results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
# finaly, sort all the references by their own score (and not the book score)
results.sort(key=lambda x: x['ref_score'],reverse=True)
return results[:num_of_results]
def run():
st.set_page_config(
page_title=" ืืืคืืฉ ืืงืืจืืช",
page_icon="๐",
layout="wide",
initial_sidebar_state="expanded"
)
get_dfs()
st.write("# ืืืคืืฉ ืืงืืจืืช ืืืืฆืขืืช ืืจืืง ืืืื ืฉืืืื")
titles_df,texts_df = get_dfs()
user_input = st.text_input('ืืชืื ืืช ืืืงืืจ ืืืืืงืฉ', placeholder='ืืื ืงืื ืืฃ ื ืขืืื ื')
top_k = st.sidebar.slider('ืืื ืกืคืจืื ืืกืจืืง top_k:',0,20,10)
num_of_results = st.sidebar.slider('ืืกืคืจ ืืชืืฆืืืช ืฉืืจืฆืื ื ืืืฆืื:',1,25,5)
algorithm = st.sidebar.selectbox('ืืืืืืจืืชื ืืืืจืื ืืชืืฆืืืช',['token_ratio','ratio','WRatio','partial_ratio','token_set_ratio','partial_token_set_ratio','token_sort_ratio'])
if user_input!="":
time0 = timer()
results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results,algorithm)
time = f"finished in {1e3*(timer()-time0):.1f} ms"
st.write(time)
buttons = []
for i, result in enumerate(results):
st.write(result)
buttons.append(st.button("ืคืชื " +result['ref'],i))
if buttons[i]:
st.write(texts_df.loc[texts_df['ref_text_long']==result['ref']][['heText','ref_text_long']])
if __name__ == "__main__":
run()
|