Browse files
@@ -6,7 +6,7 @@ from timeit import default_timer as timer
6 |
import sqlite3
7 |
import pandas as pd
8 |
import ast
9 |
10 |
11 |
12 |
@@ -82,29 +82,32 @@ def get_dfs()->object:
82 |
return titles_df, texts_df
83 |
84 |
85 |
def find_ref(titles_df,texts_df,input_text,top_k,num_of_results):
86 |
from rapidfuzz import process as rapidfuzz_process
87 |
print('hello from find_ref..')
88 |
if not input_text: return
89 |
90 |
results = []
91 |
books = titles_df['he_titles']
92 |
input_text = input_text.replace(':','注诪讜讚 讘').replace('.','注诪讜讚 讗')
93 |
94 |
# search only the references database in case the user set the top_k to 0
95 |
if top_k == 0:
96 |
refs = texts_df['ref_text_long'].unique()
97 |
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=num_of_results):
98 |
results += [{'ref':ref,'ref_score':ref_score}]
99 |
100 |
101 |
# search first only in the books database (for top_k books)
102 |
for book, book_score, _ in rapidfuzz_process.extract(input_text, books, limit=top_k):
103 |
# get all the references of that book
104 |
book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
105 |
refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
106 |
# then search these references and add them all to the results
107 |
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10):
108 |
results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
109 |
# finaly, sort all the references by their own score (and not the book score)
110 |
results.sort(key=lambda x: x['ref_score'],reverse=True)
@@ -127,10 +130,11 @@ def run():
127 |
user_input = st.text_input('讻转讜讘 讗转 讛诪拽讜专 讛诪讘讜拽砖', placeholder='讘讘讗 拽诪讗 讚祝 讘 注诪讜讚 讘')
128 |
top_k = st.sidebar.slider('讻诪讛 住驻专讬诐 诇住专讜拽 top_k:',0,20,10)
129 |
num_of_results = st.sidebar.slider('诪住驻专 讛转讜爪讗讜转 砖讘专爪讜谞讱 诇讛爪讬讙:',1,25,5)
130 |
131 |
if user_input!="":
132 |
time0 = timer()
133 |
results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results)
134 |
time = f"finished in {1e3*(timer()-time0):.1f} ms"
135 |
136 |
for result in results:
6 |
import sqlite3
7 |
import pandas as pd
8 |
import ast
9 |
import pymongo
10 |
11 |
12 |
82 |
return titles_df, texts_df
83 |
84 |
85 |
def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
86 |
from rapidfuzz import fuzz, process as rapidfuzz_process
87 |
from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio
88 |
89 |
print('hello from find_ref..')
90 |
91 |
if not input_text: return
92 |
93 |
results = []
94 |
books = titles_df['he_titles']
95 |
input_text = input_text.replace(':','注诪讜讚 讘').replace('.','注诪讜讚 讗')
96 |
scorer = eval(algorithm)
97 |
# search only the references database in case the user set the top_k to 0
98 |
if top_k == 0:
99 |
refs = texts_df['ref_text_long'].unique()
100 |
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results):
101 |
results += [{'ref':ref,'ref_score':ref_score}]
102 |
103 |
104 |
# search first only in the books database (for top_k books)
105 |
for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k):
106 |
# get all the references of that book
107 |
book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
108 |
refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
109 |
# then search these references and add them all to the results
110 |
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer):
111 |
results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
112 |
# finaly, sort all the references by their own score (and not the book score)
113 |
results.sort(key=lambda x: x['ref_score'],reverse=True)
130 |
user_input = st.text_input('讻转讜讘 讗转 讛诪拽讜专 讛诪讘讜拽砖', placeholder='讘讘讗 拽诪讗 讚祝 讘 注诪讜讚 讘')
131 |
top_k = st.sidebar.slider('讻诪讛 住驻专讬诐 诇住专讜拽 top_k:',0,20,10)
132 |
num_of_results = st.sidebar.slider('诪住驻专 讛转讜爪讗讜转 砖讘专爪讜谞讱 诇讛爪讬讙:',1,25,5)
133 |
algorithm = st.sidebar.selectbox('讛讗诇讙讜专讬转诐 诇讚讬专讜讙 讛转讜爪讗讜转',['token_ratio','ratio','partial_ratio','token_set_ratio','partial_token_set_ratio','token_sort_ratio'])
134 |
135 |
if user_input!="":
136 |
time0 = timer()
137 |
results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results,algorithm)
138 |
time = f"finished in {1e3*(timer()-time0):.1f} ms"
139 |
140 |
for result in results: