Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,7 @@ from timeit import default_timer as timer
|
|
6 |
import sqlite3
|
7 |
import pandas as pd
|
8 |
import ast
|
9 |
-
|
10 |
|
11 |
|
12 |
|
@@ -82,29 +82,32 @@ def get_dfs()->object:
|
|
82 |
return titles_df, texts_df
|
83 |
|
84 |
|
85 |
-
def find_ref(titles_df,texts_df,input_text,top_k,num_of_results):
|
86 |
-
from rapidfuzz import process as rapidfuzz_process
|
|
|
|
|
87 |
print('hello from find_ref..')
|
|
|
88 |
if not input_text: return
|
89 |
-
|
90 |
results = []
|
91 |
books = titles_df['he_titles']
|
92 |
input_text = input_text.replace(':','ืขืืื ื').replace('.','ืขืืื ื')
|
93 |
-
|
94 |
# search only the references database in case the user set the top_k to 0
|
95 |
if top_k == 0:
|
96 |
refs = texts_df['ref_text_long'].unique()
|
97 |
-
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=num_of_results):
|
98 |
results += [{'ref':ref,'ref_score':ref_score}]
|
99 |
|
100 |
else:
|
101 |
# search first only in the books database (for top_k books)
|
102 |
-
for book, book_score, _ in rapidfuzz_process.extract(input_text, books, limit=top_k):
|
103 |
# get all the references of that book
|
104 |
book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
|
105 |
refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
|
106 |
# then search these references and add them all to the results
|
107 |
-
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10):
|
108 |
results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
|
109 |
# finaly, sort all the references by their own score (and not the book score)
|
110 |
results.sort(key=lambda x: x['ref_score'],reverse=True)
|
@@ -127,10 +130,11 @@ def run():
|
|
127 |
user_input = st.text_input('ืืชืื ืืช ืืืงืืจ ืืืืืงืฉ', placeholder='ืืื ืงืื ืืฃ ื ืขืืื ื')
|
128 |
top_k = st.sidebar.slider('ืืื ืกืคืจืื ืืกืจืืง top_k:',0,20,10)
|
129 |
num_of_results = st.sidebar.slider('ืืกืคืจ ืืชืืฆืืืช ืฉืืจืฆืื ื ืืืฆืื:',1,25,5)
|
|
|
130 |
|
131 |
if user_input!="":
|
132 |
time0 = timer()
|
133 |
-
results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results)
|
134 |
time = f"finished in {1e3*(timer()-time0):.1f} ms"
|
135 |
st.write(time)
|
136 |
for result in results:
|
|
|
6 |
import sqlite3
|
7 |
import pandas as pd
|
8 |
import ast
|
9 |
+
import pymongo
|
10 |
|
11 |
|
12 |
|
|
|
82 |
return titles_df, texts_df
|
83 |
|
84 |
|
85 |
+
def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
|
86 |
+
from rapidfuzz import fuzz, process as rapidfuzz_process
|
87 |
+
from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio
|
88 |
+
|
89 |
print('hello from find_ref..')
|
90 |
+
|
91 |
if not input_text: return
|
92 |
+
print(eval(algorithm))
|
93 |
results = []
|
94 |
books = titles_df['he_titles']
|
95 |
input_text = input_text.replace(':','ืขืืื ื').replace('.','ืขืืื ื')
|
96 |
+
scorer = eval(algorithm)
|
97 |
# search only the references database in case the user set the top_k to 0
|
98 |
if top_k == 0:
|
99 |
refs = texts_df['ref_text_long'].unique()
|
100 |
+
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results):
|
101 |
results += [{'ref':ref,'ref_score':ref_score}]
|
102 |
|
103 |
else:
|
104 |
# search first only in the books database (for top_k books)
|
105 |
+
for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k):
|
106 |
# get all the references of that book
|
107 |
book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
|
108 |
refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
|
109 |
# then search these references and add them all to the results
|
110 |
+
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer):
|
111 |
results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
|
112 |
# finaly, sort all the references by their own score (and not the book score)
|
113 |
results.sort(key=lambda x: x['ref_score'],reverse=True)
|
|
|
130 |
user_input = st.text_input('ืืชืื ืืช ืืืงืืจ ืืืืืงืฉ', placeholder='ืืื ืงืื ืืฃ ื ืขืืื ื')
|
131 |
top_k = st.sidebar.slider('ืืื ืกืคืจืื ืืกืจืืง top_k:',0,20,10)
|
132 |
num_of_results = st.sidebar.slider('ืืกืคืจ ืืชืืฆืืืช ืฉืืจืฆืื ื ืืืฆืื:',1,25,5)
|
133 |
+
algorithm = st.sidebar.selectbox('ืืืืืืจืืชื ืืืืจืื ืืชืืฆืืืช',['token_ratio','ratio','partial_ratio','token_set_ratio','partial_token_set_ratio','token_sort_ratio'])
|
134 |
|
135 |
if user_input!="":
|
136 |
time0 = timer()
|
137 |
+
results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results,algorithm)
|
138 |
time = f"finished in {1e3*(timer()-time0):.1f} ms"
|
139 |
st.write(time)
|
140 |
for result in results:
|