Spaces:

sivan22
/

sefaria-ref-finder

Sleeping

App Files Files Community

sivan22 commited on Jan 11, 2024

Commit

bcadd46

verified ·

1 Parent(s): d291f25

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -9

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from timeit import default_timer as timer
 import sqlite3
 import pandas as pd
 import ast
@@ -82,29 +82,32 @@ def get_dfs()->object:
     return titles_df, texts_df
-def find_ref(titles_df,texts_df,input_text,top_k,num_of_results):
-    from rapidfuzz import process as rapidfuzz_process
     print('hello from find_ref..')
     if not input_text: return
     results = []
     books = titles_df['he_titles']
     input_text = input_text.replace(':','עמוד ב').replace('.','עמוד א')
     # search only the references database in case the user set the top_k to 0
     if top_k == 0:
         refs = texts_df['ref_text_long'].unique()
-        for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=num_of_results):
            results += [{'ref':ref,'ref_score':ref_score}]
     else:
         # search first only in the books database (for top_k books)
-        for book, book_score, _ in rapidfuzz_process.extract(input_text, books, limit=top_k):
             # get all the references of that book
             book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
             refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
             # then search these references and add them all to the results
-            for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10):
                 results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
         # finaly, sort all the references by their own score (and not the book score)
         results.sort(key=lambda x: x['ref_score'],reverse=True)
@@ -127,10 +130,11 @@ def run():
     user_input = st.text_input('כתוב את המקור המבוקש', placeholder='בבא קמא דף ב עמוד ב')
     top_k =  st.sidebar.slider('כמה ספרים לסרוק top_k:',0,20,10)
     num_of_results = st.sidebar.slider('מספר התוצאות שברצונך להציג:',1,25,5)
     if user_input!="":
         time0 = timer()
-        results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results)
         time = f"finished in {1e3*(timer()-time0):.1f} ms"
         st.write(time)
         for result in results:

 import sqlite3
 import pandas as pd
 import ast
+import pymongo
     return titles_df, texts_df
+def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
+    from rapidfuzz import fuzz, process as rapidfuzz_process
+    from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio
     print('hello from find_ref..')
     if not input_text: return
+    print(eval(algorithm))
     results = []
     books = titles_df['he_titles']
     input_text = input_text.replace(':','עמוד ב').replace('.','עמוד א')
+    scorer = eval(algorithm)
     # search only the references database in case the user set the top_k to 0
     if top_k == 0:
         refs = texts_df['ref_text_long'].unique()
+        for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results):
            results += [{'ref':ref,'ref_score':ref_score}]
     else:
         # search first only in the books database (for top_k books)
+        for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k):
             # get all the references of that book
             book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
             refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
             # then search these references and add them all to the results
+            for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer):
                 results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
         # finaly, sort all the references by their own score (and not the book score)
         results.sort(key=lambda x: x['ref_score'],reverse=True)
     user_input = st.text_input('כתוב את המקור המבוקש', placeholder='בבא קמא דף ב עמוד ב')
     top_k =  st.sidebar.slider('כמה ספרים לסרוק top_k:',0,20,10)
     num_of_results = st.sidebar.slider('מספר התוצאות שברצונך להציג:',1,25,5)
+    algorithm = st.sidebar.selectbox('האלגוריתם לדירוג התוצאות',['token_ratio','ratio','partial_ratio','token_set_ratio','partial_token_set_ratio','token_sort_ratio'])
     if user_input!="":
         time0 = timer()
+        results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results,algorithm)
         time = f"finished in {1e3*(timer()-time0):.1f} ms"
         st.write(time)
         for result in results: