sivan22 commited on
Commit
bcadd46
โ€ข
1 Parent(s): d291f25

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -9
app.py CHANGED
@@ -6,7 +6,7 @@ from timeit import default_timer as timer
6
  import sqlite3
7
  import pandas as pd
8
  import ast
9
-
10
 
11
 
12
 
@@ -82,29 +82,32 @@ def get_dfs()->object:
82
  return titles_df, texts_df
83
 
84
 
85
- def find_ref(titles_df,texts_df,input_text,top_k,num_of_results):
86
- from rapidfuzz import process as rapidfuzz_process
 
 
87
  print('hello from find_ref..')
 
88
  if not input_text: return
89
-
90
  results = []
91
  books = titles_df['he_titles']
92
  input_text = input_text.replace(':','ืขืžื•ื“ ื‘').replace('.','ืขืžื•ื“ ื')
93
-
94
  # search only the references database in case the user set the top_k to 0
95
  if top_k == 0:
96
  refs = texts_df['ref_text_long'].unique()
97
- for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=num_of_results):
98
  results += [{'ref':ref,'ref_score':ref_score}]
99
 
100
  else:
101
  # search first only in the books database (for top_k books)
102
- for book, book_score, _ in rapidfuzz_process.extract(input_text, books, limit=top_k):
103
  # get all the references of that book
104
  book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
105
  refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
106
  # then search these references and add them all to the results
107
- for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10):
108
  results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
109
  # finaly, sort all the references by their own score (and not the book score)
110
  results.sort(key=lambda x: x['ref_score'],reverse=True)
@@ -127,10 +130,11 @@ def run():
127
  user_input = st.text_input('ื›ืชื•ื‘ ืืช ื”ืžืงื•ืจ ื”ืžื‘ื•ืงืฉ', placeholder='ื‘ื‘ื ืงืžื ื“ืฃ ื‘ ืขืžื•ื“ ื‘')
128
  top_k = st.sidebar.slider('ื›ืžื” ืกืคืจื™ื ืœืกืจื•ืง top_k:',0,20,10)
129
  num_of_results = st.sidebar.slider('ืžืกืคืจ ื”ืชื•ืฆืื•ืช ืฉื‘ืจืฆื•ื ืš ืœื”ืฆื™ื’:',1,25,5)
 
130
 
131
  if user_input!="":
132
  time0 = timer()
133
- results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results)
134
  time = f"finished in {1e3*(timer()-time0):.1f} ms"
135
  st.write(time)
136
  for result in results:
 
6
  import sqlite3
7
  import pandas as pd
8
  import ast
9
+ import pymongo
10
 
11
 
12
 
 
82
  return titles_df, texts_df
83
 
84
 
85
+ def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
86
+ from rapidfuzz import fuzz, process as rapidfuzz_process
87
+ from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio
88
+
89
  print('hello from find_ref..')
90
+
91
  if not input_text: return
92
+ print(eval(algorithm))
93
  results = []
94
  books = titles_df['he_titles']
95
  input_text = input_text.replace(':','ืขืžื•ื“ ื‘').replace('.','ืขืžื•ื“ ื')
96
+ scorer = eval(algorithm)
97
  # search only the references database in case the user set the top_k to 0
98
  if top_k == 0:
99
  refs = texts_df['ref_text_long'].unique()
100
+ for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results):
101
  results += [{'ref':ref,'ref_score':ref_score}]
102
 
103
  else:
104
  # search first only in the books database (for top_k books)
105
+ for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k):
106
  # get all the references of that book
107
  book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
108
  refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
109
  # then search these references and add them all to the results
110
+ for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer):
111
  results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
112
  # finaly, sort all the references by their own score (and not the book score)
113
  results.sort(key=lambda x: x['ref_score'],reverse=True)
 
130
  user_input = st.text_input('ื›ืชื•ื‘ ืืช ื”ืžืงื•ืจ ื”ืžื‘ื•ืงืฉ', placeholder='ื‘ื‘ื ืงืžื ื“ืฃ ื‘ ืขืžื•ื“ ื‘')
131
  top_k = st.sidebar.slider('ื›ืžื” ืกืคืจื™ื ืœืกืจื•ืง top_k:',0,20,10)
132
  num_of_results = st.sidebar.slider('ืžืกืคืจ ื”ืชื•ืฆืื•ืช ืฉื‘ืจืฆื•ื ืš ืœื”ืฆื™ื’:',1,25,5)
133
+ algorithm = st.sidebar.selectbox('ื”ืืœื’ื•ืจื™ืชื ืœื“ื™ืจื•ื’ ื”ืชื•ืฆืื•ืช',['token_ratio','ratio','partial_ratio','token_set_ratio','partial_token_set_ratio','token_sort_ratio'])
134
 
135
  if user_input!="":
136
  time0 = timer()
137
+ results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results,algorithm)
138
  time = f"finished in {1e3*(timer()-time0):.1f} ms"
139
  st.write(time)
140
  for result in results: