Spaces:

sivan22
/

sefaria-ref-finder

Sleeping

File size: 5,305 Bytes

5a3e088
 
 
 
e710286
5a3e088
 
 
e710286
 
 
5a3e088
 
 
c1fdc9d
e710286
5a3e088
 
 
 
c1fdc9d
5a3e088
 
 
c1fdc9d
 
5a3e088
 
 
 
 
 
 
 
 
c1fdc9d
 
 
 
 
5a3e088
c1fdc9d
 
 
 
 
 
5a3e088
 
 
c1fdc9d
 
5a3e088
c1fdc9d
 
 
5a3e088
 
c1fdc9d
5a3e088
 
 
bcadd46
 
e710286
bcadd46
5a3e088
bcadd46
5a3e088
bcadd46
5a3e088
 
 
bcadd46
5a3e088
 
 
e710286
5a3e088
 
 
 
e710286
5a3e088
 
 
 
e710286
5a3e088
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a6c96b
5a3e088
 
 
bcadd46
5a3e088
 
c1fdc9d
 
5a3e088
c1fdc9d
 
 
 
5a3e088

import streamlit as st
from streamlit.logger import get_logger
from timeit import default_timer as timer
import sqlite3
import pandas as pd

LOGGER = get_logger(__name__)
    
def preprocess(s:str)->str:
    return s.replace('"','').replace('על','').replace('פרק','').replace('פסוק','').replace('דף','').replace('עמוד','').replace('סימן','').replace('סעיף','').replace('חידושי','').replace("'",'')
        
@st.cache_resource
def get_dfs()->object:
    print('hello from get_dfs..')
        
    # //get the books table//
    # Connect to the database
    conn = sqlite3.connect('test42.db')

    # Query the database and retrieve the results
    cursor = conn.execute("SELECT * FROM titles")
    results = cursor.fetchall()

    # Convert the query results into a Pandas DataFrame
    titles = pd.DataFrame(results)
    titles.columns=list(map(lambda x: x[0], cursor.description))
    
    # //get the texts table//
    # Query the database and retrieve the results
    cursor = conn.execute("SELECT * FROM texts")
    results = cursor.fetchall()

    # Convert the query results into a Pandas DataFrame
    texts = pd.DataFrame(results)
    texts.columns=list(map(lambda x: x[0], cursor.description))
    
    # //get the references database
   # Query the database and retrieve the results
    cursor = conn.execute("SELECT * FROM refs")
    results = cursor.fetchall()

    # Convert the query results into a Pandas DataFrame
    refs = pd.DataFrame(results)
    refs.columns=list(map(lambda x: x[0], cursor.description))
   
   # Query the database and retrieve the results
    cursor = conn.execute("SELECT * FROM books")
    results = cursor.fetchall()

    # Convert the query results into a Pandas DataFrame
    books = pd.DataFrame(list(results))
    books.columns=list(map(lambda x: x[0], cursor.description))
    
    #merge the books and refs with the texts
    merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id')
    texts_df = pd.merge(merged,refs,left_on='_id_x',right_on='tid')
    
    titles_df = titles
    
    return titles_df, texts_df
    

def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
    from rapidfuzz import fuzz, process as rapidfuzz_process
    from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio, WRatio

    print('hello from find_ref..')

    if not input_text: return
    print(eval(algorithm))
    results = []    
    books = titles_df['he_titles']
    input_text = input_text.replace(':','עמוד ב').replace('.','עמוד א')
    scorer = eval(algorithm)
    # search only the references database in case the user set the top_k to 0
    if top_k == 0:
        refs = texts_df['ref_text_long'].unique()
        for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results,processor=preprocess):
           results += [{'ref':ref,'ref_score':ref_score}]
    
    else:
        # search first only in the books database (for top_k books)
        for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k,processor=preprocess):
            # get all the references of that book
            book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
            refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
            # then search these references and add them all to the results
            for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer,processor=preprocess):
                results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
        # finaly, sort all the references by their own score (and not the book score)
        results.sort(key=lambda x: x['ref_score'],reverse=True)
    
    return results[:num_of_results]


def run():
    
    st.set_page_config(
        page_title=" חיפוש מקורות",
        page_icon="📚",
        layout="wide",
        initial_sidebar_state="expanded"    
    )
    get_dfs()
    st.write("# חיפוש מקורות באמצעות מרחק לוינשטיין")

    titles_df,texts_df = get_dfs()
    user_input = st.text_input('כתוב את המקור המבוקש', placeholder='בבא קמא דף ב עמוד ב') 
    top_k =  st.sidebar.slider('כמה ספרים לסרוק top_k:',0,20,10)
    num_of_results = st.sidebar.slider('מספר התוצאות שברצונך להציג:',1,25,5)
    algorithm = st.sidebar.selectbox('האלגוריתם לדירוג התוצאות',['token_ratio','ratio','WRatio','partial_ratio','token_set_ratio','partial_token_set_ratio','token_sort_ratio'])
    
    if user_input!="":
        time0 = timer()
        results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results,algorithm)
        time = f"finished in {1e3*(timer()-time0):.1f} ms"
        st.write(time)
        buttons = []
        for i, result in enumerate(results):
            st.write(result)
            buttons.append(st.button("פתח " +result['ref'],i))
            if buttons[i]:
                st.write(texts_df.loc[texts_df['ref_text_long']==result['ref']][['heText','ref_text_long']])
            

if __name__ == "__main__":
    run()