File size: 5,305 Bytes
5a3e088
 
 
 
e710286
5a3e088
 
 
e710286
 
 
5a3e088
 
 
c1fdc9d
e710286
5a3e088
 
 
 
c1fdc9d
5a3e088
 
 
c1fdc9d
 
5a3e088
 
 
 
 
 
 
 
 
c1fdc9d
 
 
 
 
5a3e088
c1fdc9d
 
 
 
 
 
5a3e088
 
 
c1fdc9d
 
5a3e088
c1fdc9d
 
 
5a3e088
 
c1fdc9d
5a3e088
 
 
bcadd46
 
e710286
bcadd46
5a3e088
bcadd46
5a3e088
bcadd46
5a3e088
 
 
bcadd46
5a3e088
 
 
e710286
5a3e088
 
 
 
e710286
5a3e088
 
 
 
e710286
5a3e088
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a6c96b
5a3e088
 
 
bcadd46
5a3e088
 
c1fdc9d
 
5a3e088
c1fdc9d
 
 
 
5a3e088
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import streamlit as st
from streamlit.logger import get_logger
from timeit import default_timer as timer
import sqlite3
import pandas as pd

LOGGER = get_logger(__name__)
    
def preprocess(s:str)->str:
    return s.replace('"','').replace('ืขืœ','').replace('ืคืจืง','').replace('ืคืกื•ืง','').replace('ื“ืฃ','').replace('ืขืžื•ื“','').replace('ืกื™ืžืŸ','').replace('ืกืขื™ืฃ','').replace('ื—ื™ื“ื•ืฉื™','').replace("'",'')
        
@st.cache_resource
def get_dfs()->object:
    print('hello from get_dfs..')
        
    # //get the books table//
    # Connect to the database
    conn = sqlite3.connect('test42.db')

    # Query the database and retrieve the results
    cursor = conn.execute("SELECT * FROM titles")
    results = cursor.fetchall()

    # Convert the query results into a Pandas DataFrame
    titles = pd.DataFrame(results)
    titles.columns=list(map(lambda x: x[0], cursor.description))
    
    # //get the texts table//
    # Query the database and retrieve the results
    cursor = conn.execute("SELECT * FROM texts")
    results = cursor.fetchall()

    # Convert the query results into a Pandas DataFrame
    texts = pd.DataFrame(results)
    texts.columns=list(map(lambda x: x[0], cursor.description))
    
    # //get the references database
   # Query the database and retrieve the results
    cursor = conn.execute("SELECT * FROM refs")
    results = cursor.fetchall()

    # Convert the query results into a Pandas DataFrame
    refs = pd.DataFrame(results)
    refs.columns=list(map(lambda x: x[0], cursor.description))
   
   # Query the database and retrieve the results
    cursor = conn.execute("SELECT * FROM books")
    results = cursor.fetchall()

    # Convert the query results into a Pandas DataFrame
    books = pd.DataFrame(list(results))
    books.columns=list(map(lambda x: x[0], cursor.description))
    
    #merge the books and refs with the texts
    merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id')
    texts_df = pd.merge(merged,refs,left_on='_id_x',right_on='tid')
    
    titles_df = titles
    
    return titles_df, texts_df
    

def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
    from rapidfuzz import fuzz, process as rapidfuzz_process
    from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio, WRatio

    print('hello from find_ref..')

    if not input_text: return
    print(eval(algorithm))
    results = []    
    books = titles_df['he_titles']
    input_text = input_text.replace(':','ืขืžื•ื“ ื‘').replace('.','ืขืžื•ื“ ื')
    scorer = eval(algorithm)
    # search only the references database in case the user set the top_k to 0
    if top_k == 0:
        refs = texts_df['ref_text_long'].unique()
        for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results,processor=preprocess):
           results += [{'ref':ref,'ref_score':ref_score}]
    
    else:
        # search first only in the books database (for top_k books)
        for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k,processor=preprocess):
            # get all the references of that book
            book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
            refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
            # then search these references and add them all to the results
            for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer,processor=preprocess):
                results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
        # finaly, sort all the references by their own score (and not the book score)
        results.sort(key=lambda x: x['ref_score'],reverse=True)
    
    return results[:num_of_results]


def run():
    
    st.set_page_config(
        page_title=" ื—ื™ืคื•ืฉ ืžืงื•ืจื•ืช",
        page_icon="๐Ÿ“š",
        layout="wide",
        initial_sidebar_state="expanded"    
    )
    get_dfs()
    st.write("# ื—ื™ืคื•ืฉ ืžืงื•ืจื•ืช ื‘ืืžืฆืขื•ืช ืžืจื—ืง ืœื•ื™ื ืฉื˜ื™ื™ืŸ")

    titles_df,texts_df = get_dfs()
    user_input = st.text_input('ื›ืชื•ื‘ ืืช ื”ืžืงื•ืจ ื”ืžื‘ื•ืงืฉ', placeholder='ื‘ื‘ื ืงืžื ื“ืฃ ื‘ ืขืžื•ื“ ื‘') 
    top_k =  st.sidebar.slider('ื›ืžื” ืกืคืจื™ื ืœืกืจื•ืง top_k:',0,20,10)
    num_of_results = st.sidebar.slider('ืžืกืคืจ ื”ืชื•ืฆืื•ืช ืฉื‘ืจืฆื•ื ืš ืœื”ืฆื™ื’:',1,25,5)
    algorithm = st.sidebar.selectbox('ื”ืืœื’ื•ืจื™ืชื ืœื“ื™ืจื•ื’ ื”ืชื•ืฆืื•ืช',['token_ratio','ratio','WRatio','partial_ratio','token_set_ratio','partial_token_set_ratio','token_sort_ratio'])
    
    if user_input!="":
        time0 = timer()
        results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results,algorithm)
        time = f"finished in {1e3*(timer()-time0):.1f} ms"
        st.write(time)
        buttons = []
        for i, result in enumerate(results):
            st.write(result)
            buttons.append(st.button("ืคืชื— " +result['ref'],i))
            if buttons[i]:
                st.write(texts_df.loc[texts_df['ref_text_long']==result['ref']][['heText','ref_text_long']])
            

if __name__ == "__main__":
    run()