File size: 2,776 Bytes
ffbadc4
e85ca86
c4baae7
69e5f39
ffbadc4
3836430
d8758a7
 
 
 
3836430
fc50127
e8569d3
 
 
 
 
 
 
 
5f6349c
b5392ae
b64bcd9
5f6349c
 
 
 
50a6b52
 
5f6349c
f33586f
 
 
 
 
45fde58
50a6b52
ab70a4d
 
45fde58
e8569d3
 
69e5f39
 
 
 
45fde58
b64bcd9
1043bb5
 
b5392ae
908d90a
b64bcd9
7b6cfde
1043bb5
908d90a
b5392ae
ffbadc4
b164b4d
38ea0d4
3836430
1043bb5
854df65
c9b1232
fdef0fb
c9b1232
ffbadc4
 
a3c8aa4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
from sentence_transformers import SentenceTransformer
import pandas as pd
import pickle

def make_clickable_both(val): 
    name, url = val.split('#')
    print(name+"\n")
    print(url+"\n")
    return f'<a href="{url}">{name}</a>'

def find(query):
    def get_detailed_instruct(task_description: str, query: str) -> str:
        return f'Instruct: {task_description}\nQuery: {query}'
    
    # Each query must come with a one-sentence instruction that describes the task
    task = 'Given a web search query, retrieve relevant passages that answer the query'
    queries = [
        get_detailed_instruct(task, query)
    ]
    print("cekpoin0\n")
    
    quran = pd.read_csv('quran-simple-clean.txt', delimiter="|")
    
    file = open('quran-splitted.sav','rb')
    quran_splitted = pickle.load(file)
    
    model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')
    
    documents = quran_splitted['text'].tolist()
    # document_embeddings = model.encode(documents, convert_to_tensor=True, normalize_embeddings=True)
    # filename = 'encoded_quran_text_split_multilingual-e5-large-instruct.sav'
    # pickle.dump(embeddings, open(filename, 'wb'))
    file = open('encoded_quran_text_split_multilingual-e5-large-instructs.sav','rb')
    document_embeddings = pickle.load(file)
    print("cekpoin1\n")
    
    query_embeddings = model.encode(queries, convert_to_tensor=True, normalize_embeddings=True)
    scores = (query_embeddings @ document_embeddings.T) * 100
    print("cekpoin2\n")

    # insert the similarity value to dataframe & sort it
    file = open('quran-splitted.sav','rb')
    quran_splitted = pickle.load(file)
    quran_splitted['similarity'] = scores.tolist()[0]
    sorted_quran = quran_splitted.sort_values(by='similarity', ascending=False)
    print("cekpoin3\n")
    
    #results = ""
    results = pd.DataFrame()
    i = 0
    while i<20:
        result = sorted_quran.iloc[i]
        result_quran = quran.loc[(quran['sura']==result['sura']) & (quran['aya']==result['aya'])]
        results = pd.concat([results, result_quran])
        #results = results + result_quran['text'].item()+" (Q.S "+str(result['sura']).rstrip('.0')+":"+str(result['aya']).rstrip('.0')+")\n"
        i=i+1

    results['text'] = results['text'] + '#' + 'https://quran.com/'+results['sura'].astype(str)+':'+results['aya'].astype(str)+'/tafsirs/en-tafisr-ibn-kathir'
    results = results.style.format({'text': make_clickable_both})
    
    #return sorted_quran
    return results
    
demo = gr.Interface(fn=find, inputs="textbox", outputs=gr.Dataframe(headers=['sura', 'aya', 'text'],wrap=True))
#demo = gr.Interface(fn=find, inputs="textbox", outputs="textbox")
    
if __name__ == "__main__":
    demo.launch(share=True)