File size: 2,164 Bytes
ffbadc4
e85ca86
c4baae7
69e5f39
ffbadc4
fc50127
e8569d3
 
 
 
 
 
 
 
5f6349c
b5392ae
b64bcd9
5f6349c
 
 
 
50a6b52
 
5f6349c
 
 
 
 
 
45fde58
50a6b52
ab70a4d
 
45fde58
e8569d3
 
69e5f39
 
 
 
45fde58
b64bcd9
b5392ae
 
 
b64bcd9
7b6cfde
6d63854
b5392ae
 
 
ffbadc4
fc50127
ffbadc4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import gradio as gr
from sentence_transformers import SentenceTransformer
import pandas as pd
import pickle

def find(query):
    def get_detailed_instruct(task_description: str, query: str) -> str:
        return f'Instruct: {task_description}\nQuery: {query}'
    
    # Each query must come with a one-sentence instruction that describes the task
    task = 'Given a web search query, retrieve relevant passages that answer the query'
    queries = [
        get_detailed_instruct(task, query)
    ]
    print("cekpoin0\n")
    
    quran = pd.read_csv('quran-simple-clean.txt', delimiter="|")
    
    file = open('quran-splitted.sav','rb')
    quran_splitted = pickle.load(file)
    
    model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')
    
    documents = quran_splitted['text'].tolist()
    document_embeddings = model.encode(documents, convert_to_tensor=True, normalize_embeddings=True)
    filename = 'encoded_quran_text_split_multilingual-e5-large-instruct.sav'
    pickle.dump(embeddings, open(filename, 'wb'))
    #file = open('encoded_quran_text_split_multilingual-e5-large-instruct.sav','rb')
    #document_embeddings = pickle.load(file)
    print("cekpoin1\n")
    
    query_embeddings = model.encode(queries, convert_to_tensor=True, normalize_embeddings=True)
    scores = (query_embeddings @ document_embeddings.T) * 100
    print("cekpoin2\n")

    # insert the similarity value to dataframe & sort it
    file = open('quran-splitted.sav','rb')
    quran_splitted = pickle.load(file)
    quran_splitted['similarity'] = scores.tolist()[0]
    sorted_quran = quran_splitted.sort_values(by='similarity', ascending=False)
    print("cekpoin3\n")
    
    results = ""
    i = 0
    while i<6:
        result = sorted_quran.iloc[i]
        result_quran = quran.loc[(quran['sura']==result['sura']) & (quran['aya']==result['aya'])]
        results = results + result_quran['text'].item()+" (Q.S "+str(result['sura']).rstrip('.0')+":"+str(result['aya']).rstrip('.0')+")\n"
        i=i+1
    
    return results

demo = gr.Interface(fn=find, inputs="textbox", outputs="textbox")
    
if __name__ == "__main__":
    demo.launch()