File size: 4,022 Bytes
13791ef
 
 
 
 
 
 
 
 
64dd69a
13791ef
 
 
 
 
 
 
 
ae13785
13791ef
b628b23
13791ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64dd69a
 
 
13791ef
 
 
 
956d65c
 
 
 
 
13791ef
956d65c
13791ef
 
 
 
 
 
956d65c
 
 
 
13791ef
 
 
 
 
956d65c
 
13791ef
 
 
 
 
 
 
ae13785
13791ef
 
 
 
 
ae13785
13791ef
 
 
 
 
 
 
 
 
 
 
 
 
 
956d65c
 
 
 
 
 
 
13791ef
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import streamlit as st
from streamlit.logger import get_logger
import datasets
import pandas as pd
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from sentence_transformers import util
from torch import tensor



LOGGER = get_logger(__name__)


@st.cache_data
def get_df() ->object:
    ds = datasets.load_dataset('sivan22/orach-chaim-embeddings-e5')
    df = pd.DataFrame.from_dict(ds['train'])
    df = df[df['bookname']==' 诪砖谞讛 讘专讜专讛']
    return df

@st.cache_resource
def get_model()->object:
    model_name = "intfloat/multilingual-e5-large"
    model_kwargs = {'device': 'cpu'} #'cpu' or 'cuda'
    encode_kwargs = {'normalize_embeddings': False}
    embeddings_model = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    return embeddings_model

@st.cache_resource
def get_chat_api(api_key:str):
    chat = ChatOpenAI(model="gpt-3.5-turbo-16k", api_key=api_key)
    return chat


def get_results(embeddings_model,input,df,num_of_results) -> pd.DataFrame:
    embeddings = embeddings_model.embed_query('query: '+ input)
    hits = util.semantic_search(tensor(embeddings), tensor(df['embeddings'].tolist()), top_k=num_of_results)
    hit_list = [hit['corpus_id'] for hit in hits[0]]
    return df.iloc[hit_list]

def get_llm_results(query,chat,results):   

    prompt_template = PromptTemplate.from_template(
   """
    your misssion is to rank the given answers based on their relevance to the given question.
    Provide a relevancy score between 0 (not relevant) and 1 (highly relevant) for each possible answer.
    the results should be in  the following JSON format: "answer": "score", "answer": "score" while answer is the possible answer's text and score is the relevancy score.
    
    the question is: {query}

    the possible answers are:
    {answers}

    """   )

    messages = [
        SystemMessage(content="""
                      You're a helpful assistant.
                      Return a JSON formatted string.
                      """),
        HumanMessage(content=prompt_template.format(query=query, answers=str.join('\n', results['text'].head(10).tolist()))),
    ]

    response =  chat.invoke(messages)
    llm_results_df = pd.read_json(response.content, orient='index')
    llm_results_df.rename(columns={0: 'score'}, inplace=True)
    llm_results_df.sort_values(by='score', ascending=False, inplace=True)
    return llm_results_df



def run():
    
    st.set_page_config(
        page_title=" 讞讬驻讜砖 住诪谞讟讬 讘诪砖谞讛 讘专讜专讛",
        page_icon="馃摎",
        layout="wide",
        initial_sidebar_state="expanded"    
    )
    
    st.write("# 讞讬驻讜砖 讞讻诐 讘住驻专 诪砖谞讛 讘专讜专讛")
    
    embeddings_model = get_model()    
    df = get_df()
    
    user_input = st.text_input('讻转讜讘 讻讗谉 讗转 砖讗诇转讱', placeholder='讻诪讛 谞专讜转 诪讚诇讬拽讬诐 讘讻诇 诇讬诇讛 诪诇讬诇讜转 讛讞谞讜讻讛')    
    num_of_results = st.sidebar.slider('诪住驻专 讛转讜爪讗讜转 砖讘专爪讜谞讱 诇讛爪讬讙:',1,25,5)
    use_llm = st.sidebar.checkbox("讛砖转诪砖 讘诪讜讚诇 砖驻讛 讻讚讬 诇砖驻专 转讜爪讗讜转", False)
    openAikey = st.sidebar.text_input("OpenAI API key", type="password")
    
    if (st.button('讞驻砖') or user_input) and user_input!="":
        
        results = get_results(embeddings_model,user_input,df,num_of_results)  

        if use_llm:
            if openAikey == None or openAikey=="":
                st.write("诇讗 讛讜讻谞住 诪驻转讞 砖诇 OpenAI")

            else:
                chat = get_chat_api(openAikey)
                llm_results = get_llm_results(user_input,chat,results)
                st.write(llm_results) 

        else:
            st.write(results[['siman','sek','text']].head(10))
            
       
if __name__ == "__main__":
    run()