File size: 4,022 Bytes
13791ef 64dd69a 13791ef ae13785 13791ef b628b23 13791ef 64dd69a 13791ef 956d65c 13791ef 956d65c 13791ef 956d65c 13791ef 956d65c 13791ef ae13785 13791ef ae13785 13791ef 956d65c 13791ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import streamlit as st
from streamlit.logger import get_logger
import datasets
import pandas as pd
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from sentence_transformers import util
from torch import tensor
LOGGER = get_logger(__name__)
@st.cache_data
def get_df() ->object:
ds = datasets.load_dataset('sivan22/orach-chaim-embeddings-e5')
df = pd.DataFrame.from_dict(ds['train'])
df = df[df['bookname']==' 诪砖谞讛 讘专讜专讛']
return df
@st.cache_resource
def get_model()->object:
model_name = "intfloat/multilingual-e5-large"
model_kwargs = {'device': 'cpu'} #'cpu' or 'cuda'
encode_kwargs = {'normalize_embeddings': False}
embeddings_model = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
return embeddings_model
@st.cache_resource
def get_chat_api(api_key:str):
chat = ChatOpenAI(model="gpt-3.5-turbo-16k", api_key=api_key)
return chat
def get_results(embeddings_model,input,df,num_of_results) -> pd.DataFrame:
embeddings = embeddings_model.embed_query('query: '+ input)
hits = util.semantic_search(tensor(embeddings), tensor(df['embeddings'].tolist()), top_k=num_of_results)
hit_list = [hit['corpus_id'] for hit in hits[0]]
return df.iloc[hit_list]
def get_llm_results(query,chat,results):
prompt_template = PromptTemplate.from_template(
"""
your misssion is to rank the given answers based on their relevance to the given question.
Provide a relevancy score between 0 (not relevant) and 1 (highly relevant) for each possible answer.
the results should be in the following JSON format: "answer": "score", "answer": "score" while answer is the possible answer's text and score is the relevancy score.
the question is: {query}
the possible answers are:
{answers}
""" )
messages = [
SystemMessage(content="""
You're a helpful assistant.
Return a JSON formatted string.
"""),
HumanMessage(content=prompt_template.format(query=query, answers=str.join('\n', results['text'].head(10).tolist()))),
]
response = chat.invoke(messages)
llm_results_df = pd.read_json(response.content, orient='index')
llm_results_df.rename(columns={0: 'score'}, inplace=True)
llm_results_df.sort_values(by='score', ascending=False, inplace=True)
return llm_results_df
def run():
st.set_page_config(
page_title=" 讞讬驻讜砖 住诪谞讟讬 讘诪砖谞讛 讘专讜专讛",
page_icon="馃摎",
layout="wide",
initial_sidebar_state="expanded"
)
st.write("# 讞讬驻讜砖 讞讻诐 讘住驻专 诪砖谞讛 讘专讜专讛")
embeddings_model = get_model()
df = get_df()
user_input = st.text_input('讻转讜讘 讻讗谉 讗转 砖讗诇转讱', placeholder='讻诪讛 谞专讜转 诪讚诇讬拽讬诐 讘讻诇 诇讬诇讛 诪诇讬诇讜转 讛讞谞讜讻讛')
num_of_results = st.sidebar.slider('诪住驻专 讛转讜爪讗讜转 砖讘专爪讜谞讱 诇讛爪讬讙:',1,25,5)
use_llm = st.sidebar.checkbox("讛砖转诪砖 讘诪讜讚诇 砖驻讛 讻讚讬 诇砖驻专 转讜爪讗讜转", False)
openAikey = st.sidebar.text_input("OpenAI API key", type="password")
if (st.button('讞驻砖') or user_input) and user_input!="":
results = get_results(embeddings_model,user_input,df,num_of_results)
if use_llm:
if openAikey == None or openAikey=="":
st.write("诇讗 讛讜讻谞住 诪驻转讞 砖诇 OpenAI")
else:
chat = get_chat_api(openAikey)
llm_results = get_llm_results(user_input,chat,results)
st.write(llm_results)
else:
st.write(results[['siman','sek','text']].head(10))
if __name__ == "__main__":
run()
|