Spaces:

sivan22
/

Semantic-Search-upload-your-file

Runtime error

App Files Files Community

sivan22 commited on May 28, 2024

Commit

3d33fb5

1 Parent(s): b067875

init from PC

Browse files

Files changed (5) hide show

__init__.py +13 -0
app.py +138 -0
requirements.txt +9 -0
run.bat +2 -0
utils.py +28 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

app.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import streamlit as st
+from streamlit.logger import get_logger
+import datasets
+import pandas as pd
+from langchain_huggingface.embeddings import HuggingFaceEmbeddings
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import PromptTemplate
+from langchain_core.messages import HumanMessage, SystemMessage
+from sentence_transformers import util
+from torch import tensor
+from io import StringIO
+LOGGER = get_logger(__name__)
+@st.cache_data
+def get_df(uploaded_file) ->object:
+    if uploaded_file is  None:
+        return None
+    stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
+    string_data = stringio.read()
+    df = pd.DataFrame(string_data.split('\n'), columns=['text'])
+    return df
+@st.cache_data
+def get_embeddings(df,_embeddings_model) ->object:
+    df['embeddings'] = df['text'].apply(lambda x: _embeddings_model.embed_query('passage: '+  x))
+    return df
+@st.cache_resource
+def get_model()->object:
+    model_name = "intfloat/multilingual-e5-large"
+    model_kwargs = {'device': 'cuda'} #'cpu' or 'cuda'
+    encode_kwargs = {'normalize_embeddings': True}
+    embeddings_model = HuggingFaceEmbeddings(
+        model_name=model_name,
+        model_kwargs=model_kwargs,
+        encode_kwargs=encode_kwargs
+    )
+    return embeddings_model
+@st.cache_resource
+def get_chat_api(api_key:str):
+    chat = ChatOpenAI(model="gpt-3.5-turbo-16k", api_key=api_key)
+    return chat
+def get_results(embeddings_model,input,df,num_of_results) -> pd.DataFrame:
+    embeddings = embeddings_model.embed_query('query: '+ input)
+    hits = util.semantic_search(tensor(embeddings), tensor(df['embeddings'].tolist()), top_k=num_of_results)
+    hit_list = [hit['corpus_id'] for hit in hits[0]]
+    return df.iloc[hit_list]
+def get_llm_results(query,chat,results):
+    prompt_template = PromptTemplate.from_template(
+   """
+    your misssion is to rank the given answers based on their relevance to the given question.
+    Provide a relevancy score between 0 (not relevant) and 1 (highly relevant) for each possible answer.
+    the results should be in  the following JSON format: "answer": "score", "answer": "score" while answer is the possible answer's text and score is the relevancy score.
+    the question is: {query}
+    the possible answers are:
+    {answers}
+    """   )
+    messages = [
+        SystemMessage(content="""
+                      You're a helpful assistant.
+                      Return a JSON formatted string.
+                      """),
+        HumanMessage(content=prompt_template.format(query=query, answers=str.join('\n', results['text'].head(10).tolist()))),
+    ]
+    response =  chat.invoke(messages)
+    llm_results_df = pd.read_json(response.content, orient='index')
+    llm_results_df.rename(columns={0: 'score'}, inplace=True)
+    llm_results_df.sort_values(by='score', ascending=False, inplace=True)
+    return llm_results_df
+def run():
+    st.set_page_config(
+        page_title=" חיפוש סמנטי",
+        page_icon="",
+        layout="wide",
+        initial_sidebar_state="expanded"
+    )
+    st.write("# חיפוש חכם ")
+    st.write('ניתן להעלות כל קובץ טקסט, להמתין ליצירת האינדקס ולאחר מכן לחפש בשפה חופשית')
+    st.write('יצירת האינדקס עשויה לקחת מספר דקות, ותלויה בגודל הקובץ')
+    uploaded_file = st.file_uploader('העלה קובץ', type=['txt'], on_change=run)
+    embeddings_model = get_model()
+    df = get_df(uploaded_file)
+    if df is None:
+        st.write("לא הועלה קובץ")
+    else:
+        df = get_embeddings(df,embeddings_model)
+    user_input = st.text_input('כתוב כאן את שאלתך', placeholder='')
+    num_of_results = st.sidebar.slider('מספר התוצאות שברצונך להציג:',1,25,5)
+    use_llm = st.sidebar.checkbox("השתמש במודל שפה כדי לשפר תוצאות", False)
+    openAikey = st.sidebar.text_input("OpenAI API key", type="password")
+    if (st.button('חפש') or user_input) and user_input!="" and df is not None:
+        results = get_results(embeddings_model,user_input,df,num_of_results)
+        if use_llm:
+            if openAikey == None or openAikey=="":
+                st.write("לא הוכנס מפתח של OpenAI")
+            else:
+                chat = get_chat_api(openAikey)
+                llm_results = get_llm_results(user_input,chat,results)
+                st.write(llm_results)
+        else:
+            st.write(results.head(10))
+if __name__ == "__main__":
+    run()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+pandas
+streamlit
+torch
+transformers
+datasets
+langchain_huggingface
+langchain_openai
+langchain
+sentence_transformers

run.bat ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ pip install -r requirements.txt
2	+ streamlit run app.py

utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import textwrap
+import streamlit as st
+def show_code(demo):
+    """Showing the code of the demo."""
+    show_code = st.sidebar.checkbox("Show code", True)
+    if show_code:
+        # Showing the code of the demo.
+        st.markdown("## Code")
+        sourcelines, _ = inspect.getsourcelines(demo)
+        st.code(textwrap.dedent("".join(sourcelines[1:])))