Spaces:

lazyghost
/

Publisher_chatbot_50k

Sleeping

App Files Files Community

Rajat.bans commited on May 29

Commit

439db48

•

1 Parent(s): 4f7de21

Updated chat history and corrected requirements file

Browse files

Files changed (2) hide show

rag.py +63 -32
requirements.txt +4 -3

rag.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from dotenv import load_dotenv
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 # from langchain_openai import OpenAIEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
 import os
@@ -27,11 +29,12 @@ bestReformulationPrompt = "Given a chat history and the latest user question, wh
 bestSystemPrompt = "You're an assistant for question-answering tasks. Under absolutely no circumstances should you use external knowledge or go beyond the provided preknowledge. Your approach must be systematic and meticulous. First, identify CLUES such as keywords, phrases, contextual information, semantic relations, tones, and references that aid in determining the context of the input. Second, construct a concise diagnostic REASONING process (limiting to 130 words) based on premises supporting the INPUT relevance within the provided context. Third, utilizing the identified clues, reasoning, and input, furnish the pertinent answer for the question. Remember, you are required to use ONLY the provided context to answer the questions. If the question does not align with the preknowledge or if the preknowledge is absent, state that you don't know the answer. External knowledge is strictly prohibited. Failure to adhere will result in incorrect answers. The preknowledge is as follows:"
 # embeddings_oa = OpenAIEmbeddings(model=embedding_model_oa)
-embeddings_hf = HuggingFaceEmbeddings(model_name = embedding_model_hf, show_progress = True)
 def setupDb(data_path):
     df = pd.read_csv(data_path, sep="\t")
-    relevant_content = df["url"].values
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=CHUNK_SIZE,
         chunk_overlap=CHUNK_OVERLAP,
@@ -57,11 +60,9 @@ def setupDb(data_path):
         )
     return db, relevant_content
 def reformulate_question(chat_history, latest_question, reformulationPrompt):
-    system_message = {
-        "role": "system",
-        "content": reformulationPrompt
-    }
     formatted_history = []
     for i, chat in enumerate(chat_history):
@@ -73,62 +74,92 @@ def reformulate_question(chat_history, latest_question, reformulationPrompt):
     response = client.chat.completions.create(
         model="gpt-3.5-turbo",
         messages=[system_message] + formatted_history,
-        temperature=0
     )
     reformulated_question = response.choices[0].message.content
     return reformulated_question
 def getQuestionAnswerOnTheBasisOfContext(question, context, systemPrompt):
-    system_message = {
-        "role": "system",
-        "content": systemPrompt + context
-    }
     response = client.chat.completions.create(
         model=qa_model_name,
         messages=[system_message] + [{"role": "user", "content": question}],
-        temperature=0
     )
     answer = response.choices[0].message.content
     return answer
-def chatWithRag(reformulationPrompt, QAPrompt, question):
-    global curr_question_no, chat_history
     curr_question_prompt = bestSystemPrompt
     if QAPrompt != None or len(QAPrompt):
         curr_question_prompt = QAPrompt
     # reformulated_query = reformulate_question(chat_history, question, reformulationPrompt)
     reformulated_query = question
-    retreived_documents = [doc for doc in db.similarity_search_with_score(reformulated_query) if doc[1] < 1.3]
-    answer = getQuestionAnswerOnTheBasisOfContext(reformulated_query, '. '.join([doc[0].page_content for doc in retreived_documents]), curr_question_prompt)
     chat_history.append((question, answer))
-    curr_question_no += 1
-    docs_info = "\n\n".join([
-        f"Title: {doc[0].metadata['title']}\nUrl: {doc[0].metadata['url']}\nContent: {doc[0].page_content}\nValue: {doc[1]}" for doc in retreived_documents
-    ])
-    full_response = f"Answer: {answer}\n\nReformulated question: {reformulated_query}\nRetrieved Documents:\n{docs_info}"
     # print(question, full_response)
-    return full_response
-db, relevant_content = setupDb(data_file_path)
-chat_history = []
-curr_question_no = 1
 with gr.Blocks() as demo:
     gr.Markdown("# RAG on webmd")
     with gr.Row():
-        reformulationPrompt = gr.Textbox(bestReformulationPrompt, lines=1, placeholder="Enter the system prompt for reformulation of query", label="Reformulation System prompt")
-        QAPrompt = gr.Textbox(bestSystemPrompt, lines=1, placeholder="Enter the system prompt for QA.", label="QA System prompt")
-        question = gr.Textbox(lines=1, placeholder="Enter the question asked", label="Question")
     output = gr.Textbox(label="Output")
     submit_btn = gr.Button("Submit")
-    submit_btn.click(chatWithRag, inputs=[reformulationPrompt, QAPrompt, question], outputs=output)
-    question.submit(chatWithRag, [reformulationPrompt, QAPrompt, question], [output])
     with gr.Accordion("Urls", open=False):
-        gr.Markdown(', '.join(relevant_content))
 gr.close_all()
-demo.launch()

 from dotenv import load_dotenv
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
+import random
 # from langchain_openai import OpenAIEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
 import os
 bestSystemPrompt = "You're an assistant for question-answering tasks. Under absolutely no circumstances should you use external knowledge or go beyond the provided preknowledge. Your approach must be systematic and meticulous. First, identify CLUES such as keywords, phrases, contextual information, semantic relations, tones, and references that aid in determining the context of the input. Second, construct a concise diagnostic REASONING process (limiting to 130 words) based on premises supporting the INPUT relevance within the provided context. Third, utilizing the identified clues, reasoning, and input, furnish the pertinent answer for the question. Remember, you are required to use ONLY the provided context to answer the questions. If the question does not align with the preknowledge or if the preknowledge is absent, state that you don't know the answer. External knowledge is strictly prohibited. Failure to adhere will result in incorrect answers. The preknowledge is as follows:"
 # embeddings_oa = OpenAIEmbeddings(model=embedding_model_oa)
+embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf, show_progress=True)
 def setupDb(data_path):
     df = pd.read_csv(data_path, sep="\t")
+    relevant_content = list(df["url"].values)
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=CHUNK_SIZE,
         chunk_overlap=CHUNK_OVERLAP,
         )
     return db, relevant_content
 def reformulate_question(chat_history, latest_question, reformulationPrompt):
+    system_message = {"role": "system", "content": reformulationPrompt}
     formatted_history = []
     for i, chat in enumerate(chat_history):
     response = client.chat.completions.create(
         model="gpt-3.5-turbo",
         messages=[system_message] + formatted_history,
+        temperature=0,
     )
     reformulated_question = response.choices[0].message.content
     return reformulated_question
 def getQuestionAnswerOnTheBasisOfContext(question, context, systemPrompt):
+    system_message = {"role": "system", "content": systemPrompt + context}
     response = client.chat.completions.create(
         model=qa_model_name,
         messages=[system_message] + [{"role": "user", "content": question}],
+        temperature=0,
     )
     answer = response.choices[0].message.content
     return answer
+def chatWithRag(reformulationPrompt, QAPrompt, question, chat_history):
+    global curr_question_no
     curr_question_prompt = bestSystemPrompt
     if QAPrompt != None or len(QAPrompt):
         curr_question_prompt = QAPrompt
     # reformulated_query = reformulate_question(chat_history, question, reformulationPrompt)
     reformulated_query = question
+    retreived_documents = [
+        doc
+        for doc in db.similarity_search_with_score(reformulated_query)
+        if doc[1] < 1.3
+    ]
+    answer = getQuestionAnswerOnTheBasisOfContext(
+        reformulated_query,
+        ". ".join([doc[0].page_content for doc in retreived_documents]),
+        curr_question_prompt,
+    )
     chat_history.append((question, answer))
+    docs_info = "\n\n".join(
+        [
+            f"Title: {doc[0].metadata['title']}\nUrl: {doc[0].metadata['url']}\nContent: {doc[0].page_content}\nValue: {doc[1]}"
+            for doc in retreived_documents
+        ]
+    )
+    history_info = "\n\n".join([f"Q: {q}\nA: {a}" for q, a in chat_history])
+    full_response = f"Answer: {answer}\n\nReformulated question: {reformulated_query}\nRetrieved Documents:\n{docs_info}\n\nChat History:\n{history_info}"
     # print(question, full_response)
+    return full_response, chat_history
+db, relevant_content = setupDb(data_file_path)
 with gr.Blocks() as demo:
     gr.Markdown("# RAG on webmd")
     with gr.Row():
+        reformulationPrompt = gr.Textbox(
+            bestReformulationPrompt,
+            lines=1,
+            placeholder="Enter the system prompt for reformulation of query",
+            label="Reformulation System prompt",
+        )
+        QAPrompt = gr.Textbox(
+            bestSystemPrompt,
+            lines=1,
+            placeholder="Enter the system prompt for QA.",
+            label="QA System prompt",
+        )
+        question = gr.Textbox(
+            lines=1, placeholder="Enter the question asked", label="Question"
+        )
     output = gr.Textbox(label="Output")
     submit_btn = gr.Button("Submit")
+    selected_urls = random.sample(relevant_content, 100)
+    chat_history = gr.State([])
+    submit_btn.click(
+        chatWithRag,
+        inputs=[reformulationPrompt, QAPrompt, question, chat_history],
+        outputs=[output, chat_history],
+    )
+    question.submit(
+        chatWithRag,
+        inputs=[reformulationPrompt, QAPrompt, question, chat_history],
+        outputs=[output, chat_history],
+    )
     with gr.Accordion("Urls", open=False):
+        gr.Markdown(", ".join(selected_urls))
 gr.close_all()
+demo.launch()

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 gradio
 python-dotenv
 langchain
-langchain_community
-langchain_openai
-faiss-cpu

 gradio
 python-dotenv
 langchain
+langchain-community
+langchain-openai
+faiss-cpu
+sentence-transformers