lazyghost commited on
Commit
4f7de21
0 Parent(s):

initial commit

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=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
.gitattributes ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.tsv filter=lfs diff=lfs merge=lfs -text
37
+ data/ filter=lfs diff=lfs merge=lfs -text
38
+ vectorstore/ filter=lfs diff=lfs merge=lfs -text
39
+ documents/ filter=lfs diff=lfs merge=lfs -text
40
+ *.faiss filter=lfs diff=lfs merge=lfs -text
41
+ document/* filter=lfs diff=lfs merge=lfs -text
42
+ *.pdf filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Rajat Bansal
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Publisher Chatbot 50k
3
+ emoji: 📈
4
+ colorFrom: purple
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 4.31.5
8
+ app_file: rag.py
9
+ pinned: false
10
+ ---
11
+
12
+ ## Steps for running rag -:
13
+ 1. Create .env file in root folder and add the following environment variables
14
+ ```
15
+ OPENAI_API_KEY=<YOUR OPENAI KEY>
16
+ ```
17
+ 2. Run the following commands:
18
+ ```
19
+ pip3 install -r requirements.txt
20
+ python3 rag.py
21
+ ```
22
+
23
+
24
+ Neo4j RAG course - https://www.deeplearning.ai/short-courses/knowledge-graphs-rag/
25
+ 1. langchain documentation retreiver - https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/
26
+ 2. https://medium.com/@shaktikanungo2019/3. conversational-ai-unveiling-the-first-rag-chatbot-with-langchain-8b9b04ee4b63
27
+ 3. https://medium.com/@vikrambhat2/building-a-rag-system-and-conversational-chatbot-with-custom-data-793e9617a865
28
+ 4. https://abvijaykumar.medium.com/retrieval-augmented-generation-rag-with-llamaindex-1828ef80314c
29
+ 5. https://medium.com/the-ai-forum/implementing-agentic-rag-using-langchain-b22af7f6a3b5
backupcodes/llamaIndexSingleQuery.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.core import (
2
+ VectorStoreIndex,
3
+ SimpleDirectoryReader,
4
+ StorageContext,
5
+ load_index_from_storage,
6
+ )
7
+ from llama_index.llms.openai import OpenAI
8
+ from llama_index.core.settings import Settings
9
+ import os
10
+ from dotenv import load_dotenv
11
+
12
+ load_dotenv(override=True)
13
+ Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
14
+
15
+ storage_path = "../vectorstore"
16
+ documents_path = "../../documents"
17
+
18
+
19
+ def initialize():
20
+ if not os.path.exists(storage_path):
21
+ documents = SimpleDirectoryReader(documents_path).load_data()
22
+ index = VectorStoreIndex.from_documents(documents)
23
+ index.storage_context.persist(persist_dir=storage_path)
24
+ else:
25
+ storage_context = StorageContext.from_defaults(persist_dir=storage_path)
26
+ index = load_index_from_storage(storage_context)
27
+ return index
28
+ index = initialize()
29
+
30
+ chat_engine = index.as_chat_engine(chat_mode="condense_question", verbose=True)
31
+ response = chat_engine.chat("hi tell me what i can ask you")
32
+ print(response.response)
backupcodes/pdfreader.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ docs = []
2
+ metadata = []
3
+
4
+ # Read PDF documents from the given path
5
+ pdf_docs = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith('.pdf')]
6
+ for pdf_path in pdf_docs:
7
+ with open(pdf_path, "rb") as pdf_file:
8
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
9
+ for index, page in enumerate(pdf_reader.pages):
10
+ doc_page = {
11
+ "title": os.path.basename(pdf_path) + " page " + str(index + 1),
12
+ "content": page.extract_text(),
13
+ }
14
+ docs.append(doc_page)
15
+
16
+ content = [doc["content"] for doc in docs]
17
+ metadata = [{"title": doc["title"]} for doc in docs]
18
+ print("Content and metadata are extracted from the documents")
backupcodes/streamlist_LLAMA.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import streamlit as st
3
+ from dotenv import load_dotenv
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain.memory import ConversationBufferMemory
8
+ from langchain.prompts import PromptTemplate
9
+
10
+ # from langchain_community.llms import llamacpp
11
+ # from langchain_community.embeddings import HuggingFaceEmbeddings
12
+ from langchain_openai import OpenAIEmbeddings
13
+ from langchain_openai import ChatOpenAI
14
+ import os
15
+
16
+ llmtemplate = """[INST]
17
+ As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
18
+ - Answer the question based on the provided documents.
19
+ - Be direct and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no etc.
20
+ - Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
21
+ - Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
22
+ - Do not fabricate information or include questions in your responses.
23
+ - do not prompt to select answers. do not ask me questions
24
+ {question}
25
+
26
+ [/INST]
27
+ """
28
+ prompt_template = """Use the following pieces of context and previous questions and answers to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
29
+
30
+ {context}
31
+
32
+ Previous Q&A: {previous_qa}
33
+
34
+ Question: {question}
35
+ Helpful Answer:"""
36
+
37
+ # PDF_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
38
+ # LLAMA_MODEL_PATH = "llama-2-7b-chat.Q4_K_M.gguf"
39
+ DB_FAISS_PATH = "../vectorstore/db_faiss"
40
+ CHUNK_SIZE = 512
41
+ CHUNK_OVERLAP = 256
42
+ SIMILARITY_THRESHOLD = 0.5
43
+
44
+
45
+ def prepare_db(pdf_docs):
46
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
47
+ if not os.path.exists(DB_FAISS_PATH):
48
+ docs = []
49
+ metadata = []
50
+ content = []
51
+
52
+ for pdf in pdf_docs:
53
+ pdf_reader = PyPDF2.PdfReader(pdf)
54
+ for index, page in enumerate(pdf_reader.pages):
55
+ doc_page = {
56
+ "title": pdf.name + " page " + str(index + 1),
57
+ "content": page.extract_text(),
58
+ }
59
+ docs.append(doc_page)
60
+ for doc in docs:
61
+ content.append(doc["content"])
62
+ metadata.append({"title": doc["title"]})
63
+ print("Content and metadata are extracted from the documents")
64
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
65
+ chunk_size=CHUNK_SIZE,
66
+ chunk_overlap=CHUNK_OVERLAP,
67
+ )
68
+ split_docs = text_splitter.create_documents(content, metadatas=metadata)
69
+ print(f"Documents are split into {len(split_docs)} passages")
70
+ # embeddings = HuggingFaceEmbeddings(
71
+ # model_name=PDF_MODEL_NAME,
72
+ # model_kwargs={"device": "cpu"},
73
+ # )
74
+ db = FAISS.from_documents(split_docs, embeddings)
75
+ print(f"Document saved in db")
76
+ db.save_local(DB_FAISS_PATH)
77
+ else:
78
+ print(f"Db already exists")
79
+ db = FAISS.load_local("./vectorstore/db_faiss", embeddings, allow_dangerous_deserialization=True)
80
+ return db
81
+
82
+
83
+ def get_conversation_chain(vectordb):
84
+ conversation_chain = ConversationalRetrievalChain.from_llm(
85
+ # llm=llamacpp.LlamaCpp(
86
+ # model_path=LLAMA_MODEL_PATH,
87
+ # temperature=0.75,
88
+ # max_tokens=200,
89
+ # top_p=1,
90
+ # n_ctx=3000,
91
+ # verbose=False,
92
+ # ),
93
+ llm = ChatOpenAI(model="gpt-3.5-turbo"),
94
+ retriever=vectordb.as_retriever(),
95
+ condense_question_prompt= PromptTemplate.from_template(llmtemplate),
96
+
97
+ memory=ConversationBufferMemory(
98
+ memory_key="chat_history", return_messages=True, output_key="answer"
99
+ ),
100
+ return_source_documents=True,
101
+ )
102
+ print("Conversation chain created")
103
+ return conversation_chain
104
+
105
+
106
+ # def validate_answer_against_sources(response_answer, source_documents):
107
+ # model = SentenceTransformer(PDF_MODEL_NAME)
108
+ # source_texts = [doc.page_content for doc in source_documents]
109
+ # answer_embedding = model.encode(response_answer, convert_to_tensor=True)
110
+ # source_embeddings = model.encode(source_texts, convert_to_tensor=True)
111
+ # cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)
112
+ # if any(score.item() > SIMILARITY_THRESHOLD for score in cosine_scores[0]):
113
+ # return True
114
+ # return False
115
+
116
+
117
+ def handle_userinput(user_question):
118
+ response = st.session_state.conversation({"question": user_question})
119
+ st.session_state.chat_history = response["chat_history"]
120
+
121
+ for i, message in enumerate(st.session_state.chat_history):
122
+ template = "<div style='color: blue;'>{{MSG}}</div>"
123
+ if i%2 != 0:
124
+ template = "<div style='color: green;'>{{MSG}}</div>"
125
+ st.write(
126
+ template.replace("{{MSG}}", str(i) + ': ' + message.content),
127
+ unsafe_allow_html=True,
128
+ )
129
+
130
+
131
+ def main():
132
+ load_dotenv(override=True)
133
+ st.set_page_config(page_title="Chat with your PDFs", page_icon=":books:")
134
+ if "conversation" not in st.session_state:
135
+ st.session_state.conversation = None
136
+ if "chat_history" not in st.session_state:
137
+ st.session_state.chat_history = []
138
+ st.header("Chat with multiple PDFs :books:")
139
+ user_question = st.text_input("Ask a question about your documents:")
140
+ if user_question:
141
+ handle_userinput(user_question)
142
+ with st.sidebar:
143
+ st.subheader("Your documents")
144
+ pdf_docs = st.file_uploader(
145
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
146
+ )
147
+ if st.button("Process"):
148
+ with st.spinner("Processing"):
149
+ vectorstore = prepare_db(pdf_docs)
150
+ # print(vectorstore.similarity_search("Tell me about add and adhd"))
151
+ st.session_state.conversation = get_conversation_chain(vectorstore)
152
+
153
+ if __name__ == "__main__":
154
+ main()
data/131_webmd_vogon_sample1000_urlsContent_cleaned.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ece612a60d11e19ca95d1ae6af58e6c968d98b014459c8afbd298b57afae4cf4
3
+ size 157522
data/132_webmd_vogon_urlsContent_cleaned.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb0b5985d1bc8610d28603f556c91194c2f755939dc235bdf5d3b77524f10b05
3
+ size 197658055
documents/ADD and ADHD (Attention Deficit Hyperactivity Disorder) Health Center.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:872fa6d12d05e40655b03de827150868f2233828843e5363e89fcfdb2dd66c57
3
+ size 2362135
documents/Media.net — WebMD - Better information. Better health..pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bec7051b94443c051b75f1aade0904b3d413c7ca11420787b93d152512dd7487
3
+ size 5966263
documents/WebMD Allergies Health Center - Find allergy information and latest health news.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d1591c0ddd6b184c3cfc4514d5613017a7a9054dbc7021ee4777a8a9eca0608
3
+ size 1367936
documents/WebMD Arthritis and Joint Pain Center: Symptoms, Causes, Tests, and Treatments.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a9c402c5f7c8d8ae0c02dc6c36426d1cf2ed538c5d8ae3e41203e1fed4d095c
3
+ size 672808
documents/WebMD Health News Center - The latest breaking health news and alerts.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:119e2f7dd9de99275ac13d7ba06f7984dc43c7b0e44006360ef784e3b44076e7
3
+ size 1921177
rag.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_community.vectorstores import FAISS
4
+ # from langchain_openai import OpenAIEmbeddings
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ import os
7
+ import pandas as pd
8
+
9
+ import gradio as gr
10
+ from openai import OpenAI
11
+
12
+ load_dotenv(override=True)
13
+ client = OpenAI()
14
+ DB_FAISS_PATH = "./vectorstore/db_faiss_50k"
15
+ data_file_path = "./data/132_webmd_vogon_urlsContent_cleaned.tsv"
16
+
17
+ # DB_FAISS_PATH = "./vectorstore/db_faiss_10"
18
+ # data_file_path = "./data/131_webmd_vogon_sample1000_urlsContent_cleaned.tsv"
19
+
20
+ CHUNK_SIZE = 512
21
+ CHUNK_OVERLAP = 128
22
+ # embedding_model_oa = "text-embedding-3-small"
23
+ embedding_model_hf = "BAAI/bge-m3"
24
+ # embedding_model_hf = "sentence-transformers/all-mpnet-base-v2"
25
+ qa_model_name = "gpt-3.5-turbo"
26
+ bestReformulationPrompt = "Given a chat history and the latest user question, which may reference context from the chat history, you must formulate a standalone question that can be understood without the chat history. You are strictly forbidden from using any outside knowledge. Do not, under any circumstances, answer the question. Reformulate it if necessary; otherwise, return it as is."
27
+ bestSystemPrompt = "You're an assistant for question-answering tasks. Under absolutely no circumstances should you use external knowledge or go beyond the provided preknowledge. Your approach must be systematic and meticulous. First, identify CLUES such as keywords, phrases, contextual information, semantic relations, tones, and references that aid in determining the context of the input. Second, construct a concise diagnostic REASONING process (limiting to 130 words) based on premises supporting the INPUT relevance within the provided context. Third, utilizing the identified clues, reasoning, and input, furnish the pertinent answer for the question. Remember, you are required to use ONLY the provided context to answer the questions. If the question does not align with the preknowledge or if the preknowledge is absent, state that you don't know the answer. External knowledge is strictly prohibited. Failure to adhere will result in incorrect answers. The preknowledge is as follows:"
28
+
29
+ # embeddings_oa = OpenAIEmbeddings(model=embedding_model_oa)
30
+ embeddings_hf = HuggingFaceEmbeddings(model_name = embedding_model_hf, show_progress = True)
31
+
32
+ def setupDb(data_path):
33
+ df = pd.read_csv(data_path, sep="\t")
34
+ relevant_content = df["url"].values
35
+ text_splitter = RecursiveCharacterTextSplitter(
36
+ chunk_size=CHUNK_SIZE,
37
+ chunk_overlap=CHUNK_OVERLAP,
38
+ )
39
+
40
+ if not os.path.exists(DB_FAISS_PATH):
41
+ split_docs = text_splitter.create_documents(
42
+ df["url_content"].tolist(),
43
+ metadatas=[
44
+ {"title": row["url_title"], "url": row["url"]}
45
+ for _, row in df.iterrows()
46
+ ],
47
+ )
48
+ print(f"Documents are split into {len(split_docs)} passages")
49
+
50
+ db = FAISS.from_documents(split_docs, embeddings_hf)
51
+ print(f"Document saved in db")
52
+ db.save_local(DB_FAISS_PATH + "/index_1")
53
+ else:
54
+ print(f"Db already exists")
55
+ db = FAISS.load_local(
56
+ DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True
57
+ )
58
+ return db, relevant_content
59
+
60
+ def reformulate_question(chat_history, latest_question, reformulationPrompt):
61
+ system_message = {
62
+ "role": "system",
63
+ "content": reformulationPrompt
64
+ }
65
+
66
+ formatted_history = []
67
+ for i, chat in enumerate(chat_history):
68
+ formatted_history.append({"role": "user", "content": chat[0]})
69
+ formatted_history.append({"role": "assistant", "content": chat[1]})
70
+ # print("History -------------->", formatted_history)
71
+
72
+ formatted_history.append({"role": "user", "content": latest_question})
73
+ response = client.chat.completions.create(
74
+ model="gpt-3.5-turbo",
75
+ messages=[system_message] + formatted_history,
76
+ temperature=0
77
+ )
78
+
79
+ reformulated_question = response.choices[0].message.content
80
+ return reformulated_question
81
+
82
+ def getQuestionAnswerOnTheBasisOfContext(question, context, systemPrompt):
83
+ system_message = {
84
+ "role": "system",
85
+ "content": systemPrompt + context
86
+ }
87
+
88
+ response = client.chat.completions.create(
89
+ model=qa_model_name,
90
+ messages=[system_message] + [{"role": "user", "content": question}],
91
+ temperature=0
92
+ )
93
+ answer = response.choices[0].message.content
94
+ return answer
95
+
96
+
97
+ def chatWithRag(reformulationPrompt, QAPrompt, question):
98
+ global curr_question_no, chat_history
99
+ curr_question_prompt = bestSystemPrompt
100
+ if QAPrompt != None or len(QAPrompt):
101
+ curr_question_prompt = QAPrompt
102
+
103
+ # reformulated_query = reformulate_question(chat_history, question, reformulationPrompt)
104
+ reformulated_query = question
105
+ retreived_documents = [doc for doc in db.similarity_search_with_score(reformulated_query) if doc[1] < 1.3]
106
+ answer = getQuestionAnswerOnTheBasisOfContext(reformulated_query, '. '.join([doc[0].page_content for doc in retreived_documents]), curr_question_prompt)
107
+ chat_history.append((question, answer))
108
+ curr_question_no += 1
109
+ docs_info = "\n\n".join([
110
+ f"Title: {doc[0].metadata['title']}\nUrl: {doc[0].metadata['url']}\nContent: {doc[0].page_content}\nValue: {doc[1]}" for doc in retreived_documents
111
+ ])
112
+ full_response = f"Answer: {answer}\n\nReformulated question: {reformulated_query}\nRetrieved Documents:\n{docs_info}"
113
+ # print(question, full_response)
114
+ return full_response
115
+
116
+ db, relevant_content = setupDb(data_file_path)
117
+ chat_history = []
118
+ curr_question_no = 1
119
+
120
+ with gr.Blocks() as demo:
121
+ gr.Markdown("# RAG on webmd")
122
+ with gr.Row():
123
+ reformulationPrompt = gr.Textbox(bestReformulationPrompt, lines=1, placeholder="Enter the system prompt for reformulation of query", label="Reformulation System prompt")
124
+ QAPrompt = gr.Textbox(bestSystemPrompt, lines=1, placeholder="Enter the system prompt for QA.", label="QA System prompt")
125
+ question = gr.Textbox(lines=1, placeholder="Enter the question asked", label="Question")
126
+ output = gr.Textbox(label="Output")
127
+ submit_btn = gr.Button("Submit")
128
+ submit_btn.click(chatWithRag, inputs=[reformulationPrompt, QAPrompt, question], outputs=output)
129
+ question.submit(chatWithRag, [reformulationPrompt, QAPrompt, question], [output])
130
+ with gr.Accordion("Urls", open=False):
131
+ gr.Markdown(', '.join(relevant_content))
132
+
133
+ gr.close_all()
134
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ python-dotenv
3
+ langchain
4
+ langchain_community
5
+ langchain_openai
6
+ faiss-cpu
vectorstore/.DS_Store ADDED
Binary file (6.15 kB). View file
 
vectorstore/db_faiss_10/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e15f7b6feed6be9100fe75f074fa861e6d80fec0b10ac60f902c6b0980aa280
3
+ size 3704877
vectorstore/db_faiss_10/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f4c9e2ecd619a27680b0ddede61f3745ee764284fab71ad0f95a83487f699f6
3
+ size 372279
vectorstore/db_faiss_50k/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7454ff939aed934f1a9741bda8d9d4ad0962c1d6c22737c5bd9d98f3a91b25e
3
+ size 2059517997
vectorstore/db_faiss_50k/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62959ed49b05c3081c612a3b951c427238098bafc7b7ad74da67e61da6af1167
3
+ size 321147051