Update app.py
Browse files
app.py
CHANGED
@@ -67,18 +67,18 @@ def initialize_pinecone(index_name: str):
|
|
67 |
|
68 |
# Initialize Pinecone index and BM25 encoder
|
69 |
pinecone_index = initialize_pinecone("updated-uae-gov")
|
70 |
-
bm25 = BM25Encoder().load("./
|
71 |
|
72 |
##################################################
|
73 |
##################################################
|
74 |
|
75 |
# Initialize models and retriever
|
76 |
-
embed_model = HuggingFaceEmbeddings(model_name="
|
77 |
retriever = PineconeHybridSearchRetriever(
|
78 |
embeddings=embed_model,
|
79 |
sparse_encoder=bm25,
|
80 |
index=pinecone_index,
|
81 |
-
top_k=
|
82 |
alpha=0.5,
|
83 |
)
|
84 |
|
@@ -110,22 +110,29 @@ history_aware_retriever = create_history_aware_retriever(llm, retriever, context
|
|
110 |
|
111 |
# QA system prompt and chain
|
112 |
qa_system_prompt = """ You are a highly skilled information retrieval assistant. Use the following context to answer questions effectively.
|
113 |
-
If you don't know the answer, simply state that you don't know.
|
114 |
-
|
|
|
|
|
115 |
When responding to queries, follow these guidelines:
|
116 |
1. Provide Clear Answers:
|
117 |
-
-
|
118 |
- Ensure the response directly addresses the query with accurate and relevant information.
|
119 |
- Do not give long answers. Provide detailed but concise responses.
|
|
|
120 |
2. Formatting for Readability:
|
121 |
- Provide the entire response in proper markdown format.
|
122 |
-
- Use structured
|
123 |
-
- Use
|
|
|
124 |
3. Proper Citations:
|
125 |
-
-
|
126 |
- The inline citations should be in the format [1], [2], etc.
|
127 |
-
- DO
|
128 |
-
|
|
|
|
|
|
|
129 |
{context}
|
130 |
"""
|
131 |
qa_prompt = ChatPromptTemplate.from_messages(
|
@@ -198,10 +205,19 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
198 |
citations = re.findall(r'\[(\d+)\]', complete_response)
|
199 |
citation_numbers = list(map(int, citations))
|
200 |
sources = dict()
|
|
|
|
|
201 |
for index, doc in enumerate(context):
|
202 |
if (index+1) in citation_numbers:
|
203 |
sources[f"[{index+1}]"] = doc.metadata["source"]
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
await stream_response()
|
207 |
except Exception as e:
|
|
|
67 |
|
68 |
# Initialize Pinecone index and BM25 encoder
|
69 |
pinecone_index = initialize_pinecone("updated-uae-gov")
|
70 |
+
bm25 = BM25Encoder().load("./updated-uae-gov.json")
|
71 |
|
72 |
##################################################
|
73 |
##################################################
|
74 |
|
75 |
# Initialize models and retriever
|
76 |
+
embed_model = HuggingFaceEmbeddings(model_name="jinaai/jina-embeddings-v3", model_kwargs={"trust_remote_code":True})
|
77 |
retriever = PineconeHybridSearchRetriever(
|
78 |
embeddings=embed_model,
|
79 |
sparse_encoder=bm25,
|
80 |
index=pinecone_index,
|
81 |
+
top_k=10,
|
82 |
alpha=0.5,
|
83 |
)
|
84 |
|
|
|
110 |
|
111 |
# QA system prompt and chain
|
112 |
qa_system_prompt = """ You are a highly skilled information retrieval assistant. Use the following context to answer questions effectively.
|
113 |
+
If you don't know the answer, simply state that you don't know.
|
114 |
+
|
115 |
+
YOUR ANSWER SHOULD BE IN '{language}' LANGUAGE.
|
116 |
+
|
117 |
When responding to queries, follow these guidelines:
|
118 |
1. Provide Clear Answers:
|
119 |
+
- You have to answer in that language based on the given language of the answer. If it is English, answer it in English; if it is Arabic, you should answer it in Arabic.
|
120 |
- Ensure the response directly addresses the query with accurate and relevant information.
|
121 |
- Do not give long answers. Provide detailed but concise responses.
|
122 |
+
|
123 |
2. Formatting for Readability:
|
124 |
- Provide the entire response in proper markdown format.
|
125 |
+
- Use structured Markdown elements such as headings, subheadings, lists, tables, and links.
|
126 |
+
- Use emphasis on headings, important texts, and phrases.
|
127 |
+
|
128 |
3. Proper Citations:
|
129 |
+
- Always use inline citations with embedded source URLs.
|
130 |
- The inline citations should be in the format [1], [2], etc.
|
131 |
+
- DO NOT INCLUDE THE 'References' SECTION IN THE RESPONSE.
|
132 |
+
|
133 |
+
FOLLOW ALL THE GIVEN INSTRUCTIONS, FAILURE TO DO SO WILL RESULT IN THE TERMINATION OF THE CHAT.
|
134 |
+
|
135 |
+
== CONTEXT ==
|
136 |
{context}
|
137 |
"""
|
138 |
qa_prompt = ChatPromptTemplate.from_messages(
|
|
|
205 |
citations = re.findall(r'\[(\d+)\]', complete_response)
|
206 |
citation_numbers = list(map(int, citations))
|
207 |
sources = dict()
|
208 |
+
backup = dict()
|
209 |
+
i=1
|
210 |
for index, doc in enumerate(context):
|
211 |
if (index+1) in citation_numbers:
|
212 |
sources[f"[{index+1}]"] = doc.metadata["source"]
|
213 |
+
else:
|
214 |
+
if doc.metadata["source"] not in backup.values():
|
215 |
+
backup[f"[{i}]"] = doc.metadata["source"]
|
216 |
+
i += 1
|
217 |
+
if sources:
|
218 |
+
await websocket.send_json({'sources': sources})
|
219 |
+
else:
|
220 |
+
await websocket.send_json({'sources': backup})
|
221 |
|
222 |
await stream_response()
|
223 |
except Exception as e:
|