Ritesh-hf commited on
Commit
1a3b495
1 Parent(s): 34070e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -12
app.py CHANGED
@@ -67,18 +67,18 @@ def initialize_pinecone(index_name: str):
67
 
68
  # Initialize Pinecone index and BM25 encoder
69
  pinecone_index = initialize_pinecone("updated-uae-gov")
70
- bm25 = BM25Encoder().load("./bm25_u.ae.json")
71
 
72
  ##################################################
73
  ##################################################
74
 
75
  # Initialize models and retriever
76
- embed_model = HuggingFaceEmbeddings(model_name="Alibaba-NLP/gte-multilingual-base", model_kwargs={"trust_remote_code":True})
77
  retriever = PineconeHybridSearchRetriever(
78
  embeddings=embed_model,
79
  sparse_encoder=bm25,
80
  index=pinecone_index,
81
- top_k=20,
82
  alpha=0.5,
83
  )
84
 
@@ -110,22 +110,29 @@ history_aware_retriever = create_history_aware_retriever(llm, retriever, context
110
 
111
  # QA system prompt and chain
112
  qa_system_prompt = """ You are a highly skilled information retrieval assistant. Use the following context to answer questions effectively.
113
- If you don't know the answer, simply state that you don't know.
114
- Your answer should be in {language} language.
 
 
115
  When responding to queries, follow these guidelines:
116
  1. Provide Clear Answers:
117
- - Based on the language of the question, you have to answer in that language. E.g., if the question is in English, then answer in English; if the question is in Arabic, you should answer in Arabic.
118
  - Ensure the response directly addresses the query with accurate and relevant information.
119
  - Do not give long answers. Provide detailed but concise responses.
 
120
  2. Formatting for Readability:
121
  - Provide the entire response in proper markdown format.
122
- - Use structured Maekdown elements such as headings, subheading, lists, tables, and links.
123
- - Use emaphsis on headings, important texts and phrases.
 
124
  3. Proper Citations:
125
- - ALWAYS USE INLINE CITATIONS with embed source URLs where users can verify information or explore further.
126
  - The inline citations should be in the format [1], [2], etc.
127
- - DO not inlcude references at the end of response.
128
- FOLLOW ALL THE GIVEN INSTRUCTIONS, FAILURE TO DO SO WILL RESULT IN TERMINATION OF THE CHAT.
 
 
 
129
  {context}
130
  """
131
  qa_prompt = ChatPromptTemplate.from_messages(
@@ -198,10 +205,19 @@ async def websocket_endpoint(websocket: WebSocket):
198
  citations = re.findall(r'\[(\d+)\]', complete_response)
199
  citation_numbers = list(map(int, citations))
200
  sources = dict()
 
 
201
  for index, doc in enumerate(context):
202
  if (index+1) in citation_numbers:
203
  sources[f"[{index+1}]"] = doc.metadata["source"]
204
- await websocket.send_json({'sources': sources})
 
 
 
 
 
 
 
205
 
206
  await stream_response()
207
  except Exception as e:
 
67
 
68
  # Initialize Pinecone index and BM25 encoder
69
  pinecone_index = initialize_pinecone("updated-uae-gov")
70
+ bm25 = BM25Encoder().load("./updated-uae-gov.json")
71
 
72
  ##################################################
73
  ##################################################
74
 
75
  # Initialize models and retriever
76
+ embed_model = HuggingFaceEmbeddings(model_name="jinaai/jina-embeddings-v3", model_kwargs={"trust_remote_code":True})
77
  retriever = PineconeHybridSearchRetriever(
78
  embeddings=embed_model,
79
  sparse_encoder=bm25,
80
  index=pinecone_index,
81
+ top_k=10,
82
  alpha=0.5,
83
  )
84
 
 
110
 
111
  # QA system prompt and chain
112
  qa_system_prompt = """ You are a highly skilled information retrieval assistant. Use the following context to answer questions effectively.
113
+ If you don't know the answer, simply state that you don't know.
114
+
115
+ YOUR ANSWER SHOULD BE IN '{language}' LANGUAGE.
116
+
117
  When responding to queries, follow these guidelines:
118
  1. Provide Clear Answers:
119
+ - You have to answer in that language based on the given language of the answer. If it is English, answer it in English; if it is Arabic, you should answer it in Arabic.
120
  - Ensure the response directly addresses the query with accurate and relevant information.
121
  - Do not give long answers. Provide detailed but concise responses.
122
+
123
  2. Formatting for Readability:
124
  - Provide the entire response in proper markdown format.
125
+ - Use structured Markdown elements such as headings, subheadings, lists, tables, and links.
126
+ - Use emphasis on headings, important texts, and phrases.
127
+
128
  3. Proper Citations:
129
+ - Always use inline citations with embedded source URLs.
130
  - The inline citations should be in the format [1], [2], etc.
131
+ - DO NOT INCLUDE THE 'References' SECTION IN THE RESPONSE.
132
+
133
+ FOLLOW ALL THE GIVEN INSTRUCTIONS, FAILURE TO DO SO WILL RESULT IN THE TERMINATION OF THE CHAT.
134
+
135
+ == CONTEXT ==
136
  {context}
137
  """
138
  qa_prompt = ChatPromptTemplate.from_messages(
 
205
  citations = re.findall(r'\[(\d+)\]', complete_response)
206
  citation_numbers = list(map(int, citations))
207
  sources = dict()
208
+ backup = dict()
209
+ i=1
210
  for index, doc in enumerate(context):
211
  if (index+1) in citation_numbers:
212
  sources[f"[{index+1}]"] = doc.metadata["source"]
213
+ else:
214
+ if doc.metadata["source"] not in backup.values():
215
+ backup[f"[{i}]"] = doc.metadata["source"]
216
+ i += 1
217
+ if sources:
218
+ await websocket.send_json({'sources': sources})
219
+ else:
220
+ await websocket.send_json({'sources': backup})
221
 
222
  await stream_response()
223
  except Exception as e: