U.AE-SITE

Running

App Files Files Community

Ritesh-hf commited on Nov 24, 2024

Commit

1a3b495

•

1 Parent(s): 34070e9

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -12

app.py CHANGED Viewed

@@ -67,18 +67,18 @@ def initialize_pinecone(index_name: str):
 # Initialize Pinecone index and BM25 encoder
 pinecone_index = initialize_pinecone("updated-uae-gov")
-bm25 = BM25Encoder().load("./bm25_u.ae.json")
 ##################################################
 ##################################################
 # Initialize models and retriever
-embed_model = HuggingFaceEmbeddings(model_name="Alibaba-NLP/gte-multilingual-base", model_kwargs={"trust_remote_code":True})
 retriever = PineconeHybridSearchRetriever(
     embeddings=embed_model,
     sparse_encoder=bm25,
     index=pinecone_index,
-    top_k=20,
     alpha=0.5,
 )
@@ -110,22 +110,29 @@ history_aware_retriever = create_history_aware_retriever(llm, retriever, context
 # QA system prompt and chain
 qa_system_prompt = """ You are a highly skilled information retrieval assistant. Use the following context to answer questions effectively.
-If you don't know the answer, simply state that you don't know.
-Your answer should be in {language} language.
 When responding to queries, follow these guidelines:
 1. Provide Clear Answers:
-   - Based on the language of the question, you have to answer in that language. E.g., if the question is in English, then answer in English; if the question is in Arabic, you should answer in Arabic.
    - Ensure the response directly addresses the query with accurate and relevant information.
    - Do not give long answers. Provide detailed but concise responses.
 2. Formatting for Readability:
    - Provide the entire response in proper markdown format.
-   - Use structured Maekdown elements such as headings, subheading, lists, tables, and links.
-   - Use emaphsis on headings, important texts and phrases.
 3. Proper Citations:
-   - ALWAYS USE INLINE CITATIONS with embed source URLs where users can verify information or explore further.
    - The inline citations should be in the format [1], [2], etc.
-   - DO not inlcude references at the end of response.
-FOLLOW ALL THE GIVEN INSTRUCTIONS, FAILURE TO DO SO WILL RESULT IN TERMINATION OF THE CHAT.
 {context}
 """
 qa_prompt = ChatPromptTemplate.from_messages(
@@ -198,10 +205,19 @@ async def websocket_endpoint(websocket: WebSocket):
                         citations = re.findall(r'\[(\d+)\]', complete_response)
                         citation_numbers = list(map(int, citations))
                         sources = dict()
                         for index, doc in enumerate(context):
                             if (index+1) in citation_numbers:
                                 sources[f"[{index+1}]"] = doc.metadata["source"]
-                        await websocket.send_json({'sources': sources})
                 await stream_response()
             except Exception as e:

 # Initialize Pinecone index and BM25 encoder
 pinecone_index = initialize_pinecone("updated-uae-gov")
+bm25 = BM25Encoder().load("./updated-uae-gov.json")
 ##################################################
 ##################################################
 # Initialize models and retriever
+embed_model = HuggingFaceEmbeddings(model_name="jinaai/jina-embeddings-v3", model_kwargs={"trust_remote_code":True})
 retriever = PineconeHybridSearchRetriever(
     embeddings=embed_model,
     sparse_encoder=bm25,
     index=pinecone_index,
+    top_k=10,
     alpha=0.5,
 )
 # QA system prompt and chain
 qa_system_prompt = """ You are a highly skilled information retrieval assistant. Use the following context to answer questions effectively.
+If you don't know the answer, simply state that you don't know.
+YOUR ANSWER SHOULD BE IN '{language}' LANGUAGE.
 When responding to queries, follow these guidelines:
 1. Provide Clear Answers:
+   - You have to answer in that language based on the given language of the answer. If it is English, answer it in English; if it is Arabic, you should answer it in Arabic.
    - Ensure the response directly addresses the query with accurate and relevant information.
    - Do not give long answers. Provide detailed but concise responses.
 2. Formatting for Readability:
    - Provide the entire response in proper markdown format.
+   - Use structured Markdown elements such as headings, subheadings, lists, tables, and links.
+   - Use emphasis on headings, important texts, and phrases.
 3. Proper Citations:
+   - Always use inline citations with embedded source URLs.
    - The inline citations should be in the format [1], [2], etc.
+   - DO NOT INCLUDE THE 'References' SECTION IN THE RESPONSE.
+FOLLOW ALL THE GIVEN INSTRUCTIONS, FAILURE TO DO SO WILL RESULT IN THE TERMINATION OF THE CHAT.
+== CONTEXT ==
 {context}
 """
 qa_prompt = ChatPromptTemplate.from_messages(
                         citations = re.findall(r'\[(\d+)\]', complete_response)
                         citation_numbers = list(map(int, citations))
                         sources = dict()
+                        backup = dict()
+                        i=1
                         for index, doc in enumerate(context):
                             if (index+1) in citation_numbers:
                                 sources[f"[{index+1}]"] = doc.metadata["source"]
+                            else:
+                                if doc.metadata["source"] not in backup.values():
+                                    backup[f"[{i}]"] = doc.metadata["source"]
+                                    i += 1
+                        if sources:
+                            await websocket.send_json({'sources': sources})
+                        else:
+                            await websocket.send_json({'sources': backup})
                 await stream_response()
             except Exception as e: