anything-question-answering

Sleeping

App Files Files Community

LOUIS SANNA commited on Oct 24, 2023

Commit

6e28a81

•

1 Parent(s): 35c9187

feat(*)

Browse files

Files changed (9) hide show

climateqa/chains.py +37 -30
climateqa/chat.py +14 -11
climateqa/custom_retrieval_chain.py +3 -5
climateqa/llm.py +8 -7
climateqa/logging.py +1 -0
climateqa/prompts.py +1 -2
climateqa/retriever.py +25 -23
climateqa/vectorstore.py +5 -3
utils.py +2 -2

climateqa/chains.py CHANGED Viewed

@@ -3,20 +3,20 @@
 import json
 from langchain import PromptTemplate, LLMChain
-from langchain.chains import RetrievalQAWithSourcesChain,QAWithSourcesChain
 from langchain.chains import TransformChain, SequentialChain
 from langchain.chains.qa_with_sources import load_qa_with_sources_chain
-from climateqa.prompts import answer_prompt, reformulation_prompt,audience_prompts
 from climateqa.custom_retrieval_chain import CustomRetrievalQAWithSourcesChain
-def load_reformulation_chain(llm):
     prompt = PromptTemplate(
-        template = reformulation_prompt,
         input_variables=["query"],
     )
-    reformulation_chain = LLMChain(llm = llm,prompt = prompt,output_key="json")
     # Parse the output
     def parse_output(output):
@@ -28,20 +28,30 @@ def load_reformulation_chain(llm):
             "question": question,
             "language": language,
         }
     transform_chain = TransformChain(
-        input_variables=["json"], output_variables=["question","language"], transform=parse_output
     )
-    reformulation_chain = SequentialChain(chains = [reformulation_chain,transform_chain],input_variables=["query"],output_variables=["question","language"])
     return reformulation_chain
 def load_combine_documents_chain(llm):
-    prompt = PromptTemplate(template=answer_prompt, input_variables=["summaries", "question","audience","language"])
-    qa_chain = load_qa_with_sources_chain(llm, chain_type="stuff",prompt = prompt)
     return qa_chain
 def load_qa_chain_with_docs(llm):
     """Load a QA chain with documents.
     Useful when you already have retrieved docs
@@ -60,50 +70,47 @@ def load_qa_chain_with_docs(llm):
     qa_chain = load_combine_documents_chain(llm)
     chain = QAWithSourcesChain(
-        input_docs_key = "docs",
-        combine_documents_chain = qa_chain,
-        return_source_documents = True,
     )
     return chain
 def load_qa_chain_with_text(llm):
     prompt = PromptTemplate(
-        template = answer_prompt,
-        input_variables=["question","audience","language","summaries"],
     )
-    qa_chain = LLMChain(llm = llm,prompt = prompt)
     return qa_chain
-def load_qa_chain_with_retriever(retriever,llm):
     qa_chain = load_combine_documents_chain(llm)
     # This could be improved by providing a document prompt to avoid modifying page_content in the docs
     # See here https://github.com/langchain-ai/langchain/issues/3523
     answer_chain = CustomRetrievalQAWithSourcesChain(
-        combine_documents_chain = qa_chain,
         retriever=retriever,
-        return_source_documents = True,
-        verbose = True,
         fallback_answer="**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate issues).**",
     )
     return answer_chain
-def load_climateqa_chain(retriever,llm_reformulation,llm_answer):
     reformulation_chain = load_reformulation_chain(llm_reformulation)
-    answer_chain = load_qa_chain_with_retriever(retriever,llm_answer)
     climateqa_chain = SequentialChain(
-        chains = [reformulation_chain,answer_chain],
-        input_variables=["query","audience"],
-        output_variables=["answer","question","language","source_documents"],
-        return_all = True,
-        verbose = True,
     )
     return climateqa_chain

 import json
 from langchain import PromptTemplate, LLMChain
+from langchain.chains import RetrievalQAWithSourcesChain, QAWithSourcesChain
 from langchain.chains import TransformChain, SequentialChain
 from langchain.chains.qa_with_sources import load_qa_with_sources_chain
+from climateqa.prompts import answer_prompt, reformulation_prompt, audience_prompts
 from climateqa.custom_retrieval_chain import CustomRetrievalQAWithSourcesChain
+def load_reformulation_chain(llm):
     prompt = PromptTemplate(
+        template=reformulation_prompt,
         input_variables=["query"],
     )
+    reformulation_chain = LLMChain(llm=llm, prompt=prompt, output_key="json")
     # Parse the output
     def parse_output(output):
             "question": question,
             "language": language,
         }
     transform_chain = TransformChain(
+        input_variables=["json"],
+        output_variables=["question", "language"],
+        transform=parse_output,
     )
+    reformulation_chain = SequentialChain(
+        chains=[reformulation_chain, transform_chain],
+        input_variables=["query"],
+        output_variables=["question", "language"],
+    )
     return reformulation_chain
 def load_combine_documents_chain(llm):
+    prompt = PromptTemplate(
+        template=answer_prompt,
+        input_variables=["summaries", "question", "audience", "language"],
+    )
+    qa_chain = load_qa_with_sources_chain(llm, chain_type="stuff", prompt=prompt)
     return qa_chain
 def load_qa_chain_with_docs(llm):
     """Load a QA chain with documents.
     Useful when you already have retrieved docs
     qa_chain = load_combine_documents_chain(llm)
     chain = QAWithSourcesChain(
+        input_docs_key="docs",
+        combine_documents_chain=qa_chain,
+        return_source_documents=True,
     )
     return chain
 def load_qa_chain_with_text(llm):
     prompt = PromptTemplate(
+        template=answer_prompt,
+        input_variables=["question", "audience", "language", "summaries"],
     )
+    qa_chain = LLMChain(llm=llm, prompt=prompt)
     return qa_chain
+def load_qa_chain_with_retriever(retriever, llm):
     qa_chain = load_combine_documents_chain(llm)
     # This could be improved by providing a document prompt to avoid modifying page_content in the docs
     # See here https://github.com/langchain-ai/langchain/issues/3523
     answer_chain = CustomRetrievalQAWithSourcesChain(
+        combine_documents_chain=qa_chain,
         retriever=retriever,
+        return_source_documents=True,
+        verbose=True,
         fallback_answer="**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate issues).**",
     )
     return answer_chain
+def load_climateqa_chain(retriever, llm_reformulation, llm_answer):
     reformulation_chain = load_reformulation_chain(llm_reformulation)
+    answer_chain = load_qa_chain_with_retriever(retriever, llm_answer)
     climateqa_chain = SequentialChain(
+        chains=[reformulation_chain, answer_chain],
+        input_variables=["query", "audience"],
+        output_variables=["answer", "question", "language", "source_documents"],
+        return_all=True,
+        verbose=True,
     )
     return climateqa_chain

climateqa/chat.py CHANGED Viewed

@@ -12,28 +12,31 @@ from climateqa.chains import load_climateqa_chain
 class ClimateQA:
-    def __init__(self,hf_embedding_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
-                 show_progress_bar = False,batch_size = 1,max_tokens = 1024,**kwargs):
-        self.llm = self.get_llm(max_tokens = max_tokens,**kwargs)
         self.embeddings_function = HuggingFaceEmbeddings(
             model_name=hf_embedding_model,
-            encode_kwargs={"show_progress_bar":show_progress_bar,"batch_size":batch_size}
         )
     def get_vectorstore(self):
         pass
     def reformulate(self):
         pass
     def retrieve(self):
         pass
     def ask(self):
-        pass

 class ClimateQA:
+    def __init__(
+        self,
+        hf_embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
+        show_progress_bar=False,
+        batch_size=1,
+        max_tokens=1024,
+        **kwargs
+    ):
+        self.llm = self.get_llm(max_tokens=max_tokens, **kwargs)
         self.embeddings_function = HuggingFaceEmbeddings(
             model_name=hf_embedding_model,
+            encode_kwargs={
+                "show_progress_bar": show_progress_bar,
+                "batch_size": batch_size,
+            },
         )
     def get_vectorstore(self):
         pass
     def reformulate(self):
         pass
     def retrieve(self):
         pass
     def ask(self):
+        pass

climateqa/custom_retrieval_chain.py CHANGED Viewed

@@ -29,11 +29,11 @@ from langchain.chains import RetrievalQAWithSourcesChain
 from langchain.chains.router.llm_router import LLMRouterChain
-class CustomRetrievalQAWithSourcesChain(RetrievalQAWithSourcesChain):
-    fallback_answer:str = "No sources available to answer this question."
-    def _call(self,inputs,run_manager=None):
         _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
         accepts_run_manager = (
             "run_manager" in inspect.signature(self._get_docs).parameters
@@ -43,12 +43,10 @@ class CustomRetrievalQAWithSourcesChain(RetrievalQAWithSourcesChain):
         else:
             docs = self._get_docs(inputs)  # type: ignore[call-arg]
         if len(docs) == 0:
             answer = self.fallback_answer
             sources = []
         else:
             answer = self.combine_documents_chain.run(
                 input_documents=docs, callbacks=_run_manager.get_child(), **inputs
             )

 from langchain.chains.router.llm_router import LLMRouterChain
+class CustomRetrievalQAWithSourcesChain(RetrievalQAWithSourcesChain):
+    fallback_answer: str = "No sources available to answer this question."
+    def _call(self, inputs, run_manager=None):
         _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
         accepts_run_manager = (
             "run_manager" in inspect.signature(self._get_docs).parameters
         else:
             docs = self._get_docs(inputs)  # type: ignore[call-arg]
         if len(docs) == 0:
             answer = self.fallback_answer
             sources = []
         else:
             answer = self.combine_documents_chain.run(
                 input_documents=docs, callbacks=_run_manager.get_child(), **inputs
             )

climateqa/llm.py CHANGED Viewed

@@ -1,25 +1,26 @@
 from langchain.chat_models import AzureChatOpenAI
 import os
 # LOAD ENVIRONMENT VARIABLES
 try:
     from dotenv import load_dotenv
     load_dotenv()
 except:
     pass
-def get_llm(max_tokens = 1024,temperature = 0.0,verbose = True,streaming = False, **kwargs):
     llm = AzureChatOpenAI(
         openai_api_base=os.environ["AZURE_OPENAI_API_BASE_URL"],
         openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
         deployment_name=os.environ["AZURE_OPENAI_API_DEPLOYMENT_NAME"],
         openai_api_key=os.environ["AZURE_OPENAI_API_KEY"],
-        openai_api_type = "azure",
-        max_tokens = max_tokens,
-        temperature = temperature,
-        verbose = verbose,
-        streaming = streaming,
         **kwargs,
     )
     return llm

 from langchain.chat_models import AzureChatOpenAI
 import os
 # LOAD ENVIRONMENT VARIABLES
 try:
     from dotenv import load_dotenv
     load_dotenv()
 except:
     pass
+def get_llm(max_tokens=1024, temperature=0.0, verbose=True, streaming=False, **kwargs):
     llm = AzureChatOpenAI(
         openai_api_base=os.environ["AZURE_OPENAI_API_BASE_URL"],
         openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
         deployment_name=os.environ["AZURE_OPENAI_API_DEPLOYMENT_NAME"],
         openai_api_key=os.environ["AZURE_OPENAI_API_KEY"],
+        openai_api_type="azure",
+        max_tokens=max_tokens,
+        temperature=temperature,
+        verbose=verbose,
+        streaming=streaming,
         **kwargs,
     )
     return llm

climateqa/logging.py CHANGED Viewed

@@ -53,6 +53,7 @@ def get_azure_blob_client():
     share_client = service.get_share_client(file_share_name)
     return share_client
 if has_blob_config():
     share_client = get_azure_blob_client()

     share_client = service.get_share_client(file_share_name)
     return share_client
 if has_blob_config():
     share_client = get_azure_blob_client()

climateqa/prompts.py CHANGED Viewed

@@ -1,4 +1,3 @@
 # If the message is not relevant to climate change (like "How are you", "I am 18 years old" or "When was built the eiffel tower"), return N/A
 reformulation_prompt = """
@@ -54,4 +53,4 @@ audience_prompts = {
     "children": "6 year old children that don't know anything about science and climate change and need metaphors to learn",
     "general": "the general public who know the basics in science and climate change and want to learn more about it without technical terms. Still use references to passages.",
     "experts": "expert and climate scientists that are not afraid of technical terms",
-}

 # If the message is not relevant to climate change (like "How are you", "I am 18 years old" or "When was built the eiffel tower"), return N/A
 reformulation_prompt = """
     "children": "6 year old children that don't know anything about science and climate change and need metaphors to learn",
     "general": "the general public who know the basics in science and climate change and want to learn more about it without technical terms. Still use references to passages.",
     "experts": "expert and climate scientists that are not afraid of technical terms",
+}

climateqa/retriever.py CHANGED Viewed

@@ -9,41 +9,48 @@ from langchain.callbacks.manager import CallbackManagerForRetrieverRun
 from typing import List
 from pydantic import Field
 class ClimateQARetriever(BaseRetriever):
-    vectorstore:VectorStore
-    sources:list = ["IPCC","IPBES"]
-    threshold:float = 22
-    k_summary:int = 3
-    k_total:int = 10
-    namespace:str = "vectors"
     def get_relevant_documents(self, query: str) -> List[Document]:
         # Check if all elements in the list are either IPCC or IPBES
-        assert isinstance(self.sources,list)
-        assert all([x in ["IPCC","IPBES"] for x in self.sources])
         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
         # Prepare base search kwargs
         filters = {
-            "source": { "$in":self.sources},
         }
         # Search for k_summary documents in the summaries dataset
         filters_summaries = {
             **filters,
-            "report_type": { "$in":["SPM","TS"]},
         }
-        docs_summaries = self.vectorstore.similarity_search_with_score(query=query,namespace = self.namespace,filter = filters_summaries,k = self.k_summary)
         docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
         # Search for k_total - k_summary documents in the full reports dataset
         filters_full = {
             **filters,
-            "report_type": { "$nin":["SPM","TS"]},
         }
         k_full = self.k_total - len(docs_summaries)
-        docs_full = self.vectorstore.similarity_search_with_score(query=query,namespace = self.namespace,filter = filters_full,k = k_full)
         # Concatenate documents
         docs = docs_summaries + docs_full
@@ -53,19 +60,18 @@ class ClimateQARetriever(BaseRetriever):
         # Add score to metadata
         results = []
-        for i,(doc,score) in enumerate(docs):
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["page_number"] = int(doc.metadata["page_number"])
-            doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
             results.append(doc)
         return results
 # def filter_summaries(df,k_summary = 3,k_total = 10):
 #     # assert source in ["IPCC","IPBES","ALL"], "source arg should be in (IPCC,IPBES,ALL)"
@@ -92,8 +98,6 @@ class ClimateQARetriever(BaseRetriever):
 #     return passages
 # def retrieve_with_summaries(query,retriever,k_summary = 3,k_total = 10,sources = ["IPCC","IPBES"],max_k = 100,threshold = 0.555,as_dict = True,min_length = 300):
 #     assert max_k > k_total
@@ -125,7 +129,6 @@ class ClimateQARetriever(BaseRetriever):
 #         return passages_df
 # def retrieve(query,sources = ["IPCC"],threshold = 0.555,k = 10):
@@ -146,4 +149,3 @@ class ClimateQARetriever(BaseRetriever):
 #       "prompts":{"init_prompt":init_prompt,"sources_prompt":sources_prompt},
 #     }
 #     return response

 from typing import List
 from pydantic import Field
 class ClimateQARetriever(BaseRetriever):
+    vectorstore: VectorStore
+    sources: list = ["IPCC", "IPBES"]
+    threshold: float = 22
+    k_summary: int = 3
+    k_total: int = 10
+    namespace: str = "vectors"
     def get_relevant_documents(self, query: str) -> List[Document]:
         # Check if all elements in the list are either IPCC or IPBES
+        assert isinstance(self.sources, list)
+        assert all([x in ["IPCC", "IPBES"] for x in self.sources])
         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
         # Prepare base search kwargs
         filters = {
+            "source": {"$in": self.sources},
         }
         # Search for k_summary documents in the summaries dataset
         filters_summaries = {
             **filters,
+            "report_type": {"$in": ["SPM", "TS"]},
         }
+        docs_summaries = self.vectorstore.similarity_search_with_score(
+            query=query,
+            namespace=self.namespace,
+            filter=filters_summaries,
+            k=self.k_summary,
+        )
         docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
         # Search for k_total - k_summary documents in the full reports dataset
         filters_full = {
             **filters,
+            "report_type": {"$nin": ["SPM", "TS"]},
         }
         k_full = self.k_total - len(docs_summaries)
+        docs_full = self.vectorstore.similarity_search_with_score(
+            query=query, namespace=self.namespace, filter=filters_full, k=k_full
+        )
         # Concatenate documents
         docs = docs_summaries + docs_full
         # Add score to metadata
         results = []
+        for i, (doc, score) in enumerate(docs):
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["page_number"] = int(doc.metadata["page_number"])
+            doc.page_content = (
+                f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
+            )
             results.append(doc)
         return results
 # def filter_summaries(df,k_summary = 3,k_total = 10):
 #     # assert source in ["IPCC","IPBES","ALL"], "source arg should be in (IPCC,IPBES,ALL)"
 #     return passages
 # def retrieve_with_summaries(query,retriever,k_summary = 3,k_total = 10,sources = ["IPCC","IPBES"],max_k = 100,threshold = 0.555,as_dict = True,min_length = 300):
 #     assert max_k > k_total
 #         return passages_df
 # def retrieve(query,sources = ["IPCC"],threshold = 0.555,k = 10):
 #       "prompts":{"init_prompt":init_prompt,"sources_prompt":sources_prompt},
 #     }
 #     return response

climateqa/vectorstore.py CHANGED Viewed

@@ -8,13 +8,13 @@ from langchain.vectorstores import Pinecone
 # LOAD ENVIRONMENT VARIABLES
 try:
     from dotenv import load_dotenv
     load_dotenv()
 except:
     pass
-def get_pinecone_vectorstore(embeddings,text_key = "content"):
     # initialize pinecone
     pinecone.init(
         api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
@@ -22,5 +22,7 @@ def get_pinecone_vectorstore(embeddings,text_key = "content"):
     )
     index_name = os.getenv("PINECONE_API_INDEX")
-    vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
     return vectorstore

 # LOAD ENVIRONMENT VARIABLES
 try:
     from dotenv import load_dotenv
     load_dotenv()
 except:
     pass
+def get_pinecone_vectorstore(embeddings, text_key="content"):
     # initialize pinecone
     pinecone.init(
         api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
     )
     index_name = os.getenv("PINECONE_API_INDEX")
+    vectorstore = Pinecone.from_existing_index(
+        index_name, embeddings, text_key=text_key
+    )
     return vectorstore

utils.py CHANGED Viewed

@@ -6,7 +6,7 @@ import uuid
 def create_user_id():
     """Create user_id
-        str: String to id user
     """
     user_id = str(uuid.uuid4())
-    return user_id

 def create_user_id():
     """Create user_id
+    str: String to id user
     """
     user_id = str(uuid.uuid4())
+    return user_id