Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

TheoLvs commited on May 21, 2024

Commit

481f3b1

1 Parent(s): caf1faa

First commit CQA with Agents

Browse files

Files changed (18) hide show

.gitignore +3 -0
app.py +3 -3
climateqa/engine/chains/__init__.py +0 -0
climateqa/engine/chains/answer_ai_impact.py +46 -0
climateqa/engine/chains/answer_chitchat.py +52 -0
climateqa/engine/{rag.py → chains/answer_rag.py} +12 -2
climateqa/engine/chains/intent_routing.py +86 -0
climateqa/engine/{prompts.py → chains/prompts.py} +0 -0
climateqa/engine/chains/query_transform.py +152 -0
climateqa/engine/{reformulation.py → chains/reformulation.py} +1 -1
climateqa/engine/chains/retriever.py +122 -0
climateqa/engine/chains/sample_router.py +66 -0
climateqa/engine/chains/translation.py +41 -0
climateqa/engine/embeddings.py +6 -3
climateqa/engine/graph.py +32 -0
climateqa/engine/reranker.py +40 -0
requirements.txt +2 -1
sandbox/20240310 - CQA - Semantic Routing 1.ipynb +0 -0

.gitignore CHANGED Viewed

@@ -5,3 +5,6 @@ __pycache__/utils.cpython-38.pyc
 notebooks/
 *.pyc

 notebooks/
 *.pyc
+**/.ipynb_checkpoints/
+**/.flashrank_cache/

app.py CHANGED Viewed

@@ -29,16 +29,16 @@ from utils import create_user_id
 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
-from climateqa.engine.rag import make_rag_chain
 from climateqa.engine.vectorstore import get_pinecone_vectorstore
 from climateqa.engine.retriever import ClimateQARetriever
 from climateqa.engine.embeddings import get_embeddings_function
-from climateqa.engine.prompts import audience_prompts
 from climateqa.sample_questions import QUESTIONS
 from climateqa.constants import POSSIBLE_REPORTS
 from climateqa.utils import get_image_from_azure_blob_storage
 from climateqa.engine.keywords import make_keywords_chain
-from climateqa.engine.rag import make_rag_papers_chain
 # Load environment variables in local mode
 try:

 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
+from climateqa.engine.chains.answer_rag import make_rag_chain
 from climateqa.engine.vectorstore import get_pinecone_vectorstore
 from climateqa.engine.retriever import ClimateQARetriever
 from climateqa.engine.embeddings import get_embeddings_function
+from climateqa.engine.chains.prompts import audience_prompts
 from climateqa.sample_questions import QUESTIONS
 from climateqa.constants import POSSIBLE_REPORTS
 from climateqa.utils import get_image_from_azure_blob_storage
 from climateqa.engine.keywords import make_keywords_chain
+from climateqa.engine.chains.answer_rag import make_rag_papers_chain
 # Load environment variables in local mode
 try:

climateqa/engine/chains/__init__.py ADDED Viewed

File without changes

climateqa/engine/chains/answer_ai_impact.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from langchain.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+prompt_template = """
+You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
+Always stay true to climate and nature science and do not make up information.
+If you do not know the answer, just say you do not know.
+## Guidelines
+- Explain that the environmental impact of AI is not covered by the IPCC or IPBES reports, but you can recommend info based on the sources below
+- Answer the question in the original language of the question
+## Sources
+- You can propose to visit this page https://climateqa.com/docs/carbon-footprint/ to learn more about ClimateQ&A's own carbon footprint
+- You can recommend to look at the work of the AI & climate expert scientist Sasha Luccioni with in in particular those papers
+    - Power Hungry Processing: Watts Driving the Cost of AI Deployment? - https://arxiv.org/abs/2311.16863 - about the carbon footprint at the inference stage of AI models
+    - Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning - https://arxiv.org/abs/2302.08476
+    - Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model - https://arxiv.org/abs/2211.02001 - about the carbon footprint of training a large language model
+- You can also recommend the following tools to calculate the carbon footprint of AI models
+    - CodeCarbon - https://github.com/mlco2/codecarbon to measure the carbon footprint of your code
+    - Ecologits - https://ecologits.ai/ to measure the carbon footprint of using LLMs APIs such
+"""
+def make_ai_impact_chain(llm):
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", prompt_template),
+        ("user", "{question}")
+    ])
+    chain = prompt | llm | StrOutputParser()
+    chain = chain.with_config({"run_name":"ai_impact_chain"})
+    return chain
+def make_ai_impact_node(llm):
+    ai_impact_chain = make_ai_impact_chain(llm)
+    async def answer_ai_impact(state,config):
+        answer = await ai_impact_chain.ainvoke({"question":state["user_input"]},config)
+        return {"answer":answer}
+    return answer_ai_impact

climateqa/engine/chains/answer_chitchat.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from langchain.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+chitchat_prompt_template = """
+You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
+Always stay true to climate and nature science and do not make up information.
+If you do not know the answer, just say you do not know.
+## Guidelines
+- If it's a conversational question, you can normally chat with the user
+- If the question is not related to any topic about the environment, refuse to answer and politely ask the user to ask another question about the environment
+- If the user ask if you speak any language, you can say you speak all languages :)
+- If the user ask about the bot itself "ClimateQ&A", you can explain that you are an AI assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports and propose to visit the website here https://climateqa.com/docs/intro/ for more information
+- If the question is about ESG regulations, standards, or frameworks like the CSRD, TCFD, SASB, GRI, CDP, etc., you can explain that this is not a topic covered by the IPCC or IPBES reports.
+- Precise that you are specialized in finding trustworthy information from the scientific reports of the IPCC and IPBES and other scientific litterature
+- If relevant you can propose up to 3 example of questions they could ask from the IPCC or IPBES reports from the examples below
+- Always answer in the original language of the question
+## Examples of questions you can suggest (in the original language of the question)
+    "What evidence do we have of climate change?",
+    "Are human activities causing global warming?",
+    "What are the impacts of climate change?",
+    "Can climate change be reversed?",
+    "What is the difference between climate change and global warming?",
+"""
+def make_chitchat_chain(llm):
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", chitchat_prompt_template),
+        ("user", "{question}")
+    ])
+    chain = prompt | llm | StrOutputParser()
+    chain = chain.with_config({"run_name":"chitchat_chain"})
+    return chain
+def make_chitchat_node(llm):
+    chitchat_chain = make_chitchat_chain(llm)
+    async def answer_chitchat(state,config):
+        answer = await chitchat_chain.ainvoke({"question":state["user_input"]},config)
+        return {"answer":answer}
+    return answer_chitchat

climateqa/engine/{rag.py → chains/answer_rag.py} RENAMED Viewed

@@ -6,7 +6,7 @@ from langchain_core.runnables import RunnablePassthrough, RunnableLambda, Runnab
 from langchain_core.prompts.prompt import PromptTemplate
 from langchain_core.prompts.base import format_document
-from climateqa.engine.reformulation import make_reformulation_chain
 from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
 from climateqa.engine.prompts import papers_prompt_template
 from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
@@ -131,4 +131,14 @@ def make_illustration_chain(llm):
     }
     illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
-    return illustration_chain

 from langchain_core.prompts.prompt import PromptTemplate
 from langchain_core.prompts.base import format_document
+from climateqa.engine.chains.reformulation import make_reformulation_chain
 from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
 from climateqa.engine.prompts import papers_prompt_template
 from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
     }
     illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
+    return illustration_chain
+def make_answer_rag_node(llm):
+    def answer_rag(state):
+        answer = "\n".join([x["question"] for x in state["questions"]])
+        return {"answer":answer}
+    return answer_rag

climateqa/engine/chains/intent_routing.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from langchain_core.pydantic_v1 import BaseModel, Field
+from typing import List
+from typing import Literal
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.utils.function_calling import convert_to_openai_function
+from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+class IntentRouter(BaseModel):
+    """Analyzing the user message input"""
+    language: str = Field(
+        description="Find the language of the message input in full words (ex: French, English, Spanish, ...), defaults to English",
+        default="English",
+    )
+    intent: str = Field(
+        enum=[
+            "ai_impact",
+            "geo_info",
+            "esg",
+            "search",
+            "chitchat",
+        ],
+        description="""
+            Categorize the user input in one of the following category
+            Any question
+            Examples:
+            - ai_impact = Environmental impacts of AI: "What are the environmental impacts of AI", "How does AI affect the environment"
+            - geo_info = Geolocated info about climate change: Any question where the user wants to know localized impacts of climate change, eg: "What will be the temperature in Marseille in 2050"
+            - esg = Any question about the ESG regulation, frameworks and standards like the CSRD, TCFD, SASB, GRI, CDP, etc.
+            - search = Searching for any quesiton about climate change, energy, biodiversity, nature, and everything we can find the IPCC or IPBES reports or scientific papers,
+            - chitchat = Any general question that is not related to the environment or climate change or just conversational, or if you don't think searching the IPCC or IPBES reports would be relevant
+        """,
+    )
+def make_intent_router_chain(llm):
+    openai_functions = [convert_to_openai_function(IntentRouter)]
+    llm_with_router = llm.bind(functions = openai_functions,function_call={"name":"IntentRouter"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_router | JsonOutputFunctionsParser()
+    return chain
+def make_intent_router_node(llm):
+    router_chain = make_intent_router_chain(llm)
+    def route_input_message(state):
+        output = router_chain.invoke({"input":state["user_input"]})
+        if "language" not in output: output["language"] = "English"
+        output["query"] = state["user_input"]
+        return output
+    return route_input_message
+# SAMPLE_QUESTIONS = [
+#     "Est-ce que l'IA a un impact sur l'environnement ?",
+#     "Que dit le GIEC sur l'impact de l'IA",
+#     "Qui sont les membres du GIEC",
+#     "What is the impact of El Nino ?",
+#     "Yo",
+#     "Hello ça va bien ?",
+#     "Par qui as tu été créé ?",
+#     "What role do cloud formations play in modulating the Earth's radiative balance, and how are they represented in current climate models?",
+#     "Which industries have the highest GHG emissions?",
+#     "What are invasive alien species and how do they threaten biodiversity and ecosystems?",
+#     "Are human activities causing global warming?",
+#     "What is the motivation behind mining the deep seabed?",
+#     "Tu peux m'écrire un poème sur le changement climatique ?",
+#     "Tu peux m'écrire un poème sur les bonbons ?",
+#     "What will be the temperature in 2100 in Strasbourg?",
+#     "C'est quoi le lien entre biodiversity and changement climatique ?",
+# ]

climateqa/engine/{prompts.py → chains/prompts.py} RENAMED Viewed

File without changes

climateqa/engine/chains/query_transform.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from langchain_core.pydantic_v1 import BaseModel, Field
+from typing import List
+from typing import Literal
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.utils.function_calling import convert_to_openai_function
+from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+# Prompt from the original paper https://arxiv.org/pdf/2305.14283
+# Query Rewriting for Retrieval-Augmented Large Language Models
+class QueryDecomposition(BaseModel):
+    """
+    Decompose the user query into smaller parts to think step by step to answer this question
+    Act as a simple planning agent
+    """
+    questions: List[str] = Field(
+        description="""
+        Think step by step to answer this question, and provide one or several search engine questions in English for knowledge that you need.
+        Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
+        - If it's already a standalone question, you don't need to provide more questions, just reformulate it if relevant as a better question for a search engine
+        - If you need to decompose the question, output a list of maximum 3 questions
+    """
+    )
+class Location(BaseModel):
+    country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
+    location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
+class QueryAnalysis(BaseModel):
+    """
+    Analyzing the user query to extract topics, sources and date
+    Also do query expansion to get alternative search queries
+    Also provide simple keywords to feed a search engine
+    """
+    # keywords: List[str] = Field(
+    #     description="""
+    #     Extract the keywords from the user query to feed a search engine as a list
+    #     Maximum 3 keywords
+    #     Examples:
+    #     - "What is the impact of deep sea mining ?" -> deep sea mining
+    #     - "How will El Nino be impacted by climate change" -> el nino;climate change
+    #     - "Is climate change a hoax" -> climate change;hoax
+    #     """
+    # )
+    # alternative_queries: List[str] = Field(
+    #     description="""
+    #     Generate alternative search questions from the user query to feed a search engine
+    #     """
+    # )
+    # step_back_question: str = Field(
+    #     description="""
+    #     You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.
+    #     This questions should help you get more context and information about the user query
+    #     """
+    # )
+    sources: List[Literal["IPCC", "IPBES", "IPOS","OpenAlex"]] = Field(
+        ...,
+        description="""
+            Given a user question choose which documents would be most relevant for answering their question,
+            - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
+            - IPBES is for questions about biodiversity and nature
+            - IPOS is for questions about the ocean and deep sea mining
+            - OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
+        """,
+    )
+    # topics: List[Literal[
+    #     "Climate change",
+    #     "Biodiversity",
+    #     "Energy",
+    #     "Decarbonization",
+    #     "Climate science",
+    #     "Nature",
+    #     "Climate policy and justice",
+    #     "Oceans",
+    #     "Deep sea mining",
+    #     "ESG and regulations",
+    #     "CSRD",
+    # ]] = Field(
+    #     ...,
+    #     description = """
+    #         Choose the topics that are most relevant to the user query, ex: Climate change, Energy, Biodiversity, ...
+    #     """,
+    # )
+    # date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
+    # location:Location
+def make_query_decomposition_chain(llm):
+    openai_functions = [convert_to_openai_function(QueryDecomposition)]
+    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryDecomposition"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
+    return chain
+def make_query_rewriter_chain(llm):
+    openai_functions = [convert_to_openai_function(QueryAnalysis)]
+    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
+    return chain
+def make_query_transform_node(llm):
+    decomposition_chain = make_query_decomposition_chain(llm)
+    rewriter_chain = make_query_rewriter_chain(llm)
+    def transform_query(state):
+        new_state = {}
+        # Decomposition
+        decomposition_output = decomposition_chain.invoke({"input":state["query"]})
+        new_state.update(decomposition_output)
+        # Query Analysis
+        questions = []
+        for question in new_state["questions"]:
+            question_state = {"question":question}
+            analysis_output = rewriter_chain.invoke({"input":question})
+            question_state.update(analysis_output)
+            questions.append(question_state)
+        new_state["questions"] = questions
+        return new_state
+    return transform_query

climateqa/engine/{reformulation.py → chains/reformulation.py} RENAMED Viewed

@@ -3,7 +3,7 @@ from langchain.output_parsers.structured import StructuredOutputParser, Response
 from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
-from climateqa.engine.prompts import reformulation_prompt_template
 from climateqa.engine.utils import pass_values, flatten_dict

 from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
+from climateqa.engine.chains.prompts import reformulation_prompt_template
 from climateqa.engine.utils import pass_values, flatten_dict

climateqa/engine/chains/retriever.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import sys
+import os
+from contextlib import contextmanager
+from ..reranker import rerank_docs
+from ..retriever import ClimateQARetriever
+def divide_into_parts(target, parts):
+    # Base value for each part
+    base = target // parts
+    # Remainder to distribute
+    remainder = target % parts
+    # List to hold the result
+    result = []
+    for i in range(parts):
+        if i < remainder:
+            # These parts get base value + 1
+            result.append(base + 1)
+        else:
+            # The rest get the base value
+            result.append(base)
+    return result
+@contextmanager
+def suppress_output():
+    # Open a null device
+    with open(os.devnull, 'w') as devnull:
+        # Store the original stdout and stderr
+        old_stdout = sys.stdout
+        old_stderr = sys.stderr
+        # Redirect stdout and stderr to the null device
+        sys.stdout = devnull
+        sys.stderr = devnull
+        try:
+            yield
+        finally:
+            # Restore stdout and stderr
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
+def make_retriever_node(vectorstore,reranker):
+    def retrieve_documents(state):
+        POSSIBLE_SOURCES = ["IPCC","IPBES","IPOS","OpenAlex"]
+        questions = state["questions"]
+        # Use sources from the user input or from the LLM detection
+        sources_input = state["sources_input"] if "sources_input" in state else ["auto"]
+        auto_mode = "auto" in sources_input
+        # Constants
+        k_final = 15
+        k_before_reranking = 100
+        k_summary = 5
+        rerank_by_question = True
+        # There are several options to get the final top k
+        # Option 1 - Get 100 documents by question and rerank by question
+        # Option 2 - Get 100/n documents by question and rerank the total
+        if rerank_by_question:
+            k_by_question = divide_into_parts(k_final,len(questions))
+        docs = []
+        for i,q in enumerate(questions):
+            sources = q["sources"]
+            question = q["question"]
+            # If auto mode, we use the sources detected by the LLM
+            if auto_mode:
+                sources = [x for x in sources if x in POSSIBLE_SOURCES]
+            # Otherwise, we use the config
+            else:
+                sources = sources_input
+            # Search the document store using the retriever
+            # Configure high top k for further reranking step
+            retriever = ClimateQARetriever(
+                vectorstore=vectorstore,
+                sources = sources,
+                # reports = ias_reports,
+            min_size = 200,
+            k_summary = k_summary,k_total = k_before_reranking,
+            threshold = 0.5,
+            )
+            docs_question = retriever.get_relevant_documents(question)
+            # Rerank
+            with suppress_output():
+                docs_question = rerank_docs(reranker,docs_question,question)
+            # If rerank by question we select the top documents for each question
+            if rerank_by_question:
+                docs_question = docs_question[:k_by_question[i]]
+            # Add sources used in the metadata
+            for doc in docs_question:
+                doc.metadata["sources_used"] = sources
+            # Add to the list of docs
+            docs.extend(docs_question)
+        # Sorting the list in descending order by rerank_score
+        # Then select the top k
+        docs = sorted(docs, key=lambda x: x.metadata["rerank_score"], reverse=True)
+        docs = docs[:k_final]
+        new_state = {"documents":docs}
+        return new_state
+    return retrieve_documents

climateqa/engine/chains/sample_router.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# from typing import List
+# from typing import Literal
+# from langchain.prompts import ChatPromptTemplate
+# from langchain_core.utils.function_calling import convert_to_openai_function
+# from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+# # https://livingdatalab.com/posts/2023-11-05-openai-function-calling-with-langchain.html
+# class Location(BaseModel):
+#     country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
+#     location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
+# class QueryAnalysis(BaseModel):
+#     """Analyzing the user query"""
+#     language: str = Field(
+#         description="Find the language of the query in full words (ex: French, English, Spanish, ...), defaults to English"
+#     )
+#     intent: str = Field(
+#         enum=[
+#             "Environmental impacts of AI",
+#             "Geolocated info about climate change",
+#             "Climate change",
+#             "Biodiversity",
+#             "Deep sea mining",
+#             "Chitchat",
+#         ],
+#         description="""
+#             Categorize the user query in one of the following category,
+#             Examples:
+#             - Geolocated info about climate change: "What will be the temperature in Marseille in 2050"
+#             - Climate change: "What is radiative forcing", "How much will
+#         """,
+#     )
+#     sources: List[Literal["IPCC", "IPBES", "IPOS"]] = Field(
+#         ...,
+#         description="""
+#             Given a user question choose which documents would be most relevant for answering their question,
+#             - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
+#             - IPBES is for questions about biodiversity and nature
+#             - IPOS is for questions about the ocean and deep sea mining
+#         """,
+#     )
+#     date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
+#     location:Location
+#     # query: str = Field(
+#     #     description = """
+#     #         Translate to english and reformulate the following user message to be a short standalone question, in the context of an educational discussion about climate change.
+#     #         The reformulated question will used in a search engine
+#     #         By default, assume that the user is asking information about the last century,
+#     #         Use the following examples
+#     #         ### Examples:
+#     #         La technologie nous sauvera-t-elle ? -> Can technology help humanity mitigate the effects of climate change?
+#     #         what are our reserves in fossil fuel? -> What are the current reserves of fossil fuels and how long will they last?
+#     #         what are the main causes of climate change? -> What are the main causes of climate change in the last century?
+#     #         Question in English:
+#     #     """
+#     # )
+# openai_functions = [convert_to_openai_function(QueryAnalysis)]
+# llm2 = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})

climateqa/engine/chains/translation.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from langchain_core.pydantic_v1 import BaseModel, Field
+from typing import List
+from typing import Literal
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.utils.function_calling import convert_to_openai_function
+from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+class Translation(BaseModel):
+    """Analyzing the user message input"""
+    translation: str = Field(
+        description="Translate the message input to English",
+    )
+def make_translation_chain(llm):
+    openai_functions = [convert_to_openai_function(Translation)]
+    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"Translation"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, you will translate the user input message to English using the function provided"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
+    return chain
+def make_translation_node(llm):
+    translation_chain = make_translation_chain(llm)
+    def translate_query(state):
+        user_input = state["user_input"]
+        translation = translation_chain.invoke({"input":user_input})
+        return {"query":translation["translation"]}
+    return translate_query

climateqa/engine/embeddings.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
-def get_embeddings_function(version = "v1.2"):
     if version == "v1.2":
@@ -10,12 +10,12 @@ def get_embeddings_function(version = "v1.2"):
         # Best embedding model at a reasonable size at the moment (2023-11-22)
         model_name = "BAAI/bge-base-en-v1.5"
-        encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
         print("Loading embeddings model: ", model_name)
         embeddings_function = HuggingFaceBgeEmbeddings(
             model_name=model_name,
             encode_kwargs=encode_kwargs,
-            query_instruction="Represent this sentence for searching relevant passages: "
         )
     else:
@@ -23,3 +23,6 @@ def get_embeddings_function(version = "v1.2"):
         embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
     return embeddings_function

 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
+def get_embeddings_function(version = "v1.2",query_instruction = "Represent this sentence for searching relevant passages: "):
     if version == "v1.2":
         # Best embedding model at a reasonable size at the moment (2023-11-22)
         model_name = "BAAI/bge-base-en-v1.5"
+        encode_kwargs = {'normalize_embeddings': True,"show_progress_bar":False} # set True to compute cosine similarity
         print("Loading embeddings model: ", model_name)
         embeddings_function = HuggingFaceBgeEmbeddings(
             model_name=model_name,
             encode_kwargs=encode_kwargs,
+            query_instruction=query_instruction,
         )
     else:
         embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
     return embeddings_function

climateqa/engine/graph.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import sys
+import os
+from contextlib import contextmanager
+from langchain.schema import Document
+from langgraph.graph import END, StateGraph
+from typing_extensions import TypedDict
+from typing import List
+from .chains.answer_chitchat import make_chitchat_node
+from .chains.answer_ai_impact import make_ai_impact_node
+from .chains.query_transform import make_query_transform_node
+from .chains.translation import make_translation_node
+from .chains.intent_routing import make_intent_router_node
+class GraphState(TypedDict):
+    """
+    Represents the state of our graph.
+    """
+    user_input : str
+    language : str
+    intent : str
+    query: str
+    questions : List[dict]
+    answer: str
+    audience: str
+    sources_input: str
+    documents: List[Document]
+def search(state):
+    return {}

climateqa/engine/reranker.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from scipy.special import expit, logit
+from rerankers import Reranker
+def get_reranker(model = "nano",cohere_api_key = None):
+    assert model in ["nano","tiny","small","large"]
+    if model == "nano":
+        reranker = Reranker('ms-marco-TinyBERT-L-2-v2', model_type='flashrank')
+    elif model == "tiny":
+        reranker = Reranker('ms-marco-MiniLM-L-12-v2', model_type='flashrank')
+    elif model == "small":
+        reranker = Reranker("mixedbread-ai/mxbai-rerank-xsmall-v1", model_type='cross-encoder')
+    elif model == "large":
+        if cohere_api_key is None:
+            cohere_api_key = os.environ["COHERE_API_KEY"]
+        reranker = Reranker("cohere", lang='en', api_key = cohere_api_key)
+    return reranker
+def rerank_docs(reranker,docs,query):
+    # Get a list of texts from langchain docs
+    input_docs = [x.page_content for x in docs]
+    # Rerank using rerankers library
+    results = reranker.rank(query=query, docs=input_docs)
+    # Prepare langchain list of docs
+    docs_reranked = []
+    for result in results.results:
+        doc_id = result.document.doc_id
+        doc = docs[doc_id]
+        doc.metadata["rerank_score"] = result.score
+        doc.metadata["query_used_for_retrieval"] = query
+        docs_reranked.append(doc)
+    return docs_reranked

requirements.txt CHANGED Viewed

@@ -10,4 +10,5 @@ huggingface-hub
 msal
 pyalex==0.13
 networkx==3.2.1
-pyvis==0.3.2

 msal
 pyalex==0.13
 networkx==3.2.1
+pyvis==0.3.2
+flashrank==0.2.5

sandbox/20240310 - CQA - Semantic Routing 1.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff