First commit CQA with Agents
Browse files- .gitignore +3 -0
- app.py +3 -3
- climateqa/engine/chains/__init__.py +0 -0
- climateqa/engine/chains/answer_ai_impact.py +46 -0
- climateqa/engine/chains/answer_chitchat.py +52 -0
- climateqa/engine/{rag.py β chains/answer_rag.py} +12 -2
- climateqa/engine/chains/intent_routing.py +86 -0
- climateqa/engine/{prompts.py β chains/prompts.py} +0 -0
- climateqa/engine/chains/query_transform.py +152 -0
- climateqa/engine/{reformulation.py β chains/reformulation.py} +1 -1
- climateqa/engine/chains/retriever.py +122 -0
- climateqa/engine/chains/sample_router.py +66 -0
- climateqa/engine/chains/translation.py +41 -0
- climateqa/engine/embeddings.py +6 -3
- climateqa/engine/graph.py +32 -0
- climateqa/engine/reranker.py +40 -0
- requirements.txt +2 -1
- sandbox/20240310 - CQA - Semantic Routing 1.ipynb +0 -0
.gitignore
CHANGED
@@ -5,3 +5,6 @@ __pycache__/utils.cpython-38.pyc
|
|
5 |
|
6 |
notebooks/
|
7 |
*.pyc
|
|
|
|
|
|
|
|
5 |
|
6 |
notebooks/
|
7 |
*.pyc
|
8 |
+
|
9 |
+
**/.ipynb_checkpoints/
|
10 |
+
**/.flashrank_cache/
|
app.py
CHANGED
@@ -29,16 +29,16 @@ from utils import create_user_id
|
|
29 |
|
30 |
# ClimateQ&A imports
|
31 |
from climateqa.engine.llm import get_llm
|
32 |
-
from climateqa.engine.
|
33 |
from climateqa.engine.vectorstore import get_pinecone_vectorstore
|
34 |
from climateqa.engine.retriever import ClimateQARetriever
|
35 |
from climateqa.engine.embeddings import get_embeddings_function
|
36 |
-
from climateqa.engine.prompts import audience_prompts
|
37 |
from climateqa.sample_questions import QUESTIONS
|
38 |
from climateqa.constants import POSSIBLE_REPORTS
|
39 |
from climateqa.utils import get_image_from_azure_blob_storage
|
40 |
from climateqa.engine.keywords import make_keywords_chain
|
41 |
-
from climateqa.engine.
|
42 |
|
43 |
# Load environment variables in local mode
|
44 |
try:
|
|
|
29 |
|
30 |
# ClimateQ&A imports
|
31 |
from climateqa.engine.llm import get_llm
|
32 |
+
from climateqa.engine.chains.answer_rag import make_rag_chain
|
33 |
from climateqa.engine.vectorstore import get_pinecone_vectorstore
|
34 |
from climateqa.engine.retriever import ClimateQARetriever
|
35 |
from climateqa.engine.embeddings import get_embeddings_function
|
36 |
+
from climateqa.engine.chains.prompts import audience_prompts
|
37 |
from climateqa.sample_questions import QUESTIONS
|
38 |
from climateqa.constants import POSSIBLE_REPORTS
|
39 |
from climateqa.utils import get_image_from_azure_blob_storage
|
40 |
from climateqa.engine.keywords import make_keywords_chain
|
41 |
+
from climateqa.engine.chains.answer_rag import make_rag_papers_chain
|
42 |
|
43 |
# Load environment variables in local mode
|
44 |
try:
|
climateqa/engine/chains/__init__.py
ADDED
File without changes
|
climateqa/engine/chains/answer_ai_impact.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.prompts import ChatPromptTemplate
|
2 |
+
from langchain_core.output_parsers import StrOutputParser
|
3 |
+
|
4 |
+
|
5 |
+
prompt_template = """
|
6 |
+
You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
|
7 |
+
Always stay true to climate and nature science and do not make up information.
|
8 |
+
If you do not know the answer, just say you do not know.
|
9 |
+
|
10 |
+
## Guidelines
|
11 |
+
- Explain that the environmental impact of AI is not covered by the IPCC or IPBES reports, but you can recommend info based on the sources below
|
12 |
+
- Answer the question in the original language of the question
|
13 |
+
|
14 |
+
## Sources
|
15 |
+
- You can propose to visit this page https://climateqa.com/docs/carbon-footprint/ to learn more about ClimateQ&A's own carbon footprint
|
16 |
+
- You can recommend to look at the work of the AI & climate expert scientist Sasha Luccioni with in in particular those papers
|
17 |
+
- Power Hungry Processing: Watts Driving the Cost of AI Deployment? - https://arxiv.org/abs/2311.16863 - about the carbon footprint at the inference stage of AI models
|
18 |
+
- Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning - https://arxiv.org/abs/2302.08476
|
19 |
+
- Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model - https://arxiv.org/abs/2211.02001 - about the carbon footprint of training a large language model
|
20 |
+
- You can also recommend the following tools to calculate the carbon footprint of AI models
|
21 |
+
- CodeCarbon - https://github.com/mlco2/codecarbon to measure the carbon footprint of your code
|
22 |
+
- Ecologits - https://ecologits.ai/ to measure the carbon footprint of using LLMs APIs such
|
23 |
+
"""
|
24 |
+
|
25 |
+
|
26 |
+
def make_ai_impact_chain(llm):
|
27 |
+
|
28 |
+
prompt = ChatPromptTemplate.from_messages([
|
29 |
+
("system", prompt_template),
|
30 |
+
("user", "{question}")
|
31 |
+
])
|
32 |
+
|
33 |
+
chain = prompt | llm | StrOutputParser()
|
34 |
+
chain = chain.with_config({"run_name":"ai_impact_chain"})
|
35 |
+
|
36 |
+
return chain
|
37 |
+
|
38 |
+
def make_ai_impact_node(llm):
|
39 |
+
|
40 |
+
ai_impact_chain = make_ai_impact_chain(llm)
|
41 |
+
|
42 |
+
async def answer_ai_impact(state,config):
|
43 |
+
answer = await ai_impact_chain.ainvoke({"question":state["user_input"]},config)
|
44 |
+
return {"answer":answer}
|
45 |
+
|
46 |
+
return answer_ai_impact
|
climateqa/engine/chains/answer_chitchat.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.prompts import ChatPromptTemplate
|
2 |
+
from langchain_core.output_parsers import StrOutputParser
|
3 |
+
|
4 |
+
|
5 |
+
chitchat_prompt_template = """
|
6 |
+
You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
|
7 |
+
Always stay true to climate and nature science and do not make up information.
|
8 |
+
If you do not know the answer, just say you do not know.
|
9 |
+
|
10 |
+
## Guidelines
|
11 |
+
- If it's a conversational question, you can normally chat with the user
|
12 |
+
- If the question is not related to any topic about the environment, refuse to answer and politely ask the user to ask another question about the environment
|
13 |
+
- If the user ask if you speak any language, you can say you speak all languages :)
|
14 |
+
- If the user ask about the bot itself "ClimateQ&A", you can explain that you are an AI assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports and propose to visit the website here https://climateqa.com/docs/intro/ for more information
|
15 |
+
- If the question is about ESG regulations, standards, or frameworks like the CSRD, TCFD, SASB, GRI, CDP, etc., you can explain that this is not a topic covered by the IPCC or IPBES reports.
|
16 |
+
- Precise that you are specialized in finding trustworthy information from the scientific reports of the IPCC and IPBES and other scientific litterature
|
17 |
+
- If relevant you can propose up to 3 example of questions they could ask from the IPCC or IPBES reports from the examples below
|
18 |
+
- Always answer in the original language of the question
|
19 |
+
|
20 |
+
## Examples of questions you can suggest (in the original language of the question)
|
21 |
+
"What evidence do we have of climate change?",
|
22 |
+
"Are human activities causing global warming?",
|
23 |
+
"What are the impacts of climate change?",
|
24 |
+
"Can climate change be reversed?",
|
25 |
+
"What is the difference between climate change and global warming?",
|
26 |
+
"""
|
27 |
+
|
28 |
+
|
29 |
+
def make_chitchat_chain(llm):
|
30 |
+
|
31 |
+
prompt = ChatPromptTemplate.from_messages([
|
32 |
+
("system", chitchat_prompt_template),
|
33 |
+
("user", "{question}")
|
34 |
+
])
|
35 |
+
|
36 |
+
chain = prompt | llm | StrOutputParser()
|
37 |
+
chain = chain.with_config({"run_name":"chitchat_chain"})
|
38 |
+
|
39 |
+
return chain
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
def make_chitchat_node(llm):
|
44 |
+
|
45 |
+
chitchat_chain = make_chitchat_chain(llm)
|
46 |
+
|
47 |
+
async def answer_chitchat(state,config):
|
48 |
+
answer = await chitchat_chain.ainvoke({"question":state["user_input"]},config)
|
49 |
+
return {"answer":answer}
|
50 |
+
|
51 |
+
return answer_chitchat
|
52 |
+
|
climateqa/engine/{rag.py β chains/answer_rag.py}
RENAMED
@@ -6,7 +6,7 @@ from langchain_core.runnables import RunnablePassthrough, RunnableLambda, Runnab
|
|
6 |
from langchain_core.prompts.prompt import PromptTemplate
|
7 |
from langchain_core.prompts.base import format_document
|
8 |
|
9 |
-
from climateqa.engine.reformulation import make_reformulation_chain
|
10 |
from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
|
11 |
from climateqa.engine.prompts import papers_prompt_template
|
12 |
from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
|
@@ -131,4 +131,14 @@ def make_illustration_chain(llm):
|
|
131 |
}
|
132 |
|
133 |
illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
|
134 |
-
return illustration_chain
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from langchain_core.prompts.prompt import PromptTemplate
|
7 |
from langchain_core.prompts.base import format_document
|
8 |
|
9 |
+
from climateqa.engine.chains.reformulation import make_reformulation_chain
|
10 |
from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
|
11 |
from climateqa.engine.prompts import papers_prompt_template
|
12 |
from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
|
|
|
131 |
}
|
132 |
|
133 |
illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
|
134 |
+
return illustration_chain
|
135 |
+
|
136 |
+
|
137 |
+
def make_answer_rag_node(llm):
|
138 |
+
|
139 |
+
|
140 |
+
def answer_rag(state):
|
141 |
+
answer = "\n".join([x["question"] for x in state["questions"]])
|
142 |
+
return {"answer":answer}
|
143 |
+
|
144 |
+
return answer_rag
|
climateqa/engine/chains/intent_routing.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from langchain_core.pydantic_v1 import BaseModel, Field
|
3 |
+
from typing import List
|
4 |
+
from typing import Literal
|
5 |
+
from langchain.prompts import ChatPromptTemplate
|
6 |
+
from langchain_core.utils.function_calling import convert_to_openai_function
|
7 |
+
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
|
8 |
+
|
9 |
+
|
10 |
+
class IntentRouter(BaseModel):
|
11 |
+
"""Analyzing the user message input"""
|
12 |
+
|
13 |
+
language: str = Field(
|
14 |
+
description="Find the language of the message input in full words (ex: French, English, Spanish, ...), defaults to English",
|
15 |
+
default="English",
|
16 |
+
)
|
17 |
+
intent: str = Field(
|
18 |
+
enum=[
|
19 |
+
"ai_impact",
|
20 |
+
"geo_info",
|
21 |
+
"esg",
|
22 |
+
"search",
|
23 |
+
"chitchat",
|
24 |
+
],
|
25 |
+
description="""
|
26 |
+
Categorize the user input in one of the following category
|
27 |
+
Any question
|
28 |
+
|
29 |
+
Examples:
|
30 |
+
- ai_impact = Environmental impacts of AI: "What are the environmental impacts of AI", "How does AI affect the environment"
|
31 |
+
- geo_info = Geolocated info about climate change: Any question where the user wants to know localized impacts of climate change, eg: "What will be the temperature in Marseille in 2050"
|
32 |
+
- esg = Any question about the ESG regulation, frameworks and standards like the CSRD, TCFD, SASB, GRI, CDP, etc.
|
33 |
+
- search = Searching for any quesiton about climate change, energy, biodiversity, nature, and everything we can find the IPCC or IPBES reports or scientific papers,
|
34 |
+
- chitchat = Any general question that is not related to the environment or climate change or just conversational, or if you don't think searching the IPCC or IPBES reports would be relevant
|
35 |
+
""",
|
36 |
+
)
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
def make_intent_router_chain(llm):
|
41 |
+
|
42 |
+
openai_functions = [convert_to_openai_function(IntentRouter)]
|
43 |
+
llm_with_router = llm.bind(functions = openai_functions,function_call={"name":"IntentRouter"})
|
44 |
+
|
45 |
+
prompt = ChatPromptTemplate.from_messages([
|
46 |
+
("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
|
47 |
+
("user", "input: {input}")
|
48 |
+
])
|
49 |
+
|
50 |
+
chain = prompt | llm_with_router | JsonOutputFunctionsParser()
|
51 |
+
return chain
|
52 |
+
|
53 |
+
|
54 |
+
def make_intent_router_node(llm):
|
55 |
+
|
56 |
+
router_chain = make_intent_router_chain(llm)
|
57 |
+
|
58 |
+
def route_input_message(state):
|
59 |
+
output = router_chain.invoke({"input":state["user_input"]})
|
60 |
+
if "language" not in output: output["language"] = "English"
|
61 |
+
output["query"] = state["user_input"]
|
62 |
+
return output
|
63 |
+
|
64 |
+
return route_input_message
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
# SAMPLE_QUESTIONS = [
|
70 |
+
# "Est-ce que l'IA a un impact sur l'environnement ?",
|
71 |
+
# "Que dit le GIEC sur l'impact de l'IA",
|
72 |
+
# "Qui sont les membres du GIEC",
|
73 |
+
# "What is the impact of El Nino ?",
|
74 |
+
# "Yo",
|
75 |
+
# "Hello Γ§a va bien ?",
|
76 |
+
# "Par qui as tu été créé ?",
|
77 |
+
# "What role do cloud formations play in modulating the Earth's radiative balance, and how are they represented in current climate models?",
|
78 |
+
# "Which industries have the highest GHG emissions?",
|
79 |
+
# "What are invasive alien species and how do they threaten biodiversity and ecosystems?",
|
80 |
+
# "Are human activities causing global warming?",
|
81 |
+
# "What is the motivation behind mining the deep seabed?",
|
82 |
+
# "Tu peux m'écrire un poème sur le changement climatique ?",
|
83 |
+
# "Tu peux m'écrire un poème sur les bonbons ?",
|
84 |
+
# "What will be the temperature in 2100 in Strasbourg?",
|
85 |
+
# "C'est quoi le lien entre biodiversity and changement climatique ?",
|
86 |
+
# ]
|
climateqa/engine/{prompts.py β chains/prompts.py}
RENAMED
File without changes
|
climateqa/engine/chains/query_transform.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
from langchain_core.pydantic_v1 import BaseModel, Field
|
4 |
+
from typing import List
|
5 |
+
from typing import Literal
|
6 |
+
from langchain.prompts import ChatPromptTemplate
|
7 |
+
from langchain_core.utils.function_calling import convert_to_openai_function
|
8 |
+
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
|
9 |
+
|
10 |
+
|
11 |
+
# Prompt from the original paper https://arxiv.org/pdf/2305.14283
|
12 |
+
# Query Rewriting for Retrieval-Augmented Large Language Models
|
13 |
+
class QueryDecomposition(BaseModel):
|
14 |
+
"""
|
15 |
+
Decompose the user query into smaller parts to think step by step to answer this question
|
16 |
+
Act as a simple planning agent
|
17 |
+
"""
|
18 |
+
|
19 |
+
questions: List[str] = Field(
|
20 |
+
description="""
|
21 |
+
Think step by step to answer this question, and provide one or several search engine questions in English for knowledge that you need.
|
22 |
+
Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
|
23 |
+
- If it's already a standalone question, you don't need to provide more questions, just reformulate it if relevant as a better question for a search engine
|
24 |
+
- If you need to decompose the question, output a list of maximum 3 questions
|
25 |
+
"""
|
26 |
+
)
|
27 |
+
|
28 |
+
|
29 |
+
class Location(BaseModel):
|
30 |
+
country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
|
31 |
+
location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
|
32 |
+
|
33 |
+
class QueryAnalysis(BaseModel):
|
34 |
+
"""
|
35 |
+
Analyzing the user query to extract topics, sources and date
|
36 |
+
Also do query expansion to get alternative search queries
|
37 |
+
Also provide simple keywords to feed a search engine
|
38 |
+
"""
|
39 |
+
|
40 |
+
# keywords: List[str] = Field(
|
41 |
+
# description="""
|
42 |
+
# Extract the keywords from the user query to feed a search engine as a list
|
43 |
+
# Maximum 3 keywords
|
44 |
+
|
45 |
+
# Examples:
|
46 |
+
# - "What is the impact of deep sea mining ?" -> deep sea mining
|
47 |
+
# - "How will El Nino be impacted by climate change" -> el nino;climate change
|
48 |
+
# - "Is climate change a hoax" -> climate change;hoax
|
49 |
+
# """
|
50 |
+
# )
|
51 |
+
|
52 |
+
# alternative_queries: List[str] = Field(
|
53 |
+
# description="""
|
54 |
+
# Generate alternative search questions from the user query to feed a search engine
|
55 |
+
# """
|
56 |
+
# )
|
57 |
+
|
58 |
+
# step_back_question: str = Field(
|
59 |
+
# description="""
|
60 |
+
# You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.
|
61 |
+
# This questions should help you get more context and information about the user query
|
62 |
+
# """
|
63 |
+
# )
|
64 |
+
|
65 |
+
sources: List[Literal["IPCC", "IPBES", "IPOS","OpenAlex"]] = Field(
|
66 |
+
...,
|
67 |
+
description="""
|
68 |
+
Given a user question choose which documents would be most relevant for answering their question,
|
69 |
+
- IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
|
70 |
+
- IPBES is for questions about biodiversity and nature
|
71 |
+
- IPOS is for questions about the ocean and deep sea mining
|
72 |
+
- OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
|
73 |
+
""",
|
74 |
+
)
|
75 |
+
# topics: List[Literal[
|
76 |
+
# "Climate change",
|
77 |
+
# "Biodiversity",
|
78 |
+
# "Energy",
|
79 |
+
# "Decarbonization",
|
80 |
+
# "Climate science",
|
81 |
+
# "Nature",
|
82 |
+
# "Climate policy and justice",
|
83 |
+
# "Oceans",
|
84 |
+
# "Deep sea mining",
|
85 |
+
# "ESG and regulations",
|
86 |
+
# "CSRD",
|
87 |
+
# ]] = Field(
|
88 |
+
# ...,
|
89 |
+
# description = """
|
90 |
+
# Choose the topics that are most relevant to the user query, ex: Climate change, Energy, Biodiversity, ...
|
91 |
+
# """,
|
92 |
+
# )
|
93 |
+
# date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
|
94 |
+
# location:Location
|
95 |
+
|
96 |
+
|
97 |
+
def make_query_decomposition_chain(llm):
|
98 |
+
|
99 |
+
openai_functions = [convert_to_openai_function(QueryDecomposition)]
|
100 |
+
llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryDecomposition"})
|
101 |
+
|
102 |
+
prompt = ChatPromptTemplate.from_messages([
|
103 |
+
("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
|
104 |
+
("user", "input: {input}")
|
105 |
+
])
|
106 |
+
|
107 |
+
chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
|
108 |
+
return chain
|
109 |
+
|
110 |
+
|
111 |
+
def make_query_rewriter_chain(llm):
|
112 |
+
|
113 |
+
openai_functions = [convert_to_openai_function(QueryAnalysis)]
|
114 |
+
llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
|
115 |
+
|
116 |
+
|
117 |
+
|
118 |
+
prompt = ChatPromptTemplate.from_messages([
|
119 |
+
("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
|
120 |
+
("user", "input: {input}")
|
121 |
+
])
|
122 |
+
|
123 |
+
|
124 |
+
chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
|
125 |
+
return chain
|
126 |
+
|
127 |
+
|
128 |
+
def make_query_transform_node(llm):
|
129 |
+
|
130 |
+
decomposition_chain = make_query_decomposition_chain(llm)
|
131 |
+
rewriter_chain = make_query_rewriter_chain(llm)
|
132 |
+
|
133 |
+
def transform_query(state):
|
134 |
+
|
135 |
+
new_state = {}
|
136 |
+
|
137 |
+
# Decomposition
|
138 |
+
decomposition_output = decomposition_chain.invoke({"input":state["query"]})
|
139 |
+
new_state.update(decomposition_output)
|
140 |
+
|
141 |
+
# Query Analysis
|
142 |
+
questions = []
|
143 |
+
for question in new_state["questions"]:
|
144 |
+
question_state = {"question":question}
|
145 |
+
analysis_output = rewriter_chain.invoke({"input":question})
|
146 |
+
question_state.update(analysis_output)
|
147 |
+
questions.append(question_state)
|
148 |
+
new_state["questions"] = questions
|
149 |
+
|
150 |
+
return new_state
|
151 |
+
|
152 |
+
return transform_query
|
climateqa/engine/{reformulation.py β chains/reformulation.py}
RENAMED
@@ -3,7 +3,7 @@ from langchain.output_parsers.structured import StructuredOutputParser, Response
|
|
3 |
from langchain_core.prompts import PromptTemplate
|
4 |
from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
|
5 |
|
6 |
-
from climateqa.engine.prompts import reformulation_prompt_template
|
7 |
from climateqa.engine.utils import pass_values, flatten_dict
|
8 |
|
9 |
|
|
|
3 |
from langchain_core.prompts import PromptTemplate
|
4 |
from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
|
5 |
|
6 |
+
from climateqa.engine.chains.prompts import reformulation_prompt_template
|
7 |
from climateqa.engine.utils import pass_values, flatten_dict
|
8 |
|
9 |
|
climateqa/engine/chains/retriever.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
from contextlib import contextmanager
|
4 |
+
|
5 |
+
from ..reranker import rerank_docs
|
6 |
+
from ..retriever import ClimateQARetriever
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
def divide_into_parts(target, parts):
|
11 |
+
# Base value for each part
|
12 |
+
base = target // parts
|
13 |
+
# Remainder to distribute
|
14 |
+
remainder = target % parts
|
15 |
+
# List to hold the result
|
16 |
+
result = []
|
17 |
+
|
18 |
+
for i in range(parts):
|
19 |
+
if i < remainder:
|
20 |
+
# These parts get base value + 1
|
21 |
+
result.append(base + 1)
|
22 |
+
else:
|
23 |
+
# The rest get the base value
|
24 |
+
result.append(base)
|
25 |
+
|
26 |
+
return result
|
27 |
+
|
28 |
+
|
29 |
+
@contextmanager
|
30 |
+
def suppress_output():
|
31 |
+
# Open a null device
|
32 |
+
with open(os.devnull, 'w') as devnull:
|
33 |
+
# Store the original stdout and stderr
|
34 |
+
old_stdout = sys.stdout
|
35 |
+
old_stderr = sys.stderr
|
36 |
+
# Redirect stdout and stderr to the null device
|
37 |
+
sys.stdout = devnull
|
38 |
+
sys.stderr = devnull
|
39 |
+
try:
|
40 |
+
yield
|
41 |
+
finally:
|
42 |
+
# Restore stdout and stderr
|
43 |
+
sys.stdout = old_stdout
|
44 |
+
sys.stderr = old_stderr
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
def make_retriever_node(vectorstore,reranker):
|
49 |
+
|
50 |
+
def retrieve_documents(state):
|
51 |
+
|
52 |
+
POSSIBLE_SOURCES = ["IPCC","IPBES","IPOS","OpenAlex"]
|
53 |
+
questions = state["questions"]
|
54 |
+
|
55 |
+
# Use sources from the user input or from the LLM detection
|
56 |
+
sources_input = state["sources_input"] if "sources_input" in state else ["auto"]
|
57 |
+
auto_mode = "auto" in sources_input
|
58 |
+
|
59 |
+
# Constants
|
60 |
+
k_final = 15
|
61 |
+
k_before_reranking = 100
|
62 |
+
k_summary = 5
|
63 |
+
rerank_by_question = True
|
64 |
+
|
65 |
+
# There are several options to get the final top k
|
66 |
+
# Option 1 - Get 100 documents by question and rerank by question
|
67 |
+
# Option 2 - Get 100/n documents by question and rerank the total
|
68 |
+
if rerank_by_question:
|
69 |
+
k_by_question = divide_into_parts(k_final,len(questions))
|
70 |
+
|
71 |
+
docs = []
|
72 |
+
|
73 |
+
for i,q in enumerate(questions):
|
74 |
+
|
75 |
+
sources = q["sources"]
|
76 |
+
question = q["question"]
|
77 |
+
|
78 |
+
# If auto mode, we use the sources detected by the LLM
|
79 |
+
if auto_mode:
|
80 |
+
sources = [x for x in sources if x in POSSIBLE_SOURCES]
|
81 |
+
|
82 |
+
# Otherwise, we use the config
|
83 |
+
else:
|
84 |
+
sources = sources_input
|
85 |
+
|
86 |
+
# Search the document store using the retriever
|
87 |
+
# Configure high top k for further reranking step
|
88 |
+
retriever = ClimateQARetriever(
|
89 |
+
vectorstore=vectorstore,
|
90 |
+
sources = sources,
|
91 |
+
# reports = ias_reports,
|
92 |
+
min_size = 200,
|
93 |
+
k_summary = k_summary,k_total = k_before_reranking,
|
94 |
+
threshold = 0.5,
|
95 |
+
)
|
96 |
+
docs_question = retriever.get_relevant_documents(question)
|
97 |
+
|
98 |
+
# Rerank
|
99 |
+
with suppress_output():
|
100 |
+
docs_question = rerank_docs(reranker,docs_question,question)
|
101 |
+
|
102 |
+
# If rerank by question we select the top documents for each question
|
103 |
+
if rerank_by_question:
|
104 |
+
docs_question = docs_question[:k_by_question[i]]
|
105 |
+
|
106 |
+
# Add sources used in the metadata
|
107 |
+
for doc in docs_question:
|
108 |
+
doc.metadata["sources_used"] = sources
|
109 |
+
|
110 |
+
# Add to the list of docs
|
111 |
+
docs.extend(docs_question)
|
112 |
+
|
113 |
+
# Sorting the list in descending order by rerank_score
|
114 |
+
# Then select the top k
|
115 |
+
docs = sorted(docs, key=lambda x: x.metadata["rerank_score"], reverse=True)
|
116 |
+
docs = docs[:k_final]
|
117 |
+
|
118 |
+
new_state = {"documents":docs}
|
119 |
+
return new_state
|
120 |
+
|
121 |
+
return retrieve_documents
|
122 |
+
|
climateqa/engine/chains/sample_router.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# from typing import List
|
3 |
+
# from typing import Literal
|
4 |
+
# from langchain.prompts import ChatPromptTemplate
|
5 |
+
# from langchain_core.utils.function_calling import convert_to_openai_function
|
6 |
+
# from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
|
7 |
+
|
8 |
+
# # https://livingdatalab.com/posts/2023-11-05-openai-function-calling-with-langchain.html
|
9 |
+
|
10 |
+
# class Location(BaseModel):
|
11 |
+
# country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
|
12 |
+
# location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
|
13 |
+
|
14 |
+
# class QueryAnalysis(BaseModel):
|
15 |
+
# """Analyzing the user query"""
|
16 |
+
|
17 |
+
# language: str = Field(
|
18 |
+
# description="Find the language of the query in full words (ex: French, English, Spanish, ...), defaults to English"
|
19 |
+
# )
|
20 |
+
# intent: str = Field(
|
21 |
+
# enum=[
|
22 |
+
# "Environmental impacts of AI",
|
23 |
+
# "Geolocated info about climate change",
|
24 |
+
# "Climate change",
|
25 |
+
# "Biodiversity",
|
26 |
+
# "Deep sea mining",
|
27 |
+
# "Chitchat",
|
28 |
+
# ],
|
29 |
+
# description="""
|
30 |
+
# Categorize the user query in one of the following category,
|
31 |
+
|
32 |
+
# Examples:
|
33 |
+
# - Geolocated info about climate change: "What will be the temperature in Marseille in 2050"
|
34 |
+
# - Climate change: "What is radiative forcing", "How much will
|
35 |
+
# """,
|
36 |
+
# )
|
37 |
+
# sources: List[Literal["IPCC", "IPBES", "IPOS"]] = Field(
|
38 |
+
# ...,
|
39 |
+
# description="""
|
40 |
+
# Given a user question choose which documents would be most relevant for answering their question,
|
41 |
+
# - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
|
42 |
+
# - IPBES is for questions about biodiversity and nature
|
43 |
+
# - IPOS is for questions about the ocean and deep sea mining
|
44 |
+
|
45 |
+
# """,
|
46 |
+
# )
|
47 |
+
# date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
|
48 |
+
# location:Location
|
49 |
+
# # query: str = Field(
|
50 |
+
# # description = """
|
51 |
+
# # Translate to english and reformulate the following user message to be a short standalone question, in the context of an educational discussion about climate change.
|
52 |
+
# # The reformulated question will used in a search engine
|
53 |
+
# # By default, assume that the user is asking information about the last century,
|
54 |
+
# # Use the following examples
|
55 |
+
|
56 |
+
# # ### Examples:
|
57 |
+
# # La technologie nous sauvera-t-elle ? -> Can technology help humanity mitigate the effects of climate change?
|
58 |
+
# # what are our reserves in fossil fuel? -> What are the current reserves of fossil fuels and how long will they last?
|
59 |
+
# # what are the main causes of climate change? -> What are the main causes of climate change in the last century?
|
60 |
+
|
61 |
+
# # Question in English:
|
62 |
+
# # """
|
63 |
+
# # )
|
64 |
+
|
65 |
+
# openai_functions = [convert_to_openai_function(QueryAnalysis)]
|
66 |
+
# llm2 = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
|
climateqa/engine/chains/translation.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from langchain_core.pydantic_v1 import BaseModel, Field
|
3 |
+
from typing import List
|
4 |
+
from typing import Literal
|
5 |
+
from langchain.prompts import ChatPromptTemplate
|
6 |
+
from langchain_core.utils.function_calling import convert_to_openai_function
|
7 |
+
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
|
8 |
+
|
9 |
+
|
10 |
+
class Translation(BaseModel):
|
11 |
+
"""Analyzing the user message input"""
|
12 |
+
|
13 |
+
translation: str = Field(
|
14 |
+
description="Translate the message input to English",
|
15 |
+
)
|
16 |
+
|
17 |
+
|
18 |
+
def make_translation_chain(llm):
|
19 |
+
|
20 |
+
openai_functions = [convert_to_openai_function(Translation)]
|
21 |
+
llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"Translation"})
|
22 |
+
|
23 |
+
prompt = ChatPromptTemplate.from_messages([
|
24 |
+
("system", "You are a helpful assistant, you will translate the user input message to English using the function provided"),
|
25 |
+
("user", "input: {input}")
|
26 |
+
])
|
27 |
+
|
28 |
+
chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
|
29 |
+
return chain
|
30 |
+
|
31 |
+
|
32 |
+
def make_translation_node(llm):
|
33 |
+
|
34 |
+
translation_chain = make_translation_chain(llm)
|
35 |
+
|
36 |
+
def translate_query(state):
|
37 |
+
user_input = state["user_input"]
|
38 |
+
translation = translation_chain.invoke({"input":user_input})
|
39 |
+
return {"query":translation["translation"]}
|
40 |
+
|
41 |
+
return translate_query
|
climateqa/engine/embeddings.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
3 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
4 |
|
5 |
-
def get_embeddings_function(version = "v1.2"):
|
6 |
|
7 |
if version == "v1.2":
|
8 |
|
@@ -10,12 +10,12 @@ def get_embeddings_function(version = "v1.2"):
|
|
10 |
# Best embedding model at a reasonable size at the moment (2023-11-22)
|
11 |
|
12 |
model_name = "BAAI/bge-base-en-v1.5"
|
13 |
-
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
|
14 |
print("Loading embeddings model: ", model_name)
|
15 |
embeddings_function = HuggingFaceBgeEmbeddings(
|
16 |
model_name=model_name,
|
17 |
encode_kwargs=encode_kwargs,
|
18 |
-
query_instruction=
|
19 |
)
|
20 |
|
21 |
else:
|
@@ -23,3 +23,6 @@ def get_embeddings_function(version = "v1.2"):
|
|
23 |
embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
24 |
|
25 |
return embeddings_function
|
|
|
|
|
|
|
|
2 |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
3 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
4 |
|
5 |
+
def get_embeddings_function(version = "v1.2",query_instruction = "Represent this sentence for searching relevant passages: "):
|
6 |
|
7 |
if version == "v1.2":
|
8 |
|
|
|
10 |
# Best embedding model at a reasonable size at the moment (2023-11-22)
|
11 |
|
12 |
model_name = "BAAI/bge-base-en-v1.5"
|
13 |
+
encode_kwargs = {'normalize_embeddings': True,"show_progress_bar":False} # set True to compute cosine similarity
|
14 |
print("Loading embeddings model: ", model_name)
|
15 |
embeddings_function = HuggingFaceBgeEmbeddings(
|
16 |
model_name=model_name,
|
17 |
encode_kwargs=encode_kwargs,
|
18 |
+
query_instruction=query_instruction,
|
19 |
)
|
20 |
|
21 |
else:
|
|
|
23 |
embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
24 |
|
25 |
return embeddings_function
|
26 |
+
|
27 |
+
|
28 |
+
|
climateqa/engine/graph.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
from contextlib import contextmanager
|
4 |
+
|
5 |
+
from langchain.schema import Document
|
6 |
+
from langgraph.graph import END, StateGraph
|
7 |
+
from typing_extensions import TypedDict
|
8 |
+
from typing import List
|
9 |
+
|
10 |
+
from .chains.answer_chitchat import make_chitchat_node
|
11 |
+
from .chains.answer_ai_impact import make_ai_impact_node
|
12 |
+
from .chains.query_transform import make_query_transform_node
|
13 |
+
from .chains.translation import make_translation_node
|
14 |
+
from .chains.intent_routing import make_intent_router_node
|
15 |
+
|
16 |
+
|
17 |
+
class GraphState(TypedDict):
|
18 |
+
"""
|
19 |
+
Represents the state of our graph.
|
20 |
+
"""
|
21 |
+
user_input : str
|
22 |
+
language : str
|
23 |
+
intent : str
|
24 |
+
query: str
|
25 |
+
questions : List[dict]
|
26 |
+
answer: str
|
27 |
+
audience: str
|
28 |
+
sources_input: str
|
29 |
+
documents: List[Document]
|
30 |
+
|
31 |
+
def search(state):
|
32 |
+
return {}
|
climateqa/engine/reranker.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from scipy.special import expit, logit
|
3 |
+
from rerankers import Reranker
|
4 |
+
|
5 |
+
|
6 |
+
def get_reranker(model = "nano",cohere_api_key = None):
|
7 |
+
|
8 |
+
assert model in ["nano","tiny","small","large"]
|
9 |
+
|
10 |
+
if model == "nano":
|
11 |
+
reranker = Reranker('ms-marco-TinyBERT-L-2-v2', model_type='flashrank')
|
12 |
+
elif model == "tiny":
|
13 |
+
reranker = Reranker('ms-marco-MiniLM-L-12-v2', model_type='flashrank')
|
14 |
+
elif model == "small":
|
15 |
+
reranker = Reranker("mixedbread-ai/mxbai-rerank-xsmall-v1", model_type='cross-encoder')
|
16 |
+
elif model == "large":
|
17 |
+
if cohere_api_key is None:
|
18 |
+
cohere_api_key = os.environ["COHERE_API_KEY"]
|
19 |
+
reranker = Reranker("cohere", lang='en', api_key = cohere_api_key)
|
20 |
+
return reranker
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
def rerank_docs(reranker,docs,query):
|
25 |
+
|
26 |
+
# Get a list of texts from langchain docs
|
27 |
+
input_docs = [x.page_content for x in docs]
|
28 |
+
|
29 |
+
# Rerank using rerankers library
|
30 |
+
results = reranker.rank(query=query, docs=input_docs)
|
31 |
+
|
32 |
+
# Prepare langchain list of docs
|
33 |
+
docs_reranked = []
|
34 |
+
for result in results.results:
|
35 |
+
doc_id = result.document.doc_id
|
36 |
+
doc = docs[doc_id]
|
37 |
+
doc.metadata["rerank_score"] = result.score
|
38 |
+
doc.metadata["query_used_for_retrieval"] = query
|
39 |
+
docs_reranked.append(doc)
|
40 |
+
return docs_reranked
|
requirements.txt
CHANGED
@@ -10,4 +10,5 @@ huggingface-hub
|
|
10 |
msal
|
11 |
pyalex==0.13
|
12 |
networkx==3.2.1
|
13 |
-
pyvis==0.3.2
|
|
|
|
10 |
msal
|
11 |
pyalex==0.13
|
12 |
networkx==3.2.1
|
13 |
+
pyvis==0.3.2
|
14 |
+
flashrank==0.2.5
|
sandbox/20240310 - CQA - Semantic Routing 1.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|