ThΓ©o ALVES DA COSTA commited on
Commit
139fefe
β€’
1 Parent(s): 6d2199d

Switched to LCEL

Browse files
climateqa/engine/__init__.py ADDED
File without changes
climateqa/engine/embeddings.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain.embeddings import HuggingFaceBgeEmbeddings
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+
5
+ def get_embeddings_function(version = "v1.2"):
6
+
7
+ if version == "v1.2":
8
+
9
+ # https://huggingface.co/BAAI/bge-base-en-v1.5
10
+ # Best embedding model at a reasonable size at the moment (2023-11-22)
11
+
12
+ model_name = "BAAI/bge-base-en-v1.5"
13
+ encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
14
+ embeddings_function = HuggingFaceBgeEmbeddings(
15
+ model_name=model_name,
16
+ encode_kwargs=encode_kwargs,
17
+ query_instruction="Represent this sentence for searching relevant passages: "
18
+ )
19
+
20
+ else:
21
+
22
+ embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
23
+
24
+ return embeddings_function
climateqa/{llm.py β†’ engine/llm.py} RENAMED
File without changes
climateqa/{chains.py β†’ engine/old/chains.py} RENAMED
@@ -10,32 +10,6 @@ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
10
  from climateqa.prompts import answer_prompt, reformulation_prompt,audience_prompts
11
  from climateqa.custom_retrieval_chain import CustomRetrievalQAWithSourcesChain
12
 
13
- def load_reformulation_chain(llm):
14
-
15
- prompt = PromptTemplate(
16
- template = reformulation_prompt,
17
- input_variables=["query"],
18
- )
19
- reformulation_chain = LLMChain(llm = llm,prompt = prompt,output_key="json")
20
-
21
- # Parse the output
22
- def parse_output(output):
23
- query = output["query"]
24
- json_output = json.loads(output["json"])
25
- question = json_output.get("question", query)
26
- language = json_output.get("language", "English")
27
- return {
28
- "question": question,
29
- "language": language,
30
- }
31
-
32
- transform_chain = TransformChain(
33
- input_variables=["json"], output_variables=["question","language"], transform=parse_output
34
- )
35
-
36
- reformulation_chain = SequentialChain(chains = [reformulation_chain,transform_chain],input_variables=["query"],output_variables=["question","language"])
37
- return reformulation_chain
38
-
39
 
40
  def load_combine_documents_chain(llm):
41
  prompt = PromptTemplate(template=answer_prompt, input_variables=["summaries", "question","audience","language"])
 
10
  from climateqa.prompts import answer_prompt, reformulation_prompt,audience_prompts
11
  from climateqa.custom_retrieval_chain import CustomRetrievalQAWithSourcesChain
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def load_combine_documents_chain(llm):
15
  prompt = PromptTemplate(template=answer_prompt, input_variables=["summaries", "question","audience","language"])
climateqa/{chat.py β†’ engine/old/chat.py} RENAMED
File without changes
climateqa/{custom_retrieval_chain.py β†’ engine/old/custom_retrieval_chain.py} RENAMED
File without changes
climateqa/{prompts.py β†’ engine/prompts.py} RENAMED
@@ -1,33 +1,42 @@
1
 
2
  # If the message is not relevant to climate change (like "How are you", "I am 18 years old" or "When was built the eiffel tower"), return N/A
3
 
4
- reformulation_prompt = """
5
  Reformulate the following user message to be a short standalone question in English, in the context of an educational discussion about climate change.
6
  ---
7
  query: La technologie nous sauvera-t-elle ?
8
- question: Can technology help humanity mitigate the effects of climate change?
9
- language: French
 
10
  ---
11
  query: what are our reserves in fossil fuel?
12
- question: What are the current reserves of fossil fuels and how long will they last?
13
- language: English
 
14
  ---
15
  query: what are the main causes of climate change?
16
- question: What are the main causes of climate change in the last century?
17
- language: English
 
18
  ---
19
 
 
 
 
20
  Output the result as json with two keys "question" and "language"
21
  query: {query}
22
- answer:"""
 
 
 
23
 
24
- system_prompt = """
25
  You are ClimateQ&A, an AI Assistant created by Ekimetrics, you will act as a climate scientist and answer questions about climate change and biodiversity.
26
  You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
27
  """
28
 
29
 
30
- answer_prompt = """
31
  You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
32
 
33
  Guidelines:
@@ -42,7 +51,7 @@ Guidelines:
42
 
43
  -----------------------
44
  Passages:
45
- {summaries}
46
 
47
  -----------------------
48
  Question: {question} - Explained to {audience}
 
1
 
2
  # If the message is not relevant to climate change (like "How are you", "I am 18 years old" or "When was built the eiffel tower"), return N/A
3
 
4
+ reformulation_prompt_template = """
5
  Reformulate the following user message to be a short standalone question in English, in the context of an educational discussion about climate change.
6
  ---
7
  query: La technologie nous sauvera-t-elle ?
8
+ ->
9
+ 'question': 'Can technology help humanity mitigate the effects of climate change?',
10
+ 'language': 'French',
11
  ---
12
  query: what are our reserves in fossil fuel?
13
+ ->
14
+ 'question': 'What are the current reserves of fossil fuels and how long will they last?',
15
+ 'language': 'English',
16
  ---
17
  query: what are the main causes of climate change?
18
+ ->
19
+ 'question': 'What are the main causes of climate change in the last century?',
20
+ 'language': 'English'
21
  ---
22
 
23
+ {format_instructions}
24
+
25
+ Reformulate the question in English and detect the language of the original message
26
  Output the result as json with two keys "question" and "language"
27
  query: {query}
28
+ ->
29
+ ```json
30
+ """
31
+
32
 
33
+ system_prompt_template = """
34
  You are ClimateQ&A, an AI Assistant created by Ekimetrics, you will act as a climate scientist and answer questions about climate change and biodiversity.
35
  You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
36
  """
37
 
38
 
39
+ answer_prompt_template = """
40
  You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
41
 
42
  Guidelines:
 
51
 
52
  -----------------------
53
  Passages:
54
+ {context}
55
 
56
  -----------------------
57
  Question: {question} - Explained to {audience}
climateqa/engine/rag.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import itemgetter
2
+
3
+ from langchain.prompts import ChatPromptTemplate
4
+ from langchain.schema.output_parser import StrOutputParser
5
+ from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
6
+ from langchain.prompts.prompt import PromptTemplate
7
+ from langchain.schema import format_document
8
+
9
+ from climateqa.engine.reformulation import make_reformulation_chain
10
+ from climateqa.engine.prompts import answer_prompt_template
11
+ from climateqa.engine.utils import pass_values, flatten_dict
12
+
13
+ DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
14
+
15
+ def _combine_documents(
16
+ docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
17
+ ):
18
+ doc_strings = [f"Doc {i+1}: " + format_document(doc, document_prompt) for i,doc in enumerate(docs)]
19
+ return sep.join(doc_strings)
20
+
21
+
22
+ def make_rag_chain(retriever,llm):
23
+
24
+
25
+ # Construct the prompt
26
+ prompt = ChatPromptTemplate.from_template(answer_prompt_template)
27
+
28
+
29
+ # ------- CHAIN 0 - Reformulation
30
+ reformulation_chain = make_reformulation_chain(llm)
31
+ reformulation = (
32
+ {"reformulation":reformulation_chain,**pass_values(["audience","query"])}
33
+ | RunnablePassthrough()
34
+ | flatten_dict
35
+ )
36
+
37
+
38
+ # ------- CHAIN 1
39
+ # Retrieved documents
40
+ find_documents = {
41
+ "docs": itemgetter("question") | retriever,
42
+ **pass_values(["question","audience","language","query"])
43
+ } | RunnablePassthrough()
44
+
45
+
46
+ # ------- CHAIN 2
47
+ # Construct inputs for the llm
48
+ input_documents = {
49
+ "context":lambda x : _combine_documents(x["docs"]),
50
+ **pass_values(["question","audience","language"])
51
+ }
52
+
53
+ # Generate the answer
54
+ answer = {
55
+ "answer": input_documents | prompt | llm | StrOutputParser(),
56
+ **pass_values(["question","audience","language","query","docs"])
57
+ }
58
+
59
+ # ------- FINAL CHAIN
60
+ # Build the final chain
61
+ rag_chain = reformulation | find_documents | answer
62
+
63
+ return rag_chain
64
+
climateqa/engine/reformulation.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain.output_parsers import StructuredOutputParser, ResponseSchema
3
+ from langchain.prompts import PromptTemplate
4
+ from langchain.llms import OpenAI
5
+ from langchain.chat_models import ChatOpenAI
6
+
7
+ from climateqa.engine.prompts import reformulation_prompt_template
8
+
9
+
10
+
11
+ response_schemas = [
12
+ ResponseSchema(name="language", description="The detected language of the input message"),
13
+ ResponseSchema(name="question", description="The reformulated question always in English")
14
+ ]
15
+ output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
16
+ format_instructions = output_parser.get_format_instructions()
17
+
18
+
19
+ def make_reformulation_chain(llm):
20
+
21
+ prompt = PromptTemplate(
22
+ template=reformulation_prompt_template,
23
+ input_variables=["query"],
24
+ partial_variables={"format_instructions": format_instructions}
25
+ )
26
+
27
+ chain = (prompt | llm.bind(stop=["```"]) | output_parser)
28
+ return chain
climateqa/{retriever.py β†’ engine/retriever.py} RENAMED
@@ -12,12 +12,16 @@ from pydantic import Field
12
  class ClimateQARetriever(BaseRetriever):
13
  vectorstore:VectorStore
14
  sources:list = ["IPCC","IPBES"]
15
- threshold:float = 22
 
16
  k_summary:int = 3
17
  k_total:int = 10
18
  namespace:str = "vectors"
19
 
20
- def get_relevant_documents(self, query: str) -> List[Document]:
 
 
 
21
 
22
  # Check if all elements in the list are either IPCC or IPBES
23
  assert isinstance(self.sources,list)
@@ -25,16 +29,20 @@ class ClimateQARetriever(BaseRetriever):
25
  assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
26
 
27
  # Prepare base search kwargs
28
- filters = {
29
- "source": { "$in":self.sources},
30
- }
 
 
 
31
 
32
  # Search for k_summary documents in the summaries dataset
33
  filters_summaries = {
34
  **filters,
35
  "report_type": { "$in":["SPM","TS"]},
36
  }
37
- docs_summaries = self.vectorstore.similarity_search_with_score(query=query,namespace = self.namespace,filter = filters_summaries,k = self.k_summary)
 
38
  docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
39
 
40
  # Search for k_total - k_summary documents in the full reports dataset
@@ -43,7 +51,7 @@ class ClimateQARetriever(BaseRetriever):
43
  "report_type": { "$nin":["SPM","TS"]},
44
  }
45
  k_full = self.k_total - len(docs_summaries)
46
- docs_full = self.vectorstore.similarity_search_with_score(query=query,namespace = self.namespace,filter = filters_full,k = k_full)
47
 
48
  # Concatenate documents
49
  docs = docs_summaries + docs_full
@@ -57,11 +65,13 @@ class ClimateQARetriever(BaseRetriever):
57
  doc.metadata["similarity_score"] = score
58
  doc.metadata["content"] = doc.page_content
59
  doc.metadata["page_number"] = int(doc.metadata["page_number"])
60
- doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
61
  results.append(doc)
62
 
63
- return results
 
64
 
 
65
 
66
 
67
 
 
12
  class ClimateQARetriever(BaseRetriever):
13
  vectorstore:VectorStore
14
  sources:list = ["IPCC","IPBES"]
15
+ reports:list = []
16
+ threshold:float = 0.4
17
  k_summary:int = 3
18
  k_total:int = 10
19
  namespace:str = "vectors"
20
 
21
+
22
+ def _get_relevant_documents(
23
+ self, query: str, *, run_manager: CallbackManagerForRetrieverRun
24
+ ) -> List[Document]:
25
 
26
  # Check if all elements in the list are either IPCC or IPBES
27
  assert isinstance(self.sources,list)
 
29
  assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
30
 
31
  # Prepare base search kwargs
32
+
33
+ filters = {}
34
+ if len(self.reports) > 0:
35
+ filters["short_name"] = {"$in":self.reports}
36
+ else:
37
+ filters["source"] = { "$in":self.sources}
38
 
39
  # Search for k_summary documents in the summaries dataset
40
  filters_summaries = {
41
  **filters,
42
  "report_type": { "$in":["SPM","TS"]},
43
  }
44
+
45
+ docs_summaries = self.vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = self.k_summary)
46
  docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
47
 
48
  # Search for k_total - k_summary documents in the full reports dataset
 
51
  "report_type": { "$nin":["SPM","TS"]},
52
  }
53
  k_full = self.k_total - len(docs_summaries)
54
+ docs_full = self.vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
55
 
56
  # Concatenate documents
57
  docs = docs_summaries + docs_full
 
65
  doc.metadata["similarity_score"] = score
66
  doc.metadata["content"] = doc.page_content
67
  doc.metadata["page_number"] = int(doc.metadata["page_number"])
68
+ # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
69
  results.append(doc)
70
 
71
+ # Sort by score
72
+ # results = sorted(results,key = lambda x : x.metadata["similarity_score"],reverse = True)
73
 
74
+ return results
75
 
76
 
77
 
climateqa/engine/utils.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import Any, Dict, Iterable, Tuple, Union
3
+ from operator import itemgetter
4
+
5
+ def pass_values(x):
6
+ if not isinstance(x,list): x = [x]
7
+ return {k:itemgetter(k) for k in x}
8
+
9
+
10
+ # Drawn from langchain utils and modified to remove the parent key
11
+ def _flatten_dict(
12
+ nested_dict: Dict[str, Any], parent_key: str = "", sep: str = "_"
13
+ ) -> Iterable[Tuple[str, Any]]:
14
+ """
15
+ Generator that yields flattened items from a nested dictionary for a flat dict.
16
+
17
+ Parameters:
18
+ nested_dict (dict): The nested dictionary to flatten.
19
+ parent_key (str): The prefix to prepend to the keys of the flattened dict.
20
+ sep (str): The separator to use between the parent key and the key of the
21
+ flattened dictionary.
22
+
23
+ Yields:
24
+ (str, any): A key-value pair from the flattened dictionary.
25
+ """
26
+ for key, value in nested_dict.items():
27
+ new_key = key
28
+ if isinstance(value, dict):
29
+ yield from _flatten_dict(value, new_key, sep)
30
+ else:
31
+ yield new_key, value
32
+
33
+
34
+ def flatten_dict(
35
+ nested_dict: Dict[str, Any], parent_key: str = "", sep: str = "_"
36
+ ) -> Dict[str, Any]:
37
+ """Flattens a nested dictionary into a flat dictionary.
38
+
39
+ Parameters:
40
+ nested_dict (dict): The nested dictionary to flatten.
41
+ parent_key (str): The prefix to prepend to the keys of the flattened dict.
42
+ sep (str): The separator to use between the parent key and the key of the
43
+ flattened dictionary.
44
+
45
+ Returns:
46
+ (dict): A flat dictionary.
47
+
48
+ """
49
+ flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
50
+ return flat_dict
climateqa/engine/vectorstore.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pinecone
2
+ # More info at https://docs.pinecone.io/docs/langchain
3
+ # And https://python.langchain.com/docs/integrations/vectorstores/pinecone
4
+ import os
5
+ import pinecone
6
+ from langchain.vectorstores import Pinecone
7
+
8
+ # LOAD ENVIRONMENT VARIABLES
9
+ try:
10
+ from dotenv import load_dotenv
11
+ load_dotenv()
12
+ except:
13
+ pass
14
+
15
+
16
+ def get_pinecone_vectorstore(embeddings,text_key = "text"):
17
+
18
+ # initialize pinecone
19
+ pinecone.init(
20
+ api_key=os.getenv("PINECONE_API_KEY"), # find at app.pinecone.io
21
+ environment=os.getenv("PINECONE_API_ENVIRONMENT"), # next to api key in console
22
+ )
23
+
24
+ index_name = os.getenv("PINECONE_API_INDEX")
25
+ vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
26
+ return vectorstore
27
+
28
+
29
+ # def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
30
+
31
+ # assert isinstance(sources,list)
32
+
33
+ # # Check if all elements in the list are either IPCC or IPBES
34
+ # filter = {
35
+ # "source": { "$in":sources},
36
+ # }
37
+
38
+ # retriever = vectorstore.as_retriever(search_kwargs={
39
+ # "k": k,
40
+ # "namespace":"vectors",
41
+ # "filter":filter
42
+ # })
43
+
44
+ # return retriever
climateqa/parser/__init__.py ADDED
File without changes