Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on Sep 30

Commit

aa904c1

•

2 Parent(s): 6b43c86 25e32e6

Merge branch 'bugfix/add_dummy_searchs' into feature/graph_recommandation

Browse files

Files changed (14) hide show

app.py +2 -2
climateqa/engine/chains/keywords_extraction.py +40 -0
climateqa/engine/chains/query_transformation.py +45 -4
climateqa/engine/chains/retrieve_documents.py +159 -0
climateqa/engine/graph.py +22 -10
climateqa/engine/llm/__init__.py +3 -0
climateqa/engine/llm/ollama.py +6 -0
climateqa/engine/utils.py +17 -0
climateqa/knowledge/__init__.py +0 -0
climateqa/{papers → knowledge}/openalex.py +61 -12
climateqa/{engine → knowledge}/retriever.py +1 -83
climateqa/papers/__init__.py +0 -43
requirements.txt +3 -1
sandbox/20240310 - CQA - Semantic Routing 1.ipynb +0 -0

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from climateqa.engine.embeddings import get_embeddings_function
 embeddings_function = get_embeddings_function()
-from climateqa.papers.openalex import OpenAlex
 from sentence_transformers import CrossEncoder
 # reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
@@ -31,7 +31,7 @@ from collections import defaultdict
 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
 from climateqa.engine.vectorstore import get_pinecone_vectorstore
-from climateqa.engine.retriever import ClimateQARetriever
 from climateqa.engine.reranker import get_reranker
 from climateqa.engine.embeddings import get_embeddings_function
 from climateqa.engine.chains.prompts import audience_prompts

 from climateqa.engine.embeddings import get_embeddings_function
 embeddings_function = get_embeddings_function()
+from climateqa.knowledge.openalex import OpenAlex
 from sentence_transformers import CrossEncoder
 # reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
 from climateqa.engine.vectorstore import get_pinecone_vectorstore
+from climateqa.knowledge.retriever import ClimateQARetriever
 from climateqa.engine.reranker import get_reranker
 from climateqa.engine.embeddings import get_embeddings_function
 from climateqa.engine.chains.prompts import audience_prompts

climateqa/engine/chains/keywords_extraction.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from langchain_core.pydantic_v1 import BaseModel, Field
+from typing import List
+from typing import Literal
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.utils.function_calling import convert_to_openai_function
+from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+class KeywordExtraction(BaseModel):
+    """
+    Analyzing the user query to extract keywords to feed a search engine
+    """
+    keywords: List[str] = Field(
+        description="""
+        Extract the keywords from the user query to feed a search engine as a list
+        Avoid adding super specific keywords to prefer general keywords
+        Maximum 3 keywords
+        Examples:
+        - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
+        - "How will El Nino be impacted by climate change" -> ["el nino","climate change"]
+        - "Is climate change a hoax" -> ["climate change","hoax"]
+        """
+    )
+def make_keywords_extraction_chain(llm):
+    openai_functions = [convert_to_openai_function(KeywordExtraction)]
+    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"KeywordExtraction"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
+    return chain

climateqa/engine/chains/query_transformation.py CHANGED Viewed

@@ -8,6 +8,13 @@ from langchain_core.utils.function_calling import convert_to_openai_function
 from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
 # Prompt from the original paper https://arxiv.org/pdf/2305.14283
 # Query Rewriting for Retrieval-Augmented Large Language Models
 class QueryDecomposition(BaseModel):
@@ -20,8 +27,8 @@ class QueryDecomposition(BaseModel):
         description="""
         Think step by step to answer this question, and provide one or several search engine questions in English for knowledge that you need.
         Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
-        - If it's already a standalone question, you don't need to provide more questions, just reformulate it if relevant as a better question for a search engine
-        - If you need to decompose the question, output a list of maximum 3 questions
     """
     )
@@ -125,12 +132,20 @@ def make_query_rewriter_chain(llm):
     return chain
-def make_query_transform_node(llm):
     decomposition_chain = make_query_decomposition_chain(llm)
     rewriter_chain = make_query_rewriter_chain(llm)
     def transform_query(state):
         new_state = {}
@@ -150,7 +165,33 @@ def make_query_transform_node(llm):
             question_state.update(analysis_output)
             questions.append(question_state)
-        new_state["questions"] = questions
         return new_state

 from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+ROUTING_INDEX = {
+    "Vector":["IPCC","IPBES","IPOS"],
+    "OpenAlex":["OpenAlex"],
+}
+POSSIBLE_SOURCES = [y for values in ROUTING_INDEX.values() for y in values]
 # Prompt from the original paper https://arxiv.org/pdf/2305.14283
 # Query Rewriting for Retrieval-Augmented Large Language Models
 class QueryDecomposition(BaseModel):
         description="""
         Think step by step to answer this question, and provide one or several search engine questions in English for knowledge that you need.
         Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
+        - If it's already a standalone and explicit question, just return the reformulated question for the search engine
+        - If you need to decompose the question, output a list of maximum 2 to 3 questions
     """
     )
     return chain
+def make_query_transform_node(llm,k_final=15):
     decomposition_chain = make_query_decomposition_chain(llm)
     rewriter_chain = make_query_rewriter_chain(llm)
     def transform_query(state):
+        if "sources_auto" not in state or state["sources_auto"] is None or state["sources_auto"] is False:
+            auto_mode = False
+        else:
+            auto_mode = True
+        sources_input = state.get("sources_input")
+        if sources_input is None: sources_input = ROUTING_INDEX["Vector"]
         new_state = {}
             question_state.update(analysis_output)
             questions.append(question_state)
+        # Explode the questions into multiple questions with different sources
+        new_questions = []
+        for q in questions:
+            question,sources = q["question"],q["sources"]
+            # If not auto mode we take the configuration
+            if not auto_mode:
+                sources = sources_input
+            for index,index_sources in ROUTING_INDEX.items():
+                selected_sources = list(set(sources).intersection(index_sources))
+                if len(selected_sources) > 0:
+                    new_questions.append({"question":question,"sources":selected_sources,"index":index})
+        # # Add the number of questions to search
+        # k_by_question = k_final // len(new_questions)
+        # for q in new_questions:
+        #     q["k"] = k_by_question
+        # new_state["questions"] = new_questions
+        # new_state["remaining_questions"] = new_questions
+        new_state = {
+            "remaining_questions":new_questions,
+            "n_questions":len(new_questions),
+        }
         return new_state

climateqa/engine/chains/retrieve_documents.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import sys
+import os
+from contextlib import contextmanager
+from langchain_core.tools import tool
+from langchain_core.runnables import chain
+from langchain_core.runnables import RunnableParallel, RunnablePassthrough
+from langchain_core.runnables import RunnableLambda
+from ..reranker import rerank_docs
+from ...knowledge.retriever import ClimateQARetriever
+from ...knowledge.openalex import OpenAlexRetriever
+from .keywords_extraction import make_keywords_extraction_chain
+from ..utils import log_event
+def divide_into_parts(target, parts):
+    # Base value for each part
+    base = target // parts
+    # Remainder to distribute
+    remainder = target % parts
+    # List to hold the result
+    result = []
+    for i in range(parts):
+        if i < remainder:
+            # These parts get base value + 1
+            result.append(base + 1)
+        else:
+            # The rest get the base value
+            result.append(base)
+    return result
+@contextmanager
+def suppress_output():
+    # Open a null device
+    with open(os.devnull, 'w') as devnull:
+        # Store the original stdout and stderr
+        old_stdout = sys.stdout
+        old_stderr = sys.stderr
+        # Redirect stdout and stderr to the null device
+        sys.stdout = devnull
+        sys.stderr = devnull
+        try:
+            yield
+        finally:
+            # Restore stdout and stderr
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
+@tool
+def query_retriever(question):
+    """Just a dummy tool to simulate the retriever query"""
+    return question
+def make_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
+    # The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
+    @chain
+    async def retrieve_documents(state,config):
+        keywords_extraction = make_keywords_extraction_chain(llm)
+        current_question = state["remaining_questions"][0]
+        remaining_questions = state["remaining_questions"][1:]
+        # ToolMessage(f"Retrieving documents for question: {current_question['question']}",tool_call_id = "retriever")
+        # # There are several options to get the final top k
+        # # Option 1 - Get 100 documents by question and rerank by question
+        # # Option 2 - Get 100/n documents by question and rerank the total
+        # if rerank_by_question:
+        #     k_by_question = divide_into_parts(k_final,len(questions))
+        # docs = state["documents"]
+        # if docs is None: docs = []
+        docs = []
+        k_by_question = k_final // state["n_questions"]
+        sources = current_question["sources"]
+        question = current_question["question"]
+        index = current_question["index"]
+        await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
+        if index == "Vector":
+            # Search the document store using the retriever
+            # Configure high top k for further reranking step
+            retriever = ClimateQARetriever(
+                vectorstore=vectorstore,
+                sources = sources,
+                min_size = 200,
+                k_summary = k_summary,
+                k_total = k_before_reranking,
+                threshold = 0.5,
+            )
+            docs_question = await retriever.ainvoke(question,config)
+        elif index == "OpenAlex":
+            keywords = keywords_extraction.invoke(question)["keywords"]
+            openalex_query = " AND ".join(keywords)
+            print(f"... OpenAlex query: {openalex_query}")
+            retriever_openalex = OpenAlexRetriever(
+                min_year = state.get("min_year",1960),
+                max_year = state.get("max_year",None),
+                k = k_before_reranking
+            )
+            docs_question = await retriever_openalex.ainvoke(openalex_query,config)
+        else:
+            raise Exception(f"Index {index} not found in the routing index")
+        # Rerank
+        if reranker is not None:
+            with suppress_output():
+                docs_question = rerank_docs(reranker,docs_question,question)
+        else:
+            # Add a default reranking score
+            for doc in docs_question:
+                doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
+        # If rerank by question we select the top documents for each question
+        if rerank_by_question:
+            docs_question = docs_question[:k_by_question]
+        # Add sources used in the metadata
+        for doc in docs_question:
+            doc.metadata["sources_used"] = sources
+            doc.metadata["question_used"] = question
+            doc.metadata["index_used"] = index
+        # Add to the list of docs
+        docs.extend(docs_question)
+        # Sorting the list in descending order by rerank_score
+        docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
+        new_state = {"documents":docs,"remaining_questions":remaining_questions}
+        return new_state
+    return retrieve_documents

climateqa/engine/graph.py CHANGED Viewed

@@ -16,10 +16,9 @@ from .chains.answer_ai_impact import make_ai_impact_node
 from .chains.query_transformation import make_query_transform_node
 from .chains.translation import make_translation_node
 from .chains.intent_categorization import make_intent_categorization_node
-from .chains.retriever import make_retriever_node
 from .chains.answer_rag import make_rag_node
 class GraphState(TypedDict):
     """
     Represents the state of our graph.
@@ -29,23 +28,30 @@ class GraphState(TypedDict):
     intent : str
     search_graphs_chitchat : bool
     query: str
-    questions : List[dict]
     answer: str
     audience: str = "experts"
-    sources_input: List[str] = ["auto"]
     documents: List[Document]
     recommended_content : List[Document]
     # graphs_returned: Dict[str,str]
-def search(state):
-    return {}
 def route_intent(state):
     intent = state["intent"]
     if intent in ["chitchat","esg"]:
         return "answer_chitchat"
-    elif intent == "ai_impact":
-        return "answer_ai_impact"
     else:
         # Search route
         return "search"
@@ -95,6 +101,7 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, thresh
     workflow.add_node("set_defaults", set_defaults)
     workflow.add_node("categorize_intent", categorize_intent)
     workflow.add_node("search", search)
     workflow.add_node("transform_query", transform_query)
     workflow.add_node("translate_query", translate_query)
     # workflow.add_node("transform_query_ai", transform_query)
@@ -118,7 +125,7 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, thresh
     workflow.add_conditional_edges(
         "categorize_intent",
         route_intent,
-        make_id_dict(["answer_chitchat","answer_ai_impact","search"])
     )
     workflow.add_conditional_edges(
@@ -132,9 +139,14 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, thresh
         route_translation,
         make_id_dict(["translate_query","transform_query"])
     )
     workflow.add_conditional_edges(
         "retrieve_documents",
         lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
         make_id_dict(["answer_rag","answer_rag_no_docs"])
     )

 from .chains.query_transformation import make_query_transform_node
 from .chains.translation import make_translation_node
 from .chains.intent_categorization import make_intent_categorization_node
+from .chains.retrieve_documents import make_retriever_node
 from .chains.answer_rag import make_rag_node
 class GraphState(TypedDict):
     """
     Represents the state of our graph.
     intent : str
     search_graphs_chitchat : bool
     query: str
+    remaining_questions : List[dict]
+    n_questions : int
     answer: str
     audience: str = "experts"
+    sources_input: List[str] = ["IPCC","IPBES"]
+    sources_auto: bool = True
+    min_year: int = 1960
+    max_year: int = None
     documents: List[Document]
     recommended_content : List[Document]
     # graphs_returned: Dict[str,str]
+def search(state): #TODO
+    return state
+def answer_search(state):#TODO
+    return state
 def route_intent(state):
     intent = state["intent"]
     if intent in ["chitchat","esg"]:
         return "answer_chitchat"
+    # elif intent == "ai_impact":
+    #     return "answer_ai_impact"
     else:
         # Search route
         return "search"
     workflow.add_node("set_defaults", set_defaults)
     workflow.add_node("categorize_intent", categorize_intent)
     workflow.add_node("search", search)
+    workflow.add_node("answer_search", answer_search)
     workflow.add_node("transform_query", transform_query)
     workflow.add_node("translate_query", translate_query)
     # workflow.add_node("transform_query_ai", transform_query)
     workflow.add_conditional_edges(
         "categorize_intent",
         route_intent,
+        make_id_dict(["answer_chitchat","search"])
     )
     workflow.add_conditional_edges(
         route_translation,
         make_id_dict(["translate_query","transform_query"])
     )
     workflow.add_conditional_edges(
         "retrieve_documents",
+        lambda state : "retrieve_documents" if len(state["remaining_questions"]) > 0 else "answer_search",
+        make_id_dict(["retrieve_documents","answer_search"])
+    )
+    workflow.add_conditional_edges(
+        "answer_search",
         lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
         make_id_dict(["answer_rag","answer_rag_no_docs"])
     )

climateqa/engine/llm/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from climateqa.engine.llm.openai import get_llm as get_openai_llm
 from climateqa.engine.llm.azure import get_llm as get_azure_llm
 def get_llm(provider="openai",**kwargs):
@@ -8,6 +9,8 @@ def get_llm(provider="openai",**kwargs):
         return get_openai_llm(**kwargs)
     elif provider == "azure":
         return get_azure_llm(**kwargs)
     else:
         raise ValueError(f"Unknown provider: {provider}")

 from climateqa.engine.llm.openai import get_llm as get_openai_llm
 from climateqa.engine.llm.azure import get_llm as get_azure_llm
+from climateqa.engine.llm.ollama import get_llm as get_ollama_llm
 def get_llm(provider="openai",**kwargs):
         return get_openai_llm(**kwargs)
     elif provider == "azure":
         return get_azure_llm(**kwargs)
+    elif provider == "ollama":
+        return  get_ollama_llm(**kwargs)
     else:
         raise ValueError(f"Unknown provider: {provider}")

climateqa/engine/llm/ollama.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from langchain_community.llms import Ollama
+def get_llm(model="llama3", **kwargs):
+    return Ollama(model=model, **kwargs)

climateqa/engine/utils.py CHANGED Viewed

@@ -1,8 +1,15 @@
 from operator import itemgetter
 from typing import Any, Dict, Iterable, Tuple
 from langchain_core.runnables import RunnablePassthrough
 def pass_values(x):
     if not isinstance(x, list):
         x = [x]
@@ -67,3 +74,13 @@ def flatten_dict(
     """
     flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
     return flat_dict

 from operator import itemgetter
 from typing import Any, Dict, Iterable, Tuple
+import tiktoken
 from langchain_core.runnables import RunnablePassthrough
+def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
+    encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
 def pass_values(x):
     if not isinstance(x, list):
         x = [x]
     """
     flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
     return flat_dict
+async def log_event(info,name,config):
+    """Helper function that will run a dummy chain with the given info
+    The astream_event function will catch this chain and stream the dict info to the logger
+    """
+    chain = RunnablePassthrough().with_config(run_name=name)
+    _ = await chain.ainvoke(info,config)

climateqa/knowledge/__init__.py ADDED Viewed

File without changes

climateqa/{papers → knowledge}/openalex.py RENAMED Viewed

@@ -3,18 +3,32 @@ import networkx as nx
 import matplotlib.pyplot as plt
 from pyvis.network import Network
 from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
 import pyalex
 pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
 class OpenAlex():
     def __init__(self):
         pass
-    def search(self,keywords,n_results = 100,after = None,before = None):
         if isinstance(keywords,str):
             works = Works().search(keywords)
@@ -27,18 +41,21 @@ class OpenAlex():
                 break
             df_works = pd.DataFrame(page)
-            df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x))
             df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
             df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
-            df_works["content"] = df_works["title"] + "\n" + df_works["abstract"]
         else:
-            df_works = []
-            for keyword in keywords:
-                df_keyword = self.search(keyword,n_results = n_results,after = after,before = before)
-                df_works.append(df_keyword)
-            df_works = pd.concat(df_works,ignore_index=True,axis = 0)
-        return df_works
     def rerank(self,query,df,reranker):
@@ -139,4 +156,36 @@ class OpenAlex():
                     reconstructed[position] = token
             # Join the tokens to form the reconstructed sentence(s)
-            return ' '.join(reconstructed)

 import matplotlib.pyplot as plt
 from pyvis.network import Network
+from langchain_core.retrievers import BaseRetriever
+from langchain_core.vectorstores import VectorStoreRetriever
+from langchain_core.documents.base import Document
+from langchain_core.vectorstores import VectorStore
+from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
+from ..engine.utils import num_tokens_from_string
+from typing import List
+from pydantic import Field
 from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
 import pyalex
 pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
+def replace_nan_with_empty_dict(x):
+    return x if pd.notna(x) else {}
 class OpenAlex():
     def __init__(self):
         pass
+    def search(self,keywords:str,n_results = 100,after = None,before = None):
         if isinstance(keywords,str):
             works = Works().search(keywords)
                 break
             df_works = pd.DataFrame(page)
+            df_works = df_works.dropna(subset = ["title"])
+            df_works["primary_location"] = df_works["primary_location"].map(replace_nan_with_empty_dict)
+            df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x)).fillna("")
             df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
             df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
+            df_works["url"] = df_works["id"]
+            df_works["content"] = (df_works["title"] + "\n" + df_works["abstract"]).map(lambda x : x.strip())
+            df_works["num_tokens"] = df_works["content"].map(lambda x : num_tokens_from_string(x))
+            df_works = df_works.drop(columns = ["abstract_inverted_index"])
+            # df_works["subtitle"] = df_works["title"] + " - " + df_works["primary_location"]["source"]["display_name"] + " - " + df_works["publication_year"]
+            return df_works
         else:
+           raise Exception("Keywords must be a string")
     def rerank(self,query,df,reranker):
                     reconstructed[position] = token
             # Join the tokens to form the reconstructed sentence(s)
+            return ' '.join(reconstructed)
+class OpenAlexRetriever(BaseRetriever):
+    min_year:int = 1960
+    max_year:int = None
+    k:int = 100
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        openalex = OpenAlex()
+        # Search for documents
+        df_docs = openalex.search(query,n_results=self.k,after = self.min_year,before = self.max_year)
+        docs = []
+        for i,row in df_docs.iterrows():
+            num_tokens = row["num_tokens"]
+            if num_tokens < 50 or num_tokens > 1000:
+                continue
+            doc = Document(
+                page_content = row["content"],
+                metadata = row.to_dict()
+            )
+            docs.append(doc)
+        return docs

climateqa/{engine → knowledge}/retriever.py RENAMED Viewed

@@ -67,6 +67,7 @@ class ClimateQARetriever(BaseRetriever):
         # Add score to metadata
         results = []
         for i,(doc,score) in enumerate(docs):
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
@@ -79,86 +80,3 @@ class ClimateQARetriever(BaseRetriever):
         return results
-# def filter_summaries(df,k_summary = 3,k_total = 10):
-#     # assert source in ["IPCC","IPBES","ALL"], "source arg should be in (IPCC,IPBES,ALL)"
-#     # # Filter by source
-#     # if source == "IPCC":
-#     #     df = df.loc[df["source"]=="IPCC"]
-#     # elif source == "IPBES":
-#     #     df = df.loc[df["source"]=="IPBES"]
-#     # else:
-#     #     pass
-#     # Separate summaries and full reports
-#     df_summaries = df.loc[df["report_type"].isin(["SPM","TS"])]
-#     df_full = df.loc[~df["report_type"].isin(["SPM","TS"])]
-#     # Find passages from summaries dataset
-#     passages_summaries = df_summaries.head(k_summary)
-#     # Find passages from full reports dataset
-#     passages_fullreports = df_full.head(k_total - len(passages_summaries))
-#     # Concatenate passages
-#     passages = pd.concat([passages_summaries,passages_fullreports],axis = 0,ignore_index = True)
-#     return passages
-# def retrieve_with_summaries(query,retriever,k_summary = 3,k_total = 10,sources = ["IPCC","IPBES"],max_k = 100,threshold = 0.555,as_dict = True,min_length = 300):
-#     assert max_k > k_total
-#     validated_sources = ["IPCC","IPBES"]
-#     sources = [x for x in sources if x in validated_sources]
-#     filters = {
-#         "source": { "$in": sources },
-#     }
-#     print(filters)
-#     # Retrieve documents
-#     docs = retriever.retrieve(query,top_k = max_k,filters = filters)
-#     # Filter by score
-#     docs = [{**x.meta,"score":x.score,"content":x.content} for x in docs if x.score > threshold]
-#     if len(docs) == 0:
-#         return []
-#     res = pd.DataFrame(docs)
-#     passages_df = filter_summaries(res,k_summary,k_total)
-#     if as_dict:
-#         contents = passages_df["content"].tolist()
-#         meta = passages_df.drop(columns = ["content"]).to_dict(orient = "records")
-#         passages = []
-#         for i in range(len(contents)):
-#             passages.append({"content":contents[i],"meta":meta[i]})
-#         return passages
-#     else:
-#         return passages_df
-# def retrieve(query,sources = ["IPCC"],threshold = 0.555,k = 10):
-#     print("hellooooo")
-#     # Reformulate queries
-#     reformulated_query,language = reformulate(query)
-#     print(reformulated_query)
-#     # Retrieve documents
-#     passages = retrieve_with_summaries(reformulated_query,retriever,k_total = k,k_summary = 3,as_dict = True,sources = sources,threshold = threshold)
-#     response = {
-#       "query":query,
-#       "reformulated_query":reformulated_query,
-#       "language":language,
-#       "sources":passages,
-#       "prompts":{"init_prompt":init_prompt,"sources_prompt":sources_prompt},
-#     }
-#     return response

         # Add score to metadata
         results = []
         for i,(doc,score) in enumerate(docs):
+            doc.page_content = doc.page_content.replace("\r\n"," ")
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
         return results

climateqa/papers/__init__.py DELETED Viewed

@@ -1,43 +0,0 @@
-import pandas as pd
-from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
-import pyalex
-pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
-class OpenAlex():
-    def __init__(self):
-        pass
-    def search(self,keywords,n_results = 100,after = None,before = None):
-        works = Works().search(keywords).get()
-        for page in works.paginate(per_page=n_results):
-            break
-        df_works = pd.DataFrame(page)
-        return works
-    def make_network(self):
-        pass
-    def get_abstract_from_inverted_index(self,index):
-        # Determine the maximum index to know the length of the reconstructed array
-        max_index = max([max(positions) for positions in index.values()])
-        # Initialize a list with placeholders for all positions
-        reconstructed = [''] * (max_index + 1)
-        # Iterate through the inverted index and place each token at its respective position(s)
-        for token, positions in index.items():
-            for position in positions:
-                reconstructed[position] = token
-        # Join the tokens to form the reconstructed sentence(s)
-        return ' '.join(reconstructed)

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-gradio==4.19.1
 azure-storage-file-share==12.11.1
 azure-storage-blob
 python-dotenv==1.0.0
@@ -15,3 +15,5 @@ flashrank==0.2.5
 rerankers==0.3.0
 torch==2.3.0
 nvidia-cudnn-cu12==8.9.2.26

+gradio==4.44
 azure-storage-file-share==12.11.1
 azure-storage-blob
 python-dotenv==1.0.0
 rerankers==0.3.0
 torch==2.3.0
 nvidia-cudnn-cu12==8.9.2.26
+langchain-community==0.2
+msal==1.31

sandbox/20240310 - CQA - Semantic Routing 1.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff