Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on 28 days ago

Commit

40084ba

•

1 Parent(s): 4c4fe76

fix answer latency when having multiple sources

Browse files

Files changed (8) hide show

app.py +68 -52
climateqa/engine/chains/graph_retriever.py +68 -67
climateqa/engine/chains/retrieve_documents.py +115 -30
climateqa/engine/graph.py +12 -3
climateqa/engine/graph_retriever.py +54 -14
climateqa/engine/reranker.py +2 -0
climateqa/knowledge/retriever.py +95 -94
sandbox/20241104 - CQA - StepByStep CQA.ipynb +0 -0

app.py CHANGED Viewed

@@ -120,7 +120,7 @@ reranker = get_reranker("nano")
 agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, reranker=reranker)
-async def chat(query,history,audience,sources,reports,current_graphs):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
@@ -136,7 +136,7 @@ async def chat(query,history,audience,sources,reports,current_graphs):
     if reports is None or len(reports) == 0:
         reports = []
-    inputs = {"user_input": query,"audience": audience_prompt,"sources_input":sources}
     result = agent.astream_events(inputs,version = "v1")
@@ -167,7 +167,16 @@ async def chat(query,history,audience,sources,reports,current_graphs):
                 if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents" :# when documents are retrieved
                     docs, docs_html, history, used_documents, related_contents = handle_retrieved_documents(event, history, used_documents)
                 elif event["name"] in steps_display.keys() and event["event"] == "on_chain_start": #display steps
                     event_description, display_output = steps_display[node]
                     if not hasattr(history[-1], 'metadata') or history[-1].metadata["title"] != event_description: # if a new step begins
@@ -260,59 +269,59 @@ papers_cols = list(papers_cols_widths.keys())
 papers_cols_widths = list(papers_cols_widths.values())
-async def find_papers(query,after):
-    summary = ""
-    keywords = generate_keywords(query)
-    df_works = oa.search(keywords,after = after)
-    df_works = df_works.dropna(subset=["abstract"])
-    df_works = oa.rerank(query,df_works,reranker)
-    df_works = df_works.sort_values("rerank_score",ascending=False)
-    docs_html = []
-    for i in range(10):
-        docs_html.append(make_html_df(df_works, i))
-    docs_html = "".join(docs_html)
-    print(docs_html)
-    G = oa.make_network(df_works)
-    height = "750px"
-    network = oa.show_network(G,color_by = "rerank_score",notebook=False,height = height)
-    network_html = network.generate_html()
-    network_html = network_html.replace("'", "\"")
-    css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
-    network_html = network_html + css_to_inject
-    network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
-    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
-    allow-scripts allow-same-origin allow-popups
-    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
-    allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
-    docs = df_works["content"].head(10).tolist()
-    df_works = df_works.reset_index(drop = True).reset_index().rename(columns = {"index":"doc"})
-    df_works["doc"] = df_works["doc"] + 1
-    df_works = df_works[papers_cols]
-    yield docs_html, network_html, summary
-    chain = make_rag_papers_chain(llm)
-    result = chain.astream_log({"question": query,"docs": docs,"language":"English"})
-    path_answer = "/logs/StrOutputParser/streamed_output/-"
-    async for op in result:
-        op = op.ops[0]
-        if op['path'] == path_answer: # reforulated question
-            new_token = op['value'] # str
-            summary += new_token
-        else:
-            continue
-        yield docs_html, network_html, summary
@@ -473,7 +482,13 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
             value=["IPCC"],
             interactive=True,
         )
         dropdown_reports = gr.Dropdown(
             POSSIBLE_REPORTS,
             label="Or select specific reports",
@@ -488,9 +503,10 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
             value="Experts",
             interactive=True,
         )
-        output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
-        output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
@@ -603,14 +619,14 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
     (textbox
         .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
-        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, current_graphs] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
         # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_recommended_content, tab_papers] )
     )
     (examples_hidden
         .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
-        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, current_graphs] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
         # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_recommended_content, tab_papers] )
     )
@@ -633,8 +649,8 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
     dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
-    textbox.submit(find_papers,[textbox,after], [papers_html,citations_network,papers_summary])
-    examples_hidden.change(find_papers,[examples_hidden,after], [papers_html,citations_network,papers_summary])
     btn_summary.click(toggle_summary_visibility, outputs=summary_popup)
     btn_relevant_papers.click(toggle_relevant_visibility, outputs=relevant_popup)

 agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, reranker=reranker)
+async def chat(query, history, audience, sources, reports, relevant_content_sources):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
     if reports is None or len(reports) == 0:
         reports = []
+    inputs = {"user_input": query,"audience": audience_prompt,"sources_input":sources, "relevant_content_sources" : relevant_content_sources}
     result = agent.astream_events(inputs,version = "v1")
                 if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents" :# when documents are retrieved
                     docs, docs_html, history, used_documents, related_contents = handle_retrieved_documents(event, history, used_documents)
+                elif event["event"] == "on_chain_end" and node == "categorize_intent" and event["name"] == "_write": # when the query is transformed
+                    intent = event["data"]["output"]["intent"]
+                    if "language" in event["data"]["output"]:
+                        output_language = event["data"]["output"]["language"]
+                    else :
+                        output_language = "English"
+                    history[-1].content = f"Language identified : {output_language} \n Intent identified : {intent}"
                 elif event["name"] in steps_display.keys() and event["event"] == "on_chain_start": #display steps
                     event_description, display_output = steps_display[node]
                     if not hasattr(history[-1], 'metadata') or history[-1].metadata["title"] != event_description: # if a new step begins
 papers_cols_widths = list(papers_cols_widths.values())
+async def find_papers(query,after, relevant_content_sources):
+    if "OpenAlex" in relevant_content_sources:
+        summary = ""
+        keywords = generate_keywords(query)
+        df_works = oa.search(keywords,after = after)
+        df_works = df_works.dropna(subset=["abstract"])
+        df_works = oa.rerank(query,df_works,reranker)
+        df_works = df_works.sort_values("rerank_score",ascending=False)
+        docs_html = []
+        for i in range(10):
+            docs_html.append(make_html_df(df_works, i))
+        docs_html = "".join(docs_html)
+        print(docs_html)
+        G = oa.make_network(df_works)
+        height = "750px"
+        network = oa.show_network(G,color_by = "rerank_score",notebook=False,height = height)
+        network_html = network.generate_html()
+        network_html = network_html.replace("'", "\"")
+        css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
+        network_html = network_html + css_to_inject
+        network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
+        display-capture; encrypted-media;" sandbox="allow-modals allow-forms
+        allow-scripts allow-same-origin allow-popups
+        allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
+        allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
+        docs = df_works["content"].head(10).tolist()
+        df_works = df_works.reset_index(drop = True).reset_index().rename(columns = {"index":"doc"})
+        df_works["doc"] = df_works["doc"] + 1
+        df_works = df_works[papers_cols]
+        yield docs_html, network_html, summary
+        chain = make_rag_papers_chain(llm)
+        result = chain.astream_log({"question": query,"docs": docs,"language":"English"})
+        path_answer = "/logs/StrOutputParser/streamed_output/-"
+        async for op in result:
+            op = op.ops[0]
+            if op['path'] == path_answer: # reforulated question
+                new_token = op['value'] # str
+                summary += new_token
+            else:
+                continue
+            yield docs_html, network_html, summary
             value=["IPCC"],
             interactive=True,
         )
+        dropdown_external_sources = gr.CheckboxGroup(
+            ["IPCC figures","OpenAlex", "OurWorldInData"],
+            label="Select database to search for relevant content",
+            value=["IPCC figures"],
+            interactive=True,
+        )
         dropdown_reports = gr.Dropdown(
             POSSIBLE_REPORTS,
             label="Or select specific reports",
             value="Experts",
             interactive=True,
         )
+        output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False, visible= False)
+        output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False, visible= False)
     (textbox
         .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
+        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
         # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_recommended_content, tab_papers] )
     )
     (examples_hidden
         .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
+        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
         # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_recommended_content, tab_papers] )
     )
     dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
+    textbox.submit(find_papers,[textbox,after, dropdown_external_sources], [papers_html,citations_network,papers_summary])
+    examples_hidden.change(find_papers,[examples_hidden,after,dropdown_external_sources], [papers_html,citations_network,papers_summary])
     btn_summary.click(toggle_summary_visibility, outputs=summary_popup)
     btn_relevant_papers.click(toggle_relevant_visibility, outputs=relevant_popup)

climateqa/engine/chains/graph_retriever.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 from contextlib import contextmanager
 from ..reranker import rerank_docs
-from ..graph_retriever import GraphRetriever
 from ...utils import remove_duplicates_keep_highest_score
@@ -46,82 +46,83 @@ def suppress_output():
 def make_graph_retriever_node(vectorstore, reranker, rerank_by_question=True, k_final=15, k_before_reranking=100):
-        def retrieve_graphs(state):
-            print("---- Retrieving graphs ----")
-            POSSIBLE_SOURCES = ["IEA", "OWID"]
-            questions = state["remaining_questions"] if state["remaining_questions"] is not None and state["remaining_questions"]!=[]  else [state["query"]]
-            # sources_input = state["sources_input"]
-            sources_input = ["auto"]
-            auto_mode = "auto" in sources_input
-            # There are several options to get the final top k
-            # Option 1 - Get 100 documents by question and rerank by question
-            # Option 2 - Get 100/n documents by question and rerank the total
-            if rerank_by_question:
-                k_by_question = divide_into_parts(k_final,len(questions))
-            docs = []
-            for i,q in enumerate(questions):
-                question = q["question"] if isinstance(q, dict) else q
-                print(f"Subquestion {i}: {question}")
-                # If auto mode, we use all sources
-                if auto_mode:
-                    sources = POSSIBLE_SOURCES
-                # Otherwise, we use the config
                 else:
-                    sources = sources_input
-                if any([x in POSSIBLE_SOURCES for x in sources]):
-                    sources = [x for x in sources if x in POSSIBLE_SOURCES]
-                    # Search the document store using the retriever
-                    retriever = GraphRetriever(
-                        vectorstore = vectorstore,
-                        sources = sources,
-                        k_total = k_before_reranking,
-                        threshold = 0.5,
-                        )
-                    docs_question = retriever.get_relevant_documents(question)
-                    # Rerank
-                    if reranker is not None and docs_question!=[]:
-                        with suppress_output():
-                            docs_question = rerank_docs(reranker,docs_question,question)
-                    else:
-                        # Add a default reranking score
-                        for doc in docs_question:
-                            doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
-                    # If rerank by question we select the top documents for each question
-                    if rerank_by_question:
-                        docs_question = docs_question[:k_by_question[i]]
-                    # Add sources used in the metadata
                     for doc in docs_question:
-                        doc.metadata["sources_used"] = sources
-                    print(f"{len(docs_question)} graphs retrieved for subquestion {i + 1}: {docs_question}")
-                    docs.extend(docs_question)
-                else:
-                    print(f"There are no graphs which match the sources filtered on. Sources filtered on: {sources}. Sources available: {POSSIBLE_SOURCES}.")
-                # Remove duplicates and keep the duplicate document with the highest reranking score
-                docs = remove_duplicates_keep_highest_score(docs)
-                # Sorting the list in descending order by rerank_score
-                # Then select the top k
-                docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
-                docs = docs[:k_final]
-            return {"recommended_content": docs}
-        return retrieve_graphs

 from contextlib import contextmanager
 from ..reranker import rerank_docs
+from ..graph_retriever import retrieve_graphs # GraphRetriever
 from ...utils import remove_duplicates_keep_highest_score
 def make_graph_retriever_node(vectorstore, reranker, rerank_by_question=True, k_final=15, k_before_reranking=100):
+    async def node_retrieve_graphs(state):
+        print("---- Retrieving graphs ----")
+        POSSIBLE_SOURCES = ["IEA", "OWID"]
+        questions = state["remaining_questions"] if state["remaining_questions"] is not None and state["remaining_questions"]!=[]  else [state["query"]]
+        # sources_input = state["sources_input"]
+        sources_input = ["auto"]
+        auto_mode = "auto" in sources_input
+        # There are several options to get the final top k
+        # Option 1 - Get 100 documents by question and rerank by question
+        # Option 2 - Get 100/n documents by question and rerank the total
+        if rerank_by_question:
+            k_by_question = divide_into_parts(k_final,len(questions))
+        docs = []
+        for i,q in enumerate(questions):
+            question = q["question"] if isinstance(q, dict) else q
+            print(f"Subquestion {i}: {question}")
+            # If auto mode, we use all sources
+            if auto_mode:
+                sources = POSSIBLE_SOURCES
+            # Otherwise, we use the config
+            else:
+                sources = sources_input
+            if any([x in POSSIBLE_SOURCES for x in sources]):
+                sources = [x for x in sources if x in POSSIBLE_SOURCES]
+                # Search the document store using the retriever
+                docs_question = await retrieve_graphs(
+                    query = question,
+                    vectorstore = vectorstore,
+                    sources = sources,
+                    k_total = k_before_reranking,
+                    threshold = 0.5,
+                    )
+                # docs_question = retriever.get_relevant_documents(question)
+                # Rerank
+                if reranker is not None and docs_question!=[]:
+                    with suppress_output():
+                        docs_question = rerank_docs(reranker,docs_question,question)
                 else:
+                    # Add a default reranking score
                     for doc in docs_question:
+                        doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
+                # If rerank by question we select the top documents for each question
+                if rerank_by_question:
+                    docs_question = docs_question[:k_by_question[i]]
+                # Add sources used in the metadata
+                for doc in docs_question:
+                    doc.metadata["sources_used"] = sources
+                print(f"{len(docs_question)} graphs retrieved for subquestion {i + 1}: {docs_question}")
+                docs.extend(docs_question)
+            else:
+                print(f"There are no graphs which match the sources filtered on. Sources filtered on: {sources}. Sources available: {POSSIBLE_SOURCES}.")
+            # Remove duplicates and keep the duplicate document with the highest reranking score
+            docs = remove_duplicates_keep_highest_score(docs)
+            # Sorting the list in descending order by rerank_score
+            # Then select the top k
+            docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
+            docs = docs[:k_final]
+        return {"recommended_content": docs}
+    return node_retrieve_graphs

climateqa/engine/chains/retrieve_documents.py CHANGED Viewed

@@ -8,10 +8,13 @@ from langchain_core.runnables import RunnableParallel, RunnablePassthrough
 from langchain_core.runnables import RunnableLambda
 from ..reranker import rerank_docs
-from ...knowledge.retriever import ClimateQARetriever
 from ...knowledge.openalex import OpenAlexRetriever
 from .keywords_extraction import make_keywords_extraction_chain
 from ..utils import log_event
@@ -76,10 +79,110 @@ def _get_k_summary_by_question(n_questions):
     else:
         return 1
 # The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
 # @chain
-async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
     print("---- Retrieve documents ----")
     # Get the documents from the state
@@ -93,12 +196,15 @@ async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_qu
     else:
         related_content = []
     # Get the current question
     current_question = state["remaining_questions"][0]
     remaining_questions = state["remaining_questions"][1:]
     k_by_question = k_final // state["n_questions"]
     k_summary_by_question = _get_k_summary_by_question(state["n_questions"])
     sources = current_question["sources"]
     question = current_question["question"]
@@ -108,40 +214,19 @@ async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_qu
     await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
-    if index == "Vector":
-        # Search the document store using the retriever
-        # Configure high top k for further reranking step
-        retriever = ClimateQARetriever(
             vectorstore=vectorstore,
             sources = sources,
             min_size = 200,
             k_summary = k_summary_by_question,
             k_total = k_before_reranking,
             threshold = 0.5,
         )
-        docs_question_dict = await retriever.ainvoke(question,config)
-    # elif index == "OpenAlex":
-    #     # keyword extraction
-    #     keywords_extraction = make_keywords_extraction_chain(llm)
-    #     keywords = keywords_extraction.invoke(question)["keywords"]
-    #     openalex_query = " AND ".join(keywords)
-    #     print(f"... OpenAlex query: {openalex_query}")
-    #     retriever_openalex = OpenAlexRetriever(
-    #         min_year = state.get("min_year",1960),
-    #         max_year = state.get("max_year",None),
-    #         k = k_before_reranking
-    #     )
-    #     docs_question = await retriever_openalex.ainvoke(openalex_query,config)
-    # else:
-    #     raise Exception(f"Index {index} not found in the routing index")
     # Rerank
     if reranker is not None:
@@ -161,7 +246,7 @@ async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_qu
     docs_question = docs_question_summary_reranked + docs_question_fulltext_reranked
     docs_question = docs_question[:k_by_question]
-    images_question = docs_question_images_reranked[:k_by_question]
     if reranker is not None and rerank_by_question:
         docs_question = sorted(docs_question, key=lambda x: x.metadata["reranking_score"], reverse=True)
@@ -173,7 +258,7 @@ async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_qu
     # Add to the list of docs
     docs.extend(docs_question)
     related_content.extend(images_question)
     new_state = {"documents":docs, "related_contents": related_content,"remaining_questions":remaining_questions}
     return new_state

 from langchain_core.runnables import RunnableLambda
 from ..reranker import rerank_docs
+# from ...knowledge.retriever import ClimateQARetriever
 from ...knowledge.openalex import OpenAlexRetriever
 from .keywords_extraction import make_keywords_extraction_chain
 from ..utils import log_event
+from langchain_core.vectorstores import VectorStore
+from typing import List
+from langchain_core.documents.base import Document
     else:
         return 1
+def _get_k_images_by_question(n_questions):
+    if n_questions == 0:
+        return 0
+    elif n_questions == 1:
+        return 5
+    elif n_questions == 2:
+        return 3
+    elif n_questions == 3:
+        return 2
+    else:
+        return 1
+def _add_metadata_and_score(docs: List) -> Document:
+    # Add score to metadata
+    docs_with_metadata = []
+    for i,(doc,score) in enumerate(docs):
+        doc.page_content = doc.page_content.replace("\r\n"," ")
+        doc.metadata["similarity_score"] = score
+        doc.metadata["content"] = doc.page_content
+        doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
+        # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
+        docs_with_metadata.append(doc)
+    return docs_with_metadata
+async def get_IPCC_relevant_documents(
+    query: str,
+    vectorstore:VectorStore,
+    sources:list = ["IPCC","IPBES","IPOS"],
+    search_figures:bool = False,
+    reports:list = [],
+    threshold:float = 0.6,
+    k_summary:int = 3,
+    k_total:int = 10,
+    k_images: int = 5,
+    namespace:str = "vectors",
+    min_size:int = 200,
+) :
+    # Check if all elements in the list are either IPCC or IPBES
+    assert isinstance(sources,list)
+    assert sources
+    assert all([x in ["IPCC","IPBES","IPOS"] for x in sources])
+    assert k_total > k_summary, "k_total should be greater than k_summary"
+    # Prepare base search kwargs
+    filters = {}
+    if len(reports) > 0:
+        filters["short_name"] = {"$in":reports}
+    else:
+        filters["source"] = { "$in": sources}
+    # INIT
+    docs_summaries = []
+    docs_full = []
+    docs_images = []
+    # Search for k_summary documents in the summaries dataset
+    filters_summaries = {
+        **filters,
+        "chunk_type":"text",
+        "report_type": { "$in":["SPM"]},
+    }
+    docs_summaries = vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = k_summary)
+    docs_summaries = [x for x in docs_summaries if x[1] > threshold]
+    # docs_summaries = []
+    # Search for k_total - k_summary documents in the full reports dataset
+    filters_full = {
+        **filters,
+        "chunk_type":"text",
+        "report_type": { "$nin":["SPM"]},
+    }
+    k_full = k_total - len(docs_summaries)
+    docs_full = vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
+    if search_figures:
+        # Images
+        filters_image = {
+            **filters,
+            "chunk_type":"image"
+        }
+        docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
+    docs_summaries, docs_full, docs_images = _add_metadata_and_score(docs_summaries), _add_metadata_and_score(docs_full), _add_metadata_and_score(docs_images)
+    # Filter if length are below threshold
+    docs_summaries = [x for x in docs_summaries if len(x.page_content) > min_size]
+    docs_full = [x for x in docs_full if len(x.page_content) > min_size]
+    return {
+        "docs_summaries" : docs_summaries,
+        "docs_full" : docs_full,
+        "docs_images" : docs_images,
+    }
 # The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
 # @chain
+async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5, k_images=5):
     print("---- Retrieve documents ----")
     # Get the documents from the state
     else:
         related_content = []
+    search_figures = "IPCC figures" in state["relevant_content_sources"]
     # Get the current question
     current_question = state["remaining_questions"][0]
     remaining_questions = state["remaining_questions"][1:]
     k_by_question = k_final // state["n_questions"]
     k_summary_by_question = _get_k_summary_by_question(state["n_questions"])
+    k_images_by_question = _get_k_images_by_question(state["n_questions"])
     sources = current_question["sources"]
     question = current_question["question"]
     await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
+    if index == "Vector": # always true for now
+        docs_question_dict = await get_IPCC_relevant_documents(
+            query  = question,
             vectorstore=vectorstore,
+            search_figures = search_figures,
             sources = sources,
             min_size = 200,
             k_summary = k_summary_by_question,
             k_total = k_before_reranking,
+            k_images = k_images_by_question,
             threshold = 0.5,
         )
     # Rerank
     if reranker is not None:
     docs_question = docs_question_summary_reranked + docs_question_fulltext_reranked
     docs_question = docs_question[:k_by_question]
+    images_question = docs_question_images_reranked[:k_images]
     if reranker is not None and rerank_by_question:
         docs_question = sorted(docs_question, key=lambda x: x.metadata["reranking_score"], reverse=True)
     # Add to the list of docs
     docs.extend(docs_question)
     related_content.extend(images_question)
+    # related_content=[]
     new_state = {"documents":docs, "related_contents": related_content,"remaining_questions":remaining_questions}
     return new_state

climateqa/engine/graph.py CHANGED Viewed

@@ -36,6 +36,7 @@ class GraphState(TypedDict):
     answer: str
     audience: str = "experts"
     sources_input: List[str] = ["IPCC","IPBES"]
     sources_auto: bool = True
     min_year: int = 1960
     max_year: int = None
@@ -153,20 +154,28 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, thresh
         lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
         make_id_dict(["answer_rag","answer_rag_no_docs"])
     )
     # Define the edges
     # workflow.add_edge("set_defaults", "categorize_intent")
     workflow.add_edge("translate_query", "transform_query")
-    workflow.add_edge("transform_query", "retrieve_graphs")
     # workflow.add_edge("retrieve_graphs", "answer_rag_graph")
-    workflow.add_edge("retrieve_graphs", "retrieve_documents")
     # workflow.add_edge("answer_rag_graph", "retrieve_documents")
     workflow.add_edge("answer_rag", END)
     workflow.add_edge("answer_rag_no_docs", END)
     workflow.add_edge("answer_chitchat", "chitchat_categorize_intent")
     # workflow.add_edge("answer_chitchat", END)
     # workflow.add_edge("answer_ai_impact", END)
-    workflow.add_edge("retrieve_graphs_chitchat", END)
     # workflow.add_edge("answer_ai_impact", "translate_query_ai")
     # workflow.add_edge("translate_query_ai", "transform_query_ai")
     # workflow.add_edge("transform_query_ai", "retrieve_graphs_ai")

     answer: str
     audience: str = "experts"
     sources_input: List[str] = ["IPCC","IPBES"]
+    relevant_content_sources: List[str] = ["IPCC figures"]
     sources_auto: bool = True
     min_year: int = 1960
     max_year: int = None
         lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
         make_id_dict(["answer_rag","answer_rag_no_docs"])
     )
+    workflow.add_conditional_edges(
+        "transform_query",
+        lambda state : "retrieve_graphs" if "OurWorldInData" in state["relevant_content_sources"]  else END,
+        make_id_dict(["retrieve_graphs", END])
+    )
     # Define the edges
     # workflow.add_edge("set_defaults", "categorize_intent")
     workflow.add_edge("translate_query", "transform_query")
+    # workflow.add_edge("transform_query", "retrieve_graphs")
+    workflow.add_edge("transform_query", "retrieve_documents")
     # workflow.add_edge("retrieve_graphs", "answer_rag_graph")
+    workflow.add_edge("retrieve_graphs", END)
+    # workflow.add_edge("retrieve_graphs", "retrieve_documents")
     # workflow.add_edge("answer_rag_graph", "retrieve_documents")
     workflow.add_edge("answer_rag", END)
     workflow.add_edge("answer_rag_no_docs", END)
     workflow.add_edge("answer_chitchat", "chitchat_categorize_intent")
     # workflow.add_edge("answer_chitchat", END)
     # workflow.add_edge("answer_ai_impact", END)
+    # workflow.add_edge("retrieve_graphs_chitchat", END)
     # workflow.add_edge("answer_ai_impact", "translate_query_ai")
     # workflow.add_edge("translate_query_ai", "transform_query_ai")
     # workflow.add_edge("transform_query_ai", "retrieve_graphs_ai")

climateqa/engine/graph_retriever.py CHANGED Viewed

@@ -5,30 +5,70 @@ from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
 from typing import List
-class GraphRetriever(BaseRetriever):
-    vectorstore:VectorStore
-    sources:list = ["OWID"] # plus tard ajouter OurWorldInData # faudra integrate avec l'autre retriever
-    threshold:float = 0.5
-    k_total:int = 10
-    def _get_relevant_documents(
-        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
-    ) -> List[Document]:
         # Check if all elements in the list are IEA or OWID
-        assert isinstance(self.sources,list)
-        assert self.sources
-        assert any([x in ["OWID"] for x in self.sources])
         # Prepare base search kwargs
         filters = {}
-        filters["source"] = {"$in": self.sources}
-        docs = self.vectorstore.similarity_search_with_score(query=query, filter=filters, k=self.k_total)
         # Filter if scores are below threshold
-        docs = [x for x in docs if x[1] > self.threshold]
         # Remove duplicate documents
         unique_docs = []

 from typing import List
+# class GraphRetriever(BaseRetriever):
+#     vectorstore:VectorStore
+#     sources:list = ["OWID"] # plus tard ajouter OurWorldInData # faudra integrate avec l'autre retriever
+#     threshold:float = 0.5
+#     k_total:int = 10
+#     def _get_relevant_documents(
+#         self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+#     ) -> List[Document]:
+#         # Check if all elements in the list are IEA or OWID
+#         assert isinstance(self.sources,list)
+#         assert self.sources
+#         assert any([x in ["OWID"] for x in self.sources])
+#         # Prepare base search kwargs
+#         filters = {}
+#         filters["source"] = {"$in": self.sources}
+#         docs = self.vectorstore.similarity_search_with_score(query=query, filter=filters, k=self.k_total)
+#         # Filter if scores are below threshold
+#         docs = [x for x in docs if x[1] > self.threshold]
+#         # Remove duplicate documents
+#         unique_docs = []
+#         seen_docs = []
+#         for i, doc in enumerate(docs):
+#             if doc[0].page_content not in seen_docs:
+#                 unique_docs.append(doc)
+#                 seen_docs.append(doc[0].page_content)
+#         # Add score to metadata
+#         results = []
+#         for i,(doc,score) in enumerate(unique_docs):
+#             doc.metadata["similarity_score"] = score
+#             doc.metadata["content"] = doc.page_content
+#             results.append(doc)
+#         return results
+async def retrieve_graphs(
+    query: str,
+    vectorstore:VectorStore,
+    sources:list = ["OWID"], # plus tard ajouter OurWorldInData # faudra integrate avec l'autre retriever
+    threshold:float = 0.5,
+    k_total:int = 10,
+)-> List[Document]:
         # Check if all elements in the list are IEA or OWID
+        assert isinstance(sources,list)
+        assert sources
+        assert any([x in ["OWID"] for x in sources])
         # Prepare base search kwargs
         filters = {}
+        filters["source"] = {"$in": sources}
+        docs = vectorstore.similarity_search_with_score(query=query, filter=filters, k=k_total)
         # Filter if scores are below threshold
+        docs = [x for x in docs if x[1] > threshold]
         # Remove duplicate documents
         unique_docs = []

climateqa/engine/reranker.py CHANGED Viewed

@@ -30,6 +30,8 @@ def get_reranker(model = "nano", cohere_api_key = None):
 def rerank_docs(reranker,docs,query):
     # Get a list of texts from langchain docs
     input_docs = [x.page_content for x in docs]

 def rerank_docs(reranker,docs,query):
+    if docs == []:
+        return []
     # Get a list of texts from langchain docs
     input_docs = [x.page_content for x in docs]

climateqa/knowledge/retriever.py CHANGED Viewed

@@ -1,101 +1,102 @@
-# https://github.com/langchain-ai/langchain/issues/8623
-import pandas as pd
-from langchain_core.retrievers import BaseRetriever
-from langchain_core.vectorstores import VectorStoreRetriever
-from langchain_core.documents.base import Document
-from langchain_core.vectorstores import VectorStore
-from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
-from typing import List
-from pydantic import Field
-def _add_metadata_and_score(docs: List) -> Document:
-    # Add score to metadata
-    docs_with_metadata = []
-    for i,(doc,score) in enumerate(docs):
-        doc.page_content = doc.page_content.replace("\r\n"," ")
-        doc.metadata["similarity_score"] = score
-        doc.metadata["content"] = doc.page_content
-        doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
-        # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
-        docs_with_metadata.append(doc)
-    return docs_with_metadata
-class ClimateQARetriever(BaseRetriever):
-    vectorstore:VectorStore
-    sources:list = ["IPCC","IPBES","IPOS"]
-    reports:list = []
-    threshold:float = 0.6
-    k_summary:int = 3
-    k_total:int = 10
-    namespace:str = "vectors",
-    min_size:int = 200,
-    def _get_relevant_documents(
-        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
-    ) -> List[Document]:
-        # Check if all elements in the list are either IPCC or IPBES
-        assert isinstance(self.sources,list)
-        assert self.sources
-        assert all([x in ["IPCC","IPBES","IPOS"] for x in self.sources])
-        assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
-        # Prepare base search kwargs
-        filters = {}
-        if len(self.reports) > 0:
-            filters["short_name"] = {"$in":self.reports}
-        else:
-            filters["source"] = { "$in":self.sources}
-        # Search for k_summary documents in the summaries dataset
-        filters_summaries = {
-            **filters,
-            "chunk_type":"text",
-            "report_type": { "$in":["SPM"]},
-        }
-        docs_summaries = self.vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = self.k_summary)
-        docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
-        # Search for k_total - k_summary documents in the full reports dataset
-        filters_full = {
-            **filters,
-            "chunk_type":"text",
-            "report_type": { "$nin":["SPM"]},
-        }
-        k_full = self.k_total - len(docs_summaries)
-        docs_full = self.vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
-        # Images
-        filters_image = {
-            **filters,
-            "chunk_type":"image"
-        }
-        docs_images = self.vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_full)
-        # Concatenate documents
-        docs = docs_summaries + docs_full + docs_images
-        # Filter if scores are below threshold
-        docs = [x for x in docs if len(x[0].page_content) > self.min_size]
-        # docs = [x for x in docs if x[1] > self.threshold]
-        docs_summaries, docs_full, docs_images = _add_metadata_and_score(docs_summaries), _add_metadata_and_score(docs_full), _add_metadata_and_score(docs_images)
-        # Filter if length are below threshold
-        docs_summaries = [x for x in docs_summaries if len(x.page_content) > self.min_size]
-        docs_full = [x for x in docs_full if len(x.page_content) > self.min_size]
-        return {
-            "docs_summaries" : docs_summaries,
-            "docs_full" : docs_full,
-            "docs_images" : docs_images
-        }

+# # https://github.com/langchain-ai/langchain/issues/8623
+# import pandas as pd
+# from langchain_core.retrievers import BaseRetriever
+# from langchain_core.vectorstores import VectorStoreRetriever
+# from langchain_core.documents.base import Document
+# from langchain_core.vectorstores import VectorStore
+# from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
+# from typing import List
+# from pydantic import Field
+# def _add_metadata_and_score(docs: List) -> Document:
+#     # Add score to metadata
+#     docs_with_metadata = []
+#     for i,(doc,score) in enumerate(docs):
+#         doc.page_content = doc.page_content.replace("\r\n"," ")
+#         doc.metadata["similarity_score"] = score
+#         doc.metadata["content"] = doc.page_content
+#         doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
+#         # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
+#         docs_with_metadata.append(doc)
+#     return docs_with_metadata
+# class ClimateQARetriever(BaseRetriever):
+#     vectorstore:VectorStore
+#     sources:list = ["IPCC","IPBES","IPOS"]
+#     reports:list = []
+#     threshold:float = 0.6
+#     k_summary:int = 3
+#     k_total:int = 10
+#     namespace:str = "vectors",
+#     min_size:int = 200,
+#     def _get_relevant_documents(
+#         self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+#     ) -> List[Document]:
+#         # Check if all elements in the list are either IPCC or IPBES
+#         assert isinstance(self.sources,list)
+#         assert self.sources
+#         assert all([x in ["IPCC","IPBES","IPOS"] for x in self.sources])
+#         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
+#         # Prepare base search kwargs
+#         filters = {}
+#         if len(self.reports) > 0:
+#             filters["short_name"] = {"$in":self.reports}
+#         else:
+#             filters["source"] = { "$in":self.sources}
+#         # Search for k_summary documents in the summaries dataset
+#         filters_summaries = {
+#             **filters,
+#             "chunk_type":"text",
+#             "report_type": { "$in":["SPM"]},
+#         }
+#         docs_summaries = self.vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = self.k_summary)
+#         docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
+#         # docs_summaries = []
+#         # Search for k_total - k_summary documents in the full reports dataset
+#         filters_full = {
+#             **filters,
+#             "chunk_type":"text",
+#             "report_type": { "$nin":["SPM"]},
+#         }
+#         k_full = self.k_total - len(docs_summaries)
+#         docs_full = self.vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
+#         # Images
+#         filters_image = {
+#             **filters,
+#             "chunk_type":"image"
+#         }
+#         docs_images = self.vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_full)
+#         # docs_images = []
+#         # Concatenate documents
+#         # docs = docs_summaries + docs_full + docs_images
+#         # Filter if scores are below threshold
+#         # docs = [x for x in docs if x[1] > self.threshold]
+#         docs_summaries, docs_full, docs_images = _add_metadata_and_score(docs_summaries), _add_metadata_and_score(docs_full), _add_metadata_and_score(docs_images)
+#         # Filter if length are below threshold
+#         docs_summaries = [x for x in docs_summaries if len(x.page_content) > self.min_size]
+#         docs_full = [x for x in docs_full if len(x.page_content) > self.min_size]
+#         return {
+#             "docs_summaries" : docs_summaries,
+#             "docs_full" : docs_full,
+#             "docs_images" : docs_images,
+#         }

sandbox/20241104 - CQA - StepByStep CQA.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff