Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on Sep 30, 2024

Commit

6b43c86

1 Parent(s): fd67e15

add dora graph recommandation

Browse files

Files changed (19) hide show

app.py +186 -29
climateqa/constants.py +22 -1
climateqa/engine/chains/answer_ai_impact.py +1 -0
climateqa/engine/chains/answer_rag.py +1 -0
climateqa/engine/chains/chitchat_categorization.py +43 -0
climateqa/engine/chains/graph_retriever.py +126 -0
climateqa/engine/chains/intent_categorization.py +37 -8
climateqa/engine/chains/prompts.py +24 -1
climateqa/engine/chains/query_transformation.py +7 -2
climateqa/engine/chains/retriever.py +1 -1
climateqa/engine/chains/set_defaults.py +13 -0
climateqa/engine/graph.py +192 -14
climateqa/engine/graph_retriever.py +48 -0
climateqa/engine/reranker.py +11 -2
climateqa/engine/retriever.py +1 -0
climateqa/engine/vectorstore.py +6 -0
climateqa/utils.py +13 -0
front/utils.py +79 -0
style.css +30 -2

app.py CHANGED Viewed

@@ -25,7 +25,8 @@ from azure.storage.fileshare import ShareServiceClient
 from utils import create_user_id
 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
@@ -35,13 +36,14 @@ from climateqa.engine.reranker import get_reranker
 from climateqa.engine.embeddings import get_embeddings_function
 from climateqa.engine.chains.prompts import audience_prompts
 from climateqa.sample_questions import QUESTIONS
-from climateqa.constants import POSSIBLE_REPORTS
 from climateqa.utils import get_image_from_azure_blob_storage
 from climateqa.engine.keywords import make_keywords_chain
 # from climateqa.engine.chains.answer_rag import make_rag_papers_chain
 from climateqa.engine.graph import make_graph_agent,display_graph
-from front.utils import make_html_source,parse_output_llm_with_sources,serialize_docs,make_toolbox
 # Load environment variables in local mode
 try:
@@ -50,6 +52,7 @@ try:
 except Exception as e:
     pass
 # Set up Gradio Theme
 theme = gr.themes.Base(
     primary_hue="blue",
@@ -83,17 +86,18 @@ share_client = service.get_share_client(file_share_name)
 user_id = create_user_id()
 # Create vectorstore and retriever
 vectorstore = get_pinecone_vectorstore(embeddings_function)
-llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
-reranker = get_reranker("large")
-agent = make_graph_agent(llm,vectorstore,reranker)
-async def chat(query,history,audience,sources,reports):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
@@ -110,13 +114,14 @@ async def chat(query,history,audience,sources,reports):
         audience_prompt = audience_prompts["experts"]
     # Prepare default values
-    if len(sources) == 0:
-        sources = ["IPCC"]
-    if len(reports) == 0:
         reports = []
     inputs = {"user_input": query,"audience": audience_prompt,"sources":sources}
     result = agent.astream_events(inputs,version = "v1") #{"callbacks":[MyCustomAsyncHandler()]})
     # result = rag_chain.stream(inputs)
@@ -126,11 +131,14 @@ async def chat(query,history,audience,sources,reports):
     # path_answer = "/logs/answer/streamed_output_str/-"
     docs = []
     docs_html = ""
     output_query = ""
     output_language = ""
     output_keywords = ""
     gallery = []
     start_streaming = False
     steps_display = {
@@ -142,7 +150,7 @@ async def chat(query,history,audience,sources,reports):
     try:
         async for event in result:
-            if event["event"] == "on_chat_model_stream":
                 if start_streaming == False:
                     start_streaming = True
                     history[-1] = (query,"")
@@ -155,14 +163,17 @@ async def chat(query,history,audience,sources,reports):
                 answer_yet = parse_output_llm_with_sources(answer_yet)
                 history[-1] = (query,answer_yet)
-            elif event["name"] == "retrieve_documents" and event["event"] == "on_chain_end":
                 try:
                     docs = event["data"]["output"]["documents"]
                     docs_html = []
                     for i, d in enumerate(docs, 1):
                         docs_html.append(make_html_source(d, i))
                     docs_html = "".join(docs_html)
                 except Exception as e:
                     print(f"Error getting documents: {e}")
                     print(event)
@@ -174,6 +185,55 @@ async def chat(query,history,audience,sources,reports):
             #     answer_yet = "🔄️ Searching in the knowledge base\n{questions}"
             #     history[-1] = (query,answer_yet)
             for event_name,(event_description,display_output) in steps_display.items():
                 if event["name"] == event_name:
@@ -181,6 +241,7 @@ async def chat(query,history,audience,sources,reports):
                         # answer_yet = f"<p><span class='loader'></span>{event_description}</p>"
                         # answer_yet = make_toolbox(event_description, "", checked = False)
                         answer_yet = event_description
                         history[-1] = (query,answer_yet)
                     # elif event["event"] == "on_chain_end":
                     #     answer_yet = ""
@@ -205,7 +266,8 @@ async def chat(query,history,audience,sources,reports):
             history = [tuple(x) for x in history]
-            yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
     except Exception as e:
         raise gr.Error(f"{e}")
@@ -268,12 +330,14 @@ async def chat(query,history,audience,sources,reports):
         history[-1] = (history[-1][0],answer_yet)
         history = [tuple(x) for x in history]
     # gallery = [x.metadata["image_path"] for x in docs if (len(x.metadata["image_path"]) > 0 and "IAS" in x.metadata["image_path"])]
     # if len(gallery) > 0:
     #     gallery = list(set("|".join(gallery).split("|")))
     #     gallery = [get_image_from_azure_blob_storage(x) for x in gallery]
-    yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
@@ -405,16 +469,27 @@ def vote(data: gr.LikeData):
     else:
         print(data)
 with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main-component") as demo:
-    # user_id_state = gr.State([user_id])
     with gr.Tab("ClimateQ&A"):
         with gr.Row(elem_id="chatbot-row"):
             with gr.Column(scale=2):
-                # state = gr.State([system_template])
                 chatbot = gr.Chatbot(
                     value=[(None,init_prompt)],
                     show_copy_button=True,show_label = False,elem_id="chatbot",layout = "panel",
@@ -468,13 +543,13 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
                     # with Modal(visible = False) as config_modal:
                     with gr.Tab("Configuration",elem_id = "tab-config",id = 2):
-                        gr.Markdown("Reminder: You can talk in any language, ClimateQ&A is multi-lingual!")
                         dropdown_sources = gr.CheckboxGroup(
                             ["IPCC", "IPBES","IPOS"],
                             label="Select source",
-                            value=["IPCC"],
                             interactive=True,
                         )
@@ -495,13 +570,84 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
                         output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
                         output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
 #---------------------------------------------------------------------------------------
 # OTHER TABS
 #---------------------------------------------------------------------------------------
     with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
         gallery_component = gr.Gallery()
@@ -526,7 +672,11 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
     #             with gr.Tab("Citations network",elem_id="papers-network-tab"):
     #                 citations_network = gr.HTML(visible=True,elem_id="papers-citations-network")
     with gr.Tab("About",elem_classes = "max-height other-tabs"):
         with gr.Row():
@@ -540,18 +690,25 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
         return (gr.update(interactive = False),gr.update(selected=1),history)
     def finish_chat():
-        return (gr.update(interactive = True,value = ""))
     (textbox
         .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
-        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_textbox")
-        .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
     )
     (examples_hidden
         .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
-        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_examples")
-        .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
     )
@@ -570,4 +727,4 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
     demo.queue()
-demo.launch()

 from utils import create_user_id
+from langchain_chroma import Chroma
+from collections import defaultdict
 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
 from climateqa.engine.embeddings import get_embeddings_function
 from climateqa.engine.chains.prompts import audience_prompts
 from climateqa.sample_questions import QUESTIONS
+from climateqa.constants import POSSIBLE_REPORTS, OWID_CATEGORIES
 from climateqa.utils import get_image_from_azure_blob_storage
 from climateqa.engine.keywords import make_keywords_chain
 # from climateqa.engine.chains.answer_rag import make_rag_papers_chain
 from climateqa.engine.graph import make_graph_agent,display_graph
+from climateqa.engine.embeddings import get_embeddings_function
+from front.utils import make_html_source,parse_output_llm_with_sources,serialize_docs,make_toolbox,generate_html_graphs
 # Load environment variables in local mode
 try:
 except Exception as e:
     pass
 # Set up Gradio Theme
 theme = gr.themes.Base(
     primary_hue="blue",
 user_id = create_user_id()
+embeddings_function = get_embeddings_function()
+llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
+reranker = get_reranker("nano")
 # Create vectorstore and retriever
 vectorstore = get_pinecone_vectorstore(embeddings_function)
+vectorstore_graphs = Chroma(persist_directory="/home/tim/ai4s/climate_qa/dora/climate-question-answering-graphs/climate-question-answering-graphs/vectorstore_owid", embedding_function=embeddings_function)
+# agent = make_graph_agent(llm,vectorstore,reranker)
+agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, reranker=reranker)
+async def chat(query,history,audience,sources,reports,current_graphs):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
         audience_prompt = audience_prompts["experts"]
     # Prepare default values
+    if sources is None or len(sources) == 0:
+        sources = ["IPCC", "IPBES", "IPOS"]
+    if reports is None or len(reports) == 0:
         reports = []
     inputs = {"user_input": query,"audience": audience_prompt,"sources":sources}
+    print(f"\n\nInputs:\n {inputs}\n\n")
     result = agent.astream_events(inputs,version = "v1") #{"callbacks":[MyCustomAsyncHandler()]})
     # result = rag_chain.stream(inputs)
     # path_answer = "/logs/answer/streamed_output_str/-"
     docs = []
+    docs_used = True
     docs_html = ""
+    current_graphs = []
     output_query = ""
     output_language = ""
     output_keywords = ""
     gallery = []
+    updates = []
     start_streaming = False
     steps_display = {
     try:
         async for event in result:
+            if event["event"] == "on_chat_model_stream" and event["metadata"]["langgraph_node"] in ["answer_rag", "answer_rag_no_docs", "answer_chitchat", "answer_ai_impact"]:
                 if start_streaming == False:
                     start_streaming = True
                     history[-1] = (query,"")
                 answer_yet = parse_output_llm_with_sources(answer_yet)
                 history[-1] = (query,answer_yet)
+                if docs_used is True and event["metadata"]["langgraph_node"] in ["answer_rag_no_docs", "answer_chitchat", "answer_ai_impact"]:
+                    docs_used = False
+            elif docs_used is True and event["name"] == "retrieve_documents" and event["event"] == "on_chain_end":
                 try:
                     docs = event["data"]["output"]["documents"]
                     docs_html = []
                     for i, d in enumerate(docs, 1):
                         docs_html.append(make_html_source(d, i))
                     docs_html = "".join(docs_html)
                 except Exception as e:
                     print(f"Error getting documents: {e}")
                     print(event)
             #     answer_yet = "🔄️ Searching in the knowledge base\n{questions}"
             #     history[-1] = (query,answer_yet)
+            elif event["name"] in ["retrieve_graphs", "retrieve_graphs_ai"] and event["event"] == "on_chain_end":
+                try:
+                    recommended_content = event["data"]["output"]["recommended_content"]
+                    # graphs = [
+                    #     {
+                    #         "embedding": x.metadata["returned_content"],
+                    #         "metadata": {
+                    #             "source": x.metadata["source"],
+                    #             "category": x.metadata["category"]
+                    #             }
+                    #             } for x in recommended_content if x.metadata["source"] == "OWID"
+                    #             ]
+                    unique_graphs = []
+                    seen_embeddings = set()
+                    for x in recommended_content:
+                        embedding = x.metadata["returned_content"]
+                        # Check if the embedding has already been seen
+                        if embedding not in seen_embeddings:
+                            unique_graphs.append({
+                                "embedding": embedding,
+                                "metadata": {
+                                    "source": x.metadata["source"],
+                                    "category": x.metadata["category"]
+                                }
+                            })
+                            # Add the embedding to the seen set
+                            seen_embeddings.add(embedding)
+                    categories = {}
+                    for graph in unique_graphs:
+                        category = graph['metadata']['category']
+                        if category not in categories:
+                            categories[category] = []
+                        categories[category].append(graph['embedding'])
+                    # graphs_html = ""
+                    for category, embeddings in categories.items():
+                        # graphs_html += f"<h3>{category}</h3>"
+                        # current_graphs.append(f"<h3>{category}</h3>")
+                        for embedding in embeddings:
+                            current_graphs.append([embedding, category])
+                            # graphs_html += f"<div>{embedding}</div>"
+                except Exception as e:
+                    print(f"Error getting graphs: {e}")
             for event_name,(event_description,display_output) in steps_display.items():
                 if event["name"] == event_name:
                         # answer_yet = f"<p><span class='loader'></span>{event_description}</p>"
                         # answer_yet = make_toolbox(event_description, "", checked = False)
                         answer_yet = event_description
                         history[-1] = (query,answer_yet)
                     # elif event["event"] == "on_chain_end":
                     #     answer_yet = ""
             history = [tuple(x) for x in history]
+            yield history,docs_html,output_query,output_language,gallery,current_graphs #,output_query,output_keywords
     except Exception as e:
         raise gr.Error(f"{e}")
         history[-1] = (history[-1][0],answer_yet)
         history = [tuple(x) for x in history]
+        print(f"\n\nImages:\n{gallery}")
     # gallery = [x.metadata["image_path"] for x in docs if (len(x.metadata["image_path"]) > 0 and "IAS" in x.metadata["image_path"])]
     # if len(gallery) > 0:
     #     gallery = list(set("|".join(gallery).split("|")))
     #     gallery = [get_image_from_azure_blob_storage(x) for x in gallery]
+        yield history,docs_html,output_query,output_language,gallery,current_graphs #,output_query,output_keywords
     else:
         print(data)
+def save_graph(saved_graphs_state, embedding, category):
+    print(f"\nCategory:\n{saved_graphs_state}\n")
+    if category not in saved_graphs_state:
+        saved_graphs_state[category] = []
+    if embedding not in saved_graphs_state[category]:
+        saved_graphs_state[category].append(embedding)
+    return saved_graphs_state, gr.Button("Graph Saved")
 with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main-component") as demo:
+    user_id_state = gr.State([user_id])
+    chat_completed_state = gr.State(0)
+    current_graphs = gr.State([])
+    saved_graphs = gr.State({})
     with gr.Tab("ClimateQ&A"):
         with gr.Row(elem_id="chatbot-row"):
             with gr.Column(scale=2):
+                state = gr.State([system_template])
                 chatbot = gr.Chatbot(
                     value=[(None,init_prompt)],
                     show_copy_button=True,show_label = False,elem_id="chatbot",layout = "panel",
                     # with Modal(visible = False) as config_modal:
                     with gr.Tab("Configuration",elem_id = "tab-config",id = 2):
+                        gr.Markdown("Reminders: You can talk in any language, ClimateQ&A is multi-lingual!")
                         dropdown_sources = gr.CheckboxGroup(
                             ["IPCC", "IPBES","IPOS"],
                             label="Select source",
+                            value=["IPCC", "IPBES","IPOS"],
                             interactive=True,
                         )
                         output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
                         output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
+                    # with gr.Tab("Recommended content", elem_id="tab-recommended_content", id=3) as recommended_content_tab:
+                        # @gr.render(inputs=[current_graphs])
+                        # def display_default_recommended(current_graphs):
+                        #     if len(current_graphs)==0:
+                        #         placeholder_message = gr.HTML("<h2>There are no graphs to be displayed at the moment. Try asking another question.</h2>")
+                        # @gr.render(inputs=[current_graphs],triggers=[chat_completed_state.change])
+                        # def render_graphs(current_graph_list):
+                        #     global saved_graphs
+                        #     with gr.Column():
+                        #         print(f"\ncurrent_graph_list:\n{current_graph_list}")
+                        #         for (embedding, category) in current_graph_list:
+                        #             graphs_placeholder = gr.HTML(embedding, elem_id="graphs-placeholder")
+                        #             save_btn = gr.Button("Save Graph")
+                        #             save_btn.click(
+                        #                 save_graph,
+                        #                 [saved_graphs, gr.State(embedding), gr.State(category)],
+                        #                 [saved_graphs, save_btn]
+                        #             )
 #---------------------------------------------------------------------------------------
 # OTHER TABS
 #---------------------------------------------------------------------------------------
+    # with gr.Tab("Recommended content", elem_id="tab-recommended_content2") as recommended_content_tab2:
+    #     @gr.render(inputs=[current_graphs])
+    #     def display_default_recommended_head(current_graphs_list):
+    #         if len(current_graphs_list)==0:
+    #             gr.HTML("<h2>There are no graphs to be displayed at the moment. Try asking another question.</h2>")
+    #     @gr.render(inputs=[current_graphs],triggers=[chat_completed_state.change])
+    #     def render_graphs_head(current_graph_list):
+    #         global saved_graphs
+    #         category_dict = defaultdict(list)
+    #         for (embedding, category) in current_graph_list:
+    #             category_dict[category].append(embedding)
+    #         for category in category_dict:
+    #             with gr.Tab(category):
+    #                 splits = [category_dict[category][i:i+3] for i in range(0, len(category_dict[category]), 3)]
+    #                 for row in splits:
+    #                     with gr.Row():
+    #                         for embedding in row:
+    #                             with gr.Column():
+    #                                 gr.HTML(embedding, elem_id="graphs-placeholder")
+    #                                 save_btn = gr.Button("Save Graph")
+    #                                 save_btn.click(
+    #                                     save_graph,
+    #                                     [saved_graphs, gr.State(embedding), gr.State(category)],
+    #                                     [saved_graphs, save_btn]
+    #                                 )
+    # with gr.Tab("Saved Graphs", elem_id="tab-saved-graphs") as saved_graphs_tab:
+    #     @gr.render(inputs=[saved_graphs])
+    #     def display_default_save(saved):
+    #         if len(saved)==0:
+    #             gr.HTML("<h2>You have not saved any graphs yet</h2>")
+    #     @gr.render(inputs=[saved_graphs], triggers=[saved_graphs.change])
+    #     def view_saved_graphs(graphs_list):
+    #         categories = [category for category in graphs_list] # graphs_list.keys()
+    #         for category in categories:
+    #             with gr.Tab(category):
+    #                 splits = [graphs_list[category][i:i+3] for i in range(0, len(graphs_list[category]), 3)]
+    #                 for row in splits:
+    #                     with gr.Row():
+    #                         for graph in row:
+    #                             gr.HTML(graph, elem_id="graphs-placeholder")
     with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
         gallery_component = gr.Gallery()
     #             with gr.Tab("Citations network",elem_id="papers-network-tab"):
     #                 citations_network = gr.HTML(visible=True,elem_id="papers-citations-network")
+    # with gr.Tab("Saved Graphs", elem_id="tab-saved-graphs", id=4) as saved_graphs_tab:
+    #     @gr.render(inputs=[saved_graphs], triggers=[saved_graphs.change])
+    #     def view_saved_graphs(graphs_list):
+    #         for graph in graphs_list:
+    #             gr.HTML(graph, elem_id="graphs-placeholder")
     with gr.Tab("About",elem_classes = "max-height other-tabs"):
         with gr.Row():
         return (gr.update(interactive = False),gr.update(selected=1),history)
     def finish_chat():
+        return (gr.update(interactive = True,value = ""),gr.update(selected=3))
+    def change_completion_status(current_state):
+        current_state = 1 - current_state
+        return current_state
     (textbox
         .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
+        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, current_graphs], [chatbot,sources_textbox,output_query,output_language,gallery_component, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
+        .then(finish_chat, None, [textbox,tabs],api_name = "finish_chat_textbox")
+        .then(change_completion_status, [chat_completed_state], [chat_completed_state])
     )
     (examples_hidden
         .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
+        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports,current_graphs], [chatbot,sources_textbox,output_query,output_language,gallery_component, current_graphs],concurrency_limit = 8,api_name = "chat_examples")
+        .then(finish_chat, None, [textbox,tabs],api_name = "finish_chat_examples")
+        .then(change_completion_status, [chat_completed_state], [chat_completed_state])
     )
     demo.queue()
+demo.launch(debug=True)

climateqa/constants.py CHANGED Viewed

@@ -42,4 +42,25 @@ POSSIBLE_REPORTS = [
     "IPBES IAS A C5",
     "IPBES IAS A C6",
     "IPBES IAS A SPM"
-]

     "IPBES IAS A C5",
     "IPBES IAS A C6",
     "IPBES IAS A SPM"
+]
+OWID_CATEGORIES = ['Access to Energy', 'Agricultural Production',
+       'Agricultural Regulation & Policy', 'Air Pollution',
+       'Animal Welfare', 'Antibiotics', 'Biodiversity', 'Biofuels',
+       'Biological & Chemical Weapons', 'CO2 & Greenhouse Gas Emissions',
+       'COVID-19', 'Clean Water', 'Clean Water & Sanitation',
+       'Climate Change', 'Crop Yields', 'Diet Compositions',
+       'Electricity', 'Electricity Mix', 'Energy', 'Energy Efficiency',
+       'Energy Prices', 'Environmental Impacts of Food Production',
+       'Environmental Protection & Regulation', 'Famines', 'Farm Size',
+       'Fertilizers', 'Fish & Overfishing', 'Food Supply', 'Food Trade',
+       'Food Waste', 'Food and Agriculture', 'Forests & Deforestation',
+       'Fossil Fuels', 'Future Population Growth',
+       'Hunger & Undernourishment', 'Indoor Air Pollution', 'Land Use',
+       'Land Use & Yields in Agriculture', 'Lead Pollution',
+       'Meat & Dairy Production', 'Metals & Minerals',
+       'Natural Disasters', 'Nuclear Energy', 'Nuclear Weapons',
+       'Oil Spills', 'Outdoor Air Pollution', 'Ozone Layer', 'Pandemics',
+       'Pesticides', 'Plastic Pollution', 'Renewable Energy', 'Soil',
+       'Transport', 'Urbanization', 'Waste Management', 'Water Pollution',
+       'Water Use & Stress', 'Wildfires']

climateqa/engine/chains/answer_ai_impact.py CHANGED Viewed

@@ -38,6 +38,7 @@ def make_ai_impact_chain(llm):
 def make_ai_impact_node(llm):
     ai_impact_chain = make_ai_impact_chain(llm)
     async def answer_ai_impact(state,config):
         answer = await ai_impact_chain.ainvoke({"question":state["user_input"]},config)

 def make_ai_impact_node(llm):
     ai_impact_chain = make_ai_impact_chain(llm)
     async def answer_ai_impact(state,config):
         answer = await ai_impact_chain.ainvoke({"question":state["user_input"]},config)

climateqa/engine/chains/answer_rag.py CHANGED Viewed

@@ -61,6 +61,7 @@ def make_rag_node(llm,with_docs = True):
     async def answer_rag(state,config):
         answer = await rag_chain.ainvoke(state,config)
         return {"answer":answer}
     return answer_rag

     async def answer_rag(state,config):
         answer = await rag_chain.ainvoke(state,config)
+        print(f"\n\nAnswer:\n{answer}")
         return {"answer":answer}
     return answer_rag

climateqa/engine/chains/chitchat_categorization.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from langchain_core.pydantic_v1 import BaseModel, Field
+from typing import List
+from typing import Literal
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.utils.function_calling import convert_to_openai_function
+from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+class IntentCategorizer(BaseModel):
+    """Analyzing the user message input"""
+    environment: bool = Field(
+        description="Return 'True' if the question relates to climate change, the environment, nature, etc. (Example: should I eat fish?). Return 'False' if the question is just chit chat or not related to the environment or climate change.",
+    )
+def make_chitchat_intent_categorization_chain(llm):
+    openai_functions = [convert_to_openai_function(IntentCategorizer)]
+    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"IntentCategorizer"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
+    return chain
+def make_chitchat_intent_categorization_node(llm):
+    categorization_chain = make_chitchat_intent_categorization_chain(llm)
+    def categorize_message(state):
+        output = categorization_chain.invoke({"input": state["user_input"]})
+        print(f"\n\nChit chat output intent categorization: {output}\n")
+        state["search_graphs_chitchat"] = output["environment"]
+        print(f"\n\nChit chat output intent categorization: {state}\n")
+        return state
+    return categorize_message

climateqa/engine/chains/graph_retriever.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import sys
+import os
+from contextlib import contextmanager
+from ..reranker import rerank_docs
+from ..graph_retriever import GraphRetriever
+from ...utils import remove_duplicates_keep_highest_score
+def divide_into_parts(target, parts):
+    # Base value for each part
+    base = target // parts
+    # Remainder to distribute
+    remainder = target % parts
+    # List to hold the result
+    result = []
+    for i in range(parts):
+        if i < remainder:
+            # These parts get base value + 1
+            result.append(base + 1)
+        else:
+            # The rest get the base value
+            result.append(base)
+    return result
+@contextmanager
+def suppress_output():
+    # Open a null device
+    with open(os.devnull, 'w') as devnull:
+        # Store the original stdout and stderr
+        old_stdout = sys.stdout
+        old_stderr = sys.stderr
+        # Redirect stdout and stderr to the null device
+        sys.stdout = devnull
+        sys.stderr = devnull
+        try:
+            yield
+        finally:
+            # Restore stdout and stderr
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
+def make_graph_retriever_node(vectorstore, reranker, rerank_by_question=True, k_final=15, k_before_reranking=100):
+        def retrieve_graphs(state):
+            print("---- Retrieving graphs ----")
+            POSSIBLE_SOURCES = ["IEA", "OWID"]
+            questions = state["questions"] if state["questions"] is not None else [state["query"]]
+            sources_input = state["sources_input"]
+            auto_mode = "auto" in sources_input
+            # There are several options to get the final top k
+            # Option 1 - Get 100 documents by question and rerank by question
+            # Option 2 - Get 100/n documents by question and rerank the total
+            if rerank_by_question:
+                k_by_question = divide_into_parts(k_final,len(questions))
+            docs = []
+            for i,q in enumerate(questions):
+                question = q["question"] if isinstance(q, dict) else q
+                print(f"Subquestion {i}: {question}")
+                # If auto mode, we use all sources
+                if auto_mode:
+                    sources = POSSIBLE_SOURCES
+                # Otherwise, we use the config
+                else:
+                    sources = sources_input
+                if any([x in POSSIBLE_SOURCES for x in sources]):
+                    sources = [x for x in sources if x in POSSIBLE_SOURCES]
+                    # Search the document store using the retriever
+                    retriever = GraphRetriever(
+                        vectorstore = vectorstore,
+                        sources = sources,
+                        k_total = k_before_reranking,
+                        threshold = 0.5,
+                        )
+                    docs_question = retriever.get_relevant_documents(question)
+                    # Rerank
+                    if reranker is not None:
+                        with suppress_output():
+                            docs_question = rerank_docs(reranker,docs_question,question)
+                    else:
+                        # Add a default reranking score
+                        for doc in docs_question:
+                            doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
+                    # If rerank by question we select the top documents for each question
+                    if rerank_by_question:
+                        docs_question = docs_question[:k_by_question[i]]
+                    # Add sources used in the metadata
+                    for doc in docs_question:
+                        doc.metadata["sources_used"] = sources
+                    print(f"{len(docs_question)} graphs retrieved for subquestion {i + 1}: {docs_question}")
+                    docs.extend(docs_question)
+                else:
+                    print(f"There are no graphs which match the sources filtered on. Sources filtered on: {sources}. Sources available: {POSSIBLE_SOURCES}.")
+                # Remove duplicates and keep the duplicate document with the highest reranking score
+                docs = remove_duplicates_keep_highest_score(docs)
+                # Sorting the list in descending order by rerank_score
+                # Then select the top k
+                docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
+                docs = docs[:k_final]
+            return {"recommended_content": docs}
+        return retrieve_graphs

climateqa/engine/chains/intent_categorization.py CHANGED Viewed

@@ -7,6 +7,34 @@ from langchain_core.utils.function_calling import convert_to_openai_function
 from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
 class IntentCategorizer(BaseModel):
     """Analyzing the user message input"""
@@ -16,9 +44,9 @@ class IntentCategorizer(BaseModel):
     )
     intent: str = Field(
         enum=[
-            "ai_impact",
-            "geo_info",
-            "esg",
             "search",
             "chitchat",
         ],
@@ -27,12 +55,12 @@ class IntentCategorizer(BaseModel):
             Any question
             Examples:
-            - ai_impact = Environmental impacts of AI: "What are the environmental impacts of AI", "How does AI affect the environment"
-            - geo_info = Geolocated info about climate change: Any question where the user wants to know localized impacts of climate change, eg: "What will be the temperature in Marseille in 2050"
-            - esg = Any question about the ESG regulation, frameworks and standards like the CSRD, TCFD, SASB, GRI, CDP, etc.
             - search = Searching for any quesiton about climate change, energy, biodiversity, nature, and everything we can find the IPCC or IPBES reports or scientific papers,
             - chitchat = Any general question that is not related to the environment or climate change or just conversational, or if you don't think searching the IPCC or IPBES reports would be relevant
         """,
     )
@@ -43,7 +71,7 @@ def make_intent_categorization_chain(llm):
     llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"IntentCategorizer"})
     prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
         ("user", "input: {input}")
     ])
@@ -56,7 +84,8 @@ def make_intent_categorization_node(llm):
     categorization_chain = make_intent_categorization_chain(llm)
     def categorize_message(state):
-        output = categorization_chain.invoke({"input":state["user_input"]})
         if "language" not in output: output["language"] = "English"
         output["query"] = state["user_input"]
         return output

 from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+# class IntentCategorizer(BaseModel):
+#     """Analyzing the user message input"""
+#     language: str = Field(
+#         description="Find the language of the message input in full words (ex: French, English, Spanish, ...), defaults to English",
+#         default="English",
+#     )
+#     intent: str = Field(
+#         enum=[
+#             "ai",
+#             # "geo_info",
+#             # "esg"
+#             "search",
+#             "chitchat",
+#         ],
+#         description="""
+#             Categorize the user input in one of the following category
+#             Any question
+#             Examples:
+#             - ai = any question related to AI: "What are the environmental consequences of AI", "How does AI affect the environment"
+#             - search = Searching for any question about climate change, energy, biodiversity, nature, and everything we can find the IPCC or IPBES reports or scientific papers. Also questions about individual actions or anything loosely related to the environment.
+#             - chitchat = Any chit chat or any question that is not related to the environment or climate change or for which it is not necessary to look for the answer in the IPCC, IPBES, IPOS or scientific reports.
+#         """,
+#             # - geo_info = Geolocated info about climate change: Any question where the user wants to know localized impacts of climate change, eg: "What will be the temperature in Marseille in 2050"
+#             # - esg = Any question about the ESG regulation, frameworks and standards like the CSRD, TCFD, SASB, GRI, CDP, etc.
+#     )
 class IntentCategorizer(BaseModel):
     """Analyzing the user message input"""
     )
     intent: str = Field(
         enum=[
+            "ai",
+            # "geo_info",
+            # "esg"
             "search",
             "chitchat",
         ],
             Any question
             Examples:
+            - ai = Any query related to Artificial Intelligence: "What are the environmental consequences of AI", "How does AI affect the environment"
             - search = Searching for any quesiton about climate change, energy, biodiversity, nature, and everything we can find the IPCC or IPBES reports or scientific papers,
             - chitchat = Any general question that is not related to the environment or climate change or just conversational, or if you don't think searching the IPCC or IPBES reports would be relevant
         """,
+            # - geo_info = Geolocated info about climate change: Any question where the user wants to know localized impacts of climate change, eg: "What will be the temperature in Marseille in 2050"
+            # - esg = Any question about the ESG regulation, frameworks and standards like the CSRD, TCFD, SASB, GRI, CDP, etc.
     )
     llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"IntentCategorizer"})
     prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, you will analyze, and categorize the user input message using the function provided. Categorize the user input as ai ONLY if it is related to Artificial Intelligence, search if it is related to the environment, climate change, energy, biodiversity, nature, etc. and chitchat if it is just general conversation."),
         ("user", "input: {input}")
     ])
     categorization_chain = make_intent_categorization_chain(llm)
     def categorize_message(state):
+        output = categorization_chain.invoke({"input": state["user_input"]})
+        print(f"\n\nOutput intent categorization: {output}\n")
         if "language" not in output: output["language"] = "English"
         output["query"] = state["user_input"]
         return output

climateqa/engine/chains/prompts.py CHANGED Viewed

@@ -147,4 +147,27 @@ audience_prompts = {
     "children": "6 year old children that don't know anything about science and climate change and need metaphors to learn",
     "general": "the general public who know the basics in science and climate change and want to learn more about it without technical terms. Still use references to passages.",
     "experts": "expert and climate scientists that are not afraid of technical terms",
-}

     "children": "6 year old children that don't know anything about science and climate change and need metaphors to learn",
     "general": "the general public who know the basics in science and climate change and want to learn more about it without technical terms. Still use references to passages.",
     "experts": "expert and climate scientists that are not afraid of technical terms",
+}
+answer_prompt_graph_template = """
+Given the user question and a list of graphs which are related to the question, rank the graphs based on relevance to the user question. ALWAYS follow the guidelines given below.
+### Guidelines ###
+- Keep all the graphs that are given to you.
+- NEVER modify the graph HTML embedding, the category or the source leave them exactly as they are given.
+- Return the ranked graphs as a list of dictionaries with keys 'embedding', 'category', and 'source'.
+- Return a valid JSON output.
+-----------------------
+User question:
+{query}
+Graphs and their HTML embedding:
+{recommended_content}
+-----------------------
+{format_instructions}
+Output the result as json with a key "graphs" containing a list of dictionaries of the relevant graphs with keys 'embedding', 'category', and 'source'. Do not modify the graph HTML embedding, the category or the source. Do not put any message or text before or after the JSON output.
+"""

climateqa/engine/chains/query_transformation.py CHANGED Viewed

@@ -62,15 +62,15 @@ class QueryAnalysis(BaseModel):
     #     """
     # )
-    sources: List[Literal["IPCC", "IPBES", "IPOS","OpenAlex"]] = Field(
         ...,
         description="""
             Given a user question choose which documents would be most relevant for answering their question,
             - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
             - IPBES is for questions about biodiversity and nature
             - IPOS is for questions about the ocean and deep sea mining
-            - OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
         """,
     )
     # topics: List[Literal[
     #     "Climate change",
@@ -143,6 +143,11 @@ def make_query_transform_node(llm):
         for question in new_state["questions"]:
             question_state = {"question":question}
             analysis_output = rewriter_chain.invoke({"input":question})
             question_state.update(analysis_output)
             questions.append(question_state)
         new_state["questions"] = questions

     #     """
     # )
+    sources: List[Literal["IPCC", "IPBES", "IPOS"]] = Field( #,"OpenAlex"]] = Field(
         ...,
         description="""
             Given a user question choose which documents would be most relevant for answering their question,
             - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
             - IPBES is for questions about biodiversity and nature
             - IPOS is for questions about the ocean and deep sea mining
         """,
+            # - OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
     )
     # topics: List[Literal[
     #     "Climate change",
         for question in new_state["questions"]:
             question_state = {"question":question}
             analysis_output = rewriter_chain.invoke({"input":question})
+            # The case when the llm does not return any sources
+            if not analysis_output["sources"] or not all(source in ["IPCC", "IPBS", "IPOS"] for source in analysis_output["sources"]):
+                analysis_output["sources"] = ["IPCC", "IPBES", "IPOS"]
             question_state.update(analysis_output)
             questions.append(question_state)
         new_state["questions"] = questions

climateqa/engine/chains/retriever.py CHANGED Viewed

@@ -49,7 +49,7 @@ def make_retriever_node(vectorstore,reranker,rerank_by_question=True, k_final=15
     def retrieve_documents(state):
-        POSSIBLE_SOURCES = ["IPCC","IPBES","IPOS","OpenAlex"]
         questions = state["questions"]
         # Use sources from the user input or from the LLM detection

     def retrieve_documents(state):
+        POSSIBLE_SOURCES = ["IPCC","IPBES","IPOS"] # ,"OpenAlex"]
         questions = state["questions"]
         # Use sources from the user input or from the LLM detection

climateqa/engine/chains/set_defaults.py ADDED Viewed

	@@ -0,0 +1,13 @@

+def set_defaults(state):
+    print("---- Setting defaults ----")
+    if not state["audience"] or state["audience"] is None:
+        state.update({"audience": "experts"})
+    sources_input = state["sources_input"] if "sources_input" in state else ["auto"]
+    state.update({"sources_input": sources_input})
+    # if not state["sources_input"] or state["sources_input"] is None:
+    #     state.update({"sources_input": ["auto"]})
+    return state

climateqa/engine/graph.py CHANGED Viewed

@@ -4,10 +4,10 @@ from contextlib import contextmanager
 from langchain.schema import Document
 from langgraph.graph import END, StateGraph
-from langchain_core.runnables.graph import CurveStyle, NodeColors, MermaidDrawMethod
 from typing_extensions import TypedDict
-from typing import List
 from IPython.display import display, HTML, Image
@@ -27,12 +27,15 @@ class GraphState(TypedDict):
     user_input : str
     language : str
     intent : str
     query: str
     questions : List[dict]
     answer: str
     audience: str = "experts"
     sources_input: List[str] = ["auto"]
     documents: List[Document]
 def search(state):
     return {}
@@ -46,6 +49,13 @@ def route_intent(state):
     else:
         # Search route
         return "search"
 def route_translation(state):
     if state["language"].lower() == "english":
@@ -64,7 +74,7 @@ def route_based_on_relevant_docs(state,threshold_docs=0.2):
 def make_id_dict(values):
     return {k:k for k in values}
-def make_graph_agent(llm,vectorstore,reranker,threshold_docs = 0.2):
     workflow = StateGraph(GraphState)
@@ -74,23 +84,35 @@ def make_graph_agent(llm,vectorstore,reranker,threshold_docs = 0.2):
     translate_query = make_translation_node(llm)
     answer_chitchat = make_chitchat_node(llm)
     answer_ai_impact = make_ai_impact_node(llm)
-    retrieve_documents = make_retriever_node(vectorstore,reranker)
-    answer_rag = make_rag_node(llm,with_docs=True)
-    answer_rag_no_docs = make_rag_node(llm,with_docs=False)
     # Define the nodes
     workflow.add_node("categorize_intent", categorize_intent)
     workflow.add_node("search", search)
     workflow.add_node("transform_query", transform_query)
     workflow.add_node("translate_query", translate_query)
     workflow.add_node("answer_chitchat", answer_chitchat)
     workflow.add_node("answer_ai_impact", answer_ai_impact)
-    workflow.add_node("retrieve_documents",retrieve_documents)
-    workflow.add_node("answer_rag",answer_rag)
-    workflow.add_node("answer_rag_no_docs",answer_rag_no_docs)
     # Entry point
-    workflow.set_entry_point("categorize_intent")
     # CONDITIONAL EDGES
     workflow.add_conditional_edges(
@@ -99,6 +121,12 @@ def make_graph_agent(llm,vectorstore,reranker,threshold_docs = 0.2):
         make_id_dict(["answer_chitchat","answer_ai_impact","search"])
     )
     workflow.add_conditional_edges(
         "search",
         route_translation,
@@ -112,13 +140,24 @@ def make_graph_agent(llm,vectorstore,reranker,threshold_docs = 0.2):
     )
     # Define the edges
     workflow.add_edge("translate_query", "transform_query")
-    workflow.add_edge("transform_query", "retrieve_documents")
-    workflow.add_edge("retrieve_documents", "answer_rag")
     workflow.add_edge("answer_rag", END)
     workflow.add_edge("answer_rag_no_docs", END)
-    workflow.add_edge("answer_chitchat", END)
     workflow.add_edge("answer_ai_impact", END)
     # Compile
     app = workflow.compile()
@@ -135,4 +174,143 @@ def display_graph(app):
                 draw_method=MermaidDrawMethod.API,
             )
         )
-    )

 from langchain.schema import Document
 from langgraph.graph import END, StateGraph
+from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod
 from typing_extensions import TypedDict
+from typing import List, Dict
 from IPython.display import display, HTML, Image
     user_input : str
     language : str
     intent : str
+    search_graphs_chitchat : bool
     query: str
     questions : List[dict]
     answer: str
     audience: str = "experts"
     sources_input: List[str] = ["auto"]
     documents: List[Document]
+    recommended_content : List[Document]
+    # graphs_returned: Dict[str,str]
 def search(state):
     return {}
     else:
         # Search route
         return "search"
+def chitchat_route_intent(state):
+    intent = state["search_graphs_chitchat"]
+    if intent is True:
+        return "retrieve_graphs_chitchat"
+    elif intent is False:
+        return END
 def route_translation(state):
     if state["language"].lower() == "english":
 def make_id_dict(values):
     return {k:k for k in values}
+def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, threshold_docs=0.2):
     workflow = StateGraph(GraphState)
     translate_query = make_translation_node(llm)
     answer_chitchat = make_chitchat_node(llm)
     answer_ai_impact = make_ai_impact_node(llm)
+    retrieve_documents = make_retriever_node(vectorstore_ipcc, reranker)
+    retrieve_graphs = make_graph_retriever_node(vectorstore_graphs, reranker)
+    # answer_rag_graph = make_rag_graph_node(llm)
+    answer_rag = make_rag_node(llm, with_docs=True)
+    answer_rag_no_docs = make_rag_node(llm, with_docs=False)
+    chitchat_categorize_intent = make_chitchat_intent_categorization_node(llm)
     # Define the nodes
+    workflow.add_node("set_defaults", set_defaults)
     workflow.add_node("categorize_intent", categorize_intent)
     workflow.add_node("search", search)
     workflow.add_node("transform_query", transform_query)
     workflow.add_node("translate_query", translate_query)
+    # workflow.add_node("transform_query_ai", transform_query)
+    # workflow.add_node("translate_query_ai", translate_query)
     workflow.add_node("answer_chitchat", answer_chitchat)
+    workflow.add_node("chitchat_categorize_intent", chitchat_categorize_intent)
     workflow.add_node("answer_ai_impact", answer_ai_impact)
+    workflow.add_node("retrieve_graphs", retrieve_graphs)
+    workflow.add_node("retrieve_graphs_chitchat", retrieve_graphs)
+    # workflow.add_node("retrieve_graphs_ai", retrieve_graphs)
+    # workflow.add_node("answer_rag_graph", answer_rag_graph)
+    # workflow.add_node("answer_rag_graph_ai", answer_rag_graph)
+    workflow.add_node("retrieve_documents", retrieve_documents)
+    workflow.add_node("answer_rag", answer_rag)
+    workflow.add_node("answer_rag_no_docs", answer_rag_no_docs)
     # Entry point
+    workflow.set_entry_point("set_defaults")
     # CONDITIONAL EDGES
     workflow.add_conditional_edges(
         make_id_dict(["answer_chitchat","answer_ai_impact","search"])
     )
+    workflow.add_conditional_edges(
+        "chitchat_categorize_intent",
+        chitchat_route_intent,
+        make_id_dict(["retrieve_graphs_chitchat", END])
+    )
     workflow.add_conditional_edges(
         "search",
         route_translation,
     )
     # Define the edges
+    workflow.add_edge("set_defaults", "categorize_intent")
     workflow.add_edge("translate_query", "transform_query")
+    workflow.add_edge("transform_query", "retrieve_graphs")
+    # workflow.add_edge("retrieve_graphs", "answer_rag_graph")
+    workflow.add_edge("retrieve_graphs", "retrieve_documents")
+    # workflow.add_edge("answer_rag_graph", "retrieve_documents")
     workflow.add_edge("answer_rag", END)
     workflow.add_edge("answer_rag_no_docs", END)
+    workflow.add_edge("answer_chitchat", "chitchat_categorize_intent")
+    # workflow.add_edge("answer_chitchat", END)
     workflow.add_edge("answer_ai_impact", END)
+    workflow.add_edge("retrieve_graphs_chitchat", END)
+    # workflow.add_edge("answer_ai_impact", "translate_query_ai")
+    # workflow.add_edge("translate_query_ai", "transform_query_ai")
+    # workflow.add_edge("transform_query_ai", "retrieve_graphs_ai")
+    # workflow.add_edge("retrieve_graphs_ai", "answer_rag_graph_ai")
+    # workflow.add_edge("answer_rag_graph_ai", END)
+    # workflow.add_edge("retrieve_graphs_ai", END)
     # Compile
     app = workflow.compile()
                 draw_method=MermaidDrawMethod.API,
             )
         )
+    )
+# import sys
+# import os
+# from contextlib import contextmanager
+# from langchain.schema import Document
+# from langgraph.graph import END, StateGraph
+# from langchain_core.runnables.graph import CurveStyle, NodeColors, MermaidDrawMethod
+# from typing_extensions import TypedDict
+# from typing import List
+# from IPython.display import display, HTML, Image
+# from .chains.answer_chitchat import make_chitchat_node
+# from .chains.answer_ai_impact import make_ai_impact_node
+# from .chains.query_transformation import make_query_transform_node
+# from .chains.translation import make_translation_node
+# from .chains.intent_categorization import make_intent_categorization_node
+# from .chains.retriever import make_retriever_node
+# from .chains.answer_rag import make_rag_node
+# class GraphState(TypedDict):
+#     """
+#     Represents the state of our graph.
+#     """
+#     user_input : str
+#     language : str
+#     intent : str
+#     query: str
+#     questions : List[dict]
+#     answer: str
+#     audience: str = "experts"
+#     sources_input: List[str] = ["auto"]
+#     documents: List[Document]
+# def search(state):
+#     return {}
+# def route_intent(state):
+#     intent = state["intent"]
+#     if intent in ["chitchat","esg"]:
+#         return "answer_chitchat"
+#     elif intent == "ai_impact":
+#         return "answer_ai_impact"
+#     else:
+#         # Search route
+#         return "search"
+# def route_translation(state):
+#     if state["language"].lower() == "english":
+#         return "transform_query"
+#     else:
+#         return "translate_query"
+# def route_based_on_relevant_docs(state,threshold_docs=0.2):
+#     docs = [x for x in state["documents"] if x.metadata["reranking_score"] > threshold_docs]
+#     if len(docs) > 0:
+#         return "answer_rag"
+#     else:
+#         return "answer_rag_no_docs"
+# def make_id_dict(values):
+#     return {k:k for k in values}
+# def make_graph_agent(llm,vectorstore,reranker,threshold_docs = 0.2):
+#     workflow = StateGraph(GraphState)
+#     # Define the node functions
+#     categorize_intent = make_intent_categorization_node(llm)
+#     transform_query = make_query_transform_node(llm)
+#     translate_query = make_translation_node(llm)
+#     answer_chitchat = make_chitchat_node(llm)
+#     answer_ai_impact = make_ai_impact_node(llm)
+#     retrieve_documents = make_retriever_node(vectorstore,reranker)
+#     answer_rag = make_rag_node(llm,with_docs=True)
+#     answer_rag_no_docs = make_rag_node(llm,with_docs=False)
+#     # Define the nodes
+#     workflow.add_node("categorize_intent", categorize_intent)
+#     workflow.add_node("search", search)
+#     workflow.add_node("transform_query", transform_query)
+#     workflow.add_node("translate_query", translate_query)
+#     workflow.add_node("answer_chitchat", answer_chitchat)
+#     workflow.add_node("answer_ai_impact", answer_ai_impact)
+#     workflow.add_node("retrieve_documents",retrieve_documents)
+#     workflow.add_node("answer_rag",answer_rag)
+#     workflow.add_node("answer_rag_no_docs",answer_rag_no_docs)
+#     # Entry point
+#     workflow.set_entry_point("categorize_intent")
+#     # CONDITIONAL EDGES
+#     workflow.add_conditional_edges(
+#         "categorize_intent",
+#         route_intent,
+#         make_id_dict(["answer_chitchat","answer_ai_impact","search"])
+#     )
+#     workflow.add_conditional_edges(
+#         "search",
+#         route_translation,
+#         make_id_dict(["translate_query","transform_query"])
+#     )
+#     workflow.add_conditional_edges(
+#         "retrieve_documents",
+#         lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
+#         make_id_dict(["answer_rag","answer_rag_no_docs"])
+#     )
+#     # Define the edges
+#     workflow.add_edge("translate_query", "transform_query")
+#     workflow.add_edge("transform_query", "retrieve_documents")
+#     workflow.add_edge("retrieve_documents", "answer_rag")
+#     workflow.add_edge("answer_rag", END)
+#     workflow.add_edge("answer_rag_no_docs", END)
+#     workflow.add_edge("answer_chitchat", END)
+#     workflow.add_edge("answer_ai_impact", END)
+#     # Compile
+#     app = workflow.compile()
+#     return app
+# def display_graph(app):
+#     display(
+#         Image(
+#             app.get_graph(xray = True).draw_mermaid_png(
+#                 draw_method=MermaidDrawMethod.API,
+#             )
+#         )
+#     )

climateqa/engine/graph_retriever.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from langchain_core.retrievers import BaseRetriever
+from langchain_core.documents.base import Document
+from langchain_core.vectorstores import VectorStore
+from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
+from typing import List
+class GraphRetriever(BaseRetriever):
+    vectorstore:VectorStore
+    sources:list = ["OWID"] # plus tard ajouter OurWorldInData # faudra integrate avec l'autre retriever
+    threshold:float = 0.5
+    k_total:int = 10
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        # Check if all elements in the list are IEA or OWID
+        assert isinstance(self.sources,list)
+        assert self.sources
+        assert any([x in ["OWID"] for x in self.sources])
+        # Prepare base search kwargs
+        filters = {}
+        filters["source"] = {"$in": self.sources}
+        docs = self.vectorstore.similarity_search_with_score(query=query, filter=filters, k=self.k_total)
+        # Filter if scores are below threshold
+        docs = [x for x in docs if x[1] > self.threshold]
+        # Remove duplicate documents
+        unique_docs = []
+        seen_docs = []
+        for i, doc in enumerate(docs):
+            if doc[0].page_content not in seen_docs:
+                unique_docs.append(doc)
+                seen_docs.append(doc[0].page_content)
+        # Add score to metadata
+        results = []
+        for i,(doc,score) in enumerate(unique_docs):
+            doc.metadata["similarity_score"] = score
+            doc.metadata["content"] = doc.page_content
+            results.append(doc)
+        return results

climateqa/engine/reranker.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import os
 from scipy.special import expit, logit
 from rerankers import Reranker
-def get_reranker(model = "nano",cohere_api_key = None):
-    assert model in ["nano","tiny","small","large"]
     if model == "nano":
         reranker = Reranker('ms-marco-TinyBERT-L-2-v2', model_type='flashrank')
@@ -17,6 +20,11 @@ def get_reranker(model = "nano",cohere_api_key = None):
         if cohere_api_key is None:
             cohere_api_key = os.environ["COHERE_API_KEY"]
         reranker = Reranker("cohere", lang='en', api_key = cohere_api_key)
     return reranker
@@ -26,6 +34,7 @@ def rerank_docs(reranker,docs,query):
     # Get a list of texts from langchain docs
     input_docs = [x.page_content for x in docs]
     # Rerank using rerankers library
     results = reranker.rank(query=query, docs=input_docs)

 import os
+from dotenv import load_dotenv
 from scipy.special import expit, logit
 from rerankers import Reranker
+from sentence_transformers import CrossEncoder
+load_dotenv()
+def get_reranker(model = "jina", cohere_api_key = None):
+    assert model in ["nano","tiny","small","large", "jina"]
     if model == "nano":
         reranker = Reranker('ms-marco-TinyBERT-L-2-v2', model_type='flashrank')
         if cohere_api_key is None:
             cohere_api_key = os.environ["COHERE_API_KEY"]
         reranker = Reranker("cohere", lang='en', api_key = cohere_api_key)
+    elif model == "jina":
+        # Reached token quota so does not work
+        reranker = Reranker("jina-reranker-v2-base-multilingual", api_key = os.getenv("JINA_RERANKER_API_KEY"))
+        # marche pas sans gpu ? et anyways returns with another structure donc faudrait changer le code du retriever node
+        # reranker = CrossEncoder("jinaai/jina-reranker-v2-base-multilingual", automodel_args={"torch_dtype": "auto"}, trust_remote_code=True,)
     return reranker
     # Get a list of texts from langchain docs
     input_docs = [x.page_content for x in docs]
+    print(f"\n\nDOCS:{input_docs}\n\n")
     # Rerank using rerankers library
     results = reranker.rank(query=query, docs=input_docs)

climateqa/engine/retriever.py CHANGED Viewed

@@ -28,6 +28,7 @@ class ClimateQARetriever(BaseRetriever):
         # Check if all elements in the list are either IPCC or IPBES
         assert isinstance(self.sources,list)
         assert all([x in ["IPCC","IPBES","IPOS"] for x in self.sources])
         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"

         # Check if all elements in the list are either IPCC or IPBES
         assert isinstance(self.sources,list)
+        assert self.sources
         assert all([x in ["IPCC","IPBES","IPOS"] for x in self.sources])
         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"

climateqa/engine/vectorstore.py CHANGED Viewed

@@ -4,6 +4,7 @@
 import os
 from pinecone import Pinecone
 from langchain_community.vectorstores import Pinecone as PineconeVectorstore
 # LOAD ENVIRONMENT VARIABLES
 try:
@@ -13,6 +14,11 @@ except:
     pass
 def get_pinecone_vectorstore(embeddings,text_key = "content"):
     # # initialize pinecone

 import os
 from pinecone import Pinecone
 from langchain_community.vectorstores import Pinecone as PineconeVectorstore
+from langchain_chroma import Chroma
 # LOAD ENVIRONMENT VARIABLES
 try:
     pass
+def get_chroma_vectorstore(embedding_function, persist_directory="/home/dora/climate-question-answering/data/vectorstore"):
+    vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_function)
+    return vectorstore
 def get_pinecone_vectorstore(embeddings,text_key = "content"):
     # # initialize pinecone

climateqa/utils.py CHANGED Viewed

@@ -20,3 +20,16 @@ def get_image_from_azure_blob_storage(path):
     file_object = get_file_from_azure_blob_storage(path)
     image = Image.open(file_object)
     return image

     file_object = get_file_from_azure_blob_storage(path)
     image = Image.open(file_object)
     return image
+def remove_duplicates_keep_highest_score(documents):
+    unique_docs = {}
+    for doc in documents:
+        doc_id = doc.metadata.get('doc_id')
+        if doc_id in unique_docs:
+            if doc.metadata['reranking_score'] > unique_docs[doc_id].metadata['reranking_score']:
+                unique_docs[doc_id] = doc
+        else:
+            unique_docs[doc_id] = doc
+    return list(unique_docs.values())

front/utils.py CHANGED Viewed

@@ -33,6 +33,85 @@ def parse_output_llm_with_sources(output):
     return content_parts
 def make_html_source(source,i):
     meta = source.metadata
     # content = source.page_content.split(":",1)[1].strip()

     return content_parts
+from collections import defaultdict
+def generate_html_graphs(graphs):
+    # Organize graphs by category
+    categories = defaultdict(list)
+    for graph in graphs:
+        category = graph['metadata']['category']
+        categories[category].append(graph['embedding'])
+    # Begin constructing the HTML
+    html_code = '''
+                <!DOCTYPE html>
+                <html lang="en">
+                <head>
+                    <meta charset="UTF-8">
+                    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+                    <title>Graphs by Category</title>
+                    <style>
+                        .tab-content {
+                            display: none;
+                        }
+                        .tab-content.active {
+                            display: block;
+                        }
+                        .tabs {
+                            margin-bottom: 20px;
+                        }
+                        .tab-button {
+                            background-color: #ddd;
+                            border: none;
+                            padding: 10px 20px;
+                            cursor: pointer;
+                            margin-right: 5px;
+                        }
+                        .tab-button.active {
+                            background-color: #ccc;
+                        }
+                    </style>
+                    <script>
+                        function showTab(tabId) {
+                            var contents = document.getElementsByClassName('tab-content');
+                            var buttons = document.getElementsByClassName('tab-button');
+                            for (var i = 0; i < contents.length; i++) {
+                                contents[i].classList.remove('active');
+                                buttons[i].classList.remove('active');
+                            }
+                            document.getElementById(tabId).classList.add('active');
+                            document.querySelector('button[data-tab="'+tabId+'"]').classList.add('active');
+                        }
+                    </script>
+                </head>
+                <body>
+                    <div class="tabs">
+                '''
+    # Add buttons for each category
+    for i, category in enumerate(categories.keys()):
+        active_class = 'active' if i == 0 else ''
+        html_code += f'<button class="tab-button {active_class}" onclick="showTab(\'tab-{i}\')" data-tab="tab-{i}">{category}</button>'
+    html_code += '</div>'
+    # Add content for each category
+    for i, (category, embeds) in enumerate(categories.items()):
+        active_class = 'active' if i == 0 else ''
+        html_code += f'<div id="tab-{i}" class="tab-content {active_class}">'
+        for embed in embeds:
+            html_code += embed
+        html_code += '</div>'
+    html_code += '''
+                </body>
+                </html>
+                '''
+    return html_code
 def make_html_source(source,i):
     meta = source.metadata
     # content = source.page_content.split(":",1)[1].strip()

style.css CHANGED Viewed

@@ -3,7 +3,7 @@
     --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
   } */
-.warning-box {
     background-color: #fff3cd;
     border: 1px solid #ffeeba;
     border-radius: 4px;
@@ -464,4 +464,32 @@ span.chatbot > p > img{
   .score-orange{
     color:red !important;
-  }

     --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
   } */
+  .warning-box {
     background-color: #fff3cd;
     border: 1px solid #ffeeba;
     border-radius: 4px;
   .score-orange{
     color:red !important;
+  }
+/* Additional style for scrollable tab content */
+div#tab-recommended_content {
+    overflow-y: auto; /* Enable vertical scrolling */
+    max-height: 80vh; /* Adjust height as needed */
+}
+/* Mobile specific adjustments */
+@media screen and (max-width: 767px) {
+    div#tab-recommended_content {
+        max-height: 50vh; /* Reduce height for smaller screens */
+        overflow-y: auto;
+    }
+}
+/* Additional style for scrollable tab content */
+div#tab-saved-graphs {
+    overflow-y: auto; /* Enable vertical scrolling */
+    max-height: 80vh; /* Adjust height as needed */
+}
+/* Mobile specific adjustments */
+@media screen and (max-width: 767px) {
+    div#tab-saved-graphs {
+        max-height: 50vh; /* Reduce height for smaller screens */
+        overflow-y: auto;
+    }
+}