Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

feature/add_agents

#14

by timeki - opened Oct 14, 2024

base: refs/heads/main

←

from: refs/pr/14

Discussion Files changed

+3146

-578

Files changed (32) hide show

.gitignore +6 -0
app.py +106 -294
climateqa/engine/chains/__init__.py +0 -0
climateqa/engine/chains/answer_ai_impact.py +46 -0
climateqa/engine/chains/answer_chitchat.py +52 -0
climateqa/engine/chains/answer_rag.py +99 -0
climateqa/engine/chains/intent_categorization.py +86 -0
climateqa/engine/chains/keywords_extraction.py +40 -0
climateqa/engine/{prompts.py → chains/prompts.py} +2 -2
climateqa/engine/chains/query_transformation.py +193 -0
climateqa/engine/{reformulation.py → chains/reformulation.py} +1 -1
climateqa/engine/chains/retrieve_documents.py +159 -0
climateqa/engine/chains/sample_router.py +66 -0
climateqa/engine/chains/translation.py +41 -0
climateqa/engine/embeddings.py +6 -3
climateqa/engine/graph.py +149 -0
climateqa/engine/llm/__init__.py +3 -0
climateqa/engine/llm/ollama.py +6 -0
climateqa/engine/rag.py +0 -134
climateqa/engine/reranker.py +40 -0
climateqa/engine/utils.py +17 -0
climateqa/knowledge/__init__.py +0 -0
climateqa/{papers → knowledge}/openalex.py +61 -12
climateqa/{engine → knowledge}/retriever.py +1 -83
climateqa/papers/__init__.py +0 -43
front/__init__.py +0 -0
front/callbacks.py +0 -0
front/utils.py +142 -0
requirements.txt +12 -6
sandbox/20240310 - CQA - Semantic Routing 1.ipynb +0 -0
style.css +118 -0
test.json +0 -0

.gitignore CHANGED Viewed

@@ -5,3 +5,9 @@ __pycache__/utils.cpython-38.pyc
 notebooks/
 *.pyc

 notebooks/
 *.pyc
+**/.ipynb_checkpoints/
+**/.flashrank_cache/
+data/
+sandbox/

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from climateqa.engine.embeddings import get_embeddings_function
 embeddings_function = get_embeddings_function()
-from climateqa.papers.openalex import OpenAlex
 from sentence_transformers import CrossEncoder
-reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
 oa = OpenAlex()
 import gradio as gr
@@ -15,6 +15,8 @@ import time
 import re
 import json
 # from gradio_modal import Modal
 from io import BytesIO
@@ -29,16 +31,19 @@ from utils import create_user_id
 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
-from climateqa.engine.rag import make_rag_chain
 from climateqa.engine.vectorstore import get_pinecone_vectorstore
-from climateqa.engine.retriever import ClimateQARetriever
 from climateqa.engine.embeddings import get_embeddings_function
-from climateqa.engine.prompts import audience_prompts
 from climateqa.sample_questions import QUESTIONS
 from climateqa.constants import POSSIBLE_REPORTS
 from climateqa.utils import get_image_from_azure_blob_storage
 from climateqa.engine.keywords import make_keywords_chain
-from climateqa.engine.rag import make_rag_papers_chain
 # Load environment variables in local mode
 try:
@@ -81,48 +86,21 @@ user_id = create_user_id()
-def parse_output_llm_with_sources(output):
-    # Split the content into a list of text and "[Doc X]" references
-    content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
-    parts = []
-    for part in content_parts:
-        if part.startswith("Doc"):
-            subparts = part.split(",")
-            subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
-            subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" for subpart in subparts]
-            parts.append("".join(subparts))
-        else:
-            parts.append(part)
-    content_parts = "".join(parts)
-    return content_parts
 # Create vectorstore and retriever
 vectorstore = get_pinecone_vectorstore(embeddings_function)
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
-def make_pairs(lst):
-    """from a list of even lenght, make tupple pairs"""
-    return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]
-def serialize_docs(docs):
-    new_docs = []
-    for doc in docs:
-        new_doc = {}
-        new_doc["page_content"] = doc.page_content
-        new_doc["metadata"] = doc.metadata
-        new_docs.append(new_doc)
-    return new_docs
 async def chat(query,history,audience,sources,reports):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
-    print(f">> NEW QUESTION : {query}")
     if audience == "Children":
         audience_prompt = audience_prompts["children"]
@@ -137,77 +115,79 @@ async def chat(query,history,audience,sources,reports):
     if len(sources) == 0:
         sources = ["IPCC"]
-    if len(reports) == 0:
-        reports = []
-    retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,min_size = 200,reports = reports,k_summary = 3,k_total = 15,threshold=0.5)
-    rag_chain = make_rag_chain(retriever,llm)
-    inputs = {"query": query,"audience": audience_prompt}
-    result = rag_chain.astream_log(inputs) #{"callbacks":[MyCustomAsyncHandler()]})
-    # result = rag_chain.stream(inputs)
-    path_reformulation = "/logs/reformulation/final_output"
-    path_keywords = "/logs/keywords/final_output"
-    path_retriever = "/logs/find_documents/final_output"
-    path_answer = "/logs/answer/streamed_output_str/-"
     docs_html = ""
     output_query = ""
     output_language = ""
     output_keywords = ""
     gallery = []
     try:
-        async for op in result:
-            op = op.ops[0]
-            if op['path'] == path_reformulation: # reforulated question
-                try:
-                    output_language = op['value']["language"] # str
-                    output_query = op["value"]["question"]
-                except Exception as e:
-                    raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
-            if op["path"] == path_keywords:
-                try:
-                    output_keywords = op['value']["keywords"] # str
-                    output_keywords = " AND ".join(output_keywords)
-                except Exception as e:
-                    pass
-            elif op['path'] == path_retriever: # documents
-                try:
-                    docs = op['value']['docs'] # List[Document]
-                    docs_html = []
-                    for i, d in enumerate(docs, 1):
-                        docs_html.append(make_html_source(d, i))
-                    docs_html = "".join(docs_html)
-                except TypeError:
-                    print("No documents found")
-                    print("op: ",op)
-                    continue
-            elif op['path'] == path_answer: # final answer
-                new_token = op['value'] # str
-                # time.sleep(0.01)
-                previous_answer = history[-1][1]
-                previous_answer = previous_answer if previous_answer is not None else ""
-                answer_yet = previous_answer + new_token
-                answer_yet = parse_output_llm_with_sources(answer_yet)
-                history[-1] = (query,answer_yet)
-            else:
-                continue
-            history = [tuple(x) for x in history]
-            yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
     except Exception as e:
         raise gr.Error(f"{e}")
@@ -268,83 +248,7 @@ async def chat(query,history,audience,sources,reports):
         history[-1] = (history[-1][0],answer_yet)
         history = [tuple(x) for x in history]
-    # gallery = [x.metadata["image_path"] for x in docs if (len(x.metadata["image_path"]) > 0 and "IAS" in x.metadata["image_path"])]
-    # if len(gallery) > 0:
-    #     gallery = list(set("|".join(gallery).split("|")))
-    #     gallery = [get_image_from_azure_blob_storage(x) for x in gallery]
-    yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
-def make_html_source(source,i):
-    meta = source.metadata
-    # content = source.page_content.split(":",1)[1].strip()
-    content = source.page_content.strip()
-    toc_levels = []
-    for j in range(2):
-        level = meta[f"toc_level{j}"]
-        if level != "N/A":
-            toc_levels.append(level)
-        else:
-            break
-    toc_levels = " > ".join(toc_levels)
-    if len(toc_levels) > 0:
-        name = f"<b>{toc_levels}</b><br/>{meta['name']}"
-    else:
-        name = meta['name']
-    if meta["chunk_type"] == "text":
-        card = f"""
-    <div class="card" id="doc{i}">
-        <div class="card-content">
-            <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
-            <p>{content}</p>
-        </div>
-        <div class="card-footer">
-            <span>{name}</span>
-            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
-                <span role="img" aria-label="Open PDF">🔗</span>
-            </a>
-        </div>
-    </div>
-    """
-    else:
-        if meta["figure_code"] != "N/A":
-            title = f"{meta['figure_code']} - {meta['short_name']}"
-        else:
-            title = f"{meta['short_name']}"
-        card = f"""
-    <div class="card card-image">
-        <div class="card-content">
-            <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
-            <p>{content}</p>
-            <p class='ai-generated'>AI-generated description</p>
-        </div>
-        <div class="card-footer">
-            <span>{name}</span>
-            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
-                <span role="img" aria-label="Open PDF">🔗</span>
-            </a>
-        </div>
-    </div>
-    """
-    return card
-#     else:
-#         docs_string = "No relevant passages found in the climate science reports (IPCC and IPBES)"
-#         complete_response = "**No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate issues).**"
-#         messages.append({"role": "assistant", "content": complete_response})
-#         gradio_format = make_pairs([a["content"] for a in messages[1:]])
-#         yield gradio_format, messages, docs_string
 def save_feedback(feed: str, user_id):
@@ -390,56 +294,6 @@ papers_cols_widths = {
 papers_cols = list(papers_cols_widths.keys())
 papers_cols_widths = list(papers_cols_widths.values())
-async def find_papers(query, keywords,after):
-    summary = ""
-    df_works = oa.search(keywords,after = after)
-    df_works = df_works.dropna(subset=["abstract"])
-    df_works = oa.rerank(query,df_works,reranker)
-    df_works = df_works.sort_values("rerank_score",ascending=False)
-    G = oa.make_network(df_works)
-    height = "750px"
-    network = oa.show_network(G,color_by = "rerank_score",notebook=False,height = height)
-    network_html = network.generate_html()
-    network_html = network_html.replace("'", "\"")
-    css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
-    network_html = network_html + css_to_inject
-    network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
-    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
-    allow-scripts allow-same-origin allow-popups
-    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
-    allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
-    docs = df_works["content"].head(15).tolist()
-    df_works = df_works.reset_index(drop = True).reset_index().rename(columns = {"index":"doc"})
-    df_works["doc"] = df_works["doc"] + 1
-    df_works = df_works[papers_cols]
-    yield df_works,network_html,summary
-    chain = make_rag_papers_chain(llm)
-    result = chain.astream_log({"question": query,"docs": docs,"language":"English"})
-    path_answer = "/logs/StrOutputParser/streamed_output/-"
-    async for op in result:
-        op = op.ops[0]
-        if op['path'] == path_answer: # reforulated question
-            new_token = op['value'] # str
-            summary += new_token
-        else:
-            continue
-        yield df_works,network_html,summary
 # --------------------------------------------------------------------
 # Gradio
@@ -469,19 +323,21 @@ def vote(data: gr.LikeData):
-with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main-component") as demo:
-    # user_id_state = gr.State([user_id])
     with gr.Tab("ClimateQ&A"):
         with gr.Row(elem_id="chatbot-row"):
             with gr.Column(scale=2):
-                # state = gr.State([system_template])
                 chatbot = gr.Chatbot(
-                    value=[(None,init_prompt)],
-                    show_copy_button=True,show_label = False,elem_id="chatbot",layout = "panel",
                     avatar_images = (None,"https://i.ibb.co/YNyd5W2/logo4.png"),
-                )#,avatar_images = ("assets/logo4.png",None))
                 # bot.like(vote,None,None)
@@ -489,8 +345,7 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
                 with gr.Row(elem_id = "input-message"):
                     textbox=gr.Textbox(placeholder="Ask me anything here!",show_label=False,scale=7,lines = 1,interactive = True,elem_id="input-textbox")
-                    # submit = gr.Button("",elem_id = "submit-button",scale = 1,interactive = True,icon = "https://static-00.iconduck.com/assets.00/settings-icon-2048x2046-cw28eevx.png")
             with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
@@ -560,9 +415,6 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
 #---------------------------------------------------------------------------------------
 # OTHER TABS
 #---------------------------------------------------------------------------------------
@@ -571,25 +423,25 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
     with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
         gallery_component = gr.Gallery()
-    with gr.Tab("Papers (beta)",elem_id = "tab-papers",elem_classes = "max-height other-tabs"):
-        with gr.Row():
-            with gr.Column(scale=1):
-                query_papers = gr.Textbox(placeholder="Question",show_label=False,lines = 1,interactive = True,elem_id="query-papers")
-                keywords_papers = gr.Textbox(placeholder="Keywords",show_label=False,lines = 1,interactive = True,elem_id="keywords-papers")
-                after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers")
-                search_papers = gr.Button("Search",elem_id="search-papers",interactive=True)
-            with gr.Column(scale=7):
-                with gr.Tab("Summary",elem_id="papers-summary-tab"):
-                    papers_summary = gr.Markdown(visible=True,elem_id="papers-summary")
-                with gr.Tab("Relevant papers",elem_id="papers-results-tab"):
-                    papers_dataframe = gr.Dataframe(visible=True,elem_id="papers-table",headers = papers_cols)
-                with gr.Tab("Citations network",elem_id="papers-network-tab"):
-                    citations_network = gr.HTML(visible=True,elem_id="papers-citations-network")
@@ -600,8 +452,9 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
     def start_chat(query,history):
-        history = history + [(query,None)]
-        history = [tuple(x) for x in history]
         return (gr.update(interactive = False),gr.update(selected=1),history)
     def finish_chat():
@@ -609,13 +462,13 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
     (textbox
         .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
-        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
     )
     (examples_hidden
         .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
-        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_examples")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
     )
@@ -630,47 +483,6 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
     dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
-    query_papers.submit(generate_keywords,[query_papers], [keywords_papers])
-    search_papers.click(find_papers,[query_papers,keywords_papers,after], [papers_dataframe,citations_network,papers_summary])
-    # # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
-    # (textbox
-    #     .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
-    #     .success(change_tab,None,tabs)
-    #     .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
-    #     .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
-    #     .success(lambda x : textbox,[textbox],[textbox])
-    # )
-    # (examples_hidden
-    #     .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
-    #     .success(change_tab,None,tabs)
-    #     .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
-    #     .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
-    #     .success(lambda x : textbox,[textbox],[textbox])
-    # )
-    # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
-    #         answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
-    #     )
-    # with Modal(visible=True) as first_modal:
-    #     gr.Markdown("# Welcome to ClimateQ&A !")
-    #     gr.Markdown("### Examples")
-    #     examples = gr.Examples(
-    #         ["Yo ça roule","ça boume"],
-    #         [examples_hidden],
-    #         examples_per_page=8,
-    #         run_on_click=False,
-    #         elem_id="examples",
-    #         api_name="examples",
-    #     )
-    # submit.click(lambda: Modal(visible=True), None, config_modal)
     demo.queue()

 from climateqa.engine.embeddings import get_embeddings_function
 embeddings_function = get_embeddings_function()
+from climateqa.knowledge.openalex import OpenAlex
 from sentence_transformers import CrossEncoder
+# reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
 oa = OpenAlex()
 import gradio as gr
 import re
 import json
+from gradio import ChatMessage
 # from gradio_modal import Modal
 from io import BytesIO
 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
 from climateqa.engine.vectorstore import get_pinecone_vectorstore
+from climateqa.knowledge.retriever import ClimateQARetriever
+from climateqa.engine.reranker import get_reranker
 from climateqa.engine.embeddings import get_embeddings_function
+from climateqa.engine.chains.prompts import audience_prompts
 from climateqa.sample_questions import QUESTIONS
 from climateqa.constants import POSSIBLE_REPORTS
 from climateqa.utils import get_image_from_azure_blob_storage
 from climateqa.engine.keywords import make_keywords_chain
+# from climateqa.engine.chains.answer_rag import make_rag_papers_chain
+from climateqa.engine.graph import make_graph_agent,display_graph
+from front.utils import make_html_source,parse_output_llm_with_sources,serialize_docs,make_toolbox
 # Load environment variables in local mode
 try:
 # Create vectorstore and retriever
 vectorstore = get_pinecone_vectorstore(embeddings_function)
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
+reranker = get_reranker("large")
+agent = make_graph_agent(llm,vectorstore,reranker)
 async def chat(query,history,audience,sources,reports):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
+    date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    print(f">> NEW QUESTION ({date_now}) : {query}")
     if audience == "Children":
         audience_prompt = audience_prompts["children"]
     if len(sources) == 0:
         sources = ["IPCC"]
+    # if len(reports) == 0: # TODO
+    reports = []
+    inputs = {"user_input": query,"audience": audience_prompt,"sources":sources}
+    result = agent.astream_events(inputs,version = "v1")
+    # path_reformulation = "/logs/reformulation/final_output"
+    # path_keywords = "/logs/keywords/final_output"
+    # path_retriever = "/logs/find_documents/final_output"
+    # path_answer = "/logs/answer/streamed_output_str/-"
+    docs = []
     docs_html = ""
     output_query = ""
     output_language = ""
     output_keywords = ""
     gallery = []
+    start_streaming = False
+    steps_display = {
+        "categorize_intent":("🔄️ Analyzing user message",True),
+        "transform_query":("🔄️ Thinking step by step to answer the question",True),
+        "retrieve_documents":("🔄️ Searching in the knowledge base",False),
+    }
+    used_documents = []
+    answer_message_content = ""
     try:
+        async for event in result:
+            if "langgraph_node" in event["metadata"]:
+                node = event["metadata"]["langgraph_node"]
+                if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents" :# when documents are retrieved
+                    try:
+                        docs = event["data"]["output"]["documents"]
+                        docs_html = []
+                        for i, d in enumerate(docs, 1):
+                            docs_html.append(make_html_source(d, i))
+                        used_documents = used_documents + [d.metadata["name"] for d in docs]
+                        history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
+                        docs_html = "".join(docs_html)
+                    except Exception as e:
+                        print(f"Error getting documents: {e}")
+                        print(event)
+                elif event["name"] in steps_display.keys() and event["event"] == "on_chain_start": #display steps
+                    event_description,display_output = steps_display[node]
+                    if not hasattr(history[-1], 'metadata') or history[-1].metadata["title"] != event_description: # if a new step begins
+                        history.append(ChatMessage(role="assistant", content = "", metadata={'title' :event_description}))
+                elif event["name"] != "transform_query" and event["event"] == "on_chat_model_stream" and node in ["answer_rag", "answer_search"]:# if streaming answer
+                    if start_streaming == False:
+                        start_streaming = True
+                        history.append(ChatMessage(role="assistant", content = ""))
+                    answer_message_content +=  event["data"]["chunk"].content
+                    answer_message_content = parse_output_llm_with_sources(answer_message_content)
+                    history[-1] = ChatMessage(role="assistant", content = answer_message_content)
+                    # history.append(ChatMessage(role="assistant", content = new_message_content))
+                if event["name"] == "transform_query" and event["event"] =="on_chain_end":
+                    if hasattr(history[-1],"content"):
+                        history[-1].content += "Decompose question into sub-questions: \n\n - " + "\n - ".join([q["question"] for q in event["data"]["output"]["remaining_questions"]])
+                if event["name"] == "categorize_intent" and event["event"] == "on_chain_start":
+                    print("X")
+            yield history,docs_html,output_query,output_language,gallery #,output_query,output_keywords
     except Exception as e:
+        print(event, "has failed")
         raise gr.Error(f"{e}")
         history[-1] = (history[-1][0],answer_yet)
         history = [tuple(x) for x in history]
+    yield history,docs_html,output_query,output_language,gallery#,output_query,output_keywords
 def save_feedback(feed: str, user_id):
 papers_cols = list(papers_cols_widths.keys())
 papers_cols_widths = list(papers_cols_widths.values())
 # --------------------------------------------------------------------
 # Gradio
+with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=theme,elem_id = "main-component") as demo:
     with gr.Tab("ClimateQ&A"):
         with gr.Row(elem_id="chatbot-row"):
             with gr.Column(scale=2):
                 chatbot = gr.Chatbot(
+                    value = [ChatMessage(role="assistant", content=init_prompt)],
+                    type = "messages",
+                    show_copy_button=True,
+                    show_label = False,
+                    elem_id="chatbot",
+                    layout = "panel",
                     avatar_images = (None,"https://i.ibb.co/YNyd5W2/logo4.png"),
+                )
                 # bot.like(vote,None,None)
                 with gr.Row(elem_id = "input-message"):
                     textbox=gr.Textbox(placeholder="Ask me anything here!",show_label=False,scale=7,lines = 1,interactive = True,elem_id="input-textbox")
             with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
 #---------------------------------------------------------------------------------------
 # OTHER TABS
 #---------------------------------------------------------------------------------------
     with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
         gallery_component = gr.Gallery()
+    # with gr.Tab("Papers (beta)",elem_id = "tab-papers",elem_classes = "max-height other-tabs"):
+    #     with gr.Row():
+    #         with gr.Column(scale=1):
+    #             query_papers = gr.Textbox(placeholder="Question",show_label=False,lines = 1,interactive = True,elem_id="query-papers")
+    #             keywords_papers = gr.Textbox(placeholder="Keywords",show_label=False,lines = 1,interactive = True,elem_id="keywords-papers")
+    #             after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers")
+    #             search_papers = gr.Button("Search",elem_id="search-papers",interactive=True)
+    #         with gr.Column(scale=7):
+    #             with gr.Tab("Summary",elem_id="papers-summary-tab"):
+    #                 papers_summary = gr.Markdown(visible=True,elem_id="papers-summary")
+    #             with gr.Tab("Relevant papers",elem_id="papers-results-tab"):
+    #                 papers_dataframe = gr.Dataframe(visible=True,elem_id="papers-table",headers = papers_cols)
+    #             with gr.Tab("Citations network",elem_id="papers-network-tab"):
+    #                 citations_network = gr.HTML(visible=True,elem_id="papers-citations-network")
     def start_chat(query,history):
+        # history = history + [(query,None)]
+        # history = [tuple(x) for x in history]
+        history = history + [ChatMessage(role="user", content=query)]
         return (gr.update(interactive = False),gr.update(selected=1),history)
     def finish_chat():
     (textbox
         .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
+        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
     )
     (examples_hidden
         .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
+        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_examples")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
     )
     dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
     demo.queue()

climateqa/engine/chains/__init__.py ADDED Viewed

File without changes

climateqa/engine/chains/answer_ai_impact.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from langchain.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+prompt_template = """
+You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
+Always stay true to climate and nature science and do not make up information.
+If you do not know the answer, just say you do not know.
+## Guidelines
+- Explain that the environmental impact of AI is not covered by the IPCC or IPBES reports, but you can recommend info based on the sources below
+- Answer the question in the original language of the question
+## Sources
+- You can propose to visit this page https://climateqa.com/docs/carbon-footprint/ to learn more about ClimateQ&A's own carbon footprint
+- You can recommend to look at the work of the AI & climate expert scientist Sasha Luccioni with in in particular those papers
+    - Power Hungry Processing: Watts Driving the Cost of AI Deployment? - https://arxiv.org/abs/2311.16863 - about the carbon footprint at the inference stage of AI models
+    - Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning - https://arxiv.org/abs/2302.08476
+    - Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model - https://arxiv.org/abs/2211.02001 - about the carbon footprint of training a large language model
+- You can also recommend the following tools to calculate the carbon footprint of AI models
+    - CodeCarbon - https://github.com/mlco2/codecarbon to measure the carbon footprint of your code
+    - Ecologits - https://ecologits.ai/ to measure the carbon footprint of using LLMs APIs such
+"""
+def make_ai_impact_chain(llm):
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", prompt_template),
+        ("user", "{question}")
+    ])
+    chain = prompt | llm | StrOutputParser()
+    chain = chain.with_config({"run_name":"ai_impact_chain"})
+    return chain
+def make_ai_impact_node(llm):
+    ai_impact_chain = make_ai_impact_chain(llm)
+    async def answer_ai_impact(state,config):
+        answer = await ai_impact_chain.ainvoke({"question":state["user_input"]},config)
+        return {"answer":answer}
+    return answer_ai_impact

climateqa/engine/chains/answer_chitchat.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from langchain.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+chitchat_prompt_template = """
+You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
+Always stay true to climate and nature science and do not make up information.
+If you do not know the answer, just say you do not know.
+## Guidelines
+- If it's a conversational question, you can normally chat with the user
+- If the question is not related to any topic about the environment, refuse to answer and politely ask the user to ask another question about the environment
+- If the user ask if you speak any language, you can say you speak all languages :)
+- If the user ask about the bot itself "ClimateQ&A", you can explain that you are an AI assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports and propose to visit the website here https://climateqa.com/docs/intro/ for more information
+- If the question is about ESG regulations, standards, or frameworks like the CSRD, TCFD, SASB, GRI, CDP, etc., you can explain that this is not a topic covered by the IPCC or IPBES reports.
+- Precise that you are specialized in finding trustworthy information from the scientific reports of the IPCC and IPBES and other scientific litterature
+- If relevant you can propose up to 3 example of questions they could ask from the IPCC or IPBES reports from the examples below
+- Always answer in the original language of the question
+## Examples of questions you can suggest (in the original language of the question)
+    "What evidence do we have of climate change?",
+    "Are human activities causing global warming?",
+    "What are the impacts of climate change?",
+    "Can climate change be reversed?",
+    "What is the difference between climate change and global warming?",
+"""
+def make_chitchat_chain(llm):
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", chitchat_prompt_template),
+        ("user", "{question}")
+    ])
+    chain = prompt | llm | StrOutputParser()
+    chain = chain.with_config({"run_name":"chitchat_chain"})
+    return chain
+def make_chitchat_node(llm):
+    chitchat_chain = make_chitchat_chain(llm)
+    async def answer_chitchat(state,config):
+        answer = await chitchat_chain.ainvoke({"question":state["user_input"]},config)
+        return {"answer":answer}
+    return answer_chitchat

climateqa/engine/chains/answer_rag.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from operator import itemgetter
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts.prompt import PromptTemplate
+from langchain_core.prompts.base import format_document
+from climateqa.engine.chains.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
+from climateqa.engine.chains.prompts import papers_prompt_template
+DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
+def _combine_documents(
+    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
+):
+    doc_strings =  []
+    for i,doc in enumerate(docs):
+        # chunk_type = "Doc" if doc.metadata["chunk_type"] == "text" else "Image"
+        chunk_type = "Doc"
+        if isinstance(doc,str):
+            doc_formatted = doc
+        else:
+            doc_formatted = format_document(doc, document_prompt)
+        doc_string = f"{chunk_type} {i+1}: " + doc_formatted
+        doc_string = doc_string.replace("\n"," ")
+        doc_strings.append(doc_string)
+    return sep.join(doc_strings)
+def get_text_docs(x):
+    return [doc for doc in x if doc.metadata["chunk_type"] == "text"]
+def get_image_docs(x):
+    return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
+def make_rag_chain(llm):
+    prompt = ChatPromptTemplate.from_template(answer_prompt_template)
+    chain = ({
+        "context":lambda x : _combine_documents(x["documents"]),
+        "query":itemgetter("query"),
+        "language":itemgetter("language"),
+        "audience":itemgetter("audience"),
+    } | prompt | llm | StrOutputParser())
+    return chain
+def make_rag_chain_without_docs(llm):
+    prompt = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
+    chain = prompt | llm | StrOutputParser()
+    return chain
+def make_rag_node(llm,with_docs = True):
+    if with_docs:
+        rag_chain = make_rag_chain(llm)
+    else:
+        rag_chain = make_rag_chain_without_docs(llm)
+    async def answer_rag(state,config):
+        answer = await rag_chain.ainvoke(state,config)
+        return {"answer":answer}
+    return answer_rag
+# def make_rag_papers_chain(llm):
+#     prompt = ChatPromptTemplate.from_template(papers_prompt_template)
+#     input_documents = {
+#         "context":lambda x : _combine_documents(x["docs"]),
+#         **pass_values(["question","language"])
+#     }
+#     chain = input_documents | prompt | llm | StrOutputParser()
+#     chain = rename_chain(chain,"answer")
+#     return chain
+# def make_illustration_chain(llm):
+#     prompt_with_images = ChatPromptTemplate.from_template(answer_prompt_images_template)
+#     input_description_images = {
+#         "images":lambda x : _combine_documents(get_image_docs(x["docs"])),
+#         **pass_values(["question","audience","language","answer"]),
+#     }
+#     illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
+#     return illustration_chain

climateqa/engine/chains/intent_categorization.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from langchain_core.pydantic_v1 import BaseModel, Field
+from typing import List
+from typing import Literal
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.utils.function_calling import convert_to_openai_function
+from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+class IntentCategorizer(BaseModel):
+    """Analyzing the user message input"""
+    language: str = Field(
+        description="Find the language of the message input in full words (ex: French, English, Spanish, ...), defaults to English",
+        default="English",
+    )
+    intent: str = Field(
+        enum=[
+            "ai_impact",
+            "geo_info",
+            "esg",
+            "search",
+            "chitchat",
+        ],
+        description="""
+            Categorize the user input in one of the following category
+            Any question
+            Examples:
+            - ai_impact = Environmental impacts of AI: "What are the environmental impacts of AI", "How does AI affect the environment"
+            - geo_info = Geolocated info about climate change: Any question where the user wants to know localized impacts of climate change, eg: "What will be the temperature in Marseille in 2050"
+            - esg = Any question about the ESG regulation, frameworks and standards like the CSRD, TCFD, SASB, GRI, CDP, etc.
+            - search = Searching for any quesiton about climate change, energy, biodiversity, nature, and everything we can find the IPCC or IPBES reports or scientific papers,
+            - chitchat = Any general question that is not related to the environment or climate change or just conversational, or if you don't think searching the IPCC or IPBES reports would be relevant
+        """,
+    )
+def make_intent_categorization_chain(llm):
+    openai_functions = [convert_to_openai_function(IntentCategorizer)]
+    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"IntentCategorizer"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
+    return chain
+def make_intent_categorization_node(llm):
+    categorization_chain = make_intent_categorization_chain(llm)
+    def categorize_message(state):
+        output = categorization_chain.invoke({"input":state["user_input"]})
+        if "language" not in output: output["language"] = "English"
+        output["query"] = state["user_input"]
+        return output
+    return categorize_message
+# SAMPLE_QUESTIONS = [
+#     "Est-ce que l'IA a un impact sur l'environnement ?",
+#     "Que dit le GIEC sur l'impact de l'IA",
+#     "Qui sont les membres du GIEC",
+#     "What is the impact of El Nino ?",
+#     "Yo",
+#     "Hello ça va bien ?",
+#     "Par qui as tu été créé ?",
+#     "What role do cloud formations play in modulating the Earth's radiative balance, and how are they represented in current climate models?",
+#     "Which industries have the highest GHG emissions?",
+#     "What are invasive alien species and how do they threaten biodiversity and ecosystems?",
+#     "Are human activities causing global warming?",
+#     "What is the motivation behind mining the deep seabed?",
+#     "Tu peux m'écrire un poème sur le changement climatique ?",
+#     "Tu peux m'écrire un poème sur les bonbons ?",
+#     "What will be the temperature in 2100 in Strasbourg?",
+#     "C'est quoi le lien entre biodiversity and changement climatique ?",
+# ]

climateqa/engine/chains/keywords_extraction.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from langchain_core.pydantic_v1 import BaseModel, Field
+from typing import List
+from typing import Literal
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.utils.function_calling import convert_to_openai_function
+from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+class KeywordExtraction(BaseModel):
+    """
+    Analyzing the user query to extract keywords to feed a search engine
+    """
+    keywords: List[str] = Field(
+        description="""
+        Extract the keywords from the user query to feed a search engine as a list
+        Avoid adding super specific keywords to prefer general keywords
+        Maximum 3 keywords
+        Examples:
+        - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
+        - "How will El Nino be impacted by climate change" -> ["el nino","climate change"]
+        - "Is climate change a hoax" -> ["climate change","hoax"]
+        """
+    )
+def make_keywords_extraction_chain(llm):
+    openai_functions = [convert_to_openai_function(KeywordExtraction)]
+    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"KeywordExtraction"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
+    return chain

climateqa/engine/{prompts.py → chains/prompts.py} RENAMED Viewed

@@ -56,7 +56,7 @@ Passages:
 {context}
 -----------------------
-Question: {question} - Explained to {audience}
 Answer in {language} with the passages citations:
 """
@@ -137,7 +137,7 @@ Guidelines:
 - If the question is not related to environmental issues, never never answer it. Say it's not your role.
 - Make paragraphs by starting new lines to make your answers more readable.
-Question: {question}
 Answer in {language}:
 """

 {context}
 -----------------------
+Question: {query} - Explained to {audience}
 Answer in {language} with the passages citations:
 """
 - If the question is not related to environmental issues, never never answer it. Say it's not your role.
 - Make paragraphs by starting new lines to make your answers more readable.
+Question: {query}
 Answer in {language}:
 """

climateqa/engine/chains/query_transformation.py ADDED Viewed

	@@ -0,0 +1,193 @@

+from langchain_core.pydantic_v1 import BaseModel, Field
+from typing import List
+from typing import Literal
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.utils.function_calling import convert_to_openai_function
+from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+ROUTING_INDEX = {
+    "Vector":["IPCC","IPBES","IPOS"],
+    "OpenAlex":["OpenAlex"],
+}
+POSSIBLE_SOURCES = [y for values in ROUTING_INDEX.values() for y in values]
+# Prompt from the original paper https://arxiv.org/pdf/2305.14283
+# Query Rewriting for Retrieval-Augmented Large Language Models
+class QueryDecomposition(BaseModel):
+    """
+    Decompose the user query into smaller parts to think step by step to answer this question
+    Act as a simple planning agent
+    """
+    questions: List[str] = Field(
+        description="""
+        Think step by step to answer this question, and provide one or several search engine questions in English for knowledge that you need.
+        Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
+        - If it's already a standalone and explicit question, just return the reformulated question for the search engine
+        - If you need to decompose the question, output a list of maximum 2 to 3 questions
+    """
+    )
+class Location(BaseModel):
+    country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
+    location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
+class QueryAnalysis(BaseModel):
+    """
+    Analyzing the user query to extract topics, sources and date
+    Also do query expansion to get alternative search queries
+    Also provide simple keywords to feed a search engine
+    """
+    # keywords: List[str] = Field(
+    #     description="""
+    #     Extract the keywords from the user query to feed a search engine as a list
+    #     Maximum 3 keywords
+    #     Examples:
+    #     - "What is the impact of deep sea mining ?" -> deep sea mining
+    #     - "How will El Nino be impacted by climate change" -> el nino;climate change
+    #     - "Is climate change a hoax" -> climate change;hoax
+    #     """
+    # )
+    # alternative_queries: List[str] = Field(
+    #     description="""
+    #     Generate alternative search questions from the user query to feed a search engine
+    #     """
+    # )
+    # step_back_question: str = Field(
+    #     description="""
+    #     You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.
+    #     This questions should help you get more context and information about the user query
+    #     """
+    # )
+    sources: List[Literal["IPCC", "IPBES", "IPOS","OpenAlex"]] = Field(
+        ...,
+        description="""
+            Given a user question choose which documents would be most relevant for answering their question,
+            - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
+            - IPBES is for questions about biodiversity and nature
+            - IPOS is for questions about the ocean and deep sea mining
+            - OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
+        """,
+    )
+    # topics: List[Literal[
+    #     "Climate change",
+    #     "Biodiversity",
+    #     "Energy",
+    #     "Decarbonization",
+    #     "Climate science",
+    #     "Nature",
+    #     "Climate policy and justice",
+    #     "Oceans",
+    #     "Deep sea mining",
+    #     "ESG and regulations",
+    #     "CSRD",
+    # ]] = Field(
+    #     ...,
+    #     description = """
+    #         Choose the topics that are most relevant to the user query, ex: Climate change, Energy, Biodiversity, ...
+    #     """,
+    # )
+    # date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
+    # location:Location
+def make_query_decomposition_chain(llm):
+    openai_functions = [convert_to_openai_function(QueryDecomposition)]
+    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryDecomposition"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
+    return chain
+def make_query_rewriter_chain(llm):
+    openai_functions = [convert_to_openai_function(QueryAnalysis)]
+    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
+    return chain
+def make_query_transform_node(llm,k_final=15):
+    decomposition_chain = make_query_decomposition_chain(llm)
+    rewriter_chain = make_query_rewriter_chain(llm)
+    def transform_query(state):
+        if "sources_auto" not in state or state["sources_auto"] is None or state["sources_auto"] is False:
+            auto_mode = False
+        else:
+            auto_mode = True
+        sources_input = state.get("sources_input")
+        if sources_input is None: sources_input = ROUTING_INDEX["Vector"]
+        new_state = {}
+        # Decomposition
+        decomposition_output = decomposition_chain.invoke({"input":state["query"]})
+        new_state.update(decomposition_output)
+        # Query Analysis
+        questions = []
+        for question in new_state["questions"]:
+            question_state = {"question":question}
+            analysis_output = rewriter_chain.invoke({"input":question})
+            question_state.update(analysis_output)
+            questions.append(question_state)
+        # Explode the questions into multiple questions with different sources
+        new_questions = []
+        for q in questions:
+            question,sources = q["question"],q["sources"]
+            # If not auto mode we take the configuration
+            if not auto_mode:
+                sources = sources_input
+            for index,index_sources in ROUTING_INDEX.items():
+                selected_sources = list(set(sources).intersection(index_sources))
+                if len(selected_sources) > 0:
+                    new_questions.append({"question":question,"sources":selected_sources,"index":index})
+        # # Add the number of questions to search
+        # k_by_question = k_final // len(new_questions)
+        # for q in new_questions:
+        #     q["k"] = k_by_question
+        # new_state["questions"] = new_questions
+        # new_state["remaining_questions"] = new_questions
+        new_state = {
+            "remaining_questions":new_questions,
+            "n_questions":len(new_questions),
+        }
+        return new_state
+    return transform_query

climateqa/engine/{reformulation.py → chains/reformulation.py} RENAMED Viewed

@@ -3,7 +3,7 @@ from langchain.output_parsers.structured import StructuredOutputParser, Response
 from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
-from climateqa.engine.prompts import reformulation_prompt_template
 from climateqa.engine.utils import pass_values, flatten_dict

 from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
+from climateqa.engine.chains.prompts import reformulation_prompt_template
 from climateqa.engine.utils import pass_values, flatten_dict

climateqa/engine/chains/retrieve_documents.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import sys
+import os
+from contextlib import contextmanager
+from langchain_core.tools import tool
+from langchain_core.runnables import chain
+from langchain_core.runnables import RunnableParallel, RunnablePassthrough
+from langchain_core.runnables import RunnableLambda
+from ..reranker import rerank_docs
+from ...knowledge.retriever import ClimateQARetriever
+from ...knowledge.openalex import OpenAlexRetriever
+from .keywords_extraction import make_keywords_extraction_chain
+from ..utils import log_event
+def divide_into_parts(target, parts):
+    # Base value for each part
+    base = target // parts
+    # Remainder to distribute
+    remainder = target % parts
+    # List to hold the result
+    result = []
+    for i in range(parts):
+        if i < remainder:
+            # These parts get base value + 1
+            result.append(base + 1)
+        else:
+            # The rest get the base value
+            result.append(base)
+    return result
+@contextmanager
+def suppress_output():
+    # Open a null device
+    with open(os.devnull, 'w') as devnull:
+        # Store the original stdout and stderr
+        old_stdout = sys.stdout
+        old_stderr = sys.stderr
+        # Redirect stdout and stderr to the null device
+        sys.stdout = devnull
+        sys.stderr = devnull
+        try:
+            yield
+        finally:
+            # Restore stdout and stderr
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
+@tool
+def query_retriever(question):
+    """Just a dummy tool to simulate the retriever query"""
+    return question
+def make_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
+    # The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
+    @chain
+    async def retrieve_documents(state,config):
+        keywords_extraction = make_keywords_extraction_chain(llm)
+        current_question = state["remaining_questions"][0]
+        remaining_questions = state["remaining_questions"][1:]
+        # ToolMessage(f"Retrieving documents for question: {current_question['question']}",tool_call_id = "retriever")
+        # # There are several options to get the final top k
+        # # Option 1 - Get 100 documents by question and rerank by question
+        # # Option 2 - Get 100/n documents by question and rerank the total
+        # if rerank_by_question:
+        #     k_by_question = divide_into_parts(k_final,len(questions))
+        # docs = state["documents"]
+        # if docs is None: docs = []
+        docs = []
+        k_by_question = k_final // state["n_questions"]
+        sources = current_question["sources"]
+        question = current_question["question"]
+        index = current_question["index"]
+        await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
+        if index == "Vector":
+            # Search the document store using the retriever
+            # Configure high top k for further reranking step
+            retriever = ClimateQARetriever(
+                vectorstore=vectorstore,
+                sources = sources,
+                min_size = 200,
+                k_summary = k_summary,
+                k_total = k_before_reranking,
+                threshold = 0.5,
+            )
+            docs_question = await retriever.ainvoke(question,config)
+        elif index == "OpenAlex":
+            keywords = keywords_extraction.invoke(question)["keywords"]
+            openalex_query = " AND ".join(keywords)
+            print(f"... OpenAlex query: {openalex_query}")
+            retriever_openalex = OpenAlexRetriever(
+                min_year = state.get("min_year",1960),
+                max_year = state.get("max_year",None),
+                k = k_before_reranking
+            )
+            docs_question = await retriever_openalex.ainvoke(openalex_query,config)
+        else:
+            raise Exception(f"Index {index} not found in the routing index")
+        # Rerank
+        if reranker is not None:
+            with suppress_output():
+                docs_question = rerank_docs(reranker,docs_question,question)
+        else:
+            # Add a default reranking score
+            for doc in docs_question:
+                doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
+        # If rerank by question we select the top documents for each question
+        if rerank_by_question:
+            docs_question = docs_question[:k_by_question]
+        # Add sources used in the metadata
+        for doc in docs_question:
+            doc.metadata["sources_used"] = sources
+            doc.metadata["question_used"] = question
+            doc.metadata["index_used"] = index
+        # Add to the list of docs
+        docs.extend(docs_question)
+        # Sorting the list in descending order by rerank_score
+        docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
+        new_state = {"documents":docs,"remaining_questions":remaining_questions}
+        return new_state
+    return retrieve_documents

climateqa/engine/chains/sample_router.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# from typing import List
+# from typing import Literal
+# from langchain.prompts import ChatPromptTemplate
+# from langchain_core.utils.function_calling import convert_to_openai_function
+# from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+# # https://livingdatalab.com/posts/2023-11-05-openai-function-calling-with-langchain.html
+# class Location(BaseModel):
+#     country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
+#     location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
+# class QueryAnalysis(BaseModel):
+#     """Analyzing the user query"""
+#     language: str = Field(
+#         description="Find the language of the query in full words (ex: French, English, Spanish, ...), defaults to English"
+#     )
+#     intent: str = Field(
+#         enum=[
+#             "Environmental impacts of AI",
+#             "Geolocated info about climate change",
+#             "Climate change",
+#             "Biodiversity",
+#             "Deep sea mining",
+#             "Chitchat",
+#         ],
+#         description="""
+#             Categorize the user query in one of the following category,
+#             Examples:
+#             - Geolocated info about climate change: "What will be the temperature in Marseille in 2050"
+#             - Climate change: "What is radiative forcing", "How much will
+#         """,
+#     )
+#     sources: List[Literal["IPCC", "IPBES", "IPOS"]] = Field(
+#         ...,
+#         description="""
+#             Given a user question choose which documents would be most relevant for answering their question,
+#             - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
+#             - IPBES is for questions about biodiversity and nature
+#             - IPOS is for questions about the ocean and deep sea mining
+#         """,
+#     )
+#     date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
+#     location:Location
+#     # query: str = Field(
+#     #     description = """
+#     #         Translate to english and reformulate the following user message to be a short standalone question, in the context of an educational discussion about climate change.
+#     #         The reformulated question will used in a search engine
+#     #         By default, assume that the user is asking information about the last century,
+#     #         Use the following examples
+#     #         ### Examples:
+#     #         La technologie nous sauvera-t-elle ? -> Can technology help humanity mitigate the effects of climate change?
+#     #         what are our reserves in fossil fuel? -> What are the current reserves of fossil fuels and how long will they last?
+#     #         what are the main causes of climate change? -> What are the main causes of climate change in the last century?
+#     #         Question in English:
+#     #     """
+#     # )
+# openai_functions = [convert_to_openai_function(QueryAnalysis)]
+# llm2 = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})

climateqa/engine/chains/translation.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from langchain_core.pydantic_v1 import BaseModel, Field
+from typing import List
+from typing import Literal
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.utils.function_calling import convert_to_openai_function
+from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+class Translation(BaseModel):
+    """Analyzing the user message input"""
+    translation: str = Field(
+        description="Translate the message input to English",
+    )
+def make_translation_chain(llm):
+    openai_functions = [convert_to_openai_function(Translation)]
+    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"Translation"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, you will translate the user input message to English using the function provided"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
+    return chain
+def make_translation_node(llm):
+    translation_chain = make_translation_chain(llm)
+    def translate_query(state):
+        user_input = state["user_input"]
+        translation = translation_chain.invoke({"input":user_input})
+        return {"query":translation["translation"]}
+    return translate_query

climateqa/engine/embeddings.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
-def get_embeddings_function(version = "v1.2"):
     if version == "v1.2":
@@ -10,12 +10,12 @@ def get_embeddings_function(version = "v1.2"):
         # Best embedding model at a reasonable size at the moment (2023-11-22)
         model_name = "BAAI/bge-base-en-v1.5"
-        encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
         print("Loading embeddings model: ", model_name)
         embeddings_function = HuggingFaceBgeEmbeddings(
             model_name=model_name,
             encode_kwargs=encode_kwargs,
-            query_instruction="Represent this sentence for searching relevant passages: "
         )
     else:
@@ -23,3 +23,6 @@ def get_embeddings_function(version = "v1.2"):
         embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
     return embeddings_function

 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
+def get_embeddings_function(version = "v1.2",query_instruction = "Represent this sentence for searching relevant passages: "):
     if version == "v1.2":
         # Best embedding model at a reasonable size at the moment (2023-11-22)
         model_name = "BAAI/bge-base-en-v1.5"
+        encode_kwargs = {'normalize_embeddings': True,"show_progress_bar":False} # set True to compute cosine similarity
         print("Loading embeddings model: ", model_name)
         embeddings_function = HuggingFaceBgeEmbeddings(
             model_name=model_name,
             encode_kwargs=encode_kwargs,
+            query_instruction=query_instruction,
         )
     else:
         embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
     return embeddings_function

climateqa/engine/graph.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import sys
+import os
+from contextlib import contextmanager
+from langchain.schema import Document
+from langgraph.graph import END, StateGraph
+from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod
+from typing_extensions import TypedDict
+from typing import List
+from IPython.display import display, HTML, Image
+from .chains.answer_chitchat import make_chitchat_node
+from .chains.answer_ai_impact import make_ai_impact_node
+from .chains.query_transformation import make_query_transform_node
+from .chains.translation import make_translation_node
+from .chains.intent_categorization import make_intent_categorization_node
+from .chains.retrieve_documents import make_retriever_node
+from .chains.answer_rag import make_rag_node
+class GraphState(TypedDict):
+    """
+    Represents the state of our graph.
+    """
+    user_input : str
+    language : str
+    intent : str
+    query: str
+    remaining_questions : List[dict]
+    n_questions : int
+    answer: str
+    audience: str = "experts"
+    sources_input: List[str] = ["IPCC","IPBES"]
+    sources_auto: bool = True
+    min_year: int = 1960
+    max_year: int = None
+    documents: List[Document]
+def search(state): #TODO
+    return state
+def answer_search(state):#TODO
+    return state
+def route_intent(state):
+    intent = state["intent"]
+    if intent in ["chitchat","esg"]:
+        return "answer_chitchat"
+    # elif intent == "ai_impact":
+    #     return "answer_ai_impact"
+    else:
+        # Search route
+        return "search"
+def route_translation(state):
+    if state["language"].lower() == "english":
+        return "transform_query"
+    else:
+        return "translate_query"
+def route_based_on_relevant_docs(state,threshold_docs=0.2):
+    docs = [x for x in state["documents"] if x.metadata["reranking_score"] > threshold_docs]
+    if len(docs) > 0:
+        return "answer_rag"
+    else:
+        return "answer_rag_no_docs"
+def make_id_dict(values):
+    return {k:k for k in values}
+def make_graph_agent(llm,vectorstore,reranker,threshold_docs = 0.2):
+    workflow = StateGraph(GraphState)
+    # Define the node functions
+    categorize_intent = make_intent_categorization_node(llm)
+    transform_query = make_query_transform_node(llm)
+    translate_query = make_translation_node(llm)
+    answer_chitchat = make_chitchat_node(llm)
+    answer_ai_impact = make_ai_impact_node(llm)
+    retrieve_documents = make_retriever_node(vectorstore,reranker,llm)
+    answer_rag = make_rag_node(llm,with_docs=True)
+    answer_rag_no_docs = make_rag_node(llm,with_docs=False)
+    # Define the nodes
+    workflow.add_node("categorize_intent", categorize_intent)
+    workflow.add_node("search", search)
+    workflow.add_node("answer_search", answer_search)
+    workflow.add_node("transform_query", transform_query)
+    workflow.add_node("translate_query", translate_query)
+    workflow.add_node("answer_chitchat", answer_chitchat)
+    # workflow.add_node("answer_ai_impact", answer_ai_impact)
+    workflow.add_node("retrieve_documents",retrieve_documents)
+    workflow.add_node("answer_rag",answer_rag)
+    workflow.add_node("answer_rag_no_docs",answer_rag_no_docs)
+    # Entry point
+    workflow.set_entry_point("categorize_intent")
+    # CONDITIONAL EDGES
+    workflow.add_conditional_edges(
+        "categorize_intent",
+        route_intent,
+        make_id_dict(["answer_chitchat","search"])
+    )
+    workflow.add_conditional_edges(
+        "search",
+        route_translation,
+        make_id_dict(["translate_query","transform_query"])
+    )
+    workflow.add_conditional_edges(
+        "retrieve_documents",
+        lambda state : "retrieve_documents" if len(state["remaining_questions"]) > 0 else "answer_search",
+        make_id_dict(["retrieve_documents","answer_search"])
+    )
+    workflow.add_conditional_edges(
+        "answer_search",
+        lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
+        make_id_dict(["answer_rag","answer_rag_no_docs"])
+    )
+    # Define the edges
+    workflow.add_edge("translate_query", "transform_query")
+    workflow.add_edge("transform_query", "retrieve_documents")
+    workflow.add_edge("answer_rag", END)
+    workflow.add_edge("answer_rag_no_docs", END)
+    workflow.add_edge("answer_chitchat", END)
+    # workflow.add_edge("answer_ai_impact", END)
+    # Compile
+    app = workflow.compile()
+    return app
+def display_graph(app):
+    display(
+        Image(
+            app.get_graph(xray = True).draw_mermaid_png(
+                draw_method=MermaidDrawMethod.API,
+            )
+        )
+    )

climateqa/engine/llm/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from climateqa.engine.llm.openai import get_llm as get_openai_llm
 from climateqa.engine.llm.azure import get_llm as get_azure_llm
 def get_llm(provider="openai",**kwargs):
@@ -8,6 +9,8 @@ def get_llm(provider="openai",**kwargs):
         return get_openai_llm(**kwargs)
     elif provider == "azure":
         return get_azure_llm(**kwargs)
     else:
         raise ValueError(f"Unknown provider: {provider}")

 from climateqa.engine.llm.openai import get_llm as get_openai_llm
 from climateqa.engine.llm.azure import get_llm as get_azure_llm
+from climateqa.engine.llm.ollama import get_llm as get_ollama_llm
 def get_llm(provider="openai",**kwargs):
         return get_openai_llm(**kwargs)
     elif provider == "azure":
         return get_azure_llm(**kwargs)
+    elif provider == "ollama":
+        return  get_ollama_llm(**kwargs)
     else:
         raise ValueError(f"Unknown provider: {provider}")

climateqa/engine/llm/ollama.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from langchain_community.llms import Ollama
+def get_llm(model="llama3", **kwargs):
+    return Ollama(model=model, **kwargs)

climateqa/engine/rag.py DELETED Viewed

@@ -1,134 +0,0 @@
-from operator import itemgetter
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
-from langchain_core.prompts.prompt import PromptTemplate
-from langchain_core.prompts.base import format_document
-from climateqa.engine.reformulation import make_reformulation_chain
-from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
-from climateqa.engine.prompts import papers_prompt_template
-from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
-from climateqa.engine.keywords import make_keywords_chain
-DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
-def _combine_documents(
-    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
-):
-    doc_strings =  []
-    for i,doc in enumerate(docs):
-        # chunk_type = "Doc" if doc.metadata["chunk_type"] == "text" else "Image"
-        chunk_type = "Doc"
-        if isinstance(doc,str):
-            doc_formatted = doc
-        else:
-            doc_formatted = format_document(doc, document_prompt)
-        doc_string = f"{chunk_type} {i+1}: " + doc_formatted
-        doc_string = doc_string.replace("\n"," ")
-        doc_strings.append(doc_string)
-    return sep.join(doc_strings)
-def get_text_docs(x):
-    return [doc for doc in x if doc.metadata["chunk_type"] == "text"]
-def get_image_docs(x):
-    return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
-def make_rag_chain(retriever,llm):
-    # Construct the prompt
-    prompt = ChatPromptTemplate.from_template(answer_prompt_template)
-    prompt_without_docs = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
-    # ------- CHAIN 0 - Reformulation
-    reformulation = make_reformulation_chain(llm)
-    reformulation = prepare_chain(reformulation,"reformulation")
-    # ------- Find all keywords from the reformulated query
-    keywords = make_keywords_chain(llm)
-    keywords = {"keywords":itemgetter("question") | keywords}
-    keywords = prepare_chain(keywords,"keywords")
-    # ------- CHAIN 1
-    # Retrieved documents
-    find_documents = {"docs": itemgetter("question") | retriever} | RunnablePassthrough()
-    find_documents = prepare_chain(find_documents,"find_documents")
-    # ------- CHAIN 2
-    # Construct inputs for the llm
-    input_documents = {
-        "context":lambda x : _combine_documents(x["docs"]),
-        **pass_values(["question","audience","language","keywords"])
-    }
-    # ------- CHAIN 3
-    # Bot answer
-    llm_final = rename_chain(llm,"answer")
-    answer_with_docs = {
-        "answer": input_documents | prompt | llm_final | StrOutputParser(),
-        **pass_values(["question","audience","language","query","docs","keywords"]),
-    }
-    answer_without_docs = {
-        "answer":  prompt_without_docs | llm_final | StrOutputParser(),
-        **pass_values(["question","audience","language","query","docs","keywords"]),
-    }
-    # def has_images(x):
-    #     image_docs = [doc for doc in x["docs"] if doc.metadata["chunk_type"]=="image"]
-    #     return len(image_docs) > 0
-    def has_docs(x):
-        return len(x["docs"]) > 0
-    answer = RunnableBranch(
-        (lambda x: has_docs(x), answer_with_docs),
-        answer_without_docs,
-    )
-    # ------- FINAL CHAIN
-    # Build the final chain
-    rag_chain = reformulation | keywords | find_documents | answer
-    return rag_chain
-def make_rag_papers_chain(llm):
-    prompt = ChatPromptTemplate.from_template(papers_prompt_template)
-    input_documents = {
-        "context":lambda x : _combine_documents(x["docs"]),
-        **pass_values(["question","language"])
-    }
-    chain = input_documents | prompt | llm | StrOutputParser()
-    chain = rename_chain(chain,"answer")
-    return chain
-def make_illustration_chain(llm):
-    prompt_with_images = ChatPromptTemplate.from_template(answer_prompt_images_template)
-    input_description_images = {
-        "images":lambda x : _combine_documents(get_image_docs(x["docs"])),
-        **pass_values(["question","audience","language","answer"]),
-    }
-    illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
-    return illustration_chain

climateqa/engine/reranker.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from scipy.special import expit, logit
+from rerankers import Reranker
+def get_reranker(model = "nano",cohere_api_key = None):
+    assert model in ["nano","tiny","small","large"]
+    if model == "nano":
+        reranker = Reranker('ms-marco-TinyBERT-L-2-v2', model_type='flashrank')
+    elif model == "tiny":
+        reranker = Reranker('ms-marco-MiniLM-L-12-v2', model_type='flashrank')
+    elif model == "small":
+        reranker = Reranker("mixedbread-ai/mxbai-rerank-xsmall-v1", model_type='cross-encoder')
+    elif model == "large":
+        if cohere_api_key is None:
+            cohere_api_key = os.environ["COHERE_API_KEY"]
+        reranker = Reranker("cohere", lang='en', api_key = cohere_api_key)
+    return reranker
+def rerank_docs(reranker,docs,query):
+    # Get a list of texts from langchain docs
+    input_docs = [x.page_content for x in docs]
+    # Rerank using rerankers library
+    results = reranker.rank(query=query, docs=input_docs)
+    # Prepare langchain list of docs
+    docs_reranked = []
+    for result in results.results:
+        doc_id = result.document.doc_id
+        doc = docs[doc_id]
+        doc.metadata["reranking_score"] = result.score
+        doc.metadata["query_used_for_retrieval"] = query
+        docs_reranked.append(doc)
+    return docs_reranked

climateqa/engine/utils.py CHANGED Viewed

@@ -1,8 +1,15 @@
 from operator import itemgetter
 from typing import Any, Dict, Iterable, Tuple
 from langchain_core.runnables import RunnablePassthrough
 def pass_values(x):
     if not isinstance(x, list):
         x = [x]
@@ -67,3 +74,13 @@ def flatten_dict(
     """
     flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
     return flat_dict

 from operator import itemgetter
 from typing import Any, Dict, Iterable, Tuple
+import tiktoken
 from langchain_core.runnables import RunnablePassthrough
+def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
+    encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
 def pass_values(x):
     if not isinstance(x, list):
         x = [x]
     """
     flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
     return flat_dict
+async def log_event(info,name,config):
+    """Helper function that will run a dummy chain with the given info
+    The astream_event function will catch this chain and stream the dict info to the logger
+    """
+    chain = RunnablePassthrough().with_config(run_name=name)
+    _ = await chain.ainvoke(info,config)

climateqa/knowledge/__init__.py ADDED Viewed

File without changes

climateqa/{papers → knowledge}/openalex.py RENAMED Viewed

@@ -3,18 +3,32 @@ import networkx as nx
 import matplotlib.pyplot as plt
 from pyvis.network import Network
 from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
 import pyalex
 pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
 class OpenAlex():
     def __init__(self):
         pass
-    def search(self,keywords,n_results = 100,after = None,before = None):
         if isinstance(keywords,str):
             works = Works().search(keywords)
@@ -27,18 +41,21 @@ class OpenAlex():
                 break
             df_works = pd.DataFrame(page)
-            df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x))
             df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
             df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
-            df_works["content"] = df_works["title"] + "\n" + df_works["abstract"]
         else:
-            df_works = []
-            for keyword in keywords:
-                df_keyword = self.search(keyword,n_results = n_results,after = after,before = before)
-                df_works.append(df_keyword)
-            df_works = pd.concat(df_works,ignore_index=True,axis = 0)
-        return df_works
     def rerank(self,query,df,reranker):
@@ -139,4 +156,36 @@ class OpenAlex():
                     reconstructed[position] = token
             # Join the tokens to form the reconstructed sentence(s)
-            return ' '.join(reconstructed)

 import matplotlib.pyplot as plt
 from pyvis.network import Network
+from langchain_core.retrievers import BaseRetriever
+from langchain_core.vectorstores import VectorStoreRetriever
+from langchain_core.documents.base import Document
+from langchain_core.vectorstores import VectorStore
+from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
+from ..engine.utils import num_tokens_from_string
+from typing import List
+from pydantic import Field
 from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
 import pyalex
 pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
+def replace_nan_with_empty_dict(x):
+    return x if pd.notna(x) else {}
 class OpenAlex():
     def __init__(self):
         pass
+    def search(self,keywords:str,n_results = 100,after = None,before = None):
         if isinstance(keywords,str):
             works = Works().search(keywords)
                 break
             df_works = pd.DataFrame(page)
+            df_works = df_works.dropna(subset = ["title"])
+            df_works["primary_location"] = df_works["primary_location"].map(replace_nan_with_empty_dict)
+            df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x)).fillna("")
             df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
             df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
+            df_works["url"] = df_works["id"]
+            df_works["content"] = (df_works["title"] + "\n" + df_works["abstract"]).map(lambda x : x.strip())
+            df_works["num_tokens"] = df_works["content"].map(lambda x : num_tokens_from_string(x))
+            df_works = df_works.drop(columns = ["abstract_inverted_index"])
+            # df_works["subtitle"] = df_works["title"] + " - " + df_works["primary_location"]["source"]["display_name"] + " - " + df_works["publication_year"]
+            return df_works
         else:
+           raise Exception("Keywords must be a string")
     def rerank(self,query,df,reranker):
                     reconstructed[position] = token
             # Join the tokens to form the reconstructed sentence(s)
+            return ' '.join(reconstructed)
+class OpenAlexRetriever(BaseRetriever):
+    min_year:int = 1960
+    max_year:int = None
+    k:int = 100
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        openalex = OpenAlex()
+        # Search for documents
+        df_docs = openalex.search(query,n_results=self.k,after = self.min_year,before = self.max_year)
+        docs = []
+        for i,row in df_docs.iterrows():
+            num_tokens = row["num_tokens"]
+            if num_tokens < 50 or num_tokens > 1000:
+                continue
+            doc = Document(
+                page_content = row["content"],
+                metadata = row.to_dict()
+            )
+            docs.append(doc)
+        return docs

climateqa/{engine → knowledge}/retriever.py RENAMED Viewed

@@ -66,6 +66,7 @@ class ClimateQARetriever(BaseRetriever):
         # Add score to metadata
         results = []
         for i,(doc,score) in enumerate(docs):
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
@@ -78,86 +79,3 @@ class ClimateQARetriever(BaseRetriever):
         return results
-# def filter_summaries(df,k_summary = 3,k_total = 10):
-#     # assert source in ["IPCC","IPBES","ALL"], "source arg should be in (IPCC,IPBES,ALL)"
-#     # # Filter by source
-#     # if source == "IPCC":
-#     #     df = df.loc[df["source"]=="IPCC"]
-#     # elif source == "IPBES":
-#     #     df = df.loc[df["source"]=="IPBES"]
-#     # else:
-#     #     pass
-#     # Separate summaries and full reports
-#     df_summaries = df.loc[df["report_type"].isin(["SPM","TS"])]
-#     df_full = df.loc[~df["report_type"].isin(["SPM","TS"])]
-#     # Find passages from summaries dataset
-#     passages_summaries = df_summaries.head(k_summary)
-#     # Find passages from full reports dataset
-#     passages_fullreports = df_full.head(k_total - len(passages_summaries))
-#     # Concatenate passages
-#     passages = pd.concat([passages_summaries,passages_fullreports],axis = 0,ignore_index = True)
-#     return passages
-# def retrieve_with_summaries(query,retriever,k_summary = 3,k_total = 10,sources = ["IPCC","IPBES"],max_k = 100,threshold = 0.555,as_dict = True,min_length = 300):
-#     assert max_k > k_total
-#     validated_sources = ["IPCC","IPBES"]
-#     sources = [x for x in sources if x in validated_sources]
-#     filters = {
-#         "source": { "$in": sources },
-#     }
-#     print(filters)
-#     # Retrieve documents
-#     docs = retriever.retrieve(query,top_k = max_k,filters = filters)
-#     # Filter by score
-#     docs = [{**x.meta,"score":x.score,"content":x.content} for x in docs if x.score > threshold]
-#     if len(docs) == 0:
-#         return []
-#     res = pd.DataFrame(docs)
-#     passages_df = filter_summaries(res,k_summary,k_total)
-#     if as_dict:
-#         contents = passages_df["content"].tolist()
-#         meta = passages_df.drop(columns = ["content"]).to_dict(orient = "records")
-#         passages = []
-#         for i in range(len(contents)):
-#             passages.append({"content":contents[i],"meta":meta[i]})
-#         return passages
-#     else:
-#         return passages_df
-# def retrieve(query,sources = ["IPCC"],threshold = 0.555,k = 10):
-#     print("hellooooo")
-#     # Reformulate queries
-#     reformulated_query,language = reformulate(query)
-#     print(reformulated_query)
-#     # Retrieve documents
-#     passages = retrieve_with_summaries(reformulated_query,retriever,k_total = k,k_summary = 3,as_dict = True,sources = sources,threshold = threshold)
-#     response = {
-#       "query":query,
-#       "reformulated_query":reformulated_query,
-#       "language":language,
-#       "sources":passages,
-#       "prompts":{"init_prompt":init_prompt,"sources_prompt":sources_prompt},
-#     }
-#     return response

         # Add score to metadata
         results = []
         for i,(doc,score) in enumerate(docs):
+            doc.page_content = doc.page_content.replace("\r\n"," ")
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
         return results

climateqa/papers/__init__.py DELETED Viewed

@@ -1,43 +0,0 @@
-import pandas as pd
-from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
-import pyalex
-pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
-class OpenAlex():
-    def __init__(self):
-        pass
-    def search(self,keywords,n_results = 100,after = None,before = None):
-        works = Works().search(keywords).get()
-        for page in works.paginate(per_page=n_results):
-            break
-        df_works = pd.DataFrame(page)
-        return works
-    def make_network(self):
-        pass
-    def get_abstract_from_inverted_index(self,index):
-        # Determine the maximum index to know the length of the reconstructed array
-        max_index = max([max(positions) for positions in index.values()])
-        # Initialize a list with placeholders for all positions
-        reconstructed = [''] * (max_index + 1)
-        # Iterate through the inverted index and place each token at its respective position(s)
-        for token, positions in index.items():
-            for position in positions:
-                reconstructed[position] = token
-        # Join the tokens to form the reconstructed sentence(s)
-        return ' '.join(reconstructed)

front/__init__.py ADDED Viewed

File without changes

front/callbacks.py ADDED Viewed

File without changes

front/utils.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import re
+def make_pairs(lst):
+    """from a list of even lenght, make tupple pairs"""
+    return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]
+def serialize_docs(docs):
+    new_docs = []
+    for doc in docs:
+        new_doc = {}
+        new_doc["page_content"] = doc.page_content
+        new_doc["metadata"] = doc.metadata
+        new_docs.append(new_doc)
+    return new_docs
+def parse_output_llm_with_sources(output):
+    # Split the content into a list of text and "[Doc X]" references
+    content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
+    parts = []
+    for part in content_parts:
+        if part.startswith("Doc"):
+            subparts = part.split(",")
+            subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
+            subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" for subpart in subparts]
+            parts.append("".join(subparts))
+        else:
+            parts.append(part)
+    content_parts = "".join(parts)
+    return content_parts
+def make_html_source(source,i):
+    meta = source.metadata
+    # content = source.page_content.split(":",1)[1].strip()
+    content = source.page_content.strip()
+    toc_levels = []
+    for j in range(2):
+        level = meta[f"toc_level{j}"]
+        if level != "N/A":
+            toc_levels.append(level)
+        else:
+            break
+    toc_levels = " > ".join(toc_levels)
+    if len(toc_levels) > 0:
+        name = f"<b>{toc_levels}</b><br/>{meta['name']}"
+    else:
+        name = meta['name']
+    score = meta['reranking_score']
+    if score > 0.8:
+        color = "score-green"
+    elif score > 0.4:
+        color = "score-orange"
+    else:
+        color = "score-red"
+    relevancy_score = f"<p class=relevancy-score>Relevancy score: <span class='{color}'>{score:.1%}</span></p>"
+    if meta["chunk_type"] == "text":
+        card = f"""
+    <div class="card" id="doc{i}">
+        <div class="card-content">
+            <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
+            <p>{content}</p>
+            {relevancy_score}
+        </div>
+        <div class="card-footer">
+            <span>{name}</span>
+            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
+                <span role="img" aria-label="Open PDF">🔗</span>
+            </a>
+        </div>
+    </div>
+    """
+    else:
+        if meta["figure_code"] != "N/A":
+            title = f"{meta['figure_code']} - {meta['short_name']}"
+        else:
+            title = f"{meta['short_name']}"
+        card = f"""
+    <div class="card card-image">
+        <div class="card-content">
+            <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
+            <p>{content}</p>
+            <p class='ai-generated'>AI-generated description</p>
+            {relevancy_score}
+        </div>
+        <div class="card-footer">
+            <span>{name}</span>
+            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
+                <span role="img" aria-label="Open PDF">🔗</span>
+            </a>
+        </div>
+    </div>
+    """
+    return card
+def make_toolbox(tool_name,description = "",checked = False,elem_id = "toggle"):
+    if checked:
+        span = "<span class='checkmark'>&#10003;</span>"
+    else:
+        span = "<span class='loader'></span>"
+#     toolbox = f"""
+# <div class="dropdown">
+# <label for="{elem_id}" class="dropdown-toggle">
+#     {span}
+#     {tool_name}
+#     <span class="caret"></span>
+# </label>
+# <input type="checkbox" id="{elem_id}" hidden/>
+# <div class="dropdown-content">
+#     <p>{description}</p>
+# </div>
+# </div>
+# """
+    toolbox = f"""
+<div class="dropdown">
+<label for="{elem_id}" class="dropdown-toggle">
+    {span}
+    {tool_name}
+</label>
+</div>
+"""
+    return toolbox

requirements.txt CHANGED Viewed

@@ -1,13 +1,19 @@
-gradio==4.19.1
 azure-storage-file-share==12.11.1
 azure-storage-blob
 python-dotenv==1.0.0
-langchain==0.1.4
-langchain_openai==0.0.6
-pinecone-client==3.0.2
 sentence-transformers==2.6.0
 huggingface-hub
-msal
 pyalex==0.13
 networkx==3.2.1
-pyvis==0.3.2

+gradio==4.44
 azure-storage-file-share==12.11.1
 azure-storage-blob
 python-dotenv==1.0.0
+langchain==0.2.1
+langchain_openai==0.1.7
+langgraph==0.0.55
+pinecone-client==4.1.0
 sentence-transformers==2.6.0
 huggingface-hub
 pyalex==0.13
 networkx==3.2.1
+pyvis==0.3.2
+flashrank==0.2.5
+rerankers==0.3.0
+torch==2.3.0
+nvidia-cudnn-cu12==8.9.2.26
+langchain-community==0.2
+msal==1.31

sandbox/20240310 - CQA - Semantic Routing 1.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

style.css CHANGED Viewed

@@ -2,6 +2,14 @@
 /* :root {
     --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
   } */
 .warning-box {
     background-color: #fff3cd;
@@ -57,6 +65,7 @@ body.dark .tip-box * {
 .message{
     font-size:14px !important;
 }
@@ -65,6 +74,10 @@ a {
     color: inherit;
 }
 .card {
     background-color: white;
     border-radius: 10px;
@@ -363,3 +376,108 @@ span.chatbot > p > img{
 .a-doc-ref{
 	text-decoration: none !important;
 }

 /* :root {
     --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
   } */
+.avatar-container.svelte-1x5p6hu:not(.thumbnail-item) img {
+    width: 100%;
+    height: 100%;
+    object-fit: cover;
+    border-radius: 50%;
+    padding: 0px;
+    margin: 0px;
+}
 .warning-box {
     background-color: #fff3cd;
 .message{
     font-size:14px !important;
 }
     color: inherit;
 }
+.doc-ref sup{
+    color:#dc2626!important;
+    /* margin-right:1px; */
+}
 .card {
     background-color: white;
     border-radius: 10px;
 .a-doc-ref{
 	text-decoration: none !important;
 }
+.dropdown {
+    position: relative;
+    display:inline-block;
+    margin-bottom: 10px;
+  }
+  .dropdown-toggle {
+    background-color: #f2f2f2;
+    color: black;
+    padding: 10px;
+    font-size: 16px;
+    cursor: pointer;
+    display: block;
+    width: 400px; /* Adjust width as needed */
+    position: relative;
+    display: flex;
+    align-items: center; /* Vertically center the contents */
+    justify-content: left;
+  }
+  .dropdown-toggle .caret {
+    content: "";
+    position: absolute;
+    right: 10px;
+    top: 50%;
+    border-left: 5px solid transparent;
+    border-right: 5px solid transparent;
+    border-top: 5px solid black;
+    transform: translateY(-50%);
+  }
+  input[type="checkbox"] {
+    display: none !important;
+  }
+  input[type="checkbox"]:checked + .dropdown-content {
+    display: block;
+  }
+  .dropdown-content {
+    display: none;
+    position: absolute;
+    background-color: #f9f9f9;
+    min-width: 300px;
+    box-shadow: 0 8px 16px 0 rgba(0,0,0,0.2);
+    z-index: 1;
+    padding: 12px;
+    border: 1px solid #ccc;
+  }
+  input[type="checkbox"]:checked + .dropdown-toggle + .dropdown-content {
+    display: block;
+  }
+  input[type="checkbox"]:checked + .dropdown-toggle .caret {
+    border-top: 0;
+    border-bottom: 5px solid black;
+  }
+  .loader {
+    border: 1px solid #d0d0d0 !important; /* Light grey background */
+    border-top: 1px solid #db3434 !important; /* Blue color */
+    border-right: 1px solid #3498db !important; /* Blue color */
+    border-radius: 50%;
+    width: 20px;
+    height: 20px;
+    animation: spin 2s linear infinite;
+    display:inline-block;
+    margin-right:10px !important;
+  }
+  .checkmark{
+    color:green !important;
+    font-size:18px;
+    margin-right:10px !important;
+  }
+  @keyframes spin {
+    0% { transform: rotate(0deg); }
+    100% { transform: rotate(360deg); }
+  }
+  .relevancy-score{
+    margin-top:10px !important;
+    font-size:10px !important;
+    font-style:italic;
+  }
+  .score-green{
+    color:green !important;
+  }
+  .score-orange{
+    color:orange !important;
+  }
+  .score-orange{
+    color:red !important;
+  }
+.message-buttons-left.panel.message-buttons.with-avatar {
+    display: none;
+}

test.json ADDED Viewed

File without changes