Spaces:

Ekimetrics
/

celsius-csrd-chatbot

Sleeping

App Files Files Community

momenaca commited on Sep 2

Commit

d708cb9

•

1 Parent(s): 7bfa7e6

add latest updates regarding agent mode

Browse files

Files changed (10) hide show

app.py +84 -162
celsius_csrd_chatbot/__init__.py +0 -0
celsius_csrd_chatbot/agent.py +27 -3
celsius_csrd_chatbot/chains/answer_rag.py +11 -13
celsius_csrd_chatbot/chains/esrs_categorization.py +25 -112
celsius_csrd_chatbot/chains/esrs_intent.py +7 -18
celsius_csrd_chatbot/chains/retriever.py +6 -5
celsius_csrd_chatbot/utils.py +24 -2
poetry.lock +0 -0
pyproject.toml +5 -2

app.py CHANGED Viewed

@@ -1,11 +1,9 @@
 import os
 import gradio as gr
-from operator import itemgetter
 from pinecone import Pinecone
 from huggingface_hub import whoami
 from langchain.prompts import ChatPromptTemplate
-from langchain.schema.output_parser import StrOutputParser
-from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_openai import AzureChatOpenAI
 from langchain.prompts.prompt import PromptTemplate
@@ -18,7 +16,9 @@ from celsius_csrd_chatbot.utils import (
     _combine_documents,
     get_llm,
     init_env,
 )
 init_env()
 chat_model_init = get_llm()
@@ -33,165 +33,86 @@ embeddings = HuggingFaceBgeEmbeddings(
 pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
 index = pc.Index(os.getenv("PINECONE_API_INDEX"))
 vectorstore = PineconeVectorstore(index, embeddings, "page_content")
-retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
-chat_model = AzureChatOpenAI()
-esrs_wiki = """
-The Corporate Sustainability Reporting Directive (CSRD) is a mandate that requires all companies to report on their sustainability initiatives. In response to this directive, the European Sustainability Reporting Standards (ESRS) were developed. These standards are a key tool in promoting the transition to a sustainable economy within the EU, providing a structured framework for companies to disclose their sustainability initiatives. The ESRS cover a wide range of environmental, social, and governance (ESG) issues, including climate change, biodiversity, and human rights. Companies that adhere to the ESRS can provide investors with valuable insights into their sustainability impact, thereby informing investment decisions. The ESRS are designed to be highly interoperable with global reporting standards, which helps to avoid unnecessary duplication in reporting by companies. The reporting requirements based on the ESRS will be gradually implemented for different companies over time. In summary, the ESRS play a critical role in fostering sustainable finance and enabling companies to demonstrate their commitment to the green deal agenda while accessing sustainable finance.
----
-"""
-reformulation_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
-Chat History:
-{chat_history}
-Follow Up Input: {question}
-Standalone question:"""
-CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(reformulation_template)
-answering_template = """
-    You are an ESG expert, with 20 years experience analyzing corporate sustainability reports.
-    You are specialist in the upcoming CSRD regulation and in general with corporate sustainability disclosure requirements.
-    {esrs_wiki}
-    You will answer the question based on the following passages extracted from CSRD specific sustainability guidelines and reports:
-    ```
-    {context}
-    ```
-    Guidelines:
-    1. Context: You'll receive relevant excerpts from a CSRD-specific sustainability guideline or report to address a given question.
-    2. Relevance: Only include passages directly pertaining to the question; omit irrelevant content.
-    3. Facts and Figures: Prioritize factual information in your response.
-    4. Conciseness: Keep answers sharp and succinct, avoiding unnecessary context.
-    5. Focus: Address the specific question without veering into related topics.
-    6. Honesty: If unsure, state that you don't know rather than inventing an answer.
-    7. Source Attribution: When using information from a passage, mention it as [Doc i] at the end of the sentence (where 'i' represents the document number).
-    8. Multiple Sources: If the same content appears in multiple documents, cite them collectively (e.g., [Doc i, Doc j, Doc k]).
-    9. Structured Paragraphs: Instead of bullet-point summaries, compile your responses into well-structured paragraphs.
-    10. Method Focus: When addressing "how" questions, emphasize methods and procedures over outcomes.
-    11. Selective Usage: You're not obligated to use every passage; include only those relevant to the question.
-    12. Insufficient Information: If documents lack necessary details, indicate that you don't have enough information.
-    Question: {question}
-    Answer:
-    """
-ANSWER_PROMPT = ChatPromptTemplate.from_template(answering_template)
-DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
 memory = ConversationBufferMemory(
     return_messages=True, output_key="answer", input_key="question"
 )
-# First we add a step to load memory
-# This adds a "memory" key to the input object
-loaded_memory = RunnablePassthrough.assign(
-    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
-)
-# Now we calculate the standalone question
-standalone_question = {
-    "standalone_question": {
-        "question": lambda x: x["question"],
-        "chat_history": lambda x: _format_chat_history(x["chat_history"]),
-    }
-    | CONDENSE_QUESTION_PROMPT
-    | chat_model
-    | StrOutputParser(),
-}
-# Now we retrieve the documents
-retrieved_documents = {
-    "docs": itemgetter("standalone_question") | retriever,
-    "question": lambda x: x["standalone_question"],
-}
-# Now we construct the inputs for the final prompt
-final_inputs = {
-    "context": lambda x: _combine_documents(x["docs"], DEFAULT_DOCUMENT_PROMPT),
-    "question": itemgetter("question"),
-    "esrs_wiki": lambda x: esrs_wiki,
-}
-# And finally, we do the part that returns the answers
-answer = {
-    "answer": final_inputs | ANSWER_PROMPT | chat_model,
-    "docs": itemgetter("docs"),
-}
-# And now we put it all together!
-final_chain = loaded_memory | standalone_question | retrieved_documents | answer
-async def chat(
-    query: str,
-    history: list = [],
-):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
-    source_string = ""
-    gradio_format = make_pairs([a.content for a in history]) + [(query, "")]
-    # reset memory
-    memory.clear()
-    for message in history:
-        memory.chat_memory.add_message(message)
-    inputs = {"question": query}
-    result = final_chain.astream_log({"question": query})
-    reformulated_question_path_id = "/logs/AzureChatOpenAI/streamed_output_str/-"
-    retriever_path_id = "/logs/Retriever/final_output"
-    final_answer_path_id = "/logs/AzureChatOpenAI:2/streamed_output_str/-"
-    async for op in result:
-        op = op.ops[0]
-        if op["path"] == reformulated_question_path_id:  # reforulated question
-            new_token = op["value"]  # str
-        elif op["path"] == retriever_path_id:  # documents
-            sources = op["value"]["documents"]  # List[Document]
-            source_string = "\n\n".join(
-                [(make_html_source(i, doc)) for i, doc in enumerate(sources, 1)]
-            )
-        elif op["path"] == final_answer_path_id:  # final answer
-            new_token = op["value"]  # str
-            answer_yet = gradio_format[-1][1]
-            gradio_format[-1] = (query, answer_yet + new_token)
-        yield "", gradio_format, history, source_string
-    memory.save_context(inputs, {"answer": gradio_format[-1][1]})
-    yield "", gradio_format, memory.load_memory_variables({})["history"], source_string
 with open("./assets/style.css", "r") as f:
     css = f.read()
-def update_visible(oauth_token: gr.OAuthToken | None):
-    if oauth_token is None:
-        return {
-            bloc_1: gr.update(visible=True),
-            bloc_2: gr.update(visible=False),
-            bloc_3: gr.update(visible=False),
-        }
-    org_names = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
-    if "ekimetrics-esrsqa" in org_names:  # logged in group
-        return {
-            bloc_1: gr.update(visible=False),
-            bloc_2: gr.update(visible=True),
-            bloc_3: gr.update(visible=False),
-        }
-    else:  # logged but not in group
-        return {
-            bloc_1: gr.update(visible=False),
-            bloc_2: gr.update(visible=False),
-            bloc_3: gr.update(visible=True),
-        }
 # Set up Gradio Theme
 theme = gr.themes.Base(
     primary_hue="blue",
@@ -211,18 +132,6 @@ What do you want to learn ?
 with gr.Blocks(title=f"{demo_name}", css=css, theme=theme) as demo:
-    # with gr.Row():
-    #     with gr.Column(scale=1):
-    #         login = gr.LoginButton()
-    # with gr.Column() as bloc_1:
-    #     textbox_1 = gr.Textbox("You are not logged to Hugging Face !", show_label=False)
-    # with gr.Column(visible=False) as bloc_3:
-    #     textbox_3 = gr.Textbox(
-    #         "You are not part of the ESRS Q&A Project, if interested ask access here : https://huggingface.co/ekimetrics-esrsqa"
-    #     )
     with gr.Column(visible=True) as bloc_2:
         with gr.Tab("ESRS Q&A"):
             with gr.Row():
@@ -262,16 +171,29 @@ with gr.Blocks(title=f"{demo_name}", css=css, theme=theme) as demo:
                 with gr.Column(scale=1):
                     gr.Markdown("WIP")
-    # demo.load(update_visible, inputs=None, outputs=[bloc_1, bloc_2, bloc_3])
-    # login.click(update_visible, inputs=[], outputs=[bloc_1, bloc_2, bloc_3])
     ask.submit(
         fn=chat,
         inputs=[
             ask,
-            state,
         ],
-        outputs=[ask, chatbot, state, sources_textbox],
     )

 import os
+from datetime import datetime
 import gradio as gr
 from pinecone import Pinecone
 from huggingface_hub import whoami
 from langchain.prompts import ChatPromptTemplate
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_openai import AzureChatOpenAI
 from langchain.prompts.prompt import PromptTemplate
     _combine_documents,
     get_llm,
     init_env,
+    parse_output_llm_with_sources,
 )
+from celsius_csrd_chatbot.agent import make_graph_agent, display_graph
 init_env()
 chat_model_init = get_llm()
 pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
 index = pc.Index(os.getenv("PINECONE_API_INDEX"))
 vectorstore = PineconeVectorstore(index, embeddings, "page_content")
+llm = AzureChatOpenAI()
+agent = make_graph_agent(llm, vectorstore)
 memory = ConversationBufferMemory(
     return_messages=True, output_key="answer", input_key="question"
 )
+async def chat(query, history):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
+    date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    print(f">> NEW QUESTION ({date_now}) : {query}")
+    inputs = {"query": query}
+    result = agent.astream_events(inputs, version="v1")
+    docs = []
+    docs_html = ""
+    output_query = ""
+    start_streaming = False
+    steps_display = {
+        "categorize_esrs": ("🔄️ Analyzing user query", True),
+        "retrieve_documents": ("🔄️ Searching in the knowledge base", True),
+    }
+    try:
+        async for event in result:
+            print(event)
+            if event["event"] == "on_chat_model_stream":
+                print("line 66")
+                if start_streaming == False:
+                    print("line 68")
+                    start_streaming = True
+                    history[-1] = (query, "")
+                new_token = event["data"]["chunk"].content
+                previous_answer = history[-1][1]
+                previous_answer = previous_answer if previous_answer is not None else ""
+                answer_yet = previous_answer + new_token
+                answer_yet = parse_output_llm_with_sources(answer_yet)
+                history[-1] = (query, answer_yet)
+            elif (
+                event["name"] == "retrieve_documents"
+                and event["event"] == "on_chain_end"
+            ):
+                try:
+                    print("line 84")
+                    docs = event["data"]["output"]["documents"]
+                    docs_html = []
+                    for i, d in enumerate(docs, 1):
+                        docs_html.append(make_html_source(d, i))
+                    docs_html = "".join(docs_html)
+                except Exception as e:
+                    print(f"Error getting documents: {e}")
+                    print(event)
+            for event_name, (
+                event_description,
+                display_output,
+            ) in steps_display.items():
+                if event["name"] == event_name:
+                    print("line 99")
+                    if event["event"] == "on_chain_start":
+                        print("line 101")
+                        answer_yet = event_description
+                        history[-1] = (query, answer_yet)
+            history = [tuple(x) for x in history]
+            yield history, docs_html
+    except Exception as e:
+        raise gr.Error(f"{e}")
 with open("./assets/style.css", "r") as f:
     css = f.read()
 # Set up Gradio Theme
 theme = gr.themes.Base(
     primary_hue="blue",
 with gr.Blocks(title=f"{demo_name}", css=css, theme=theme) as demo:
     with gr.Column(visible=True) as bloc_2:
         with gr.Tab("ESRS Q&A"):
             with gr.Row():
                 with gr.Column(scale=1):
                     gr.Markdown("WIP")
+    def start_chat(query, history):
+        history = history + [(query, None)]
+        history = [tuple(x) for x in history]
+        return (gr.update(interactive=False), history)
+    def finish_chat():
+        return gr.update(interactive=True, value="")
     ask.submit(
+        start_chat,
+        [ask, chatbot],
+        [ask, chatbot],
+        queue=False,
+        api_name="start_chat_textbox",
+    ).then(
         fn=chat,
         inputs=[
             ask,
+            chatbot,
         ],
+        outputs=[chatbot, sources_textbox],
+    ).then(
+        finish_chat, None, [ask], api_name="finish_chat_textbox"
     )

celsius_csrd_chatbot/__init__.py ADDED Viewed

File without changes

celsius_csrd_chatbot/agent.py CHANGED Viewed

@@ -6,6 +6,7 @@ from langchain.schema import Document
 from langgraph.graph import END, StateGraph
 from langchain_core.runnables.graph import MermaidDrawMethod
 from typing_extensions import TypedDict
 from typing import List
@@ -14,6 +15,9 @@ from IPython.display import display, HTML, Image
 from celsius_csrd_chatbot.chains.esrs_categorization import (
     make_esrs_categorization_node,
 )
 from celsius_csrd_chatbot.chains.retriever import make_retriever_node
 from celsius_csrd_chatbot.chains.answer_rag import make_rag_node
@@ -26,6 +30,7 @@ class GraphState(TypedDict):
     query: str
     esrs_type: str
     answer: str
 def route_intent(state):
@@ -33,30 +38,49 @@ def route_intent(state):
     if esrs == "none":
         return "intent_esrs"
     else:
         return "retrieve_documents"
 def make_graph_agent(llm, vectorstore):
     workflow = StateGraph(GraphState)
     # Define the node functions
-    categorize_esrs = make_esrs_categorization_node(llm)
     retrieve_documents = make_retriever_node(vectorstore)
-    answer_rag = make_rag_node(llm)
     # Define the nodes
     workflow.add_node("categorize_esrs", categorize_esrs)
     workflow.add_node("retrieve_documents", retrieve_documents)
     workflow.add_node("answer_rag", answer_rag)
     # Entry point
     workflow.set_entry_point("categorize_esrs")
     # Define the edges
-    workflow.add_edge("categorize_esrs", "retrieve_documents")
     workflow.add_edge("retrieve_documents", "answer_rag")
     workflow.add_edge("answer_rag", END)
     # Compile
     app = workflow.compile()

 from langgraph.graph import END, StateGraph
 from langchain_core.runnables.graph import MermaidDrawMethod
+from tomlkit import document
 from typing_extensions import TypedDict
 from typing import List
 from celsius_csrd_chatbot.chains.esrs_categorization import (
     make_esrs_categorization_node,
 )
+from celsius_csrd_chatbot.chains.esrs_intent import (
+    make_esrs_intent_node,
+)
 from celsius_csrd_chatbot.chains.retriever import make_retriever_node
 from celsius_csrd_chatbot.chains.answer_rag import make_rag_node
     query: str
     esrs_type: str
     answer: str
+    documents: List[Document]
 def route_intent(state):
     if esrs == "none":
         return "intent_esrs"
+    elif esrs == "wrong_esrs":
+        return "answer_rag"
     else:
         return "retrieve_documents"
+def make_id_dict(values):
+    return {k: k for k in values}
 def make_graph_agent(llm, vectorstore):
     workflow = StateGraph(GraphState)
     # Define the node functions
+    categorize_esrs = make_esrs_categorization_node()
+    intent_esrs = make_esrs_intent_node(llm)
     retrieve_documents = make_retriever_node(vectorstore)
+    answer_rag = make_rag_node(llm, wrong_esrs=False)
+    answer_rag_wrong = make_rag_node(llm, wrong_esrs=True)
     # Define the nodes
     workflow.add_node("categorize_esrs", categorize_esrs)
+    workflow.add_node("intent_esrs", intent_esrs)
     workflow.add_node("retrieve_documents", retrieve_documents)
     workflow.add_node("answer_rag", answer_rag)
+    workflow.add_node("answer_rag_wrong", answer_rag_wrong)
     # Entry point
     workflow.set_entry_point("categorize_esrs")
+    # CONDITIONAL EDGES
+    workflow.add_conditional_edges(
+        "categorize_esrs",
+        route_intent,
+        make_id_dict(["intent_esrs", "retrieve_documents", "answer_rag_wrong"]),
+    )
     # Define the edges
+    workflow.add_edge("intent_esrs", "retrieve_documents")
     workflow.add_edge("retrieve_documents", "answer_rag")
     workflow.add_edge("answer_rag", END)
+    workflow.add_edge("answer_rag_wrong", END)
     # Compile
     app = workflow.compile()

celsius_csrd_chatbot/chains/answer_rag.py CHANGED Viewed

@@ -13,15 +13,6 @@ The Corporate Sustainability Reporting Directive (CSRD) is a mandate that requir
 """
-reformulation_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
-Chat History:
-{chat_history}
-Follow Up Input: {question}
-Standalone question:"""
-CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(reformulation_template)
 answering_template = """
     You are an ESG expert, with 20 years experience analyzing corporate sustainability reports.
     You are specialist in the upcoming CSRD regulation and in general with corporate sustainability disclosure requirements.
@@ -46,7 +37,7 @@ answering_template = """
     11. Selective Usage: You're not obligated to use every passage; include only those relevant to the question.
     12. Insufficient Information: If documents lack necessary details, indicate that you don't have enough information.
-    Question: {question}
     Answer:
     """
@@ -78,6 +69,7 @@ def make_rag_chain(llm):
         {
             "context": lambda x: _combine_documents(x["documents"]),
             "query": itemgetter("query"),
         }
         | prompt
         | llm
@@ -86,11 +78,17 @@ def make_rag_chain(llm):
     return chain
-def make_rag_node(llm):
     rag_chain = make_rag_chain(llm)
     async def answer_rag(state, config):
-        answer = await rag_chain.ainvoke(state, config)
-        return {"answer": answer}
     return answer_rag

 """
 answering_template = """
     You are an ESG expert, with 20 years experience analyzing corporate sustainability reports.
     You are specialist in the upcoming CSRD regulation and in general with corporate sustainability disclosure requirements.
     11. Selective Usage: You're not obligated to use every passage; include only those relevant to the question.
     12. Insufficient Information: If documents lack necessary details, indicate that you don't have enough information.
+    Question: {query}
     Answer:
     """
         {
             "context": lambda x: _combine_documents(x["documents"]),
             "query": itemgetter("query"),
+            "esrs_wiki": lambda x: esrs_wiki,
         }
         | prompt
         | llm
     return chain
+def make_rag_node(llm, wrong_esrs=False):
     rag_chain = make_rag_chain(llm)
     async def answer_rag(state, config):
+        if wrong_esrs:
+            return {
+                "answer": "I'm sorry, I can't answer that question. Please provide a valid ESRS."
+            }
+        else:
+            answer = await rag_chain.ainvoke(state, config)
+            return {"answer": answer}
     return answer_rag

celsius_csrd_chatbot/chains/esrs_categorization.py CHANGED Viewed

@@ -1,121 +1,34 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from langchain.prompts.prompt import PromptTemplate
-from langchain.output_parsers import PydanticOutputParser
-from typing import Literal
-from operator import itemgetter
-import json
-from langchain_core.exceptions import OutputParserException
-class ESRSAnalysis(BaseModel):
-    """Analyzing the user query to get ESRS type, sources and intent"""
-    esrs_type: Literal[
-        "ESRS 1",
-        "ESRS 2",
-        "ESRS E1",
-        "ESRS E2",
-        "ESRS E3",
-        "ESRS E4",
-        "ESRS E5",
-        "ESRS S1",
-        "ESRS S2",
-        "ESRS S3",
-        "ESRS S4",
-        "ESRS G1",
-        "none",
-    ] = Field(
-        description="""
-            Given a user question choose which documents would be most relevant for answering their question :
-            - ESRS 1 is for questions about general principles for preparing and presenting sustainability information in accordance with CSRD
-            - ESRS 2 is for questions about general disclosures related to sustainability reporting, including governance, strategy, impact, risk, opportunity management, and metrics and targets
-            - ESRS E1 is for questions about climate change, global warming, GES and energy
-            - ESRS E2 is for questions about air, water, and soil pollution, and dangerous substances
-            - ESRS E3 is for questions about water and marine resources
-            - ESRS E4 is for questions about biodiversity, nature, wildlife and ecosystems
-            - ESRS E5 is for questions about resource use and circular economy
-            - ESRS S1 is for questions about workforce and labor issues, job security, fair pay, and health and safety
-            - ESRS S2 is for questions about workers in the value chain, workers' treatment
-            - SRS S3 is for questions about affected communities, impact on local communities
-            - ESRS S4 is for questions about consumers and end users, customer privacy, safety, and inclusion
-            - ESRS G1 is for questions about governance, risk management, internal control, and business conduct
-            - none is for questions that do not fit into any of the above categories
-            Follow these guidelines :
-            - Do not take into account upper or lower case letter distinction. For example, 'esrs 1', 'Esrs 1' and 'ESRS 1' should be considered as 'ESRS 1'.
-            - Some questions could be related to multiple ESRS. In this case, keep all options and format the output as such : 'ESRS 1', 'ESRS 2'.
-            - Remember, if the question is not related to any ESRS, the output should be 'none'.
-        """,
-    )
-def make_esrs_categorization_chain(llm):
-    parser = PydanticOutputParser(pydantic_object=ESRSAnalysis)
-    prompt_template = """
-    The following question is about ESRS related topics. Please analyze the question and indicate if it refers to specific ESRS.
-    {format_instructions}
-    Please answer with the appropriate ESRS to answer the question.
-    Question: '{query}'
-    Answer:
-    """
-    prompt = PromptTemplate(
-        template=prompt_template,
-        input_variables=["query"],
-        partial_variables={"format_instructions": parser.get_format_instructions()},
-    )
-    chain = {"query": itemgetter("query")} | prompt | llm | parser
-    return chain
-def make_esrs_categorization_node(llm):
     def categorize_message(state):
         query = state["query"]
-        categorization_chain = make_esrs_categorization_chain(llm)
-        output = categorization_chain.invoke(query)
-        if not output:
-            raise OutputParserException("Output is empty")
-        try:
-            # Attempt to parse the output as JSON
-            parsed_output = json.loads(output)
-        except json.JSONDecodeError as e:
-            # Raise a more informative error if the output is not valid JSON
-            raise OutputParserException(f"Invalid JSON output: {output}") from e
         return output
     return categorize_message
-    # intent: str = Field(
-    #     enum=[
-    #         "Specific topic",
-    #         "Implementation reco",
-    #         "KPI extraction",
-    #     ],
-    #     description="""
-    #         Categorize the user query in one of the following categories,
-    #         Examples:
-    #         - Specific topic: "What are the specificities of ESRS E1 ?"
-    #         - Implementation reco: "How should I compute my scope 1 reduction target ?"
-    #         - KPI extraction: "When will the CSRD be mandatory for my small French company ?"
-    #     """,
-    # )
-    # sources: str = Field(
-    #     enum=["ESRS", "External"],
-    #     description="""
-    #         Given a user question choose which documents would be most relevant for answering their question,
-    #         - ESRS is for questions about a specific environmental, social or governance topic, as well as CSRD's general principles and disclosures
-    #         - External is for questions about how to implement the CSRD, or general questions about CSRD's context
-    #     """,
-    # )

+import re
+def make_esrs_categorization_node():
     def categorize_message(state):
         query = state["query"]
+        pattern = r"ESRS \d|ESRS [A-Z]\d|ESRS [A-Z] \d"
+        esrs_truth = [
+            "ESRS 1",
+            "ESRS 2",
+            "ESRS E1",
+            "ESRS E2",
+            "ESRS E3",
+            "ESRS E4",
+            "ESRS E5",
+            "ESRS S1",
+            "ESRS S2",
+            "ESRS S3",
+            "ESRS S4",
+            "ESRS G1",
+        ]
+        matches = re.findall(pattern, query.upper())
+        if matches:
+            true_matches = [match for match in matches if match in esrs_truth]
+            output = {"esrs_type": true_matches if true_matches else "wrong_esrs"}
+        else:
+            output = {"esrs_type": "none"}
         return output
     return categorize_message

celsius_csrd_chatbot/chains/esrs_intent.py CHANGED Viewed

@@ -44,17 +44,16 @@ class ESRSAnalysis(BaseModel):
             Follow these guidelines :
-            - Do not take into account upper or lower case letter distinction. For example, 'esrs 1', 'Esrs 1' and 'ESRS 1' should be considered as 'ESRS 1'.
-            - Some questions could be related to multiple ESRS. In this case, keep all options and format the output as such : 'ESRS 1', 'ESRS 2'.
             - Remember, if the question is not related to any ESRS, the output should be 'none'.
         """,
     )
-def make_esrs_categorization_chain(llm):
     parser = PydanticOutputParser(pydantic_object=ESRSAnalysis)
     prompt_template = """
-    The following question is about ESRS related topics. Please analyze the question and indicate if it refers to specific ESRS.
     {format_instructions}
@@ -74,26 +73,16 @@ def make_esrs_categorization_chain(llm):
     return chain
-def make_esrs_categorization_node(llm):
-    def categorize_message(state):
         query = state["query"]
-        categorization_chain = make_esrs_categorization_chain(llm)
         output = categorization_chain.invoke(query)
-        if not output:
-            raise OutputParserException("Output is empty")
-        try:
-            # Attempt to parse the output as JSON
-            parsed_output = json.loads(output)
-        except json.JSONDecodeError as e:
-            # Raise a more informative error if the output is not valid JSON
-            raise OutputParserException(f"Invalid JSON output: {output}") from e
         return output
-    return categorize_message
     # intent: str = Field(
     #     enum=[

             Follow these guidelines :
+            - Some questions could be related to multiple ESRS. In such case, choose the most appropriate one.
             - Remember, if the question is not related to any ESRS, the output should be 'none'.
         """,
     )
+def make_esrs_intent_chain(llm):
     parser = PydanticOutputParser(pydantic_object=ESRSAnalysis)
     prompt_template = """
+    The following question is about ESRS related topics. Please analyze the question and indicate if it refers to a specific ESRS.
     {format_instructions}
     return chain
+def make_esrs_intent_node(llm):
+    def intent_message(state):
         query = state["query"]
+        categorization_chain = make_esrs_intent_chain(llm)
         output = categorization_chain.invoke(query)
         return output
+    return intent_message
     # intent: str = Field(
     #     enum=[

celsius_csrd_chatbot/chains/retriever.py CHANGED Viewed

@@ -6,14 +6,15 @@ def make_retriever_node(vectorstore, k=10):
         if sources == "none":
             filters_full = {}
         else:
-            filters_full = {"ESRS": {"$in": [sources]}}
         docs = []
-        retriever = vectorstore.as_retriever()
-        docs = retriever.similarity_search_with_score(
             query=query, filter=filters_full, k=k
         )
-        for doc in docs:
-            doc.metadata["similarity_score"] = doc.metadata["similarity_score"]
         docs = sorted(docs, key=lambda x: x.metadata["similarity_score"], reverse=True)
         new_state = {"documents": docs}

         if sources == "none":
             filters_full = {}
         else:
+            filters_full = {"ESRS_filter": {"$in": sources}}
         docs = []
+        docs_retrieved = vectorstore.similarity_search_with_score(
             query=query, filter=filters_full, k=k
         )
+        for doc in docs_retrieved:
+            doc_append = doc[0]
+            doc_append.metadata["similarity_score"] = doc[1]
+            docs.append(doc_append)
         docs = sorted(docs, key=lambda x: x.metadata["similarity_score"], reverse=True)
         new_state = {"documents": docs}

celsius_csrd_chatbot/utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 from typing import Tuple, List
 from dotenv import load_dotenv
 from msal import ConfidentialClientApplication
@@ -58,13 +59,13 @@ def make_pairs(lst):
 def make_html_source(i, doc):
     if doc.metadata["source"] == "ESRS":
         return f"""
-<div class="card">
     <div class="card-content">
         <h3>Doc {i}</h2>
         <p>{doc.page_content}</p>
     </div>
     <div class="card-footer">
-        <span>{doc.metadata['ESRS']} \n</span>
         <span>DR: {doc.metadata['DR']} \n</span>
         <span>Data type: {doc.metadata['Data type']} \n</span>
     </div>
@@ -82,3 +83,24 @@ def make_html_source(i, doc):
     </div>
 </div>
     """

 import os
+import re
 from typing import Tuple, List
 from dotenv import load_dotenv
 from msal import ConfidentialClientApplication
 def make_html_source(i, doc):
     if doc.metadata["source"] == "ESRS":
         return f"""
+<div class="card" id="doc{i}">
     <div class="card-content">
         <h3>Doc {i}</h2>
         <p>{doc.page_content}</p>
     </div>
     <div class="card-footer">
+        <span>{doc.metadata['ESRS_filter']} \n</span>
         <span>DR: {doc.metadata['DR']} \n</span>
         <span>Data type: {doc.metadata['Data type']} \n</span>
     </div>
     </div>
 </div>
     """
+def parse_output_llm_with_sources(output):
+    # Split the content into a list of text and "[Doc X]" references
+    content_parts = re.split(r"\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]", output)
+    parts = []
+    for part in content_parts:
+        if part.startswith("Doc"):
+            subparts = part.split(",")
+            subparts = [
+                subpart.lower().replace("doc", "").strip() for subpart in subparts
+            ]
+            subparts = [
+                f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>"""
+                for subpart in subparts
+            ]
+            parts.append("".join(subparts))
+        else:
+            parts.append(part)
+    content_parts = "".join(parts)
+    return content_parts

poetry.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ version = "0.1.0"
 description = ""
 authors = ["Miguel Omenaca Muro <miguel.omenacamuro@ekimetrics.com>"]
 readme = "README.md"
-package-mode = false
 [tool.poetry.dependencies]
 python = ">=3.10,<3.13"
@@ -17,7 +17,10 @@ loadenv = "^0.1.1"
 openai = "^1.34.0"
 langchain-openai = "^0.1.8"
 pinecone = "^4.0.0"
-pinecone-client = "^4.1.1"
 [build-system]

 description = ""
 authors = ["Miguel Omenaca Muro <miguel.omenacamuro@ekimetrics.com>"]
 readme = "README.md"
+package-mode = true
 [tool.poetry.dependencies]
 python = ">=3.10,<3.13"
 openai = "^1.34.0"
 langchain-openai = "^0.1.8"
 pinecone = "^4.0.0"
+pinecone-client = "^5.0.1"
+langgraph = "^0.2.0"
+langchain-core = "^0.2.29"
+ipython = "^8.26.0"
 [build-system]