Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on Aug 27

Commit

0110684

2 Parent(s): d37790b ecc6c98

Merged in dev (pull request #27)

Browse files

Files changed (24) hide show

app.py +6 -12
climateqa/engine/chains/retrieve_documents.py +25 -11
climateqa/engine/graph_retriever.py +3 -4
climateqa/engine/llm/openai.py +0 -1
climateqa/engine/talk_to_data/input_processing.py +73 -8
climateqa/engine/talk_to_data/ipcc/config.py +16 -4
climateqa/engine/talk_to_data/ipcc/plot_informations.py +23 -0
climateqa/engine/talk_to_data/ipcc/plots.py +81 -3
climateqa/engine/talk_to_data/ipcc/queries.py +65 -5
climateqa/engine/talk_to_data/main.py +2 -2
climateqa/engine/talk_to_data/myVanna.py +0 -13
climateqa/engine/talk_to_data/plot.py +0 -418
climateqa/engine/talk_to_data/sql_query.py +0 -114
climateqa/engine/talk_to_data/talk_to_drias.py +0 -317
climateqa/engine/talk_to_data/utils.py +0 -281
climateqa/engine/talk_to_data/vanna_class.py +0 -325
climateqa/engine/talk_to_data/workflow/drias.py +8 -3
climateqa/engine/talk_to_data/workflow/ipcc.py +20 -7
climateqa/engine/vectorstore.py +137 -45
climateqa/utils.py +1 -1
front/tabs/tab_ipcc.py +2 -0
requirements.txt +3 -0
sandbox/20241104 - CQA - StepByStep CQA.ipynb +0 -0
style.css +0 -1

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from azure.storage.fileshare import ShareServiceClient
 # Import custom modules
 from climateqa.engine.embeddings import get_embeddings_function
 from climateqa.engine.llm import get_llm
-from climateqa.engine.vectorstore import get_pinecone_vectorstore
 from climateqa.engine.reranker import get_reranker
 from climateqa.engine.graph import make_graph_agent, make_graph_agent_poc
 from climateqa.engine.chains.retrieve_papers import find_papers
@@ -66,17 +66,11 @@ user_id = create_user_id()
 # Create vectorstore and retriever
 embeddings_function = get_embeddings_function()
-vectorstore = get_pinecone_vectorstore(
-    embeddings_function, index_name=os.getenv("PINECONE_API_INDEX")
-)
-vectorstore_graphs = get_pinecone_vectorstore(
-    embeddings_function,
-    index_name=os.getenv("PINECONE_API_INDEX_OWID"),
-    text_key="description",
-)
-vectorstore_region = get_pinecone_vectorstore(
-    embeddings_function, index_name=os.getenv("PINECONE_API_INDEX_LOCAL_V2")
-)
 llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
 if os.environ["GRADIO_ENV"] == "local":

 # Import custom modules
 from climateqa.engine.embeddings import get_embeddings_function
 from climateqa.engine.llm import get_llm
+from climateqa.engine.vectorstore import get_vectorstore
 from climateqa.engine.reranker import get_reranker
 from climateqa.engine.graph import make_graph_agent, make_graph_agent_poc
 from climateqa.engine.chains.retrieve_papers import find_papers
 # Create vectorstore and retriever
 embeddings_function = get_embeddings_function()
+vectorstore = get_vectorstore(provider="azure_search", embeddings=embeddings_function, index_name="climateqa-ipx")
+vectorstore_graphs = get_vectorstore(provider="azure_search", embeddings=embeddings_function, index_name="climateqa-owid", text_key="description")
+vectorstore_region = get_vectorstore(provider="azure_search", embeddings=embeddings_function, index_name="climateqa-v2")
 llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
 if os.environ["GRADIO_ENV"] == "local":

climateqa/engine/chains/retrieve_documents.py CHANGED Viewed

@@ -19,7 +19,7 @@ from ..llm import get_llm
 from .prompts import retrieve_chapter_prompt_template
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-from ..vectorstore import get_pinecone_vectorstore
 from ..embeddings import get_embeddings_function
 import ast
@@ -134,7 +134,7 @@ def get_ToCs(version: str) :
         "version": version
     }
     embeddings_function = get_embeddings_function()
-    vectorstore = get_pinecone_vectorstore(embeddings_function, index_name="climateqa-v2")
     tocs = vectorstore.similarity_search_with_score(query="",filter = filters_text)
     # remove duplicates or almost duplicates
@@ -236,7 +236,7 @@ async def get_POC_documents_by_ToC_relevant_documents(
     filters_text_toc = {
         **filters,
         "chunk_type":"text",
-        "toc_level0": {"$in": toc_filters},
         "version": version
         # "report_type": {}, # TODO  to be completed to choose the right documents / chapters according to the analysis of the question
     }
@@ -273,6 +273,22 @@ async def get_POC_documents_by_ToC_relevant_documents(
         "docs_images" : docs_images
     }
 async def get_IPCC_relevant_documents(
     query: str,
@@ -299,9 +315,9 @@ async def get_IPCC_relevant_documents(
     filters = {}
     if len(reports) > 0:
-        filters["short_name"] = {"$in":reports}
     else:
-        filters["source"] = { "$in": sources}
     # INIT
     docs_summaries = []
@@ -323,18 +339,16 @@ async def get_IPCC_relevant_documents(
         filters_summaries = {
             **filters,
             "chunk_type":"text",
-            "report_type": { "$in":["SPM"]},
         }
         docs_summaries = vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = k_summary)
         docs_summaries = [x for x in docs_summaries if x[1] > threshold]
         # Search for k_total - k_summary documents in the full reports dataset
-        filters_full = {
-            **filters,
-            "chunk_type":"text",
-            "report_type": { "$nin":["SPM"]},
-        }
         docs_full = vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_total)
         if search_figures:

 from .prompts import retrieve_chapter_prompt_template
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
+from ..vectorstore import get_vectorstore
 from ..embeddings import get_embeddings_function
 import ast
         "version": version
     }
     embeddings_function = get_embeddings_function()
+    vectorstore = get_vectorstore(provider="qdrant", embeddings=embeddings_function, index_name="climateqa")
     tocs = vectorstore.similarity_search_with_score(query="",filter = filters_text)
     # remove duplicates or almost duplicates
     filters_text_toc = {
         **filters,
         "chunk_type":"text",
+        "toc_level0": toc_filters,  # Changed from {"$in": toc_filters} to direct list
         "version": version
         # "report_type": {}, # TODO  to be completed to choose the right documents / chapters according to the analysis of the question
     }
         "docs_images" : docs_images
     }
+def filter_for_full_report_documents(filters: dict) -> dict:
+    """
+    Filter for full report documents.
+    Returns a dictionary format compatible with all vectorstore providers.
+    """
+    # Start with the base filters
+    full_filters = filters.copy()
+    # Add chunk_type filter
+    full_filters["chunk_type"] = "text"
+    # Add report_type exclusion using the new _exclude suffix format
+    # This will be converted to appropriate OData filter by Azure Search wrapper
+    full_filters["report_type_exclude"] = ["SPM"]
+    return full_filters
 async def get_IPCC_relevant_documents(
     query: str,
     filters = {}
     if len(reports) > 0:
+        filters["short_name"] = reports  # Changed from {"$in":reports} to direct list
     else:
+        filters["source"] = sources  # Changed from {"$in": sources} to direct list
     # INIT
     docs_summaries = []
         filters_summaries = {
             **filters,
             "chunk_type":"text",
+            "report_type": ["SPM"],  # Changed from {"$in":["SPM"]} to direct list
         }
         docs_summaries = vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = k_summary)
         docs_summaries = [x for x in docs_summaries if x[1] > threshold]
         # Search for k_total - k_summary documents in the full reports dataset
+        filters_full = filter_for_full_report_documents(filters)
         docs_full = vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_total)
         if search_figures:

climateqa/engine/graph_retriever.py CHANGED Viewed

@@ -60,10 +60,9 @@ async def retrieve_graphs(
         assert sources
         assert any([x in ["OWID"] for x in sources])
-        # Prepare base search kwargs
-        filters = {}
-        filters["source"] = {"$in": sources}
         docs = vectorstore.similarity_search_with_score(query=query, filter=filters, k=k_total)

         assert sources
         assert any([x in ["OWID"] for x in sources])
+        # Prepare base search kwargs for Azure AI Search
+        # Azure expects a filter string, e.g. "source eq 'OWID' or source eq 'IEA'"
+        filters = {"source":"OWID"}
         docs = vectorstore.similarity_search_with_score(query=query, filter=filters, k=k_total)

climateqa/engine/llm/openai.py CHANGED Viewed

@@ -8,7 +8,6 @@ except Exception:
     pass
 def get_llm(model="gpt-4o-mini",max_tokens=1024, temperature=0.0, streaming=True,timeout=30, **kwargs):
     llm = ChatOpenAI(
         model=model,
         api_key=os.environ.get("THEO_API_KEY", None),

     pass
 def get_llm(model="gpt-4o-mini",max_tokens=1024, temperature=0.0, streaming=True,timeout=30, **kwargs):
     llm = ChatOpenAI(
         model=model,
         api_key=os.environ.get("THEO_API_KEY", None),

climateqa/engine/talk_to_data/input_processing.py CHANGED Viewed

@@ -10,6 +10,7 @@ from climateqa.engine.talk_to_data.objects.llm_outputs import ArrayOutput
 from climateqa.engine.talk_to_data.objects.location import Location
 from climateqa.engine.talk_to_data.objects.plot import Plot
 from climateqa.engine.talk_to_data.objects.states import State
 async def detect_location_with_openai(sentence: str) -> str:
     """
@@ -118,7 +119,7 @@ async def detect_year_with_openai(sentence: str) -> str:
         return years_list[0]
     else:
         return ""
 async def detect_relevant_tables(user_question: str, plot: Plot, llm, table_names_list: list[str]) -> list[str]:
     """Identifies relevant tables for a plot based on user input.
@@ -227,6 +228,55 @@ async def find_year(user_input: str) -> str| None:
         return None
     return year
 async def find_relevant_plots(state: State, llm, plots: list[Plot]) -> list[str]:
     print("---- Find relevant plots ----")
     relevant_plots = await detect_relevant_plots(state['user_input'], llm, plots)
@@ -237,16 +287,28 @@ async def find_relevant_tables_per_plot(state: State, plot: Plot, llm, tables: l
     relevant_tables = await detect_relevant_tables(state['user_input'], plot, llm, tables)
     return relevant_tables
-async def find_param(state: State, param_name:str, mode: Literal['DRIAS', 'IPCC'] = 'DRIAS') -> dict[str, Optional[str]] | Location | None:
-    """Perform the good method to retrieve the desired parameter
     Args:
-        state (State): state of the workflow
-        param_name (str): name of the desired parameter
-        table (str): name of the table
     Returns:
-        dict[str, Any] | None:
     """
     if param_name == 'location':
         location = await find_location(state['user_input'], mode)
@@ -254,4 +316,7 @@ async def find_param(state: State, param_name:str, mode: Literal['DRIAS', 'IPCC'
     if param_name == 'year':
         year = await find_year(state['user_input'])
         return {'year': year}
-    return None

 from climateqa.engine.talk_to_data.objects.location import Location
 from climateqa.engine.talk_to_data.objects.plot import Plot
 from climateqa.engine.talk_to_data.objects.states import State
+import calendar
 async def detect_location_with_openai(sentence: str) -> str:
     """
         return years_list[0]
     else:
         return ""
 async def detect_relevant_tables(user_question: str, plot: Plot, llm, table_names_list: list[str]) -> list[str]:
     """Identifies relevant tables for a plot based on user input.
         return None
     return year
+async def find_month(user_input: str) -> dict[str, str|None]:
+    """
+    Extracts month information from user input using an LLM.
+    This function analyzes the user's query to detect if a month is mentioned.
+    It returns both the month number (as a string, e.g. '7' for July) and the full English month name (e.g. 'July').
+    If no month is found, both values will be None.
+    Args:
+        user_input (str): The user's query text.
+    Returns:
+        dict[str, str|None]: A dictionary with keys:
+            - "month_number": the month number as a string (e.g. '7'), or None if not found
+            - "month_name": the full English month name (e.g. 'July'), or None if not found
+    Example:
+        >>> await find_month("Show me the temperature in Paris in July")
+        {'month_number': '7', 'month_name': 'July'}
+        >>> await find_month("Show me the temperature in Paris")
+        {'month_number': None, 'month_name': None}
+    """
+    llm = get_llm()
+    prompt = """
+    Extract the month (as a number from 1 to 12) mentioned in the following sentence.
+    Return the result as a Python list of integers. If no month is mentioned, return an empty list.
+    Sentence: "{sentence}"
+    """
+    prompt = ChatPromptTemplate.from_template(prompt)
+    structured_llm = llm.with_structured_output(ArrayOutput)
+    chain = prompt | structured_llm
+    response: ArrayOutput = await chain.ainvoke({"sentence": user_input})
+    months_list = ast.literal_eval(response['array'])
+    if len(months_list) > 0:
+        month_number = int(months_list[0])
+        month_name = calendar.month_name[month_number]
+        return {
+            "month_number": str(month_number),
+            "month_name": month_name
+        }
+    else:
+        return {
+            "month_number" : None,
+            "month_name" : None
+        }
 async def find_relevant_plots(state: State, llm, plots: list[Plot]) -> list[str]:
     print("---- Find relevant plots ----")
     relevant_plots = await detect_relevant_plots(state['user_input'], llm, plots)
     relevant_tables = await detect_relevant_tables(state['user_input'], plot, llm, tables)
     return relevant_tables
+async def find_param(state: State, param_name: str, mode: Literal['DRIAS', 'IPCC'] = 'DRIAS') -> dict[str, Optional[str]] | Location | None:
+    """
+    Retrieves a specific parameter (location, year, month, etc.) from the user's input using the appropriate extraction method.
     Args:
+        state (State): The current state containing at least the user's input under 'user_input'.
+        param_name (str): The name of the parameter to extract. Supported: 'location', 'year', 'month'.
+        mode (Literal['DRIAS', 'IPCC']): The data mode to use for location extraction.
     Returns:
+        - For 'location': a Location object (dict with keys like 'location', 'latitude', etc.), or None if not found.
+        - For 'year': a dict {'year': year or None}.
+        - For 'month': a dict {'month_number': str or None, 'month_name': str or None}.
+        - None if the parameter is not recognized or not found.
+    Example:
+        >>> await find_param(state, 'location')
+        {'location': 'Paris', 'latitude': ..., ...}
+        >>> await find_param(state, 'year')
+        {'year': '2050'}
+        >>> await find_param(state, 'month')
+        {'month_number': '7', 'month_name': 'July'}
     """
     if param_name == 'location':
         location = await find_location(state['user_input'], mode)
     if param_name == 'year':
         year = await find_year(state['user_input'])
         return {'year': year}
+    if param_name == 'month':
+        month = await find_month(state['user_input'])
+        return month
+    return None

climateqa/engine/talk_to_data/ipcc/config.py CHANGED Viewed

@@ -6,16 +6,22 @@ from climateqa.engine.talk_to_data.config import IPCC_DATASET_URL
 IPCC_TABLES = [
     "mean_temperature",
     "total_precipitation",
 ]
 IPCC_INDICATOR_COLUMNS_PER_TABLE = {
     "mean_temperature": "mean_temperature",
-    "total_precipitation": "total_precipitation"
 }
 IPCC_INDICATOR_TO_UNIT = {
     "mean_temperature": "°C",
-    "total_precipitation": "mm/day"
 }
 IPCC_SCENARIO = [
@@ -30,7 +36,8 @@ IPCC_MODELS = []
 IPCC_PLOT_PARAMETERS = [
     'year',
-    'location'
 ]
 MACRO_COUNTRIES = ['JP',
@@ -63,7 +70,9 @@ HUGE_MACRO_COUNTRIES = ['CL',
 IPCC_INDICATOR_TO_COLORSCALE = {
     "mean_temperature": TEMPERATURE_COLORSCALE,
-    "total_precipitation": PRECIPITATION_COLORSCALE
 }
 IPCC_UI_TEXT = """
@@ -77,9 +86,12 @@ By default, we take the **mediane of each climate model**.
 Current available charts :
 - Yearly evolution of an indicator at a specific location (historical + SSP Projections)
 - Yearly spatial distribution of an indicator in a specific country
 Current available indicators :
 - Mean temperature
 - Total precipitation
 For example, you can ask:

 IPCC_TABLES = [
     "mean_temperature",
     "total_precipitation",
+    "minimum_temperature",
+    "maximum_temperature"
 ]
 IPCC_INDICATOR_COLUMNS_PER_TABLE = {
     "mean_temperature": "mean_temperature",
+    "total_precipitation": "total_precipitation",
+    "minimum_temperature": "minimum_temperature",
+    "maximum_temperature": "maximum_temperature"
 }
 IPCC_INDICATOR_TO_UNIT = {
     "mean_temperature": "°C",
+    "total_precipitation": "mm/day",
+    "minimum_temperature": "°C",
+    "maximum_temperature": "°C"
 }
 IPCC_SCENARIO = [
 IPCC_PLOT_PARAMETERS = [
     'year',
+    'location',
+    'month'
 ]
 MACRO_COUNTRIES = ['JP',
 IPCC_INDICATOR_TO_COLORSCALE = {
     "mean_temperature": TEMPERATURE_COLORSCALE,
+    "total_precipitation": PRECIPITATION_COLORSCALE,
+    "minimum_temperature": TEMPERATURE_COLORSCALE,
+    "maximum_temperature": TEMPERATURE_COLORSCALE,
 }
 IPCC_UI_TEXT = """
 Current available charts :
 - Yearly evolution of an indicator at a specific location (historical + SSP Projections)
 - Yearly spatial distribution of an indicator in a specific country
+- Yearly evolution of an indicator in a specific month at a specific location (historical + SSP Projections)
 Current available indicators :
 - Mean temperature
+- Minimum temperature
+- Maximum temperature
 - Total precipitation
 For example, you can ask:

climateqa/engine/talk_to_data/ipcc/plot_informations.py CHANGED Viewed

@@ -47,4 +47,27 @@ Each grid point is colored according to the value of the indicator ({unit}), all
 - For each grid point of {location} country ({country_name}), the value of {indicator} in {year} and for the selected scenario is extracted and mapped to its geographic coordinates.
 - The grid points correspond to 1-degree squares centered on the grid points of the IPCC dataset. Each grid point has been mapped to a country using [**reverse_geocoder**](https://github.com/thampiman/reverse-geocoder).
 - The coordinates used for each region are those of the closest available grid point in the IPCC database, which uses a regular grid with a spatial resolution of 1 degree.
 """

 - For each grid point of {location} country ({country_name}), the value of {indicator} in {year} and for the selected scenario is extracted and mapped to its geographic coordinates.
 - The grid points correspond to 1-degree squares centered on the grid points of the IPCC dataset. Each grid point has been mapped to a country using [**reverse_geocoder**](https://github.com/thampiman/reverse-geocoder).
 - The coordinates used for each region are those of the closest available grid point in the IPCC database, which uses a regular grid with a spatial resolution of 1 degree.
+"""
+def indicator_specific_month_evolution_informations(
+        indicator: str,
+        params: dict[str, str]
+) -> str:
+    if "location" not in params:
+        raise ValueError('"location" must be provided in params')
+    location = params["location"]
+    if "month_name" not in params:
+        raise ValueError('"month_name" must be provided in params')
+    month = params["month_name"]
+    unit = IPCC_INDICATOR_TO_UNIT[indicator]
+    return f"""
+This plot shows how the climate indicator **{indicator}** evolves over time in **{location}** for the month of **{month}**.
+It combines both historical (from 1950 to 2015) observations and future (from 2016 to 2100) projections for the different SSP climate scenarios (SSP126, SSP245, SSP370 and SSP585).
+The x-axis represents the years (from 1950 to 2100), and the y-axis shows the value of the {indicator} ({unit}) for the selected month.
+Each line corresponds to a different scenario, allowing you to compare how {indicator} for month {month} might change under various future conditions.
+**Data source:**
+- The data comes from the IPCC climate datasets (Parquet files) for the relevant indicator, location, and month.
+- For each year and scenario, the value of {indicator} for month {month} is extracted for the selected location.
+- The coordinates used for {location} correspond to the closest available point in the IPCC database, which uses a regular grid with a spatial resolution of 1 degree.
 """

climateqa/engine/talk_to_data/ipcc/plots.py CHANGED Viewed

@@ -5,8 +5,8 @@ import pandas as pd
 import geojson
 from climateqa.engine.talk_to_data.ipcc.config import IPCC_INDICATOR_TO_COLORSCALE, IPCC_INDICATOR_TO_UNIT, IPCC_SCENARIO
-from climateqa.engine.talk_to_data.ipcc.plot_informations import choropleth_map_informations, indicator_evolution_informations
-from climateqa.engine.talk_to_data.ipcc.queries import indicator_for_given_year_query, indicator_per_year_at_location_query
 from climateqa.engine.talk_to_data.objects.plot import Plot
 def generate_geojson_polygons(latitudes: list[float], longitudes: list[float], indicators: list[float]) -> geojson.FeatureCollection:
@@ -102,6 +102,82 @@ indicator_evolution_at_location_historical_and_projections: Plot = {
     "short_name": "Evolution"
 }
 def plot_choropleth_map_of_country_indicator_for_specific_year(
     params: dict,
 ) -> Callable[[pd.DataFrame], Figure]:
@@ -167,6 +243,7 @@ def plot_choropleth_map_of_country_indicator_for_specific_year(
     return plot_data
 choropleth_map_of_country_indicator_for_specific_year: Plot = {
     "name": "Choropleth Map of a Country's Indicator Distribution for a Specific Year",
     "description": (
@@ -185,5 +262,6 @@ choropleth_map_of_country_indicator_for_specific_year: Plot = {
 IPCC_PLOTS = [
     indicator_evolution_at_location_historical_and_projections,
-    choropleth_map_of_country_indicator_for_specific_year
 ]

 import geojson
 from climateqa.engine.talk_to_data.ipcc.config import IPCC_INDICATOR_TO_COLORSCALE, IPCC_INDICATOR_TO_UNIT, IPCC_SCENARIO
+from climateqa.engine.talk_to_data.ipcc.plot_informations import choropleth_map_informations, indicator_evolution_informations, indicator_specific_month_evolution_informations
+from climateqa.engine.talk_to_data.ipcc.queries import indicator_for_given_year_query, indicator_per_year_and_specific_month_at_location_query, indicator_per_year_at_location_query
 from climateqa.engine.talk_to_data.objects.plot import Plot
 def generate_geojson_polygons(latitudes: list[float], longitudes: list[float], indicators: list[float]) -> geojson.FeatureCollection:
     "short_name": "Evolution"
 }
+def plot_indicator_monthly_evolution_at_location(
+    params: dict,
+) -> Callable[[pd.DataFrame], Figure]:
+    """
+    Returns a function that generates a line plot showing the evolution of a climate indicator
+    for a specific month over time at a specific location, including both historical data
+    and future projections for different climate scenarios.
+    Args:
+        params (dict): Dictionary with:
+            - indicator_column (str): Name of the climate indicator column to plot.
+            - location (str): Location (e.g., country, city) for which to plot the indicator.
+            - month (str): Month name to plot.
+    Returns:
+        Callable[[pd.DataFrame], Figure]: Function that takes a DataFrame and returns a Plotly Figure.
+    """
+    indicator = params["indicator_column"]
+    location = params["location"]
+    month = params["month_name"]
+    indicator_label = " ".join(word.capitalize() for word in indicator.split("_"))
+    unit = IPCC_INDICATOR_TO_UNIT.get(indicator, "")
+    def plot_data(df: pd.DataFrame) -> Figure:
+        df = df.sort_values(by='year')
+        years = df['year'].astype(int).tolist()
+        indicators = df[indicator].astype(float).tolist()
+        scenarios = df['scenario'].astype(str).tolist()
+        # Find last historical value for continuity
+        last_historical = [(y, v) for y, v, s in zip(years, indicators, scenarios) if s == 'historical']
+        last_historical_year, last_historical_indicator = last_historical[-1] if last_historical else (None, None)
+        fig = go.Figure()
+        for scenario in IPCC_SCENARIO:
+            x = [y for y, s in zip(years, scenarios) if s == scenario]
+            y = [v for v, s in zip(indicators, scenarios) if s == scenario]
+            # Connect historical to scenario
+            if scenario != 'historical' and last_historical_indicator is not None:
+                x = [last_historical_year] + x
+                y = [last_historical_indicator] + y
+            fig.add_trace(go.Scatter(
+                x=x,
+                y=y,
+                mode='lines',
+                name=scenario
+            ))
+        fig.update_layout(
+            title=f'Evolution of {indicator_label} in {month} in {location} (Historical + SSP Scenarios)',
+            xaxis_title='Year',
+            yaxis_title=f'{indicator_label} ({unit})',
+            legend_title='Scenario',
+            height=800,
+        )
+        return fig
+    return plot_data
+indicator_specific_month_evolution_at_location: Plot = {
+    "name": "Indicator specific month Evolution at Location (Historical + Projections)",
+    "description": (
+        "Shows how a climate indicator (e.g., rainfall, temperature) for a specific month changes over time at a specific location, "
+        "including historical data and future projections. "
+        "Useful for questions about the value or trend of an indicator for a given month at a location, "
+        "such as 'How does July temperature evolve in Paris over time?'. "
+        "Parameters: indicator_column (the climate variable), location (e.g., country, city), month (1-12)."
+    ),
+    "params": ["indicator_column", "location", "month"],
+    "plot_function": plot_indicator_monthly_evolution_at_location,
+    "sql_query": indicator_per_year_and_specific_month_at_location_query,
+    "plot_information": indicator_specific_month_evolution_informations,
+    "short_name": "Evolution for a specific month"
+}
 def plot_choropleth_map_of_country_indicator_for_specific_year(
     params: dict,
 ) -> Callable[[pd.DataFrame], Figure]:
     return plot_data
 choropleth_map_of_country_indicator_for_specific_year: Plot = {
     "name": "Choropleth Map of a Country's Indicator Distribution for a Specific Year",
     "description": (
 IPCC_PLOTS = [
     indicator_evolution_at_location_historical_and_projections,
+    choropleth_map_of_country_indicator_for_specific_year,
+    indicator_specific_month_evolution_at_location
 ]

climateqa/engine/talk_to_data/ipcc/queries.py CHANGED Viewed

@@ -43,7 +43,7 @@ def indicator_per_year_at_location_query(
         return ""
     if country_code in MACRO_COUNTRIES:
-        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}_macro.parquet'"
         sql_query = f"""
         SELECT year, scenario, AVG({indicator_column}) as {indicator_column}
         FROM {table_path}
@@ -52,7 +52,7 @@ def indicator_per_year_at_location_query(
         ORDER BY year, scenario
         """
     elif country_code in HUGE_MACRO_COUNTRIES:
-        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}_macro.parquet'"
         sql_query = f"""
         SELECT year, scenario, {indicator_column}
         FROM {table_path}
@@ -75,6 +75,66 @@ def indicator_per_year_at_location_query(
         """
     return sql_query.strip()
 class IndicatorForGivenYearQueryParams(TypedDict, total=False):
     """
     Parameters for querying an indicator's values across locations for a specific year.
@@ -110,7 +170,7 @@ def indicator_for_given_year_query(
         return ""
     if country_code in MACRO_COUNTRIES:
-        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}_macro.parquet'"
         sql_query = f"""
         SELECT latitude, longitude, scenario, AVG({indicator_column}) as {indicator_column}
         FROM {table_path}
@@ -119,7 +179,7 @@ def indicator_for_given_year_query(
         ORDER BY latitude, longitude, scenario
         """
     elif country_code in HUGE_MACRO_COUNTRIES:
-        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}_macro.parquet'"
         sql_query = f"""
         SELECT latitude, longitude, scenario, {indicator_column}
         FROM {table_path}
@@ -141,4 +201,4 @@ def indicator_for_given_year_query(
         ORDER BY latitude, longitude, scenario
         """
-    return sql_query.strip()

         return ""
     if country_code in MACRO_COUNTRIES:
+        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}_monthly_macro.parquet'"
         sql_query = f"""
         SELECT year, scenario, AVG({indicator_column}) as {indicator_column}
         FROM {table_path}
         ORDER BY year, scenario
         """
     elif country_code in HUGE_MACRO_COUNTRIES:
+        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}_annualy_macro.parquet'"
         sql_query = f"""
         SELECT year, scenario, {indicator_column}
         FROM {table_path}
         """
     return sql_query.strip()
+class IndicatorPerYearAndSpecificMonthAtLocationQueryParams(TypedDict, total=False):
+    """
+    Parameters for querying the evolution of an indicator per year for a specific month at a specific location.
+    Attributes:
+        indicator_column (str): Name of the climate indicator column.
+        latitude (str): Latitude of the location.
+        longitude (str): Longitude of the location.
+        country_code (str): Country code.
+        month (str): Month targeted
+    """
+    indicator_column: str
+    latitude: str
+    longitude: str
+    country_code: str
+    month: str
+def indicator_per_year_and_specific_month_at_location_query(
+    table: str, params: IndicatorPerYearAndSpecificMonthAtLocationQueryParams
+) -> str:
+    """
+    Builds an SQL query to get the evolution of an indicator per year for a specific month at a specific location.
+    Args:
+        table (str): SQL table of the indicator.
+        params (dict): Dictionary with required params:
+            - indicator_column (str)
+            - latitude (str or float)
+            - longitude (str or float)
+            - month (int)
+    Returns:
+        str: The SQL query string.
+    """
+    indicator_column = params.get("indicator_column")
+    latitude = params.get("latitude")
+    longitude = params.get("longitude")
+    country_code = params.get("country_code")
+    month = params.get('month_number')
+    if not all([indicator_column, latitude, longitude, country_code, month]):
+        return ""
+    if country_code in (MACRO_COUNTRIES+HUGE_MACRO_COUNTRIES):
+        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}_monthly_macro.parquet'"
+        sql_query = f"""
+        SELECT year, scenario, {indicator_column}
+        FROM {table_path}
+        WHERE latitude = {latitude} AND longitude = {longitude} AND year >= 1950 AND month={month}
+        ORDER BY year, scenario
+        """
+    else:
+        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}.parquet'"
+        sql_query = f"""
+        SELECT year, scenario, MEDIAN({indicator_column}) AS {indicator_column}
+        FROM {table_path}
+        WHERE latitude = {latitude} AND longitude = {longitude} AND year >= 1950 AND month={month}
+        GROUP BY scenario, year
+        """
+    return sql_query.strip()
 class IndicatorForGivenYearQueryParams(TypedDict, total=False):
     """
     Parameters for querying an indicator's values across locations for a specific year.
         return ""
     if country_code in MACRO_COUNTRIES:
+        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}_monthly_macro.parquet'"
         sql_query = f"""
         SELECT latitude, longitude, scenario, AVG({indicator_column}) as {indicator_column}
         FROM {table_path}
         ORDER BY latitude, longitude, scenario
         """
     elif country_code in HUGE_MACRO_COUNTRIES:
+        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}_annualy_macro.parquet'"
         sql_query = f"""
         SELECT latitude, longitude, scenario, {indicator_column}
         FROM {table_path}
         ORDER BY latitude, longitude, scenario
         """
+    return sql_query.strip()

climateqa/engine/talk_to_data/main.py CHANGED Viewed

@@ -50,7 +50,7 @@ async def ask_drias(query: str, index_state: int = 0, user_id: str | None = None
     if "error" in final_state and final_state["error"] != "":
         # No Sql query, no dataframe, no figure, no plot information, empty sql queries list, empty result dataframes list, empty figures list, empty plot information list, index state = 0, empty table list, error message
-        return None, None, None, None, [], [], [], 0, [], final_state["error"]
     sql_query = sql_queries[index_state]
     dataframe = result_dataframes[index_state]
@@ -112,7 +112,7 @@ async def ask_ipcc(query: str, index_state: int = 0, user_id: str | None = None)
     if "error" in final_state and final_state["error"] != "":
         # No Sql query, no dataframe, no figure, no plot information, empty sql queries list, empty result dataframes list, empty figures list, empty plot information list, index state = 0, empty table list, error message
-        return None, None, None, None, [], [], [], 0, [], final_state["error"]
     sql_query = sql_queries[index_state]
     dataframe = result_dataframes[index_state]

     if "error" in final_state and final_state["error"] != "":
         # No Sql query, no dataframe, no figure, no plot information, empty sql queries list, empty result dataframes list, empty figures list, empty plot information list, index state = 0, empty table list, error message
+        return None, None, None, None, [], [], [], [], 0, [], final_state["error"]
     sql_query = sql_queries[index_state]
     dataframe = result_dataframes[index_state]
     if "error" in final_state and final_state["error"] != "":
         # No Sql query, no dataframe, no figure, no plot information, empty sql queries list, empty result dataframes list, empty figures list, empty plot information list, index state = 0, empty table list, error message
+        return None, None, None, None, [], [], [], [], 0, [], final_state["error"]
     sql_query = sql_queries[index_state]
     dataframe = result_dataframes[index_state]

climateqa/engine/talk_to_data/myVanna.py DELETED Viewed

@@ -1,13 +0,0 @@
-from dotenv import load_dotenv
-from climateqa.engine.talk_to_data.vanna_class import MyCustomVectorDB
-from vanna.openai import OpenAI_Chat
-import os
-load_dotenv()
-OPENAI_API_KEY = os.getenv('THEO_API_KEY')
-class MyVanna(MyCustomVectorDB, OpenAI_Chat):
-    def __init__(self, config=None):
-        MyCustomVectorDB.__init__(self, config=config)
-        OpenAI_Chat.__init__(self, config=config)

climateqa/engine/talk_to_data/plot.py DELETED Viewed

@@ -1,418 +0,0 @@
-from typing import Callable, TypedDict
-from matplotlib.figure import figaspect
-import pandas as pd
-from plotly.graph_objects import Figure
-import plotly.graph_objects as go
-import plotly.express as px
-from climateqa.engine.talk_to_data.sql_query import (
-    indicator_for_given_year_query,
-    indicator_per_year_at_location_query,
-)
-from climateqa.engine.talk_to_data.config import INDICATOR_TO_UNIT
-class Plot(TypedDict):
-    """Represents a plot configuration in the DRIAS system.
-    This class defines the structure for configuring different types of plots
-    that can be generated from climate data.
-    Attributes:
-        name (str): The name of the plot type
-        description (str): A description of what the plot shows
-        params (list[str]): List of required parameters for the plot
-        plot_function (Callable[..., Callable[..., Figure]]): Function to generate the plot
-        sql_query (Callable[..., str]): Function to generate the SQL query for the plot
-    """
-    name: str
-    description: str
-    params: list[str]
-    plot_function: Callable[..., Callable[..., Figure]]
-    sql_query: Callable[..., str]
-def plot_indicator_evolution_at_location(params: dict) -> Callable[..., Figure]:
-    """Generates a function to plot indicator evolution over time at a location.
-    This function creates a line plot showing how a climate indicator changes
-    over time at a specific location. It handles temperature, precipitation,
-    and other climate indicators.
-    Args:
-        params (dict): Dictionary containing:
-            - indicator_column (str): The column name for the indicator
-            - location (str): The location to plot
-            - model (str): The climate model to use
-    Returns:
-        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
-    Example:
-        >>> plot_func = plot_indicator_evolution_at_location({
-        ...     'indicator_column': 'mean_temperature',
-        ...     'location': 'Paris',
-        ...     'model': 'ALL'
-        ... })
-        >>> fig = plot_func(df)
-    """
-    indicator = params["indicator_column"]
-    location = params["location"]
-    indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
-    unit = INDICATOR_TO_UNIT.get(indicator, "")
-    def plot_data(df: pd.DataFrame) -> Figure:
-        """Generates the actual plot from the data.
-        Args:
-            df (pd.DataFrame): DataFrame containing the data to plot
-        Returns:
-            Figure: A plotly Figure object showing the indicator evolution
-        """
-        fig = go.Figure()
-        if df['model'].nunique() != 1:
-            df_avg = df.groupby("year", as_index=False)[indicator].mean()
-            # Transform to list to avoid pandas encoding
-            indicators = df_avg[indicator].astype(float).tolist()
-            years = df_avg["year"].astype(int).tolist()
-            # Compute the 10-year rolling average
-            rolling_window = 10
-            sliding_averages = (
-                df_avg[indicator]
-                .rolling(window=rolling_window, min_periods=rolling_window)
-                .mean()
-                .astype(float)
-                .tolist()
-            )
-            model_label = "Model Average"
-            # Only add rolling average if we have enough data points
-            if len([x for x in sliding_averages if pd.notna(x)]) > 0:
-                # Sliding average dashed line
-                fig.add_scatter(
-                    x=years,
-                    y=sliding_averages,
-                    mode="lines",
-                    name="10 years rolling average",
-                    line=dict(dash="dash"),
-                    marker=dict(color="#d62728"),
-                    hovertemplate=f"10-year average: %{{y:.2f}} {unit}<br>Year: %{{x}}<extra></extra>"
-                )
-        else:
-            df_model = df
-            # Transform to list to avoid pandas encoding
-            indicators = df_model[indicator].astype(float).tolist()
-            years = df_model["year"].astype(int).tolist()
-            # Compute the 10-year rolling average
-            rolling_window = 10
-            sliding_averages = (
-                df_model[indicator]
-                .rolling(window=rolling_window, min_periods=rolling_window)
-                .mean()
-                .astype(float)
-                .tolist()
-            )
-            model_label = f"Model : {df['model'].unique()[0]}"
-            # Only add rolling average if we have enough data points
-            if len([x for x in sliding_averages if pd.notna(x)]) > 0:
-                # Sliding average dashed line
-                fig.add_scatter(
-                    x=years,
-                    y=sliding_averages,
-                    mode="lines",
-                    name="10 years rolling average",
-                    line=dict(dash="dash"),
-                    marker=dict(color="#d62728"),
-                    hovertemplate=f"10-year average: %{{y:.2f}} {unit}<br>Year: %{{x}}<extra></extra>"
-                )
-        # Indicator per year plot
-        fig.add_scatter(
-            x=years,
-            y=indicators,
-            name=f"Yearly {indicator_label}",
-            mode="lines",
-            marker=dict(color="#1f77b4"),
-            hovertemplate=f"{indicator_label}: %{{y:.2f}} {unit}<br>Year: %{{x}}<extra></extra>"
-        )
-        fig.update_layout(
-            title=f"Plot of {indicator_label} in {location} ({model_label})",
-            xaxis_title="Year",
-            yaxis_title=f"{indicator_label} ({unit})",
-            template="plotly_white",
-        )
-        return fig
-    return plot_data
-indicator_evolution_at_location: Plot = {
-    "name": "Indicator evolution at location",
-    "description": "Plot an evolution of the indicator at a certain location",
-    "params": ["indicator_column", "location", "model"],
-    "plot_function": plot_indicator_evolution_at_location,
-    "sql_query": indicator_per_year_at_location_query,
-}
-def plot_indicator_number_of_days_per_year_at_location(
-    params: dict,
-) -> Callable[..., Figure]:
-    """Generates a function to plot the number of days per year for an indicator.
-    This function creates a bar chart showing the frequency of certain climate
-    events (like days above a temperature threshold) per year at a specific location.
-    Args:
-        params (dict): Dictionary containing:
-            - indicator_column (str): The column name for the indicator
-            - location (str): The location to plot
-            - model (str): The climate model to use
-    Returns:
-        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
-    """
-    indicator = params["indicator_column"]
-    location = params["location"]
-    indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
-    unit = INDICATOR_TO_UNIT.get(indicator, "")
-    def plot_data(df: pd.DataFrame) -> Figure:
-        """Generate the figure thanks to the dataframe
-        Args:
-            df (pd.DataFrame): pandas dataframe with the required data
-        Returns:
-            Figure: Plotly figure
-        """
-        fig = go.Figure()
-        if df['model'].nunique() != 1:
-            df_avg = df.groupby("year", as_index=False)[indicator].mean()
-            # Transform to list to avoid pandas encoding
-            indicators = df_avg[indicator].astype(float).tolist()
-            years = df_avg["year"].astype(int).tolist()
-            model_label = "Model Average"
-        else:
-            df_model = df
-            # Transform to list to avoid pandas encoding
-            indicators = df_model[indicator].astype(float).tolist()
-            years = df_model["year"].astype(int).tolist()
-            model_label = f"Model : {df['model'].unique()[0]}"
-        # Bar plot
-        fig.add_trace(
-            go.Bar(
-                x=years,
-                y=indicators,
-                width=0.5,
-                marker=dict(color="#1f77b4"),
-                hovertemplate=f"{indicator_label}: %{{y:.2f}} {unit}<br>Year: %{{x}}<extra></extra>"
-            )
-        )
-        fig.update_layout(
-            title=f"{indicator_label} in {location} ({model_label})",
-            xaxis_title="Year",
-            yaxis_title=f"{indicator_label} ({unit})",
-            yaxis=dict(range=[0, max(indicators)]),
-            bargap=0.5,
-            template="plotly_white",
-        )
-        return fig
-    return plot_data
-indicator_number_of_days_per_year_at_location: Plot = {
-    "name": "Indicator number of days per year at location",
-    "description": "Plot a barchart of the number of days per year of a certain indicator at a certain location. It is appropriate for frequency indicator.",
-    "params": ["indicator_column", "location", "model"],
-    "plot_function": plot_indicator_number_of_days_per_year_at_location,
-    "sql_query": indicator_per_year_at_location_query,
-}
-def plot_distribution_of_indicator_for_given_year(
-    params: dict,
-) -> Callable[..., Figure]:
-    """Generates a function to plot the distribution of an indicator for a year.
-    This function creates a histogram showing the distribution of a climate
-    indicator across different locations for a specific year.
-    Args:
-        params (dict): Dictionary containing:
-            - indicator_column (str): The column name for the indicator
-            - year (str): The year to plot
-            - model (str): The climate model to use
-    Returns:
-        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
-    """
-    indicator = params["indicator_column"]
-    year = params["year"]
-    indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
-    unit = INDICATOR_TO_UNIT.get(indicator, "")
-    def plot_data(df: pd.DataFrame) -> Figure:
-        """Generate the figure thanks to the dataframe
-        Args:
-            df (pd.DataFrame): pandas dataframe with the required data
-        Returns:
-            Figure: Plotly figure
-        """
-        fig = go.Figure()
-        if df['model'].nunique() != 1:
-            df_avg = df.groupby(["latitude", "longitude"], as_index=False)[
-                indicator
-            ].mean()
-            # Transform to list to avoid pandas encoding
-            indicators = df_avg[indicator].astype(float).tolist()
-            model_label = "Model Average"
-        else:
-            df_model = df
-            # Transform to list to avoid pandas encoding
-            indicators = df_model[indicator].astype(float).tolist()
-            model_label = f"Model : {df['model'].unique()[0]}"
-        fig.add_trace(
-            go.Histogram(
-                x=indicators,
-                opacity=0.8,
-                histnorm="percent",
-                marker=dict(color="#1f77b4"),
-                hovertemplate=f"{indicator_label}: %{{x:.2f}} {unit}<br>Frequency: %{{y:.2f}}%<extra></extra>"
-            )
-        )
-        fig.update_layout(
-            title=f"Distribution of {indicator_label} in {year} ({model_label})",
-            xaxis_title=f"{indicator_label} ({unit})",
-            yaxis_title="Frequency (%)",
-            plot_bgcolor="rgba(0, 0, 0, 0)",
-            showlegend=False,
-        )
-        return fig
-    return plot_data
-distribution_of_indicator_for_given_year: Plot = {
-    "name": "Distribution of an indicator for a given year",
-    "description": "Plot an histogram of the distribution for a given year of the values of an indicator",
-    "params": ["indicator_column", "model", "year"],
-    "plot_function": plot_distribution_of_indicator_for_given_year,
-    "sql_query": indicator_for_given_year_query,
-}
-def plot_map_of_france_of_indicator_for_given_year(
-    params: dict,
-) -> Callable[..., Figure]:
-    """Generates a function to plot a map of France for an indicator.
-    This function creates a choropleth map of France showing the spatial
-    distribution of a climate indicator for a specific year.
-    Args:
-        params (dict): Dictionary containing:
-            - indicator_column (str): The column name for the indicator
-            - year (str): The year to plot
-            - model (str): The climate model to use
-    Returns:
-        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
-    """
-    indicator = params["indicator_column"]
-    year = params["year"]
-    indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
-    unit = INDICATOR_TO_UNIT.get(indicator, "")
-    def plot_data(df: pd.DataFrame) -> Figure:
-        fig = go.Figure()
-        if df['model'].nunique() != 1:
-            df_avg = df.groupby(["latitude", "longitude"], as_index=False)[
-                indicator
-            ].mean()
-            indicators = df_avg[indicator].astype(float).tolist()
-            latitudes = df_avg["latitude"].astype(float).tolist()
-            longitudes = df_avg["longitude"].astype(float).tolist()
-            model_label = "Model Average"
-        else:
-            df_model = df
-            # Transform to list to avoid pandas encoding
-            indicators = df_model[indicator].astype(float).tolist()
-            latitudes = df_model["latitude"].astype(float).tolist()
-            longitudes = df_model["longitude"].astype(float).tolist()
-            model_label = f"Model : {df['model'].unique()[0]}"
-        fig.add_trace(
-            go.Scattermapbox(
-                lat=latitudes,
-                lon=longitudes,
-                mode="markers",
-                marker=dict(
-                    size=10,
-                    color=indicators,  # Color mapped to values
-                    colorscale="Turbo",  # Color scale (can be 'Plasma', 'Jet', etc.)
-                    cmin=min(indicators),  # Minimum color range
-                    cmax=max(indicators),  # Maximum color range
-                    showscale=True,  # Show colorbar
-                ),
-                text=[f"{indicator_label}: {value:.2f} {unit}" for value in indicators],  # Add hover text showing the indicator value
-                hoverinfo="text"  # Only show the custom text on hover
-            )
-        )
-        fig.update_layout(
-            mapbox_style="open-street-map",  # Use OpenStreetMap
-            mapbox_zoom=3,
-            mapbox_center={"lat": 46.6, "lon": 2.0},
-            coloraxis_colorbar=dict(title=f"{indicator_label} ({unit})"),  # Add legend
-            title=f"{indicator_label} in {year} in France ({model_label}) " # Title
-        )
-        return fig
-    return plot_data
-map_of_france_of_indicator_for_given_year: Plot = {
-    "name": "Map of France of an indicator for a given year",
-    "description": "Heatmap on the map of France of the values of an in indicator for a given year",
-    "params": ["indicator_column", "year", "model"],
-    "plot_function": plot_map_of_france_of_indicator_for_given_year,
-    "sql_query": indicator_for_given_year_query,
-}
-PLOTS = [
-    indicator_evolution_at_location,
-    indicator_number_of_days_per_year_at_location,
-    distribution_of_indicator_for_given_year,
-    map_of_france_of_indicator_for_given_year,
-]

climateqa/engine/talk_to_data/sql_query.py DELETED Viewed

@@ -1,114 +0,0 @@
-import asyncio
-from concurrent.futures import ThreadPoolExecutor
-from typing import TypedDict
-import duckdb
-import pandas as pd
-async def execute_sql_query(sql_query: str) -> pd.DataFrame:
-    """Executes a SQL query on the DRIAS database and returns the results.
-    This function connects to the DuckDB database containing DRIAS climate data
-    and executes the provided SQL query. It handles the database connection and
-    returns the results as a pandas DataFrame.
-    Args:
-        sql_query (str): The SQL query to execute
-    Returns:
-        pd.DataFrame: A DataFrame containing the query results
-    Raises:
-        duckdb.Error: If there is an error executing the SQL query
-    """
-    def _execute_query():
-        # Execute the query
-        con = duckdb.connect()
-        results = con.sql(sql_query).fetchdf()
-        # return fetched data
-        return results
-    # Run the query in a thread pool to avoid blocking
-    loop = asyncio.get_event_loop()
-    with ThreadPoolExecutor() as executor:
-        return await loop.run_in_executor(executor, _execute_query)
-class IndicatorPerYearAtLocationQueryParams(TypedDict, total=False):
-    """Parameters for querying an indicator's values over time at a location.
-    This class defines the parameters needed to query climate indicator data
-    for a specific location over multiple years.
-    Attributes:
-        indicator_column (str): The column name for the climate indicator
-        latitude (str): The latitude coordinate of the location
-        longitude (str): The longitude coordinate of the location
-        model (str): The climate model to use (optional)
-    """
-    indicator_column: str
-    latitude: str
-    longitude: str
-    model: str
-def indicator_per_year_at_location_query(
-    table: str, params: IndicatorPerYearAtLocationQueryParams
-) -> str:
-    """SQL Query to get the evolution of an indicator per year at a certain location
-    Args:
-        table (str): sql table of the indicator
-        params (IndicatorPerYearAtLocationQueryParams) : dictionary with the required params for the query
-    Returns:
-        str: the sql query
-    """
-    indicator_column = params.get("indicator_column")
-    latitude = params.get("latitude")
-    longitude = params.get("longitude")
-    if indicator_column is None or latitude is None or longitude is None: # If one parameter is missing, returns an empty query
-        return ""
-    table = f"'hf://datasets/timeki/drias_db/{table.lower()}.parquet'"
-    sql_query = f"SELECT year, {indicator_column}, model\nFROM {table}\nWHERE latitude = {latitude} \nAnd longitude = {longitude} \nOrder by Year"
-    return sql_query
-class IndicatorForGivenYearQueryParams(TypedDict, total=False):
-    """Parameters for querying an indicator's values across locations for a year.
-    This class defines the parameters needed to query climate indicator data
-    across different locations for a specific year.
-    Attributes:
-        indicator_column (str): The column name for the climate indicator
-        year (str): The year to query
-        model (str): The climate model to use (optional)
-    """
-    indicator_column: str
-    year: str
-    model: str
-def indicator_for_given_year_query(
-        table:str, params: IndicatorForGivenYearQueryParams
-) -> str:
-    """SQL Query to get the values of an indicator with their latitudes, longitudes and models for a given year
-    Args:
-        table (str): sql table of the indicator
-        params (IndicatorForGivenYearQueryParams): dictionarry with the required params for the query
-    Returns:
-        str: the sql query
-    """
-    indicator_column = params.get("indicator_column")
-    year = params.get('year')
-    if year is None or indicator_column is None: # If one parameter is missing, returns an empty query
-        return ""
-    table = f"'hf://datasets/timeki/drias_db/{table.lower()}.parquet'"
-    sql_query = f"Select {indicator_column}, latitude, longitude, model\nFrom {table}\nWhere year = {year}"
-    return sql_query

climateqa/engine/talk_to_data/talk_to_drias.py DELETED Viewed

@@ -1,317 +0,0 @@
-import os
-from typing import Any, Callable, TypedDict, Optional
-from numpy import sort
-import pandas as pd
-import asyncio
-from plotly.graph_objects import Figure
-from climateqa.engine.llm import get_llm
-from climateqa.engine.talk_to_data import sql_query
-from climateqa.engine.talk_to_data.config import INDICATOR_COLUMNS_PER_TABLE
-from climateqa.engine.talk_to_data.plot import PLOTS, Plot
-from climateqa.engine.talk_to_data.sql_query import execute_sql_query
-from climateqa.engine.talk_to_data.utils import (
-    detect_relevant_plots,
-    detect_year_with_openai,
-    loc2coords,
-    detect_location_with_openai,
-    nearestNeighbourSQL,
-    detect_relevant_tables,
-)
-ROOT_PATH = os.path.dirname(os.path.dirname(os.getcwd()))
-class TableState(TypedDict):
-    """Represents the state of a table in the DRIAS workflow.
-    This class defines the structure for tracking the state of a table during the
-    data processing workflow, including its name, parameters, SQL query, and results.
-    Attributes:
-        table_name (str): The name of the table in the database
-        params (dict[str, Any]): Parameters used for querying the table
-        sql_query (str, optional): The SQL query used to fetch data
-        dataframe (pd.DataFrame | None, optional): The resulting data
-        figure (Callable[..., Figure], optional): Function to generate visualization
-        status (str): The current status of the table processing ('OK' or 'ERROR')
-    """
-    table_name: str
-    params: dict[str, Any]
-    sql_query: Optional[str]
-    dataframe: Optional[pd.DataFrame | None]
-    figure: Optional[Callable[..., Figure]]
-    status: str
-class PlotState(TypedDict):
-    """Represents the state of a plot in the DRIAS workflow.
-    This class defines the structure for tracking the state of a plot during the
-    data processing workflow, including its name and associated tables.
-    Attributes:
-        plot_name (str): The name of the plot
-        tables (list[str]): List of tables used in the plot
-        table_states (dict[str, TableState]): States of the tables used in the plot
-    """
-    plot_name: str
-    tables: list[str]
-    table_states: dict[str, TableState]
-class State(TypedDict):
-    user_input: str
-    plots: list[str]
-    plot_states: dict[str, PlotState]
-    error: Optional[str]
-async def find_relevant_plots(state: State, llm) -> list[str]:
-    print("---- Find relevant plots ----")
-    relevant_plots = await detect_relevant_plots(state['user_input'], llm)
-    return relevant_plots
-async def find_relevant_tables_per_plot(state: State, plot: Plot, llm) -> list[str]:
-    print(f"---- Find relevant tables for {plot['name']} ----")
-    relevant_tables = await detect_relevant_tables(state['user_input'], plot, llm)
-    return relevant_tables
-async def find_param(state: State, param_name:str, table: str) -> dict[str, Any] | None:
-    """Perform the good method to retrieve the desired parameter
-    Args:
-        state (State): state of the workflow
-        param_name (str): name of the desired parameter
-        table (str): name of the table
-    Returns:
-        dict[str, Any] | None:
-    """
-    if param_name == 'location':
-        location = await find_location(state['user_input'], table)
-        return location
-    if param_name == 'year':
-        year = await find_year(state['user_input'])
-        return {'year': year}
-    return None
-class Location(TypedDict):
-    location: str
-    latitude: Optional[str]
-    longitude: Optional[str]
-async def find_location(user_input: str, table: str) -> Location:
-    print(f"---- Find location in table {table} ----")
-    location = await detect_location_with_openai(user_input)
-    output: Location = {'location' : location}
-    if location:
-        coords = loc2coords(location)
-        neighbour = nearestNeighbourSQL(coords, table)
-        output.update({
-            "latitude": neighbour[0],
-            "longitude": neighbour[1],
-        })
-    return output
-async def find_year(user_input: str) -> str:
-    """Extracts year information from user input using LLM.
-    This function uses an LLM to identify and extract year information from the
-    user's query, which is used to filter data in subsequent queries.
-    Args:
-        user_input (str): The user's query text
-    Returns:
-        str: The extracted year, or empty string if no year found
-    """
-    print(f"---- Find year ---")
-    year = await detect_year_with_openai(user_input)
-    return year
-def find_indicator_column(table: str) -> str:
-    """Retrieves the name of the indicator column within a table.
-    This function maps table names to their corresponding indicator columns
-    using the predefined mapping in INDICATOR_COLUMNS_PER_TABLE.
-    Args:
-        table (str): Name of the table in the database
-    Returns:
-        str: Name of the indicator column for the specified table
-    Raises:
-        KeyError: If the table name is not found in the mapping
-    """
-    print(f"---- Find indicator column in table {table} ----")
-    return INDICATOR_COLUMNS_PER_TABLE[table]
-async def process_table(
-    table: str,
-    params: dict[str, Any],
-    plot: Plot,
-) -> TableState:
-    """Processes a table to extract relevant data and generate visualizations.
-    This function retrieves the SQL query for the specified table, executes it,
-    and generates a visualization based on the results.
-    Args:
-        table (str): The name of the table to process
-        params (dict[str, Any]): Parameters used for querying the table
-        plot (Plot): The plot object containing SQL query and visualization function
-    Returns:
-        TableState: The state of the processed table
-    """
-    table_state: TableState = {
-        'table_name': table,
-        'params': params.copy(),
-        'status': 'OK',
-        'dataframe': None,
-        'sql_query': None,
-        'figure': None
-    }
-    table_state['params']['indicator_column'] = find_indicator_column(table)
-    sql_query = plot['sql_query'](table, table_state['params'])
-    if sql_query == "":
-        table_state['status'] = 'ERROR'
-        return table_state
-    table_state['sql_query'] = sql_query
-    df = await execute_sql_query(sql_query)
-    table_state['dataframe'] = df
-    table_state['figure'] = plot['plot_function'](table_state['params'])
-    return table_state
-async def drias_workflow(user_input: str) -> State:
-    """Performs the complete workflow of Talk To Drias : from user input to sql queries, dataframes and figures generated
-    Args:
-        user_input (str): initial user input
-    Returns:
-        State: Final state with all the results
-    """
-    state: State = {
-        'user_input': user_input,
-        'plots': [],
-        'plot_states': {},
-        'error': ''
-    }
-    llm = get_llm(provider="openai")
-    plots = await find_relevant_plots(state, llm)
-    state['plots'] = plots
-    if len(state['plots']) < 1:
-        state['error'] = 'There is no plot to answer to the question'
-        return state
-    have_relevant_table = False
-    have_sql_query = False
-    have_dataframe = False
-    for plot_name in state['plots']:
-        plot = next((p for p in PLOTS if p['name'] == plot_name), None) # Find the associated plot object
-        if plot is None:
-            continue
-        plot_state: PlotState = {
-            'plot_name': plot_name,
-            'tables': [],
-            'table_states': {}
-        }
-        plot_state['plot_name'] = plot_name
-        relevant_tables = await find_relevant_tables_per_plot(state, plot, llm)
-        if len(relevant_tables) > 0 :
-            have_relevant_table = True
-        plot_state['tables'] = relevant_tables
-        params = {}
-        for param_name in plot['params']:
-            param = await find_param(state, param_name, relevant_tables[0])
-            if param:
-                params.update(param)
-        tasks = [process_table(table, params, plot) for table in plot_state['tables'][:3]]
-        results = await asyncio.gather(*tasks)
-        # Store results back in plot_state
-        have_dataframe = False
-        have_sql_query = False
-        for table_state in results:
-            if table_state['sql_query']:
-                have_sql_query = True
-            if table_state['dataframe'] is not None and len(table_state['dataframe']) > 0:
-                have_dataframe = True
-            plot_state['table_states'][table_state['table_name']] = table_state
-        state['plot_states'][plot_name] = plot_state
-    if not have_relevant_table:
-        state['error'] = "There is no relevant table in our database to answer your question"
-    elif not have_sql_query:
-        state['error'] = "There is no relevant sql query on our database that can help to answer your question"
-    elif not have_dataframe:
-        state['error'] = "There is no data in our table that can answer to your question"
-    return state
-# def make_write_query_node():
-#     def write_query(state):
-#         print("---- Write query ----")
-#         for table in state["tables"]:
-#             sql_query = QUERIES[state[table]['query_type']](
-#                 table=table,
-#                 indicator_column=state[table]["columns"],
-#                 longitude=state[table]["longitude"],
-#                 latitude=state[table]["latitude"],
-#             )
-#             state[table].update({"sql_query": sql_query})
-#         return state
-#     return write_query
-# def make_fetch_data_node(db_path):
-#     def fetch_data(state):
-#         print("---- Fetch data ----")
-#         for table in state["tables"]:
-#             results = execute_sql_query(db_path, state[table]['sql_query'])
-#             state[table].update(results)
-#         return state
-#     return fetch_data
-## V2
-# def make_fetch_data_node(db_path: str, llm):
-#     def fetch_data(state):
-#         print("---- Fetch data ----")
-#         db = SQLDatabase.from_uri(f"sqlite:///{db_path}")
-#         output = {}
-#         sql_query = write_sql_query(state["query"], db, state["tables"], llm)
-#         # TO DO : Add query checker
-#         print(f"SQL query  : {sql_query}")
-#         output["sql_query"] = sql_query
-#         output.update(fetch_data_from_sql_query(db_path, sql_query))
-#         return output
-#     return fetch_data

climateqa/engine/talk_to_data/utils.py DELETED Viewed

@@ -1,281 +0,0 @@
-import re
-from typing import Annotated, TypedDict
-import duckdb
-from geopy.geocoders import Nominatim
-import ast
-from climateqa.engine.llm import get_llm
-from climateqa.engine.talk_to_data.config import DRIAS_TABLES
-from climateqa.engine.talk_to_data.plot import PLOTS, Plot
-from langchain_core.prompts import ChatPromptTemplate
-async def detect_location_with_openai(sentence):
-    """
-    Detects locations in a sentence using OpenAI's API via LangChain.
-    """
-    llm = get_llm()
-    prompt = f"""
-    Extract all locations (cities, countries, states, or geographical areas) mentioned in the following sentence.
-    Return the result as a Python list. If no locations are mentioned, return an empty list.
-    Sentence: "{sentence}"
-    """
-    response = await llm.ainvoke(prompt)
-    location_list = ast.literal_eval(response.content.strip("```python\n").strip())
-    if location_list:
-        return location_list[0]
-    else:
-        return ""
-class ArrayOutput(TypedDict):
-    """Represents the output of a function that returns an array.
-    This class is used to type-hint functions that return arrays,
-    ensuring consistent return types across the codebase.
-    Attributes:
-        array (str): A syntactically valid Python array string
-    """
-    array: Annotated[str, "Syntactically valid python array."]
-async def detect_year_with_openai(sentence: str) -> str:
-    """
-    Detects years in a sentence using OpenAI's API via LangChain.
-    """
-    llm = get_llm()
-    prompt = """
-    Extract all years mentioned in the following sentence.
-    Return the result as a Python list. If no year are mentioned, return an empty list.
-    Sentence: "{sentence}"
-    """
-    prompt = ChatPromptTemplate.from_template(prompt)
-    structured_llm = llm.with_structured_output(ArrayOutput)
-    chain = prompt | structured_llm
-    response: ArrayOutput = await chain.ainvoke({"sentence": sentence})
-    years_list = eval(response['array'])
-    if len(years_list) > 0:
-        return years_list[0]
-    else:
-        return ""
-def detectTable(sql_query: str) -> list[str]:
-    """Extracts table names from a SQL query.
-    This function uses regular expressions to find all table names
-    referenced in a SQL query's FROM clause.
-    Args:
-        sql_query (str): The SQL query to analyze
-    Returns:
-        list[str]: A list of table names found in the query
-    Example:
-        >>> detectTable("SELECT * FROM temperature_data WHERE year > 2000")
-        ['temperature_data']
-    """
-    pattern = r'(?i)\bFROM\s+((?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+)(?:\.(?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+))*)'
-    matches = re.findall(pattern, sql_query)
-    return matches
-def loc2coords(location: str) -> tuple[float, float]:
-    """Converts a location name to geographic coordinates.
-    This function uses the Nominatim geocoding service to convert
-    a location name (e.g., city name) to its latitude and longitude.
-    Args:
-        location (str): The name of the location to geocode
-    Returns:
-        tuple[float, float]: A tuple containing (latitude, longitude)
-    Raises:
-        AttributeError: If the location cannot be found
-    """
-    geolocator = Nominatim(user_agent="city_to_latlong")
-    coords = geolocator.geocode(location)
-    return (coords.latitude, coords.longitude)
-def coords2loc(coords: tuple[float, float]) -> str:
-    """Converts geographic coordinates to a location name.
-    This function uses the Nominatim reverse geocoding service to convert
-    latitude and longitude coordinates to a human-readable location name.
-    Args:
-        coords (tuple[float, float]): A tuple containing (latitude, longitude)
-    Returns:
-        str: The address of the location, or "Unknown Location" if not found
-    Example:
-        >>> coords2loc((48.8566, 2.3522))
-        'Paris, France'
-    """
-    geolocator = Nominatim(user_agent="coords_to_city")
-    try:
-        location = geolocator.reverse(coords)
-        return location.address
-    except Exception as e:
-        print(f"Error: {e}")
-        return "Unknown Location"
-def nearestNeighbourSQL(location: tuple, table: str) -> tuple[str, str]:
-    long = round(location[1], 3)
-    lat = round(location[0], 3)
-    table = f"'hf://datasets/timeki/drias_db/{table.lower()}.parquet'"
-    results = duckdb.sql(
-        f"SELECT latitude, longitude FROM {table} WHERE latitude BETWEEN {lat - 0.3} AND {lat + 0.3} AND longitude BETWEEN {long - 0.3} AND {long + 0.3}"
-    ).fetchdf()
-    if len(results) == 0:
-        return "", ""
-    # cursor.execute(f"SELECT latitude, longitude FROM {table} WHERE latitude BETWEEN {lat - 0.3} AND {lat + 0.3} AND longitude BETWEEN {long - 0.3} AND {long + 0.3}")
-    return results['latitude'].iloc[0], results['longitude'].iloc[0]
-async def detect_relevant_tables(user_question: str, plot: Plot, llm) -> list[str]:
-    """Identifies relevant tables for a plot based on user input.
-    This function uses an LLM to analyze the user's question and the plot
-    description to determine which tables in the DRIAS database would be
-    most relevant for generating the requested visualization.
-    Args:
-        user_question (str): The user's question about climate data
-        plot (Plot): The plot configuration object
-        llm: The language model instance to use for analysis
-    Returns:
-        list[str]: A list of table names that are relevant for the plot
-    Example:
-        >>> detect_relevant_tables(
-        ...     "What will the temperature be like in Paris?",
-        ...     indicator_evolution_at_location,
-        ...     llm
-        ... )
-        ['mean_annual_temperature', 'mean_summer_temperature']
-    """
-    # Get all table names
-    table_names_list = DRIAS_TABLES
-    prompt = (
-        f"You are helping to build a plot following this description : {plot['description']}."
-        f"You are given a list of tables and a user question."
-        f"Based on the description of the plot, which table are appropriate for that kind of plot."
-        f"Write the 3 most relevant tables to use. Answer only a python list of table name."
-        f"### List of tables : {table_names_list}"
-        f"### User question : {user_question}"
-        f"### List of table name : "
-    )
-    table_names = ast.literal_eval(
-        (await llm.ainvoke(prompt)).content.strip("```python\n").strip()
-    )
-    return table_names
-def replace_coordonates(coords, query, coords_tables):
-    n = query.count(str(coords[0]))
-    for i in range(n):
-        query = query.replace(str(coords[0]), str(coords_tables[i][0]), 1)
-        query = query.replace(str(coords[1]), str(coords_tables[i][1]), 1)
-    return query
-async def detect_relevant_plots(user_question: str, llm):
-    plots_description = ""
-    for plot in PLOTS:
-        plots_description += "Name: " + plot["name"]
-        plots_description += " - Description: " + plot["description"] + "\n"
-    prompt = (
-        f"You are helping to answer a quesiton with insightful visualizations."
-        f"You are given an user question and a list of plots with their name and description."
-        f"Based on the descriptions of the plots, which plot is appropriate to answer to this question."
-        f"Write the most relevant tables to use. Answer only a python list of plot name."
-        f"### Descriptions of the plots : {plots_description}"
-        f"### User question : {user_question}"
-        f"### Name of the plot : "
-    )
-    # prompt = (
-    #     f"You are helping to answer a question with insightful visualizations. "
-    #     f"Given a list of plots with their name and description: "
-    #     f"{plots_description} "
-    #     f"The user question is: {user_question}. "
-    #     f"Choose the most relevant plots to answer the question. "
-    #     f"The answer must be a Python list with the names of the relevant plots, and nothing else. "
-    #     f"Ensure the response is in the exact format: ['PlotName1', 'PlotName2']."
-    # )
-    plot_names = ast.literal_eval(
-        (await llm.ainvoke(prompt)).content.strip("```python\n").strip()
-    )
-    return plot_names
-# Next Version
-# class QueryOutput(TypedDict):
-#     """Generated SQL query."""
-#     query: Annotated[str, ..., "Syntactically valid SQL query."]
-# class PlotlyCodeOutput(TypedDict):
-#     """Generated Plotly code"""
-#     code: Annotated[str, ..., "Synatically valid Plotly python code."]
-# def write_sql_query(user_input: str, db: SQLDatabase, relevant_tables: list[str], llm):
-#     """Generate SQL query to fetch information."""
-#     prompt_params = {
-#         "dialect": db.dialect,
-#         "table_info": db.get_table_info(),
-#         "input": user_input,
-#         "relevant_tables": relevant_tables,
-#         "model": "ALADIN63_CNRM-CM5",
-#     }
-#     prompt = ChatPromptTemplate.from_template(query_prompt_template)
-#     structured_llm = llm.with_structured_output(QueryOutput)
-#     chain = prompt | structured_llm
-#     result = chain.invoke(prompt_params)
-#     return result["query"]
-# def fetch_data_from_sql_query(db: str, sql_query: str):
-#     conn = sqlite3.connect(db)
-#     cursor = conn.cursor()
-#     cursor.execute(sql_query)
-#     column_names = [desc[0] for desc in cursor.description]
-#     values = cursor.fetchall()
-#     return {"column_names": column_names, "data": values}
-# def generate_chart_code(user_input: str, sql_query: list[str], llm):
-#     """ "Generate plotly python code for the chart based on the sql query and the user question"""
-#     class PlotlyCodeOutput(TypedDict):
-#         """Generated Plotly code"""
-#         code: Annotated[str, ..., "Synatically valid Plotly python code."]
-#     prompt = ChatPromptTemplate.from_template(plot_prompt_template)
-#     structured_llm = llm.with_structured_output(PlotlyCodeOutput)
-#     chain = prompt | structured_llm
-#     result = chain.invoke({"input": user_input, "sql_query": sql_query})
-#     return result["code"]

climateqa/engine/talk_to_data/vanna_class.py DELETED Viewed

@@ -1,325 +0,0 @@
-from vanna.base import VannaBase
-from pinecone import Pinecone
-from climateqa.engine.embeddings import get_embeddings_function
-import pandas as pd
-import hashlib
-class MyCustomVectorDB(VannaBase):
-    """
-    VectorDB class for storing and retrieving vectors from Pinecone.
-    args :
-        config (dict) : Configuration dictionary containing the Pinecone API key and the index name :
-            - pc_api_key (str) : Pinecone API key
-            - index_name (str) : Pinecone index name
-            - top_k (int) : Number of top results to return (default = 2)
-    """
-    def __init__(self,config):
-        super().__init__(config = config)
-        try :
-            self.api_key = config.get('pc_api_key')
-            self.index_name = config.get('index_name')
-        except :
-            raise Exception("Please provide the Pinecone API key and the index name")
-        self.pc = Pinecone(api_key = self.api_key)
-        self.index = self.pc.Index(self.index_name)
-        self.top_k = config.get('top_k', 2)
-        self.embeddings = get_embeddings_function()
-    def check_embedding(self, id, namespace):
-        fetched = self.index.fetch(ids = [id], namespace = namespace)
-        if fetched['vectors'] == {}:
-            return False
-        return True
-    def generate_hash_id(self, data: str) -> str:
-        """
-        Generate a unique hash ID for the given data.
-        Args:
-            data (str): The input data to hash (e.g., a concatenated string of user attributes).
-        Returns:
-            str: A unique hash ID as a hexadecimal string.
-        """
-        data_bytes = data.encode('utf-8')
-        hash_object = hashlib.sha256(data_bytes)
-        hash_id = hash_object.hexdigest()
-        return hash_id
-    def add_ddl(self, ddl: str, **kwargs) -> str:
-        id = self.generate_hash_id(ddl) + '_ddl'
-        if self.check_embedding(id, 'ddl'):
-            print(f"DDL having id {id} already exists")
-            return id
-        self.index.upsert(
-            vectors = [(id, self.embeddings.embed_query(ddl), {'ddl': ddl})],
-            namespace = 'ddl'
-        )
-        return id
-    def add_documentation(self, doc: str, **kwargs) -> str:
-        id = self.generate_hash_id(doc) + '_doc'
-        if self.check_embedding(id, 'documentation'):
-            print(f"Documentation having id {id} already exists")
-            return id
-        self.index.upsert(
-            vectors = [(id, self.embeddings.embed_query(doc), {'doc': doc})],
-            namespace = 'documentation'
-        )
-        return id
-    def add_question_sql(self, question: str, sql: str, **kwargs) -> str:
-        id = self.generate_hash_id(question) + '_sql'
-        if self.check_embedding(id, 'question_sql'):
-            print(f"Question-SQL pair having id {id} already exists")
-            return id
-        self.index.upsert(
-            vectors = [(id, self.embeddings.embed_query(question + sql), {'question': question, 'sql': sql})],
-            namespace = 'question_sql'
-        )
-        return id
-    def get_related_ddl(self, question: str, **kwargs) -> list:
-        res = self.index.query(
-            vector=self.embeddings.embed_query(question),
-            top_k=self.top_k,
-            namespace='ddl',
-            include_metadata=True
-        )
-        return [match['metadata']['ddl'] for match in res['matches']]
-    def get_related_documentation(self, question: str, **kwargs) -> list:
-        res = self.index.query(
-            vector=self.embeddings.embed_query(question),
-            top_k=self.top_k,
-            namespace='documentation',
-            include_metadata=True
-        )
-        return [match['metadata']['doc'] for match in res['matches']]
-    def get_similar_question_sql(self, question: str, **kwargs) -> list:
-        res = self.index.query(
-            vector=self.embeddings.embed_query(question),
-            top_k=self.top_k,
-            namespace='question_sql',
-            include_metadata=True
-        )
-        return [(match['metadata']['question'], match['metadata']['sql']) for match in res['matches']]
-    def get_training_data(self, **kwargs) -> pd.DataFrame:
-        list_of_data = []
-        namespaces = ['ddl', 'documentation', 'question_sql']
-        for namespace in namespaces:
-            data = self.index.query(
-            top_k=10000,
-            namespace=namespace,
-            include_metadata=True,
-            include_values=False
-            )
-            for match in data['matches']:
-                list_of_data.append(match['metadata'])
-        return pd.DataFrame(list_of_data)
-    def remove_training_data(self, id: str, **kwargs) -> bool:
-        if id.endswith("_ddl"):
-            self.Index.delete(ids=[id], namespace="_ddl")
-            return True
-        if id.endswith("_sql"):
-            self.index.delete(ids=[id], namespace="_sql")
-            return True
-        if id.endswith("_doc"):
-            self.Index.delete(ids=[id], namespace="_doc")
-            return True
-        return False
-    def generate_embedding(self, text, **kwargs):
-        # Implement the method here
-        pass
-    def get_sql_prompt(
-            self,
-            initial_prompt : str,
-            question: str,
-            question_sql_list: list,
-            ddl_list: list,
-            doc_list: list,
-            **kwargs,
-        ):
-            """
-            Example:
-            ```python
-            vn.get_sql_prompt(
-                question="What are the top 10 customers by sales?",
-                question_sql_list=[{"question": "What are the top 10 customers by sales?", "sql": "SELECT * FROM customers ORDER BY sales DESC LIMIT 10"}],
-                ddl_list=["CREATE TABLE customers (id INT, name TEXT, sales DECIMAL)"],
-                doc_list=["The customers table contains information about customers and their sales."],
-            )
-            ```
-            This method is used to generate a prompt for the LLM to generate SQL.
-            Args:
-                question (str): The question to generate SQL for.
-                question_sql_list (list): A list of questions and their corresponding SQL statements.
-                ddl_list (list): A list of DDL statements.
-                doc_list (list): A list of documentation.
-            Returns:
-                any: The prompt for the LLM to generate SQL.
-            """
-            if initial_prompt is None:
-                initial_prompt = f"You are a {self.dialect} expert. " + \
-                "Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. "
-            initial_prompt = self.add_ddl_to_prompt(
-                initial_prompt, ddl_list, max_tokens=self.max_tokens
-            )
-            if self.static_documentation != "":
-                doc_list.append(self.static_documentation)
-            initial_prompt = self.add_documentation_to_prompt(
-                initial_prompt, doc_list, max_tokens=self.max_tokens
-            )
-            # initial_prompt = self.add_sql_to_prompt(
-            #     initial_prompt, question_sql_list, max_tokens=self.max_tokens
-            # )
-            initial_prompt += (
-                "===Response Guidelines \n"
-                "1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. \n"
-                "2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql \n"
-                "3. If the provided context is insufficient, please give a sql query based on your knowledge and the context provided. \n"
-                "4. Please use the most relevant table(s). \n"
-                "5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \n"
-                f"6. Ensure that the output SQL is {self.dialect}-compliant and executable, and free of syntax errors. \n"
-                f"7. Add a description of the table in the result of the sql query, if relevant. \n"
-                "8 Make sure to include the relevant KPI in the SQL query. The query should return impactfull data \n"
-                # f"8. If a set of latitude,longitude is provided, make a intermediate query to find the nearest value in the table and replace the coordinates in the sql query. \n"
-                # "7. Add a description of the table in the result of the sql query."
-                # "7. If the question is about a specific latitude, longitude, query an interval of 0.3 and keep only the first set of coordinate. \n"
-                # "7. Table names should be included in the result of the sql query. Use for example Mean_winter_temperature AS table_name in the query \n"
-            )
-            message_log = [self.system_message(initial_prompt)]
-            for example in question_sql_list:
-                if example is None:
-                    print("example is None")
-                else:
-                    if example is not None and "question" in example and "sql" in example:
-                        message_log.append(self.user_message(example["question"]))
-                        message_log.append(self.assistant_message(example["sql"]))
-            message_log.append(self.user_message(question))
-            return message_log
-# def get_sql_prompt(
-#         self,
-#         initial_prompt : str,
-#         question: str,
-#         question_sql_list: list,
-#         ddl_list: list,
-#         doc_list: list,
-#         **kwargs,
-#     ):
-#         """
-#         Example:
-#         ```python
-#         vn.get_sql_prompt(
-#             question="What are the top 10 customers by sales?",
-#             question_sql_list=[{"question": "What are the top 10 customers by sales?", "sql": "SELECT * FROM customers ORDER BY sales DESC LIMIT 10"}],
-#             ddl_list=["CREATE TABLE customers (id INT, name TEXT, sales DECIMAL)"],
-#             doc_list=["The customers table contains information about customers and their sales."],
-#         )
-#         ```
-#         This method is used to generate a prompt for the LLM to generate SQL.
-#         Args:
-#             question (str): The question to generate SQL for.
-#             question_sql_list (list): A list of questions and their corresponding SQL statements.
-#             ddl_list (list): A list of DDL statements.
-#             doc_list (list): A list of documentation.
-#         Returns:
-#             any: The prompt for the LLM to generate SQL.
-#         """
-#         if initial_prompt is None:
-#             initial_prompt = f"You are a {self.dialect} expert. " + \
-#             "Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. "
-#         initial_prompt = self.add_ddl_to_prompt(
-#             initial_prompt, ddl_list, max_tokens=self.max_tokens
-#         )
-#         if self.static_documentation != "":
-#             doc_list.append(self.static_documentation)
-#         initial_prompt = self.add_documentation_to_prompt(
-#             initial_prompt, doc_list, max_tokens=self.max_tokens
-#         )
-#         initial_prompt += (
-#             "===Response Guidelines \n"
-#             "1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. \n"
-#             "2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql \n"
-#             "3. If the provided context is insufficient, please explain why it can't be generated. \n"
-#             "4. Please use the most relevant table(s). \n"
-#             "5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \n"
-#             f"6. Ensure that the output SQL is {self.dialect}-compliant and executable, and free of syntax errors. \n"
-#         )
-#         message_log = [self.system_message(initial_prompt)]
-#         for example in question_sql_list:
-#             if example is None:
-#                 print("example is None")
-#             else:
-#                 if example is not None and "question" in example and "sql" in example:
-#                     message_log.append(self.user_message(example["question"]))
-#                     message_log.append(self.assistant_message(example["sql"]))
-#         message_log.append(self.user_message(question))
-#         return message_log

climateqa/engine/talk_to_data/workflow/drias.py CHANGED Viewed

@@ -125,11 +125,16 @@ async def drias_workflow(user_input: str) -> State:
                 'plot': plot,
                 'status': 'OK'
             }
-    # Gather all required parameters
     params = {}
-    for param_name in DRIAS_PLOT_PARAMETERS:
-        param = await find_param(state, param_name, mode='DRIAS')
         if param:
             params.update(param)

                 'plot': plot,
                 'status': 'OK'
             }
+    # Gather all required parameters in parallel
+    param_tasks = [
+            find_param(state, param_name, mode='DRIAS')
+            for param_name in DRIAS_PLOT_PARAMETERS
+        ]
+    param_results = await asyncio.gather(*param_tasks)
     params = {}
+    for param in param_results:
         if param:
             params.update(param)

climateqa/engine/talk_to_data/workflow/ipcc.py CHANGED Viewed

@@ -125,12 +125,17 @@ async def ipcc_workflow(user_input: str) -> State:
             }
     # Gather all required parameters
     params = {}
-    for param_name in IPCC_PLOT_PARAMETERS:
-        param = await find_param(state, param_name, mode='IPCC')
         if param:
             params.update(param)
     # Process all outputs in parallel using process_output
     tasks = [
         process_output(output_title, output['table'], output['plot'], params.copy())
@@ -152,10 +157,18 @@ async def ipcc_workflow(user_input: str) -> State:
     # Set error messages if needed
     if not errors['have_relevant_table']:
-        state['error'] = "There is no relevant table in our database to answer your question"
     elif not errors['have_sql_query']:
-        state['error'] = "There is no relevant sql query on our database that can help to answer your question"
     elif not errors['have_dataframe']:
-        state['error'] = "There is no data in our table that can answer to your question"
     return state

             }
     # Gather all required parameters
+    param_tasks = [
+            find_param(state, param_name, mode='IPCC')
+            for param_name in IPCC_PLOT_PARAMETERS
+        ]
+    param_results = await asyncio.gather(*param_tasks)
     params = {}
+    for param in param_results:
         if param:
             params.update(param)
     # Process all outputs in parallel using process_output
     tasks = [
         process_output(output_title, output['table'], output['plot'], params.copy())
     # Set error messages if needed
     if not errors['have_relevant_table']:
+        state['error'] = (
+            "Sorry, I couldn't find any relevant table in our database to answer your question.\n"
+            "Try asking about a different climate indicator like temperature or precipitation."
+        )
     elif not errors['have_sql_query']:
+        state['error'] = (
+            "Sorry, I couldn't generate a relevant SQL query to answer your question.\n"
+            "Try rephrasing your question to focus on a specific location, a year, or a month."
+        )
     elif not errors['have_dataframe']:
+        state['error'] = (
+            "Sorry, there is no data in our tables that can answer your question.\n"
+            "Try asking about a more common location, or a different year."
+        )
     return state

climateqa/engine/vectorstore.py CHANGED Viewed

@@ -1,11 +1,11 @@
-# Pinecone
-# More info at https://docs.pinecone.io/docs/langchain
-# And https://python.langchain.com/docs/integrations/vectorstores/pinecone
 import os
-from pinecone import Pinecone
-from langchain_community.vectorstores import Pinecone as PineconeVectorstore
-# LOAD ENVIRONMENT VARIABLES
 try:
     from dotenv import load_dotenv
     load_dotenv()
@@ -13,44 +13,136 @@ except:
     pass
-def get_pinecone_vectorstore(embeddings,text_key = "content", index_name = os.getenv("PINECONE_API_INDEX")):
-    # # initialize pinecone
-    # pinecone.init(
-    #     api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
-    #     environment=os.getenv("PINECONE_API_ENVIRONMENT"),  # next to api key in console
-    # )
-    # index_name = os.getenv("PINECONE_API_INDEX")
-    # vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
-    # return vectorstore
-    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
-    index = pc.Index(index_name)
-    vectorstore = PineconeVectorstore(
-        index, embeddings, text_key,
     )
-    return vectorstore
-# def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
-#     assert isinstance(sources,list)
-#     # Check if all elements in the list are either IPCC or IPBES
-#     filter = {
-#         "source": { "$in":sources},
-#     }
-#     retriever = vectorstore.as_retriever(search_kwargs={
-#         "k": k,
-#         "namespace":"vectors",
-#         "filter":filter
-#     })
-#     return retriever

+# Azure AI Search: https://python.langchain.com/docs/integrations/vectorstores/azuresearch
 import os
+# Azure AI Search imports
+from langchain_community.vectorstores.azuresearch import AzureSearch
+# Load environment variables
 try:
     from dotenv import load_dotenv
     load_dotenv()
     pass
+class AzureSearchWrapper:
+    """
+    Wrapper class for Azure AI Search vectorstore to handle filter conversion.
+    This wrapper automatically converts dictionary-style filters to Azure Search OData filter format,
+    ensuring seamless compatibility when switching from other providers.
+    """
+    def __init__(self, azure_search_vectorstore):
+        self.vectorstore = azure_search_vectorstore
+    def __getattr__(self, name):
+        """Delegate all other attributes to the wrapped vectorstore."""
+        return getattr(self.vectorstore, name)
+    def _convert_dict_filter_to_odata(self, filter_dict):
+        """
+        Convert dictionary-style filters to Azure Search OData filter format.
+        Args:
+            filter_dict (dict): Dictionary-style filter
+        Returns:
+            str: OData filter string
+        """
+        if not filter_dict:
+            return None
+        conditions = []
+        for key, value in filter_dict.items():
+            if key.endswith('_exclude'):
+                # Handle exclusion filters (e.g., report_type_exclude)
+                base_key = key.replace('_exclude', '')
+                if isinstance(value, list):
+                    if len(value) == 1:
+                        conditions.append(f"{base_key} ne '{value[0]}'")
+                    else:
+                        exclude_conditions = [f"{base_key} ne '{v}'" for v in value]
+                        conditions.append(f"({' and '.join(exclude_conditions)})")
+                else:
+                    conditions.append(f"{base_key} ne '{value}'")
+            elif isinstance(value, list):
+                # Handle list values (equivalent to $in operator)
+                if len(value) == 1:
+                    conditions.append(f"{key} eq '{value[0]}'")
+                else:
+                    list_conditions = [f"{key} eq '{v}'" for v in value]
+                    conditions.append(f"({' or '.join(list_conditions)})")
+            else:
+                # Handle single values
+                conditions.append(f"{key} eq '{value}'")
+        return " and ".join(conditions) if conditions else None
+    def similarity_search_with_score(self, query, k=4, filter=None, **kwargs):
+        """Override similarity_search_with_score to convert filters."""
+        if filter is not None:
+            filter = self._convert_dict_filter_to_odata(filter)
+        return self.vectorstore.hybrid_search_with_score(
+            query=query, k=k, filters=filter, **kwargs
+        )
+    def similarity_search(self, query, k=4, filter=None, **kwargs):
+        """Override similarity_search to convert filters."""
+        if filter is not None:
+            filter = self._convert_dict_filter_to_odata(filter)
+        return self.vectorstore.similarity_search(
+            query=query, k=k, filter=filter, **kwargs
+        )
+    def similarity_search_by_vector(self, embedding, k=4, filter=None, **kwargs):
+        """Override similarity_search_by_vector to convert filters."""
+        if filter is not None:
+            filter = self._convert_dict_filter_to_odata(filter)
+        return self.vectorstore.similarity_search_by_vector(
+            embedding=embedding, k=k, filter=filter, **kwargs
+        )
+    def as_retriever(self, search_type="similarity", search_kwargs=None, **kwargs):
+        """Override as_retriever to handle filter conversion in search_kwargs."""
+        if search_kwargs and "filter" in search_kwargs:
+            # Convert the filter in search_kwargs
+            search_kwargs = search_kwargs.copy()  # Don't modify the original
+            if search_kwargs["filter"] is not None:
+                search_kwargs["filter"] = self._convert_dict_filter_to_odata(search_kwargs["filter"])
+        return self.vectorstore.as_retriever(
+            search_type=search_type, search_kwargs=search_kwargs, **kwargs
+        )
+def get_azure_search_vectorstore(embeddings, text_key="content", index_name=None):
+    """
+    Create an Azure AI Search vectorstore instance.
+    Args:
+        embeddings: The embeddings function to use
+        text_key: The key for text content in the payload (default: "content")
+        index_name: The name of the Azure Search index
+    Returns:
+        AzureSearchWrapper: A wrapped Azure AI Search vectorstore instance with filter compatibility
+    """
+    # Get Azure AI Search configuration from environment variables
+    azure_search_endpoint = os.getenv("AI_SEARCH_INDEX_ENDPOINT")
+    azure_search_key = os.getenv("AI_SEARCH_KEY")
+    if not azure_search_endpoint:
+        raise ValueError("AI_SEARCH_INDEX_ENDPOINT environment variable is required")
+    if not azure_search_key:
+        raise ValueError("AI_SEARCH_KEY environment variable is required")
+    if not index_name:
+        raise ValueError("index_name must be provided for Azure Search")
+    # Create Azure Search vectorstore
+    vectorstore = AzureSearch(
+        azure_search_endpoint=azure_search_endpoint,
+        azure_search_key=azure_search_key,
+        index_name=index_name,
+        embedding_function=embeddings.embed_query,
+        content_key=text_key,
     )
+    # Wrap the vectorstore to handle filter conversion
+    return AzureSearchWrapper(vectorstore)

climateqa/utils.py CHANGED Viewed

@@ -25,7 +25,7 @@ def remove_duplicates_keep_highest_score(documents):
     unique_docs = {}
     for doc in documents:
-        doc_id = doc.metadata.get('doc_id')
         if doc_id in unique_docs:
             if doc.metadata['reranking_score'] > unique_docs[doc_id].metadata['reranking_score']:
                 unique_docs[doc_id] = doc

     unique_docs = {}
     for doc in documents:
+        doc_id = doc.metadata.get('id')
         if doc_id in unique_docs:
             if doc.metadata['reranking_score'] > unique_docs[doc_id].metadata['reranking_score']:
                 unique_docs[doc_id] = doc

front/tabs/tab_ipcc.py CHANGED Viewed

@@ -68,6 +68,8 @@ def show_filter_by_scenario(table_names, index_state, dataframes):
         return gr.update(visible=False)
 def filter_by_scenario(dataframes, figures, table_names, index_state, scenario):
     df = dataframes[index_state]
     if not table_names[index_state].startswith("Map"):
         return df, figures[index_state](df)

         return gr.update(visible=False)
 def filter_by_scenario(dataframes, figures, table_names, index_state, scenario):
+    if len(dataframes) == 0:
+        return None, None
     df = dataframes[index_state]
     if not table_names[index_state].startswith("Map"):
         return df, figures[index_state](df)

requirements.txt CHANGED Viewed

@@ -1,6 +1,9 @@
 gradio==5.0.2
 azure-storage-file-share==12.11.1
 azure-storage-blob==12.23.0
 python-dotenv==1.0.0
 langchain==0.2.1
 langchain_openai==0.1.7

 gradio==5.0.2
 azure-storage-file-share==12.11.1
 azure-storage-blob==12.23.0
+# Azure AI Search support
+azure-search-documents>=11.4.0
+azure-core>=1.29.0
 python-dotenv==1.0.0
 langchain==0.2.1
 langchain_openai==0.1.7

sandbox/20241104 - CQA - StepByStep CQA.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

style.css CHANGED Viewed

@@ -661,7 +661,6 @@ a {
 #sql-query textarea{
     min-height: 200px !important;
 }
 #sql-query span{

 #sql-query textarea{
     min-height: 200px !important;
 }
 #sql-query span{