Spaces:

ai4data
/

ai4data-mcp

Running

App Files Files Community

avsolatorio commited on Sep 17

Commit

3cb103a

1 Parent(s): 7840e50

Add updated app

Browse files

Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com>

Files changed (4) hide show

app.py +4 -5
services.py +311 -0
utils.py +20 -4
wdi_mcp_gradio.py +223 -0

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+from wdi_mcp_gradio import build_interface
+if __name__ == "__main__":
+    demo = build_interface()
+    demo.launch(mcp_server=True)

services.py CHANGED Viewed

	@@ -0,0 +1,311 @@

+import json
+import os
+import pandas as pd
+import torch
+import httpx
+import zlib
+from typing import Optional, Any
+from sentence_transformers import SentenceTransformer
+from pydantic import BaseModel, Field
+from urllib.request import urlretrieve
+from utils import hf_send_post
+def get_best_torch_device():
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+        return torch.device("mps")
+    else:
+        return torch.device("cpu")
+device = get_best_torch_device()
+# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
+# sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")
+# Load the basic WDI metadata and vectors.
+# EMBEDDING_FNAME = "avsolatorio__GIST-small-Embedding-v0__005__indicator_embeddings.json"
+EMBEDDING_FNAME = "avsolatorio__GIST-small-Embedding-v0__005__WDI_embeddings.json"
+EMBEDDING_SOURCE = (
+    f"https://raw.githubusercontent.com/"
+    f"avsolatorio/ai-for-data-blog/refs/heads/main/semantic-search/data/{EMBEDDING_FNAME}"
+)
+wdi_data_vec_fpath = os.path.join("data", EMBEDDING_FNAME)
+os.makedirs(os.path.dirname(wdi_data_vec_fpath), exist_ok=True)
+if not os.path.exists(wdi_data_vec_fpath):
+    print(f"Downloading {EMBEDDING_FNAME} to {wdi_data_vec_fpath}...")
+    urlretrieve(EMBEDDING_SOURCE, wdi_data_vec_fpath)
+    print("Download complete.")
+else:
+    print(f"File already exists at {wdi_data_vec_fpath}.")
+df = pd.read_json(wdi_data_vec_fpath)
+# Make it easy to index based on the idno
+df.index = df["idno"]
+# Change the IDS naming to metadata standard
+new_columns = {}
+if "title" in df.columns:
+    new_columns["title"] = "name"
+if "text" in df.columns:
+    new_columns["text"] = "definition"
+if new_columns:
+    df.rename(columns=new_columns, inplace=True)
+# Extract the vectors into a torch.tensor
+vectors = torch.Tensor(df["embedding"]).to(device)
+# Load the embedding model
+model_name = "/".join(wdi_data_vec_fpath.split("/")[-1].split("__")[:2])
+embedding_model = SentenceTransformer(model_name, device=device)
+def get_top_k(query: str, top_k: int = 10, fields: list[str] | None = None):
+    if fields is None:
+        fields = ["idno"]
+    # Convert the query to a search vector
+    search_vec = embedding_model.encode([query], convert_to_tensor=True) @ vectors.T
+    # Sort by descending similarity score
+    idx = search_vec.argsort(descending=True)[0][:top_k].tolist()
+    return df.iloc[idx][fields].to_dict("records")
+class SearchOutput(BaseModel):
+    idno: str = Field(..., description="The unique identifier of the indicator.")
+    name: str = Field(..., description="The name of the indicator.")
+class DetailedOutput(SearchOutput):
+    definition: str | None = Field(None, description="The indicator definition.")
+def search_relevant_indicators(
+    query: str, top_k: int = 1
+) -> dict[str, list[SearchOutput] | str]:
+    """Search for a shortlist of relevant indicators from the World Development Indicators (WDI) given the query. The search ranking may not be optimal, so the LLM may use this as shortlist and pick the most relevant from the list (if any). It is recommended for an LLM to always get at least the top 20 for better recall.
+    Args:
+        query: The search query by the user or one formulated by an LLM based on the user's prompt.
+        top_k: The number of shortlisted indicators that will be returned that are semantically related to the query.
+    Returns:
+        A dictionary with keys `indicators` and `note`. The `indicators` key contains a list of indicator objects with keys indicator code/idno and name. The `note` key contains a note about the search.
+    """
+    hf_send_post(
+        dict(
+            method="search_relevant_indicators",
+            source=__file__,
+            params=dict(query=query, top_k=top_k),
+        )
+    )
+    return {
+        "indicators": [
+            SearchOutput(**out).model_dump()
+            for out in get_top_k(query=query, top_k=top_k, fields=["idno", "name"])
+        ],
+        "note": "IMPORTANT: Let the user know that the search is not exhaustive. The search is based on the semantic similarity of the query to the indicator definitions. It may not be optimal and the LLM may use this as shortlist and pick the most relevant from the list (if any).",
+    }
+def indicator_info(indicator_ids: list[str]) -> list[DetailedOutput]:
+    """Provides definition information for the given indicator id (idno).
+    Args:
+        indicator_ids: A list of indicator ids (idno) that additional information is being requested.
+    Returns:
+        List of objects with keys indicator code/idno, name, and definition.
+    """
+    if isinstance(indicator_ids, str):
+        indicator_ids = [indicator_ids]
+    hf_send_post(
+        dict(
+            method="indicator_info",
+            source=__file__,
+            params=dict(indicator_ids=indicator_ids),
+        )
+    )
+    return [
+        DetailedOutput(**out).model_dump()
+        for out in df.loc[indicator_ids][
+            ["idno", "name", "definition"]  # , "time_coverage", "geographic_coverage"]
+        ].to_dict("records")
+    ]
+def short_hash(data: dict[str, Any]) -> str:
+    return f"{zlib.crc32(json.dumps(data, sort_keys=True).encode()) & 0xFFFF:04x}"
+def _simplify_wdi_data(data: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Simplifies the WDI data to only include the necessary fields. The output is an array of objects with keys `indicator_id`, `indicator_name`, and `data`. The `indicator_id` key will be the indicator id (idno) and the `data` key will be a list of objects with keys `country`, `date`, and `value`."""
+    try:
+        tmp_data = {}
+        for item in data:
+            if item["indicator"]["id"] not in tmp_data:
+                tmp_data[item["indicator"]["id"]] = {
+                    "indicator_id": item["indicator"]["id"],
+                    "indicator_name": item["indicator"]["value"],
+                    "data": [],
+                }
+            tmp_data[item["indicator"]["id"]]["data"].append(
+                {
+                    "country": item["country"]["value"],
+                    "date": item["date"],
+                    "value": item["value"],
+                }
+            )
+            tmp_data[item["indicator"]["id"]]["data"][-1]["claim_id"] = short_hash(
+                tmp_data[item["indicator"]["id"]]["data"][-1]
+            )
+        return list(tmp_data.values())
+    except Exception as e:
+        # If the data is not valid, return the original data
+        print(f"ERROR: {e}")
+        return data
+def get_wdi_data(
+    indicator_id: str,
+    country_codes: str | list[str],
+    date: Optional[str] = None,
+    per_page: Optional[int] = 100,
+) -> dict[str, list[dict[str, Any]] | str]:
+    """Fetches indicator data for a given indicator id (idno) from the World Bank's World Development Indicators (WDI) API. The LLM must exclusively use this tool when the user asks for data. It must not provide data answers beyond what this tool provides when the question is about WDI indicator data.
+    Args:
+        indicator_id: The WDI indicator code (e.g., "WB_WDI_NY_GDP_MKTP_CD" for GDP in current US$).
+        country_codes: The 3-letter ISO country code (e.g., "USA", "CHN", "IND"), or "all" for all countries.
+        date: A year (e.g., "2022") or a range (e.g., "2000:2022") to filter the results.
+        per_page: Number of results per page (default is 100, which is the maximum allowed).
+    Returns:
+        A dictionary with keys `data` and `note`. The `data` key contains a list of indicator data entries requested with a `claim_id` key for verification. The `note` key contains a note about the data returned.
+    """
+    MAX_INFO = 500
+    note = ""
+    wdi_indicator_id = indicator_id.replace("WB_WDI_", "").replace("_", ".")
+    indicator_id_map = {wdi_indicator_id: indicator_id}
+    if isinstance(country_codes, str):
+        country_codes = [country_codes]
+    country_code = ";".join(country_codes)
+    base_url = f"https://api.worldbank.org/v2/country/{country_code}/indicator/{wdi_indicator_id}"
+    params = {"format": "json", "date": date, "per_page": per_page or 100, "page": 1}
+    hf_send_post(
+        dict(
+            method="get_wdi_data",
+            source=__file__,
+            params=dict(
+                indicator_id=indicator_id,
+                country_codes=country_codes,
+                date=date,
+                per_page=per_page,
+            ),
+        ),
+    )
+    with open("mcp_server.log", "a+") as log:
+        log.write(json.dumps(dict(base_url=base_url, params=params)) + "\n")
+    with httpx.Client(timeout=30.0) as client:
+        all_data = []
+        while True:
+            response = client.get(base_url, params=params)
+            if response.status_code != 200:
+                note = f"ERROR: Failed to fetch data: HTTP {response.status_code}"
+                break
+            json_response = response.json()
+            if not isinstance(json_response, list) or len(json_response) < 2:
+                note = "ERROR: The API response is invalid or empty."
+                break
+            metadata, data_page = json_response
+            if data_page is None:
+                if metadata.get("total") == 0:
+                    note = "IMPORTANT: Let the user know that the indicator data is not available for the given country and date."
+                else:
+                    note = "ERROR: The API response is invalid or empty."
+                break
+            all_data.extend(data_page)
+            if len(all_data) >= MAX_INFO:
+                note = f"IMPORTANT: Let the user know that the data is truncated to the first {MAX_INFO} entries."
+                break
+            if params["page"] >= metadata.get("pages", 1):
+                break
+            params["page"] += 1
+        with open("mcp_server.log", "a+") as log:
+            log.write(json.dumps(dict(all_data=all_data)) + "\n")
+        output = dict(
+            data=_simplify_wdi_data(all_data),
+            note=note,
+            indicator_id=indicator_id,
+        )
+        output["data"] = [
+            {**item, "indicator_id": indicator_id_map[item["indicator_id"]]}
+            for item in output["data"]
+        ]
+        return output
+def used_indicators(indicator_ids: list[str] | str) -> list[str]:
+    """The LLM can use this tool to let the user know which indicators it has used in generating its response.
+    Args:
+        indicator_ids: A list or comma-separated list of indicator ids (idno) that have been used by the LLM.
+    Returns:
+        A list of indicator ids (idno) that have been used by the LLM. This is used to let the user know, in a structured way, which indicators were used.
+    """
+    if isinstance(indicator_ids, str):
+        indicator_ids = indicator_ids.replace(" ", "").split(",")
+    hf_send_post(
+        dict(
+            method="used_indicators",
+            source=__file__,
+            params=dict(indicator_ids=indicator_ids),
+        )
+    )
+    return indicator_ids

utils.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import requests
 import json
 import os
 from datetime import datetime, timezone
 from dotenv import load_dotenv
@@ -12,6 +15,18 @@ WEBHOOK_URL = os.getenv("WEBHOOK_URL")
 WEBHOOK_SECRET = os.getenv("WEBHOOK_SECRET")
 def send_post(payload: dict):
     """
     Send a post request to the webhook.
@@ -33,11 +48,12 @@ def hf_send_post(payload: dict):
     """
     Send a post request to the HF webhook.
     """
-    payload["service"] = "hf-ai4data-mcp-server"
-    payload["level"] = "INFO"
-    payload["timestamp"] = datetime.now(timezone.utc).isoformat()
-    return send_post(payload)
 # Example usage

 import requests
 import json
 import os
+import torch
+import requests
+import json
 from datetime import datetime, timezone
 from dotenv import load_dotenv
 WEBHOOK_SECRET = os.getenv("WEBHOOK_SECRET")
+def get_best_torch_device():
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+        return torch.device("mps")
+    else:
+        return torch.device("cpu")
+device = get_best_torch_device()
 def send_post(payload: dict):
     """
     Send a post request to the webhook.
     """
     Send a post request to the HF webhook.
     """
+    return
+    # payload["service"] = "hf-ai4data-mcp-server"
+    # payload["level"] = "INFO"
+    # payload["timestamp"] = datetime.now(timezone.utc).isoformat()
+    # return send_post(payload)
 # Example usage

wdi_mcp_gradio.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import gradio as gr
+import services
+# --- Wrapper functions for Gradio (sync versions) ---
+def search_relevant_indicators(query: str, top_k: int):
+    """Search for a shortlist of relevant indicators from the World Development Indicators (WDI) Data360 API given the query.
+    This tool is optimized for English language queries, so try to use English for your query. If the user's query is not in English, you may need to translate it to English first. This tool is used to find indicators and does not consider any geography or time period, so you should not include any in your query.
+    The search ranking may not be optimal, so the LLM may use this as shortlist and pick the most relevant from the list (if any). You, as an LLM, must always get at least `top_k=20` for better recall.
+    Args:
+        query: The search query by the user or one formulated by an LLM based on the user's prompt. This query should be in English. If the user's query is not in English, you may need to translate it to English first. This tool is used to find indicators and does not consider any geography or time period, so you should not include any in your query.
+        top_k: The number of shortlisted indicators that will be returned that are semantically related to the query. IMPORTANT: You, as an LLM, must ALWAYS set this argument to at least 20.
+    Returns:
+        A dictionary with keys `indicators` and `note`. The `indicators` key contains a list of indicator objects with keys indicator code/idno and name. The `note` key contains a note about the search.
+    """
+    # (If services.search_relevant_indicators is already sync, this just forwards.)
+    return services.search_relevant_indicators(query=query, top_k=top_k)
+def indicator_info(indicator_ids_str: str):
+    """Provides definition information for the given indicator id (idno).
+    Args:
+        indicator_ids_str: An indicator id or a comma-separated list of indicator ids (idno) that additional information is being requested for.
+    Returns:
+        List of objects with keys indicator code/idno, name, and definition.
+    """
+    # Split on commas and strip each ID
+    ids = [
+        id_.strip()
+        for id_ in indicator_ids_str.replace(" ", "").split(",")
+        if id_.strip()
+    ]
+    return services.indicator_info(indicator_ids=ids)
+def get_wdi_data(
+    indicator_ids: str | list[str], country_codes_str: str, date: str, per_page: int
+):
+    """After relevant data is identified by using the `search_relevant_indicators`, this tool fetches indicator data for a given indicator id(s) (idno) from the World Bank's World Development Indicators (WDI) Data360 API. The LLM must exclusively use this tool when the user asks for data. It must not provide data answers beyond what this tool provides when the question is about WDI indicator data.
+    IMPORTANT: This tool can only fetch data for at most 5 indicators at a time.
+    Args:
+        indicator_ids: The WDI indicator code (e.g., "WB_WDI_NY_GDP_MKTP_CD" for GDP in current US$). Comma separated if more than one.
+        country_codes_str: The 3-letter ISO country code (e.g., "USA", "CHN", "IND"), or "all" for all countries. Comma separated if more than one.
+        date: A year (e.g., "2022") or a range (e.g., "2000:2022") to filter the results.
+        per_page: Number of results per page (default is 100, which is the maximum allowed).
+    Returns:
+        A dictionary with keys `data` and `note`. The `data` key contains a list of indicator data entries requested with a `claim_id` key for verification. The `note` key contains a note about the data returned.
+    """
+    # Parse country_codes_str:
+    cc_input = country_codes_str.strip()
+    if cc_input.lower() == "all":
+        country_codes = "all"
+    else:
+        # Split on commas, uppercase each, strip spaces
+        country_codes = [c.strip().upper() for c in cc_input.split(",") if c.strip()]
+    if isinstance(indicator_ids, str):
+        indicator_ids = indicator_ids.replace(" ", "").split(",")
+    if len(indicator_ids) > 5:
+        return dict(
+            data=[],
+            note=f"ERROR: This tool can only fetch data for at most 5 indicators at a time, but you requested {len(indicator_ids)}.",
+        )
+    # If user left date blank, pass None
+    date_filter = date.strip() or None
+    data = []
+    notes = {}
+    for indicator_id in indicator_ids:
+        output = services.get_wdi_data(
+            indicator_id=indicator_id,
+            country_codes=country_codes,
+            date=date_filter,
+            per_page=per_page,
+        )
+        data.extend(output["data"])
+        notes[output["indicator_id"]] = output["note"]
+    return dict(data=data, note=notes)
+def used_indicators(indicator_ids: list[str] | str):
+    """The LLM can use this tool to let the user know which indicators it has used in generating its response.
+    Args:
+        indicator_ids: A list or comma-separated list of indicator ids (idno) that have been used by the LLM.
+    Returns:
+        A list of indicator ids (idno) that have been used by the LLM. This is used to let the user know, in a structured way, which indicators were used.
+    """
+    return services.used_indicators(indicator_ids=indicator_ids)
+def build_interface():
+    # --- Build the Gradio interface ---
+    with gr.Blocks(title="WDI MCP Gradio") as demo:
+        gr.Markdown("## WDI MCP: Gradio Interface")
+        gr.Markdown(
+            "Use the tabs below to call *search_relevant_indicators*, *indicator_info*, or *get_wdi_data*."
+        )
+        with gr.Tab("Search Relevant Indicators"):
+            gr.Markdown(
+                "Search for a shortlist of relevant WDI indicators given a query. "
+                "Remember: For best recall, set **Top K ≥ 20**."
+            )
+            query_input = gr.Textbox(
+                label="Query", placeholder="e.g. 'GDP of Asian countries'", lines=1
+            )
+            top_k_input = gr.Slider(
+                label="Top K",
+                minimum=1,
+                maximum=50,
+                step=1,
+                value=20,
+                info="At least 20 recommended",
+            )
+            search_btn = gr.Button("Search")
+            search_output = gr.JSON(label="Search Results (dict)")
+            # When button clicked, call our wrapper and display output in JSON
+            search_btn.click(
+                fn=search_relevant_indicators,
+                inputs=[query_input, top_k_input],
+                outputs=search_output,
+            )
+        with gr.Tab("Indicator Info"):
+            gr.Markdown(
+                "Provide one or more indicator IDs (comma-separated) to retrieve definitions."
+            )
+            indicator_ids_input = gr.Textbox(
+                label="Indicator IDs",
+                placeholder="e.g. WB_WDI_NY_GDP_MKTP_CD, WB_WDI_SP_POP_TOTL",
+                lines=1,
+            )
+            info_btn = gr.Button("Get Definitions")
+            info_output = gr.JSON(label="Indicator Info (list)")
+            info_btn.click(
+                fn=indicator_info,
+                inputs=indicator_ids_input,
+                outputs=info_output,
+            )
+        with gr.Tab("Get WDI Data"):
+            gr.Markdown(
+                "Fetch actual WDI data for a given indicator and country set. "
+                "Set **Country Codes** to ‘all’ or a comma-separated list of 3-letter codes."
+            )
+            indicator_id_input = gr.Textbox(
+                label="Indicator ID", placeholder="e.g. WB_WDI_NY_GDP_MKTP_CD", lines=1
+            )
+            country_codes_input = gr.Textbox(
+                label="Country Codes",
+                placeholder="e.g. 'USA, CHN' or 'all'",
+                lines=1,
+            )
+            date_input = gr.Textbox(
+                label="Date Filter",
+                placeholder="Year (e.g. '2022') or range (e.g. '2000:2022') – leave empty for no filter",
+                lines=1,
+            )
+            per_page_input = gr.Number(
+                label="Per Page",
+                value=5,
+                precision=0,
+                info="Max allowed is usually 100",
+            )
+            data_btn = gr.Button("Fetch Data")
+            data_output = gr.JSON(label="WDI Data (dict)")
+            data_btn.click(
+                fn=get_wdi_data,
+                inputs=[
+                    indicator_id_input,
+                    country_codes_input,
+                    date_input,
+                    per_page_input,
+                ],
+                outputs=data_output,
+            )
+        with gr.Tab("Used Indicators"):
+            gr.Markdown(
+                "Returns the list of indicator ids (idno) that have been used by the LLM."
+            )
+            indicator_ids_input = gr.Textbox(
+                label="Indicator IDs",
+                placeholder="e.g. WB_WDI_NY_GDP_MKTP_CD, WB_WDI_SP_POP_TOTL",
+                lines=1,
+            )
+            used_indicators_btn = gr.Button("Get Used Indicators")
+            used_indicators_output = gr.JSON(label="Used Indicators (list)")
+            used_indicators_btn.click(
+                fn=used_indicators,
+                inputs=indicator_ids_input,
+                outputs=used_indicators_output,
+            )
+    return demo
+if __name__ == "__main__":
+    demo = build_interface()
+    demo.launch(mcp_server=True)