Spaces:

FireBird-Tech
/

auto-analyst-backend

Running on CPU Upgrade

File size: 41,460 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae0e83db",
   "metadata": {},
   "outputs": [],
   "source": [
    "import dspy \n",
    "import duckdb \n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "94387056",
   "metadata": {},
   "outputs": [],
   "source": [
    "styling_instructions =  [\n",
    "    {\n",
    "        \"category\": \"line_charts\",\n",
    "        \"description\": \"Used to visualize trends and changes over time, often with multiple series.\",\n",
    "        \"styling\": {\n",
    "            \"template\": \"plotly_white\",\n",
    "            \"axes_line_width\": 0.2,\n",
    "            \"grid_width\": 1,\n",
    "            \"title\": {\n",
    "                \"bold_html\": True,\n",
    "                \"include\": True\n",
    "            },\n",
    "            \"colors\": \"use multiple colors if more than one line\",\n",
    "            \"annotations\": [\"min\", \"max\"],\n",
    "            \"number_format\": {\n",
    "                \"apply_k_m\": True,\n",
    "                \"thresholds\": {\"K\": 1000, \"M\": 100000},\n",
    "                \"percentage_decimals\": 2,\n",
    "                \"percentage_sign\": True\n",
    "            },\n",
    "            \"default_size\": {\"height\": 1200, \"width\": 1000}\n",
    "        }\n",
    "    },\n",
    "    {\n",
    "        \"category\": \"bar_charts\",\n",
    "        \"description\": \"Useful for comparing discrete categories or groups with bars representing values.\",\n",
    "        \"styling\": {\n",
    "            \"template\": \"plotly_white\",\n",
    "            \"axes_line_width\": 0.2,\n",
    "            \"grid_width\": 1,\n",
    "            \"title\": {\"bold_html\": True, \"include\": True},\n",
    "            \"annotations\": [\"bar values\"],\n",
    "            \"number_format\": {\n",
    "                \"apply_k_m\": True,\n",
    "                \"thresholds\": {\"K\": 1000, \"M\": 100000},\n",
    "                \"percentage_decimals\": 2,\n",
    "                \"percentage_sign\": True\n",
    "            },\n",
    "            \"default_size\": {\"height\": 1200, \"width\": 1000}\n",
    "        }\n",
    "    },\n",
    "    {\n",
    "        \"category\": \"histograms\",\n",
    "        \"description\": \"Display the distribution of a data set, useful for returns or frequency distributions.\",\n",
    "        \"styling\": {\n",
    "            \"template\": \"plotly_white\",\n",
    "            \"bin_size\": 50,\n",
    "            \"axes_line_width\": 0.2,\n",
    "            \"grid_width\": 1,\n",
    "            \"title\": {\"bold_html\": True, \"include\": True},\n",
    "            \"annotations\": [\"x values\"],\n",
    "            \"number_format\": {\n",
    "                \"apply_k_m\": True,\n",
    "                \"thresholds\": {\"K\": 1000, \"M\": 100000},\n",
    "                \"percentage_decimals\": 2,\n",
    "                \"percentage_sign\": True\n",
    "            },\n",
    "            \"default_size\": {\"height\": 1200, \"width\": 1000}\n",
    "        }\n",
    "    },\n",
    "    {\n",
    "        \"category\": \"pie_charts\",\n",
    "        \"description\": \"Show composition or parts of a whole with slices representing categories.\",\n",
    "        \"styling\": {\n",
    "            \"template\": \"plotly_white\",\n",
    "            \"top_categories_to_show\": 10,\n",
    "            \"bundle_rest_as\": \"Others\",\n",
    "            \"axes_line_width\": 0.2,\n",
    "            \"grid_width\": 1,\n",
    "            \"title\": {\"bold_html\": True, \"include\": True},\n",
    "            \"annotations\": [\"x values\"],\n",
    "            \"number_format\": {\n",
    "                \"apply_k_m\": True,\n",
    "                \"thresholds\": {\"K\": 1000, \"M\": 100000},\n",
    "                \"percentage_decimals\": 2,\n",
    "                \"percentage_sign\": True\n",
    "            },\n",
    "            \"default_size\": {\"height\": 1200, \"width\": 1000}\n",
    "        }\n",
    "    },\n",
    "    {\n",
    "        \"category\": \"tabular_and_generic_charts\",\n",
    "        \"description\": \"Applies to charts where number formatting needs flexibility, including mixed or raw data.\",\n",
    "        \"styling\": {\n",
    "            \"template\": \"plotly_white\",\n",
    "            \"axes_line_width\": 0.2,\n",
    "            \"grid_width\": 1,\n",
    "            \"title\": {\"bold_html\": True, \"include\": True},\n",
    "            \"annotations\": [\"x values\"],\n",
    "            \"number_format\": {\n",
    "                \"apply_k_m\": True,\n",
    "                \"thresholds\": {\"K\": 1000, \"M\": 100000},\n",
    "                \"exclude_if_commas_present\": True,\n",
    "                \"exclude_if_not_numeric\": True,\n",
    "                \"percentage_decimals\": 2,\n",
    "                \"percentage_sign\": True\n",
    "            },\n",
    "            \"default_size\": {\"height\": 1200, \"width\": 1000}\n",
    "        }\n",
    "    },\n",
    "    {\n",
    "        \"category\": \"heat_maps\",\n",
    "        \"description\": \"Show data density or intensity using color scales on a matrix or grid.\",\n",
    "        \"styling\": {\n",
    "            \"template\": \"plotly_white\",\n",
    "            \"axes_styles\": {\n",
    "                \"line_color\": \"black\",\n",
    "                \"line_width\": 0.2,\n",
    "                \"grid_width\": 1,\n",
    "                \"format_numbers_as_k_m\": True,\n",
    "                \"exclude_non_numeric_formatting\": True\n",
    "            },\n",
    "            \"title\": {\"bold_html\": True, \"include\": True},\n",
    "            \"default_size\": {\"height\": 1200, \"width\": 1000}\n",
    "        }\n",
    "    },\n",
    "    {\n",
    "        \"category\": \"histogram_distribution\",\n",
    "        \"description\": \"Specialized histogram for return distributions with opacity control.\",\n",
    "        \"styling\": {\n",
    "            \"template\": \"plotly_white\",\n",
    "            \"opacity\": 0.75,\n",
    "            \"axes_styles\": {\n",
    "                \"grid_width\": 1,\n",
    "                \"format_numbers_as_k_m\": True,\n",
    "                \"exclude_non_numeric_formatting\": True\n",
    "            },\n",
    "            \"title\": {\"bold_html\": True, \"include\": True},\n",
    "            \"default_size\": {\"height\": 1200, \"width\": 1000}\n",
    "        }\n",
    "    }\n",
    "]\n",
    "\n",
    "# Convert to list of JSON strings\n",
    "styling_instructions = [str(chart_dict) for chart_dict in styling_instructions]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0f15d1ca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"{'category': 'line_charts', 'description': 'Used to visualize trends and changes over time, often with multiple series.', 'styling': {'template': 'plotly_white', 'axes_line_width': 0.2, 'grid_width': 1, 'title': {'bold_html': True, 'include': True}, 'colors': 'use multiple colors if more than one line', 'annotations': ['min', 'max'], 'number_format': {'apply_k_m': True, 'thresholds': {'K': 1000, 'M': 100000}, 'percentage_decimals': 2, 'percentage_sign': True}, 'default_size': {'height': 1200, 'width': 1000}}}\",\n",
       " \"{'category': 'bar_charts', 'description': 'Useful for comparing discrete categories or groups with bars representing values.', 'styling': {'template': 'plotly_white', 'axes_line_width': 0.2, 'grid_width': 1, 'title': {'bold_html': True, 'include': True}, 'annotations': ['bar values'], 'number_format': {'apply_k_m': True, 'thresholds': {'K': 1000, 'M': 100000}, 'percentage_decimals': 2, 'percentage_sign': True}, 'default_size': {'height': 1200, 'width': 1000}}}\",\n",
       " \"{'category': 'histograms', 'description': 'Display the distribution of a data set, useful for returns or frequency distributions.', 'styling': {'template': 'plotly_white', 'bin_size': 50, 'axes_line_width': 0.2, 'grid_width': 1, 'title': {'bold_html': True, 'include': True}, 'annotations': ['x values'], 'number_format': {'apply_k_m': True, 'thresholds': {'K': 1000, 'M': 100000}, 'percentage_decimals': 2, 'percentage_sign': True}, 'default_size': {'height': 1200, 'width': 1000}}}\",\n",
       " \"{'category': 'pie_charts', 'description': 'Show composition or parts of a whole with slices representing categories.', 'styling': {'template': 'plotly_white', 'top_categories_to_show': 10, 'bundle_rest_as': 'Others', 'axes_line_width': 0.2, 'grid_width': 1, 'title': {'bold_html': True, 'include': True}, 'annotations': ['x values'], 'number_format': {'apply_k_m': True, 'thresholds': {'K': 1000, 'M': 100000}, 'percentage_decimals': 2, 'percentage_sign': True}, 'default_size': {'height': 1200, 'width': 1000}}}\",\n",
       " \"{'category': 'tabular_and_generic_charts', 'description': 'Applies to charts where number formatting needs flexibility, including mixed or raw data.', 'styling': {'template': 'plotly_white', 'axes_line_width': 0.2, 'grid_width': 1, 'title': {'bold_html': True, 'include': True}, 'annotations': ['x values'], 'number_format': {'apply_k_m': True, 'thresholds': {'K': 1000, 'M': 100000}, 'exclude_if_commas_present': True, 'exclude_if_not_numeric': True, 'percentage_decimals': 2, 'percentage_sign': True}, 'default_size': {'height': 1200, 'width': 1000}}}\",\n",
       " \"{'category': 'heat_maps', 'description': 'Show data density or intensity using color scales on a matrix or grid.', 'styling': {'template': 'plotly_white', 'axes_styles': {'line_color': 'black', 'line_width': 0.2, 'grid_width': 1, 'format_numbers_as_k_m': True, 'exclude_non_numeric_formatting': True}, 'title': {'bold_html': True, 'include': True}, 'default_size': {'height': 1200, 'width': 1000}}}\",\n",
       " \"{'category': 'histogram_distribution', 'description': 'Specialized histogram for return distributions with opacity control.', 'styling': {'template': 'plotly_white', 'opacity': 0.75, 'axes_styles': {'grid_width': 1, 'format_numbers_as_k_m': True, 'exclude_non_numeric_formatting': True}, 'title': {'bold_html': True, 'include': True}, 'default_size': {'height': 1200, 'width': 1000}}}\"]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "styling_instructions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "b39cdaf9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<dspy.clients.lm.LM object at 0x000001CF7397DBD0>\n"
     ]
    }
   ],
   "source": [
    "dspy.configure(lm= dspy.LM('openai/gpt-4o-mini', max_tokens =800, api_key=os.getenv('OPENAI_API_KEY')))\n",
    "\n",
    "\n",
    "print(dspy.settings.lm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bcef79e3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tables in DuckDB:\n",
      "- Month1\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'|    | Month Totals   |   Xperra |   SSC  |   Michael |   Zac |   Total |   Unnamed: 6 |   Unnamed: 7 |\\n|---:|:---------------|---------:|-------:|----------:|------:|--------:|-------------:|-------------:|\\n|  0 | Week1          |      900 |      0 |         0 |     0 |     900 |          nan |          nan |\\n|  1 | Week2          |      900 |      0 |         0 |     0 |     900 |          nan |          nan |\\n|  2 | Week3          |     1200 |      0 |         0 |     0 |    1200 |          nan |          nan |'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "- Week1\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'|    | Week/Start Min   | Unnamed: 1   | Start    | 2025-02-24 00:00:00   | End   | 2025-03-03 00:00:00   | Unnamed: 6   | Unnamed: 7   | Unnamed: 8   | Unnamed: 9   | Unnamed: 10   |\\n|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\\n|  0 | Start            | End          | MON      | TUE                   | WED   | THU                   | FRI          | SAT          | SUN          |              |               |\\n|  1 | 00:00:00         | 00:30:00     |          |                       |       |                       |              |              |              |              |               |\\n|  2 | 00:30:00         | 01:00:00     |          |                       |       |                       |              |              |              |              |               |'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "- Week2\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'|    | Week/Start Min   | Unnamed: 1   | Start    | 2025-03-03 00:00:00   | End   | 2025-03-10 00:00:00   | Unnamed: 6   | Unnamed: 7   | Unnamed: 8   | Unnamed: 9   | Unnamed: 10   |\\n|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\\n|  0 | Start            | End          | MON      | TUE                   | WED   | THU                   | FRI          | SAT          | SUN          |              |               |\\n|  1 | 00:00:00         | 00:30:00     |          |                       |       |                       |              |              |              |              |               |\\n|  2 | 00:30:00         | 01:00:00     |          |                       |       |                       |              |              |              |              |               |'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "- Week3\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'|    | Week/Start Min   | Unnamed: 1   | Start    | 2025-03-10 00:00:00   | End   | 2025-03-17 00:00:00   | Unnamed: 6   | Unnamed: 7   | Unnamed: 8   | Unnamed: 9   | Unnamed: 10   |\\n|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\\n|  0 | Start            | End          | MON      | TUE                   | WED   | THU                   | FRI          | SAT          | SUN          |              |               |\\n|  1 | 00:00:00         | 00:30:00     |          |                       |       |                       |              |              |              |              |               |\\n|  2 | 00:30:00         | 01:00:00     |          |                       |       |                       |              |              |              |              |               |'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "- Week4\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'|    | Week/Start Min   | Unnamed: 1   | Start    | 2025-03-17 00:00:00   | End   | 2025-03-24 00:00:00   | Unnamed: 6   | Unnamed: 7   | Unnamed: 8   | Unnamed: 9   | Unnamed: 10   |\\n|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\\n|  0 | Start            | End          | MON      | TUE                   | WED   | THU                   | FRI          | SAT          | SUN          |              |               |\\n|  1 | 00:00:00         | 00:30:00     |          |                       |       |                       |              |              |              |              |               |\\n|  2 | 00:30:00         | 01:00:00     |          |                       |       |                       |              |              |              |              |               |'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "('Week4',)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "excel_path = 'My Timesheet 24th Feb.xlsx'\n",
    "\n",
    "sheet_names = pd.ExcelFile(excel_path).sheet_names\n",
    "\n",
    "conn = duckdb.connect()\n",
    "\n",
    "\n",
    "# Create tables for each sheet\n",
    "for sheet in sheet_names:\n",
    "    df = pd.read_excel(excel_path, sheet_name=sheet)\n",
    "    # Register each DataFrame as a table named after the sheet\n",
    "    conn.register(sheet, df)\n",
    "\n",
    "\n",
    "# Show all tables in DuckDB\n",
    "tables = conn.execute(\"SHOW TABLES\").fetchall()\n",
    "print(\"Tables in DuckDB:\")\n",
    "for table in tables:\n",
    "    # Get the first few rows of each table to show structure\n",
    "    try:\n",
    "        head_data = conn.execute(f\"SELECT * FROM {table[0]} LIMIT 3\").df().to_markdown()\n",
    "        display(head_data)\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"  Error fetching head of {table[0]}: {e}\")\n",
    "    print()  # Add blank line for readability\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "# Read Excel file using DuckDB\n",
    "# excel = duckdb.sql(\"SELECT * FROM read_excel('My Timesheet 24th Feb.xlsx')\")\n",
    "\n",
    "\n",
    "\n",
    "# help(excel)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "# Preprocessing steps\n",
    "# 1. Drop empty rows and columns\n",
    "# excel_df.dropna(how='all', inplace=True)  # Remove empty rows\n",
    "# excel_df.dropna(how='all', axis=1, inplace=True)  # Remove empty columns\n",
    "\n",
    "# # 2. Clean column names\n",
    "# excel_df.columns = excel_df.columns.str.strip()  # Remove extra spaces\n",
    "\n",
    "# # 3. Convert Excel data to CSV with UTF-8-sig encoding\n",
    "# csv_buffer = io.StringIO()\n",
    "# excel_df.to_csv(csv_buffer, index=False, encoding='utf-8-sig')\n",
    "# csv_buffer.seek(0)\n",
    "\n",
    "# # Read the processed CSV back into a dataframe\n",
    "# new_df = pd.read_csv(csv_buffer)\n",
    "\n",
    "\n",
    "# excel\n",
    "\n",
    "table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "abf0addd",
   "metadata": {},
   "outputs": [],
   "source": [
    "class data_context_gen(dspy.Signature):\n",
    "    \"\"\"\n",
    "    Generate a compact JSON data context for DuckDB tables ingested from Excel or CSV files.\n",
    "    The JSON must include:\n",
    "    - Exact DuckDB table names\n",
    "    - Source sheet or file name for each table\n",
    "    - Table role (fact/dimension)\n",
    "    - Primary key (pk)\n",
    "    - Columns with type and role (pk, fk, attr, cat, measure, temporal)\n",
    "    - Relationships between tables (foreign keys), with cardinality types (1:1, 1:M, M:1, M:M)\n",
    "    - Business purpose of each table\n",
    "    - Metrics expressed as formulas\n",
    "    - Use cases for the dataset\n",
    "\n",
    "    Example JSON format:\n",
    "    {\n",
    "      \"tables\": {\n",
    "        \"customer_master\": {\n",
    "          \"source\": \"Customer_Master sheet\",\n",
    "          \"role\": \"dimension\",\n",
    "          \"pk\": \"customer_id\",\n",
    "          \"columns\": {\n",
    "            \"customer_id\": {\"type\": \"string\", \"role\": \"pk\"},\n",
    "            \"name\": {\"type\": \"string\", \"role\": \"attr\"},\n",
    "            \"region\": {\"type\": \"string\", \"role\": \"cat\"},\n",
    "            \"signup_date\": {\"type\": \"date\", \"role\": \"temporal\"}\n",
    "          },\n",
    "          \"purpose\": \"Customer attributes for segmentation\"\n",
    "        },\n",
    "        \"sales_data\": {\n",
    "          \"source\": \"Sales_Data sheet\",\n",
    "          \"role\": \"fact\",\n",
    "          \"pk\": \"order_id\",\n",
    "          \"columns\": {\n",
    "            \"order_id\": {\"type\": \"string\", \"role\": \"pk\"},\n",
    "            \"customer_id\": {\"type\": \"string\", \"role\": \"fk\"},\n",
    "            \"product_id\": {\"type\": \"string\", \"role\": \"fk\"},\n",
    "            \"order_date\": {\"type\": \"date\", \"role\": \"temporal\"},\n",
    "            \"quantity\": {\"type\": \"int\", \"role\": \"measure\"},\n",
    "            \"unit_price\": {\"type\": \"decimal\", \"role\": \"measure\"}\n",
    "          },\n",
    "          \"purpose\": \"Transaction records for revenue analysis\"\n",
    "        },\n",
    "        \"product_catalog\": {\n",
    "          \"source\": \"Product_Catalog sheet\",\n",
    "          \"role\": \"dimension\",\n",
    "          \"pk\": \"product_id\",\n",
    "          \"columns\": {\n",
    "            \"product_id\": {\"type\": \"string\", \"role\": \"pk\"},\n",
    "            \"product_name\": {\"type\": \"string\", \"role\": \"attr\"},\n",
    "            \"category\": {\"type\": \"string\", \"role\": \"cat\"},\n",
    "            \"subcategory\": {\"type\": \"string\", \"role\": \"cat\"},\n",
    "            \"brand\": {\"type\": \"string\", \"role\": \"cat\"}\n",
    "          },\n",
    "          \"purpose\": \"Product hierarchy for analysis\"\n",
    "        }\n",
    "      },\n",
    "      \"relationships\": [\n",
    "        {\"from\": \"sales_data.customer_id\", \"to\": \"customer_master.customer_id\", \"type\": \"M:1\"},\n",
    "        {\"from\": \"sales_data.product_id\", \"to\": \"product_catalog.product_id\", \"type\": \"M:1\"}\n",
    "      ],\n",
    "      \"metrics\": [\n",
    "        \"revenue = quantity * unit_price\",\n",
    "        \"customer_lifetime_value\"\n",
    "      ],\n",
    "      \"use_cases\": [\n",
    "        \"cohort analysis\",\n",
    "        \"product performance\",\n",
    "        \"regional sales\"\n",
    "      ]\n",
    "    }\n",
    "\n",
    "    Column roles: pk (primary key), fk (foreign key), attr (attribute), cat (categorical), measure (numerical), temporal (date/time)\n",
    "    Table roles: fact (transactional), dimension (reference data)\n",
    "    Relationship types: 1:1, 1:M, M:1, M:M\n",
    "    \"\"\"\n",
    "    user_description = dspy.InputField(desc=\"User's description of the data, including relationships\")\n",
    "    dataset_view = dspy.InputField(desc=\"Dataset name with sample head(5 rows) view\")\n",
    "    data_context = dspy.OutputField(desc=\"Compact JSON describing DuckDB tables, columns, relationships, metrics and use cases\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "59699a12",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Prediction(\n",
       "    data_context='{\\n  \"tables\": {\\n    \"Month1\": {\\n      \"source\": \"Month1 sheet\",\\n      \"role\": \"fact\",\\n      \"pk\": \"week\",\\n      \"columns\": {\\n        \"week\": {\"type\": \"string\", \"role\": \"pk\"},\\n        \"month_totals\": {\"type\": \"int\", \"role\": \"measure\"},\\n        \"xperra\": {\"type\": \"int\", \"role\": \"measure\"},\\n        \"ssc\": {\"type\": \"int\", \"role\": \"measure\"},\\n        \"michael\": {\"type\": \"int\", \"role\": \"measure\"},\\n        \"zac\": {\"type\": \"int\", \"role\": \"measure\"},\\n        \"total\": {\"type\": \"int\", \"role\": \"measure\"}\\n      },\\n      \"purpose\": \"Monthly totals for performance tracking\"\\n    },\\n    \"Week1\": {\\n      \"source\": \"Week1 sheet\",\\n      \"role\": \"dimension\",\\n      \"pk\": \"week_start\",\\n      \"columns\": {\\n        \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\\n        \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\\n        \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\\n      },\\n      \"purpose\": \"Details of the first week for time analysis\"\\n    },\\n    \"Week2\": {\\n      \"source\": \"Week2 sheet\",\\n      \"role\": \"dimension\",\\n      \"pk\": \"week_start\",\\n      \"columns\": {\\n        \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\\n        \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\\n        \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\\n      },\\n      \"purpose\": \"Details of the second week for time analysis\"\\n    },\\n    \"Week3\": {\\n      \"source\": \"Week3 sheet\",\\n      \"role\": \"dimension\",\\n      \"pk\": \"week_start\",\\n      \"columns\": {\\n        \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\\n        \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\\n        \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\\n      },\\n      \"purpose\": \"Details of the third week for time analysis\"\\n    },\\n    \"Week4\": {\\n      \"source\": \"Week4 sheet\",\\n      \"role\": \"dimension\",\\n      \"pk\": \"week_start\",\\n      \"columns\": {\\n        \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\\n        \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\\n        \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\\n      },\\n      \"purpose\": \"Details of the fourth week for time analysis\"\\n    }\\n  },\\n  \"relationships\": [\\n    {\"from\": \"Month1.week\", \"to\": \"Week1.week_start\", \"type\": \"M:1\"},\\n    {\"from\": \"Month1.week\", \"to\": \"Week2.week_start\", \"type\": \"M:1\"},\\n    {\"from\": \"Month1.week\", \"to\": \"Week3.week_start\", \"type\": \"M:1\"},\\n    {\"from\": \"Month1.week\", \"to\": \"Week4.week_start\", \"type\": \"M:1\"}\\n  ],\\n  \"metrics\": [\\n    \"total_monthly_performance = SUM(month_totals)\",\\n    \"average_weekly_performance = AVG(month_totals)\"\\n  ],\\n  \"use_cases\": [\\n    \"monthly performance tracking\",\\n    \"weekly trend analysis\",\\n    \"resource allocation planning\"\\n  ]\\n}'\n",
       ")"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "user_description = \"These are my worksheets month over month\"\n",
    "\n",
    "data_context_agent = dspy.Predict(data_context_gen)\n",
    "\n",
    "tables = conn.execute(\"SHOW TABLES\").fetchall()\n",
    "\n",
    "dataset_view = \"\"\n",
    "\n",
    "for table in tables:\n",
    "    head_data = conn.execute(f\"SELECT * FROM {table[0]} LIMIT 3\").df().to_markdown()\n",
    "\n",
    "    dataset_view+=\"exact_table_name=\"+table[0]+'\\n:'+head_data+'\\n'\n",
    "\n",
    "\n",
    "response = data_context_agent(user_description=user_description, dataset_view=dataset_view)\n",
    "\n",
    "\n",
    "display(response)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "7ec51128",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tables\": {\n",
      "    \"Month1\": {\n",
      "      \"source\": \"Month1 sheet\",\n",
      "      \"role\": \"fact\",\n",
      "      \"pk\": \"week\",\n",
      "      \"columns\": {\n",
      "        \"week\": {\"type\": \"string\", \"role\": \"pk\"},\n",
      "        \"month_totals\": {\"type\": \"int\", \"role\": \"measure\"},\n",
      "        \"xperra\": {\"type\": \"int\", \"role\": \"measure\"},\n",
      "        \"ssc\": {\"type\": \"int\", \"role\": \"measure\"},\n",
      "        \"michael\": {\"type\": \"int\", \"role\": \"measure\"},\n",
      "        \"zac\": {\"type\": \"int\", \"role\": \"measure\"},\n",
      "        \"total\": {\"type\": \"int\", \"role\": \"measure\"}\n",
      "      },\n",
      "      \"purpose\": \"Monthly totals for performance tracking\"\n",
      "    },\n",
      "    \"Week1\": {\n",
      "      \"source\": \"Week1 sheet\",\n",
      "      \"role\": \"dimension\",\n",
      "      \"pk\": \"week_start\",\n",
      "      \"columns\": {\n",
      "        \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\n",
      "        \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\n",
      "        \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\n",
      "      },\n",
      "      \"purpose\": \"Details of the first week for time analysis\"\n",
      "    },\n",
      "    \"Week2\": {\n",
      "      \"source\": \"Week2 sheet\",\n",
      "      \"role\": \"dimension\",\n",
      "      \"pk\": \"week_start\",\n",
      "      \"columns\": {\n",
      "        \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\n",
      "        \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\n",
      "        \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\n",
      "      },\n",
      "      \"purpose\": \"Details of the second week for time analysis\"\n",
      "    },\n",
      "    \"Week3\": {\n",
      "      \"source\": \"Week3 sheet\",\n",
      "      \"role\": \"dimension\",\n",
      "      \"pk\": \"week_start\",\n",
      "      \"columns\": {\n",
      "        \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\n",
      "        \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\n",
      "        \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\n",
      "      },\n",
      "      \"purpose\": \"Details of the third week for time analysis\"\n",
      "    },\n",
      "    \"Week4\": {\n",
      "      \"source\": \"Week4 sheet\",\n",
      "      \"role\": \"dimension\",\n",
      "      \"pk\": \"week_start\",\n",
      "      \"columns\": {\n",
      "        \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\n",
      "        \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\n",
      "        \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\n",
      "      },\n",
      "      \"purpose\": \"Details of the fourth week for time analysis\"\n",
      "    }\n",
      "  },\n",
      "  \"relationships\": [\n",
      "    {\"from\": \"Month1.week\", \"to\": \"Week1.week_start\", \"type\": \"M:1\"},\n",
      "    {\"from\": \"Month1.week\", \"to\": \"Week2.week_start\", \"type\": \"M:1\"},\n",
      "    {\"from\": \"Month1.week\", \"to\": \"Week3.week_start\", \"type\": \"M:1\"},\n",
      "    {\"from\": \"Month1.week\", \"to\": \"Week4.week_start\", \"type\": \"M:1\"}\n",
      "  ],\n",
      "  \"metrics\": [\n",
      "    \"total_monthly_performance = SUM(month_totals)\",\n",
      "    \"average_weekly_performance = AVG(month_totals)\"\n",
      "  ],\n",
      "  \"use_cases\": [\n",
      "    \"monthly performance tracking\",\n",
      "    \"weekly trend analysis\",\n",
      "    \"resource allocation planning\"\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(response.data_context)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "31ac856e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Month1\n",
      ":|    | Month Totals   |   Xperra |   SSC  |   Michael |   Zac |   Total |   Unnamed: 6 |   Unnamed: 7 |\n",
      "|---:|:---------------|---------:|-------:|----------:|------:|--------:|-------------:|-------------:|\n",
      "|  0 | Week1          |      900 |      0 |         0 |     0 |     900 |          nan |          nan |\n",
      "|  1 | Week2          |      900 |      0 |         0 |     0 |     900 |          nan |          nan |\n",
      "|  2 | Week3          |     1200 |      0 |         0 |     0 |    1200 |          nan |          nan |\n",
      "Week1\n",
      ":|    | Week/Start Min   | Unnamed: 1   | Start    | 2025-02-24 00:00:00   | End   | 2025-03-03 00:00:00   | Unnamed: 6   | Unnamed: 7   | Unnamed: 8   | Unnamed: 9   | Unnamed: 10   |\n",
      "|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\n",
      "|  0 | Start            | End          | MON      | TUE                   | WED   | THU                   | FRI          | SAT          | SUN          |              |               |\n",
      "|  1 | 00:00:00         | 00:30:00     |          |                       |       |                       |              |              |              |              |               |\n",
      "|  2 | 00:30:00         | 01:00:00     |          |                       |       |                       |              |              |              |              |               |\n",
      "Week2\n",
      ":|    | Week/Start Min   | Unnamed: 1   | Start    | 2025-03-03 00:00:00   | End   | 2025-03-10 00:00:00   | Unnamed: 6   | Unnamed: 7   | Unnamed: 8   | Unnamed: 9   | Unnamed: 10   |\n",
      "|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\n",
      "|  0 | Start            | End          | MON      | TUE                   | WED   | THU                   | FRI          | SAT          | SUN          |              |               |\n",
      "|  1 | 00:00:00         | 00:30:00     |          |                       |       |                       |              |              |              |              |               |\n",
      "|  2 | 00:30:00         | 01:00:00     |          |                       |       |                       |              |              |              |              |               |\n",
      "Week3\n",
      ":|    | Week/Start Min   | Unnamed: 1   | Start    | 2025-03-10 00:00:00   | End   | 2025-03-17 00:00:00   | Unnamed: 6   | Unnamed: 7   | Unnamed: 8   | Unnamed: 9   | Unnamed: 10   |\n",
      "|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\n",
      "|  0 | Start            | End          | MON      | TUE                   | WED   | THU                   | FRI          | SAT          | SUN          |              |               |\n",
      "|  1 | 00:00:00         | 00:30:00     |          |                       |       |                       |              |              |              |              |               |\n",
      "|  2 | 00:30:00         | 01:00:00     |          |                       |       |                       |              |              |              |              |               |\n",
      "Week4\n",
      ":|    | Week/Start Min   | Unnamed: 1   | Start    | 2025-03-17 00:00:00   | End   | 2025-03-24 00:00:00   | Unnamed: 6   | Unnamed: 7   | Unnamed: 8   | Unnamed: 9   | Unnamed: 10   |\n",
      "|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\n",
      "|  0 | Start            | End          | MON      | TUE                   | WED   | THU                   | FRI          | SAT          | SUN          |              |               |\n",
      "|  1 | 00:00:00         | 00:30:00     |          |                       |       |                       |              |              |              |              |               |\n",
      "|  2 | 00:30:00         | 01:00:00     |          |                       |       |                       |              |              |              |              |               |\n"
     ]
    }
   ],
   "source": [
    "print(dataset_view)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92a291cf",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "83287b47",
   "metadata": {},
   "outputs": [],
   "source": [
    "class data_maker(dspy.Signature):\n",
    "    \"\"\"\n",
    "    Generate DuckDB SQL queries to fetch data from multiple datasets. Handle joins, aggregations, and filtering across tables.\n",
    "    Use table names as they appear in dataset_descriptions. Common patterns:\n",
    "    \n",
    "    Single table: SELECT * FROM customer_master WHERE region = 'North'\n",
    "    Join tables: SELECT c.name, SUM(s.quantity * s.unit_price) as revenue \n",
    "                 FROM customer_master c JOIN sales_data s ON c.customer_id = s.customer_id\n",
    "    Multi-dataset: SELECT e.first_name, a.hours_worked FROM employee_info e \n",
    "                   JOIN attendance_log a ON e.emp_id = a.emp_id WHERE a.date = '2024-01-15'\n",
    "    Aggregation: SELECT category, COUNT(*) as products FROM product_catalog GROUP BY category\n",
    "    Time-based: SELECT DATE_TRUNC('month', order_date) as month, SUM(quantity) \n",
    "                FROM sales_data WHERE order_date >= '2024-01-01' GROUP BY month\n",
    "    \n",
    "    Always return: df = conn.execute('SQL_QUERY').df() or more\n",
    "    \"\"\"\n",
    "    user_query = dspy.InputField(desc=\"what the user is requesting\")\n",
    "    dataset_descriptions = dspy.InputField(desc=\"Dict of dataset contexts with table names, columns, and relationships\")\n",
    "    duckdb_sql = dspy.OutputField(desc=\"df = conn.execute('SQL query to fetch the right data').df()\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "b06c9724",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_maker_agent = dspy.Predict(data_maker)\n",
    "\n",
    "user_query = \"show me how much I made from Xperra\"\n",
    "\n",
    "dataset_descriptions = str(response.data_context)\n",
    "\n",
    "sql = data_maker_agent(user_query=user_query, dataset_descriptions=dataset_descriptions)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "8c6ef476",
   "metadata": {},
   "outputs": [],
   "source": [
    "exec(sql.duckdb_sql)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "4b1e7b92",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   total_xperra\n",
      "0       27240.0\n"
     ]
    }
   ],
   "source": [
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "9b4b3e34",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Month1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Week1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Week2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Week3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Week4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     name\n",
       "0  Month1\n",
       "1   Week1\n",
       "2   Week2\n",
       "3   Week3\n",
       "4   Week4"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "conn.execute(\"SHOW TABLES\").df()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d532107b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}