{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "ae0e83db", "metadata": {}, "outputs": [], "source": [ "import dspy \n", "import duckdb \n", "import pandas as pd\n", "import numpy as np\n", "import os\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv()\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "94387056", "metadata": {}, "outputs": [], "source": [ "styling_instructions = [\n", " {\n", " \"category\": \"line_charts\",\n", " \"description\": \"Used to visualize trends and changes over time, often with multiple series.\",\n", " \"styling\": {\n", " \"template\": \"plotly_white\",\n", " \"axes_line_width\": 0.2,\n", " \"grid_width\": 1,\n", " \"title\": {\n", " \"bold_html\": True,\n", " \"include\": True\n", " },\n", " \"colors\": \"use multiple colors if more than one line\",\n", " \"annotations\": [\"min\", \"max\"],\n", " \"number_format\": {\n", " \"apply_k_m\": True,\n", " \"thresholds\": {\"K\": 1000, \"M\": 100000},\n", " \"percentage_decimals\": 2,\n", " \"percentage_sign\": True\n", " },\n", " \"default_size\": {\"height\": 1200, \"width\": 1000}\n", " }\n", " },\n", " {\n", " \"category\": \"bar_charts\",\n", " \"description\": \"Useful for comparing discrete categories or groups with bars representing values.\",\n", " \"styling\": {\n", " \"template\": \"plotly_white\",\n", " \"axes_line_width\": 0.2,\n", " \"grid_width\": 1,\n", " \"title\": {\"bold_html\": True, \"include\": True},\n", " \"annotations\": [\"bar values\"],\n", " \"number_format\": {\n", " \"apply_k_m\": True,\n", " \"thresholds\": {\"K\": 1000, \"M\": 100000},\n", " \"percentage_decimals\": 2,\n", " \"percentage_sign\": True\n", " },\n", " \"default_size\": {\"height\": 1200, \"width\": 1000}\n", " }\n", " },\n", " {\n", " \"category\": \"histograms\",\n", " \"description\": \"Display the distribution of a data set, useful for returns or frequency distributions.\",\n", " \"styling\": {\n", " \"template\": \"plotly_white\",\n", " \"bin_size\": 50,\n", " \"axes_line_width\": 0.2,\n", " \"grid_width\": 1,\n", " \"title\": {\"bold_html\": True, \"include\": True},\n", " \"annotations\": [\"x values\"],\n", " \"number_format\": {\n", " \"apply_k_m\": True,\n", " \"thresholds\": {\"K\": 1000, \"M\": 100000},\n", " \"percentage_decimals\": 2,\n", " \"percentage_sign\": True\n", " },\n", " \"default_size\": {\"height\": 1200, \"width\": 1000}\n", " }\n", " },\n", " {\n", " \"category\": \"pie_charts\",\n", " \"description\": \"Show composition or parts of a whole with slices representing categories.\",\n", " \"styling\": {\n", " \"template\": \"plotly_white\",\n", " \"top_categories_to_show\": 10,\n", " \"bundle_rest_as\": \"Others\",\n", " \"axes_line_width\": 0.2,\n", " \"grid_width\": 1,\n", " \"title\": {\"bold_html\": True, \"include\": True},\n", " \"annotations\": [\"x values\"],\n", " \"number_format\": {\n", " \"apply_k_m\": True,\n", " \"thresholds\": {\"K\": 1000, \"M\": 100000},\n", " \"percentage_decimals\": 2,\n", " \"percentage_sign\": True\n", " },\n", " \"default_size\": {\"height\": 1200, \"width\": 1000}\n", " }\n", " },\n", " {\n", " \"category\": \"tabular_and_generic_charts\",\n", " \"description\": \"Applies to charts where number formatting needs flexibility, including mixed or raw data.\",\n", " \"styling\": {\n", " \"template\": \"plotly_white\",\n", " \"axes_line_width\": 0.2,\n", " \"grid_width\": 1,\n", " \"title\": {\"bold_html\": True, \"include\": True},\n", " \"annotations\": [\"x values\"],\n", " \"number_format\": {\n", " \"apply_k_m\": True,\n", " \"thresholds\": {\"K\": 1000, \"M\": 100000},\n", " \"exclude_if_commas_present\": True,\n", " \"exclude_if_not_numeric\": True,\n", " \"percentage_decimals\": 2,\n", " \"percentage_sign\": True\n", " },\n", " \"default_size\": {\"height\": 1200, \"width\": 1000}\n", " }\n", " },\n", " {\n", " \"category\": \"heat_maps\",\n", " \"description\": \"Show data density or intensity using color scales on a matrix or grid.\",\n", " \"styling\": {\n", " \"template\": \"plotly_white\",\n", " \"axes_styles\": {\n", " \"line_color\": \"black\",\n", " \"line_width\": 0.2,\n", " \"grid_width\": 1,\n", " \"format_numbers_as_k_m\": True,\n", " \"exclude_non_numeric_formatting\": True\n", " },\n", " \"title\": {\"bold_html\": True, \"include\": True},\n", " \"default_size\": {\"height\": 1200, \"width\": 1000}\n", " }\n", " },\n", " {\n", " \"category\": \"histogram_distribution\",\n", " \"description\": \"Specialized histogram for return distributions with opacity control.\",\n", " \"styling\": {\n", " \"template\": \"plotly_white\",\n", " \"opacity\": 0.75,\n", " \"axes_styles\": {\n", " \"grid_width\": 1,\n", " \"format_numbers_as_k_m\": True,\n", " \"exclude_non_numeric_formatting\": True\n", " },\n", " \"title\": {\"bold_html\": True, \"include\": True},\n", " \"default_size\": {\"height\": 1200, \"width\": 1000}\n", " }\n", " }\n", "]\n", "\n", "# Convert to list of JSON strings\n", "styling_instructions = [str(chart_dict) for chart_dict in styling_instructions]" ] }, { "cell_type": "code", "execution_count": 5, "id": "0f15d1ca", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[\"{'category': 'line_charts', 'description': 'Used to visualize trends and changes over time, often with multiple series.', 'styling': {'template': 'plotly_white', 'axes_line_width': 0.2, 'grid_width': 1, 'title': {'bold_html': True, 'include': True}, 'colors': 'use multiple colors if more than one line', 'annotations': ['min', 'max'], 'number_format': {'apply_k_m': True, 'thresholds': {'K': 1000, 'M': 100000}, 'percentage_decimals': 2, 'percentage_sign': True}, 'default_size': {'height': 1200, 'width': 1000}}}\",\n", " \"{'category': 'bar_charts', 'description': 'Useful for comparing discrete categories or groups with bars representing values.', 'styling': {'template': 'plotly_white', 'axes_line_width': 0.2, 'grid_width': 1, 'title': {'bold_html': True, 'include': True}, 'annotations': ['bar values'], 'number_format': {'apply_k_m': True, 'thresholds': {'K': 1000, 'M': 100000}, 'percentage_decimals': 2, 'percentage_sign': True}, 'default_size': {'height': 1200, 'width': 1000}}}\",\n", " \"{'category': 'histograms', 'description': 'Display the distribution of a data set, useful for returns or frequency distributions.', 'styling': {'template': 'plotly_white', 'bin_size': 50, 'axes_line_width': 0.2, 'grid_width': 1, 'title': {'bold_html': True, 'include': True}, 'annotations': ['x values'], 'number_format': {'apply_k_m': True, 'thresholds': {'K': 1000, 'M': 100000}, 'percentage_decimals': 2, 'percentage_sign': True}, 'default_size': {'height': 1200, 'width': 1000}}}\",\n", " \"{'category': 'pie_charts', 'description': 'Show composition or parts of a whole with slices representing categories.', 'styling': {'template': 'plotly_white', 'top_categories_to_show': 10, 'bundle_rest_as': 'Others', 'axes_line_width': 0.2, 'grid_width': 1, 'title': {'bold_html': True, 'include': True}, 'annotations': ['x values'], 'number_format': {'apply_k_m': True, 'thresholds': {'K': 1000, 'M': 100000}, 'percentage_decimals': 2, 'percentage_sign': True}, 'default_size': {'height': 1200, 'width': 1000}}}\",\n", " \"{'category': 'tabular_and_generic_charts', 'description': 'Applies to charts where number formatting needs flexibility, including mixed or raw data.', 'styling': {'template': 'plotly_white', 'axes_line_width': 0.2, 'grid_width': 1, 'title': {'bold_html': True, 'include': True}, 'annotations': ['x values'], 'number_format': {'apply_k_m': True, 'thresholds': {'K': 1000, 'M': 100000}, 'exclude_if_commas_present': True, 'exclude_if_not_numeric': True, 'percentage_decimals': 2, 'percentage_sign': True}, 'default_size': {'height': 1200, 'width': 1000}}}\",\n", " \"{'category': 'heat_maps', 'description': 'Show data density or intensity using color scales on a matrix or grid.', 'styling': {'template': 'plotly_white', 'axes_styles': {'line_color': 'black', 'line_width': 0.2, 'grid_width': 1, 'format_numbers_as_k_m': True, 'exclude_non_numeric_formatting': True}, 'title': {'bold_html': True, 'include': True}, 'default_size': {'height': 1200, 'width': 1000}}}\",\n", " \"{'category': 'histogram_distribution', 'description': 'Specialized histogram for return distributions with opacity control.', 'styling': {'template': 'plotly_white', 'opacity': 0.75, 'axes_styles': {'grid_width': 1, 'format_numbers_as_k_m': True, 'exclude_non_numeric_formatting': True}, 'title': {'bold_html': True, 'include': True}, 'default_size': {'height': 1200, 'width': 1000}}}\"]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "styling_instructions" ] }, { "cell_type": "code", "execution_count": 32, "id": "b39cdaf9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "dspy.configure(lm= dspy.LM('openai/gpt-4o-mini', max_tokens =800, api_key=os.getenv('OPENAI_API_KEY')))\n", "\n", "\n", "print(dspy.settings.lm)" ] }, { "cell_type": "code", "execution_count": null, "id": "bcef79e3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tables in DuckDB:\n", "- Month1\n" ] }, { "data": { "text/plain": [ "'| | Month Totals | Xperra | SSC | Michael | Zac | Total | Unnamed: 6 | Unnamed: 7 |\\n|---:|:---------------|---------:|-------:|----------:|------:|--------:|-------------:|-------------:|\\n| 0 | Week1 | 900 | 0 | 0 | 0 | 900 | nan | nan |\\n| 1 | Week2 | 900 | 0 | 0 | 0 | 900 | nan | nan |\\n| 2 | Week3 | 1200 | 0 | 0 | 0 | 1200 | nan | nan |'" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "- Week1\n" ] }, { "data": { "text/plain": [ "'| | Week/Start Min | Unnamed: 1 | Start | 2025-02-24 00:00:00 | End | 2025-03-03 00:00:00 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 |\\n|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\\n| 0 | Start | End | MON | TUE | WED | THU | FRI | SAT | SUN | | |\\n| 1 | 00:00:00 | 00:30:00 | | | | | | | | | |\\n| 2 | 00:30:00 | 01:00:00 | | | | | | | | | |'" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "- Week2\n" ] }, { "data": { "text/plain": [ "'| | Week/Start Min | Unnamed: 1 | Start | 2025-03-03 00:00:00 | End | 2025-03-10 00:00:00 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 |\\n|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\\n| 0 | Start | End | MON | TUE | WED | THU | FRI | SAT | SUN | | |\\n| 1 | 00:00:00 | 00:30:00 | | | | | | | | | |\\n| 2 | 00:30:00 | 01:00:00 | | | | | | | | | |'" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "- Week3\n" ] }, { "data": { "text/plain": [ "'| | Week/Start Min | Unnamed: 1 | Start | 2025-03-10 00:00:00 | End | 2025-03-17 00:00:00 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 |\\n|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\\n| 0 | Start | End | MON | TUE | WED | THU | FRI | SAT | SUN | | |\\n| 1 | 00:00:00 | 00:30:00 | | | | | | | | | |\\n| 2 | 00:30:00 | 01:00:00 | | | | | | | | | |'" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "- Week4\n" ] }, { "data": { "text/plain": [ "'| | Week/Start Min | Unnamed: 1 | Start | 2025-03-17 00:00:00 | End | 2025-03-24 00:00:00 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 |\\n|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\\n| 0 | Start | End | MON | TUE | WED | THU | FRI | SAT | SUN | | |\\n| 1 | 00:00:00 | 00:30:00 | | | | | | | | | |\\n| 2 | 00:30:00 | 01:00:00 | | | | | | | | | |'" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/plain": [ "('Week4',)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "excel_path = 'My Timesheet 24th Feb.xlsx'\n", "\n", "sheet_names = pd.ExcelFile(excel_path).sheet_names\n", "\n", "conn = duckdb.connect()\n", "\n", "\n", "# Create tables for each sheet\n", "for sheet in sheet_names:\n", " df = pd.read_excel(excel_path, sheet_name=sheet)\n", " # Register each DataFrame as a table named after the sheet\n", " conn.register(sheet, df)\n", "\n", "\n", "# Show all tables in DuckDB\n", "tables = conn.execute(\"SHOW TABLES\").fetchall()\n", "print(\"Tables in DuckDB:\")\n", "for table in tables:\n", " # Get the first few rows of each table to show structure\n", " try:\n", " head_data = conn.execute(f\"SELECT * FROM {table[0]} LIMIT 3\").df().to_markdown()\n", " display(head_data)\n", "\n", " except Exception as e:\n", " print(f\" Error fetching head of {table[0]}: {e}\")\n", " print() # Add blank line for readability\n", "\n", "\n", "\n", "\n", "# Read Excel file using DuckDB\n", "# excel = duckdb.sql(\"SELECT * FROM read_excel('My Timesheet 24th Feb.xlsx')\")\n", "\n", "\n", "\n", "# help(excel)\n", "\n", "\n", "\n", "\n", "\n", "# Preprocessing steps\n", "# 1. Drop empty rows and columns\n", "# excel_df.dropna(how='all', inplace=True) # Remove empty rows\n", "# excel_df.dropna(how='all', axis=1, inplace=True) # Remove empty columns\n", "\n", "# # 2. Clean column names\n", "# excel_df.columns = excel_df.columns.str.strip() # Remove extra spaces\n", "\n", "# # 3. Convert Excel data to CSV with UTF-8-sig encoding\n", "# csv_buffer = io.StringIO()\n", "# excel_df.to_csv(csv_buffer, index=False, encoding='utf-8-sig')\n", "# csv_buffer.seek(0)\n", "\n", "# # Read the processed CSV back into a dataframe\n", "# new_df = pd.read_csv(csv_buffer)\n", "\n", "\n", "# excel\n", "\n", "table" ] }, { "cell_type": "code", "execution_count": 61, "id": "abf0addd", "metadata": {}, "outputs": [], "source": [ "class data_context_gen(dspy.Signature):\n", " \"\"\"\n", " Generate a compact JSON data context for DuckDB tables ingested from Excel or CSV files.\n", " The JSON must include:\n", " - Exact DuckDB table names\n", " - Source sheet or file name for each table\n", " - Table role (fact/dimension)\n", " - Primary key (pk)\n", " - Columns with type and role (pk, fk, attr, cat, measure, temporal)\n", " - Relationships between tables (foreign keys), with cardinality types (1:1, 1:M, M:1, M:M)\n", " - Business purpose of each table\n", " - Metrics expressed as formulas\n", " - Use cases for the dataset\n", "\n", " Example JSON format:\n", " {\n", " \"tables\": {\n", " \"customer_master\": {\n", " \"source\": \"Customer_Master sheet\",\n", " \"role\": \"dimension\",\n", " \"pk\": \"customer_id\",\n", " \"columns\": {\n", " \"customer_id\": {\"type\": \"string\", \"role\": \"pk\"},\n", " \"name\": {\"type\": \"string\", \"role\": \"attr\"},\n", " \"region\": {\"type\": \"string\", \"role\": \"cat\"},\n", " \"signup_date\": {\"type\": \"date\", \"role\": \"temporal\"}\n", " },\n", " \"purpose\": \"Customer attributes for segmentation\"\n", " },\n", " \"sales_data\": {\n", " \"source\": \"Sales_Data sheet\",\n", " \"role\": \"fact\",\n", " \"pk\": \"order_id\",\n", " \"columns\": {\n", " \"order_id\": {\"type\": \"string\", \"role\": \"pk\"},\n", " \"customer_id\": {\"type\": \"string\", \"role\": \"fk\"},\n", " \"product_id\": {\"type\": \"string\", \"role\": \"fk\"},\n", " \"order_date\": {\"type\": \"date\", \"role\": \"temporal\"},\n", " \"quantity\": {\"type\": \"int\", \"role\": \"measure\"},\n", " \"unit_price\": {\"type\": \"decimal\", \"role\": \"measure\"}\n", " },\n", " \"purpose\": \"Transaction records for revenue analysis\"\n", " },\n", " \"product_catalog\": {\n", " \"source\": \"Product_Catalog sheet\",\n", " \"role\": \"dimension\",\n", " \"pk\": \"product_id\",\n", " \"columns\": {\n", " \"product_id\": {\"type\": \"string\", \"role\": \"pk\"},\n", " \"product_name\": {\"type\": \"string\", \"role\": \"attr\"},\n", " \"category\": {\"type\": \"string\", \"role\": \"cat\"},\n", " \"subcategory\": {\"type\": \"string\", \"role\": \"cat\"},\n", " \"brand\": {\"type\": \"string\", \"role\": \"cat\"}\n", " },\n", " \"purpose\": \"Product hierarchy for analysis\"\n", " }\n", " },\n", " \"relationships\": [\n", " {\"from\": \"sales_data.customer_id\", \"to\": \"customer_master.customer_id\", \"type\": \"M:1\"},\n", " {\"from\": \"sales_data.product_id\", \"to\": \"product_catalog.product_id\", \"type\": \"M:1\"}\n", " ],\n", " \"metrics\": [\n", " \"revenue = quantity * unit_price\",\n", " \"customer_lifetime_value\"\n", " ],\n", " \"use_cases\": [\n", " \"cohort analysis\",\n", " \"product performance\",\n", " \"regional sales\"\n", " ]\n", " }\n", "\n", " Column roles: pk (primary key), fk (foreign key), attr (attribute), cat (categorical), measure (numerical), temporal (date/time)\n", " Table roles: fact (transactional), dimension (reference data)\n", " Relationship types: 1:1, 1:M, M:1, M:M\n", " \"\"\"\n", " user_description = dspy.InputField(desc=\"User's description of the data, including relationships\")\n", " dataset_view = dspy.InputField(desc=\"Dataset name with sample head(5 rows) view\")\n", " data_context = dspy.OutputField(desc=\"Compact JSON describing DuckDB tables, columns, relationships, metrics and use cases\")" ] }, { "cell_type": "code", "execution_count": 71, "id": "59699a12", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Prediction(\n", " data_context='{\\n \"tables\": {\\n \"Month1\": {\\n \"source\": \"Month1 sheet\",\\n \"role\": \"fact\",\\n \"pk\": \"week\",\\n \"columns\": {\\n \"week\": {\"type\": \"string\", \"role\": \"pk\"},\\n \"month_totals\": {\"type\": \"int\", \"role\": \"measure\"},\\n \"xperra\": {\"type\": \"int\", \"role\": \"measure\"},\\n \"ssc\": {\"type\": \"int\", \"role\": \"measure\"},\\n \"michael\": {\"type\": \"int\", \"role\": \"measure\"},\\n \"zac\": {\"type\": \"int\", \"role\": \"measure\"},\\n \"total\": {\"type\": \"int\", \"role\": \"measure\"}\\n },\\n \"purpose\": \"Monthly totals for performance tracking\"\\n },\\n \"Week1\": {\\n \"source\": \"Week1 sheet\",\\n \"role\": \"dimension\",\\n \"pk\": \"week_start\",\\n \"columns\": {\\n \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\\n \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\\n \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\\n },\\n \"purpose\": \"Details of the first week for time analysis\"\\n },\\n \"Week2\": {\\n \"source\": \"Week2 sheet\",\\n \"role\": \"dimension\",\\n \"pk\": \"week_start\",\\n \"columns\": {\\n \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\\n \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\\n \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\\n },\\n \"purpose\": \"Details of the second week for time analysis\"\\n },\\n \"Week3\": {\\n \"source\": \"Week3 sheet\",\\n \"role\": \"dimension\",\\n \"pk\": \"week_start\",\\n \"columns\": {\\n \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\\n \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\\n \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\\n },\\n \"purpose\": \"Details of the third week for time analysis\"\\n },\\n \"Week4\": {\\n \"source\": \"Week4 sheet\",\\n \"role\": \"dimension\",\\n \"pk\": \"week_start\",\\n \"columns\": {\\n \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\\n \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\\n \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\\n },\\n \"purpose\": \"Details of the fourth week for time analysis\"\\n }\\n },\\n \"relationships\": [\\n {\"from\": \"Month1.week\", \"to\": \"Week1.week_start\", \"type\": \"M:1\"},\\n {\"from\": \"Month1.week\", \"to\": \"Week2.week_start\", \"type\": \"M:1\"},\\n {\"from\": \"Month1.week\", \"to\": \"Week3.week_start\", \"type\": \"M:1\"},\\n {\"from\": \"Month1.week\", \"to\": \"Week4.week_start\", \"type\": \"M:1\"}\\n ],\\n \"metrics\": [\\n \"total_monthly_performance = SUM(month_totals)\",\\n \"average_weekly_performance = AVG(month_totals)\"\\n ],\\n \"use_cases\": [\\n \"monthly performance tracking\",\\n \"weekly trend analysis\",\\n \"resource allocation planning\"\\n ]\\n}'\n", ")" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "user_description = \"These are my worksheets month over month\"\n", "\n", "data_context_agent = dspy.Predict(data_context_gen)\n", "\n", "tables = conn.execute(\"SHOW TABLES\").fetchall()\n", "\n", "dataset_view = \"\"\n", "\n", "for table in tables:\n", " head_data = conn.execute(f\"SELECT * FROM {table[0]} LIMIT 3\").df().to_markdown()\n", "\n", " dataset_view+=\"exact_table_name=\"+table[0]+'\\n:'+head_data+'\\n'\n", "\n", "\n", "response = data_context_agent(user_description=user_description, dataset_view=dataset_view)\n", "\n", "\n", "display(response)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7ec51128", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tables\": {\n", " \"Month1\": {\n", " \"source\": \"Month1 sheet\",\n", " \"role\": \"fact\",\n", " \"pk\": \"week\",\n", " \"columns\": {\n", " \"week\": {\"type\": \"string\", \"role\": \"pk\"},\n", " \"month_totals\": {\"type\": \"int\", \"role\": \"measure\"},\n", " \"xperra\": {\"type\": \"int\", \"role\": \"measure\"},\n", " \"ssc\": {\"type\": \"int\", \"role\": \"measure\"},\n", " \"michael\": {\"type\": \"int\", \"role\": \"measure\"},\n", " \"zac\": {\"type\": \"int\", \"role\": \"measure\"},\n", " \"total\": {\"type\": \"int\", \"role\": \"measure\"}\n", " },\n", " \"purpose\": \"Monthly totals for performance tracking\"\n", " },\n", " \"Week1\": {\n", " \"source\": \"Week1 sheet\",\n", " \"role\": \"dimension\",\n", " \"pk\": \"week_start\",\n", " \"columns\": {\n", " \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\n", " \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\n", " \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\n", " },\n", " \"purpose\": \"Details of the first week for time analysis\"\n", " },\n", " \"Week2\": {\n", " \"source\": \"Week2 sheet\",\n", " \"role\": \"dimension\",\n", " \"pk\": \"week_start\",\n", " \"columns\": {\n", " \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\n", " \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\n", " \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\n", " },\n", " \"purpose\": \"Details of the second week for time analysis\"\n", " },\n", " \"Week3\": {\n", " \"source\": \"Week3 sheet\",\n", " \"role\": \"dimension\",\n", " \"pk\": \"week_start\",\n", " \"columns\": {\n", " \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\n", " \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\n", " \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\n", " },\n", " \"purpose\": \"Details of the third week for time analysis\"\n", " },\n", " \"Week4\": {\n", " \"source\": \"Week4 sheet\",\n", " \"role\": \"dimension\",\n", " \"pk\": \"week_start\",\n", " \"columns\": {\n", " \"week_start\": {\"type\": \"datetime\", \"role\": \"pk\"},\n", " \"start\": {\"type\": \"time\", \"role\": \"temporal\"},\n", " \"end\": {\"type\": \"time\", \"role\": \"temporal\"}\n", " },\n", " \"purpose\": \"Details of the fourth week for time analysis\"\n", " }\n", " },\n", " \"relationships\": [\n", " {\"from\": \"Month1.week\", \"to\": \"Week1.week_start\", \"type\": \"M:1\"},\n", " {\"from\": \"Month1.week\", \"to\": \"Week2.week_start\", \"type\": \"M:1\"},\n", " {\"from\": \"Month1.week\", \"to\": \"Week3.week_start\", \"type\": \"M:1\"},\n", " {\"from\": \"Month1.week\", \"to\": \"Week4.week_start\", \"type\": \"M:1\"}\n", " ],\n", " \"metrics\": [\n", " \"total_monthly_performance = SUM(month_totals)\",\n", " \"average_weekly_performance = AVG(month_totals)\"\n", " ],\n", " \"use_cases\": [\n", " \"monthly performance tracking\",\n", " \"weekly trend analysis\",\n", " \"resource allocation planning\"\n", " ]\n", "}\n" ] } ], "source": [ "# print(response.data_context)\n", "\n", "l1 = ['muhammadalisaif2@gmail.com',\n", "faisalnazeer67@gmail.com\n", "muhammadhunainzubair@gmail.com\n", "nabeeluddin266@gmail.com\n", "Mohidyamin@gmail.com\n", "daemazeemdean@gmail.com\n", "fariakhaliq123@gmail.com\n", "ibrahimarain15@gmail.com\n", "umairahmed805805@gmail.com\n", "muhibali369@gmail.com\n", "taharizwan492@gmail.com\n", "qayyumahmed552@gmail.com\n", "abdulmusawir3545@gmail.com\n", "huzaifa.m.awan@gmail.com\n", "bilalhassan2103@gmail.com\n", "muhammadabbas485@gmail.com\n", "eros1030109@gmail.com\n", "muhammadhureran8@gmail.com\n", "abdulmoezmughl@gmail.com\n", "samahabatool7@gmail.com\n", "3meraldgg@gmail.com\n", "abdurrehman.azeem81@hotmail.com\n", "ayesh.bangash810@gmail.com\n", "'umernaeem.12513@gmail.com']" ] }, { "cell_type": "code", "execution_count": 42, "id": "31ac856e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Month1\n", ":| | Month Totals | Xperra | SSC | Michael | Zac | Total | Unnamed: 6 | Unnamed: 7 |\n", "|---:|:---------------|---------:|-------:|----------:|------:|--------:|-------------:|-------------:|\n", "| 0 | Week1 | 900 | 0 | 0 | 0 | 900 | nan | nan |\n", "| 1 | Week2 | 900 | 0 | 0 | 0 | 900 | nan | nan |\n", "| 2 | Week3 | 1200 | 0 | 0 | 0 | 1200 | nan | nan |\n", "Week1\n", ":| | Week/Start Min | Unnamed: 1 | Start | 2025-02-24 00:00:00 | End | 2025-03-03 00:00:00 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 |\n", "|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\n", "| 0 | Start | End | MON | TUE | WED | THU | FRI | SAT | SUN | | |\n", "| 1 | 00:00:00 | 00:30:00 | | | | | | | | | |\n", "| 2 | 00:30:00 | 01:00:00 | | | | | | | | | |\n", "Week2\n", ":| | Week/Start Min | Unnamed: 1 | Start | 2025-03-03 00:00:00 | End | 2025-03-10 00:00:00 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 |\n", "|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\n", "| 0 | Start | End | MON | TUE | WED | THU | FRI | SAT | SUN | | |\n", "| 1 | 00:00:00 | 00:30:00 | | | | | | | | | |\n", "| 2 | 00:30:00 | 01:00:00 | | | | | | | | | |\n", "Week3\n", ":| | Week/Start Min | Unnamed: 1 | Start | 2025-03-10 00:00:00 | End | 2025-03-17 00:00:00 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 |\n", "|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\n", "| 0 | Start | End | MON | TUE | WED | THU | FRI | SAT | SUN | | |\n", "| 1 | 00:00:00 | 00:30:00 | | | | | | | | | |\n", "| 2 | 00:30:00 | 01:00:00 | | | | | | | | | |\n", "Week4\n", ":| | Week/Start Min | Unnamed: 1 | Start | 2025-03-17 00:00:00 | End | 2025-03-24 00:00:00 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 |\n", "|---:|:-----------------|:-------------|:---------|:----------------------|:------|:----------------------|:-------------|:-------------|:-------------|:-------------|:--------------|\n", "| 0 | Start | End | MON | TUE | WED | THU | FRI | SAT | SUN | | |\n", "| 1 | 00:00:00 | 00:30:00 | | | | | | | | | |\n", "| 2 | 00:30:00 | 01:00:00 | | | | | | | | | |\n" ] } ], "source": [ "print(dataset_view)" ] }, { "cell_type": "code", "execution_count": null, "id": "92a291cf", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 67, "id": "83287b47", "metadata": {}, "outputs": [], "source": [ "class data_maker(dspy.Signature):\n", " \"\"\"\n", " Generate DuckDB SQL queries to fetch data from multiple datasets. Handle joins, aggregations, and filtering across tables.\n", " Use table names as they appear in dataset_descriptions. Common patterns:\n", " \n", " Single table: SELECT * FROM customer_master WHERE region = 'North'\n", " Join tables: SELECT c.name, SUM(s.quantity * s.unit_price) as revenue \n", " FROM customer_master c JOIN sales_data s ON c.customer_id = s.customer_id\n", " Multi-dataset: SELECT e.first_name, a.hours_worked FROM employee_info e \n", " JOIN attendance_log a ON e.emp_id = a.emp_id WHERE a.date = '2024-01-15'\n", " Aggregation: SELECT category, COUNT(*) as products FROM product_catalog GROUP BY category\n", " Time-based: SELECT DATE_TRUNC('month', order_date) as month, SUM(quantity) \n", " FROM sales_data WHERE order_date >= '2024-01-01' GROUP BY month\n", " \n", " Always return: df = conn.execute('SQL_QUERY').df() or more\n", " \"\"\"\n", " user_query = dspy.InputField(desc=\"what the user is requesting\")\n", " dataset_descriptions = dspy.InputField(desc=\"Dict of dataset contexts with table names, columns, and relationships\")\n", " duckdb_sql = dspy.OutputField(desc=\"df = conn.execute('SQL query to fetch the right data').df()\")" ] }, { "cell_type": "code", "execution_count": 68, "id": "b06c9724", "metadata": {}, "outputs": [], "source": [ "data_maker_agent = dspy.Predict(data_maker)\n", "\n", "user_query = \"show me how much I made from Xperra\"\n", "\n", "dataset_descriptions = str(response.data_context)\n", "\n", "sql = data_maker_agent(user_query=user_query, dataset_descriptions=dataset_descriptions)\n", "\n" ] }, { "cell_type": "code", "execution_count": 69, "id": "8c6ef476", "metadata": {}, "outputs": [], "source": [ "exec(sql.duckdb_sql)" ] }, { "cell_type": "code", "execution_count": 70, "id": "4b1e7b92", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " total_xperra\n", "0 27240.0\n" ] } ], "source": [ "print(df)" ] }, { "cell_type": "code", "execution_count": 60, "id": "9b4b3e34", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
name
0Month1
1Week1
2Week2
3Week3
4Week4
\n", "
" ], "text/plain": [ " name\n", "0 Month1\n", "1 Week1\n", "2 Week2\n", "3 Week3\n", "4 Week4" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conn.execute(\"SHOW TABLES\").df()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d532107b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }