Data-Science-Agent / src /tools /tools_registry.py
Pulastya B
Fixed Bugs where the SSE Streaming was improper added support for Auto Gluon, Fixed instances where stale schemas was causing EDA plots generation to fail
1ca2e0e
"""
Complete Tools Registry for Groq Function Calling - All 67 Tools
Defines all available tools in Groq's function calling format.
"""
TOOLS = [
# ============================================
# BASIC TOOLS (16)
# ============================================
# Data Profiling Tools (3)
{
"type": "function",
"function": {
"name": "profile_dataset",
"description": "Get comprehensive statistics about a dataset including shape, data types, memory usage, null counts, and unique values. Use this as the first step to understand any new dataset.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Absolute or relative path to the CSV or Parquet file"
}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_data_quality_issues",
"description": "Detect data quality issues including outliers (using IQR method), duplicate rows, inconsistent formats, and data anomalies. Returns a prioritized list of issues with severity levels.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "analyze_correlations",
"description": "Compute correlation matrix and identify top correlations. If a target column is specified, shows features most correlated with the target. Useful for feature selection and understanding relationships.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"target": {
"type": "string",
"description": "Optional target column name to analyze correlations with"
}
},
"required": ["file_path"]
}
}
},
# Data Cleaning Tools (3)
{
"type": "function",
"function": {
"name": "clean_missing_values",
"description": "Handle missing values using appropriate strategies based on column type. Strategies include median/mean for numeric, mode for categorical, forward_fill for time series, or drop. In 'auto' mode, first drops columns with >threshold missing (default 40%), then imputes remaining columns. Will not impute ID columns.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"strategy": {
"oneOf": [
{
"type": "string",
"enum": ["auto"],
"description": "Use 'auto' to automatically decide strategies for all columns based on data type. First drops columns with >threshold missing, then imputes remaining columns."
},
{
"type": "object",
"description": "Dictionary mapping column names to strategies ('median', 'mean', 'mode', 'forward_fill', 'drop')",
"additionalProperties": {"type": "string"}
}
],
"description": "Either 'auto' (string) to automatically handle all missing values, or a dictionary mapping specific columns to strategies"
},
"output_path": {
"type": "string",
"description": "Path to save cleaned dataset"
},
"threshold": {
"type": "number",
"description": "For 'auto' mode: drop columns with missing percentage above this threshold (default: 0.4 = 40%). Range: 0.0 to 1.0. For example, 0.7 means drop columns with >70% missing values."
}
},
"required": ["file_path", "strategy", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "handle_outliers",
"description": "Detect and handle outliers in numeric columns using IQR method. Methods: 'clip' (cap at boundaries), 'winsorize' (cap at percentiles), or 'remove' (delete rows).",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"method": {
"type": "string",
"enum": ["clip", "winsorize", "remove"],
"description": "Method to handle outliers"
},
"columns": {
"type": "array",
"items": {"type": "string"},
"description": "List of column names to check for outliers. Use 'all' to check all numeric columns."
},
"output_path": {
"type": "string",
"description": "Path to save cleaned dataset"
}
},
"required": ["file_path", "method", "columns", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "fix_data_types",
"description": "Auto-detect and fix incorrect data types. Handles dates, booleans, categoricals, and numeric columns. Fixes common issues like 'null' strings and mixed types.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"type_mapping": {
"type": "object",
"description": "Optional dictionary mapping column names to target types ('int', 'float', 'string', 'date', 'bool', 'category'). Use 'auto' for automatic detection.",
"additionalProperties": {"type": "string"}
},
"output_path": {
"type": "string",
"description": "Path to save dataset with fixed types"
}
},
"required": ["file_path", "output_path"]
}
}
},
# Data Type Conversion Tools (2)
{
"type": "function",
"function": {
"name": "force_numeric_conversion",
"description": "CRITICAL TOOL: Force convert columns to numeric type even if detected as strings/objects. Essential for datasets with numeric columns stored as strings (with commas, spaces, currency symbols). Use this BEFORE encoding when you see 'no numeric features' errors.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"columns": {
"type": "array",
"items": {"type": "string"},
"description": "List of column names to force convert to numeric. Use ['all'] to auto-detect and convert all non-ID columns that look numeric."
},
"output_path": {
"type": "string",
"description": "Path to save dataset with converted types"
},
"errors": {
"type": "string",
"enum": ["coerce", "raise"],
"description": "How to handle conversion errors. 'coerce' makes invalid values null (recommended), 'raise' throws error."
}
},
"required": ["file_path", "columns", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "smart_type_inference",
"description": "Intelligently infer and fix data types for all columns by analyzing patterns. Goes beyond basic type detection to understand semantic meaning. Use when dataset has widespread type issues.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"output_path": {
"type": "string",
"description": "Path to save dataset with inferred types"
},
"aggressive": {
"type": "boolean",
"description": "If true, attempts aggressive conversion on ambiguous columns. Recommended for messy datasets."
}
},
"required": ["file_path", "output_path"]
}
}
},
# Feature Engineering Tools (2)
{
"type": "function",
"function": {
"name": "create_time_features",
"description": "Extract comprehensive time-based features from datetime columns including year, month, day, day_of_week, quarter, is_weekend, and cyclical encodings (sin/cos for month and hour).",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"date_col": {
"type": "string",
"description": "Name of the datetime column to extract features from"
},
"output_path": {
"type": "string",
"description": "Path to save dataset with new features"
}
},
"required": ["file_path", "date_col", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "encode_categorical",
"description": "Encode categorical variables using one-hot encoding, target encoding, or frequency encoding. Handles high-cardinality columns intelligently. Use method='auto' to automatically choose the best encoding.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"method": {
"type": "string",
"enum": ["one_hot", "target", "frequency", "auto"],
"description": "Encoding method to use. 'auto' automatically selects the best method."
},
"columns": {
"type": "array",
"items": {"type": "string"},
"description": "List of categorical columns to encode. Use ['all'] to encode all categorical columns. If not specified, defaults to all categorical columns."
},
"target_col": {
"type": "string",
"description": "Required for target encoding: name of the target column"
},
"output_path": {
"type": "string",
"description": "Path to save dataset with encoded features"
}
},
"required": ["file_path", "output_path"]
}
}
},
# Model Training Tools (2)
{
"type": "function",
"function": {
"name": "train_baseline_models",
"description": "Train multiple baseline models (Logistic Regression, Random Forest, XGBoost) and compare their performance. Automatically detects task type (classification/regression) and returns the best model with metrics.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the prepared dataset file"
},
"target_col": {
"type": "string",
"description": "Name of the target column to predict"
},
"task_type": {
"type": "string",
"enum": ["classification", "regression", "auto"],
"description": "Type of ML task. Use 'auto' to detect automatically."
},
"test_size": {
"type": "number",
"description": "Proportion of data to use for testing (default: 0.2)"
},
"random_state": {
"type": "integer",
"description": "Random seed for reproducibility (default: 42)"
}
},
"required": ["file_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_model_report",
"description": "Generate comprehensive model evaluation report including metrics, confusion matrix (for classification), feature importance, and SHAP values for top features. Saves report as JSON.",
"parameters": {
"type": "object",
"properties": {
"model_path": {
"type": "string",
"description": "Path to saved model file (.pkl or .joblib)"
},
"test_data_path": {
"type": "string",
"description": "Path to test dataset file"
},
"target_col": {
"type": "string",
"description": "Name of the target column"
},
"output_path": {
"type": "string",
"description": "Path to save the report JSON file"
}
},
"required": ["model_path", "test_data_path", "target_col", "output_path"]
}
}
},
# New Data Wrangling Tools (3)
{
"type": "function",
"function": {
"name": "get_smart_summary",
"description": "Generate an LLM-friendly smart summary of a dataset with per-column missing value percentages (sorted by severity), unique value counts, sample data, and numeric statistics. Much more detailed than profile_dataset for decision-making.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the CSV or Parquet file to summarize"
},
"n_samples": {
"type": "integer",
"description": "Number of sample rows to include in the summary (default: 5)"
}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "merge_datasets",
"description": "Merge two datasets using SQL-like join operations (inner, left, right, outer, cross). Supports joining on single or multiple columns with same or different names. Automatically handles duplicate columns with suffixes.",
"parameters": {
"type": "object",
"properties": {
"left_path": {
"type": "string",
"description": "Path to the left (first) dataset file"
},
"right_path": {
"type": "string",
"description": "Path to the right (second) dataset file"
},
"output_path": {
"type": "string",
"description": "Path to save the merged dataset"
},
"how": {
"type": "string",
"enum": ["inner", "left", "right", "outer", "cross"],
"description": "Join type: 'inner' (only matching rows), 'left' (all left + matching right), 'right' (all right + matching left), 'outer' (all rows from both), 'cross' (cartesian product)"
},
"on": {
"type": ["string", "array"],
"items": {"type": "string"},
"description": "Column name(s) to join on (must exist in both datasets). Can be a single column name or list of columns. Use this when join columns have the same name in both datasets."
},
"left_on": {
"type": ["string", "array"],
"items": {"type": "string"},
"description": "Column name(s) in left dataset to join on. Use with right_on when join columns have different names."
},
"right_on": {
"type": ["string", "array"],
"items": {"type": "string"},
"description": "Column name(s) in right dataset to join on. Use with left_on when join columns have different names."
}
},
"required": ["left_path", "right_path", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "concat_datasets",
"description": "Concatenate multiple datasets either vertically (stacking rows, useful for monthly data) or horizontally (adding columns side-by-side). Validates schema compatibility for vertical concat.",
"parameters": {
"type": "object",
"properties": {
"file_paths": {
"type": "array",
"items": {"type": "string"},
"description": "List of paths to dataset files to concatenate (minimum 2 files)"
},
"output_path": {
"type": "string",
"description": "Path to save the concatenated dataset"
},
"axis": {
"type": "string",
"enum": ["vertical", "horizontal"],
"description": "'vertical' to stack rows (union, for monthly data), 'horizontal' to add columns side-by-side (default: 'vertical')"
}
},
"required": ["file_paths", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "reshape_dataset",
"description": "Transform dataset structure using pivot (long→wide format), melt (wide→long format), or transpose (swap rows and columns) operations.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file to reshape"
},
"output_path": {
"type": "string",
"description": "Path to save the reshaped dataset"
},
"operation": {
"type": "string",
"enum": ["pivot", "melt", "transpose"],
"description": "Reshape operation: 'pivot' (long→wide, requires index/columns/values), 'melt' (wide→long, requires id_vars/value_vars), 'transpose' (swap rows/columns)"
},
"index": {
"type": "string",
"description": "Column to use as row index (for pivot operation)"
},
"columns": {
"type": "string",
"description": "Column whose values become new column names (for pivot operation)"
},
"values": {
"type": "string",
"description": "Column whose values populate the pivoted table (for pivot operation)"
},
"id_vars": {
"type": "array",
"items": {"type": "string"},
"description": "Columns to keep as identifiers (for melt operation)"
},
"value_vars": {
"type": "array",
"items": {"type": "string"},
"description": "Columns to unpivot (for melt operation). If not specified, uses all columns except id_vars."
}
},
"required": ["file_path", "output_path", "operation"]
}
}
},
# ============================================
# ADVANCED ANALYSIS TOOLS (5)
# ============================================
{
"type": "function",
"function": {
"name": "perform_eda_analysis",
"description": "Comprehensive Exploratory Data Analysis with visualizations, distribution analysis, and automated insights. Generates HTML report with plots.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Optional target column for supervised analysis"},
"output_dir": {"type": "string", "description": "Directory to save EDA report and plots"}
},
"required": ["file_path", "output_dir"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_model_issues",
"description": "Detect overfitting, underfitting, class imbalance, and other model performance issues. Provides diagnostic recommendations.",
"parameters": {
"type": "object",
"properties": {
"model_path": {"type": "string", "description": "Path to trained model"},
"train_data_path": {"type": "string", "description": "Path to training data"},
"test_data_path": {"type": "string", "description": "Path to test data"},
"target_col": {"type": "string", "description": "Target column name"}
},
"required": ["model_path", "train_data_path", "test_data_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_anomalies",
"description": "Detect anomalies using Isolation Forest, LOF, or statistical methods. Returns anomaly scores and flags.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"method": {"type": "string", "enum": ["isolation_forest", "lof", "statistical"], "description": "Anomaly detection method"},
"contamination": {"type": "number", "description": "Expected proportion of anomalies (default: 0.1)"},
"output_path": {"type": "string", "description": "Path to save dataset with anomaly scores"}
},
"required": ["file_path", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_and_handle_multicollinearity",
"description": "Detect and handle multicollinearity using VIF (Variance Inflation Factor). Removes highly correlated features automatically.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"threshold": {"type": "number", "description": "VIF threshold (default: 10)"},
"method": {"type": "string", "enum": ["drop", "combine"], "description": "How to handle correlated features"},
"output_path": {"type": "string", "description": "Path to save cleaned dataset"}
},
"required": ["file_path", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_statistical_tests",
"description": "Perform statistical hypothesis tests (t-test, chi-square, ANOVA) to analyze relationships between features and target.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"test_type": {"type": "string", "enum": ["auto", "ttest", "chi2", "anova"], "description": "Type of statistical test"}
},
"required": ["file_path", "target_col"]
}
}
},
# ============================================
# ADVANCED FEATURE ENGINEERING (4)
# ============================================
{
"type": "function",
"function": {
"name": "create_interaction_features",
"description": "Create polynomial, PCA, or cross-product interaction features to capture non-linear relationships.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"method": {"type": "string", "enum": ["polynomial", "pca", "cross"], "description": "Interaction method"},
"degree": {"type": "integer", "description": "Polynomial degree (default: 2)"},
"max_features": {"type": "integer", "description": "Maximum new features to create (default: 50)"},
"output_path": {"type": "string", "description": "Path to save enhanced dataset"}
},
"required": ["file_path", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "create_aggregation_features",
"description": "Create aggregation features (mean, sum, count, etc.) grouped by categorical columns. Useful for customer/transaction data.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"group_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to group by"},
"agg_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to aggregate"},
"agg_functions": {"type": "array", "items": {"type": "string"}, "description": "Aggregation functions (mean, sum, count, etc.)"},
"output_path": {"type": "string", "description": "Path to save dataset with aggregations"}
},
"required": ["file_path", "group_cols", "agg_cols", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "engineer_text_features",
"description": "Extract features from text columns: TF-IDF, word counts, sentiment, readability scores, and embeddings.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"text_col": {"type": "string", "description": "Text column name"},
"methods": {"type": "array", "items": {"type": "string"}, "description": "Feature extraction methods (tfidf, count, sentiment, readability)"},
"max_features": {"type": "integer", "description": "Max TF-IDF features (default: 100)"},
"output_path": {"type": "string", "description": "Path to save dataset with text features"}
},
"required": ["file_path", "text_col", "methods", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "auto_feature_engineering",
"description": "Use LLM (Gemini/Groq) to automatically generate creative feature engineering ideas and implement them. Works without API key if environment variables are set.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"groq_api_key": {"type": "string", "description": "Groq API key (optional - uses environment variable if not provided)"},
"max_suggestions": {"type": "integer", "description": "Maximum feature suggestions to generate (default: 10)"},
"implement_top_k": {"type": "integer", "description": "Number of top suggestions to implement (default: 5)"},
"output_path": {"type": "string", "description": "Path to save dataset with new features"}
},
"required": ["file_path", "target_col", "output_path"]
}
}
},
# ============================================
# ADVANCED PREPROCESSING (3)
# ============================================
{
"type": "function",
"function": {
"name": "handle_imbalanced_data",
"description": "Handle class imbalance using SMOTE, ADASYN, or class weights. Critical for classification tasks.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"method": {"type": "string", "enum": ["smote", "adasyn", "random_oversample", "random_undersample"], "description": "Balancing method"},
"sampling_strategy": {"type": "string", "description": "Sampling ratio (auto, minority, majority)"},
"output_path": {"type": "string", "description": "Path to save balanced dataset"}
},
"required": ["file_path", "target_col", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_feature_scaling",
"description": "Scale features using StandardScaler, MinMaxScaler, or RobustScaler. Essential for distance-based algorithms.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"method": {"type": "string", "enum": ["standard", "minmax", "robust"], "description": "Scaling method"},
"columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to scale (None = all numeric)"},
"output_path": {"type": "string", "description": "Path to save scaled dataset"}
},
"required": ["file_path", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "split_data_strategically",
"description": "Split data with stratification, time-based splitting, or group-based splitting for better validation.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column for stratification"},
"method": {"type": "string", "enum": ["stratified", "time_based", "group_based"], "description": "Split method"},
"test_size": {"type": "number", "description": "Test set proportion (default: 0.2)"},
"time_col": {"type": "string", "description": "Time column for time-based split"},
"group_col": {"type": "string", "description": "Group column for group-based split"}
},
"required": ["file_path", "method"]
}
}
},
# ============================================
# ADVANCED TRAINING (3)
# ============================================
{
"type": "function",
"function": {
"name": "hyperparameter_tuning",
"description": "Optimize model hyperparameters using Optuna (Bayesian optimization). Finds best model configuration automatically.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to prepared dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"model_type": {"type": "string", "enum": ["random_forest", "xgboost", "lightgbm"], "description": "Model to tune"},
"n_trials": {"type": "integer", "description": "Number of tuning trials (default: 100)"},
"task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"},
"output_path": {"type": "string", "description": "Path to save tuned model"}
},
"required": ["file_path", "target_col", "model_type", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "train_ensemble_models",
"description": "Train ensemble models using stacking, voting, or blending. Combines multiple models for better performance.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to prepared dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"ensemble_method": {"type": "string", "enum": ["stacking", "voting", "blending"], "description": "Ensemble technique"},
"base_models": {"type": "array", "items": {"type": "string"}, "description": "Base model types to ensemble"},
"task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"},
"output_path": {"type": "string", "description": "Path to save ensemble model"}
},
"required": ["file_path", "target_col", "ensemble_method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_cross_validation",
"description": "Perform k-fold cross-validation to get robust model performance estimates. Returns mean and std of metrics.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"model_type": {"type": "string", "description": "Model type (random_forest, xgboost, logistic, ridge)"},
"n_splits": {"type": "integer", "description": "Number of CV folds/splits (default: 5)"},
"task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"},
"cv_strategy": {"type": "string", "enum": ["kfold", "stratified", "timeseries"], "description": "Cross-validation strategy (default: kfold)"},
"save_oof": {"type": "boolean", "description": "Whether to save out-of-fold predictions (default: false)"}
},
"required": ["file_path", "target_col", "model_type"]
}
}
},
# ============================================
# BUSINESS INTELLIGENCE (4)
# ============================================
{
"type": "function",
"function": {
"name": "perform_cohort_analysis",
"description": "Analyze user cohorts over time (retention, revenue, engagement). Essential for SaaS and e-commerce businesses.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to transaction/event data"},
"user_col": {"type": "string", "description": "User ID column"},
"date_col": {"type": "string", "description": "Date/timestamp column"},
"metric_col": {"type": "string", "description": "Metric to analyze (revenue, events, etc.)"},
"cohort_period": {"type": "string", "enum": ["daily", "weekly", "monthly"], "description": "Cohort grouping period"},
"output_path": {"type": "string", "description": "Path to save cohort analysis results"}
},
"required": ["file_path", "user_col", "date_col", "metric_col", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_rfm_analysis",
"description": "RFM (Recency, Frequency, Monetary) analysis for customer segmentation. Identifies best/worst customers.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to transaction data"},
"customer_col": {"type": "string", "description": "Customer ID column"},
"date_col": {"type": "string", "description": "Transaction date column"},
"amount_col": {"type": "string", "description": "Transaction amount column"},
"output_path": {"type": "string", "description": "Path to save RFM segments"}
},
"required": ["file_path", "customer_col", "date_col", "amount_col", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_causal_relationships",
"description": "Detect potential causal relationships between features using Granger causality and correlation analysis.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target/effect column"},
"feature_cols": {"type": "array", "items": {"type": "string"}, "description": "Potential cause columns"},
"method": {"type": "string", "enum": ["granger", "correlation"], "description": "Causality detection method"}
},
"required": ["file_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_business_insights",
"description": "Generate automated business insights using descriptive statistics, trends, and anomaly detection. Creates executive summary.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to business data"},
"metric_cols": {"type": "array", "items": {"type": "string"}, "description": "Key business metrics to analyze"},
"date_col": {"type": "string", "description": "Date column for trend analysis"},
"output_path": {"type": "string", "description": "Path to save insights report"}
},
"required": ["file_path", "metric_cols", "output_path"]
}
}
},
# ============================================
# COMPUTER VISION (3)
# ============================================
{
"type": "function",
"function": {
"name": "extract_image_features",
"description": "Extract features from images using pre-trained CNNs (ResNet, VGG). Converts images to feature vectors for ML.",
"parameters": {
"type": "object",
"properties": {
"image_dir": {"type": "string", "description": "Directory containing images"},
"model": {"type": "string", "enum": ["resnet", "vgg", "mobilenet"], "description": "Pre-trained model to use"},
"output_path": {"type": "string", "description": "Path to save feature vectors CSV"}
},
"required": ["image_dir", "model", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_image_clustering",
"description": "Cluster images based on visual similarity using K-means or DBSCAN on extracted features.",
"parameters": {
"type": "object",
"properties": {
"image_dir": {"type": "string", "description": "Directory containing images"},
"n_clusters": {"type": "integer", "description": "Number of clusters (default: auto-detect)"},
"method": {"type": "string", "enum": ["kmeans", "dbscan"], "description": "Clustering method"},
"output_path": {"type": "string", "description": "Path to save clustering results"}
},
"required": ["image_dir", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "analyze_tabular_image_hybrid",
"description": "Combine tabular data with image features for hybrid ML models. Useful for e-commerce/medical data.",
"parameters": {
"type": "object",
"properties": {
"tabular_path": {"type": "string", "description": "Path to tabular data CSV"},
"image_dir": {"type": "string", "description": "Directory with images"},
"image_id_col": {"type": "string", "description": "Column linking tabular data to images"},
"output_path": {"type": "string", "description": "Path to save combined features"}
},
"required": ["tabular_path", "image_dir", "image_id_col", "output_path"]
}
}
},
# ============================================
# NLP/TEXT ANALYTICS (4)
# ============================================
{
"type": "function",
"function": {
"name": "perform_topic_modeling",
"description": "Discover topics in text documents using LDA or NMF. Extract themes from customer reviews, articles, etc.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset with text"},
"text_col": {"type": "string", "description": "Text column name"},
"n_topics": {"type": "integer", "description": "Number of topics to extract (default: 5)"},
"method": {"type": "string", "enum": ["lda", "nmf"], "description": "Topic modeling method"},
"output_path": {"type": "string", "description": "Path to save topics and document-topic matrix"}
},
"required": ["file_path", "text_col", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_named_entity_recognition",
"description": "Extract named entities (person, organization, location) from text using NER models.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset with text"},
"text_col": {"type": "string", "description": "Text column name"},
"output_path": {"type": "string", "description": "Path to save dataset with extracted entities"}
},
"required": ["file_path", "text_col", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "analyze_sentiment_advanced",
"description": "Perform advanced sentiment analysis with aspect-based sentiment (what features customers like/dislike).",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset with text"},
"text_col": {"type": "string", "description": "Text column name"},
"aspects": {"type": "array", "items": {"type": "string"}, "description": "Aspects to analyze sentiment for (e.g., 'price', 'quality')"},
"output_path": {"type": "string", "description": "Path to save sentiment scores"}
},
"required": ["file_path", "text_col", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_text_similarity",
"description": "Calculate text similarity using cosine similarity, Jaccard, or semantic embeddings. Find duplicate/similar documents.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset with text"},
"text_col": {"type": "string", "description": "Text column name"},
"method": {"type": "string", "enum": ["cosine", "jaccard", "semantic"], "description": "Similarity method"},
"threshold": {"type": "number", "description": "Similarity threshold (0-1)"},
"output_path": {"type": "string", "description": "Path to save similarity matrix"}
},
"required": ["file_path", "text_col", "method", "output_path"]
}
}
},
# ============================================
# PRODUCTION/MLOPS (5)
# ============================================
{
"type": "function",
"function": {
"name": "monitor_model_drift",
"description": "Detect data drift and concept drift in production models. Compare training vs production data distributions.",
"parameters": {
"type": "object",
"properties": {
"train_data_path": {"type": "string", "description": "Path to original training data"},
"production_data_path": {"type": "string", "description": "Path to recent production data"},
"features": {"type": "array", "items": {"type": "string"}, "description": "Features to monitor for drift"},
"output_path": {"type": "string", "description": "Path to save drift report"}
},
"required": ["train_data_path", "production_data_path", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "explain_predictions",
"description": "Explain model predictions using SHAP or LIME. Generate feature importance explanations for individual predictions.",
"parameters": {
"type": "object",
"properties": {
"model_path": {"type": "string", "description": "Path to trained model"},
"data_path": {"type": "string", "description": "Path to data to explain"},
"method": {"type": "string", "enum": ["shap", "lime"], "description": "Explanation method"},
"n_samples": {"type": "integer", "description": "Number of samples to explain (default: 10)"},
"output_path": {"type": "string", "description": "Path to save explanations"}
},
"required": ["model_path", "data_path", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_model_card",
"description": "Generate model card documentation with model details, performance metrics, bias analysis, and usage guidelines.",
"parameters": {
"type": "object",
"properties": {
"model_path": {"type": "string", "description": "Path to trained model"},
"train_data_path": {"type": "string", "description": "Path to training data"},
"test_data_path": {"type": "string", "description": "Path to test data"},
"target_col": {"type": "string", "description": "Target column name"},
"output_path": {"type": "string", "description": "Path to save model card JSON"}
},
"required": ["model_path", "train_data_path", "test_data_path", "target_col", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_ab_test_analysis",
"description": "Analyze A/B test results with statistical significance testing. Determine if variant B is better than control A.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to A/B test data"},
"variant_col": {"type": "string", "description": "Column indicating variant (A/B)"},
"metric_col": {"type": "string", "description": "Success metric column"},
"confidence_level": {"type": "number", "description": "Confidence level for significance (default: 0.95)"}
},
"required": ["file_path", "variant_col", "metric_col"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_feature_leakage",
"description": "Detect potential feature leakage by analyzing feature importance and temporal relationships. Prevents data leakage bugs.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"date_col": {"type": "string", "description": "Optional date column for temporal analysis"}
},
"required": ["file_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "monitor_drift_evidently",
"description": "Generate comprehensive data drift report using Evidently AI. Provides statistical tests per feature, data quality metrics, and interactive HTML dashboard.",
"parameters": {
"type": "object",
"properties": {
"reference_data_path": {"type": "string", "description": "Path to training/reference dataset"},
"current_data_path": {"type": "string", "description": "Path to production/current dataset"},
"output_path": {"type": "string", "description": "Path to save HTML drift report"}
},
"required": ["reference_data_path", "current_data_path"]
}
}
},
{
"type": "function",
"function": {
"name": "explain_with_dtreeviz",
"description": "Generate publication-quality decision tree visualizations using dtreeviz. Shows decision path, feature distributions at each node, and split thresholds.",
"parameters": {
"type": "object",
"properties": {
"model_path": {"type": "string", "description": "Path to trained tree-based model (.pkl)"},
"data_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"instance_index": {"type": "integer", "description": "Index of instance to trace through tree (default: 0)"},
"output_path": {"type": "string", "description": "Path to save SVG visualization"}
},
"required": ["model_path", "data_path", "target_col"]
}
}
},
# ============================================
# TIME SERIES (3)
# ============================================
{
"type": "function",
"function": {
"name": "forecast_time_series",
"description": "Forecast future values using ARIMA, Prophet, or LSTM models. Handles seasonal and trend components.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to time series data"},
"date_col": {"type": "string", "description": "Date/timestamp column"},
"value_col": {"type": "string", "description": "Value column to forecast"},
"forecast_periods": {"type": "integer", "description": "Number of periods to forecast"},
"method": {"type": "string", "enum": ["arima", "prophet", "lstm"], "description": "Forecasting method"},
"output_path": {"type": "string", "description": "Path to save forecast results"}
},
"required": ["file_path", "date_col", "value_col", "forecast_periods", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_seasonality_trends",
"description": "Detect seasonality patterns and trends in time series data using STL decomposition and statistical tests.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to time series data"},
"date_col": {"type": "string", "description": "Date/timestamp column"},
"value_col": {"type": "string", "description": "Value column to analyze"},
"period": {"type": "integer", "description": "Expected seasonal period (e.g., 12 for monthly)"}
},
"required": ["file_path", "date_col", "value_col"]
}
}
},
{
"type": "function",
"function": {
"name": "create_time_series_features",
"description": "Create comprehensive time series features: lags, rolling stats, exponential moving averages, and Fourier features.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to time series data"},
"date_col": {"type": "string", "description": "Date/timestamp column"},
"value_col": {"type": "string", "description": "Value column"},
"lags": {"type": "array", "items": {"type": "integer"}, "description": "Lag periods to create (e.g., [1, 7, 30])"},
"windows": {"type": "array", "items": {"type": "integer"}, "description": "Rolling window sizes (e.g., [7, 30])"},
"output_path": {"type": "string", "description": "Path to save dataset with time series features"}
},
"required": ["file_path", "date_col", "value_col", "output_path"]
}
}
},
# ============================================
# ADVANCED INSIGHTS TOOLS (6) - NEW
# ============================================
{
"type": "function",
"function": {
"name": "analyze_root_cause",
"description": "Perform root cause analysis to identify why a metric dropped or changed. Analyzes correlations, temporal patterns, and identifies top influencing factors.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Column to analyze (e.g., 'sales')"},
"time_col": {"type": "string", "description": "Optional time column for trend analysis"},
"threshold_drop": {"type": "number", "description": "Percentage drop to flag as significant (default 0.15)"}
},
"required": ["file_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_trends_and_seasonality",
"description": "Detect trends and seasonal patterns in time series data using statistical methods and autocorrelation.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"value_col": {"type": "string", "description": "Column with values to analyze"},
"time_col": {"type": "string", "description": "Column with timestamps"},
"seasonal_period": {"type": "integer", "description": "Expected seasonal period (auto-detected if None)"}
},
"required": ["file_path", "value_col", "time_col"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_anomalies_advanced",
"description": "Detect anomalies with confidence scores using Isolation Forest or statistical methods.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to analyze (all numeric if None)"},
"contamination": {"type": "number", "description": "Expected proportion of outliers (default 0.1)"},
"method": {"type": "string", "enum": ["isolation_forest", "statistical"], "description": "Detection method"}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_hypothesis_testing",
"description": "Perform statistical hypothesis testing (t-test, ANOVA, chi-square) to compare groups.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"group_col": {"type": "string", "description": "Column defining groups"},
"value_col": {"type": "string", "description": "Column with values to compare"},
"test_type": {"type": "string", "enum": ["t-test", "anova", "chi-square", "auto"], "description": "Test type (auto-detected if 'auto')"}
},
"required": ["file_path", "group_col", "value_col"]
}
}
},
{
"type": "function",
"function": {
"name": "analyze_distribution",
"description": "Analyze distribution of a column including normality tests, skewness, and kurtosis.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"column": {"type": "string", "description": "Column to analyze"},
"tests": {"type": "array", "items": {"type": "string"}, "description": "Tests to perform (normality, skewness)"}
},
"required": ["file_path", "column"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_segment_analysis",
"description": "Perform cluster-based customer/data segmentation using K-means and profile each segment.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"n_segments": {"type": "integer", "description": "Number of segments to create (default 5)"},
"features": {"type": "array", "items": {"type": "string"}, "description": "Features for clustering (all numeric if None)"}
},
"required": ["file_path"]
}
}
},
# ============================================
# AUTOMATED PIPELINE TOOLS (2) - NEW
# ============================================
{
"type": "function",
"function": {
"name": "auto_ml_pipeline",
"description": "Fully automated ML pipeline: auto-detect types, clean missing values, handle outliers, encode categorical, engineer features, and select best features. Zero configuration required!",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to input dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "Task type (auto-detected if 'auto')"},
"output_path": {"type": "string", "description": "Where to save processed data"},
"feature_engineering_level": {"type": "string", "enum": ["basic", "intermediate", "advanced"], "description": "Feature engineering depth"}
},
"required": ["file_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "auto_feature_selection",
"description": "Automatically select the best features for modeling using mutual information or F-statistics.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column"},
"task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "Task type"},
"max_features": {"type": "integer", "description": "Maximum features to keep (default 50)"},
"method": {"type": "string", "enum": ["mutual_info", "f_test", "auto"], "description": "Selection method"},
"output_path": {"type": "string", "description": "Where to save selected features"}
},
"required": ["file_path", "target_col"]
}
}
},
# ============================================
# VISUALIZATION TOOLS (3) - NEW
# ============================================
{
"type": "function",
"function": {
"name": "generate_all_plots",
"description": "Generate ALL plots for a dataset automatically: data quality, EDA, distributions, and correlations. Creates interactive HTML plots.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Optional target column"},
"output_dir": {"type": "string", "description": "Directory to save plots (default ./outputs/plots)"}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_data_quality_plots",
"description": "Generate data quality visualizations: missing values, data types, and outlier detection plots.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"output_dir": {"type": "string", "description": "Directory to save plots"}
},
"required": ["file_path", "output_dir"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_eda_plots",
"description": "Generate exploratory data analysis plots: correlation heatmap, feature relationships, and pairplots.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Optional target column"},
"output_dir": {"type": "string", "description": "Directory to save plots"}
},
"required": ["file_path", "output_dir"]
}
}
},
# ============================================
# INTERACTIVE PLOTLY VISUALIZATIONS (6)
# ============================================
{
"type": "function",
"function": {
"name": "generate_interactive_scatter",
"description": "Create interactive scatter plot with zoom, pan, and hover capabilities. Great for exploring relationships between variables.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"x_col": {"type": "string", "description": "Column for X-axis"},
"y_col": {"type": "string", "description": "Column for Y-axis"},
"color_col": {"type": "string", "description": "Optional column for color coding points"},
"size_col": {"type": "string", "description": "Optional column for bubble size"},
"output_path": {"type": "string", "description": "Path to save HTML file (default: ./outputs/plots/interactive/scatter.html)"}
},
"required": ["file_path", "x_col", "y_col"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_interactive_histogram",
"description": "Create interactive histogram with box plot overlay. Users can explore distribution interactively.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"column": {"type": "string", "description": "Column to plot distribution"},
"bins": {"type": "integer", "description": "Number of bins (default: 30)"},
"color_col": {"type": "string", "description": "Optional column for grouped histograms"},
"output_path": {"type": "string", "description": "Path to save HTML file"}
},
"required": ["file_path", "column"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_interactive_correlation_heatmap",
"description": "Create interactive correlation heatmap with hover values. Better than static matplotlib version.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"output_path": {"type": "string", "description": "Path to save HTML file"}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_interactive_box_plots",
"description": "Create interactive box plots for outlier detection. Supports grouping by categorical variable.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to plot (all numeric if not specified)"},
"group_by": {"type": "string", "description": "Optional categorical column for grouping"},
"output_path": {"type": "string", "description": "Path to save HTML file"}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_interactive_time_series",
"description": "Create interactive time series plot with range slider and zoom. Perfect for temporal data analysis.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"time_col": {"type": "string", "description": "Column with datetime values"},
"value_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to plot over time"},
"output_path": {"type": "string", "description": "Path to save HTML file"}
},
"required": ["file_path", "time_col", "value_cols"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_plotly_dashboard",
"description": "Generate complete interactive dashboard with multiple visualizations: correlation heatmap, box plots, scatter plots, histograms. One-stop visualization solution.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Optional target column for supervised analysis"},
"output_dir": {"type": "string", "description": "Directory to save all plots (default: ./outputs/plots/interactive)"}
},
"required": ["file_path"]
}
}
},
# EDA Report Generation (1) - NEW PHASE 2
{
"type": "function",
"function": {
"name": "generate_ydata_profiling_report",
"description": "Generate comprehensive HTML report using ydata-profiling (formerly pandas-profiling). Provides extensive analysis: overview, variable statistics, interactions, correlations (Pearson, Spearman, Cramér's V), missing values matrix, duplicate analysis, and more. Most detailed and comprehensive profiling tool with automated insights and data quality warnings.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to the dataset CSV/Parquet file"},
"output_path": {"type": "string", "description": "Where to save HTML report (default: ./outputs/reports/ydata_profile.html)"},
"minimal": {"type": "boolean", "description": "If true, generates faster minimal report (useful for large datasets, default: false)"},
"title": {"type": "string", "description": "Report title (default: 'Data Profiling Report')"}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_sweetviz_report",
"description": "Generate interactive EDA report using Sweetviz. Provides feature-by-feature analysis, target associations, and dataset comparison. Great for train vs test comparison.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to the dataset CSV/Parquet file"},
"target_col": {"type": "string", "description": "Optional target column for supervised analysis"},
"compare_file_path": {"type": "string", "description": "Optional second dataset for comparison (e.g., test set)"},
"output_path": {"type": "string", "description": "Where to save HTML report (default: ./outputs/reports/sweetviz_report.html)"}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_label_errors",
"description": "Detect potential label errors in classification datasets using cleanlab. Uses confident learning to find mislabeled examples by cross-validating classifiers and identifying disagreements.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to classification dataset"},
"target_col": {"type": "string", "description": "Target/label column name"},
"features": {"type": "array", "items": {"type": "string"}, "description": "Feature columns (None = all numeric)"},
"output_path": {"type": "string", "description": "Path to save flagged rows"}
},
"required": ["file_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "validate_schema_pandera",
"description": "Validate a DataFrame against a pandera schema. Check column types, nullability, value ranges, and custom constraints.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset to validate"},
"schema_config": {"type": "object", "description": "Schema configuration with column definitions"}
},
"required": ["file_path", "schema_config"]
}
}
},
# ========================================
# CODE INTERPRETER - THE GAME CHANGER 🚀
# ========================================
{
"type": "function",
"function": {
"name": "execute_python_code",
"description": "⭐ CRITICAL TOOL - Execute custom Python code for ANY data science task not covered by existing tools. This is what makes you a TRUE AI AGENT, not just a function-calling bot. Use this when user requests: 1) Custom visualizations (specific Plotly plots, interactive dashboards, unique chart types) 2) Domain-specific calculations 3) Custom data transformations 4) Specific export formats 5) Interactive widgets/filters. Code has access to pandas, polars, numpy, matplotlib, seaborn, plotly. ALWAYS save outputs to files and return file paths.",
"parameters": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "Python code to execute. Auto-imported: pandas as pd, polars as pl, numpy as np, matplotlib.pyplot as plt, seaborn as sns, plotly.express as px, plotly.graph_objects as go. Code should save outputs to files in working_directory. Example: fig.write_html('./outputs/code/plot.html')"
},
"working_directory": {
"type": "string",
"description": "Directory to run code in (default: ./outputs/code). Code can read from ./temp/ and write to this directory."
},
"timeout": {
"type": "integer",
"description": "Maximum execution time in seconds (default: 60)"
}
},
"required": ["code"]
}
}
},
{
"type": "function",
"function": {
"name": "execute_code_from_file",
"description": "Execute Python code from an existing .py file. Useful when code is too long to pass as string, or when running pre-written scripts. Same capabilities as execute_python_code.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to .py file to execute"
},
"working_directory": {
"type": "string",
"description": "Directory to run code in (default: ./outputs/code)"
},
"timeout": {
"type": "integer",
"description": "Maximum execution time in seconds (default: 60)"
}
},
"required": ["file_path"]
}
}
},
# ============================================
# CLOUD DATA SOURCES (4) - NEW
# ============================================
{
"type": "function",
"function": {
"name": "load_bigquery_table",
"description": "Load data from Google BigQuery table into a Polars DataFrame. Supports sampling via LIMIT and column selection. Returns CSV path for downstream tools. Use profile_bigquery_table first for large tables.",
"parameters": {
"type": "object",
"properties": {
"project_id": {
"type": "string",
"description": "Google Cloud project ID"
},
"dataset": {
"type": "string",
"description": "BigQuery dataset name"
},
"table": {
"type": "string",
"description": "BigQuery table name"
},
"limit": {
"type": "integer",
"description": "Optional row limit for sampling (e.g., 10000 for large tables)"
},
"columns": {
"type": "array",
"items": {"type": "string"},
"description": "Optional list of column names to load"
},
"where_clause": {
"type": "string",
"description": "Optional SQL WHERE clause for filtering (without WHERE keyword)"
}
},
"required": ["project_id", "dataset", "table"]
}
}
},
{
"type": "function",
"function": {
"name": "write_bigquery_table",
"description": "Write predictions or processed data from CSV/Parquet file to BigQuery table. Supports append, overwrite, or fail modes.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to CSV or Parquet file to write"
},
"project_id": {
"type": "string",
"description": "Google Cloud project ID"
},
"dataset": {
"type": "string",
"description": "BigQuery dataset name"
},
"table": {
"type": "string",
"description": "BigQuery table name"
},
"mode": {
"type": "string",
"enum": ["append", "overwrite", "fail"],
"description": "Write mode: append (add rows), overwrite (replace), fail (error if exists)"
}
},
"required": ["file_path", "project_id", "dataset", "table"]
}
}
},
{
"type": "function",
"function": {
"name": "profile_bigquery_table",
"description": "Profile a BigQuery table without loading all data. Returns row count, column types, null counts (sampled), table size, and load recommendations. Use this BEFORE load_bigquery_table for large tables.",
"parameters": {
"type": "object",
"properties": {
"project_id": {
"type": "string",
"description": "Google Cloud project ID"
},
"dataset": {
"type": "string",
"description": "BigQuery dataset name"
},
"table": {
"type": "string",
"description": "BigQuery table name"
}
},
"required": ["project_id", "dataset", "table"]
}
}
},
{
"type": "function",
"function": {
"name": "query_bigquery",
"description": "Execute custom BigQuery SQL query and return results as DataFrame. Useful for complex aggregations, joins, or transformations before analysis.",
"parameters": {
"type": "object",
"properties": {
"project_id": {
"type": "string",
"description": "Google Cloud project ID"
},
"query": {
"type": "string",
"description": "SQL query to execute"
},
"output_path": {
"type": "string",
"description": "Optional path to save results (default: auto-generated)"
},
"limit": {
"type": "integer",
"description": "Optional row limit to append to query"
}
},
"required": ["project_id", "query"]
}
}
},
# ============================================
# AUTOGLUON TRAINING (3) - AutoML at Scale
# ============================================
{
"type": "function",
"function": {
"name": "train_with_autogluon",
"description": "Train ML models using AutoGluon AutoML. Automatically trains and ensembles 10+ models (LightGBM, XGBoost, CatBoost, RandomForest, etc.) with stacking. Handles raw data directly - no need to manually encode categoricals or impute missing values. Supports classification (binary/multiclass) and regression. Use this instead of train_baseline_models for best performance.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to CSV/Parquet dataset"
},
"target_col": {
"type": "string",
"description": "Column to predict"
},
"task_type": {
"type": "string",
"enum": ["classification", "regression", "auto"],
"description": "Type of ML task. 'auto' to detect automatically."
},
"time_limit": {
"type": "integer",
"description": "Max training time in seconds (default: 120). Higher = better models."
},
"presets": {
"type": "string",
"enum": ["medium_quality", "good_quality", "best_quality"],
"description": "Quality preset. medium_quality=fast, best_quality=slower but better."
},
"eval_metric": {
"type": "string",
"description": "Metric to optimize. Classification: 'accuracy','f1','roc_auc'. Regression: 'rmse','mae','r2'. Auto-selected if None."
},
"output_dir": {
"type": "string",
"description": "Directory to save trained model (default: ./outputs/autogluon_model)"
},
"infer_limit": {
"type": "number",
"description": "Max inference time per row in seconds. Only models meeting this speed constraint are kept. E.g. 0.01 = 10ms/row."
}
},
"required": ["file_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "predict_with_autogluon",
"description": "Make predictions on new data using a trained AutoGluon model. Returns predictions and probability scores for classification tasks.",
"parameters": {
"type": "object",
"properties": {
"model_path": {
"type": "string",
"description": "Path to saved AutoGluon model directory"
},
"data_path": {
"type": "string",
"description": "Path to new data CSV/Parquet for prediction"
},
"output_path": {
"type": "string",
"description": "Path to save predictions CSV"
}
},
"required": ["model_path", "data_path"]
}
}
},
{
"type": "function",
"function": {
"name": "forecast_with_autogluon",
"description": "Forecast time series using AutoGluon TimeSeriesPredictor. Trains and ensembles multiple models including DeepAR, ETS, ARIMA, Theta, and Chronos. Supports covariates, holiday features, model selection, and probabilistic forecasts. Much more powerful than basic ARIMA/Prophet.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to time series CSV/Parquet"
},
"target_col": {
"type": "string",
"description": "Column with values to forecast"
},
"time_col": {
"type": "string",
"description": "Column with timestamps/dates"
},
"forecast_horizon": {
"type": "integer",
"description": "Number of future periods to predict (default: 30)"
},
"id_col": {
"type": "string",
"description": "Column identifying different series (for multi-series forecasting)"
},
"freq": {
"type": "string",
"description": "Frequency: 'D'=daily, 'h'=hourly, 'W'=weekly, 'MS'=monthly. Auto-detected if omitted."
},
"time_limit": {
"type": "integer",
"description": "Max training time in seconds (default: 120)"
},
"presets": {
"type": "string",
"enum": ["fast_training", "medium_quality", "best_quality"],
"description": "Quality preset for forecasting models"
},
"output_path": {
"type": "string",
"description": "Path to save forecast CSV"
},
"static_features_path": {
"type": "string",
"description": "CSV with per-series metadata (one row per series). Improves cross-series learning."
},
"known_covariates_cols": {
"type": "array",
"items": {"type": "string"},
"description": "Columns with future-known values (holidays, promotions, day_of_week)"
},
"holiday_country": {
"type": "string",
"description": "Country code for auto holiday features: 'US', 'UK', 'IN', 'DE', etc."
},
"fill_missing": {
"type": "boolean",
"description": "Auto-fill missing values in time series (default: true)"
},
"models": {
"type": "array",
"items": {"type": "string"},
"description": "Specific models to train: 'ETS', 'AutoARIMA', 'Theta', 'DeepAR', 'PatchTST', 'DLinear', 'TFT', 'SeasonalNaive'"
},
"quantile_levels": {
"type": "array",
"items": {"type": "number"},
"description": "Quantile levels for probabilistic forecasts. E.g. [0.1, 0.5, 0.9] for 10th/50th/90th percentile."
}
},
"required": ["file_path", "target_col", "time_col"]
}
}
},
# ============================================
# AUTOGLUON ADVANCED (6) - Post-Training, Analysis, Multi-Label, Backtesting
# ============================================
{
"type": "function",
"function": {
"name": "optimize_autogluon_model",
"description": "Post-training optimization on a trained AutoGluon model. Operations: refit_full (re-train on 100% data for deployment), distill (compress ensemble into single model), calibrate_threshold (optimize binary classification threshold), deploy_optimize (strip artifacts for minimal deployment), delete_models (remove specific models to free resources).",
"parameters": {
"type": "object",
"properties": {
"model_path": {
"type": "string",
"description": "Path to saved AutoGluon model directory"
},
"operation": {
"type": "string",
"enum": ["refit_full", "distill", "calibrate_threshold", "deploy_optimize", "delete_models"],
"description": "Optimization operation to perform"
},
"data_path": {
"type": "string",
"description": "Path to dataset (required for distill, calibrate_threshold)"
},
"metric": {
"type": "string",
"enum": ["f1", "balanced_accuracy", "precision", "recall"],
"description": "Metric for calibrate_threshold optimization"
},
"models_to_delete": {
"type": "array",
"items": {"type": "string"},
"description": "Model names to delete (for delete_models operation)"
},
"output_dir": {
"type": "string",
"description": "Output directory for deploy_optimize"
}
},
"required": ["model_path", "operation"]
}
}
},
{
"type": "function",
"function": {
"name": "analyze_autogluon_model",
"description": "Inspect and analyze a trained AutoGluon model. Operations: summary (extended leaderboard with stack levels, memory, inference speed), transform_features (get internally transformed feature matrix), info (comprehensive model metadata and training summary).",
"parameters": {
"type": "object",
"properties": {
"model_path": {
"type": "string",
"description": "Path to saved AutoGluon model directory"
},
"data_path": {
"type": "string",
"description": "Path to dataset (required for transform_features)"
},
"operation": {
"type": "string",
"enum": ["summary", "transform_features", "info"],
"description": "Analysis operation to perform"
}
},
"required": ["model_path"]
}
}
},
{
"type": "function",
"function": {
"name": "extend_autogluon_training",
"description": "Add models or re-fit ensemble on an existing AutoGluon predictor without retraining from scratch. Operations: fit_extra (train additional models/hyperparameters), fit_weighted_ensemble (re-fit ensemble weights on existing base models).",
"parameters": {
"type": "object",
"properties": {
"model_path": {
"type": "string",
"description": "Path to saved AutoGluon model directory"
},
"operation": {
"type": "string",
"enum": ["fit_extra", "fit_weighted_ensemble"],
"description": "Extension operation to perform"
},
"data_path": {
"type": "string",
"description": "Path to training data (required for fit_extra)"
},
"time_limit": {
"type": "integer",
"description": "Additional training time in seconds (default: 60)"
},
"hyperparameters": {
"type": "object",
"description": "Model hyperparameters dict. E.g. {\"GBM\": {\"num_boost_round\": 500}, \"RF\": {}}"
}
},
"required": ["model_path"]
}
}
},
{
"type": "function",
"function": {
"name": "train_multilabel_autogluon",
"description": "Train multi-label prediction model. Predicts multiple target columns simultaneously by training separate AutoGluon TabularPredictors per label with shared feature engineering. Use when dataset has multiple columns to predict.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to CSV/Parquet dataset"
},
"target_cols": {
"type": "array",
"items": {"type": "string"},
"description": "List of columns to predict (e.g. ['label1', 'label2'])"
},
"time_limit": {
"type": "integer",
"description": "Max training time per label in seconds (default: 120)"
},
"presets": {
"type": "string",
"enum": ["medium_quality", "good_quality", "best_quality"],
"description": "Quality preset"
},
"output_dir": {
"type": "string",
"description": "Where to save trained model"
}
},
"required": ["file_path", "target_cols"]
}
}
},
{
"type": "function",
"function": {
"name": "backtest_timeseries",
"description": "Backtest time series models using multiple validation windows. More robust performance estimation than single train/test split. Trains models with multi-window cross-validation and returns per-window evaluation.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to time series CSV/Parquet"
},
"target_col": {
"type": "string",
"description": "Column with values to forecast"
},
"time_col": {
"type": "string",
"description": "Column with timestamps/dates"
},
"forecast_horizon": {
"type": "integer",
"description": "Periods to predict per window (default: 30)"
},
"id_col": {
"type": "string",
"description": "Column identifying different series"
},
"freq": {
"type": "string",
"description": "Frequency string"
},
"num_val_windows": {
"type": "integer",
"description": "Number of backtesting windows (default: 3)"
},
"time_limit": {
"type": "integer",
"description": "Max training time in seconds"
},
"presets": {
"type": "string",
"enum": ["fast_training", "medium_quality", "best_quality"],
"description": "Quality preset"
},
"output_path": {
"type": "string",
"description": "Path to save backtest predictions CSV"
}
},
"required": ["file_path", "target_col", "time_col"]
}
}
},
{
"type": "function",
"function": {
"name": "analyze_timeseries_model",
"description": "Analyze a trained AutoGluon time series model. Operations: feature_importance (permutation importance of covariates), plot (forecast vs actuals visualization), make_future_dataframe (generate future timestamp skeleton for prediction with covariates).",
"parameters": {
"type": "object",
"properties": {
"model_path": {
"type": "string",
"description": "Path to saved AutoGluon TimeSeriesPredictor"
},
"data_path": {
"type": "string",
"description": "Path to time series data"
},
"time_col": {
"type": "string",
"description": "Column with timestamps/dates"
},
"id_col": {
"type": "string",
"description": "Column identifying different series"
},
"operation": {
"type": "string",
"enum": ["feature_importance", "plot", "make_future_dataframe"],
"description": "Analysis operation to perform"
},
"output_path": {
"type": "string",
"description": "Path to save output (plot image or CSV)"
}
},
"required": ["model_path", "data_path", "time_col"]
}
}
}
]
def get_tool_by_name(tool_name: str) -> dict:
"""Get tool definition by name."""
for tool in TOOLS:
if tool["function"]["name"] == tool_name:
return tool
raise ValueError(f"Tool '{tool_name}' not found in registry")
def get_all_tool_names() -> list:
"""Get list of all tool names."""
return [tool["function"]["name"] for tool in TOOLS]
def get_tools_by_category() -> dict:
"""Get tools organized by category."""
return {
"basic": [t["function"]["name"] for t in TOOLS[:16]],
"advanced_analysis": [t["function"]["name"] for t in TOOLS[16:21]],
"advanced_feature_engineering": [t["function"]["name"] for t in TOOLS[21:25]],
"advanced_preprocessing": [t["function"]["name"] for t in TOOLS[25:28]],
"advanced_training": [t["function"]["name"] for t in TOOLS[28:31]],
"business_intelligence": [t["function"]["name"] for t in TOOLS[31:35]],
"computer_vision": [t["function"]["name"] for t in TOOLS[35:38]],
"nlp_text_analytics": [t["function"]["name"] for t in TOOLS[38:42]],
"production_mlops": [t["function"]["name"] for t in TOOLS[42:47]],
"time_series": [t["function"]["name"] for t in TOOLS[47:50]],
"cloud_data_sources": [t["function"]["name"] for t in TOOLS[50:54]]
}