Spaces:

ShutterStack
/

CausalBox

Sleeping

App Files Files Community

ShutterStack commited on Jun 20

Commit

ab66d4e

verified ·

1 Parent(s): b8611e0

major changes

Browse files

Files changed (39) hide show

.env +1 -0
.gitattributes +35 -35
data/sample_dataset.csv +0 -0
main.py +45 -37
requirements.txt +14 -11
routers/__pycache__/chatbot_routes.cpython-310.pyc +0 -0
routers/__pycache__/discover_routes.cpython-310.pyc +0 -0
routers/__pycache__/intervene_routes.cpython-310.pyc +0 -0
routers/__pycache__/prediction_routes.cpython-310.pyc +0 -0
routers/__pycache__/preprocess_routes.cpython-310.pyc +0 -0
routers/__pycache__/timeseries_routes.cpython-310.pyc +0 -0
routers/__pycache__/treatment_routes.cpython-310.pyc +0 -0
routers/__pycache__/visualize_routes.cpython-310.pyc +0 -0
routers/chatbot_routes.py +25 -0
routers/discover_routes.py +42 -42
routers/intervene_routes.py +53 -53
routers/prediction_routes.py +27 -0
routers/preprocess_routes.py +55 -55
routers/timeseries_routes.py +30 -0
routers/treatment_routes.py +53 -53
routers/visualize_routes.py +42 -42
scripts/generate_data.py +29 -29
streamlit_app.py +618 -307
utils/__pycache__/casual_algorithms.cpython-310.pyc +0 -0
utils/__pycache__/causal_chatbot.cpython-310.pyc +0 -0
utils/__pycache__/do_calculus.cpython-310.pyc +0 -0
utils/__pycache__/graph_utils.cpython-310.pyc +0 -0
utils/__pycache__/prediction_models.cpython-310.pyc +0 -0
utils/__pycache__/preprocessor.cpython-310.pyc +0 -0
utils/__pycache__/time_series_causal.cpython-310.pyc +0 -0
utils/__pycache__/treatment_effects.cpython-310.pyc +0 -0
utils/casual_algorithms.py +63 -63
utils/causal_chatbot.py +271 -0
utils/do_calculus.py +51 -51
utils/graph_utils.py +107 -60
utils/prediction_models.py +86 -0
utils/preprocessor.py +88 -57
utils/time_series_causal.py +102 -0
utils/treatment_effects.py +62 -62

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ GROQ_API_KEY=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

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

data/sample_dataset.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

main.py CHANGED Viewed

@@ -1,38 +1,46 @@
-# main.py
-from flask import Flask, jsonify, request
-from flask_cors import CORS
-import os
-import sys
-# Add the 'routers' and 'utils' directories to the Python path
-# This allows direct imports like 'from routers.preprocess_routes import preprocess_bp'
-script_dir = os.path.dirname(__file__)
-sys.path.insert(0, os.path.join(script_dir, 'routers'))
-sys.path.insert(0, os.path.join(script_dir, 'utils'))
-# Import Blueprints
-from routers.preprocess_routes import preprocess_bp
-from routers.discover_routes import discover_bp
-from routers.intervene_routes import intervene_bp
-from routers.treatment_routes import treatment_bp
-from routers.visualize_routes import visualize_bp
-app = Flask(__name__)
-CORS(app) # Enable CORS for frontend interaction
-# Register Blueprints
-app.register_blueprint(preprocess_bp, url_prefix='/preprocess')
-app.register_blueprint(discover_bp, url_prefix='/discover')
-app.register_blueprint(intervene_bp, url_prefix='/intervene')
-app.register_blueprint(treatment_bp, url_prefix='/treatment')
-app.register_blueprint(visualize_bp, url_prefix='/visualize')
-@app.route('/')
-def home():
-    return "Welcome to CausalBox Backend API!"
-if __name__ == '__main__':
-    # Ensure the 'data' directory exists for storing datasets
-    os.makedirs('data', exist_ok=True)
-    # Run the Flask app
     app.run(debug=True, host='0.0.0.0', port=5000)

+# main.py
+from flask import Flask, jsonify, request
+from flask_cors import CORS
+import os
+import sys
+from dotenv import load_dotenv
+load_dotenv()
+# Add the 'routers' and 'utils' directories to the Python path
+# This allows direct imports like 'from routers.preprocess_routes import preprocess_bp'
+script_dir = os.path.dirname(__file__)
+sys.path.insert(0, os.path.join(script_dir, 'routers'))
+sys.path.insert(0, os.path.join(script_dir, 'utils'))
+# Import Blueprints
+from routers.preprocess_routes import preprocess_bp
+from routers.discover_routes import discover_bp
+from routers.intervene_routes import intervene_bp
+from routers.treatment_routes import treatment_bp
+from routers.visualize_routes import visualize_bp
+from routers.prediction_routes import prediction_bp
+from routers.timeseries_routes import timeseries_bp
+from routers.chatbot_routes import chatbot_bp
+app = Flask(__name__)
+CORS(app) # Enable CORS for frontend interaction
+# Register Blueprints
+app.register_blueprint(preprocess_bp, url_prefix='/preprocess')
+app.register_blueprint(discover_bp, url_prefix='/discover')
+app.register_blueprint(intervene_bp, url_prefix='/intervene')
+app.register_blueprint(treatment_bp, url_prefix='/treatment')
+app.register_blueprint(visualize_bp, url_prefix='/visualize')
+app.register_blueprint(prediction_bp, url_prefix='/prediction')
+app.register_blueprint(timeseries_bp, url_prefix='/timeseries')
+app.register_blueprint(chatbot_bp, url_prefix='/chatbot')
+@app.route('/')
+def home():
+    return "Welcome to CausalBox Backend API!"
+if __name__ == '__main__':
+    # Ensure the 'data' directory exists for storing datasets
+    os.makedirs('data', exist_ok=True)
+    # Run the Flask app
     app.run(debug=True, host='0.0.0.0', port=5000)

requirements.txt CHANGED Viewed

@@ -1,11 +1,14 @@
-Flask
-flask-cors
-pandas
-numpy
-scikit-learn
-causal-learn # For PC algorithm
-networkx
-plotly
-streamlit
-requests # For Streamlit to communicate with Flask
-watchfiles # For auto_refresh.py (if implemented for background tasks)

+Flask
+flask-cors
+pandas
+numpy
+scikit-learn
+causal-learn # For PC algorithm
+networkx
+plotly
+streamlit
+requests # For Streamlit to communicate with Flask
+watchfiles # For auto_refresh.py (if implemented for background tasks)
+statsmodels # For statistical models and tests
+google-generativeai
+python-dotenv

routers/__pycache__/chatbot_routes.cpython-310.pyc ADDED Viewed

Binary file (990 Bytes). View file

routers/__pycache__/discover_routes.cpython-310.pyc CHANGED Viewed

Binary files a/routers/__pycache__/discover_routes.cpython-310.pyc and b/routers/__pycache__/discover_routes.cpython-310.pyc differ

routers/__pycache__/intervene_routes.cpython-310.pyc CHANGED Viewed

Binary files a/routers/__pycache__/intervene_routes.cpython-310.pyc and b/routers/__pycache__/intervene_routes.cpython-310.pyc differ

routers/__pycache__/prediction_routes.cpython-310.pyc ADDED Viewed

Binary file (1.13 kB). View file

routers/__pycache__/preprocess_routes.cpython-310.pyc CHANGED Viewed

Binary files a/routers/__pycache__/preprocess_routes.cpython-310.pyc and b/routers/__pycache__/preprocess_routes.cpython-310.pyc differ

routers/__pycache__/timeseries_routes.cpython-310.pyc ADDED Viewed

Binary file (1.29 kB). View file

routers/__pycache__/treatment_routes.cpython-310.pyc CHANGED Viewed

Binary files a/routers/__pycache__/treatment_routes.cpython-310.pyc and b/routers/__pycache__/treatment_routes.cpython-310.pyc differ

routers/__pycache__/visualize_routes.cpython-310.pyc CHANGED Viewed

Binary files a/routers/__pycache__/visualize_routes.cpython-310.pyc and b/routers/__pycache__/visualize_routes.cpython-310.pyc differ

routers/chatbot_routes.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# routers/chatbot_routes.py
+from flask import Blueprint, request, jsonify
+from utils.causal_chatbot import get_chatbot_response # Import the core chatbot logic
+chatbot_bp = Blueprint('chatbot_bp', __name__)
+@chatbot_bp.route('/message', methods=['POST'])
+def handle_chat_message():
+    """
+    API endpoint for the chatbot to receive user messages and provide responses.
+    """
+    data = request.json
+    user_message = data.get('user_message')
+    # Session context includes processed_data, causal_graph_adj, etc.
+    session_context = data.get('session_context', {})
+    if not user_message:
+        return jsonify({"detail": "No user message provided."}), 400
+    try:
+        response_text = get_chatbot_response(user_message, session_context)
+        return jsonify({"response": response_text}), 200
+    except Exception as e:
+        print(f"Error in chatbot route: {e}")
+        return jsonify({"detail": f"An error occurred in the chatbot: {str(e)}"}), 500

routers/discover_routes.py CHANGED Viewed

@@ -1,43 +1,43 @@
-# routers/discover_routes.py
-from flask import Blueprint, request, jsonify
-import pandas as pd
-from utils.casual_algorithms import CausalDiscoveryAlgorithms
-import logging
-discover_bp = Blueprint('discover', __name__)
-logger = logging.getLogger(__name__)
-causal_discovery_algorithms = CausalDiscoveryAlgorithms()
-@discover_bp.route('/', methods=['POST'])
-def discover_causal_graph():
-    """
-    Discover causal graph from input data using selected algorithm.
-    Expects 'data' key with list of dicts (preprocessed DataFrame records) and 'algorithm' string.
-    Returns graph as adjacency matrix.
-    """
-    try:
-        payload = request.json
-        if not payload or 'data' not in payload:
-            return jsonify({"detail": "Invalid request payload: 'data' key missing."}), 400
-        df = pd.DataFrame(payload["data"])
-        algorithm = payload.get("algorithm", "pc").lower() # Default to PC
-        logger.info(f"Received discovery request with algorithm: {algorithm}, data shape: {df.shape}")
-        if algorithm == "pc":
-            adj_matrix = causal_discovery_algorithms.pc_algorithm(df)
-        elif algorithm == "ges":
-            adj_matrix = causal_discovery_algorithms.ges_algorithm(df) # Placeholder
-        elif algorithm == "notears":
-            adj_matrix = causal_discovery_algorithms.notears_algorithm(df) # Placeholder
-        else:
-            return jsonify({"detail": f"Unsupported causal discovery algorithm: {algorithm}"}), 400
-        logger.info(f"Causal graph discovered using {algorithm}.")
-        return jsonify({"graph": adj_matrix.tolist()})
-    except Exception as e:
-        logger.exception(f"Error in causal discovery: {str(e)}")
         return jsonify({"detail": f"Causal discovery failed: {str(e)}"}), 500

+# routers/discover_routes.py
+from flask import Blueprint, request, jsonify
+import pandas as pd
+from utils.casual_algorithms import CausalDiscoveryAlgorithms
+import logging
+discover_bp = Blueprint('discover', __name__)
+logger = logging.getLogger(__name__)
+causal_discovery_algorithms = CausalDiscoveryAlgorithms()
+@discover_bp.route('/', methods=['POST'])
+def discover_causal_graph():
+    """
+    Discover causal graph from input data using selected algorithm.
+    Expects 'data' key with list of dicts (preprocessed DataFrame records) and 'algorithm' string.
+    Returns graph as adjacency matrix.
+    """
+    try:
+        payload = request.json
+        if not payload or 'data' not in payload:
+            return jsonify({"detail": "Invalid request payload: 'data' key missing."}), 400
+        df = pd.DataFrame(payload["data"])
+        algorithm = payload.get("algorithm", "pc").lower() # Default to PC
+        logger.info(f"Received discovery request with algorithm: {algorithm}, data shape: {df.shape}")
+        if algorithm == "pc":
+            adj_matrix = causal_discovery_algorithms.pc_algorithm(df)
+        elif algorithm == "ges":
+            adj_matrix = causal_discovery_algorithms.ges_algorithm(df) # Placeholder
+        elif algorithm == "notears":
+            adj_matrix = causal_discovery_algorithms.notears_algorithm(df) # Placeholder
+        else:
+            return jsonify({"detail": f"Unsupported causal discovery algorithm: {algorithm}"}), 400
+        logger.info(f"Causal graph discovered using {algorithm}.")
+        return jsonify({"graph": adj_matrix.tolist()})
+    except Exception as e:
+        logger.exception(f"Error in causal discovery: {str(e)}")
         return jsonify({"detail": f"Causal discovery failed: {str(e)}"}), 500

routers/intervene_routes.py CHANGED Viewed

@@ -1,54 +1,54 @@
-# routers/intervene_routes.py
-from flask import Blueprint, request, jsonify
-import pandas as pd
-from utils.do_calculus import DoCalculus # Will be used for more advanced intervention
-import networkx as nx # Assuming graph is passed or re-discovered
-import logging
-intervene_bp = Blueprint('intervene', __name__)
-logger = logging.getLogger(__name__)
-@intervene_bp.route('/', methods=['POST'])
-def perform_intervention():
-    """
-    Perform causal intervention on data.
-    Expects 'data' (list of dicts), 'intervention_var' (column name),
-    'intervention_value' (numeric), and optionally 'graph' (adjacency matrix).
-    Returns intervened data as list of dicts.
-    """
-    try:
-        payload = request.json
-        if not payload or 'data' not in payload or 'intervention_var' not in payload or 'intervention_value' not in payload:
-            return jsonify({"detail": "Missing required intervention parameters."}), 400
-        df = pd.DataFrame(payload["data"])
-        intervention_var = payload["intervention_var"]
-        intervention_value = payload["intervention_value"]
-        graph_adj_matrix = payload.get("graph") # Optional: pass pre-discovered graph
-        logger.info(f"Intervention request: var={intervention_var}, value={intervention_value}, data shape: {df.shape}")
-        if intervention_var not in df.columns:
-            return jsonify({"detail": f"Intervention variable '{intervention_var}' not found in data"}), 400
-        # For a more advanced do-calculus, you'd need the graph structure.
-        # Here, a simplified direct intervention is applied first.
-        # If graph_adj_matrix is provided, you could convert it to networkx.
-        # For full do-calculus, the DoCalculus class would need a proper graph.
-        df_intervened = df.copy()
-        df_intervened[intervention_var] = intervention_value
-        # Placeholder for propagating effects using a graph if provided
-        # if graph_adj_matrix:
-        #    graph_nx = nx.from_numpy_array(np.array(graph_adj_matrix), create_using=nx.DiGraph)
-        #    do_calculus_engine = DoCalculus(graph_nx)
-        #    df_intervened = do_calculus_engine.intervene(df_intervened, intervention_var, intervention_value)
-        #    logger.info("Propagated effects using do-calculus (simplified).")
-        logger.info(f"Intervened data shape: {df_intervened.shape}")
-        return jsonify({"intervened_data": df_intervened.to_dict(orient="records")})
-    except Exception as e:
-        logger.exception(f"Error in intervention: {str(e)}")
         return jsonify({"detail": f"Intervention failed: {str(e)}"}), 500

+# routers/intervene_routes.py
+from flask import Blueprint, request, jsonify
+import pandas as pd
+from utils.do_calculus import DoCalculus # Will be used for more advanced intervention
+import networkx as nx # Assuming graph is passed or re-discovered
+import logging
+intervene_bp = Blueprint('intervene', __name__)
+logger = logging.getLogger(__name__)
+@intervene_bp.route('/', methods=['POST'])
+def perform_intervention():
+    """
+    Perform causal intervention on data.
+    Expects 'data' (list of dicts), 'intervention_var' (column name),
+    'intervention_value' (numeric), and optionally 'graph' (adjacency matrix).
+    Returns intervened data as list of dicts.
+    """
+    try:
+        payload = request.json
+        if not payload or 'data' not in payload or 'intervention_var' not in payload or 'intervention_value' not in payload:
+            return jsonify({"detail": "Missing required intervention parameters."}), 400
+        df = pd.DataFrame(payload["data"])
+        intervention_var = payload["intervention_var"]
+        intervention_value = payload["intervention_value"]
+        graph_adj_matrix = payload.get("graph") # Optional: pass pre-discovered graph
+        logger.info(f"Intervention request: var={intervention_var}, value={intervention_value}, data shape: {df.shape}")
+        if intervention_var not in df.columns:
+            return jsonify({"detail": f"Intervention variable '{intervention_var}' not found in data"}), 400
+        # For a more advanced do-calculus, you'd need the graph structure.
+        # Here, a simplified direct intervention is applied first.
+        # If graph_adj_matrix is provided, you could convert it to networkx.
+        # For full do-calculus, the DoCalculus class would need a proper graph.
+        df_intervened = df.copy()
+        df_intervened[intervention_var] = intervention_value
+        # Placeholder for propagating effects using a graph if provided
+        # if graph_adj_matrix:
+        #    graph_nx = nx.from_numpy_array(np.array(graph_adj_matrix), create_using=nx.DiGraph)
+        #    do_calculus_engine = DoCalculus(graph_nx)
+        #    df_intervened = do_calculus_engine.intervene(df_intervened, intervention_var, intervention_value)
+        #    logger.info("Propagated effects using do-calculus (simplified).")
+        logger.info(f"Intervened data shape: {df_intervened.shape}")
+        return jsonify({"intervened_data": df_intervened.to_dict(orient="records")})
+    except Exception as e:
+        logger.exception(f"Error in intervention: {str(e)}")
         return jsonify({"detail": f"Intervention failed: {str(e)}"}), 500

routers/prediction_routes.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# routers/prediction_routes.py
+from flask import Blueprint, request, jsonify
+import pandas as pd
+from utils.prediction_models import train_predict_random_forest
+prediction_bp = Blueprint('prediction_bp', __name__)
+@prediction_bp.route('/train_predict', methods=['POST'])
+def train_predict():
+    """
+    API endpoint to train a Random Forest model and perform prediction/evaluation.
+    """
+    data = request.json.get('data')
+    target_col = request.json.get('target_col')
+    feature_cols = request.json.get('feature_cols')
+    prediction_type = request.json.get('prediction_type')
+    if not all([data, target_col, feature_cols, prediction_type]):
+        return jsonify({"detail": "Missing required parameters for prediction."}), 400
+    try:
+        results = train_predict_random_forest(data, target_col, feature_cols, prediction_type)
+        return jsonify({"results": results}), 200
+    except ValueError as e:
+        return jsonify({"detail": str(e)}), 400
+    except Exception as e:
+        return jsonify({"detail": f"An error occurred during prediction: {str(e)}"}), 500

routers/preprocess_routes.py CHANGED Viewed

@@ -1,56 +1,56 @@
-# routers/preprocess_routes.py
-from flask import Blueprint, request, jsonify
-import pandas as pd
-from utils.preprocessor import DataPreprocessor
-import logging
-preprocess_bp = Blueprint('preprocess', __name__)
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-preprocessor = DataPreprocessor()
-@preprocess_bp.route('/upload', methods=['POST'])
-def upload_file():
-    """
-    Upload and preprocess a CSV file.
-    Returns preprocessed DataFrame columns and data as JSON.
-    Optional limit_rows to reduce response size for testing.
-    """
-    if 'file' not in request.files:
-        return jsonify({"detail": "No file part in the request"}), 400
-    file = request.files['file']
-    if file.filename == '':
-        return jsonify({"detail": "No selected file"}), 400
-    if not file.filename.lower().endswith('.csv'):
-        return jsonify({"detail": "Only CSV files are supported"}), 400
-    limit_rows = request.args.get('limit_rows', type=int)
-    try:
-        logger.info(f"Received file: {file.filename}")
-        df = pd.read_csv(file)
-        logger.info(f"CSV read successfully, shape: {df.shape}")
-        processed_df = preprocessor.preprocess(df)
-        if limit_rows:
-            processed_df = processed_df.head(limit_rows)
-            logger.info(f"Limited to {limit_rows} rows.")
-        response = {
-            "columns": list(processed_df.columns),
-            "data": processed_df.to_dict(orient="records")
-        }
-        logger.info(f"Preprocessed {len(response['data'])} records.")
-        return jsonify(response)
-    except pd.errors.EmptyDataError:
-        logger.error("Empty CSV file uploaded.")
-        return jsonify({"detail": "Empty CSV file"}), 400
-    except pd.errors.ParserError:
-        logger.error("Invalid CSV format.")
-        return jsonify({"detail": "Invalid CSV format"}), 400
-    except Exception as e:
-        logger.exception(f"Unexpected error during file processing: {str(e)}")
         return jsonify({"detail": f"Failed to process file: {str(e)}"}), 500

+# routers/preprocess_routes.py
+from flask import Blueprint, request, jsonify
+import pandas as pd
+from utils.preprocessor import DataPreprocessor
+import logging
+preprocess_bp = Blueprint('preprocess', __name__)
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+preprocessor = DataPreprocessor()
+@preprocess_bp.route('/upload', methods=['POST'])
+def upload_file():
+    """
+    Upload and preprocess a CSV file.
+    Returns preprocessed DataFrame columns and data as JSON.
+    Optional limit_rows to reduce response size for testing.
+    """
+    if 'file' not in request.files:
+        return jsonify({"detail": "No file part in the request"}), 400
+    file = request.files['file']
+    if file.filename == '':
+        return jsonify({"detail": "No selected file"}), 400
+    if not file.filename.lower().endswith('.csv'):
+        return jsonify({"detail": "Only CSV files are supported"}), 400
+    limit_rows = request.args.get('limit_rows', type=int)
+    try:
+        logger.info(f"Received file: {file.filename}")
+        df = pd.read_csv(file)
+        logger.info(f"CSV read successfully, shape: {df.shape}")
+        processed_df = preprocessor.preprocess(df)
+        if limit_rows:
+            processed_df = processed_df.head(limit_rows)
+            logger.info(f"Limited to {limit_rows} rows.")
+        response = {
+            "columns": list(processed_df.columns),
+            "data": processed_df.to_dict(orient="records")
+        }
+        logger.info(f"Preprocessed {len(response['data'])} records.")
+        return jsonify(response)
+    except pd.errors.EmptyDataError:
+        logger.error("Empty CSV file uploaded.")
+        return jsonify({"detail": "Empty CSV file"}), 400
+    except pd.errors.ParserError:
+        logger.error("Invalid CSV format.")
+        return jsonify({"detail": "Invalid CSV format"}), 400
+    except Exception as e:
+        logger.exception(f"Unexpected error during file processing: {str(e)}")
         return jsonify({"detail": f"Failed to process file: {str(e)}"}), 500

routers/timeseries_routes.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# routers/timeseries_routes.py
+from flask import Blueprint, request, jsonify
+import pandas as pd
+from utils.time_series_causal import perform_granger_causality
+timeseries_bp = Blueprint('timeseries_bp', __name__)
+@timeseries_bp.route('/discover_causality', methods=['POST'])
+def discover_timeseries_causality():
+    """
+    API endpoint to perform time-series causal discovery (Granger Causality).
+    """
+    data = request.json.get('data')
+    timestamp_col = request.json.get('timestamp_col')
+    variables_to_analyze = request.json.get('variables_to_analyze')
+    max_lags = request.json.get('max_lags', 1) # Default to 1 lag
+    if not all([data, timestamp_col, variables_to_analyze]):
+        return jsonify({"detail": "Missing required parameters for time-series causal discovery."}), 400
+    if not isinstance(max_lags, int) or max_lags <= 0:
+        return jsonify({"detail": "max_lags must be a positive integer."}), 400
+    try:
+        results = perform_granger_causality(data, timestamp_col, variables_to_analyze, max_lags)
+        return jsonify({"results": results}), 200
+    except ValueError as e:
+        return jsonify({"detail": str(e)}), 400
+    except Exception as e:
+        return jsonify({"detail": f"An error occurred during time-series causal discovery: {str(e)}"}), 500

routers/treatment_routes.py CHANGED Viewed

@@ -1,54 +1,54 @@
-# routers/treatment_routes.py
-from flask import Blueprint, request, jsonify
-import pandas as pd
-from utils.treatment_effects import TreatmentEffectAlgorithms
-import logging
-treatment_bp = Blueprint('treatment', __name__)
-logger = logging.getLogger(__name__)
-treatment_effect_algorithms = TreatmentEffectAlgorithms()
-@treatment_bp.route('/estimate_ate', methods=['POST'])
-def estimate_ate():
-    """
-    Estimate Average Treatment Effect (ATE) or Conditional Treatment Effect (CATE).
-    Expects 'data' (list of dicts), 'treatment_col', 'outcome_col', 'covariates' (list of column names),
-    and 'method' (string for estimation method).
-    Returns ATE/CATE as float or dictionary.
-    """
-    try:
-        payload = request.json
-        if not payload or 'data' not in payload or 'treatment_col' not in payload or 'outcome_col' not in payload or 'covariates' not in payload:
-            return jsonify({"detail": "Missing required ATE estimation parameters."}), 400
-        df = pd.DataFrame(payload["data"])
-        treatment_col = payload["treatment_col"]
-        outcome_col = payload["outcome_col"]
-        covariates = payload["covariates"]
-        method = payload.get("method", "linear_regression").lower() # Default to linear regression
-        logger.info(f"ATE/CATE request: treatment={treatment_col}, outcome={outcome_col}, method={method}, data shape: {df.shape}")
-        if not all(col in df.columns for col in [treatment_col, outcome_col] + covariates):
-            return jsonify({"detail": "Invalid column names provided for ATE estimation."}), 400
-        if method == "linear_regression":
-            result = treatment_effect_algorithms.linear_regression_ate(df, treatment_col, outcome_col, covariates)
-        elif method == "propensity_score_matching":
-            result = treatment_effect_algorithms.propensity_score_matching(df, treatment_col, outcome_col, covariates) # Placeholder
-        elif method == "inverse_propensity_weighting":
-            result = treatment_effect_algorithms.inverse_propensity_weighting(df, treatment_col, outcome_col, covariates) # Placeholder
-        elif method == "t_learner":
-            result = treatment_effect_algorithms.t_learner(df, treatment_col, outcome_col, covariates) # Placeholder
-        elif method == "s_learner":
-            result = treatment_effect_algorithms.s_learner(df, treatment_col, outcome_col, covariates) # Placeholder
-        else:
-            return jsonify({"detail": f"Unsupported treatment effect estimation method: {method}"}), 400
-        logger.info(f"Estimated ATE/CATE using {method}: {result}")
-        return jsonify({"result": result})
-    except Exception as e:
-        logger.exception(f"Error in ATE/CATE estimation: {str(e)}")
         return jsonify({"detail": f"ATE/CATE estimation failed: {str(e)}"}), 500

+# routers/treatment_routes.py
+from flask import Blueprint, request, jsonify
+import pandas as pd
+from utils.treatment_effects import TreatmentEffectAlgorithms
+import logging
+treatment_bp = Blueprint('treatment', __name__)
+logger = logging.getLogger(__name__)
+treatment_effect_algorithms = TreatmentEffectAlgorithms()
+@treatment_bp.route('/estimate_ate', methods=['POST'])
+def estimate_ate():
+    """
+    Estimate Average Treatment Effect (ATE) or Conditional Treatment Effect (CATE).
+    Expects 'data' (list of dicts), 'treatment_col', 'outcome_col', 'covariates' (list of column names),
+    and 'method' (string for estimation method).
+    Returns ATE/CATE as float or dictionary.
+    """
+    try:
+        payload = request.json
+        if not payload or 'data' not in payload or 'treatment_col' not in payload or 'outcome_col' not in payload or 'covariates' not in payload:
+            return jsonify({"detail": "Missing required ATE estimation parameters."}), 400
+        df = pd.DataFrame(payload["data"])
+        treatment_col = payload["treatment_col"]
+        outcome_col = payload["outcome_col"]
+        covariates = payload["covariates"]
+        method = payload.get("method", "linear_regression").lower() # Default to linear regression
+        logger.info(f"ATE/CATE request: treatment={treatment_col}, outcome={outcome_col}, method={method}, data shape: {df.shape}")
+        if not all(col in df.columns for col in [treatment_col, outcome_col] + covariates):
+            return jsonify({"detail": "Invalid column names provided for ATE estimation."}), 400
+        if method == "linear_regression":
+            result = treatment_effect_algorithms.linear_regression_ate(df, treatment_col, outcome_col, covariates)
+        elif method == "propensity_score_matching":
+            result = treatment_effect_algorithms.propensity_score_matching(df, treatment_col, outcome_col, covariates) # Placeholder
+        elif method == "inverse_propensity_weighting":
+            result = treatment_effect_algorithms.inverse_propensity_weighting(df, treatment_col, outcome_col, covariates) # Placeholder
+        elif method == "t_learner":
+            result = treatment_effect_algorithms.t_learner(df, treatment_col, outcome_col, covariates) # Placeholder
+        elif method == "s_learner":
+            result = treatment_effect_algorithms.s_learner(df, treatment_col, outcome_col, covariates) # Placeholder
+        else:
+            return jsonify({"detail": f"Unsupported treatment effect estimation method: {method}"}), 400
+        logger.info(f"Estimated ATE/CATE using {method}: {result}")
+        return jsonify({"result": result})
+    except Exception as e:
+        logger.exception(f"Error in ATE/CATE estimation: {str(e)}")
         return jsonify({"detail": f"ATE/CATE estimation failed: {str(e)}"}), 500

routers/visualize_routes.py CHANGED Viewed

@@ -1,43 +1,43 @@
-# routers/visualize_routes.py
-from flask import Blueprint, request, jsonify
-import pandas as pd
-from utils.graph_utils import visualize_graph
-import networkx as nx
-import numpy as np
-import logging
-visualize_bp = Blueprint('visualize', __name__)
-logger = logging.getLogger(__name__)
-@visualize_bp.route('/graph', methods=['POST'])
-def get_graph_visualization():
-    """
-    Generate a causal graph visualization from an adjacency matrix.
-    Expects 'graph' (adjacency matrix as list of lists) and 'nodes' (list of node names).
-    Returns Plotly JSON for the graph.
-    """
-    try:
-        payload = request.json
-        if not payload or 'graph' not in payload or 'nodes' not in payload:
-            return jsonify({"detail": "Missing 'graph' or 'nodes' in request payload."}), 400
-        adj_matrix = np.array(payload["graph"])
-        nodes = payload["nodes"]
-        logger.info(f"Received graph visualization request for {len(nodes)} nodes.")
-        # Reconstruct networkx graph from adjacency matrix and node names
-        graph_nx = nx.from_numpy_array(adj_matrix, create_using=nx.DiGraph)
-        # Map integer node labels back to original column names if necessary
-        # Assuming nodes are ordered as they appear in the original dataframe or provided in 'nodes'
-        mapping = {i: node_name for i, node_name in enumerate(nodes)}
-        graph_nx = nx.relabel_nodes(graph_nx, mapping)
-        graph_json = visualize_graph(graph_nx)
-        logger.info("Generated graph visualization JSON.")
-        return jsonify({"graph": graph_json})
-    except Exception as e:
-        logger.exception(f"Error generating graph visualization: {str(e)}")
         return jsonify({"detail": f"Failed to generate visualization: {str(e)}"}), 500

+# routers/visualize_routes.py
+from flask import Blueprint, request, jsonify
+import pandas as pd
+from utils.graph_utils import visualize_graph
+import networkx as nx
+import numpy as np
+import logging
+visualize_bp = Blueprint('visualize', __name__)
+logger = logging.getLogger(__name__)
+@visualize_bp.route('/graph', methods=['POST'])
+def get_graph_visualization():
+    """
+    Generate a causal graph visualization from an adjacency matrix.
+    Expects 'graph' (adjacency matrix as list of lists) and 'nodes' (list of node names).
+    Returns Plotly JSON for the graph.
+    """
+    try:
+        payload = request.json
+        if not payload or 'graph' not in payload or 'nodes' not in payload:
+            return jsonify({"detail": "Missing 'graph' or 'nodes' in request payload."}), 400
+        adj_matrix = np.array(payload["graph"])
+        nodes = payload["nodes"]
+        logger.info(f"Received graph visualization request for {len(nodes)} nodes.")
+        # Reconstruct networkx graph from adjacency matrix and node names
+        graph_nx = nx.from_numpy_array(adj_matrix, create_using=nx.DiGraph)
+        # Map integer node labels back to original column names if necessary
+        # Assuming nodes are ordered as they appear in the original dataframe or provided in 'nodes'
+        mapping = {i: node_name for i, node_name in enumerate(nodes)}
+        graph_nx = nx.relabel_nodes(graph_nx, mapping)
+        graph_json = visualize_graph(graph_nx)
+        logger.info("Generated graph visualization JSON.")
+        return jsonify({"graph": graph_json})
+    except Exception as e:
+        logger.exception(f"Error generating graph visualization: {str(e)}")
         return jsonify({"detail": f"Failed to generate visualization: {str(e)}"}), 500

scripts/generate_data.py CHANGED Viewed

@@ -1,29 +1,29 @@
-# scripts/generate_data.py
-import numpy as np
-import pandas as pd
-import os
-def generate_dataset(n_samples=1000):
-    np.random.seed(42)
-    study_hours = np.random.normal(10, 2, n_samples)
-    tuition_hours = np.random.normal(5, 1, n_samples)
-    parental_education = np.random.choice(['High', 'Medium', 'Low'], n_samples)
-    school_type = np.random.choice(['Public', 'Private'], n_samples)
-    exam_score = 50 + 2 * study_hours + 1.5 * tuition_hours + np.random.normal(0, 5, n_samples)
-    df = pd.DataFrame({
-        'StudyHours': study_hours,
-        'TuitionHours': tuition_hours,
-        'ParentalEducation': parental_education,
-        'SchoolType': school_type,
-        'FinalExamScore': exam_score
-    })
-    # Ensure data directory exists
-    os.makedirs('../data', exist_ok=True)
-    df.to_csv('../data/sample_dataset.csv', index=False)
-    return df
-if __name__ == "__main__":
-    generate_dataset()
-    print("Dataset generated and saved to ../data/sample_dataset.csv")

+# scripts/generate_data.py
+import numpy as np
+import pandas as pd
+import os
+def generate_dataset(n_samples=1000):
+    np.random.seed(42)
+    study_hours = np.random.normal(10, 2, n_samples)
+    tuition_hours = np.random.normal(5, 1, n_samples)
+    parental_education = np.random.choice(['High', 'Medium', 'Low'], n_samples)
+    school_type = np.random.choice(['Public', 'Private'], n_samples)
+    exam_score = 50 + 2 * study_hours + 1.5 * tuition_hours + np.random.normal(0, 5, n_samples)
+    df = pd.DataFrame({
+        'StudyHours': study_hours,
+        'TuitionHours': tuition_hours,
+        'ParentalEducation': parental_education,
+        'SchoolType': school_type,
+        'FinalExamScore': exam_score
+    })
+    # Ensure data directory exists
+    os.makedirs('../data', exist_ok=True)
+    df.to_csv('../data/sample_dataset.csv', index=False)
+    return df
+if __name__ == "__main__":
+    generate_dataset()
+    print("Dataset generated and saved to ../data/sample_dataset.csv")

streamlit_app.py CHANGED Viewed

@@ -1,308 +1,619 @@
-# streamlit_app.py
-import streamlit as st
-import pandas as pd
-import requests
-import json
-import plotly.express as px
-import plotly.graph_objects as go
-import numpy as np # For random array in placeholders
-import os
-# Configuration
-FLASK_API_URL = "http://localhost:5000" # Ensure this matches your Flask app's host and port
-st.set_page_config(layout="wide", page_title="CausalBox Toolkit")
-st.title("🔬 CausalBox: A Causal Inference Toolkit")
-st.markdown("Uncover causal relationships, simulate interventions, and estimate treatment effects.")
-# --- Session State Initialization ---
-if 'processed_data' not in st.session_state:
-    st.session_state.processed_data = None
-if 'processed_columns' not in st.session_state:
-    st.session_state.processed_columns = None
-if 'causal_graph_adj' not in st.session_state:
-    st.session_state.causal_graph_adj = None
-if 'causal_graph_nodes' not in st.session_state:
-    st.session_state.causal_graph_nodes = None
-# --- Data Preprocessing Module ---
-st.header("1. Data Preprocessor 🧹")
-st.write("Upload your CSV dataset or use a generated sample dataset.")
-# Option to use generated sample dataset
-if st.button("Use Sample Dataset (sample_dataset.csv)"):
-    # In a real scenario, Streamlit would serve the file or you'd load it directly if local.
-    # For this setup, we assume the Flask backend can access it or you manually upload it once.
-    # For demonstration, we'll simulate loading a generic DataFrame.
-    # In a full deployment, you'd have a mechanism to either:
-    #   a) Have Flask serve the sample file, or
-    #   b) Directly load it in Streamlit if the app and data are co-located.
-    try:
-        # Assuming the sample dataset is accessible or you are testing locally with `scripts/generate_data.py`
-        # and then manually uploading this generated file.
-        # For simplicity, we'll create a dummy df here if not actually uploaded.
-        sample_df_path = "data/sample_dataset.csv" # Path relative to main.py or Streamlit app execution
-        if os.path.exists(sample_df_path):
-             sample_df = pd.read_csv(sample_df_path)
-             st.success(f"Loaded sample dataset from {sample_df_path}. Please upload this file if running from different directory.")
-        else:
-            st.warning("Sample dataset not found at data/sample_dataset.csv.")
-            # Dummy DataFrame for demonstration if sample file isn't found
-            sample_df = pd.DataFrame(np.random.rand(10, 5), columns=[f'col_{i}' for i in range(5)])
-        # Convert to JSON for Flask API call
-        files = {'file': ('sample_dataset.csv', sample_df.to_csv(index=False), 'text/csv')}
-        response = requests.post(f"{FLASK_API_URL}/preprocess/upload", files=files)
-        if response.status_code == 200:
-            result = response.json()
-            st.session_state.processed_data = result['data']
-            st.session_state.processed_columns = result['columns']
-            st.success("Sample dataset preprocessed successfully!")
-            st.dataframe(pd.DataFrame(st.session_state.processed_data).head()) # Display first few rows
-        else:
-            st.error(f"Error preprocessing sample dataset: {response.json().get('detail', 'Unknown error')}")
-    except Exception as e:
-        st.error(f"Could not load or process sample dataset: {e}")
-uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-if uploaded_file is not None:
-    st.info("Uploading and preprocessing data...")
-    files = {'file': (uploaded_file.name, uploaded_file.getvalue(), 'text/csv')}
-    try:
-        response = requests.post(f"{FLASK_API_URL}/preprocess/upload", files=files)
-        if response.status_code == 200:
-            result = response.json()
-            st.session_state.processed_data = result['data']
-            st.session_state.processed_columns = result['columns']
-            st.success("File preprocessed successfully!")
-            st.dataframe(pd.DataFrame(st.session_state.processed_data).head()) # Display first few rows
-        else:
-            st.error(f"Error during preprocessing: {response.json().get('detail', 'Unknown error')}")
-    except requests.exceptions.ConnectionError:
-        st.error(f"Could not connect to Flask API at {FLASK_API_URL}. Please ensure the backend is running.")
-    except Exception as e:
-        st.error(f"An unexpected error occurred: {e}")
-# --- Causal Discovery Module ---
-st.header("2. Causal Discovery 🕵️‍♂️")
-if st.session_state.processed_data:
-    st.write("Learn the causal structure from your preprocessed data.")
-    discovery_algo = st.selectbox(
-        "Select Causal Discovery Algorithm:",
-        ("PC Algorithm", "GES (Greedy Equivalence Search) - Placeholder", "NOTEARS - Placeholder")
-    )
-    if st.button("Discover Causal Graph"):
-        st.info(f"Discovering graph using {discovery_algo}...")
-        algo_map = {
-            "PC Algorithm": "pc",
-            "GES (Greedy Equivalence Search) - Placeholder": "ges",
-            "NOTEARS - Placeholder": "notears"
-        }
-        selected_algo_code = algo_map[discovery_algo]
-        try:
-            response = requests.post(
-                f"{FLASK_API_URL}/discover/",
-                json={"data": st.session_state.processed_data, "algorithm": selected_algo_code}
-            )
-            if response.status_code == 200:
-                result = response.json()
-                st.session_state.causal_graph_adj = result['graph']
-                st.session_state.causal_graph_nodes = st.session_state.processed_columns
-                st.success("Causal graph discovered!")
-                st.subheader("Causal Graph Visualization")
-                # Visualization will be handled by the Causal Graph Visualizer section
-            else:
-                st.error(f"Error during causal discovery: {response.json().get('detail', 'Unknown error')}")
-        except requests.exceptions.ConnectionError:
-            st.error(f"Could not connect to Flask API at {FLASK_API_URL}. Please ensure the backend is running.")
-        except Exception as e:
-            st.error(f"An unexpected error occurred: {e}")
-else:
-    st.info("Please preprocess data first to enable causal discovery.")
-# --- Causal Graph Visualizer Module ---
-st.header("3. Causal Graph Visualizer 📊")
-if st.session_state.causal_graph_adj and st.session_state.causal_graph_nodes:
-    st.write("Interactive visualization of the discovered causal graph.")
-    try:
-        response = requests.post(
-            f"{FLASK_API_URL}/visualize/graph",
-            json={"graph": st.session_state.causal_graph_adj, "nodes": st.session_state.causal_graph_nodes}
-        )
-        if response.status_code == 200:
-            graph_json = response.json()['graph']
-            fig = go.Figure(json.loads(graph_json))
-            st.plotly_chart(fig, use_container_width=True)
-            st.markdown("""
-            **Graph Explanation:**
-            * **Nodes:** Represent variables in your dataset.
-            * **Arrows (Edges):** Indicate a direct causal influence from one variable (the tail) to another (the head).
-            * **No Arrow:** Suggests no direct causal relationship was found, or the relationship is mediated by other variables.
-            This graph helps answer "Why did it happen?" by showing the structural relationships.
-            """)
-        else:
-            st.error(f"Error visualizing graph: {response.json().get('detail', 'Unknown error')}")
-    except requests.exceptions.ConnectionError:
-        st.error(f"Could not connect to Flask API at {FLASK_API_URL}. Please ensure the backend is running.")
-    except Exception as e:
-        st.error(f"An unexpected error occurred during visualization: {e}")
-else:
-    st.info("Please discover a causal graph first to visualize it.")
-# --- Do-Calculus Engine Module ---
-st.header("4. Do-Calculus Engine 🧪")
-if st.session_state.processed_data and st.session_state.causal_graph_adj:
-    st.write("Simulate interventions and observe their effects based on the causal graph.")
-    intervention_var = st.selectbox(
-        "Select variable to intervene on:",
-        st.session_state.processed_columns,
-        key="inter_var_select"
-    )
-    # Attempt to infer type for intervention_value input
-    # Simplified approach: assuming numerical for now due to preprocessor output
-    if intervention_var and isinstance(st.session_state.processed_data[0][intervention_var], (int, float)):
-        intervention_value = st.number_input(f"Set '{intervention_var}' to value:", key="inter_val_input")
-    else: # Treat as string/categorical for input, then try to preprocess for API
-        intervention_value = st.text_input(f"Set '{intervention_var}' to value:", key="inter_val_input_text")
-        st.warning("Categorical intervention values might require specific encoding logic on the backend.")
-    if st.button("Perform Intervention"):
-        st.info(f"Performing intervention: do('{intervention_var}' = {intervention_value})...")
-        try:
-            response = requests.post(
-                f"{FLASK_API_URL}/intervene/",
-                json={
-                    "data": st.session_state.processed_data,
-                    "intervention_var": intervention_var,
-                    "intervention_value": intervention_value,
-                    "graph": st.session_state.causal_graph_adj # Pass graph for advanced do-calculus
-                }
-            )
-            if response.status_code == 200:
-                intervened_data = pd.DataFrame(response.json()['intervened_data'])
-                st.success("Intervention simulated successfully!")
-                st.subheader("Intervened Data (First 10 rows)")
-                st.dataframe(intervened_data.head(10))
-                # Simple comparison visualization (e.g., histogram of outcome variable)
-                if st.session_state.processed_columns and 'FinalExamScore' in st.session_state.processed_columns:
-                    original_df = pd.DataFrame(st.session_state.processed_data)
-                    fig_dist = go.Figure()
-                    fig_dist.add_trace(go.Histogram(x=original_df['FinalExamScore'], name='Original', opacity=0.7))
-                    fig_dist.add_trace(go.Histogram(x=intervened_data['FinalExamScore'], name='Intervened', opacity=0.0))
-                    st.plotly_chart(fig_dist, use_container_width=True)
-                    st.markdown("""
-                    **Intervention Explanation:**
-                    * By simulating `do(X=x)`, we are forcing the value of X, effectively breaking its causal links from its parents.
-                    * The graph above shows the distribution of a key outcome variable (e.g., `FinalExamScore`) before and after the intervention.
-                    * This helps answer "What if we do this instead?" by showing the predicted outcome.
-                    """)
-                else:
-                    st.info("Consider adding a relevant outcome variable to your dataset for better intervention analysis.")
-            else:
-                st.error(f"Error during intervention: {response.json().get('detail', 'Unknown error')}")
-        except requests.exceptions.ConnectionError:
-            st.error(f"Could not connect to Flask API at {FLASK_API_URL}. Please ensure the backend is running.")
-        except Exception as e:
-            st.error(f"An unexpected error occurred during intervention: {e}")
-else:
-    st.info("Please preprocess data and discover a causal graph first to perform interventions.")
-# --- Treatment Effect Estimator Module ---
-st.header("5. Treatment Effect Estimator 🎯")
-if st.session_state.processed_data:
-    st.write("Estimate Average Treatment Effect (ATE) or Conditional Treatment Effect (CATE).")
-    col1, col2 = st.columns(2)
-    with col1:
-        treatment_col = st.selectbox(
-            "Select Treatment Variable:",
-            st.session_state.processed_columns,
-            key="treat_col_select"
-        )
-    with col2:
-        outcome_col = st.selectbox(
-            "Select Outcome Variable:",
-            st.session_state.processed_columns,
-            key="outcome_col_select"
-        )
-    all_cols_except_treat_outcome = [col for col in st.session_state.processed_columns if col not in [treatment_col, outcome_col]]
-    covariates = st.multiselect(
-        "Select Covariates (confounders):",
-        all_cols_except_treat_outcome,
-        default=all_cols_except_treat_outcome, # Default to all other columns
-        key="covariates_select"
-    )
-    estimation_method = st.selectbox(
-        "Select Estimation Method:",
-        (
-            "Linear Regression ATE",
-            "Propensity Score Matching - Placeholder",
-            "Inverse Propensity Weighting - Placeholder",
-            "T-learner - Placeholder",
-            "S-learner - Placeholder"
-        )
-    )
-    if st.button("Estimate Treatment Effect"):
-        st.info(f"Estimating treatment effect using {estimation_method}...")
-        method_map = {
-            "Linear Regression ATE": "linear_regression",
-            "Propensity Score Matching - Placeholder": "propensity_score_matching",
-            "Inverse Propensity Weighting - Placeholder": "inverse_propensity_weighting",
-            "T-learner - Placeholder": "t_learner",
-            "S-learner - Placeholder": "s_learner"
-        }
-        selected_method_code = method_map[estimation_method]
-        try:
-            response = requests.post(
-                f"{FLASK_API_URL}/treatment/estimate_ate",
-                json={
-                    "data": st.session_state.processed_data,
-                    "treatment_col": treatment_col,
-                    "outcome_col": outcome_col,
-                    "covariates": covariates,
-                    "method": selected_method_code
-                }
-            )
-            if response.status_code == 200:
-                ate_result = response.json()['result']
-                st.success(f"Treatment effect estimated using {estimation_method}:")
-                st.write(f"**Estimated ATE: {ate_result:.4f}**")
-                st.markdown("""
-                **Treatment Effect Explanation:**
-                * **Average Treatment Effect (ATE):** Measures the average causal effect of a treatment (e.g., `StudyHours`) on an outcome (e.g., `FinalExamScore`) across the entire population.
-                * It answers "How much does doing X cause a change in Y?".
-                * This estimation attempts to control for confounders (variables that influence both treatment and outcome) to isolate the true causal effect.
-                """)
-            else:
-                st.error(f"Error during ATE estimation: {response.json().get('detail', 'Unknown error')}")
-        except requests.exceptions.ConnectionError:
-            st.error(f"Could not connect to Flask API at {FLASK_API_URL}. Please ensure the backend is running.")
-        except Exception as e:
-            st.error(f"An unexpected error occurred during ATE estimation: {e}")
-else:
-    st.info("Please preprocess data first to estimate treatment effects.")
-# --- Optional Advanced Add-Ons (Future Considerations) ---
-st.header("Optional Advanced Add-Ons (Future Work) 🚀")
-st.markdown("""
--   **🔄 Auto-causal graph refresh if dataset updates:** This would involve setting up a background process (e.g., using `watchfiles` with a separate service or integrated carefully into Flask/Streamlit) that monitors changes to the source CSV file. Upon detection, it would re-run the preprocessing and causal discovery, updating the dashboard live. This requires more complex architecture (e.g., WebSockets for real-time updates to Streamlit or scheduled background tasks).
--   **🕰️ Time-Series Causal Discovery (e.g., Granger Causality):** This requires handling time-indexed data and implementing algorithms specifically designed for temporal causal relationships. It would involve a separate data input and discovery module.
-""")
-st.markdown("---")
 st.info("Developed by CausalBox Team. For support, please contact us.")

+# streamlit_app.py
+import streamlit as st
+import pandas as pd
+import requests
+import json
+import plotly.express as px
+import plotly.graph_objects as go
+import numpy as np # For random array in placeholders
+import os
+# Configuration
+FLASK_API_URL = "http://localhost:5000" # Ensure this matches your Flask app's host and port
+st.set_page_config(layout="wide", page_title="CausalBox Toolkit")
+st.title("🔬 CausalBox: A Causal Inference Toolkit")
+st.markdown("Uncover causal relationships, simulate interventions, and estimate treatment effects.")
+# --- Session State Initialization ---
+if 'processed_data' not in st.session_state:
+    st.session_state.processed_data = None
+if 'processed_columns' not in st.session_state:
+    st.session_state.processed_columns = None
+if 'causal_graph_adj' not in st.session_state:
+    st.session_state.causal_graph_adj = None
+if 'causal_graph_nodes' not in st.session_state:
+    st.session_state.causal_graph_nodes = None
+# --- Data Preprocessing Module ---
+st.header("1. Data Preprocessor 🧹")
+st.write("Upload your CSV dataset or use a generated sample dataset.")
+# Option to use generated sample dataset
+if st.button("Use Sample Dataset (sample_dataset.csv)"):
+    # In a real scenario, Streamlit would serve the file or you'd load it directly if local.
+    # For this setup, we assume the Flask backend can access it or you manually upload it once.
+    # For demonstration, we'll simulate loading a generic DataFrame.
+    # In a full deployment, you'd have a mechanism to either:
+    #   a) Have Flask serve the sample file, or
+    #   b) Directly load it in Streamlit if the app and data are co-located.
+    try:
+        # Assuming the sample dataset is accessible or you are testing locally with `scripts/generate_data.py`
+        # and then manually uploading this generated file.
+        # For simplicity, we'll create a dummy df here if not actually uploaded.
+        sample_df_path = "data/sample_dataset.csv" # Path relative to main.py or Streamlit app execution
+        if os.path.exists(sample_df_path):
+             sample_df = pd.read_csv(sample_df_path)
+             st.success(f"Loaded sample dataset from {sample_df_path}. Please upload this file if running from different directory.")
+        else:
+            st.warning("Sample dataset not found at data/sample_dataset.csv.")
+            # Dummy DataFrame for demonstration if sample file isn't found
+            sample_df = pd.DataFrame(np.random.rand(10, 5), columns=[f'col_{i}' for i in range(5)])
+        # Convert to JSON for Flask API call
+        files = {'file': ('sample_dataset.csv', sample_df.to_csv(index=False), 'text/csv')}
+        response = requests.post(f"{FLASK_API_URL}/preprocess/upload", files=files)
+        if response.status_code == 200:
+            result = response.json()
+            st.session_state.processed_data = result['data']
+            st.session_state.processed_columns = result['columns']
+            st.success("Sample dataset preprocessed successfully!")
+            st.dataframe(pd.DataFrame(st.session_state.processed_data).head()) # Display first few rows
+        else:
+            st.error(f"Error preprocessing sample dataset: {response.json().get('detail', 'Unknown error')}")
+    except Exception as e:
+        st.error(f"Could not load or process sample dataset: {e}")
+uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+if uploaded_file is not None:
+    st.info("Uploading and preprocessing data...")
+    files = {'file': (uploaded_file.name, uploaded_file.getvalue(), 'text/csv')}
+    try:
+        response = requests.post(f"{FLASK_API_URL}/preprocess/upload", files=files)
+        if response.status_code == 200:
+            result = response.json()
+            st.session_state.processed_data = result['data']
+            st.session_state.processed_columns = result['columns']
+            st.success("File preprocessed successfully!")
+            st.dataframe(pd.DataFrame(st.session_state.processed_data).head()) # Display first few rows
+        else:
+            st.error(f"Error during preprocessing: {response.json().get('detail', 'Unknown error')}")
+    except requests.exceptions.ConnectionError:
+        st.error(f"Could not connect to Flask API at {FLASK_API_URL}. Please ensure the backend is running.")
+    except Exception as e:
+        st.error(f"An unexpected error occurred: {e}")
+# --- Causal Discovery Module ---
+st.header("2. Causal Discovery 🕵️‍♂️")
+if st.session_state.processed_data:
+    st.write("Learn the causal structure from your preprocessed data.")
+    discovery_algo = st.selectbox(
+        "Select Causal Discovery Algorithm:",
+        ("PC Algorithm", "GES (Greedy Equivalence Search) - Placeholder", "NOTEARS - Placeholder")
+    )
+    if st.button("Discover Causal Graph"):
+        st.info(f"Discovering graph using {discovery_algo}...")
+        algo_map = {
+            "PC Algorithm": "pc",
+            "GES (Greedy Equivalence Search) - Placeholder": "ges",
+            "NOTEARS - Placeholder": "notears"
+        }
+        selected_algo_code = algo_map[discovery_algo]
+        try:
+            response = requests.post(
+                f"{FLASK_API_URL}/discover/",
+                json={"data": st.session_state.processed_data, "algorithm": selected_algo_code}
+            )
+            if response.status_code == 200:
+                result = response.json()
+                st.session_state.causal_graph_adj = result['graph']
+                st.session_state.causal_graph_nodes = st.session_state.processed_columns
+                st.success("Causal graph discovered!")
+                st.subheader("Causal Graph Visualization")
+                # Visualization will be handled by the Causal Graph Visualizer section
+            else:
+                st.error(f"Error during causal discovery: {response.json().get('detail', 'Unknown error')}")
+        except requests.exceptions.ConnectionError:
+            st.error(f"Could not connect to Flask API at {FLASK_API_URL}. Please ensure the backend is running.")
+        except Exception as e:
+            st.error(f"An unexpected error occurred: {e}")
+else:
+    st.info("Please preprocess data first to enable causal discovery.")
+# --- Causal Graph Visualizer Module ---
+st.header("3. Causal Graph Visualizer 📊")
+if st.session_state.causal_graph_adj and st.session_state.causal_graph_nodes:
+    st.write("Interactive visualization of the discovered causal graph.")
+    try:
+        response = requests.post(
+            f"{FLASK_API_URL}/visualize/graph",
+            json={"graph": st.session_state.causal_graph_adj, "nodes": st.session_state.causal_graph_nodes}
+        )
+        if response.status_code == 200:
+            graph_json = response.json()['graph']
+            fig = go.Figure(json.loads(graph_json))
+            st.plotly_chart(fig, use_container_width=True)
+            st.markdown("""
+            **Graph Explanation:**
+            * **Nodes:** Represent variables in your dataset.
+            * **Arrows (Edges):** Indicate a direct causal influence from one variable (the tail) to another (the head).
+            * **No Arrow:** Suggests no direct causal relationship was found, or the relationship is mediated by other variables.
+            This graph helps answer "Why did it happen?" by showing the structural relationships.
+            """)
+        else:
+            st.error(f"Error visualizing graph: {response.json().get('detail', 'Unknown error')}")
+    except requests.exceptions.ConnectionError:
+        st.error(f"Could not connect to Flask API at {FLASK_API_URL}. Please ensure the backend is running.")
+    except Exception as e:
+        st.error(f"An unexpected error occurred during visualization: {e}")
+else:
+    st.info("Please discover a causal graph first to visualize it.")
+# --- Do-Calculus Engine Module ---
+st.header("4. Do-Calculus Engine 🧪")
+if st.session_state.processed_data and st.session_state.causal_graph_adj:
+    st.write("Simulate interventions and observe their effects based on the causal graph.")
+    intervention_var = st.selectbox(
+        "Select variable to intervene on:",
+        st.session_state.processed_columns,
+        key="inter_var_select"
+    )
+    # Attempt to infer type for intervention_value input
+    # Simplified approach: assuming numerical for now due to preprocessor output
+    if intervention_var and isinstance(st.session_state.processed_data[0][intervention_var], (int, float)):
+        intervention_value = st.number_input(f"Set '{intervention_var}' to value:", key="inter_val_input")
+    else: # Treat as string/categorical for input, then try to preprocess for API
+        intervention_value = st.text_input(f"Set '{intervention_var}' to value:", key="inter_val_input_text")
+        st.warning("Categorical intervention values might require specific encoding logic on the backend.")
+    if st.button("Perform Intervention"):
+        st.info(f"Performing intervention: do('{intervention_var}' = {intervention_value})...")
+        try:
+            response = requests.post(
+                f"{FLASK_API_URL}/intervene/",
+                json={
+                    "data": st.session_state.processed_data,
+                    "intervention_var": intervention_var,
+                    "intervention_value": intervention_value,
+                    "graph": st.session_state.causal_graph_adj # Pass graph for advanced do-calculus
+                }
+            )
+            if response.status_code == 200:
+                intervened_data = pd.DataFrame(response.json()['intervened_data'])
+                st.success("Intervention simulated successfully!")
+                st.subheader("Intervened Data (First 10 rows)")
+                st.dataframe(intervened_data.head(10))
+                # Simple comparison visualization (e.g., histogram of outcome variable)
+                if st.session_state.processed_columns and 'FinalExamScore' in st.session_state.processed_columns:
+                    original_df = pd.DataFrame(st.session_state.processed_data)
+                    fig_dist = go.Figure()
+                    fig_dist.add_trace(go.Histogram(x=original_df['FinalExamScore'], name='Original', opacity=0.7))
+                    fig_dist.add_trace(go.Histogram(x=intervened_data['FinalExamScore'], name='Intervened', opacity=0.0))
+                    st.plotly_chart(fig_dist, use_container_width=True)
+                    st.markdown("""
+                    **Intervention Explanation:**
+                    * By simulating `do(X=x)`, we are forcing the value of X, effectively breaking its causal links from its parents.
+                    * The graph above shows the distribution of a key outcome variable (e.g., `FinalExamScore`) before and after the intervention.
+                    * This helps answer "What if we do this instead?" by showing the predicted outcome.
+                    """)
+                else:
+                    st.info("Consider adding a relevant outcome variable to your dataset for better intervention analysis.")
+            else:
+                st.error(f"Error during intervention: {response.json().get('detail', 'Unknown error')}")
+        except requests.exceptions.ConnectionError:
+            st.error(f"Could not connect to Flask API at {FLASK_API_URL}. Please ensure the backend is running.")
+        except Exception as e:
+            st.error(f"An unexpected error occurred during intervention: {e}")
+else:
+    st.info("Please preprocess data and discover a causal graph first to perform interventions.")
+# --- Treatment Effect Estimator Module ---
+st.header("5. Treatment Effect Estimator 🎯")
+if st.session_state.processed_data:
+    st.write("Estimate Average Treatment Effect (ATE) or Conditional Treatment Effect (CATE).")
+    col1, col2 = st.columns(2)
+    with col1:
+        treatment_col = st.selectbox(
+            "Select Treatment Variable:",
+            st.session_state.processed_columns,
+            key="treat_col_select"
+        )
+    with col2:
+        outcome_col = st.selectbox(
+            "Select Outcome Variable:",
+            st.session_state.processed_columns,
+            key="outcome_col_select"
+        )
+    all_cols_except_treat_outcome = [col for col in st.session_state.processed_columns if col not in [treatment_col, outcome_col]]
+    covariates = st.multiselect(
+        "Select Covariates (confounders):",
+        all_cols_except_treat_outcome,
+        default=all_cols_except_treat_outcome, # Default to all other columns
+        key="covariates_select"
+    )
+    estimation_method = st.selectbox(
+        "Select Estimation Method:",
+        (
+            "Linear Regression ATE",
+            "Propensity Score Matching - Placeholder",
+            "Inverse Propensity Weighting - Placeholder",
+            "T-learner - Placeholder",
+            "S-learner - Placeholder"
+        )
+    )
+    if st.button("Estimate Treatment Effect"):
+        st.info(f"Estimating treatment effect using {estimation_method}...")
+        method_map = {
+            "Linear Regression ATE": "linear_regression",
+            "Propensity Score Matching - Placeholder": "propensity_score_matching",
+            "Inverse Propensity Weighting - Placeholder": "inverse_propensity_weighting",
+            "T-learner - Placeholder": "t_learner",
+            "S-learner - Placeholder": "s_learner"
+        }
+        selected_method_code = method_map[estimation_method]
+        try:
+            response = requests.post(
+                f"{FLASK_API_URL}/treatment/estimate_ate",
+                json={
+                    "data": st.session_state.processed_data,
+                    "treatment_col": treatment_col,
+                    "outcome_col": outcome_col,
+                    "covariates": covariates,
+                    "method": selected_method_code
+                }
+            )
+            if response.status_code == 200:
+                ate_result = response.json()['result']
+                st.success(f"Treatment effect estimated using {estimation_method}:")
+                st.write(f"**Estimated ATE: {ate_result:.4f}**")
+                st.markdown("""
+                **Treatment Effect Explanation:**
+                * **Average Treatment Effect (ATE):** Measures the average causal effect of a treatment (e.g., `StudyHours`) on an outcome (e.g., `FinalExamScore`) across the entire population.
+                * It answers "How much does doing X cause a change in Y?".
+                * This estimation attempts to control for confounders (variables that influence both treatment and outcome) to isolate the true causal effect.
+                """)
+            else:
+                st.error(f"Error during ATE estimation: {response.json().get('detail', 'Unknown error')}")
+        except requests.exceptions.ConnectionError:
+            st.error(f"Could not connect to Flask API at {FLASK_API_URL}. Please ensure the backend is running.")
+        except Exception as e:
+            st.error(f"An unexpected error occurred during ATE estimation: {e}")
+else:
+    st.info("Please preprocess data first to estimate treatment effects.")
+# --- Prediction Module ---
+st.header("6. Prediction Module 📈")
+if st.session_state.processed_data:
+    st.write("Train a machine learning model for prediction (Regression or Classification).")
+    prediction_type = st.selectbox(
+        "Select Prediction Type:",
+        ("Regression", "Classification"),
+        key="prediction_type_select"
+    )
+    all_columns = st.session_state.processed_columns
+    suitable_target_columns = []
+    if st.session_state.processed_data:
+        temp_df = pd.DataFrame(st.session_state.processed_data)
+        for col in all_columns:
+            # For classification, check if column is object type (string), boolean,
+            # or has a limited number of unique integer values (e.g., less than 20 unique values)
+            if prediction_type == 'Classification':
+                if temp_df[col].dtype == 'object' or temp_df[col].dtype == 'bool':
+                    suitable_target_columns.append(col)
+                elif pd.api.types.is_integer_dtype(temp_df[col]) and temp_df[col].nunique() < 20: # Heuristic for discrete integers
+                     suitable_target_columns.append(col)
+            # For regression, primarily numerical columns
+            elif prediction_type == 'Regression':
+                if pd.api.types.is_numeric_dtype(temp_df[col]):
+                    suitable_target_columns.append(col)
+    if not suitable_target_columns:
+        st.warning(f"No suitable target columns found for {prediction_type}. Please check your data types.")
+        target_col = None # Set to None to prevent error if no columns are found
+    else:
+        # Try to pre-select the currently chosen target_col if it's still suitable
+        # Otherwise, default to the first suitable column
+        if 'target_col_select' in st.session_state and st.session_state.target_col_select in suitable_target_columns:
+            default_target_index = suitable_target_columns.index(st.session_state.target_col_select)
+        else:
+            default_target_index = 0
+        target_col = st.selectbox(
+            "Select Target Variable:",
+            suitable_target_columns,
+            index=default_target_index,
+            key="target_col_select"
+        )
+    # Filter out the target column from feature options
+    feature_options = [col for col in all_columns if col != target_col]
+    feature_cols = st.multiselect(
+        "Select Feature Variables:",
+        feature_options,
+        default=feature_options, # Default to all other columns
+        key="feature_cols_select"
+    )
+    if st.button("Train Model & Predict", key="train_predict_button"):
+        if not target_col or not feature_cols:
+            st.warning("Please select a target variable and at least one feature variable.")
+        else:
+            st.info(f"Training {prediction_type} model using Random Forest...")
+            try:
+                response = requests.post(
+                    f"{FLASK_API_URL}/prediction/train_predict",
+                    json={
+                        "data": st.session_state.processed_data,
+                        "target_col": target_col,
+                        "feature_cols": feature_cols,
+                        "prediction_type": prediction_type.lower()
+                    }
+                )
+                if response.status_code == 200:
+                    results = response.json()['results']
+                    st.success(f"{prediction_type} Model Trained Successfully!")
+                    st.subheader("Model Performance")
+                    if prediction_type == 'Regression':
+                        st.write(f"**R-squared:** {results['r2_score']:.4f}")
+                        st.write(f"**Mean Squared Error (MSE):** {results['mean_squared_error']:.4f}")
+                        st.write(f"**Root Mean Squared Error (RMSE):** {results['root_mean_squared_error']:.4f}")
+                        st.subheader("Actual vs. Predicted Plot")
+                        actual_predicted_df = pd.DataFrame(results['actual_vs_predicted'])
+                        fig_reg = px.scatter(actual_predicted_df, x='Actual', y='Predicted',
+                                             title='Actual vs. Predicted Values',
+                                             labels={'Actual': f'Actual {target_col}', 'Predicted': f'Predicted {target_col}'})
+                        fig_reg.add_trace(go.Scatter(x=[actual_predicted_df['Actual'].min(), actual_predicted_df['Actual'].max()],
+                                                     y=[actual_predicted_df['Actual'].min(), actual_predicted_df['Actual'].max()],
+                                                     mode='lines', name='Ideal Fit', line=dict(dash='dash', color='red')))
+                        st.plotly_chart(fig_reg, use_container_width=True)
+                        st.subheader("Residual Plot")
+                        actual_predicted_df['Residuals'] = actual_predicted_df['Actual'] - actual_predicted_df['Predicted']
+                        fig_res = px.scatter(actual_predicted_df, x='Predicted', y='Residuals',
+                                             title='Residual Plot',
+                                             labels={'Predicted': f'Predicted {target_col}', 'Residuals': 'Residuals'})
+                        fig_res.add_hline(y=0, line_dash="dash", line_color="red")
+                        st.plotly_chart(fig_res, use_container_width=True)
+                    elif prediction_type == 'Classification':
+                        st.write(f"**Accuracy:** {results['accuracy']:.4f}")
+                        st.write(f"**Precision (weighted):** {results['precision']:.4f}")
+                        st.write(f"**Recall (weighted):** {results['recall']:.4f}")
+                        st.write(f"**F1-Score (weighted):** {results['f1_score']:.4f}")
+                        st.subheader("Confusion Matrix")
+                        conf_matrix = results['confusion_matrix']
+                        class_labels = results.get('class_labels', [str(i) for i in range(len(conf_matrix))])
+                        fig_cm = px.imshow(conf_matrix,
+                                            labels=dict(x="Predicted", y="True", color="Count"),
+                                            x=class_labels,
+                                            y=class_labels,
+                                            text_auto=True,
+                                            color_continuous_scale="Viridis",
+                                            title="Confusion Matrix")
+                        st.plotly_chart(fig_cm, use_container_width=True)
+                        st.subheader("Classification Report")
+                        # Convert dict to DataFrame for nice display
+                        report_df = pd.DataFrame(results['classification_report']).transpose()
+                        st.dataframe(report_df)
+                    st.subheader("Feature Importances")
+                    feature_importances_df = pd.DataFrame(list(results['feature_importances'].items()), columns=['Feature', 'Importance'])
+                    fig_fi = px.bar(feature_importances_df, x='Importance', y='Feature', orientation='h',
+                                    title='Feature Importances',
+                                    labels={'Importance': 'Importance Score', 'Feature': 'Feature Name'})
+                    fig_fi.update_layout(yaxis={'categoryorder':'total ascending'}) # Sort bars
+                    st.plotly_chart(fig_fi, use_container_width=True)
+                else:
+                    st.error(f"Error during prediction: {response.json().get('detail', 'Unknown error')}")
+            except requests.exceptions.ConnectionError:
+                st.error(f"Could not connect to Flask API at {FLASK_API_URL}. Please ensure the backend is running.")
+            except Exception as e:
+                st.error(f"An unexpected error occurred during prediction: {e}")
+else:
+    st.info("Please preprocess data first to use the Prediction Module.")
+# --- Time Series Causal Discovery Module ---
+st.header("7. Time Series Causal Discovery ⏰")
+if st.session_state.processed_data:
+    st.write("Infer causal relationships in time-series data using Granger Causality.")
+    st.info("Ensure your dataset includes a timestamp column and that variables are numeric.")
+    all_columns = st.session_state.processed_columns
+    # Heuristic to suggest potential timestamp columns (object/string type, or first column)
+    potential_ts_cols = [col for col in all_columns if pd.DataFrame(st.session_state.processed_data)[col].dtype == 'object']
+    if not potential_ts_cols and all_columns: # If no object columns, suggest the first column
+        potential_ts_cols = [all_columns[0]]
+    timestamp_col = st.selectbox(
+        "Select Timestamp Column:",
+        potential_ts_cols if potential_ts_cols else ["No suitable timestamp column found. Please check data."],
+        key="ts_col_select"
+    )
+    # Filter out timestamp column and non-numeric columns for analysis
+    variables_for_ts_analysis = [
+        col for col in all_columns if col != timestamp_col and pd.api.types.is_numeric_dtype(pd.DataFrame(st.session_state.processed_data)[col])
+    ]
+    variables_to_analyze = st.multiselect(
+        "Select Variables to Analyze for Granger Causality:",
+        variables_for_ts_analysis,
+        default=variables_for_ts_analysis,
+        key="ts_vars_select"
+    )
+    max_lags = st.number_input(
+        "Max Lags (for Granger Causality):",
+        min_value=1,
+        value=5, # Default value
+        step=1,
+        help="The maximum number of lagged observations to consider for causality."
+    )
+    if st.button("Discover Time Series Causality", key="ts_discover_button"):
+        if not timestamp_col or not variables_to_analyze:
+            st.warning("Please select a timestamp column and at least one variable to analyze.")
+        elif "No suitable timestamp column found" in timestamp_col:
+            st.error("Cannot proceed. Please ensure your data has a suitable timestamp column.")
+        else:
+            st.info("Performing Granger Causality tests...")
+            try:
+                response = requests.post(
+                    f"{FLASK_API_URL}/timeseries/discover_causality",
+                    json={
+                        "data": st.session_state.processed_data,
+                        "timestamp_col": timestamp_col,
+                        "variables_to_analyze": variables_to_analyze,
+                        "max_lags": max_lags
+                    }
+                )
+                if response.status_code == 200:
+                    results = response.json()['results']
+                    st.success("Time Series Causal Discovery Complete!")
+                    st.subheader("Granger Causality Test Results")
+                    if results:
+                        # Convert results to a DataFrame for better display
+                        results_df = pd.DataFrame(results)
+                        results_df['p_value'] = results_df['p_value'].round(4) # Round p-values
+                        st.dataframe(results_df)
+                        st.markdown("**Interpretation:** A small p-value (typically < 0.05) suggests that the 'cause' variable Granger-causes the 'effect' variable. This means past values of the 'cause' variable help predict future values of the 'effect' variable, even when past values of the 'effect' variable are considered.")
+                        st.markdown(f"*(Note: Granger Causality implies predictive causality, not necessarily true mechanistic causality. Also, ensure your time series are stationary for robust results.)*")
+                        # Optionally, visualize a simple causality graph
+                        st.subheader("Granger Causality Graph")
+                        fig_ts_graph = go.Figure()
+                        nodes = []
+                        edges = []
+                        edge_colors = []
+                        # Add nodes
+                        for i, var in enumerate(variables_to_analyze):
+                            nodes.append(dict(id=var, label=var, x=np.cos(i*2*np.pi/len(variables_to_analyze)), y=np.sin(i*2*np.pi/len(variables_to_analyze))))
+                        # Add edges
+                        for res in results:
+                            if res['p_value'] < 0.05: # Consider it a causal link if p-value is below significance
+                                edges.append(dict(source=res['cause'], target=res['effect'], value=1/res['p_value'], title=f"p={res['p_value']:.4f}"))
+                                edge_colors.append("blue")
+                            else:
+                                # Optional: Show non-significant edges in a different color or omit
+                                pass
+                        # Use a simple network graph layout (Spring layout is common)
+                        # For a truly interactive graph, you might need a different library or more complex Plotly setup
+                        # This is a very basic attempt to visualize; consider more robust solutions like NetworkX + Plotly/Dash
+                        # Simple way to draw arrows for significant relationships
+                        significant_edges = [edge for edge in results if edge['p_value'] < 0.05]
+                        if significant_edges:
+                            st.write("Visualizing significant (p < 0.05) Granger causal links:")
+                            # This needs a more robust way to draw directed edges in plotly if using just scatter/lines.
+                            # For now, let's just list them clearly.
+                            for edge in significant_edges:
+                                st.write(f"➡️ **{edge['cause']}** Granger-causes **{edge['effect']}** (p={edge['p_value']:.4f})")
+                        else:
+                            st.info("No significant Granger causal links found at p < 0.05.")
+                    else:
+                        st.info("No Granger Causality relationships found or data insufficient.")
+                else:
+                    st.error(f"Error during time-series causal discovery: {response.json().get('detail', 'Unknown error')}")
+            except requests.exceptions.ConnectionError:
+                st.error(f"Could not connect to Flask API at {FLASK_API_URL}. Please ensure the backend is running.")
+            except Exception as e:
+                st.error(f"An unexpected error occurred during time-series causal discovery: {e}")
+else:
+    st.info("Please preprocess data first to use the Time Series Causal Discovery Module.")
+# --- CausalBox Chat Assistant ---
+st.header("8. CausalBox Chat Assistant 🤖")
+st.write("Ask questions about your loaded dataset, causal concepts, or the discovered causal graph!")
+# Initialize chat history in session state
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Display chat messages from history on app rerun
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# Accept user input
+if prompt := st.chat_input("Ask me anything about CausalBox..."):
+    # Add user message to chat history
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    # Display user message in chat message container
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    # Prepare session context to send to the backend
+    session_context = {
+        "processed_data": st.session_state.processed_data,
+        "processed_columns": st.session_state.processed_columns,
+        "causal_graph_adj": st.session_state.causal_graph_adj,
+        "causal_graph_nodes": st.session_state.causal_graph_nodes,
+        # Add any other relevant session state variables that the chatbot might need
+    }
+    with st.spinner("Thinking..."):
+        try:
+            response = requests.post(
+                f"{FLASK_API_URL}/chatbot/message",
+                json={
+                    "user_message": prompt,
+                    "session_context": session_context
+                }
+            )
+            if response.status_code == 200:
+                chatbot_response_text = response.json().get('response', 'Sorry, I could not generate a response.')
+            else:
+                chatbot_response_text = f"Error from chatbot backend: {response.json().get('detail', 'Unknown error')}"
+        except requests.exceptions.ConnectionError:
+            chatbot_response_text = f"Could not connect to Flask API at {FLASK_API_URL}. Please ensure the backend is running."
+        except Exception as e:
+            chatbot_response_text = f"An unexpected error occurred while getting chatbot response: {e}"
+    # Display assistant response in chat message container
+    with st.chat_message("assistant"):
+        st.markdown(chatbot_response_text)
+    # Add assistant response to chat history
+    st.session_state.messages.append({"role": "assistant", "content": chatbot_response_text})
+# --- Future Work (Simplified) ---
+st.header("Future Work 🚀")
+st.markdown("""
+-   **🔄 Auto-causal graph refresh:** Monitor dataset updates and automatically refresh the causal graph.
+""")
+st.markdown("---")
 st.info("Developed by CausalBox Team. For support, please contact us.")

utils/__pycache__/casual_algorithms.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/casual_algorithms.cpython-310.pyc and b/utils/__pycache__/casual_algorithms.cpython-310.pyc differ

utils/__pycache__/causal_chatbot.cpython-310.pyc ADDED Viewed

Binary file (10.2 kB). View file

utils/__pycache__/do_calculus.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/do_calculus.cpython-310.pyc and b/utils/__pycache__/do_calculus.cpython-310.pyc differ

utils/__pycache__/graph_utils.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/graph_utils.cpython-310.pyc and b/utils/__pycache__/graph_utils.cpython-310.pyc differ

utils/__pycache__/prediction_models.cpython-310.pyc ADDED Viewed

Binary file (3.09 kB). View file

utils/__pycache__/preprocessor.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/preprocessor.cpython-310.pyc and b/utils/__pycache__/preprocessor.cpython-310.pyc differ

utils/__pycache__/time_series_causal.cpython-310.pyc ADDED Viewed

Binary file (2.21 kB). View file

utils/__pycache__/treatment_effects.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/treatment_effects.cpython-310.pyc and b/utils/__pycache__/treatment_effects.cpython-310.pyc differ

utils/casual_algorithms.py CHANGED Viewed

@@ -1,64 +1,64 @@
-# utils/causal_algorithms.py
-import networkx as nx
-import pandas as pd
-import numpy as np
-from causallearn.search.ConstraintBased.PC import pc
-# from causallearn.search.ScoreBased.GES import ges # Example import for GES
-# from notears import notears_linear # Example import for NOTEARS
-class CausalDiscoveryAlgorithms:
-    def pc_algorithm(self, df, alpha=0.05):
-        """
-        Run PC algorithm to learn causal graph.
-        Returns a directed graph's adjacency matrix.
-        Requires numerical data.
-        """
-        data_array = df.to_numpy()
-        cg = pc(data_array, alpha=alpha, indep_test="fisherz")
-        adj_matrix = cg.G.graph
-        return adj_matrix
-    def ges_algorithm(self, df):
-        """
-        Placeholder for GES (Greedy Equivalence Search) algorithm.
-        Returns a directed graph's adjacency matrix.
-        You would implement or integrate the GES algorithm here.
-        """
-        # Example: G, edges = ges(data_array)
-        # For now, returning a simplified correlation-based graph for demonstration
-        print("GES algorithm is a placeholder. Using a simplified correlation-based graph.")
-        G = nx.DiGraph()
-        nodes = df.columns
-        G.add_nodes_from(nodes)
-        corr_matrix = df.corr().abs()
-        threshold = 0.3
-        for i, col1 in enumerate(nodes):
-            for col2 in nodes[i+1:]:
-                if corr_matrix.loc[col1, col2] > threshold:
-                    if np.random.rand() > 0.5:
-                        G.add_edge(col1, col2)
-                    else:
-                        G.add_edge(col2, col1)
-        return nx.to_numpy_array(G) # Convert to adjacency matrix
-    def notears_algorithm(self, df):
-        """
-        Placeholder for NOTEARS algorithm.
-        Returns a directed graph's adjacency matrix.
-        You would implement or integrate the NOTEARS algorithm here.
-        """
-        # Example: W_est = notears_linear(data_array)
-        print("NOTEARS algorithm is a placeholder. Using a simplified correlation-based graph.")
-        G = nx.DiGraph()
-        nodes = df.columns
-        G.add_nodes_from(nodes)
-        corr_matrix = df.corr().abs()
-        threshold = 0.3
-        for i, col1 in enumerate(nodes):
-            for col2 in nodes[i+1:]:
-                if corr_matrix.loc[col1, col2] > threshold:
-                    if np.random.rand() > 0.5:
-                        G.add_edge(col1, col2)
-                    else:
-                        G.add_edge(col2, col1)
         return nx.to_numpy_array(G) # Convert to adjacency matrix

+# utils/causal_algorithms.py
+import networkx as nx
+import pandas as pd
+import numpy as np
+from causallearn.search.ConstraintBased.PC import pc
+# from causallearn.search.ScoreBased.GES import ges # Example import for GES
+# from notears import notears_linear # Example import for NOTEARS
+class CausalDiscoveryAlgorithms:
+    def pc_algorithm(self, df, alpha=0.05):
+        """
+        Run PC algorithm to learn causal graph.
+        Returns a directed graph's adjacency matrix.
+        Requires numerical data.
+        """
+        data_array = df.to_numpy()
+        cg = pc(data_array, alpha=alpha, indep_test="fisherz")
+        adj_matrix = cg.G.graph
+        return adj_matrix
+    def ges_algorithm(self, df):
+        """
+        Placeholder for GES (Greedy Equivalence Search) algorithm.
+        Returns a directed graph's adjacency matrix.
+        You would implement or integrate the GES algorithm here.
+        """
+        # Example: G, edges = ges(data_array)
+        # For now, returning a simplified correlation-based graph for demonstration
+        print("GES algorithm is a placeholder. Using a simplified correlation-based graph.")
+        G = nx.DiGraph()
+        nodes = df.columns
+        G.add_nodes_from(nodes)
+        corr_matrix = df.corr().abs()
+        threshold = 0.3
+        for i, col1 in enumerate(nodes):
+            for col2 in nodes[i+1:]:
+                if corr_matrix.loc[col1, col2] > threshold:
+                    if np.random.rand() > 0.5:
+                        G.add_edge(col1, col2)
+                    else:
+                        G.add_edge(col2, col1)
+        return nx.to_numpy_array(G) # Convert to adjacency matrix
+    def notears_algorithm(self, df):
+        """
+        Placeholder for NOTEARS algorithm.
+        Returns a directed graph's adjacency matrix.
+        You would implement or integrate the NOTEARS algorithm here.
+        """
+        # Example: W_est = notears_linear(data_array)
+        print("NOTEARS algorithm is a placeholder. Using a simplified correlation-based graph.")
+        G = nx.DiGraph()
+        nodes = df.columns
+        G.add_nodes_from(nodes)
+        corr_matrix = df.corr().abs()
+        threshold = 0.3
+        for i, col1 in enumerate(nodes):
+            for col2 in nodes[i+1:]:
+                if corr_matrix.loc[col1, col2] > threshold:
+                    if np.random.rand() > 0.5:
+                        G.add_edge(col1, col2)
+                    else:
+                        G.add_edge(col2, col1)
         return nx.to_numpy_array(G) # Convert to adjacency matrix

utils/causal_chatbot.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# utils/causal_chatbot.py
+import os
+from dotenv import load_dotenv
+from langchain_groq import ChatGroq
+from langchain_core.tools import tool
+from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
+from langchain_core.prompts import ChatPromptTemplate
+from utils.preprocessor import summarize_dataframe_for_chatbot
+from utils.graph_utils import get_graph_summary_for_chatbot
+import pandas as pd
+load_dotenv()
+# Configure Groq API Key
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+if not GROQ_API_KEY:
+    print("ERROR: GROQ_API_KEY environment variable not set.")
+    raise ValueError("GROQ_API_KEY is required.")
+# Debug: Print API key details
+print(f"Loaded GROQ_API_KEY: {GROQ_API_KEY[:5]}...{GROQ_API_KEY[-5:]}")
+print(f"API Key Length: {len(GROQ_API_KEY)}")
+# Initialize the Groq model with LangChain
+try:
+    model = ChatGroq(
+        model_name="llama-3.3-70b-versatile",
+        temperature=0.7,
+        groq_api_key=GROQ_API_KEY
+    )
+except Exception as e:
+    print(f"Error configuring Groq API: {e}")
+    model = None
+def assess_causal_compatibility(data_json: list) -> str:
+    """
+    Assesses the dataset's compatibility for causal inference analysis.
+    Args:
+        data_json: List of dictionaries representing the dataset.
+    Returns:
+        String describing the dataset's suitability for causal analysis.
+    """
+    if not data_json:
+        return "No dataset provided for compatibility assessment."
+    try:
+        df = pd.DataFrame(data_json)
+        num_rows, num_cols = df.shape
+        numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
+        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+        missing_values = df.isnull().sum().sum()
+        assessment = [
+            f"Dataset has {num_rows} rows and {num_cols} columns.",
+            f"Numeric columns ({len(numeric_cols)}): {', '.join(numeric_cols) if len(numeric_cols) > 0 else 'None'}.",
+            f"Categorical columns ({len(categorical_cols)}): {', '.join(categorical_cols) if len(categorical_cols) > 0 else 'None'}.",
+            f"Missing values: {missing_values}."
+        ]
+        # Causal compatibility insights
+        if num_cols < 3:
+            assessment.append("Warning: Dataset has fewer than 3 columns, which may limit causal analysis (e.g., no room for treatment, outcome, and confounders).")
+        if len(numeric_cols) == 0:
+            assessment.append("Warning: No numeric columns detected. Causal inference often requires numeric variables for treatment or outcome.")
+        if missing_values > 0:
+            assessment.append("Note: Missing values detected. Preprocessing (e.g., imputation) may be needed for accurate causal analysis.")
+        if len(numeric_cols) >= 2 and num_rows > 100:
+            assessment.append("Positive: Dataset has multiple numeric columns and sufficient rows, suitable for causal inference with proper preprocessing.")
+        else:
+            assessment.append("Note: Ensure at least two numeric columns (e.g., treatment and outcome) and sufficient data points for robust causal analysis.")
+        return "\n".join(assessment)
+    except Exception as e:
+        print(f"Error in assess_causal_compatibility: {e}")
+        return "Unable to assess dataset compatibility due to processing error."
+# Define tools using LangChain's @tool decorator
+@tool
+def get_dataset_info() -> dict:
+    """
+    Provides summary information and causal compatibility assessment for the currently loaded dataset.
+    The dataset is provided by the backend session context.
+    Returns:
+        Dictionary containing the dataset summary and compatibility assessment.
+    """
+    return {"summary": "Dataset will be provided by session context"}
+@tool
+def get_causal_graph_info() -> dict:
+    """
+    Provides summary information about the currently discovered causal graph.
+    The graph data is provided by the backend session context.
+    Returns:
+        Dictionary containing the graph summary.
+    """
+    return {"summary": "Graph data will be provided by session context"}
+# Bind tools to the model
+tools = [get_dataset_info, get_causal_graph_info]
+if model:
+    model_with_tools = model.bind_tools(tools)
+def get_chatbot_response(user_message: str, session_context: dict) -> str:
+    """
+    Gets a response from the Groq chatbot, handling tool calls.
+    Args:
+        user_message: The message from the user.
+        session_context: Dictionary containing current session data
+                        (e.g., processed_data, causal_graph_adj, causal_graph_nodes).
+    Returns:
+        The chatbot's response message.
+    """
+    if model is None:
+        return "Chatbot is not configured correctly. Please check Groq API key."
+    try:
+        # Create a prompt template to guide the model's behavior
+        prompt = ChatPromptTemplate.from_messages([
+            ("system", """You are CausalBox Assistant, an AI that helps users analyze datasets and causal graphs.
+            Use the provided tools to access dataset or graph information. Do NOT generate or guess parameters for tool calls; the backend will provide all necessary data (e.g., dataset or graph details).
+            For dataset queries (e.g., "read the dataset", "dataset compatibility"), call `get_dataset_info` without arguments.
+            For graph queries (e.g., "describe the causal graph"), call `get_causal_graph_info` without arguments.
+            For other questions (e.g., "what is a confounder?"), respond directly with clear, accurate explanations.
+            When you receive tool results, provide a comprehensive analysis and explanation to help the user understand their data and causal analysis possibilities.
+            Examples:
+            - User: "Tell me about the dataset" -> Call `get_dataset_info`.
+            - User: "Check dataset compatibility for causal analysis" -> Call `get_dataset_info`.
+            - User: "Describe the causal graph" -> Call `get_causal_graph_info`.
+            - User: "What is a confounder?" -> Respond: "A confounder is a variable that influences both the treatment and outcome, causing a spurious association."
+            """),
+            ("human", "{user_message}")
+        ])
+        # Chain the prompt with the model
+        chain = prompt | model_with_tools
+        # Log the user message and session context
+        print(f"Processing user message: {user_message}")
+        print(f"Session context keys: {list(session_context.keys())}")
+        # Invoke the chain with the user message
+        response = chain.invoke({"user_message": user_message})
+        print(f"Model response: {response}")
+        # Handle tool calls if present
+        if response.tool_calls:
+            tool_call = response.tool_calls[0]
+            function_name = tool_call["name"]
+            function_args = tool_call["args"]
+            print(f"Chatbot calling tool: {function_name} with args: {function_args}")
+            # Map session context to tool arguments
+            tool_output = {}
+            if function_name == "get_dataset_info":
+                data_json = session_context.get("processed_data", [])
+                if not isinstance(data_json, list) or not data_json:
+                    print(f"Invalid or empty data_json: {data_json}")
+                    return "Error: No valid dataset available."
+                tool_output = get_dataset_info.invoke({})
+                tool_output["summary"] = summarize_dataframe_for_chatbot(data_json)
+                tool_output["causal_compatibility"] = assess_causal_compatibility(data_json)
+            elif function_name == "get_causal_graph_info":
+                graph_adj = session_context.get("causal_graph_adj", [])
+                nodes = session_context.get("causal_graph_nodes", [])
+                if not graph_adj or not nodes:
+                    print("No causal graph data available")
+                    return "Error: No causal graph available."
+                tool_output = get_causal_graph_info.invoke({})
+                tool_output["summary"] = get_graph_summary_for_chatbot(graph_adj, nodes)
+            else:
+                print(f"Unknown tool: {function_name}")
+                return f"Error: Unknown tool {function_name}."
+            print(f"Tool output: {tool_output}")
+            # Create the tool output text
+            output_text = tool_output["summary"]
+            if tool_output.get("causal_compatibility"):
+                output_text += "\n\nCausal Compatibility Assessment:\n" + tool_output["causal_compatibility"]
+            # Create messages for the final response - FIXED VERSION
+            messages = [
+                HumanMessage(content=user_message),
+                AIMessage(content="", tool_calls=[tool_call]),
+                ToolMessage(content=output_text, tool_call_id=tool_call["id"])
+            ]
+            # Create a follow-up prompt to ensure the model provides a comprehensive response
+            follow_up_prompt = ChatPromptTemplate.from_messages([
+                ("system", """You are CausalBox Assistant. Based on the tool results, provide a comprehensive, helpful response to the user's question.
+                Explain the dataset characteristics, causal compatibility, and provide actionable insights for causal analysis.
+                Be specific about what the data shows and what causal analysis approaches would be suitable.
+                Always provide a complete response, not just acknowledgment."""),
+                ("human", "{original_question}"),
+                ("assistant", "I'll analyze the dataset information for you."),
+                ("human", "Here's the dataset analysis: {tool_results}\n\nPlease provide a comprehensive explanation of this data and its suitability for causal analysis.")
+            ])
+            # Get final response from the model with explicit prompting
+            print("Invoking model with tool response messages")
+            try:
+                final_chain = follow_up_prompt | model
+                final_response = final_chain.invoke({
+                    "original_question": user_message,
+                    "tool_results": output_text
+                })
+                print(f"Final response content: {final_response.content}")
+                if final_response.content and final_response.content.strip():
+                    return final_response.content
+                else:
+                    # Fallback response if model still returns empty
+                    return create_fallback_response(output_text, user_message)
+            except Exception as e:
+                print(f"Error in final response generation: {e}")
+                return create_fallback_response(output_text, user_message)
+        else:
+            print("No tool calls, returning direct response")
+            if response.content and response.content.strip():
+                return response.content
+            else:
+                return "I'm ready to help you with causal analysis. Please ask me about your dataset, causal graphs, or any causal inference concepts you'd like to understand."
+    except Exception as e:
+        print(f"Error communicating with Groq: {e}")
+        return f"Sorry, I'm having trouble processing your request: {str(e)}"
+def create_fallback_response(tool_output: str, user_message: str) -> str:
+    """
+    Creates a fallback response when the model returns empty content.
+    """
+    response_parts = ["Based on your dataset analysis:\n"]
+    if "Dataset Summary:" in tool_output:
+        response_parts.append("📊 **Dataset Overview:**")
+        summary_part = tool_output.split("Dataset Summary:")[1].split("Causal Compatibility Assessment:")[0]
+        response_parts.append(summary_part.strip())
+        response_parts.append("")
+    if "Causal Compatibility Assessment:" in tool_output:
+        response_parts.append("🔍 **Causal Analysis Compatibility:**")
+        compatibility_part = tool_output.split("Causal Compatibility Assessment:")[1]
+        response_parts.append(compatibility_part.strip())
+        response_parts.append("")
+    # Add specific insights based on the data
+    if "FinalExamScore" in tool_output:
+        response_parts.append("💡 **Key Insights for Causal Analysis:**")
+        response_parts.append("- Your dataset appears to be education-related with variables like FinalExamScore, StudyHours, and TuitionHours")
+        response_parts.append("- This is excellent for causal analysis as you can explore questions like:")
+        response_parts.append("  • Does increasing study hours causally improve exam scores?")
+        response_parts.append("  • What's the causal effect of tutoring (TuitionHours) on performance?")
+        response_parts.append("  • How does parental education influence student outcomes?")
+        response_parts.append("")
+        response_parts.append("🚀 **Next Steps:**")
+        response_parts.append("- Consider identifying your treatment variable (e.g., TuitionHours)")
+        response_parts.append("- Define your outcome variable (likely FinalExamScore)")
+        response_parts.append("- Identify potential confounders (ParentalEducation, SchoolType)")
+    return "\n".join(response_parts)

utils/do_calculus.py CHANGED Viewed

@@ -1,52 +1,52 @@
-# utils/do_calculus.py
-import pandas as pd
-import numpy as np
-import networkx as nx
-class DoCalculus:
-    def __init__(self, graph):
-        self.graph = graph
-    def intervene(self, data, intervention_var, intervention_value):
-        """
-        Simulate do(X=x) intervention on a variable.
-        Returns intervened DataFrame.
-        This is a simplified implementation.
-        """
-        intervened_data = data.copy()
-        # Direct intervention: set the value
-        intervened_data[intervention_var] = intervention_value
-        # Propagate effects (simplified linear model) - needs graph
-        # For a true do-calculus, you'd prune the graph and re-estimate based on parents
-        # For demonstration, this still uses a simplified propagation.
-        try:
-            # Ensure graph is connected and topological sort is possible
-            if self.graph and not nx.is_directed_acyclic_graph(self.graph):
-                print("Warning: Graph is not a DAG. Topological sort may fail or be incorrect for do-calculus.")
-            # This simplified propagation is a conceptual placeholder
-            for node in nx.topological_sort(self.graph):
-                if node == intervention_var:
-                    continue # Do not propagate back to the intervened variable
-                parents = list(self.graph.predecessors(node))
-                if parents:
-                    # Very simplified linear model to show propagation
-                    # In reality, you'd use learned coefficients or structural equations
-                    combined_effect = np.zeros(len(intervened_data))
-                    for p in parents:
-                        if p in intervened_data.columns:
-                            # Use a fixed random coefficient for demonstration
-                            coeff = 0.5
-                            combined_effect += intervened_data[p].to_numpy() * coeff
-                    # Add a small random noise to simulate uncertainty
-                    intervened_data[node] += combined_effect + np.random.normal(0, 0.1, len(intervened_data))
-        except Exception as e:
-            print(f"Could not perform full propagation due to graph issues or simplification: {e}")
-            # Fallback to direct intervention only if graph logic fails
-            pass # The direct intervention `intervened_data[intervention_var] = intervention_value` is already applied
         return intervened_data

+# utils/do_calculus.py
+import pandas as pd
+import numpy as np
+import networkx as nx
+class DoCalculus:
+    def __init__(self, graph):
+        self.graph = graph
+    def intervene(self, data, intervention_var, intervention_value):
+        """
+        Simulate do(X=x) intervention on a variable.
+        Returns intervened DataFrame.
+        This is a simplified implementation.
+        """
+        intervened_data = data.copy()
+        # Direct intervention: set the value
+        intervened_data[intervention_var] = intervention_value
+        # Propagate effects (simplified linear model) - needs graph
+        # For a true do-calculus, you'd prune the graph and re-estimate based on parents
+        # For demonstration, this still uses a simplified propagation.
+        try:
+            # Ensure graph is connected and topological sort is possible
+            if self.graph and not nx.is_directed_acyclic_graph(self.graph):
+                print("Warning: Graph is not a DAG. Topological sort may fail or be incorrect for do-calculus.")
+            # This simplified propagation is a conceptual placeholder
+            for node in nx.topological_sort(self.graph):
+                if node == intervention_var:
+                    continue # Do not propagate back to the intervened variable
+                parents = list(self.graph.predecessors(node))
+                if parents:
+                    # Very simplified linear model to show propagation
+                    # In reality, you'd use learned coefficients or structural equations
+                    combined_effect = np.zeros(len(intervened_data))
+                    for p in parents:
+                        if p in intervened_data.columns:
+                            # Use a fixed random coefficient for demonstration
+                            coeff = 0.5
+                            combined_effect += intervened_data[p].to_numpy() * coeff
+                    # Add a small random noise to simulate uncertainty
+                    intervened_data[node] += combined_effect + np.random.normal(0, 0.1, len(intervened_data))
+        except Exception as e:
+            print(f"Could not perform full propagation due to graph issues or simplification: {e}")
+            # Fallback to direct intervention only if graph logic fails
+            pass # The direct intervention `intervened_data[intervention_var] = intervention_value` is already applied
         return intervened_data

utils/graph_utils.py CHANGED Viewed

@@ -1,60 +1,107 @@
-# utils/graph_utils.py
-import networkx as nx
-import plotly.graph_objects as go
-def visualize_graph(graph):
-    """
-    Visualize a causal graph using Plotly.
-    Returns Plotly figure as JSON.
-    """
-    # Use a fixed seed for layout reproducibility (optional)
-    pos = nx.spring_layout(graph, seed=42)
-    edge_x, edge_y = [], []
-    for edge in graph.edges():
-        x0, y0 = pos[edge[0]]
-        x1, y1 = pos[edge[1]]
-        edge_x.extend([x0, x1, None])
-        edge_y.extend([y0, y1, None])
-    edge_trace = go.Scatter(
-        x=edge_x, y=edge_y,
-        line=dict(width=1, color='#888'),
-        mode='lines',
-        hoverinfo='none'
-    )
-    node_x, node_y = [], []
-    for node in graph.nodes():
-        x, y = pos[node]
-        node_x.append(x)
-        node_y.append(y)
-    node_trace = go.Scatter(
-        x=node_x, y=node_y,
-        mode='markers+text',
-        text=list(graph.nodes()),
-        textposition='bottom center',
-        marker=dict(size=15, color='lightblue', line=dict(width=2, color='DarkSlateGrey')),
-        hoverinfo='text'
-    )
-    fig = go.Figure(
-        data=[edge_trace, node_trace],
-        layout=go.Layout(
-            showlegend=False,
-            hovermode='closest',
-            margin=dict(b=20, l=5, r=5, t=40),
-            annotations=[dict(
-                text="Python Causal Graph",
-                showarrow=False,
-                xref="paper", yref="paper",
-                x=0.005, y= -0.002,
-                font=dict(size=14, color="lightgray")
-            )],
-            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
-            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
-            title=dict(text="Causal Graph Visualization", font=dict(size=16)) # Corrected line
-        )
-    )
-    return fig.to_json()

+# utils/graph_utils.py
+import networkx as nx
+import plotly.graph_objects as go
+import numpy as np
+def visualize_graph(graph):
+    """
+    Visualize a causal graph using Plotly.
+    Returns Plotly figure as JSON.
+    """
+    # Use a fixed seed for layout reproducibility (optional)
+    pos = nx.spring_layout(graph, seed=42)
+    edge_x, edge_y = [], []
+    for edge in graph.edges():
+        x0, y0 = pos[edge[0]]
+        x1, y1 = pos[edge[1]]
+        edge_x.extend([x0, x1, None])
+        edge_y.extend([y0, y1, None])
+    edge_trace = go.Scatter(
+        x=edge_x, y=edge_y,
+        line=dict(width=1, color='#888'),
+        mode='lines',
+        hoverinfo='none'
+    )
+    node_x, node_y = [], []
+    for node in graph.nodes():
+        x, y = pos[node]
+        node_x.append(x)
+        node_y.append(y)
+    node_trace = go.Scatter(
+        x=node_x, y=node_y,
+        mode='markers+text',
+        text=list(graph.nodes()),
+        textposition='bottom center',
+        marker=dict(size=15, color='lightblue', line=dict(width=2, color='DarkSlateGrey')),
+        hoverinfo='text'
+    )
+    fig = go.Figure(
+        data=[edge_trace, node_trace],
+        layout=go.Layout(
+            showlegend=False,
+            hovermode='closest',
+            margin=dict(b=20, l=5, r=5, t=40),
+            annotations=[dict(
+                text="Python Causal Graph",
+                showarrow=False,
+                xref="paper", yref="paper",
+                x=0.005, y= -0.002,
+                font=dict(size=14, color="lightgray")
+            )],
+            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+            title=dict(text="Causal Graph Visualization", font=dict(size=16)) # Corrected line
+        )
+    )
+    return fig.to_json()
+def get_graph_summary_for_chatbot(graph_adj, nodes):
+    """
+    Generates a text summary of the causal graph for the chatbot.
+    """
+    if not graph_adj or not nodes:
+        return "No causal graph discovered yet."
+    adj_matrix = np.array(graph_adj)
+    G = nx.DiGraph(adj_matrix)
+    # Relabel nodes with actual names
+    mapping = {i: node_name for i, node_name in enumerate(nodes)}
+    G = nx.relabel_nodes(G, mapping)
+    num_nodes = G.number_of_nodes()
+    num_edges = G.number_of_edges()
+    summary = (
+        f"The causal graph has {num_nodes} variables (nodes) and {num_edges} causal relationships (directed edges).\n"
+        "The variables are: " + ", ".join(nodes) + ".\n"
+    )
+    # Add some basic structural info
+    if nx.is_directed_acyclic_graph(G):
+        summary += "The graph is a Directed Acyclic Graph (DAG), which is typical for causal models.\n"
+    else:
+        summary += "The graph contains cycles, which might indicate feedback loops or issues with the discovery algorithm for a DAG model.\n"
+    # Smallest graphs: list all edges
+    if num_edges > 0 and num_edges < 10: # Avoid listing too many edges for large graphs
+        edge_list = [f"{u} -> {v}" for u, v in G.edges()]
+        summary += "The discovered relationships are: " + ", ".join(edge_list) + ".\n"
+    elif num_edges >= 10:
+        summary += "There are many edges; you can ask for specific relationships (e.g., 'What are the direct causes of X?').\n"
+    # Identify source and sink nodes (if any)
+    source_nodes = [n for n, d in G.in_degree() if d == 0]
+    sink_nodes = [n for n, d in G.out_degree() if d == 0]
+    if source_nodes:
+        summary += f"Variables with no known causes (source nodes): {', '.join(source_nodes)}.\n"
+    if sink_nodes:
+        summary += f"Variables with no known effects (sink nodes): {', '.join(sink_nodes)}.\n"
+    return summary

utils/prediction_models.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# utils/prediction_models.py
+import pandas as pd
+from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
+import numpy as np
+def train_predict_random_forest(data_list, target_col, feature_cols, prediction_type='regression'):
+    """
+    Trains a Random Forest model and performs prediction/evaluation.
+    Args:
+        data_list (list of dict): List of dictionaries representing the dataset.
+        target_col (str): Name of the target variable.
+        feature_cols (list): List of names of feature variables.
+        prediction_type (str): 'regression' or 'classification'.
+    Returns:
+        dict: A dictionary containing model results (metrics, predictions, feature importances).
+    """
+    df = pd.DataFrame(data_list)
+    if not all(col in df.columns for col in feature_cols + [target_col]):
+        missing_cols = [col for col in feature_cols + [target_col] if col not in df.columns]
+        raise ValueError(f"Missing columns in data: {missing_cols}")
+    X = df[feature_cols]
+    y = df[target_col]
+    # Handle categorical features if any
+    X = pd.get_dummies(X, drop_first=True) # One-hot encode categorical features
+    # Split data for robust evaluation
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    results = {}
+    if prediction_type == 'regression':
+        model = RandomForestRegressor(n_estimators=100, random_state=42)
+        model.fit(X_train, y_train)
+        y_pred = model.predict(X_test)
+        results['model_type'] = 'Regression'
+        results['r2_score'] = r2_score(y_test, y_pred)
+        results['mean_squared_error'] = mean_squared_error(y_test, y_pred)
+        results['root_mean_squared_error'] = np.sqrt(mean_squared_error(y_test, y_pred))
+        results['actual_vs_predicted'] = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}).to_dict(orient='list')
+    elif prediction_type == 'classification':
+        # Ensure target variable is suitable for classification (e.g., integer/categorical)
+        # You might need more robust handling for different target types here
+        if y.dtype == 'object' or y.dtype.name == 'category':
+            y_train = y_train.astype('category').cat.codes
+            y_test = y_test.astype('category').cat.codes
+            y_unique_labels = df[target_col].astype('category').cat.categories.tolist()
+            results['class_labels'] = y_unique_labels
+        else:
+            y_unique_labels = sorted(y.unique().tolist())
+            results['class_labels'] = y_unique_labels
+        model = RandomForestClassifier(n_estimators=100, random_state=42)
+        model.fit(X_train, y_train)
+        y_pred = model.predict(X_test)
+        results['model_type'] = 'Classification'
+        results['accuracy'] = accuracy_score(y_test, y_pred)
+        # Precision, Recall, F1-score - use 'weighted' average for multi-class
+        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0)
+        results['precision'] = precision
+        results['recall'] = recall
+        results['f1_score'] = f1
+        results['confusion_matrix'] = confusion_matrix(y_test, y_pred).tolist()
+        results['classification_report'] = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
+    else:
+        raise ValueError("prediction_type must be 'regression' or 'classification'")
+    # Feature Importance (common for both)
+    if hasattr(model, 'feature_importances_'):
+        feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
+        results['feature_importances'] = feature_importances.to_dict()
+    return results

utils/preprocessor.py CHANGED Viewed

@@ -1,57 +1,88 @@
-# utils/preprocessor.py
-from sklearn.preprocessing import StandardScaler, LabelEncoder
-import pandas as pd
-import numpy as np
-import logging
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-class DataPreprocessor:
-    def __init__(self):
-        self.scaler = StandardScaler()
-        self.label_encoders = {}
-    def preprocess(self, df):
-        """
-        Preprocess DataFrame: handle missing values, encode categorical variables, scale numerical variables.
-        """
-        try:
-            logger.info(f"Input DataFrame shape: {df.shape}, columns: {list(df.columns)}")
-            df_processed = df.copy()
-            # Handle missing values
-            logger.info("Handling missing values...")
-            for col in df_processed.columns:
-                if df_processed[col].isnull().any():
-                    if pd.api.types.is_numeric_dtype(df_processed[col]):
-                        df_processed[col] = df_processed[col].fillna(df_processed[col].mean())
-                        logger.info(f"Filled numeric missing values in '{col}' with mean.")
-                    else:
-                        df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
-                        logger.info(f"Filled categorical missing values in '{col}' with mode.")
-            # Encode categorical variables
-            logger.info("Encoding categorical variables...")
-            for col in df_processed.select_dtypes(include=['object', 'category']).columns:
-                logger.info(f"Encoding column: {col}")
-                self.label_encoders[col] = LabelEncoder()
-                df_processed[col] = self.label_encoders[col].fit_transform(df_processed[col])
-            # Scale numerical variables
-            logger.info("Scaling numerical variables...")
-            numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
-            if len(numeric_cols) > 0:
-                # Exclude columns that are now effectively categorical (post-label encoding)
-                # This is a heuristic; ideally, identify original numeric columns.
-                cols_to_scale = [col for col in numeric_cols if col not in self.label_encoders]
-                if cols_to_scale:
-                    df_processed[cols_to_scale] = self.scaler.fit_transform(df_processed[cols_to_scale])
-                    logger.info(f"Scaled numeric columns: {cols_to_scale}")
-            logger.info(f"Preprocessed DataFrame shape: {df_processed.shape}")
-            return df_processed
-        except Exception as e:
-            logger.exception(f"Error preprocessing data: {str(e)}")
-            raise

+# utils/preprocessor.py
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+import pandas as pd
+import numpy as np
+import logging
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class DataPreprocessor:
+    def __init__(self):
+        self.scaler = StandardScaler()
+        self.label_encoders = {}
+    def preprocess(self, df):
+        """
+        Preprocess DataFrame: handle missing values, encode categorical variables, scale numerical variables.
+        """
+        try:
+            logger.info(f"Input DataFrame shape: {df.shape}, columns: {list(df.columns)}")
+            df_processed = df.copy()
+            # Handle missing values
+            logger.info("Handling missing values...")
+            for col in df_processed.columns:
+                if df_processed[col].isnull().any():
+                    if pd.api.types.is_numeric_dtype(df_processed[col]):
+                        df_processed[col] = df_processed[col].fillna(df_processed[col].mean())
+                        logger.info(f"Filled numeric missing values in '{col}' with mean.")
+                    else:
+                        df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
+                        logger.info(f"Filled categorical missing values in '{col}' with mode.")
+            # Encode categorical variables
+            logger.info("Encoding categorical variables...")
+            for col in df_processed.select_dtypes(include=['object', 'category']).columns:
+                logger.info(f"Encoding column: {col}")
+                self.label_encoders[col] = LabelEncoder()
+                df_processed[col] = self.label_encoders[col].fit_transform(df_processed[col])
+            # Scale numerical variables
+            logger.info("Scaling numerical variables...")
+            numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
+            if len(numeric_cols) > 0:
+                # Exclude columns that are now effectively categorical (post-label encoding)
+                # This is a heuristic; ideally, identify original numeric columns.
+                cols_to_scale = [col for col in numeric_cols if col not in self.label_encoders]
+                if cols_to_scale:
+                    df_processed[cols_to_scale] = self.scaler.fit_transform(df_processed[cols_to_scale])
+                    logger.info(f"Scaled numeric columns: {cols_to_scale}")
+            logger.info(f"Preprocessed DataFrame shape: {df_processed.shape}")
+            return df_processed
+        except Exception as e:
+            logger.exception(f"Error preprocessing data: {str(e)}")
+            raise
+def summarize_dataframe_for_chatbot(data_list):
+        """
+        Generates a test summary of the DataFrame for chatbot interaction."""
+        if not data_list:
+            return "No data loaded."
+        df = pd.DataFrame(data_list)
+        nums_rows, num_cols = df.shape
+        col_info = []
+        for col in df.columns:
+            dtype = df[col].dtype
+            unique_vals = df[col].nunique()
+            missing_count = df[col].isnull().sum()
+            info = f"-{col} (Type:{dtype}"
+            if pd.api.types.is_numeric_dtype(df[col]):
+                info +=f", Min:{df[col].min():.2f}, Max:{df[col].max():.2f}"
+            else:
+                info += f", Unique:{unique_vals}"
+            if missing_count > 0:
+                info += f", Missing:{missing_count}"
+            info += ")"
+            col_info.append(info)
+        summary = (f"Dataset Summary:\n- Rows: {nums_rows}, Columns: {num_cols}\nColumns:\n" + "\n".join(col_info))
+        return summary

utils/time_series_causal.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# utils/time_series_causal.py
+import pandas as pd
+from statsmodels.tsa.stattools import grangercausalitytests
+def perform_granger_causality(data_list, timestamp_col, variables_to_analyze, max_lags=1):
+    """
+    Performs pairwise Granger Causality tests on the given time-series data.
+    Args:
+        data_list (list of dict): List of dictionaries representing the dataset.
+        timestamp_col (str): Name of the timestamp column.
+        variables_to_analyze (list): List of names of variables to test for causality.
+        max_lags (int): The maximum number of lags to use for the Granger causality test.
+    Returns:
+        list: A list of dictionaries, each describing a causal relationship found.
+    """
+    df = pd.DataFrame(data_list)
+    if timestamp_col not in df.columns:
+        raise ValueError(f"Timestamp column '{timestamp_col}' not found in data.")
+    # Ensure timestamp column is datetime and set as index
+    try:
+        df[timestamp_col] = pd.to_datetime(df[timestamp_col])
+        df = df.set_index(timestamp_col).sort_index()
+    except Exception as e:
+        raise ValueError(f"Could not convert timestamp column '{timestamp_col}' to datetime: {e}")
+    # Ensure all variables to analyze are numeric
+    for col in variables_to_analyze:
+        if not pd.api.types.is_numeric_dtype(df[col]):
+            raise ValueError(f"Variable '{col}' is not numeric. Granger Causality requires numeric variables.")
+        if df[col].isnull().any():
+            # Handle NaNs: Granger Causality tests require no NaN values.
+            # You might choose to drop rows with NaNs or impute.
+            # For simplicity, here we'll raise an error or drop them.
+            # print(f"Warning: Variable '{col}' contains NaN values. Rows with NaNs will be dropped.")
+            df = df.dropna(subset=[col])
+    # Select only the relevant columns
+    df_selected = df[variables_to_analyze]
+    # Granger Causality requires stationarity in theory.
+    # While statsmodels can run on non-stationary data, results should be interpreted cautiously.
+    # You might want to add differencing logic here (e.g., df.diff().dropna())
+    # or a warning for the user.
+    # For now, we proceed directly.
+    causal_results = []
+    # Iterate through all unique pairs of variables
+    for i in range(len(variables_to_analyze)):
+        for j in range(len(variables_to_analyze)):
+            if i == j:
+                continue # Skip self-causation tests
+            cause_var = variables_to_analyze[i]
+            effect_var = variables_to_analyze[j]
+            # Prepare data for grangercausalitytests: [effect_var, cause_var]
+            # grangercausalitytests takes a DataFrame where the first column is the dependent variable (effect)
+            # and the second column is the independent variable (cause)
+            data_for_test = df_selected[[effect_var, cause_var]]
+            if data_for_test.empty or len(data_for_test) <= max_lags:
+                # Not enough data points to perform test with specified lags
+                # This can happen if NaNs were dropped or dataset is too small
+                continue
+            try:
+                # Perform Granger Causality test
+                # The output is a dictionary. The key 'ssr_ftest' (or 'params_ftest')
+                # usually contains the p-value.
+                test_result = grangercausalitytests(data_for_test, max_lags, verbose=False)
+                # Extract p-value for the optimal lag or the test that interests you
+                # Commonly, F-test p-value for the last lag tested is used
+                # test_result is a dictionary where keys are lag numbers
+                # Each lag has a tuple of (test_statistics, p_values).
+                # (F-test, Chi2-test, LR-test, SSR-test) -> [statistic, p-value, df_denom, df_num]
+                # Let's consider the F-test for the last lag as a general indicator
+                last_lag_p_value = test_result[max_lags][0]['ssr_ftest'][1] # F-test p-value
+                causal_results.append({
+                    "cause": cause_var,
+                    "effect": effect_var,
+                    "p_value": last_lag_p_value,
+                    "test_type": "Granger Causality (F-test)",
+                    "max_lags": max_lags
+                })
+            except ValueError as ve:
+                # Handle cases where the test cannot be performed (e.g., singular matrix)
+                print(f"Could not perform Granger Causality for {cause_var} -> {effect_var} with max_lags={max_lags}: {ve}")
+                continue # Skip this pair
+            except Exception as e:
+                print(f"An unexpected error occurred for {cause_var} -> {effect_var}: {e}")
+                continue
+    return causal_results

utils/treatment_effects.py CHANGED Viewed

@@ -1,63 +1,63 @@
-# utils/treatment_effects.py
-from sklearn.linear_model import LinearRegression, LogisticRegression
-import pandas as pd
-import numpy as np
-# For matching-based methods, you might need libraries like dowhy or causalml
-# import statsmodels.api as sm # Example for regression diagnostics
-class TreatmentEffectAlgorithms:
-    def linear_regression_ate(self, df, treatment_col, outcome_col, covariates):
-        """
-        Estimate ATE using linear regression.
-        """
-        X = df[covariates + [treatment_col]]
-        y = df[outcome_col]
-        model = LinearRegression()
-        model.fit(X, y)
-        ate = model.coef_[-1] # Coefficient of treatment_col
-        return float(ate)
-    def propensity_score_matching(self, df, treatment_col, outcome_col, covariates):
-        """
-        Placeholder for Propensity Score Matching.
-        You would implement or integrate a matching algorithm here.
-        """
-        print("Propensity Score Matching is a placeholder. Returning a dummy ATE.")
-        # Simplified: Estimate propensity scores
-        X_propensity = df[covariates]
-        T_propensity = df[treatment_col]
-        prop_model = LogisticRegression(solver='liblinear')
-        prop_model.fit(X_propensity, T_propensity)
-        propensity_scores = prop_model.predict_proba(X_propensity)[:, 1]
-        # Dummy ATE calculation for demonstration
-        treated_outcome = df[df[treatment_col] == 1][outcome_col].mean()
-        control_outcome = df[df[treatment_col] == 0][outcome_col].mean()
-        return float(treated_outcome - control_outcome) # Simplified dummy ATE
-    def inverse_propensity_weighting(self, df, treatment_col, outcome_col, covariates):
-        """
-        Placeholder for Inverse Propensity Weighting (IPW).
-        You would implement or integrate IPW here.
-        """
-        print("Inverse Propensity Weighting is a placeholder. Returning a dummy ATE.")
-        # Dummy ATE for demonstration
-        return np.random.rand() * 10 # Random dummy value
-    def t_learner(self, df, treatment_col, outcome_col, covariates):
-        """
-        Placeholder for T-learner.
-        You would implement a T-learner using two separate models.
-        """
-        print("T-learner is a placeholder. Returning a dummy ATE.")
-        # Dummy ATE for demonstration
-        return np.random.rand() * 10 + 5 # Random dummy value
-    def s_learner(self, df, treatment_col, outcome_col, covariates):
-        """
-        Placeholder for S-learner.
-        You would implement an S-learner using a single model.
-        """
-        print("S-learner is a placeholder. Returning a dummy ATE.")
-        # Dummy ATE for demonstration
         return np.random.rand() * 10 - 2 # Random dummy value

+# utils/treatment_effects.py
+from sklearn.linear_model import LinearRegression, LogisticRegression
+import pandas as pd
+import numpy as np
+# For matching-based methods, you might need libraries like dowhy or causalml
+# import statsmodels.api as sm # Example for regression diagnostics
+class TreatmentEffectAlgorithms:
+    def linear_regression_ate(self, df, treatment_col, outcome_col, covariates):
+        """
+        Estimate ATE using linear regression.
+        """
+        X = df[covariates + [treatment_col]]
+        y = df[outcome_col]
+        model = LinearRegression()
+        model.fit(X, y)
+        ate = model.coef_[-1] # Coefficient of treatment_col
+        return float(ate)
+    def propensity_score_matching(self, df, treatment_col, outcome_col, covariates):
+        """
+        Placeholder for Propensity Score Matching.
+        You would implement or integrate a matching algorithm here.
+        """
+        print("Propensity Score Matching is a placeholder. Returning a dummy ATE.")
+        # Simplified: Estimate propensity scores
+        X_propensity = df[covariates]
+        T_propensity = df[treatment_col]
+        prop_model = LogisticRegression(solver='liblinear')
+        prop_model.fit(X_propensity, T_propensity)
+        propensity_scores = prop_model.predict_proba(X_propensity)[:, 1]
+        # Dummy ATE calculation for demonstration
+        treated_outcome = df[df[treatment_col] == 1][outcome_col].mean()
+        control_outcome = df[df[treatment_col] == 0][outcome_col].mean()
+        return float(treated_outcome - control_outcome) # Simplified dummy ATE
+    def inverse_propensity_weighting(self, df, treatment_col, outcome_col, covariates):
+        """
+        Placeholder for Inverse Propensity Weighting (IPW).
+        You would implement or integrate IPW here.
+        """
+        print("Inverse Propensity Weighting is a placeholder. Returning a dummy ATE.")
+        # Dummy ATE for demonstration
+        return np.random.rand() * 10 # Random dummy value
+    def t_learner(self, df, treatment_col, outcome_col, covariates):
+        """
+        Placeholder for T-learner.
+        You would implement a T-learner using two separate models.
+        """
+        print("T-learner is a placeholder. Returning a dummy ATE.")
+        # Dummy ATE for demonstration
+        return np.random.rand() * 10 + 5 # Random dummy value
+    def s_learner(self, df, treatment_col, outcome_col, covariates):
+        """
+        Placeholder for S-learner.
+        You would implement an S-learner using a single model.
+        """
+        print("S-learner is a placeholder. Returning a dummy ATE.")
+        # Dummy ATE for demonstration
         return np.random.rand() * 10 - 2 # Random dummy value