Spaces:

causalscience
/

Impact_Analysis_Tools

Running

App Files Files Community

causalscience commited on Aug 20

Commit

46b4d92

verified ·

1 Parent(s): ee87011

Aug 25 Bug Fixes

Browse files

Files changed (1) hide show

models/granger.py +300 -139

models/granger.py CHANGED Viewed

@@ -1,139 +1,300 @@
-# causalscience/models/granger.py
-import numpy as np
-import pandas as pd
-import statsmodels.api as sm
-from statsmodels.tsa.stattools import grangercausalitytests, adfuller
-import matplotlib.pyplot as plt
-from io import BytesIO
-from PIL import Image
-def adf_stationarity_test(series, alpha=0.05):
-    """
-    Perform Augmented Dickey-Fuller test to check stationarity.
-    Args:
-        series (pd.Series): Time series data.
-        alpha (float): Significance level.
-    Returns:
-        p_value (float): p-value of the test.
-        is_stationary (bool): True if series is stationary.
-    """
-    result = adfuller(series.dropna(), autolag='AIC')
-    p_value = result[1]
-    return p_value, p_value < alpha
-def difference_series(series, order=1):
-    """
-    Difference the series to make it stationary.
-    """
-    return series.diff(periods=order).dropna()
-def make_data_stationary(df, columns, max_diff=2, alpha=0.05):
-    """
-    Iteratively difference columns to achieve stationarity.
-    Args:
-        df (pd.DataFrame): Input DataFrame.
-        columns (list[str]): Columns to transform.
-        max_diff (int): Maximum differencing order.
-        alpha (float): Significance level.
-    Returns:
-        df_out (pd.DataFrame): Transformed DataFrame.
-        transformations (dict): Info on differencing orders.
-    """
-    df_out = df.copy()
-    transformations = {}
-    for col in columns:
-        pval, stationary = adf_stationarity_test(df_out[col], alpha)
-        if stationary:
-            transformations[col] = f"Already stationary (p={pval:.4f})"
-            continue
-        for d in range(1, max_diff + 1):
-            diff_series = difference_series(df_out[col], order=d)
-            pval_d, stat_d = adf_stationarity_test(diff_series, alpha)
-            if stat_d:
-                df_out[col] = diff_series
-                transformations[col] = f"Differenced order {d} (p={pval_d:.4f})"
-                break
-        else:
-            transformations[col] = f"No stationarity by {max_diff} diffs"
-    df_out = df_out.dropna()
-    return df_out, transformations
-def recommend_var_lag(df, maxlags=7, criterion='aic'):
-    """
-    Recommend lag order for VAR model by information criterion.
-    """
-    model = sm.tsa.VAR(df.dropna())
-    results = model.select_order(maxlags=maxlags)
-    return results.selected_orders.get(criterion)
-def run_granger_analysis(df, max_lags=7, criterion='aic', apply_transformation=False,
-                         columns_to_transform=None, max_diff=2, alpha=0.05):
-    """
-    Run Granger causality analysis between first two columns.
-    Args:
-        df (pd.DataFrame): Time series DataFrame.
-        max_lags (int): Max lags to test.
-        criterion (str): Criterion for lag selection.
-        apply_transformation (bool): Whether to difference to stationarity.
-        columns_to_transform (list[str]): Columns to transform.
-        max_diff (int): Max differencing.
-        alpha (float): Significance level.
-    Returns:
-        summary_text (str): Text output of tests.
-        plot_img (PIL.Image.Image): Time series plot.
-        transformed_csv (str or None): Path to stationary data CSV.
-    """
-    transformed_csv = None
-    df_work = df.copy()
-    if apply_transformation and columns_to_transform:
-        df_work, trans = make_data_stationary(df_work, columns_to_transform, max_diff, alpha)
-        transformed_csv = 'transformed_data.csv'
-        df_work.to_csv(transformed_csv, index=False)
-    if df_work.shape[1] < 2:
-        raise ValueError("Need at least two series for Granger causality")
-    lag_order = recommend_var_lag(df_work, maxlags=max_lags, criterion=criterion) or min(max_lags, 1)
-    data_test = df_work.iloc[:, :2]
-    test_output = []
-    def _capture():
-        import sys, io
-        buf = io.StringIO()
-        sys_stdout = sys.stdout
-        try:
-            sys.stdout = buf
-            grangercausalitytests(data_test, maxlag=lag_order, verbose=True)
-        finally:
-            sys.stdout = sys_stdout
-        return buf.getvalue()
-    summary_text = f"Recommended Lag: {lag_order}\n" + _capture()
-    # Plot
-    fig, ax1 = plt.subplots()
-    ax2 = ax1.twinx()
-    col1, col2 = data_test.columns[:2]
-    ax1.plot(data_test[col1], label=col1)
-    ax2.plot(data_test[col2], label=col2)
-    ax1.set_xlabel('Time')
-    ax1.set_ylabel(col1)
-    ax2.set_ylabel(col2)
-    ax1.legend(loc='upper left')
-    ax2.legend(loc='upper right')
-    buf = BytesIO()
-    fig.savefig(buf, format='png', bbox_inches='tight')
-    plt.close(fig)
-    buf.seek(0)
-    plot_img = Image.open(buf)
-    return summary_text, plot_img, transformed_csv

+# causalscience/models/granger.py
+import numpy as np
+import pandas as pd
+import statsmodels.api as sm
+from statsmodels.tsa.stattools import grangercausalitytests, adfuller
+import matplotlib.pyplot as plt
+from io import BytesIO
+from PIL import Image
+def adf_stationarity_test(series, alpha=0.05):
+    """
+    Perform Augmented Dickey-Fuller test to check stationarity.
+    Args:
+        series (pd.Series): Time series data.
+        alpha (float): Significance level.
+    Returns:
+        p_value (float): p-value of the test.
+        is_stationary (bool): True if series is stationary.
+    """
+    result = adfuller(series.dropna(), autolag='AIC')
+    p_value = result[1]
+    return p_value, p_value < alpha
+def difference_series(series, order=1):
+    """
+    Difference the series to make it stationary.
+    """
+    return series.diff(periods=order).dropna()
+def make_data_stationary(df, columns_to_transform, max_diff=2, alpha=0.05):
+    """
+    Iteratively difference specified columns to achieve stationarity.
+    Args:
+        df (pd.DataFrame): Input DataFrame.
+        columns_to_transform (list[str] or None): Columns to transform. If None, no transformation is done.
+        max_diff (int): Maximum differencing order.
+        alpha (float): Significance level.
+    Returns:
+        df_out (pd.DataFrame): Transformed DataFrame.
+        transformations (dict): Info on differencing orders for transformed columns.
+    """
+    df_out = df.copy()
+    transformations = {}
+    if not columns_to_transform: # If no columns are specified for transformation
+        return df_out, transformations
+    for col in columns_to_transform:
+        if col not in df_out.columns:
+            transformations[col] = "Error: Column not found in DataFrame."
+            continue # Skip to the next column
+        # Ensure the column data is numeric for ADF test
+        if not pd.api.types.is_numeric_dtype(df_out[col]):
+            transformations[col] = "Skipped: Non-numeric data."
+            continue
+        pval, stationary = adf_stationarity_test(df_out[col], alpha)
+        if stationary:
+            transformations[col] = f"Already stationary (p={pval:.4f})"
+            continue
+        original_series_for_diff = df[col].copy() # Always difference from the original non-differenced series
+        for d in range(1, max_diff + 1):
+            # Apply differencing iteratively on the original series for this column
+            diff_series_data = difference_series(original_series_for_diff, order=d)
+            if diff_series_data.empty:
+                transformations[col] = f"Differenced order {d} resulted in empty series. Stationarity not achieved."
+                # df_out[col] = diff_series_data # Store the empty series if desired, or original
+                break # Stop differencing this column
+            pval_d, stat_d = adf_stationarity_test(diff_series_data, alpha)
+            if stat_d:
+                df_out[col] = diff_series_data # Update the column in df_out with the stationary series
+                transformations[col] = f"Differenced order {d} (p={pval_d:.4f})"
+                break
+        else: # This else belongs to the for loop (if break was not hit)
+            transformations[col] = f"Stationarity not achieved within {max_diff} differencing orders (last p={pval_d:.4f}). Using last differenced series."
+            # df_out[col] = diff_series_data # Ensure the last differenced series is used even if not stationary
+    # df_out = df_out.dropna() # Dropna AFTER all transformations are applied if needed,
+                             # but this might heavily reduce data if different columns have different diff orders.
+                             # It's often better to let VAR handle NaNs or for user to decide.
+                             # For Granger, two series are selected *after* this.
+    return df_out, transformations
+def recommend_var_lag(df, maxlags=7, criterion='aic'):
+    """
+    Recommend lag order for VAR model by information criterion.
+    Assumes df contains only the series for VAR (typically 2 for Granger).
+    """
+    # Ensure no NaNs are passed to VAR model, common after differencing
+    df_dropna = df.dropna()
+    if df_dropna.shape[0] < maxlags + 1: # Check if enough data points after dropping NaNs
+        # Not enough data to reliably select lag, or even fit model
+        # Default to a small lag, or raise error.
+        # print(f"Warning: Not enough data points ({df_dropna.shape[0]}) after dropping NaNs for maxlags={maxlags}. Defaulting lag to 1.")
+        return 1
+    if df_dropna.empty:
+        # print("Warning: DataFrame is empty after dropping NaNs. Cannot recommend VAR lag. Defaulting to 1.")
+        return 1
+    model = sm.tsa.VAR(df_dropna)
+    try:
+        results = model.select_order(maxlags=maxlags)
+        selected_lag = results.selected_orders.get(criterion)
+        return selected_lag if selected_lag is not None else 1 # Default to 1 if criterion not found or is None
+    except Exception as e:
+        # print(f"Error during VAR lag selection: {e}. Defaulting lag to 1.")
+        return 1
+def run_granger_analysis(df, target_col1, target_col2, max_lags=7, criterion='aic',
+                         apply_transformation=False, columns_to_transform=None,
+                         max_diff=2, alpha=0.05):
+    """
+    Run Granger causality analysis between two specified columns.
+    Args:
+        df (pd.DataFrame): Time series DataFrame.
+        target_col1 (str): Name of the first column for Granger analysis.
+        target_col2 (str): Name of the second column for Granger analysis.
+        max_lags (int): Max lags to test.
+        criterion (str): Criterion for lag selection.
+        apply_transformation (bool): Whether to difference to stationarity.
+        columns_to_transform (list[str] or None): Specific columns to attempt to make stationary.
+                                                 If None, and apply_transformation is True, it could
+                                                 default to target_col1 and target_col2, or be an error.
+                                                 Best to be explicit from UI.
+        max_diff (int): Max differencing.
+        alpha (float): Significance level for ADF test.
+    Returns:
+        summary_text (str): Text output of tests.
+        plot_img (PIL.Image.Image or None): Time series plot of the two target series.
+        transformed_csv_path (str or None): Path to CSV of the (potentially transformed) DataFrame.
+        transformation_info (dict): Log of transformations applied.
+    """
+    if not target_col1 or not target_col2:
+        raise ValueError("target_col1 and target_col2 must be specified.")
+    if target_col1 not in df.columns or target_col2 not in df.columns:
+        raise ValueError(f"One or both target columns ('{target_col1}', '{target_col2}') not found in DataFrame columns: {list(df.columns)}")
+    if target_col1 == target_col2:
+        raise ValueError("target_col1 and target_col2 must be different.")
+    df_work = df.copy()
+    transformation_info = {}
+    transformed_csv_path = None
+    if apply_transformation:
+        # If columns_to_transform is not provided, default to transforming the target columns
+        # This is a design choice; an alternative would be to raise an error or transform all numeric columns.
+        cols_for_stat = columns_to_transform
+        if not cols_for_stat: # If an empty list or None was passed, and apply_transformation is True
+            cols_for_stat = [target_col1, target_col2]
+            transformation_info["Note"] = f"No specific columns for transformation provided; applying to target series: {target_col1}, {target_col2}."
+        # Ensure only existing columns are in cols_for_stat, especially if user-provided
+        valid_cols_for_stat = [col for col in cols_for_stat if col in df_work.columns]
+        if len(valid_cols_for_stat) < len(cols_for_stat):
+            missing = set(cols_for_stat) - set(valid_cols_for_stat)
+            transformation_info["Warning_Transformation"] = f"Columns not found for stationarity transformation and were skipped: {list(missing)}"
+        if valid_cols_for_stat:
+            df_work, trans_log = make_data_stationary(df_work, valid_cols_for_stat, max_diff, alpha)
+            transformation_info.update(trans_log) # Merge the detailed log
+            transformed_csv_path = 'transformed_data.csv'
+            try:
+                df_work.to_csv(transformed_csv_path, index=False)
+            except Exception as e:
+                transformation_info["CSV_Save_Error"] = f"Could not save transformed data: {str(e)}"
+                transformed_csv_path = None # Indicate saving failed
+        else:
+            transformation_info["Note_Transformation"] = "No valid columns found or specified for stationarity transformation."
+    # Select the two target series for Granger causality AFTER potential transformations
+    # Ensure they still exist and are numeric
+    if target_col1 not in df_work.columns or target_col2 not in df_work.columns:
+         # This could happen if differencing made a column all NaN and it got dropped by some operation
+        raise ValueError(f"Target columns '{target_col1}' or '{target_col2}' are no longer in the DataFrame after transformations. Check transformation log.")
+    series1_data = df_work[target_col1]
+    series2_data = df_work[target_col2]
+    if not pd.api.types.is_numeric_dtype(series1_data) or \
+       not pd.api.types.is_numeric_dtype(series2_data):
+        raise ValueError(f"Target columns '{target_col1}' and/or '{target_col2}' must be numeric for Granger causality. Check data types after transformations.")
+    data_for_test = pd.DataFrame({target_col1: series1_data, target_col2: series2_data})
+    # For VAR lag selection, we need to drop NaNs from the pair of series
+    data_for_var_lag_selection = data_for_test.dropna()
+    if data_for_var_lag_selection.shape[0] < max_lags + 1 : # Check if enough data points
+        # This check is now more critical as differencing can reduce data significantly
+        transformation_info["LagSelectionWarning"] = (
+            f"Not enough non-NaN data points ({data_for_var_lag_selection.shape[0]}) "
+            f"in '{target_col1}' & '{target_col2}' pair for maxlags={max_lags} after transformations/NaN removal. "
+            f"Defaulting lag to 1 or minimum possible."
+        )
+        # Attempt to determine a lag with available data, or default to 1
+        effective_max_lags = min(max_lags, max(1, data_for_var_lag_selection.shape[0] // 3 -1) ) # Heuristic
+        if effective_max_lags < 1 : effective_max_lags =1
+        lag_order = recommend_var_lag(data_for_var_lag_selection, maxlags=effective_max_lags, criterion=criterion)
+        lag_order = lag_order or 1 # Ensure lag_order is at least 1
+    elif data_for_var_lag_selection.empty:
+        transformation_info["LagSelectionError"] = (
+            f"DataFrame for VAR lag selection between '{target_col1}' & '{target_col2}' is empty after transformations/NaN removal. "
+            "Cannot perform Granger causality. Defaulting lag to 1 for report."
+        )
+        lag_order = 1
+    else:
+        lag_order = recommend_var_lag(data_for_var_lag_selection, maxlags=max_lags, criterion=criterion)
+        lag_order = lag_order or min(max_lags, 1) # Ensure lag_order is at least 1 if recommend_var_lag returns None
+    granger_input_df = data_for_test[[target_col1, target_col2]] # Ensure correct order for interpretation
+    summary_text_parts = [f"Granger Causality Analysis for '{target_col1}' and '{target_col2}'"]
+    summary_text_parts.append(f"Recommended Lag Order (based on VAR on processed series, criterion: {criterion}): {lag_order}\n")
+    if granger_input_df.dropna().shape[0] < lag_order + 1:
+        summary_text_parts.append(
+            f"Critical Warning: After NaN removal for the pair ('{target_col1}', '{target_col2}'), "
+            f"only {granger_input_df.dropna().shape[0]} observations remain. "
+            f"This may be insufficient for Granger causality tests with lag {lag_order}. Results might be unreliable or fail.\n"
+        )
+        # Return early or let grangercausalitytests try and fail
+        # For now, let it try, it might still work for lag 1 if data is very short.
+    # Capture output from grangercausalitytests
+    capture_buffer = None
+    try:
+        import sys, io
+        original_stdout = sys.stdout
+        sys.stdout = capture_buffer = io.StringIO()
+        # Note: grangercausalitytests expects the first column to be the Y (effect)
+        # and the second to be X (cause) for the test "X -> Y".
+        # The function runs tests for both directions.
+        # The order here (target_col1, then target_col2) means the first set of tests is "target_col2 -> target_col1"
+        grangercausalitytests(granger_input_df, maxlag=lag_order, verbose=True)
+    except Exception as e:
+        summary_text_parts.append(f"Error during grangercausalitytests execution: {str(e)}\n")
+    finally:
+        if capture_buffer:
+            summary_text_parts.append(capture_buffer.getvalue())
+            sys.stdout = original_stdout # Restore stdout
+    summary_text = "\n".join(summary_text_parts)
+    # Plot the two selected series (potentially transformed)
+    plot_img = None
+    try:
+        fig, ax1 = plt.subplots()
+        ax2 = ax1.twinx()
+        # Use .dropna() for plotting to avoid issues if leading/trailing NaNs exist from differencing
+        # but only for the series being plotted, not changing data_for_test
+        plot_series1 = data_for_test[target_col1].dropna()
+        plot_series2 = data_for_test[target_col2].dropna()
+        ax1.plot(plot_series1.index, plot_series1, label=target_col1, color='blue')
+        ax2.plot(plot_series2.index, plot_series2, label=target_col2, color='red')
+        ax1.set_xlabel('Time / Index')
+        ax1.set_ylabel(target_col1, color='blue')
+        ax2.set_ylabel(target_col2, color='red')
+        # Combine legends
+        lines1, labels1 = ax1.get_legend_handles_labels()
+        lines2, labels2 = ax2.get_legend_handles_labels()
+        ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
+        plt.title(f"Time Series: {target_col1} vs {target_col2}")
+        fig.tight_layout() # Adjust layout
+        buf = BytesIO()
+        fig.savefig(buf, format='png', bbox_inches='tight')
+        plt.close(fig)
+        buf.seek(0)
+        plot_img = Image.open(buf)
+    except Exception as e:
+        # Add to transformation_info as it's a non-critical error for the text output
+        transformation_info["Plotting_Error"] = f"Could not generate plot: {str(e)}"
+    return summary_text, plot_img, transformed_csv_path, transformation_info