Spaces:
Sleeping
Sleeping
Delete Utils
Browse files- utils/__pycache__/helpers.cpython-312.pyc +0 -0
- utils/__pycache__/plotting.cpython-312.pyc +0 -0
- utils/helpers.py +0 -28
- utils/plotting.py +0 -77
utils/__pycache__/helpers.cpython-312.pyc
DELETED
|
Binary file (1.23 kB)
|
|
|
utils/__pycache__/plotting.cpython-312.pyc
DELETED
|
Binary file (4.08 kB)
|
|
|
utils/helpers.py
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
# causalscience/utils/helpers.py
|
| 2 |
-
|
| 3 |
-
import pandas as pd
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def detect_column_type(df: pd.DataFrame, column: str) -> str:
|
| 7 |
-
"""
|
| 8 |
-
Determine if a column is boolean (binary) or timeseries.
|
| 9 |
-
|
| 10 |
-
Args:
|
| 11 |
-
df (pd.DataFrame): Input DataFrame.
|
| 12 |
-
column (str): Column name to inspect.
|
| 13 |
-
|
| 14 |
-
Returns:
|
| 15 |
-
'boolean' if the column has exactly two unique values,
|
| 16 |
-
'timeseries' if all values can be parsed as dates,
|
| 17 |
-
None otherwise.
|
| 18 |
-
"""
|
| 19 |
-
unique_vals = df[column].dropna().unique()
|
| 20 |
-
if len(unique_vals) == 2:
|
| 21 |
-
return 'boolean'
|
| 22 |
-
try:
|
| 23 |
-
dt = pd.to_datetime(df[column], errors='coerce')
|
| 24 |
-
if dt.notna().all():
|
| 25 |
-
return 'timeseries'
|
| 26 |
-
except Exception:
|
| 27 |
-
pass
|
| 28 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/plotting.py
DELETED
|
@@ -1,77 +0,0 @@
|
|
| 1 |
-
# causalscience/utils/plotting.py
|
| 2 |
-
|
| 3 |
-
import numpy as np
|
| 4 |
-
import matplotlib.pyplot as plt
|
| 5 |
-
from io import BytesIO
|
| 6 |
-
from PIL import Image
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def calculate_standardized_differences(data, vars_list, treatment, weights=None):
|
| 10 |
-
"""
|
| 11 |
-
Calculate standardized mean differences for covariate balance.
|
| 12 |
-
|
| 13 |
-
Args:
|
| 14 |
-
data (pd.DataFrame): Dataset including treatment indicator.
|
| 15 |
-
vars_list (list[str]): Numeric covariate column names.
|
| 16 |
-
treatment (str): Treatment indicator column (0/1).
|
| 17 |
-
weights (list or np.array, optional): Weights for each observation.
|
| 18 |
-
|
| 19 |
-
Returns:
|
| 20 |
-
pd.DataFrame: Columns ['variable', 'std_diff'].
|
| 21 |
-
"""
|
| 22 |
-
results = []
|
| 23 |
-
if weights is None:
|
| 24 |
-
weights = np.ones(len(data))
|
| 25 |
-
else:
|
| 26 |
-
weights = np.array(weights)
|
| 27 |
-
|
| 28 |
-
treated = data[treatment] == 1
|
| 29 |
-
control = data[treatment] == 0
|
| 30 |
-
|
| 31 |
-
for var in vars_list:
|
| 32 |
-
mean_t = np.average(data.loc[treated, var], weights=weights[treated])
|
| 33 |
-
mean_c = np.average(data.loc[control, var], weights=weights[control])
|
| 34 |
-
var_t = np.average((data.loc[treated, var] - mean_t)**2, weights=weights[treated])
|
| 35 |
-
var_c = np.average((data.loc[control, var] - mean_c)**2, weights=weights[control])
|
| 36 |
-
pooled_sd = np.sqrt((var_t + var_c) / 2)
|
| 37 |
-
std_diff = (mean_t - mean_c) / pooled_sd if pooled_sd != 0 else np.nan
|
| 38 |
-
results.append({'variable': var, 'std_diff': std_diff})
|
| 39 |
-
import pandas as pd
|
| 40 |
-
return pd.DataFrame(results)
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
def love_plot(std_diffs_list, labels, threshold=0.1, abs_val=False):
|
| 44 |
-
"""
|
| 45 |
-
Generate a Love plot for covariate balance.
|
| 46 |
-
|
| 47 |
-
Args:
|
| 48 |
-
std_diffs_list (list[pd.DataFrame]): List of std diff DataFrames.
|
| 49 |
-
labels (list[str]): Labels for each dataset.
|
| 50 |
-
threshold (float, optional): Threshold lines for balance.
|
| 51 |
-
abs_val (bool, optional): Plot absolute std diffs.
|
| 52 |
-
|
| 53 |
-
Returns:
|
| 54 |
-
PIL.Image.Image: Love plot saved to image buffer.
|
| 55 |
-
"""
|
| 56 |
-
fig, ax = plt.subplots(figsize=(10, 6))
|
| 57 |
-
markers = ['o', 's', '^', 'd']
|
| 58 |
-
colors = ['red', 'blue', 'green', 'purple']
|
| 59 |
-
|
| 60 |
-
for i, (std_df, label) in enumerate(zip(std_diffs_list, labels)):
|
| 61 |
-
values = std_df['std_diff'].abs() if abs_val else std_df['std_diff']
|
| 62 |
-
ax.scatter(values, std_df['variable'], label=label,
|
| 63 |
-
marker=markers[i % len(markers)], s=100)
|
| 64 |
-
|
| 65 |
-
ax.axvline(x=threshold, color='gray', linestyle='--', alpha=0.5)
|
| 66 |
-
ax.axvline(x=-threshold, color='gray', linestyle='--', alpha=0.5)
|
| 67 |
-
|
| 68 |
-
ax.set_xlabel('Standardized Mean Difference')
|
| 69 |
-
ax.set_title('Love Plot: Covariate Balance')
|
| 70 |
-
ax.legend()
|
| 71 |
-
|
| 72 |
-
buf = BytesIO()
|
| 73 |
-
fig.tight_layout()
|
| 74 |
-
fig.savefig(buf, format='png', bbox_inches='tight')
|
| 75 |
-
plt.close(fig)
|
| 76 |
-
buf.seek(0)
|
| 77 |
-
return Image.open(buf)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|