| import argparse |
| import os |
| |
| import argparse |
| import os |
| import numpy as np |
| import pandas as pd |
|
|
| EPS = 1e-12 |
|
|
| |
| def logit(p: np.ndarray) -> np.ndarray: |
| """Compute logit(p) = log(p / (1 - p)).""" |
| p = np.clip(p, EPS, 1.0 - EPS) |
| return np.log(p / (1.0 - p)) |
|
|
| def entropy(p: np.ndarray) -> np.ndarray: |
| """Shannon entropy: H(p) = -Σ p log p.""" |
| p = np.clip(p, EPS, 1.0) |
| return -np.sum(p * np.log(p), axis=1) |
|
|
| def top2_margin(p: np.ndarray) -> np.ndarray: |
| """Margin = top1(p) - top2(p).""" |
| s = np.sort(p, axis=1) |
| return s[:, -1] - s[:, -2] |
|
|
| |
| def main(): |
| ap = argparse.ArgumentParser(description="Compute probability-derived features (logit, max prob, margin, entropy).") |
| ap.add_argument("--input", required=True, |
| help="Path to FinSent_*_raw_probs.csv from FinBERT/RoBERTa step.") |
| ap.add_argument("--out_file", default=None, |
| help="Output CSV (default: adds _prob_features to filename).") |
| ap.add_argument("--out_dir", default="outputs", help="Base output directory") |
| ap.add_argument("--out_subdir", default="prob features", help="Subdirectory under out_dir to save prob features") |
| args = ap.parse_args() |
|
|
| df = pd.read_csv(args.input) |
|
|
| |
| req = [ |
| "fin_p_neg","fin_p_neu","fin_p_pos", |
| "rob_p_neg","rob_p_neu","rob_p_pos" |
| ] |
| missing = [c for c in req if c not in df.columns] |
| if missing: |
| raise ValueError(f"Missing columns: {missing}") |
|
|
| |
| p_fin = df[["fin_p_neg","fin_p_neu","fin_p_pos"]].to_numpy(dtype=float) |
| p_rob = df[["rob_p_neg","rob_p_neu","rob_p_pos"]].to_numpy(dtype=float) |
|
|
| |
| fin_logit = logit(p_fin) |
| for i, cls in enumerate(["neg","neu","pos"]): |
| df[f"fin_logit_{cls}"] = fin_logit[:, i] |
| df["fin_max_prob"] = np.max(p_fin, axis=1) |
| df["fin_margin"] = top2_margin(p_fin) |
| df["fin_entropy"] = entropy(p_fin) |
|
|
| |
| rob_logit = logit(p_rob) |
| for i, cls in enumerate(["neg","neu","pos"]): |
| df[f"rob_logit_{cls}"] = rob_logit[:, i] |
| df["rob_max_prob"] = np.max(p_rob, axis=1) |
| df["rob_margin"] = top2_margin(p_rob) |
| df["rob_entropy"] = entropy(p_rob) |
|
|
| |
| root, ext = os.path.splitext(args.input) |
| if args.out_file: |
| out_path = args.out_file |
| else: |
| |
| save_dir = os.path.join(args.out_dir, args.out_subdir) if args.out_subdir else args.out_dir |
| os.makedirs(save_dir, exist_ok=True) |
| base = os.path.basename(root) |
| out_path = os.path.join(save_dir, f"{base}_prob_features.csv") |
| df.to_csv(out_path, index=False) |
| print(f"[✓] Saved probability-derived features to: {out_path}") |
| print("Added columns: fin/rob logits, max_prob, margin, entropy") |
|
|
| if __name__ == "__main__": |
| main() |