Spaces:
Sleeping
Sleeping
| # Generated by Claude Code -- 2026-02-08 | |
| """Data augmentation for the conjunction prediction dataset. | |
| The fundamental problem: only 67 high-risk events out of 13,154 in training (0.5%). | |
| This module provides two augmentation strategies: | |
| 1. SPACE-TRACK INTEGRATION: Merge real high-risk CDMs from Space-Track's cdm_public | |
| feed. These have fewer features (16 vs 103) but provide real positive examples. | |
| 2. TIME-SERIES AUGMENTATION: Create synthetic variants of existing high-risk events | |
| by applying realistic perturbations: | |
| - Gaussian noise on covariance/position/velocity features | |
| - Temporal jittering (shift CDM creation times slightly) | |
| - Feature dropout (randomly zero out some features, simulating missing data) | |
| - Sequence truncation (remove early CDMs, simulating late detection) | |
| Both strategies are physics-aware: they don't generate impossible configurations | |
| (e.g., negative miss distances or covariance values). | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| from pathlib import Path | |
| def augment_event_noise( | |
| event_df: pd.DataFrame, | |
| noise_scale: float = 0.05, | |
| n_augments: int = 5, | |
| rng: np.random.Generator = None, | |
| ) -> list[pd.DataFrame]: | |
| """ | |
| Create n_augments noisy variants of a single conjunction event. | |
| Applies Gaussian noise to numeric features, scaled by each column's | |
| standard deviation within the event. Preserves event_id structure and | |
| ensures physical constraints (non-negative distances, etc.). | |
| """ | |
| if rng is None: | |
| rng = np.random.default_rng(42) | |
| # Identify numeric columns to perturb (exclude IDs and targets) | |
| exclude = {"event_id", "time_to_tca", "risk", "mission_id", "source"} | |
| numeric_cols = event_df.select_dtypes(include=[np.number]).columns | |
| perturb_cols = [c for c in numeric_cols if c not in exclude] | |
| augmented = [] | |
| for i in range(n_augments): | |
| aug = event_df.copy() | |
| for col in perturb_cols: | |
| values = aug[col].values.astype(float) | |
| col_std = np.std(values) | |
| if col_std < 1e-10: | |
| col_std = np.abs(np.mean(values)) * 0.01 + 1e-10 | |
| noise = rng.normal(0, noise_scale * col_std, size=len(values)) | |
| aug[col] = values + noise | |
| # Physical constraints | |
| if "miss_distance" in aug.columns: | |
| aug["miss_distance"] = aug["miss_distance"].clip(lower=0) | |
| if "relative_speed" in aug.columns: | |
| aug["relative_speed"] = aug["relative_speed"].clip(lower=0) | |
| # Ensure covariance sigma columns stay positive | |
| sigma_cols = [c for c in perturb_cols if "sigma" in c.lower()] | |
| for col in sigma_cols: | |
| aug[col] = aug[col].clip(lower=0) | |
| augmented.append(aug) | |
| return augmented | |
| def augment_event_truncate( | |
| event_df: pd.DataFrame, | |
| min_keep: int = 3, | |
| n_augments: int = 3, | |
| rng: np.random.Generator = None, | |
| ) -> list[pd.DataFrame]: | |
| """ | |
| Create truncated variants by removing early CDMs. | |
| Simulates late-detection scenarios where only the most recent CDMs | |
| are available (closer to TCA). | |
| """ | |
| if rng is None: | |
| rng = np.random.default_rng(42) | |
| # Sort by time_to_tca descending (first CDM = furthest from TCA) | |
| event_df = event_df.sort_values("time_to_tca", ascending=False) | |
| n_cdms = len(event_df) | |
| if n_cdms <= min_keep: | |
| return [] | |
| augmented = [] | |
| for _ in range(n_augments): | |
| # Keep between min_keep and n_cdms-1 CDMs (always keep the last few) | |
| n_keep = rng.integers(min_keep, n_cdms) | |
| aug = event_df.iloc[-n_keep:].copy() | |
| augmented.append(aug) | |
| return augmented | |
| def augment_positive_events( | |
| df: pd.DataFrame, | |
| target_ratio: float = 0.05, | |
| noise_scale: float = 0.05, | |
| seed: int = 42, | |
| ) -> pd.DataFrame: | |
| """ | |
| Augment the positive (high-risk) class to reach target_ratio. | |
| Args: | |
| df: full training DataFrame with event_id, risk columns | |
| target_ratio: desired fraction of high-risk events (default 5%) | |
| noise_scale: std dev of Gaussian noise as fraction of feature std | |
| seed: random seed | |
| Returns: | |
| Augmented DataFrame with new synthetic positive events appended | |
| """ | |
| rng = np.random.default_rng(seed) | |
| # Find positive events | |
| event_risks = df.groupby("event_id")["risk"].last() | |
| pos_event_ids = event_risks[event_risks > -5].index.tolist() | |
| neg_event_ids = event_risks[event_risks <= -5].index.tolist() | |
| n_pos = len(pos_event_ids) | |
| n_neg = len(neg_event_ids) | |
| n_total = n_pos + n_neg | |
| # How many positive events do we need? | |
| target_pos = int(target_ratio * (n_total / (1 - target_ratio))) | |
| n_needed = max(0, target_pos - n_pos) | |
| if n_needed == 0: | |
| print(f"Already at target ratio ({n_pos}/{n_total} = {n_pos/n_total:.1%})") | |
| return df | |
| print(f"Augmenting: {n_pos} positive events → {n_pos + n_needed} " | |
| f"(target {target_ratio:.0%} of {n_total + n_needed})") | |
| # Generate augmented events | |
| max_event_id = df["event_id"].max() | |
| augmented_dfs = [] | |
| generated = 0 | |
| while generated < n_needed: | |
| # Pick a random positive event to augment | |
| src_event_id = rng.choice(pos_event_ids) | |
| src_event = df[df["event_id"] == src_event_id] | |
| # Apply noise augmentation | |
| aug_variants = augment_event_noise( | |
| src_event, noise_scale=noise_scale, n_augments=1, rng=rng | |
| ) | |
| # Also try truncation sometimes | |
| if rng.random() < 0.3 and len(src_event) > 3: | |
| trunc_variants = augment_event_truncate( | |
| src_event, n_augments=1, rng=rng | |
| ) | |
| aug_variants.extend(trunc_variants) | |
| for aug_df in aug_variants: | |
| if generated >= n_needed: | |
| break | |
| max_event_id += 1 | |
| aug_df = aug_df.copy() | |
| aug_df["event_id"] = max_event_id | |
| aug_df["source"] = "augmented" | |
| augmented_dfs.append(aug_df) | |
| generated += 1 | |
| if augmented_dfs: | |
| augmented = pd.concat(augmented_dfs, ignore_index=True) | |
| result = pd.concat([df, augmented], ignore_index=True) | |
| # Verify | |
| event_risks = result.groupby("event_id")["risk"].last() | |
| new_pos = (event_risks > -5).sum() | |
| new_total = len(event_risks) | |
| print(f"Result: {new_pos} positive / {new_total} total " | |
| f"({new_pos/new_total:.1%})") | |
| return result | |
| return df | |
| def integrate_spacetrack_positives( | |
| kelvins_df: pd.DataFrame, | |
| spacetrack_path: Path, | |
| ) -> pd.DataFrame: | |
| """ | |
| Add Space-Track emergency CDMs as additional positive training examples. | |
| Since Space-Track cdm_public has only 16 features vs Kelvins' 103, | |
| missing features are filled with 0. The model will learn to use whatever | |
| features are available. | |
| """ | |
| if not spacetrack_path.exists(): | |
| print(f"No Space-Track data at {spacetrack_path}") | |
| return kelvins_df | |
| from src.data.merge_sources import ( | |
| load_spacetrack_cdms, group_into_events, merge_datasets | |
| ) | |
| st_df = load_spacetrack_cdms(spacetrack_path) | |
| st_df = group_into_events(st_df) | |
| merged = merge_datasets(kelvins_df, st_df) | |
| return merged | |
| def build_augmented_training_set( | |
| data_dir: Path, | |
| target_positive_ratio: float = 0.05, | |
| noise_scale: float = 0.05, | |
| seed: int = 42, | |
| ) -> tuple[pd.DataFrame, pd.DataFrame]: | |
| """ | |
| Build the full augmented training set from all available sources. | |
| Steps: | |
| 1. Load ESA Kelvins train/test | |
| 2. Merge Space-Track emergency CDMs into training set | |
| 3. Apply time-series augmentation to positive events | |
| 4. Return (augmented_train, original_test) | |
| Test set is NEVER augmented — it stays as Kelvins-only for fair evaluation. | |
| """ | |
| from src.data.cdm_loader import load_dataset | |
| print("=" * 60) | |
| print(" Building Augmented Training Set") | |
| print("=" * 60) | |
| # Step 1: Load Kelvins | |
| print("\n1. Loading ESA Kelvins dataset ...") | |
| train_df, test_df = load_dataset(data_dir / "cdm") | |
| # Defragment and tag source | |
| train_df = train_df.copy() | |
| test_df = test_df.copy() | |
| train_df["source"] = "kelvins" | |
| test_df["source"] = "kelvins" | |
| # Count initial positives | |
| event_risks = train_df.groupby("event_id")["risk"].last() | |
| n_pos_initial = (event_risks > -5).sum() | |
| n_total_initial = len(event_risks) | |
| print(f" Initial: {n_pos_initial} positive / {n_total_initial} total " | |
| f"({n_pos_initial/n_total_initial:.2%})") | |
| # Step 2: Space-Track integration | |
| st_path = data_dir / "cdm_spacetrack" / "cdm_spacetrack_emergency.csv" | |
| if st_path.exists(): | |
| print(f"\n2. Integrating Space-Track emergency CDMs ...") | |
| train_df = integrate_spacetrack_positives(train_df, st_path) | |
| else: | |
| print(f"\n2. No Space-Track data found (skipping)") | |
| # Step 3: Time-series augmentation | |
| print(f"\n3. Augmenting positive events (target ratio: {target_positive_ratio:.0%}) ...") | |
| train_df = augment_positive_events( | |
| train_df, | |
| target_ratio=target_positive_ratio, | |
| noise_scale=noise_scale, | |
| seed=seed, | |
| ) | |
| # Final stats | |
| event_risks = train_df.groupby("event_id")["risk"].last() | |
| event_sources = train_df.groupby("event_id")["source"].first() | |
| n_kelvins = (event_sources == "kelvins").sum() | |
| n_spacetrack = (event_sources == "spacetrack").sum() | |
| n_augmented = (event_sources == "augmented").sum() | |
| n_pos_final = (event_risks > -5).sum() | |
| n_total_final = len(event_risks) | |
| print(f"\n{'=' * 60}") | |
| print(f" Final Training Set:") | |
| print(f" Kelvins events: {n_kelvins}") | |
| print(f" Space-Track events: {n_spacetrack}") | |
| print(f" Augmented events: {n_augmented}") | |
| print(f" Total events: {n_total_final}") | |
| print(f" Positive events: {n_pos_final} ({n_pos_final/n_total_final:.1%})") | |
| print(f" Total CDM rows: {len(train_df)}") | |
| print(f"{'=' * 60}") | |
| return train_df, test_df | |