Spaces:

bhanug2026
/

aviation-disruption-intelligence

Sleeping

aviation-disruption-intelligence / src /utils /generate_base_data.py

bhanug2026

Initial commit

47c6cfd 4 days ago

21.7 kB

	"""
	src/utils/generate_base_data.py
	================================
	Generates realistic synthetic historical base datasets for model training.
	Simulates 18 months of aviation disruption data (Jan 2024 – Jun 2025)
	with realistic correlations between conflict signals, disruptions, and prices.

	Run: python -m src.utils.generate_base_data
	"""

	import numpy as np
	import pandas as pd
	from pathlib import Path
	from datetime import datetime, timedelta
	import sys

	SEED = 42
	rng = np.random.default_rng(SEED)

	PROCESSED_DIR = Path(__file__).parent.parent.parent / "data" / "processed"
	PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

	# ── Helper ────────────────────────────────────────────────────────────────────

	def _sin_wave(n, period, amplitude=1.0, phase=0.0):
	"""Create a sinusoidal pattern."""
	return amplitude * np.sin(2 * np.pi * np.arange(n) / period + phase)


	# ── Conflict Events ───────────────────────────────────────────────────────────

	CONFLICT_EVENTS = [
	# (start_day_offset, duration_days, intensity, label)
	(0, 30, 0.8, "Iran-US escalation Jan 2024"),
	(45, 15, 0.6, "Gaza airspace closure Feb 2024"),
	(90, 20, 0.7, "Ukraine FIR disruptions Mar 2024"),
	(140, 10, 0.5, "Yemen Houthi attacks Apr 2024"),
	(180, 25, 0.9, "Iran-Israel direct exchange May 2024"),
	(220, 12, 0.5, "Pakistan-India tensions Jun 2024"),
	(270, 30, 0.7, "Middle East escalation Sep 2024"),
	(320, 20, 0.6, "Ukraine winter offensive Nov 2024"),
	(380, 15, 0.8, "Iran-US tensions Jan 2025"),
	(420, 40, 1.0, "Major conflict peak Feb-Mar 2025"),
	(470, 25, 0.7, "Post-conflict recovery Apr 2025"),
	(510, 20, 0.6, "Regional tensions May 2025"),
	]

	AIRPORTS = [
	("OIII", "Tehran", "IR", "Middle East"),
	("OMDB", "Dubai", "AE", "Middle East"),
	("LLBG", "Tel Aviv", "IL", "Middle East"),
	("HECA", "Cairo", "EG", "Middle East"),
	("OJAM", "Amman", "JO", "Middle East"),
	("UKBB", "Kyiv", "UA", "Eastern Europe"),
	("UUEE", "Moscow", "RU", "Eastern Europe"),
	("EPWA", "Warsaw", "PL", "Eastern Europe"),
	("LHBP", "Budapest", "HU", "Eastern Europe"),
	("OPKC", "Karachi", "PK", "South Asia"),
	("VIDP", "Delhi", "IN", "South Asia"),
	("EGLL", "London", "GB", "Western Europe"),
	("LFPG", "Paris", "FR", "Western Europe"),
	("EDDF", "Frankfurt", "DE", "Western Europe"),
	("EHAM", "Amsterdam", "NL", "Western Europe"),
	]

	AIRLINES = [
	("EK", "Emirates", "AE", "Middle East"),
	("EY", "Etihad", "AE", "Middle East"),
	("QR", "Qatar Airways", "QA", "Middle East"),
	("TK", "Turkish Airlines", "TR", "Turkey"),
	("LH", "Lufthansa", "DE", "Western Europe"),
	("BA", "British Airways", "GB", "Western Europe"),
	("AF", "Air France", "FR", "Western Europe"),
	("PS", "Ukraine Int'l", "UA", "Eastern Europe"),
	("PK", "Pakistan Int'l", "PK", "South Asia"),
	("IR", "Iran Air", "IR", "Middle East"),
	("AY", "Finnair", "FI", "Western Europe"),
	("KL", "KLM", "NL", "Western Europe"),
	]

	ROUTES = [
	("LHR", "DXB", "LH-ME", 350, 650),
	("CDG", "DXB", "LH-ME", 330, 620),
	("FRA", "DXB", "LH-ME", 320, 610),
	("JFK", "DXB", "NA-ME", 580, 950),
	("LHR", "TLV", "LH-ME", 280, 550),
	("CDG", "TLV", "LH-ME", 260, 530),
	("LHR", "BKK", "LH-AS", 420, 780),
	("LHR", "KHI", "LH-SA", 310, 600),
	("DXB", "DEL", "ME-SA", 180, 380),
	("IST", "DXB", "ME-ME", 150, 320),
	]


	def build_conflict_signal(n_days: int) -> np.ndarray:
	"""Build a day-level conflict intensity signal (0..1)."""
	signal = np.zeros(n_days)
	for start, dur, intensity, _ in CONFLICT_EVENTS:
	if start >= n_days:
	continue
	end = min(start + dur, n_days)
	# Ramp up / plateau / ramp down
	ramp = min(5, dur // 3)
	for d in range(start, end):
	offset = d - start
	if offset < ramp:
	signal[d] = intensity * offset / ramp
	elif offset > dur - ramp:
	signal[d] = intensity * (dur - offset) / ramp
	else:
	signal[d] = intensity
	# Add noise
	signal += rng.normal(0, 0.05, n_days)
	return np.clip(signal, 0, 1)


	# ── Generate Flight Disruptions ───────────────────────────────────────────────

	def generate_flight_disruptions() -> pd.DataFrame:
	"""
	One row per (airport, 6-hour period) over 18 months.
	~15 airports × 4 periods/day × 548 days ≈ 32,880 rows.
	"""
	start_date = datetime(2024, 1, 1)
	n_days = 548 # Jan 2024 – Jun 2025
	periods_per_day = 4 # 00:00, 06:00, 12:00, 18:00

	conflict_signal = build_conflict_signal(n_days)

	# Oil price simulation: Brent crude, realistic range $70–$100
	oil_base = 82.0
	oil_trend = np.linspace(0, 10, n_days) # slight upward trend
	oil_cycle = _sin_wave(n_days, 60, amplitude=8)
	oil_shock = np.zeros(n_days)
	for start, dur, intensity, _ in CONFLICT_EVENTS:
	if start < n_days:
	end = min(start + dur, n_days)
	oil_shock[start:end] += intensity * 12 # conflict → oil spike
	oil_price_daily = oil_base + oil_trend + oil_cycle + oil_shock + rng.normal(0, 1.5, n_days)
	oil_price_daily = np.clip(oil_price_daily, 60, 130)

	records = []
	for day_idx in range(n_days):
	date = start_date + timedelta(days=day_idx)
	conflict = conflict_signal[day_idx]
	oil = oil_price_daily[day_idx]
	oil_prev7 = oil_price_daily[max(0, day_idx - 7)]
	oil_change_pct = (oil - oil_prev7) / oil_prev7 * 100

	for period in range(periods_per_day):
	hour = period * 6
	ts = date + timedelta(hours=hour)

	for airport_code, airport_name, country, region in AIRPORTS:
	# Regional conflict modifier
	is_conflict_region = region in ["Middle East", "Eastern Europe", "South Asia"]
	regional_mult = 2.5 if is_conflict_region else 0.8

	# Cancellation rate (0..1)
	cancel_base = 0.05 + conflict * regional_mult * 0.35
	cancel_noise = rng.beta(1.5, 8) * 0.15
	cancellation_rate = np.clip(cancel_base + cancel_noise, 0, 0.95)

	# Delay minutes
	delay_base = 15 + conflict * regional_mult * 80
	delay_minutes = max(0, rng.normal(delay_base, 10))

	# Airspace risk score (0..4)
	if is_conflict_region:
	risk_raw = conflict * 4 * regional_mult * 0.7
	airspace_risk_score = min(4.0, risk_raw + rng.uniform(-0.3, 0.3))
	else:
	airspace_risk_score = rng.uniform(0, 0.8)

	# Sentiment score (higher = more negative news)
	sentiment_base = conflict * regional_mult * 80
	sentiment_score = np.clip(
	rng.normal(sentiment_base, 10), -100, 100
	)

	# Number of conflict events in region
	conflict_event_count = int(
	rng.poisson(conflict * regional_mult * 5)
	)

	# Fuel pressure indicator
	fuel_pressure = (oil_change_pct / 20 + conflict * 0.3) * 50
	fuel_pressure_indicator = np.clip(fuel_pressure, 0, 100)

	# Disruption index (composite, 0..100)
	disruption_index = np.clip(
	cancellation_rate * 40 + delay_minutes / 200 * 30 +
	airspace_risk_score / 4 * 20 + conflict * regional_mult * 10,
	0, 100
	)

	# Airport stress score (0..100)
	airport_stress_score = np.clip(
	disruption_index * 0.7 + airspace_risk_score * 5 +
	rng.normal(0, 3), 0, 100
	)

	# Binary target: is_high_disruption
	is_high_disruption = int(disruption_index > 50)

	records.append({
	"timestamp": ts.isoformat(),
	"date": date.strftime("%Y-%m-%d"),
	"hour": hour,
	"airport_code": airport_code,
	"airport_name": airport_name,
	"country": country,
	"region": region,
	"conflict_active": int(conflict > 0.3),
	"conflict_intensity": round(conflict, 4),
	"conflict_event_count": conflict_event_count,
	"cancellation_rate": round(cancellation_rate, 4),
	"avg_delay_minutes": round(delay_minutes, 1),
	"cancellation_rate_24h": round(cancellation_rate, 4),
	"avg_delay_24h": round(delay_minutes, 1),
	"airspace_risk_score": round(airspace_risk_score, 3),
	"sentiment_score": round(sentiment_score, 2),
	"sentiment_momentum": round(rng.normal(0, 5), 2),
	"oil_price": round(oil, 2),
	"oil_price_change_pct": round(oil_change_pct, 3),
	"fuel_pressure_indicator": round(fuel_pressure_indicator, 2),
	"disruption_index": round(disruption_index, 2),
	"disruption_index_lag6h": round(disruption_index * rng.uniform(0.85, 1.0), 2),
	"airport_stress_score": round(airport_stress_score, 2),
	"is_high_disruption": is_high_disruption,
	})

	df = pd.DataFrame(records)
	print(f"Flight disruptions: {len(df):,} rows \| positive rate: {df.is_high_disruption.mean():.2%}")
	return df


	# ── Generate Flight Prices ────────────────────────────────────────────────────

	def generate_flight_prices() -> pd.DataFrame:
	"""
	One row per (route, week) over 18 months with realistic price drivers.
	~10 routes × 78 weeks ≈ 780 rows.
	"""
	start_date = datetime(2024, 1, 1)
	n_weeks = 78
	n_days = n_weeks * 7

	conflict_signal = build_conflict_signal(n_days)
	oil_base = 82.0
	oil_price_daily = (
	oil_base
	+ np.linspace(0, 10, n_days)
	+ _sin_wave(n_days, 60, 8)
	+ rng.normal(0, 1.5, n_days)
	)

	records = []
	for week_idx in range(n_weeks):
	day_idx = week_idx * 7
	date = start_date + timedelta(days=day_idx)
	conflict = conflict_signal[day_idx]
	oil = oil_price_daily[day_idx]
	oil_prev = oil_price_daily[max(0, day_idx - 14)]
	oil_change_pct = (oil - oil_prev) / oil_prev * 100

	sentiment_score = conflict * 70 + rng.normal(0, 8)
	sentiment_momentum = rng.normal(0, 5)

	for origin, dest, region_type, price_base, price_max in ROUTES:
	is_conflict_route = "ME" in region_type
	route_conflict_flag = int(is_conflict_route and conflict > 0.4)

	# Price model: base + oil spike + conflict premium + seasonality
	seasonal = _sin_wave(n_weeks, 52, 40)[week_idx]
	oil_premium = oil_change_pct * (2.5 if is_conflict_route else 1.2)
	conflict_premium = conflict * (120 if is_conflict_route else 40)
	demand_shock = rng.normal(0, 25)

	price = (price_base + seasonal + oil_premium + conflict_premium +
	demand_shock)
	price = np.clip(price, price_base * 0.7, price_max * 1.4)

	disruption_index = conflict * (2.5 if is_conflict_route else 0.8) * 50
	disruption_index = np.clip(disruption_index + rng.normal(0, 5), 0, 100)

	fuel_pressure_indicator = np.clip(
	(oil_change_pct / 20 + conflict * 0.3) * 50 + rng.normal(0, 3), 0, 100
	)

	for days_to_dep in [7, 14, 30, 60]:
	booking_premium = max(0, (30 - days_to_dep) * 2.5)
	final_price = price + booking_premium + rng.normal(0, 15)
	final_price = max(80, final_price)

	records.append({
	"timestamp": date.isoformat(),
	"week": date.strftime("%Y-W%U"),
	"origin": origin,
	"destination": dest,
	"route": f"{origin}-{dest}",
	"region_type": region_type,
	"route_conflict_flag": route_conflict_flag,
	"days_to_departure": days_to_dep,
	"day_of_week": date.weekday(),
	"price_usd": round(final_price, 2),
	"oil_price": round(oil, 2),
	"oil_price_change_pct": round(oil_change_pct, 3),
	"disruption_index": round(disruption_index, 2),
	"cancellation_rate_24h": round(conflict * 0.3 + rng.uniform(0, 0.1), 4),
	"sentiment_score": round(sentiment_score, 2),
	"sentiment_momentum": round(sentiment_momentum, 2),
	"fuel_pressure_indicator": round(fuel_pressure_indicator, 2),
	"conflict_intensity": round(conflict, 4),
	})

	df = pd.DataFrame(records)
	print(f"Flight prices: {len(df):,} rows \| price range: ${df.price_usd.min():.0f}–${df.price_usd.max():.0f}")
	return df


	# ── Generate Oil Prices ───────────────────────────────────────────────────────

	def generate_oil_prices() -> pd.DataFrame:
	"""Daily Brent crude oil prices, Jan 2024 – Jun 2025."""
	n_days = 548
	start = datetime(2024, 1, 1)
	conflict_signal = build_conflict_signal(n_days)

	oil_base = 82.0
	oil_trend = np.linspace(0, 10, n_days)
	oil_cycle = _sin_wave(n_days, 60, 8)
	oil_shock = np.array([
	sum(intensity * 12 for s, dur, intensity, _ in CONFLICT_EVENTS
	if s <= d < s + dur)
	for d in range(n_days)
	])
	prices = oil_base + oil_trend + oil_cycle + oil_shock + rng.normal(0, 1.5, n_days)
	prices = np.clip(prices, 60, 130)

	records = []
	for i, p in enumerate(prices):
	date = start + timedelta(days=i)
	prev = prices[max(0, i - 1)]
	pct = (p - prev) / prev * 100
	records.append({
	"date": date.strftime("%Y-%m-%d"),
	"brent_usd": round(p, 2),
	"wti_usd": round(p - rng.uniform(2, 5), 2),
	"pct_change": round(pct, 3),
	"rolling_7d_avg": round(np.mean(prices[max(0, i - 7):i + 1]), 2),
	"rolling_30d_avg": round(np.mean(prices[max(0, i - 30):i + 1]), 2),
	"conflict_intensity": round(conflict_signal[i], 4),
	})

	df = pd.DataFrame(records)
	print(f"Oil prices: {len(df):,} days")
	return df


	# ── Generate Airspace Risk ────────────────────────────────────────────────────

	def generate_airspace_risk() -> pd.DataFrame:
	"""Curated airspace risk snapshots, updated weekly."""
	RISK_LEVELS = ["No Advisory", "Exercise Caution", "Increased Caution",
	"Avoid if Possible", "Do Not Fly"]
	RISK_MAP = {r: i for i, r in enumerate(RISK_LEVELS)}

	countries = [
	("IR", "Iran", "Middle East"),
	("IQ", "Iraq", "Middle East"),
	("IL", "Israel", "Middle East"),
	("YE", "Yemen", "Middle East"),
	("SY", "Syria", "Middle East"),
	("UA", "Ukraine", "Eastern Europe"),
	("RU", "Russia", "Eastern Europe"),
	("PK", "Pakistan", "South Asia"),
	("ET", "Ethiopia", "Africa"),
	("LY", "Libya", "Africa"),
	]

	SAFE_COUNTRIES = [
	("DE", "Germany", "Western Europe"),
	("FR", "France", "Western Europe"),
	("GB", "United Kingdom", "Western Europe"),
	("US", "United States", "North America"),
	("AU", "Australia", "Asia-Pacific"),
	]

	start = datetime(2024, 1, 1)
	n_weeks = 78
	conflict_signal = build_conflict_signal(n_weeks * 7)

	records = []
	for week_idx in range(n_weeks):
	day = start + timedelta(weeks=week_idx)
	conflict = conflict_signal[week_idx * 7]

	for code, name, region in countries:
	risk_float = min(4, conflict * 4 * 1.2 + rng.uniform(-0.5, 0.5))
	risk_idx = max(0, min(4, int(risk_float)))
	risk_level = RISK_LEVELS[risk_idx]
	records.append({
	"timestamp": day.isoformat(),
	"country_code": code,
	"country_name": name,
	"region": region,
	"risk_level": risk_level,
	"risk_score": risk_idx,
	"description": f"{risk_level}: based on current conflict activity",
	"source": "SafeAirspace",
	"is_conflict_affected": 1,
	})

	for code, name, region in SAFE_COUNTRIES:
	records.append({
	"timestamp": day.isoformat(),
	"country_code": code,
	"country_name": name,
	"region": region,
	"risk_level": "No Advisory",
	"risk_score": 0,
	"description": "No active advisories",
	"source": "SafeAirspace",
	"is_conflict_affected": 0,
	})

	df = pd.DataFrame(records)
	print(f"Airspace risk: {len(df):,} rows")
	return df


	# ── Generate Sentiment (GDELT-style) ─────────────────────────────────────────

	def generate_sentiment() -> pd.DataFrame:
	"""Simulated GDELT news sentiment scores per region, every 6 hours."""
	n_days = 548
	start = datetime(2024, 1, 1)
	conflict_signal = build_conflict_signal(n_days)

	regions = ["Middle East", "Eastern Europe", "South Asia", "Global"]
	region_mults = {"Middle East": 1.5, "Eastern Europe": 1.2,
	"South Asia": 1.0, "Global": 0.7}

	records = []
	for day_idx in range(n_days):
	for hour in [0, 6, 12, 18]:
	ts = start + timedelta(days=day_idx, hours=hour)
	conflict = conflict_signal[day_idx]
	for region in regions:
	mult = region_mults[region]
	# GDELT tone: negative = bad news (0 = neutral, negative = conflict)
	tone_base = -conflict * mult * 5
	tone = tone_base + rng.normal(0, 0.8)
	article_count = max(1, int(rng.poisson(20 + conflict * mult * 40)))
	records.append({
	"timestamp": ts.isoformat(),
	"region": region,
	"tone_avg": round(tone, 3),
	"article_count": article_count,
	"sentiment_score": round(-tone * 10, 2), # positive = more conflict news
	"conflict_intensity": round(conflict, 4),
	})

	df = pd.DataFrame(records)
	# Add sentiment momentum
	df = df.sort_values(["region", "timestamp"]).reset_index(drop=True)
	df["sentiment_momentum"] = df.groupby("region")["sentiment_score"].diff().fillna(0)
	print(f"Sentiment: {len(df):,} rows")
	return df


	# ── Main ──────────────────────────────────────────────────────────────────────

	def main():
	print("=" * 60)
	print("Generating synthetic historical base datasets...")
	print("=" * 60)

	datasets = {
	"flight_disruptions.csv": generate_flight_disruptions,
	"flight_prices.csv": generate_flight_prices,
	"oil_prices.csv": generate_oil_prices,
	"airspace_risk.csv": generate_airspace_risk,
	"sentiment.csv": generate_sentiment,
	}

	for filename, generator in datasets.items():
	print(f"\n→ {filename}")
	df = generator()
	out = PROCESSED_DIR / filename
	df.to_csv(out, index=False)
	print(f" Saved: {out}")

	print("\n✓ All base datasets generated successfully.")
	print(f" Location: {PROCESSED_DIR}")


	if __name__ == "__main__":
	# ── Bootstrap guard ───────────────────────────────────────────────────────
	# This script generates SYNTHETIC data for initial development/testing only.
	# The real pipeline reads from data/base/ (real Kaggle CSVs).
	# Only run this with the --bootstrap flag to avoid accidentally overwriting
	# or bypassing real data.
	#
	# Usage: python -m src.utils.generate_base_data --bootstrap
	#
	if "--bootstrap" not in sys.argv:
	print("ERROR: Refusing to run without --bootstrap flag.")
	print(" This script generates synthetic data and should NOT be")
	print(" used as the default training source.")
	print("")
	print(" Run: python -m src.utils.generate_base_data --bootstrap")
	print(" to explicitly opt in to synthetic data generation.")
	sys.exit(1)
	main()