bioweather / generate_data.py
emp-admin's picture
Upload 9 files
5f98f88 verified
"""
Bioweather Production Data Generator v2.0
EmpedocLabs Β© 2025
Generates clinically-plausible weather β†’ headache risk data with:
- 15 distinct biometeo conditions
- Seasonal/geographic variation
- Multi-trigger overlap scoring
- Graded risk (not just if/else buckets)
- 20,000+ samples for robust training
"""
import numpy as np
import pandas as pd
def generate_production_data(n: int = 25000, seed: int = 42) -> pd.DataFrame:
rng = np.random.default_rng(seed)
rows = []
for _ in range(n):
# ── Base weather with seasonal coherence ─────────────────────
season = rng.choice(["winter", "spring", "summer", "autumn"],
p=[0.25, 0.25, 0.25, 0.25])
if season == "winter":
temp = rng.normal(-2, 8)
humidity = rng.normal(70, 15)
uv = rng.integers(0, 4)
wind = abs(rng.normal(15, 12))
elif season == "spring":
temp = rng.normal(14, 7)
humidity = rng.normal(55, 18)
uv = rng.integers(2, 8)
wind = abs(rng.normal(18, 10))
elif season == "summer":
temp = rng.normal(28, 6)
humidity = rng.normal(55, 20)
uv = rng.integers(5, 11)
wind = abs(rng.normal(12, 8))
else: # autumn
temp = rng.normal(12, 8)
humidity = rng.normal(65, 15)
uv = rng.integers(1, 6)
wind = abs(rng.normal(16, 10))
temp = np.clip(temp, -15, 45)
humidity = np.clip(humidity, 8, 99)
uv = int(np.clip(uv, 0, 11))
wind = np.clip(wind, 0, 70)
pressure = rng.normal(1013, 12)
pressure = np.clip(pressure, 970, 1050)
# Pressure change: occasional fronts
if rng.random() < 0.10:
p_drop = rng.normal(-8, 3) # cold front
elif rng.random() < 0.08:
p_drop = rng.normal(7, 2.5) # high pressure ridge
else:
p_drop = rng.normal(0, 2.5)
p_drop = np.clip(p_drop, -15, 15)
# Temp change: some days have big swings
if rng.random() < 0.07:
t_change = rng.choice([-1, 1]) * abs(rng.normal(10, 3))
else:
t_change = rng.normal(0, 3)
t_change = np.clip(t_change, -15, 15)
# ── Additive risk scoring (multiple triggers stack) ──────────
risk = 5.0 # baseline
condition_scores = {} # condition_id β†’ contribution
# 1. Pressure drop (strongest weather trigger per literature)
if p_drop <= -8:
contribution = 35 + abs(p_drop) * 1.5
condition_scores[1] = contribution
risk += contribution
elif p_drop <= -4:
contribution = 15 + abs(p_drop) * 1.2
condition_scores[10] = contribution
risk += contribution
elif p_drop <= -2:
contribution = 8 + abs(p_drop) * 0.8
condition_scores[10] = contribution
risk += contribution
# 2. Pressure rise
if p_drop >= 8:
contribution = 25 + p_drop * 1.0
condition_scores[2] = contribution
risk += contribution
elif p_drop >= 4:
contribution = 12 + p_drop * 0.7
condition_scores[11] = contribution
risk += contribution
elif p_drop >= 2:
contribution = 6 + p_drop * 0.5
condition_scores[11] = contribution
risk += contribution
# 3. Sauna effect (heat + humidity)
if temp >= 28 and humidity >= 65:
strength = (temp - 28) * 2 + (humidity - 65) * 0.5
condition_scores[3] = strength
risk += strength
# 4. Wind
if wind >= 40:
condition_scores[4] = 25 + (wind - 40) * 0.8
risk += condition_scores[4]
elif wind >= 20:
condition_scores[12] = 10 + (wind - 20) * 0.3
risk += condition_scores[12]
# 5. UV glare
if uv >= 8:
condition_scores[5] = 20 + (uv - 8) * 3
risk += condition_scores[5]
elif uv >= 6 and temp > 15:
condition_scores[5] = 8 + (uv - 6) * 2
risk += condition_scores[5]
# 6. Bitter cold
if temp <= -5:
condition_scores[6] = 25 + abs(temp + 5) * 2
risk += condition_scores[6]
elif temp <= 2:
condition_scores[6] = 10 + abs(temp - 2) * 1.5
risk += condition_scores[6]
# 7. Drastic temp drop
if t_change <= -8:
condition_scores[7] = 30 + abs(t_change) * 1.5
risk += condition_scores[7]
elif t_change <= -5:
condition_scores[7] = 12 + abs(t_change) * 0.8
risk += condition_scores[7]
# 8. Heat shock
if t_change >= 8:
condition_scores[8] = 28 + t_change * 1.2
risk += condition_scores[8]
elif t_change >= 5:
condition_scores[8] = 10 + t_change * 0.7
risk += condition_scores[8]
# 9. Heavy dampness
if humidity >= 88 and wind <= 12:
condition_scores[9] = 15 + (humidity - 88) * 0.8
risk += condition_scores[9]
# 13. Dry air
if humidity <= 25:
condition_scores[13] = 18 + (25 - humidity) * 0.8
risk += condition_scores[13]
elif humidity <= 32:
condition_scores[13] = 8 + (32 - humidity) * 0.5
risk += condition_scores[13]
# 14. Stagnant & gloomy
if uv <= 2 and humidity >= 72 and wind <= 10 and temp < 18:
condition_scores[14] = 10 + (humidity - 72) * 0.3
risk += condition_scores[14]
# ── Determine primary condition ──────────────────────────────
if condition_scores:
label = max(condition_scores, key=condition_scores.get)
else:
label = 0 # clear skies
# ── Add realistic noise ──────────────────────────────────────
risk += rng.normal(0, 2.5)
risk = int(np.clip(round(risk), 0, 100))
rows.append([
round(temp, 1), round(pressure, 1), round(humidity, 1),
round(wind, 1), uv, round(p_drop, 2), round(t_change, 2),
risk, label,
])
df = pd.DataFrame(rows, columns=[
"temp_c", "pressure_hpa", "humidity", "wind_kph", "uv_index",
"pressure_drop", "temp_change", "risk_score", "advice_label",
])
print(f"βœ… Generated {len(df):,} samples")
print(f" Risk: mean={df['risk_score'].mean():.1f}, std={df['risk_score'].std():.1f}")
print(f" Conditions: {df['advice_label'].value_counts().sort_index().to_dict()}")
return df
if __name__ == "__main__":
df = generate_production_data()
df.to_csv("smart_weather_data.csv", index=False)
print(f"πŸ’Ύ Saved β†’ smart_weather_data.csv")