fbmc-chronos2 / scripts /collect_entsoe_sample.py
Evgueni Poloukarov
feat: complete Phase 1 ENTSO-E asset-specific outage validation
27cb60a
raw
history blame
4.13 kB
"""
Collect ENTSOE 1-week sample data for Sept 23-30, 2025
Collects generation by type for all 12 Core FBMC zones:
- Wind, Solar, Thermal, Hydro, Nuclear generation
Matches the JAO sample period for integrated analysis.
"""
import os
import sys
from pathlib import Path
from datetime import datetime, timedelta
import pandas as pd
from entsoe import EntsoePandasClient
from dotenv import load_dotenv
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
# Load API key
load_dotenv()
API_KEY = os.getenv('ENTSOE_API_KEY')
if not API_KEY:
print("[ERROR] ENTSOE_API_KEY not found in .env file")
print("Please add: ENTSOE_API_KEY=your_key_here")
sys.exit(1)
# Initialize client
client = EntsoePandasClient(api_key=API_KEY)
# Core FBMC zones (12 total)
FBMC_ZONES = {
'AT': '10YAT-APG------L', # Austria
'BE': '10YBE----------2', # Belgium
'CZ': '10YCZ-CEPS-----N', # Czech Republic
'DE_LU': '10Y1001A1001A83F', # Germany-Luxembourg
'FR': '10YFR-RTE------C', # France
'HR': '10YHR-HEP------M', # Croatia
'HU': '10YHU-MAVIR----U', # Hungary
'NL': '10YNL----------L', # Netherlands
'PL': '10YPL-AREA-----S', # Poland
'RO': '10YRO-TEL------P', # Romania
'SI': '10YSI-ELES-----O', # Slovenia
'SK': '10YSK-SEPS-----K', # Slovakia
}
# Generation types mapping (ENTSOE API codes)
GENERATION_TYPES = {
'B16': 'solar', # Solar
'B19': 'wind_offshore', # Wind offshore
'B18': 'wind_onshore', # Wind onshore
'B01': 'biomass', # Biomass
'B10': 'hydro_pumped', # Hydro pumped storage
'B11': 'hydro_run', # Hydro run-of-river
'B12': 'hydro_reservoir', # Hydro reservoir
'B14': 'nuclear', # Nuclear
'B02': 'fossil_brown_coal', # Fossil brown coal/lignite
'B05': 'fossil_coal', # Fossil hard coal
'B04': 'fossil_gas', # Fossil gas
'B03': 'fossil_oil', # Fossil oil
}
# Sample period: Sept 23-30, 2025 (matches JAO sample)
START_DATE = pd.Timestamp('2025-09-23', tz='UTC')
END_DATE = pd.Timestamp('2025-09-30', tz='UTC')
print("=" * 70)
print("ENTSOE 1-Week Sample Data Collection")
print("=" * 70)
print(f"Period: {START_DATE.date()} to {END_DATE.date()}")
print(f"Zones: {len(FBMC_ZONES)} Core FBMC zones")
print(f"Duration: 7 days = 168 hours")
print()
# Collect data
all_generation = []
for zone_code, zone_eic in FBMC_ZONES.items():
print(f"\n[{zone_code}] Collecting generation data...")
try:
# Query generation by type
gen_df = client.query_generation(
zone_eic,
start=START_DATE,
end=END_DATE,
psr_type=None # Get all generation types
)
# Add zone identifier
gen_df['zone'] = zone_code
# Reshape: generation types as columns
if isinstance(gen_df, pd.DataFrame):
# Already in correct format
all_generation.append(gen_df)
print(f" [OK] Collected {len(gen_df)} rows")
else:
print(f" [WARNING] Unexpected format: {type(gen_df)}")
except Exception as e:
print(f" [ERROR] {e}")
continue
if not all_generation:
print("\n[ERROR] No data collected - check API key and zone codes")
sys.exit(1)
# Combine all zones
print("\n" + "=" * 70)
print("Processing collected data...")
combined_df = pd.concat(all_generation, axis=0)
# Reset index to make timestamp a column
combined_df = combined_df.reset_index()
if 'index' in combined_df.columns:
combined_df = combined_df.rename(columns={'index': 'timestamp'})
print(f" Combined shape: {combined_df.shape}")
print(f" Columns: {list(combined_df.columns)}")
# Save to parquet
output_dir = Path("data/raw/sample")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "entsoe_sample_sept2025.parquet"
combined_df.to_parquet(output_file, index=False)
print(f"\n[SUCCESS] Saved to: {output_file}")
print(f" File size: {output_file.stat().st_size / 1024:.1f} KB")
print()
print("=" * 70)
print("ENTSOE Sample Collection Complete")
print("=" * 70)
print("\nNext: Add ENTSOE exploration to Marimo notebook")