Spaces:
Sleeping
Sleeping
Evgueni Poloukarov
refactor: improve Marimo notebook readability with proper number formatting
2a32f6f
| import marimo | |
| __generated_with = "0.17.2" | |
| app = marimo.App(width="full", auto_download=["html"]) | |
| def _(): | |
| # Imports | |
| import marimo as mo | |
| import polars as pl | |
| import altair as alt | |
| import numpy as np | |
| from pathlib import Path | |
| return Path, alt, mo, np, pl | |
| def _(mo): | |
| mo.md( | |
| """ | |
| # FBMC Chronos-2 Zero-Shot Forecasting | |
| ## October 2024 Evaluation Results | |
| **Comprehensive Analysis of 38-Border × 14-Day Multivariate Forecasting** | |
| --- | |
| ### Executive Summary | |
| This notebook presents the complete evaluation of zero-shot multivariate forecasting for 38 European FBMC borders using Amazon Chronos-2 with 615 covariate features. | |
| **Key Results**: | |
| - Mean D+1 MAE: **15.92 MW** (88% better than 134 MW target) | |
| - Forecast Time: **3.45 minutes** for 38 borders × 336 hours | |
| - Success Rate: **94.7%** of borders meet ≤150 MW threshold | |
| - Model: Zero-shot (no fine-tuning) with multivariate features | |
| --- | |
| """ | |
| ) | |
| return | |
| def _(Path, pl): | |
| # Load evaluation results | |
| results_path = Path(__file__).parent.parent / 'results' / 'october_2024_multivariate.csv' | |
| eval_df_raw = pl.read_csv(results_path) | |
| # Round all MAE and RMSE columns for readability | |
| mae_cols = [f'mae_d{i}' for i in range(1, 15)] + ['mae_overall'] | |
| rmse_cols = ['rmse_overall'] | |
| eval_df = eval_df_raw.with_columns([ | |
| pl.col(col).round(1) for col in mae_cols + rmse_cols | |
| ]) | |
| print(f"Loaded {len(eval_df)} border evaluations") | |
| print(f"Columns: {eval_df.columns}") | |
| eval_df.head(38) | |
| return (eval_df,) | |
| def _(eval_df, mo): | |
| # Overall Statistics Card | |
| mean_d1 = eval_df['mae_d1'].mean() | |
| median_d1 = eval_df['mae_d1'].median() | |
| min_d1 = eval_df['mae_d1'].min() | |
| max_d1 = eval_df['mae_d1'].max() | |
| target_met = (eval_df['mae_d1'] <= 150).sum() | |
| total_borders = len(eval_df) | |
| mo.md(f""" | |
| ## 1. Overall Performance Metrics | |
| ### D+1 Mean Absolute Error (Primary Metric) | |
| | Statistic | Value | Target | Status | | |
| |-----------|-------|--------|--------| | |
| | **Mean** | **{mean_d1:.2f} MW** | ≤134 MW | ✅ **{((134 - mean_d1) / 134 * 100):.0f}% better!** | | |
| | Median | {median_d1:.2f} MW | - | ✅ Excellent | | |
| | Min | {min_d1:.2f} MW | - | ✅ Perfect | | |
| | Max | {max_d1:.2f} MW | - | ⚠️ Outliers present | | |
| | **Success Rate** | **{target_met}/{total_borders} ({target_met/total_borders*100:.1f}%)** | - | ✅ Very good | | |
| **Interpretation**: The zero-shot model achieves outstanding performance with mean D+1 MAE of {mean_d1:.2f} MW, significantly beating the 134 MW target. However, 2 outlier borders require attention in Phase 2. | |
| """) | |
| return | |
| def _(mo): | |
| # MAE Distribution Visualization | |
| mo.md(""" | |
| ### D+1 MAE Distribution | |
| Distribution of D+1 MAE across all 38 borders, showing the concentration of excellent performance with a few outliers. | |
| """) | |
| return | |
| def _(alt, eval_df): | |
| # Histogram of D+1 MAE | |
| hist_chart = alt.Chart(eval_df.to_pandas()).mark_bar().encode( | |
| x=alt.X('mae_d1:Q', bin=alt.Bin(maxbins=20), title='D+1 MAE (MW)'), | |
| y=alt.Y('count()', title='Number of Borders'), | |
| tooltip=[ | |
| alt.Tooltip('count()', title='Number of Borders') | |
| ] | |
| ).properties( | |
| width=600, | |
| height=300, | |
| title='Distribution of D+1 MAE Across 38 Borders' | |
| ) | |
| hist_chart | |
| return | |
| def _(mo): | |
| mo.md( | |
| """ | |
| ## 2. Border-Level Performance | |
| ### Top 10 Best Performers (Lowest D+1 MAE) | |
| """ | |
| ) | |
| return | |
| def _(eval_df, pl): | |
| # Top 10 best performers (rounded for readability) | |
| best_performers = eval_df.sort('mae_d1').head(10).with_columns([ | |
| pl.col('mae_d1').round(1), | |
| pl.col('mae_overall').round(1), | |
| pl.col('rmse_overall').round(1) | |
| ]) | |
| best_performers.select(['border', 'mae_d1', 'mae_overall', 'rmse_overall']) | |
| return | |
| def _(mo): | |
| mo.md( | |
| """ | |
| ### Top 10 Worst Performers (Highest D+1 MAE) | |
| These borders are candidates for fine-tuning in Phase 2. | |
| """ | |
| ) | |
| return | |
| def _(eval_df, pl): | |
| # Top 10 worst performers (rounded for readability) | |
| worst_performers = eval_df.sort('mae_d1', descending=True).head(10).with_columns([ | |
| pl.col('mae_d1').round(1), | |
| pl.col('mae_overall').round(1), | |
| pl.col('rmse_overall').round(1) | |
| ]) | |
| worst_performers.select(['border', 'mae_d1', 'mae_overall', 'rmse_overall']) | |
| return | |
| def _(mo): | |
| mo.md( | |
| """ | |
| ## 3. MAE Degradation Over Forecast Horizon | |
| ### Daily MAE Evolution (D+1 through D+14) | |
| Analysis of how forecast accuracy degrades over the 14-day horizon. | |
| """ | |
| ) | |
| return | |
| def _(eval_df, pl): | |
| # Calculate mean MAE for each day (rounded for readability) | |
| daily_mae_data = [] | |
| for day in range(1, 15): | |
| col_name = f'mae_d{day}' | |
| mean_mae = round(eval_df[col_name].mean(), 1) | |
| median_mae = round(eval_df[col_name].median(), 1) | |
| daily_mae_data.append({ | |
| 'day': day, | |
| 'mean_mae': mean_mae, | |
| 'median_mae': median_mae | |
| }) | |
| daily_mae_df = pl.DataFrame(daily_mae_data) | |
| daily_mae_df | |
| return (daily_mae_df,) | |
| def _(alt, daily_mae_df): | |
| # Line chart of MAE degradation | |
| degradation_chart = alt.Chart(daily_mae_df.to_pandas()).mark_line(point=True).encode( | |
| x=alt.X('day:Q', title='Forecast Day', scale=alt.Scale(domain=[1, 14])), | |
| y=alt.Y('mean_mae:Q', title='Mean MAE (MW)', scale=alt.Scale(zero=True)), | |
| tooltip=[ | |
| alt.Tooltip('day:Q', title='Day'), | |
| alt.Tooltip('mean_mae:Q', title='Mean MAE (MW)', format='.1f'), | |
| alt.Tooltip('median_mae:Q', title='Median MAE (MW)', format='.1f') | |
| ] | |
| ).properties( | |
| width=700, | |
| height=400, | |
| title='MAE Degradation Over 14-Day Forecast Horizon' | |
| ) | |
| degradation_chart | |
| return | |
| def _(daily_mae_df, mo, pl): | |
| # MAE degradation table with explicit baseline (rounded for readability) | |
| mae_list = daily_mae_df['mean_mae'].to_list() | |
| baseline_mae = mae_list[0] | |
| degradation_table = daily_mae_df.with_columns([ | |
| (((pl.col('mean_mae') - baseline_mae) / baseline_mae * 100).round(1)).alias('pct_increase') | |
| ]) | |
| # Extract specific days for readability | |
| degradation_d1_mae = mae_list[0] | |
| degradation_d2_mae = mae_list[1] | |
| degradation_d8_mae = mae_list[7] | |
| degradation_d14_mae = mae_list[13] | |
| mo.md(f""" | |
| ### Degradation Statistics | |
| {mo.as_html(degradation_table.to_pandas())} | |
| **Key Observations**: | |
| - D+1 baseline: {degradation_d1_mae:.1f} MW | |
| - D+2 degradation: {((degradation_d2_mae - degradation_d1_mae) / degradation_d1_mae * 100):.1f}% | |
| - D+14 final: {degradation_d14_mae:.1f} MW (+{((degradation_d14_mae - degradation_d1_mae) / degradation_d1_mae * 100):.1f}%) | |
| - Largest jump: D+8 at {degradation_d8_mae:.1f} MW (investigate cause) | |
| """) | |
| return | |
| def _(mo): | |
| mo.md( | |
| """ | |
| ## 4. Border-Level Heatmap | |
| ### MAE Across All Borders and Days | |
| Interactive heatmap showing forecast error evolution for each border over 14 days. | |
| """ | |
| ) | |
| return | |
| def _(eval_df, pl): | |
| # Reshape data for heatmap (unpivot daily MAE columns) | |
| heatmap_data = eval_df.select(['border'] + [f'mae_d{i}' for i in range(1, 15)]) | |
| # Unpivot to long format (already rounded in eval_df) | |
| heatmap_long = heatmap_data.unpivot( | |
| index='border', | |
| on=[f'mae_d{i}' for i in range(1, 15)], | |
| variable_name='day', | |
| value_name='mae' | |
| ).with_columns([ | |
| pl.col('day').str.replace('mae_d', '').cast(pl.Int32), | |
| pl.col('mae').round(1) # Ensure rounding for display | |
| ]) | |
| heatmap_long.head() | |
| return (heatmap_long,) | |
| def _(alt, heatmap_long): | |
| # Heatmap of MAE by border and day | |
| heatmap_chart = alt.Chart(heatmap_long.to_pandas()).mark_rect().encode( | |
| x=alt.X('day:O', title='Forecast Day'), | |
| y=alt.Y('border:N', title='Border', sort='-x'), | |
| color=alt.Color('mae:Q', | |
| title='MAE (MW)', | |
| scale=alt.Scale(scheme='redyellowgreen', reverse=True, domain=[0, 300])), | |
| tooltip=['border', 'day', alt.Tooltip('mae:Q', format='.1f')] | |
| ).properties( | |
| width=700, | |
| height=800, | |
| title='MAE Heatmap: All Borders × 14 Days' | |
| ) | |
| heatmap_chart | |
| return | |
| def _(mo): | |
| mo.md( | |
| """ | |
| ## 5. Outlier Analysis | |
| ### Borders with D+1 MAE > 150 MW | |
| Detailed analysis of underperforming borders for Phase 2 fine-tuning. | |
| """ | |
| ) | |
| return | |
| def _(eval_df, pl): | |
| # Identify outliers (rounded for readability) | |
| outliers = eval_df.filter(pl.col('mae_d1') > 150).sort('mae_d1', descending=True).with_columns([ | |
| pl.col('mae_d1').round(1), | |
| pl.col('mae_d2').round(1), | |
| pl.col('mae_d7').round(1), | |
| pl.col('mae_d14').round(1), | |
| pl.col('mae_overall').round(1), | |
| pl.col('rmse_overall').round(1) | |
| ]) | |
| outliers.select(['border', 'mae_d1', 'mae_d2', 'mae_d7', 'mae_d14', 'mae_overall', 'rmse_overall']) | |
| return (outliers,) | |
| def _(mo, outliers): | |
| outlier_analysis = [] | |
| for row in outliers.iter_rows(named=True): | |
| border = row['border'] | |
| outlier_mae = row['mae_d1'] | |
| if border == 'AT_DE': | |
| reason = "Bidirectional Austria-Germany flow with high volatility (large capacity, multiple ramping patterns)" | |
| elif border == 'FR_DE': | |
| reason = "France-Germany high-capacity interconnection with complex market dynamics" | |
| else: | |
| reason = "Requires investigation" | |
| outlier_analysis.append(f"- **{border}**: {outlier_mae:.1f} MW - {reason}") | |
| mo.md(f""" | |
| ### Outlier Investigation | |
| {chr(10).join(outlier_analysis)} | |
| **Recommendation**: Fine-tune with LoRA on 6 months of border-specific data in Phase 2. | |
| """) | |
| return | |
| def _(mo): | |
| mo.md( | |
| """ | |
| ## 6. Performance Categories | |
| ### Borders Grouped by D+1 MAE | |
| Classification of forecast quality across borders. | |
| """ | |
| ) | |
| return | |
| def _(eval_df, pl): | |
| # Categorize borders by performance | |
| categorized_df = eval_df.with_columns([ | |
| pl.when(pl.col('mae_d1') <= 10).then(pl.lit('Excellent (≤10 MW)')) | |
| .when(pl.col('mae_d1') <= 50).then(pl.lit('Good (10-50 MW)')) | |
| .when(pl.col('mae_d1') <= 150).then(pl.lit('Acceptable (50-150 MW)')) | |
| .otherwise(pl.lit('Needs Improvement (>150 MW)')) | |
| .alias('category') | |
| ]) | |
| # Count by category | |
| category_counts = categorized_df.group_by('category').agg([ | |
| pl.count().alias('count') | |
| ]).sort('count', descending=True) | |
| category_counts | |
| return (category_counts,) | |
| def _(alt, category_counts): | |
| # Pie chart of performance categories | |
| cat_chart = alt.Chart(category_counts.to_pandas()).mark_arc(innerRadius=50).encode( | |
| theta=alt.Theta('count:Q', stack=True), | |
| color=alt.Color('category:N', | |
| scale=alt.Scale(domain=['Excellent (≤10 MW)', 'Good (10-50 MW)', | |
| 'Acceptable (50-150 MW)', 'Needs Improvement (>150 MW)'], | |
| range=['#2ecc71', '#3498db', '#f39c12', '#e74c3c'])), | |
| tooltip=['category', 'count'] | |
| ).properties( | |
| width=400, | |
| height=400, | |
| title='Border Performance Distribution' | |
| ) | |
| cat_chart | |
| return | |
| def _(mo): | |
| mo.md( | |
| """ | |
| ## 7. Statistical Analysis | |
| ### Correlation Between Overall MAE and D+1 MAE | |
| """ | |
| ) | |
| return | |
| def _(alt, eval_df): | |
| # Scatter plot: Overall vs D+1 MAE | |
| correlation_chart = alt.Chart(eval_df.to_pandas()).mark_point(size=100, opacity=0.7).encode( | |
| x=alt.X('mae_d1:Q', title='D+1 MAE (MW)'), | |
| y=alt.Y('mae_overall:Q', title='Overall MAE (MW)'), | |
| color=alt.condition( | |
| alt.datum.mae_d1 > 150, | |
| alt.value('#e74c3c'), | |
| alt.value('#3498db') | |
| ), | |
| tooltip=[ | |
| alt.Tooltip('border:N', title='Border'), | |
| alt.Tooltip('mae_d1:Q', title='D+1 MAE (MW)', format='.1f'), | |
| alt.Tooltip('mae_overall:Q', title='Overall MAE (MW)', format='.1f') | |
| ] | |
| ).properties( | |
| width=600, | |
| height=400, | |
| title='Correlation: D+1 MAE vs Overall MAE' | |
| ) | |
| correlation_chart | |
| return | |
| def _(eval_df, mo, np): | |
| # Calculate correlation | |
| corr_d1_overall = np.corrcoef(eval_df['mae_d1'].to_numpy(), eval_df['mae_overall'].to_numpy())[0, 1] | |
| mo.md(f""" | |
| **Pearson Correlation**: {corr_d1_overall:.3f} | |
| { | |
| "Strong positive correlation indicates D+1 performance is a good predictor of overall forecast quality." | |
| if corr_d1_overall > 0.7 | |
| else "Moderate correlation suggests D+1 and overall MAE have some relationship." | |
| } | |
| """) | |
| return | |
| def _(mo): | |
| mo.md( | |
| """ | |
| ## 8. Key Findings & Recommendations | |
| ### Summary of Evaluation Results | |
| """ | |
| ) | |
| return | |
| def _(eval_df, mo): | |
| # Calculate additional stats | |
| perfect_borders = (eval_df['mae_d1'] == 0).sum() | |
| low_error_borders = (eval_df['mae_d1'] <= 10).sum() | |
| high_error_borders = (eval_df['mae_d1'] > 150).sum() | |
| mo.md(f""" | |
| ### Key Findings | |
| 1. **Exceptional Zero-Shot Performance** | |
| - {perfect_borders} borders have ZERO D+1 MAE (perfect forecasts) | |
| - {low_error_borders} borders have D+1 MAE ≤10 MW (near-perfect) | |
| - Mean D+1 MAE of 15.92 MW is 88% better than the 134 MW target | |
| 2. **Multivariate Features Provide Strong Signal** | |
| - 615 covariate features (weather, generation, CNEC outages) enable accurate zero-shot forecasting | |
| - No model training required - pre-trained Chronos-2 generalizes well | |
| 3. **Outliers Identified for Phase 2** | |
| - {high_error_borders} borders exceed 150 MW threshold | |
| - AT_DE (266 MW) and FR_DE (181 MW) require fine-tuning | |
| - Complex bidirectional flows and high volatility are main challenges | |
| 4. **Forecast Degradation Analysis** | |
| - Accuracy degrades reasonably over 14-day horizon | |
| - D+2: +7.6% degradation (excellent) | |
| - D+14: +90.4% degradation (acceptable for long-range forecasts) | |
| - D+8 spike (38.42 MW, +141%) requires investigation | |
| ### Phase 2 Recommendations | |
| **Priority 1: Fine-Tune Outlier Borders** | |
| - Apply LoRA fine-tuning to AT_DE and FR_DE | |
| - Use 6 months of border-specific data | |
| - Expected improvement: 40-60% MAE reduction | |
| - Timeline: 2-3 weeks | |
| **Priority 2: Investigate D+8 Spike** | |
| - Analyze why D+8 has larger errors than D+14 | |
| - Check for systematic patterns or data quality issues | |
| - Timeline: 1 week | |
| **Priority 3: Extend Context Window** | |
| - Increase from 128h to 512h for better pattern learning | |
| - Verify no OOM on A100 GPU | |
| - Expected improvement: 10-15% overall MAE reduction | |
| - Timeline: 1 week | |
| **Priority 4: Feature Engineering** | |
| - Add scheduled outages, cross-border ramping constraints | |
| - Refine CNEC weighting based on binding frequency | |
| - Expected improvement: 5-10% MAE reduction | |
| - Timeline: 2 weeks | |
| ### Production Readiness | |
| ✅ **Ready for Deployment** | |
| - Zero-shot model achieves target (15.92 MW < 134 MW) | |
| - Inference time acceptable (3.45 min for 38 borders) | |
| - 94.7% of borders meet quality threshold | |
| - API deployed on HuggingFace Space (A100 GPU) | |
| ⚠️ **Monitor These Borders** | |
| - AT_DE, FR_DE require manual review | |
| - Consider ensemble methods or manual adjustments for outliers | |
| ### Cost & Infrastructure | |
| - **GPU**: A100-large (40-80 GB VRAM) required for multivariate forecasting | |
| - **Cost**: ~$500/month for 24/7 API access | |
| - **Alternative**: Run batched forecasts on smaller GPU (A10G) to reduce costs | |
| --- | |
| **Document Version**: 1.0.0 | |
| **Evaluation Date**: 2024-10-01 to 2024-10-14 | |
| **Model**: amazon/chronos-2 (zero-shot, 615 features) | |
| **Author**: FBMC Forecasting Team | |
| """) | |
| return | |
| if __name__ == "__main__": | |
| app.run() | |