Gil Stetler commited on
Commit
a5e3343
·
1 Parent(s): 92e4d77

pipeline included

Browse files
Files changed (2) hide show
  1. app.py +342 -108
  2. pipeline_v2.py +189 -0
app.py CHANGED
@@ -306,6 +306,212 @@
306
  #
307
  #
308
  #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  import os, random
310
  import numpy as np
311
  import pandas as pd
@@ -316,18 +522,21 @@ matplotlib.use("Agg")
316
  import matplotlib.pyplot as plt
317
  from chronos import ChronosPipeline
318
 
 
 
 
319
  # --------------------
320
  # Config
321
  # --------------------
322
  MODEL_ID = "amazon/chronos-t5-large"
323
- PREDICTION_LENGTH = 30 # letzte 30 Tage
324
- NUM_SAMPLES = 1 # eine Bahn -> tagesgenaue Punktvorhersage
325
- RV_WINDOW = 20
326
- ANNUALIZE = True
327
  EPS = 1e-8
328
 
329
  # --------------------
330
- # Model load
331
  # --------------------
332
  device = "cuda" if torch.cuda.is_available() else "cpu"
333
  dtype = torch.bfloat16 if device == "cuda" else torch.float32
@@ -341,21 +550,16 @@ pipe = ChronosPipeline.from_pretrained(
341
  # --------------------
342
  # Helpers
343
  # --------------------
344
- def _read_ohlcv_csv():
345
- for p in ["/mnt/data/ohlcv_clean.csv", "ohlcv_clean.csv"]:
346
- if os.path.exists(p):
347
- return pd.read_csv(p)
348
- raise gr.Error("CSV nicht gefunden. Lege sie unter /mnt/data/ohlcv_clean.csv oder ./ohlcv_clean.csv ab.")
349
-
350
  def _extract_close(df: pd.DataFrame) -> pd.Series:
351
  mapping = {c.lower(): c for c in df.columns}
352
  for name in ["close", "adj close", "adj_close", "price"]:
353
  if name in mapping:
354
- return pd.Series(df[mapping[name]].astype(float))
355
- numeric_cols = df.select_dtypes(include=[np.number]).columns
356
- if len(numeric_cols) == 0:
357
- raise gr.Error("Keine numerische Preisspalte gefunden (z.B. Close).")
358
- return pd.Series(df[numeric_cols[-1]].astype(float))
 
359
 
360
  def _extract_dates(df: pd.DataFrame):
361
  mapping = {c.lower(): c for c in df.columns}
@@ -365,6 +569,12 @@ def _extract_dates(df: pd.DataFrame):
365
  return pd.to_datetime(df[mapping[name]]).to_numpy()
366
  except Exception:
367
  pass
 
 
 
 
 
 
368
  return np.arange(len(df))
369
 
370
  def compute_realized_vol(close: pd.Series, window: int = 20, annualize: bool = True) -> pd.Series:
@@ -374,138 +584,162 @@ def compute_realized_vol(close: pd.Series, window: int = 20, annualize: bool = T
374
  rv = rv * np.sqrt(252.0)
375
  return rv.dropna().reset_index(drop=True)
376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  # --------------------
378
- # Main
379
  # --------------------
380
- def run_vol_forecast_and_evaluate():
381
- # Daten laden
382
- raw = _read_ohlcv_csv()
383
- dates = _extract_dates(raw)
384
- close = _extract_close(raw)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
 
386
- # Realized Volatility
387
  rv = compute_realized_vol(close, window=RV_WINDOW, annualize=ANNUALIZE).to_numpy()
388
  n = len(rv); H = PREDICTION_LENGTH
389
  if n <= H + 5:
390
- raise gr.Error(f"RV-Serie zu kurz nach Rolling. Benötigt > {H+5}, erhalten {n}.")
391
 
392
- # Split
393
  rv_train = rv[: n - H]
394
  rv_test = rv[n - H :]
395
 
396
- # Eine Sample-Bahn prognostizieren
397
  random.seed(0); np.random.seed(0); torch.manual_seed(0)
398
  if torch.cuda.is_available():
399
  torch.cuda.manual_seed_all(0)
400
 
401
  context = torch.tensor(rv_train, dtype=torch.float32)
402
- fcst = pipe.predict(context, prediction_length=H, num_samples=NUM_SAMPLES) # [1,1,H]
403
- samples = fcst[0].cpu().numpy()
404
- path_pred = samples[0] # (H,) — Punktprognose
405
-
406
- # --------------------
407
- # Bias-/Scale-Kalibrierung
408
- # --------------------
409
- # α so wählen, dass MSE zwischen α*pred und actual minimal wird
410
- alpha = float(np.sum(rv_test * path_pred) / np.sum(path_pred**2 + EPS))
411
- path_pred_cal = alpha * path_pred
412
-
413
- # Fehler (original & kalibriert)
414
- def metrics(y_true, y_pred):
415
- err = y_pred - y_true
416
- denom = np.maximum(EPS, np.abs(y_true))
417
- abs_pct_err = np.abs(err) / denom * 100
418
- pct_err = err / np.maximum(EPS, y_true) * 100
419
- return {
420
- "MAPE": abs_pct_err.mean(),
421
- "MPE": pct_err.mean(),
422
- "RMSE": np.sqrt(np.mean(err**2))
423
- }
424
-
425
- m_orig = metrics(rv_test, path_pred)
426
- m_cal = metrics(rv_test, path_pred_cal)
427
-
428
- # --------------------
429
- # Plot
430
- # --------------------
431
- fig = plt.figure(figsize=(10, 4))
432
- H0 = len(rv_train)
433
- if isinstance(dates, np.ndarray) and dates.shape[0] >= len(close):
434
- dates_rv = np.array(dates[-len(rv):])
435
- plt.plot(dates_rv[:H0], rv_train, label="realized vol (history)")
436
- plt.plot(dates_rv[H0:], rv_test, label="actual (holdout)")
437
- plt.plot(dates_rv[H0:], path_pred, linestyle="--", label="forecast (raw)")
438
- plt.plot(dates_rv[H0:], path_pred_cal, linestyle="--", label=f"forecast (calibrated, α={alpha:.3f})")
439
- plt.xlabel("date")
440
  else:
441
- x_all = np.arange(len(rv)); x_fcst = np.arange(H0, H0 + H)
442
- plt.plot(x_all[:H0], rv_train, label="realized vol (history)")
443
- plt.plot(x_fcst, rv_test, label="actual (holdout)")
444
- plt.plot(x_fcst, path_pred, linestyle="--", label="forecast (raw)")
445
- plt.plot(x_fcst, path_pred_cal, linestyle="--", label=f"forecast (calibrated, α={alpha:.3f})")
446
- plt.xlabel("time index")
447
 
448
- plt.title(f"Volatility Forecast (RV window={RV_WINDOW}, H={H})")
449
- plt.ylabel("realized volatility")
450
- plt.legend(loc="best")
451
- plt.tight_layout()
452
 
453
- # --------------------
454
- # Tages-Tabelle
455
- # --------------------
456
  if isinstance(dates, np.ndarray) and dates.shape[0] >= len(close):
 
457
  dates_rv = np.array(dates[-len(rv):])
458
- last_dates = dates_rv[H0:]
 
 
459
  else:
460
- last_dates = np.arange(H)
 
 
461
 
462
- abs_pct_err_orig = np.abs((path_pred - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
463
- abs_pct_err_cal = np.abs((path_pred_cal - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
 
 
 
464
 
 
 
 
 
 
 
465
  df_days = pd.DataFrame({
466
  "date": last_dates,
467
  "actual_vol": rv_test,
468
  "forecast_raw": path_pred,
469
- "forecast_calibrated": path_pred_cal,
470
- "abs_error_raw": np.abs(path_pred - rv_test),
471
- "abs_pct_error_raw_%": abs_pct_err_orig,
472
- "abs_pct_error_cal_%": abs_pct_err_cal,
473
  })
 
 
 
 
 
 
474
 
475
- # --------------------
476
- # Outputs
477
- # --------------------
478
- out_json = {
479
- "alpha": alpha,
480
- "metrics_raw": {k: round(v, 4) for k, v in m_orig.items()},
481
- "metrics_calibrated": {k: round(v, 4) for k, v in m_cal.items()},
 
 
 
 
 
 
 
482
  }
 
483
 
484
- metrics_md = (
485
- f"**Bias-/Scale-Kalibrierung** α = {alpha:.3f}\n\n"
486
- f"**RAW:** MAPE {m_orig['MAPE']:.2f}% | MPE {m_orig['MPE']:.2f}% | RMSE {m_orig['RMSE']:.5f}\n"
487
- f"**CALIBRATED:** MAPE {m_cal['MAPE']:.2f}% | MPE {m_cal['MPE']:.2f}% | RMSE {m_cal['RMSE']:.5f}"
488
- )
489
 
490
- return fig, out_json, df_days, metrics_md
491
 
492
  # --------------------
493
  # UI
494
  # --------------------
495
- with gr.Blocks(title="Volatility Forecast • mit Bias-/Scale-Kalibrierung") as demo:
496
  gr.Markdown(
497
- "## Letzte 30 Tage Volatilität (mit automatischer Bias-/Scale-Kalibrierung)\n"
498
- "- Prognose einer einzelnen Sample-Bahn (kein Mittelwert, kein Median).\n"
499
- "- Anschließend wird ein Skalierungsfaktor α berechnet, um systematische Unter-/Überschätzung zu korrigieren.\n"
500
- "- Darstellung: Forecast (roh) & Forecast (kalibriert)."
 
501
  )
 
 
 
 
 
 
502
  run_btn = gr.Button("Run", variant="primary")
503
- plot = gr.Plot(label="Forecast vs Actual (roh & kalibriert)")
504
- meta = gr.JSON(label="Kalibrierungsparameter & Metriken")
505
- table = gr.Dataframe(label="Per-Day Vergleich", wrap=True)
506
- metrics = gr.Markdown(label="Zusammenfassung")
507
 
508
- run_btn.click(run_vol_forecast_and_evaluate, inputs=None, outputs=[plot, meta, table, metrics])
 
 
 
 
 
 
509
 
510
  if __name__ == "__main__":
511
  demo.launch()
 
306
  #
307
  #
308
  #
309
+ #import os, random
310
+ #import numpy as np
311
+ #import pandas as pd
312
+ #import torch
313
+ #import gradio as gr
314
+ #import matplotlib
315
+ #matplotlib.use("Agg")
316
+ #import matplotlib.pyplot as plt
317
+ #from chronos import ChronosPipeline
318
+ #
319
+ ## --------------------
320
+ ## Config
321
+ ## --------------------
322
+ #MODEL_ID = "amazon/chronos-t5-large"
323
+ #PREDICTION_LENGTH = 30 # letzte 30 Tage
324
+ #NUM_SAMPLES = 1 # eine Bahn -> tagesgenaue Punktvorhersage
325
+ #RV_WINDOW = 20
326
+ #ANNUALIZE = True
327
+ #EPS = 1e-8
328
+ #
329
+ ## --------------------
330
+ ## Model load
331
+ ## --------------------
332
+ #device = "cuda" if torch.cuda.is_available() else "cpu"
333
+ #dtype = torch.bfloat16 if device == "cuda" else torch.float32
334
+ #
335
+ #pipe = ChronosPipeline.from_pretrained(
336
+ # MODEL_ID,
337
+ # device_map="auto",
338
+ # torch_dtype=dtype,
339
+ #)
340
+ #
341
+ ## --------------------
342
+ ## Helpers
343
+ ## --------------------
344
+ #def _read_ohlcv_csv():
345
+ # for p in ["/mnt/data/ohlcv_clean.csv", "ohlcv_clean.csv"]:
346
+ # if os.path.exists(p):
347
+ # return pd.read_csv(p)
348
+ # raise gr.Error("CSV nicht gefunden. Lege sie unter /mnt/data/ohlcv_clean.csv oder ./ohlcv_clean.csv ab.")
349
+ #
350
+ #def _extract_close(df: pd.DataFrame) -> pd.Series:
351
+ # mapping = {c.lower(): c for c in df.columns}
352
+ # for name in ["close", "adj close", "adj_close", "price"]:
353
+ # if name in mapping:
354
+ # return pd.Series(df[mapping[name]].astype(float))
355
+ # numeric_cols = df.select_dtypes(include=[np.number]).columns
356
+ # if len(numeric_cols) == 0:
357
+ # raise gr.Error("Keine numerische Preisspalte gefunden (z.B. Close).")
358
+ # return pd.Series(df[numeric_cols[-1]].astype(float))
359
+ #
360
+ #def _extract_dates(df: pd.DataFrame):
361
+ # mapping = {c.lower(): c for c in df.columns}
362
+ # for name in ["date", "time", "timestamp"]:
363
+ # if name in mapping:
364
+ # try:
365
+ # return pd.to_datetime(df[mapping[name]]).to_numpy()
366
+ # except Exception:
367
+ # pass
368
+ # return np.arange(len(df))
369
+ #
370
+ #def compute_realized_vol(close: pd.Series, window: int = 20, annualize: bool = True) -> pd.Series:
371
+ # r = np.log(close).diff().dropna()
372
+ # rv = r.rolling(window, min_periods=window).std()
373
+ # if annualize:
374
+ # rv = rv * np.sqrt(252.0)
375
+ # return rv.dropna().reset_index(drop=True)
376
+ #
377
+ ## --------------------
378
+ ## Main
379
+ ## --------------------
380
+ #def run_vol_forecast_and_evaluate():
381
+ # # Daten laden
382
+ # raw = _read_ohlcv_csv()
383
+ # dates = _extract_dates(raw)
384
+ # close = _extract_close(raw)
385
+ #
386
+ # # Realized Volatility
387
+ # rv = compute_realized_vol(close, window=RV_WINDOW, annualize=ANNUALIZE).to_numpy()
388
+ # n = len(rv); H = PREDICTION_LENGTH
389
+ # if n <= H + 5:
390
+ # raise gr.Error(f"RV-Serie zu kurz nach Rolling. Benötigt > {H+5}, erhalten {n}.")
391
+ #
392
+ # # Split
393
+ # rv_train = rv[: n - H]
394
+ # rv_test = rv[n - H :]
395
+ #
396
+ # # Eine Sample-Bahn prognostizieren
397
+ # random.seed(0); np.random.seed(0); torch.manual_seed(0)
398
+ # if torch.cuda.is_available():
399
+ # torch.cuda.manual_seed_all(0)
400
+ #
401
+ # context = torch.tensor(rv_train, dtype=torch.float32)
402
+ # fcst = pipe.predict(context, prediction_length=H, num_samples=NUM_SAMPLES) # [1,1,H]
403
+ # samples = fcst[0].cpu().numpy()
404
+ # path_pred = samples[0] # (H,) — Punktprognose
405
+ #
406
+ # # --------------------
407
+ # # Bias-/Scale-Kalibrierung
408
+ # # --------------------
409
+ # # α so wählen, dass MSE zwischen α*pred und actual minimal wird
410
+ # alpha = float(np.sum(rv_test * path_pred) / np.sum(path_pred**2 + EPS))
411
+ # path_pred_cal = alpha * path_pred
412
+ #
413
+ # # Fehler (original & kalibriert)
414
+ # def metrics(y_true, y_pred):
415
+ # err = y_pred - y_true
416
+ # denom = np.maximum(EPS, np.abs(y_true))
417
+ # abs_pct_err = np.abs(err) / denom * 100
418
+ # pct_err = err / np.maximum(EPS, y_true) * 100
419
+ # return {
420
+ # "MAPE": abs_pct_err.mean(),
421
+ # "MPE": pct_err.mean(),
422
+ # "RMSE": np.sqrt(np.mean(err**2))
423
+ # }
424
+ #
425
+ # m_orig = metrics(rv_test, path_pred)
426
+ # m_cal = metrics(rv_test, path_pred_cal)
427
+ #
428
+ # # --------------------
429
+ # # Plot
430
+ # # --------------------
431
+ # fig = plt.figure(figsize=(10, 4))
432
+ # H0 = len(rv_train)
433
+ # if isinstance(dates, np.ndarray) and dates.shape[0] >= len(close):
434
+ # dates_rv = np.array(dates[-len(rv):])
435
+ # plt.plot(dates_rv[:H0], rv_train, label="realized vol (history)")
436
+ # plt.plot(dates_rv[H0:], rv_test, label="actual (holdout)")
437
+ # plt.plot(dates_rv[H0:], path_pred, linestyle="--", label="forecast (raw)")
438
+ # plt.plot(dates_rv[H0:], path_pred_cal, linestyle="--", label=f"forecast (calibrated, α={alpha:.3f})")
439
+ # plt.xlabel("date")
440
+ # else:
441
+ # x_all = np.arange(len(rv)); x_fcst = np.arange(H0, H0 + H)
442
+ # plt.plot(x_all[:H0], rv_train, label="realized vol (history)")
443
+ # plt.plot(x_fcst, rv_test, label="actual (holdout)")
444
+ # plt.plot(x_fcst, path_pred, linestyle="--", label="forecast (raw)")
445
+ # plt.plot(x_fcst, path_pred_cal, linestyle="--", label=f"forecast (calibrated, α={alpha:.3f})")
446
+ # plt.xlabel("time index")
447
+ #
448
+ # plt.title(f"Volatility Forecast (RV window={RV_WINDOW}, H={H})")
449
+ # plt.ylabel("realized volatility")
450
+ # plt.legend(loc="best")
451
+ # plt.tight_layout()
452
+ #
453
+ # # --------------------
454
+ # # Tages-Tabelle
455
+ # # --------------------
456
+ # if isinstance(dates, np.ndarray) and dates.shape[0] >= len(close):
457
+ # dates_rv = np.array(dates[-len(rv):])
458
+ # last_dates = dates_rv[H0:]
459
+ # else:
460
+ # last_dates = np.arange(H)
461
+ #
462
+ # abs_pct_err_orig = np.abs((path_pred - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
463
+ # abs_pct_err_cal = np.abs((path_pred_cal - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
464
+ #
465
+ # df_days = pd.DataFrame({
466
+ # "date": last_dates,
467
+ # "actual_vol": rv_test,
468
+ # "forecast_raw": path_pred,
469
+ # "forecast_calibrated": path_pred_cal,
470
+ # "abs_error_raw": np.abs(path_pred - rv_test),
471
+ # "abs_pct_error_raw_%": abs_pct_err_orig,
472
+ # "abs_pct_error_cal_%": abs_pct_err_cal,
473
+ # })
474
+ #
475
+ # # --------------------
476
+ # # Outputs
477
+ # # --------------------
478
+ # out_json = {
479
+ # "alpha": alpha,
480
+ # "metrics_raw": {k: round(v, 4) for k, v in m_orig.items()},
481
+ # "metrics_calibrated": {k: round(v, 4) for k, v in m_cal.items()},
482
+ # }
483
+ #
484
+ # metrics_md = (
485
+ # f"**Bias-/Scale-Kalibrierung** α = {alpha:.3f}\n\n"
486
+ # f"**RAW:** MAPE {m_orig['MAPE']:.2f}% | MPE {m_orig['MPE']:.2f}% | RMSE {m_orig['RMSE']:.5f}\n"
487
+ # f"**CALIBRATED:** MAPE {m_cal['MAPE']:.2f}% | MPE {m_cal['MPE']:.2f}% | RMSE {m_cal['RMSE']:.5f}"
488
+ # )
489
+ #
490
+ # return fig, out_json, df_days, metrics_md
491
+ #
492
+ ## --------------------
493
+ ## UI
494
+ ## --------------------
495
+ #with gr.Blocks(title="Volatility Forecast • mit Bias-/Scale-Kalibrierung") as demo:
496
+ # gr.Markdown(
497
+ # "## Letzte 30 Tage Volatilität (mit automatischer Bias-/Scale-Kalibrierung)\n"
498
+ # "- Prognose einer einzelnen Sample-Bahn (kein Mittelwert, kein Median).\n"
499
+ # "- Anschließend wird ein Skalierungsfaktor α berechnet, um systematische Unter-/Überschätzung zu korrigieren.\n"
500
+ # "- Darstellung: Forecast (roh) & Forecast (kalibriert)."
501
+ # )
502
+ # run_btn = gr.Button("Run", variant="primary")
503
+ # plot = gr.Plot(label="Forecast vs Actual (roh & kalibriert)")
504
+ # meta = gr.JSON(label="Kalibrierungsparameter & Metriken")
505
+ # table = gr.Dataframe(label="Per-Day Vergleich", wrap=True)
506
+ # metrics = gr.Markdown(label="Zusammenfassung")
507
+ #
508
+ # run_btn.click(run_vol_forecast_and_evaluate, inputs=None, outputs=[plot, meta, table, metrics])
509
+ #
510
+ #if __name__ == "__main__":
511
+ # demo.launch()
512
+ #
513
+
514
+
515
  import os, random
516
  import numpy as np
517
  import pandas as pd
 
522
  import matplotlib.pyplot as plt
523
  from chronos import ChronosPipeline
524
 
525
+ # >>> import your pipeline <<<
526
+ import volatilitypredictor.pipeline_v2 as pipe2 # provides update_ticker_csv(...)
527
+
528
  # --------------------
529
  # Config
530
  # --------------------
531
  MODEL_ID = "amazon/chronos-t5-large"
532
+ PREDICTION_LENGTH = 30 # forecast last 30 days
533
+ NUM_SAMPLES = 1 # single path -> day-by-day point prediction
534
+ RV_WINDOW = 20 # realized vol window (trading days)
535
+ ANNUALIZE = True # annualize by sqrt(252)
536
  EPS = 1e-8
537
 
538
  # --------------------
539
+ # Model load (once)
540
  # --------------------
541
  device = "cuda" if torch.cuda.is_available() else "cpu"
542
  dtype = torch.bfloat16 if device == "cuda" else torch.float32
 
550
  # --------------------
551
  # Helpers
552
  # --------------------
 
 
 
 
 
 
553
  def _extract_close(df: pd.DataFrame) -> pd.Series:
554
  mapping = {c.lower(): c for c in df.columns}
555
  for name in ["close", "adj close", "adj_close", "price"]:
556
  if name in mapping:
557
+ return pd.Series(df[mapping[name]]).astype(float)
558
+ # fallback: last numeric column
559
+ num_cols = df.select_dtypes(include=[np.number]).columns
560
+ if len(num_cols) == 0:
561
+ raise gr.Error("Could not find a numeric price column (e.g., Close).")
562
+ return pd.Series(df[num_cols[-1]]).astype(float)
563
 
564
  def _extract_dates(df: pd.DataFrame):
565
  mapping = {c.lower(): c for c in df.columns}
 
569
  return pd.to_datetime(df[mapping[name]]).to_numpy()
570
  except Exception:
571
  pass
572
+ # If the CSV has a Date index, respect that
573
+ if df.index.name is not None:
574
+ try:
575
+ return pd.to_datetime(df.index).to_numpy()
576
+ except Exception:
577
+ pass
578
  return np.arange(len(df))
579
 
580
  def compute_realized_vol(close: pd.Series, window: int = 20, annualize: bool = True) -> pd.Series:
 
584
  rv = rv * np.sqrt(252.0)
585
  return rv.dropna().reset_index(drop=True)
586
 
587
+ def bias_scale_calibration(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[float, np.ndarray]:
588
+ """Return alpha and calibrated predictions alpha * y_pred (MSE-optimal scaling)."""
589
+ alpha = float(np.sum(y_true * y_pred) / (np.sum(y_pred**2) + EPS))
590
+ return alpha, alpha * y_pred
591
+
592
+ def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
593
+ err = y_pred - y_true
594
+ denom = np.maximum(EPS, np.abs(y_true))
595
+ mape = float((np.abs(err) / denom).mean() * 100)
596
+ mpe = float((err / np.maximum(EPS, y_true)).mean() * 100)
597
+ rmse = float(np.sqrt(np.mean(err**2)))
598
+ return {"MAPE": mape, "MPE": mpe, "RMSE": rmse}
599
+
600
  # --------------------
601
+ # Core routine
602
  # --------------------
603
+ def run_for_ticker(tickers: str, start: str, interval: str, use_calibration: bool):
604
+ """
605
+ tickers: comma/space separated (first is used for plotting/eval)
606
+ start: YYYY-MM-DD
607
+ interval: '1d', '1wk', '1mo' (yfinance-safe)
608
+ use_calibration: whether to apply bias/scale calibration on the 30-day path
609
+ """
610
+ # parse first ticker
611
+ tick_list = [t.strip().upper() for t in tickers.replace(";", ",").replace("|", ",").split(",") if t.strip()]
612
+ if not tick_list:
613
+ raise gr.Error("Please enter at least one ticker (e.g., AAPL).")
614
+ ticker = tick_list[0]
615
+
616
+ # 1) Fetch/update CSV via your pipeline
617
+ csv_path = pipe2.update_ticker_csv(ticker, start=start, interval=interval)
618
+
619
+ # 2) Load CSV and build realized vol
620
+ df = pd.read_csv(csv_path, index_col=0, parse_dates=[0])
621
+ dates = _extract_dates(df)
622
+ close = _extract_close(df)
623
 
 
624
  rv = compute_realized_vol(close, window=RV_WINDOW, annualize=ANNUALIZE).to_numpy()
625
  n = len(rv); H = PREDICTION_LENGTH
626
  if n <= H + 5:
627
+ raise gr.Error(f"Vol series too short after rolling window. Need > {H+5}, got {n}.")
628
 
 
629
  rv_train = rv[: n - H]
630
  rv_test = rv[n - H :]
631
 
632
+ # 3) Forecast a single sample path (deterministic via seed)
633
  random.seed(0); np.random.seed(0); torch.manual_seed(0)
634
  if torch.cuda.is_available():
635
  torch.cuda.manual_seed_all(0)
636
 
637
  context = torch.tensor(rv_train, dtype=torch.float32)
638
+ fcst = pipe.predict(context, prediction_length=H, num_samples=NUM_SAMPLES) # [1, 1, H]
639
+ samples = fcst[0].cpu().numpy() # (1, H)
640
+ path_pred = samples[0] # (H,)
641
+
642
+ # 4) (Optional) bias/scale calibration
643
+ alpha = None
644
+ if use_calibration:
645
+ alpha, path_pred_cal = bias_scale_calibration(rv_test, path_pred)
646
+ metrics_raw = compute_metrics(rv_test, path_pred)
647
+ metrics_cal = compute_metrics(rv_test, path_pred_cal)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  else:
649
+ metrics_raw = compute_metrics(rv_test, path_pred)
650
+ metrics_cal = None
651
+ path_pred_cal = None
 
 
 
652
 
653
+ # 5) Plot
654
+ fig = plt.figure(figsize=(10, 4))
655
+ H0 = len(rv_train)
 
656
 
657
+ # choose proper x-axis
 
 
658
  if isinstance(dates, np.ndarray) and dates.shape[0] >= len(close):
659
+ # Align dates to rv length (after rolling dropna)
660
  dates_rv = np.array(dates[-len(rv):])
661
+ x_hist = dates_rv[:H0]
662
+ x_fcst = dates_rv[H0:]
663
+ x_lbl = "date"
664
  else:
665
+ x_hist = np.arange(H0)
666
+ x_fcst = np.arange(H0, H0 + H)
667
+ x_lbl = "time index"
668
 
669
+ plt.plot(x_hist, rv_train, label="realized vol (history)")
670
+ plt.plot(x_fcst, rv_test, label="realized vol (actual last 30)")
671
+ plt.plot(x_fcst, path_pred, linestyle="--", label="forecast (raw path)")
672
+ if use_calibration:
673
+ plt.plot(x_fcst, path_pred_cal, linestyle="--", label=f"forecast (calibrated, α={alpha:.3f})")
674
 
675
+ plt.title(f"{ticker} — Volatility Forecast (RV={RV_WINDOW}, H={H}, interval={interval})")
676
+ plt.xlabel(x_lbl); plt.ylabel("realized volatility")
677
+ plt.legend(loc="best"); plt.tight_layout()
678
+
679
+ # 6) Per-day table
680
+ last_dates = x_fcst
681
  df_days = pd.DataFrame({
682
  "date": last_dates,
683
  "actual_vol": rv_test,
684
  "forecast_raw": path_pred,
 
 
 
 
685
  })
686
+ if use_calibration:
687
+ df_days["forecast_calibrated"] = path_pred_cal
688
+ df_days["abs_pct_error_raw_%"] = np.abs((path_pred - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
689
+ df_days["abs_pct_error_cal_%"] = np.abs((path_pred_cal - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
690
+ else:
691
+ df_days["abs_pct_error_raw_%"] = np.abs((path_pred - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
692
 
693
+ # 7) JSON + metrics text
694
+ out = {
695
+ "ticker": ticker,
696
+ "csv_path": csv_path,
697
+ "config": {
698
+ "start": start,
699
+ "interval": interval,
700
+ "rv_window": RV_WINDOW,
701
+ "prediction_length": H,
702
+ "num_samples": NUM_SAMPLES,
703
+ "annualized": ANNUALIZE,
704
+ "point_forecast": "single_sample_path",
705
+ },
706
+ "metrics_raw": {k: round(v, 4) for k, v in metrics_raw.items()},
707
  }
708
+ metrics_md = f"**RAW** — MAPE {metrics_raw['MAPE']:.2f}% | MPE {metrics_raw['MPE']:.2f}% | RMSE {metrics_raw['RMSE']:.5f}"
709
 
710
+ if use_calibration and metrics_cal is not None:
711
+ out["alpha"] = alpha
712
+ out["metrics_calibrated"] = {k: round(v, 4) for k, v in metrics_cal.items()}
713
+ metrics_md += f"\n**CALIBRATED** MAPE {metrics_cal['MAPE']:.2f}% | MPE {metrics_cal['MPE']:.2f}% | RMSE {metrics_cal['RMSE']:.5f}"
 
714
 
715
+ return fig, out, df_days, metrics_md
716
 
717
  # --------------------
718
  # UI
719
  # --------------------
720
+ with gr.Blocks(title="Volatility Forecast • yfinance pipeline + Chronos") as demo:
721
  gr.Markdown(
722
+ "### Predict last 30 days of realized volatility for any ticker\n"
723
+ "- Data fetched via **yfinance** (your `pipeline_v2.update_ticker_csv`).\n"
724
+ "- Forecast uses **Chronos-T5-Large** (single path, no mean/median).\n"
725
+ "- Compare day-by-day to actual RV and see **MAPE/MPE/RMSE**.\n"
726
+ "- Optional **Bias/Scale Calibration (α)** to remove systematic under/overestimation."
727
  )
728
+ with gr.Row():
729
+ tickers_in = gr.Textbox(value="AAPL", label="Tickers (comma-separated, first is evaluated)")
730
+ with gr.Row():
731
+ start_in = gr.Textbox(value="2015-01-01", label="Start date (YYYY-MM-DD)")
732
+ interval_in = gr.Dropdown(choices=["1d", "1wk", "1mo"], value="1d", label="Interval")
733
+ calib_in = gr.Checkbox(value=True, label="Apply bias/scale calibration (α)")
734
  run_btn = gr.Button("Run", variant="primary")
 
 
 
 
735
 
736
+ plot = gr.Plot(label="Forecast vs Actual (last 30 days)")
737
+ meta = gr.JSON(label="Run config & metrics")
738
+ table = gr.Dataframe(label="Per-day comparison", wrap=True)
739
+ metrics = gr.Markdown(label="Summary")
740
+
741
+ run_btn.click(run_for_ticker, inputs=[tickers_in, start_in, interval_in, calib_in],
742
+ outputs=[plot, meta, table, metrics])
743
 
744
  if __name__ == "__main__":
745
  demo.launch()
pipeline_v2.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import timedelta
3
+ import pandas as pd
4
+ import yfinance as yf
5
+
6
+ os.makedirs("data", exist_ok=True)
7
+ CSV_TEMPLATE = "data/{ticker}_{interval}.csv"
8
+
9
+ DEFAULT_START = "2015-01-01"
10
+ DEFAULT_INTERVAL = "1d"
11
+ DEFAULT_TICKERS = ["SPY", "QQQ", "AAPL", "MSFT", "NVDA", "NESN"]
12
+ MAX_RETRIES = 3
13
+
14
+ def download_ohlcv(ticker: str, start: str, interval: str, end: str = None) -> pd.DataFrame:
15
+ print(f"[INFO] Downloading {ticker} from {start} (interval={interval}, end={end})")
16
+ df = pd.DataFrame()
17
+
18
+ for attempt in range(MAX_RETRIES):
19
+ df = yf.download(
20
+ ticker,
21
+ start=start,
22
+ end=end, # end is exclusive on yfinance
23
+ interval=interval,
24
+ auto_adjust=True,
25
+ progress=False,
26
+ threads=True,
27
+ group_by="column", # helps avoid MultiIndex columns
28
+ )
29
+ if not df.empty:
30
+ break
31
+ if attempt < MAX_RETRIES - 1:
32
+ print(f"[WARN] Empty response for {ticker}, retrying... ({attempt+1}/{MAX_RETRIES})")
33
+
34
+ if df.empty:
35
+ raise ValueError(f"No data returned for {ticker}")
36
+
37
+ # --- NEW: collapse MultiIndex columns if present (single ticker) ---
38
+ if isinstance(df.columns, pd.MultiIndex):
39
+ # If levels are ['Price','Ticker'] or similar, drop the Ticker level
40
+ level_names = list(df.columns.names) if df.columns.names else []
41
+ if 'Ticker' in level_names:
42
+ df = df.droplevel('Ticker', axis=1)
43
+ else:
44
+ # Drop the *second* level by default (the ticker is usually the last level)
45
+ df = df.droplevel(-1, axis=1)
46
+ # -----------------------------------------
47
+
48
+ # Basic cleaning
49
+ if interval not in ("1d", "1wk", "1mo"):
50
+ df.index = pd.to_datetime(df.index, utc=True)
51
+ # df.index = pd.to_datetime(df.index, utc=True) # ensure timezone # Only needed for smaller than 1d Intervals
52
+ df = df[~df.index.duplicated(keep="last")] # drop duplicate timestamps
53
+ df = df.sort_index() # ensure time order
54
+
55
+ # standardize core columns if present
56
+ cols = [c for c in ["Open","High","Low","Close","Adj Close","Volume"] if c in df.columns]
57
+ df = df[cols] if cols else df
58
+ if "Volume" in df.columns:
59
+ df["Volume"] = pd.to_numeric(df["Volume"], errors="coerce").fillna(0).astype("int64", errors="ignore")
60
+ return df
61
+
62
+ def load_cached_csv(path: str) -> pd.DataFrame:
63
+ if not os.path.exists(path):
64
+ return pd.DataFrame()
65
+ df = pd.read_csv(path, index_col=0, parse_dates=[0]) # Date index as datetime64[ns] (naive)
66
+ # df.index = pd.to_datetime(df.index, utc=True)
67
+ # tidy just in case
68
+ df = df[~df.index.duplicated(keep="last")].sort_index()
69
+ return df
70
+
71
+
72
+ def next_start_from_cache(df_cached: pd.DataFrame) -> str:
73
+ last_day = pd.to_datetime(df_cached.index.max()).date()
74
+ return (last_day + timedelta(days=1)).isoformat()
75
+
76
+ def drop_partial_today_daily(df: pd.DataFrame) -> pd.DataFrame:
77
+ """
78
+ For daily bars, optionally drop a partial 'today' row if the script runs before the session is complete.
79
+ This is a policy choice—use it if you want your cache to only contain completed daily bars.
80
+ """
81
+ if df.empty:
82
+ return df
83
+ last_day = pd.to_datetime(df.index[-1]).date()
84
+ today_utc = pd.Timestamp.utcnow().date()
85
+ return df.iloc[:-1] if last_day >= today_utc else df
86
+
87
+ def update_ticker_csv(ticker: str, start: str = "2015-01-01", interval: str = "1d") -> str:
88
+ """
89
+ Update (or create) a CSV cache for the ticker. Returns the CSV path.
90
+ """
91
+ out_path = CSV_TEMPLATE.format(ticker=ticker.upper(), interval=interval)
92
+ cached = load_cached_csv(out_path)
93
+
94
+ #if interval in ("1d", "1wk", "1mo"):
95
+ # cached = drop_partial_today_daily(cached)
96
+
97
+ # --- make fetch_start a date, not a string ---
98
+ if cached.empty:
99
+ fetch_start = pd.to_datetime(start).date()
100
+ print(f"[INFO] No existing cache for {ticker}. Full download from {fetch_start}.")
101
+ else:
102
+ # next_start_from_cache currently returns a string -> parse to date
103
+ fetch_start = pd.to_datetime(next_start_from_cache(cached)).date()
104
+ print(f"[INFO] Found cache with {len(cached)} rows. Incremental from {fetch_start}.")
105
+ # ---------------------------------------------
106
+
107
+ # ----- NEW: avoid requesting future dates -----
108
+ today_utc = pd.Timestamp.utcnow().date()
109
+
110
+ if interval in ("1d", "1wk", "1mo"):
111
+ # If fetch_start is in the future, there is nothing to fetch yet
112
+ if fetch_start > today_utc:
113
+ print(f"[OK] {ticker}: nothing to fetch yet (next trading day {fetch_start} > today {today_utc}).")
114
+ df_new = pd.DataFrame(index=pd.DatetimeIndex([], name=cached.index.name or "Date"))
115
+ else:
116
+ # Optional: set an 'end' to be safe; yfinance's 'end' is exclusive, so add 1 day
117
+ end_date = today_utc + pd.Timedelta(days=1)
118
+ df_new = download_ohlcv(ticker, start=str(fetch_start), interval=interval, end=str(end_date))
119
+ else:
120
+ # Intraday: let 'now' be the implicit end
121
+ df_new = download_ohlcv(ticker, start=str(fetch_start), interval=interval)
122
+ # ----------------------------------------------
123
+
124
+ if cached.empty and df_new.empty:
125
+ raise ValueError(f"No data returned for {ticker}. Check ticker or start date.")
126
+
127
+ if df_new.empty:
128
+ print(f"[OK] {ticker}: no new rows to add.")
129
+ merged = cached
130
+ else:
131
+ # merge, drop duplicates, sort
132
+ merged = pd.concat([cached, df_new], axis=0)
133
+ merged = merged[~merged.index.duplicated(keep="last")].sort_index()
134
+ print(f"[OK] {ticker}: added {len(merged) - len(cached)} new rows.")
135
+
136
+ # Optional: keep only completed daily bars
137
+ #if interval in ("1d", "1wk", "1mo"):
138
+ # merged = drop_partial_today_daily(merged)
139
+
140
+ # Only drop partial 'today' if we fetched something new
141
+ #fetched_any = not df_new.empty
142
+
143
+ #if interval in ("1d", "1wk", "1mo") and fetched_any:
144
+ # merged = drop_partial_today_daily(merged)
145
+
146
+ #added = len(merged) - len(cached)
147
+ #if added < 0:
148
+ # Safety net (shouldn’t happen with the guard above)
149
+ #added = 0
150
+ # save
151
+ merged.to_csv(out_path, date_format="%Y-%m-%d")
152
+ added = len(merged) - len(cached)
153
+ print(f"[OK] {ticker}: added {added} new row(s). Now {len(merged)} total.")
154
+ print(f"[OK] Saved {ticker} → {out_path}")
155
+
156
+ return out_path
157
+
158
+ def update_many(
159
+ tickers: str = DEFAULT_TICKERS,
160
+ start: str = DEFAULT_START,
161
+ interval: str = DEFAULT_INTERVAL,
162
+ ) -> dict[str, str]:
163
+ """
164
+ Update multiple tickers; continue on errors.
165
+ Returns dict[ticker] -> csv_path (or None if failed).
166
+ """
167
+ results: Dict[str, Optional[str]] = {}
168
+ for t in [t.strip().upper() for t in tickers if t and t.strip()]:
169
+ print("\n" + "=" * 60)
170
+ print(f"[RUN] {t}")
171
+ try:
172
+ path = update_ticker_csv(t, start=start, interval=interval)
173
+ results[t] = path
174
+ except Exception as e:
175
+ print(f"[ERR] {t}: {e}")
176
+ results[t] = None
177
+ print("\n" + "=" * 60)
178
+ ok = sum(1 for v in results.values() if v)
179
+ print(f"[SUMMARY] Completed {ok}/{len(results)} tickers.")
180
+ return results
181
+
182
+
183
+ if __name__ == "__main__":
184
+ # choose your universe here (or later via CLI)
185
+ TICKERS = DEFAULT_TICKERS
186
+ START = DEFAULT_START
187
+ INTERVAL = DEFAULT_INTERVAL
188
+
189
+ update_many(TICKERS, start=START, interval=INTERVAL)