Spaces:

Gilette
/

volatilitypredictor

Running

App Files Files Community

Gil Stetler commited on 23 days ago

Commit

a5e3343

1 Parent(s): 92e4d77

pipeline included

Browse files

Files changed (2) hide show

app.py +342 -108
pipeline_v2.py +189 -0

app.py CHANGED Viewed

@@ -306,6 +306,212 @@
 #
 #
 #
 import os, random
 import numpy as np
 import pandas as pd
@@ -316,18 +522,21 @@ matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 from chronos import ChronosPipeline
 # --------------------
 # Config
 # --------------------
 MODEL_ID = "amazon/chronos-t5-large"
-PREDICTION_LENGTH = 30          # letzte 30 Tage
-NUM_SAMPLES = 1                 # eine Bahn -> tagesgenaue Punktvorhersage
-RV_WINDOW = 20
-ANNUALIZE = True
 EPS = 1e-8
 # --------------------
-# Model load
 # --------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.bfloat16 if device == "cuda" else torch.float32
@@ -341,21 +550,16 @@ pipe = ChronosPipeline.from_pretrained(
 # --------------------
 # Helpers
 # --------------------
-def _read_ohlcv_csv():
-    for p in ["/mnt/data/ohlcv_clean.csv", "ohlcv_clean.csv"]:
-        if os.path.exists(p):
-            return pd.read_csv(p)
-    raise gr.Error("CSV nicht gefunden. Lege sie unter /mnt/data/ohlcv_clean.csv oder ./ohlcv_clean.csv ab.")
 def _extract_close(df: pd.DataFrame) -> pd.Series:
     mapping = {c.lower(): c for c in df.columns}
     for name in ["close", "adj close", "adj_close", "price"]:
         if name in mapping:
-            return pd.Series(df[mapping[name]].astype(float))
-    numeric_cols = df.select_dtypes(include=[np.number]).columns
-    if len(numeric_cols) == 0:
-        raise gr.Error("Keine numerische Preisspalte gefunden (z.B. Close).")
-    return pd.Series(df[numeric_cols[-1]].astype(float))
 def _extract_dates(df: pd.DataFrame):
     mapping = {c.lower(): c for c in df.columns}
@@ -365,6 +569,12 @@ def _extract_dates(df: pd.DataFrame):
                 return pd.to_datetime(df[mapping[name]]).to_numpy()
             except Exception:
                 pass
     return np.arange(len(df))
 def compute_realized_vol(close: pd.Series, window: int = 20, annualize: bool = True) -> pd.Series:
@@ -374,138 +584,162 @@ def compute_realized_vol(close: pd.Series, window: int = 20, annualize: bool = T
         rv = rv * np.sqrt(252.0)
     return rv.dropna().reset_index(drop=True)
 # --------------------
-# Main
 # --------------------
-def run_vol_forecast_and_evaluate():
-    # Daten laden
-    raw = _read_ohlcv_csv()
-    dates = _extract_dates(raw)
-    close = _extract_close(raw)
-    # Realized Volatility
     rv = compute_realized_vol(close, window=RV_WINDOW, annualize=ANNUALIZE).to_numpy()
     n = len(rv); H = PREDICTION_LENGTH
     if n <= H + 5:
-        raise gr.Error(f"RV-Serie zu kurz nach Rolling. Benötigt > {H+5}, erhalten {n}.")
-    # Split
     rv_train = rv[: n - H]
     rv_test  = rv[n - H :]
-    # Eine Sample-Bahn prognostizieren
     random.seed(0); np.random.seed(0); torch.manual_seed(0)
     if torch.cuda.is_available():
         torch.cuda.manual_seed_all(0)
     context = torch.tensor(rv_train, dtype=torch.float32)
-    fcst = pipe.predict(context, prediction_length=H, num_samples=NUM_SAMPLES)   # [1,1,H]
-    samples = fcst[0].cpu().numpy()
-    path_pred = samples[0]  # (H,) — Punktprognose
-    # --------------------
-    # Bias-/Scale-Kalibrierung
-    # --------------------
-    # α so wählen, dass MSE zwischen α*pred und actual minimal wird
-    alpha = float(np.sum(rv_test * path_pred) / np.sum(path_pred**2 + EPS))
-    path_pred_cal = alpha * path_pred
-    # Fehler (original & kalibriert)
-    def metrics(y_true, y_pred):
-        err = y_pred - y_true
-        denom = np.maximum(EPS, np.abs(y_true))
-        abs_pct_err = np.abs(err) / denom * 100
-        pct_err = err / np.maximum(EPS, y_true) * 100
-        return {
-            "MAPE": abs_pct_err.mean(),
-            "MPE": pct_err.mean(),
-            "RMSE": np.sqrt(np.mean(err**2))
-        }
-    m_orig = metrics(rv_test, path_pred)
-    m_cal  = metrics(rv_test, path_pred_cal)
-    # --------------------
-    # Plot
-    # --------------------
-    fig = plt.figure(figsize=(10, 4))
-    H0 = len(rv_train)
-    if isinstance(dates, np.ndarray) and dates.shape[0] >= len(close):
-        dates_rv = np.array(dates[-len(rv):])
-        plt.plot(dates_rv[:H0], rv_train, label="realized vol (history)")
-        plt.plot(dates_rv[H0:], rv_test, label="actual (holdout)")
-        plt.plot(dates_rv[H0:], path_pred, linestyle="--", label="forecast (raw)")
-        plt.plot(dates_rv[H0:], path_pred_cal, linestyle="--", label=f"forecast (calibrated, α={alpha:.3f})")
-        plt.xlabel("date")
     else:
-        x_all = np.arange(len(rv)); x_fcst = np.arange(H0, H0 + H)
-        plt.plot(x_all[:H0], rv_train, label="realized vol (history)")
-        plt.plot(x_fcst, rv_test, label="actual (holdout)")
-        plt.plot(x_fcst, path_pred, linestyle="--", label="forecast (raw)")
-        plt.plot(x_fcst, path_pred_cal, linestyle="--", label=f"forecast (calibrated, α={alpha:.3f})")
-        plt.xlabel("time index")
-    plt.title(f"Volatility Forecast (RV window={RV_WINDOW}, H={H})")
-    plt.ylabel("realized volatility")
-    plt.legend(loc="best")
-    plt.tight_layout()
-    # --------------------
-    # Tages-Tabelle
-    # --------------------
     if isinstance(dates, np.ndarray) and dates.shape[0] >= len(close):
         dates_rv = np.array(dates[-len(rv):])
-        last_dates = dates_rv[H0:]
     else:
-        last_dates = np.arange(H)
-    abs_pct_err_orig = np.abs((path_pred - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
-    abs_pct_err_cal  = np.abs((path_pred_cal - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
     df_days = pd.DataFrame({
         "date": last_dates,
         "actual_vol": rv_test,
         "forecast_raw": path_pred,
-        "forecast_calibrated": path_pred_cal,
-        "abs_error_raw": np.abs(path_pred - rv_test),
-        "abs_pct_error_raw_%": abs_pct_err_orig,
-        "abs_pct_error_cal_%": abs_pct_err_cal,
     })
-    # --------------------
-    # Outputs
-    # --------------------
-    out_json = {
-        "alpha": alpha,
-        "metrics_raw": {k: round(v, 4) for k, v in m_orig.items()},
-        "metrics_calibrated": {k: round(v, 4) for k, v in m_cal.items()},
     }
-    metrics_md = (
-        f"**Bias-/Scale-Kalibrierung** α = {alpha:.3f}\n\n"
-        f"**RAW:** MAPE {m_orig['MAPE']:.2f}% | MPE {m_orig['MPE']:.2f}% | RMSE {m_orig['RMSE']:.5f}\n"
-        f"**CALIBRATED:** MAPE {m_cal['MAPE']:.2f}% | MPE {m_cal['MPE']:.2f}% | RMSE {m_cal['RMSE']:.5f}"
-    )
-    return fig, out_json, df_days, metrics_md
 # --------------------
 # UI
 # --------------------
-with gr.Blocks(title="Volatility Forecast • mit Bias-/Scale-Kalibrierung") as demo:
     gr.Markdown(
-        "## Letzte 30 Tage Volatilität (mit automatischer Bias-/Scale-Kalibrierung)\n"
-        "- Prognose einer einzelnen Sample-Bahn (kein Mittelwert, kein Median).\n"
-        "- Anschließend wird ein Skalierungsfaktor α berechnet, um systematische Unter-/Überschätzung zu korrigieren.\n"
-        "- Darstellung: Forecast (roh) & Forecast (kalibriert)."
     )
     run_btn = gr.Button("Run", variant="primary")
-    plot = gr.Plot(label="Forecast vs Actual (roh & kalibriert)")
-    meta = gr.JSON(label="Kalibrierungsparameter & Metriken")
-    table = gr.Dataframe(label="Per-Day Vergleich", wrap=True)
-    metrics = gr.Markdown(label="Zusammenfassung")
-    run_btn.click(run_vol_forecast_and_evaluate, inputs=None, outputs=[plot, meta, table, metrics])
 if __name__ == "__main__":
     demo.launch()

 #
 #
 #
+#import os, random
+#import numpy as np
+#import pandas as pd
+#import torch
+#import gradio as gr
+#import matplotlib
+#matplotlib.use("Agg")
+#import matplotlib.pyplot as plt
+#from chronos import ChronosPipeline
+#
+## --------------------
+## Config
+## --------------------
+#MODEL_ID = "amazon/chronos-t5-large"
+#PREDICTION_LENGTH = 30          # letzte 30 Tage
+#NUM_SAMPLES = 1                 # eine Bahn -> tagesgenaue Punktvorhersage
+#RV_WINDOW = 20
+#ANNUALIZE = True
+#EPS = 1e-8
+#
+## --------------------
+## Model load
+## --------------------
+#device = "cuda" if torch.cuda.is_available() else "cpu"
+#dtype = torch.bfloat16 if device == "cuda" else torch.float32
+#
+#pipe = ChronosPipeline.from_pretrained(
+#    MODEL_ID,
+#    device_map="auto",
+#    torch_dtype=dtype,
+#)
+#
+## --------------------
+## Helpers
+## --------------------
+#def _read_ohlcv_csv():
+#    for p in ["/mnt/data/ohlcv_clean.csv", "ohlcv_clean.csv"]:
+#        if os.path.exists(p):
+#            return pd.read_csv(p)
+#    raise gr.Error("CSV nicht gefunden. Lege sie unter /mnt/data/ohlcv_clean.csv oder ./ohlcv_clean.csv ab.")
+#
+#def _extract_close(df: pd.DataFrame) -> pd.Series:
+#    mapping = {c.lower(): c for c in df.columns}
+#    for name in ["close", "adj close", "adj_close", "price"]:
+#        if name in mapping:
+#            return pd.Series(df[mapping[name]].astype(float))
+#    numeric_cols = df.select_dtypes(include=[np.number]).columns
+#    if len(numeric_cols) == 0:
+#        raise gr.Error("Keine numerische Preisspalte gefunden (z.B. Close).")
+#    return pd.Series(df[numeric_cols[-1]].astype(float))
+#
+#def _extract_dates(df: pd.DataFrame):
+#    mapping = {c.lower(): c for c in df.columns}
+#    for name in ["date", "time", "timestamp"]:
+#        if name in mapping:
+#            try:
+#                return pd.to_datetime(df[mapping[name]]).to_numpy()
+#            except Exception:
+#                pass
+#    return np.arange(len(df))
+#
+#def compute_realized_vol(close: pd.Series, window: int = 20, annualize: bool = True) -> pd.Series:
+#    r = np.log(close).diff().dropna()
+#    rv = r.rolling(window, min_periods=window).std()
+#    if annualize:
+#        rv = rv * np.sqrt(252.0)
+#    return rv.dropna().reset_index(drop=True)
+#
+## --------------------
+## Main
+## --------------------
+#def run_vol_forecast_and_evaluate():
+#    # Daten laden
+#    raw = _read_ohlcv_csv()
+#    dates = _extract_dates(raw)
+#    close = _extract_close(raw)
+#
+#    # Realized Volatility
+#    rv = compute_realized_vol(close, window=RV_WINDOW, annualize=ANNUALIZE).to_numpy()
+#    n = len(rv); H = PREDICTION_LENGTH
+#    if n <= H + 5:
+#        raise gr.Error(f"RV-Serie zu kurz nach Rolling. Benötigt > {H+5}, erhalten {n}.")
+#
+#    # Split
+#    rv_train = rv[: n - H]
+#    rv_test  = rv[n - H :]
+#
+#    # Eine Sample-Bahn prognostizieren
+#    random.seed(0); np.random.seed(0); torch.manual_seed(0)
+#    if torch.cuda.is_available():
+#        torch.cuda.manual_seed_all(0)
+#
+#    context = torch.tensor(rv_train, dtype=torch.float32)
+#    fcst = pipe.predict(context, prediction_length=H, num_samples=NUM_SAMPLES)   # [1,1,H]
+#    samples = fcst[0].cpu().numpy()
+#    path_pred = samples[0]  # (H,) — Punktprognose
+#
+#    # --------------------
+#    # Bias-/Scale-Kalibrierung
+#    # --------------------
+#    # α so wählen, dass MSE zwischen α*pred und actual minimal wird
+#    alpha = float(np.sum(rv_test * path_pred) / np.sum(path_pred**2 + EPS))
+#    path_pred_cal = alpha * path_pred
+#
+#    # Fehler (original & kalibriert)
+#    def metrics(y_true, y_pred):
+#        err = y_pred - y_true
+#        denom = np.maximum(EPS, np.abs(y_true))
+#        abs_pct_err = np.abs(err) / denom * 100
+#        pct_err = err / np.maximum(EPS, y_true) * 100
+#        return {
+#            "MAPE": abs_pct_err.mean(),
+#            "MPE": pct_err.mean(),
+#            "RMSE": np.sqrt(np.mean(err**2))
+#        }
+#
+#    m_orig = metrics(rv_test, path_pred)
+#    m_cal  = metrics(rv_test, path_pred_cal)
+#
+#    # --------------------
+#    # Plot
+#    # --------------------
+#    fig = plt.figure(figsize=(10, 4))
+#    H0 = len(rv_train)
+#    if isinstance(dates, np.ndarray) and dates.shape[0] >= len(close):
+#        dates_rv = np.array(dates[-len(rv):])
+#        plt.plot(dates_rv[:H0], rv_train, label="realized vol (history)")
+#        plt.plot(dates_rv[H0:], rv_test, label="actual (holdout)")
+#        plt.plot(dates_rv[H0:], path_pred, linestyle="--", label="forecast (raw)")
+#        plt.plot(dates_rv[H0:], path_pred_cal, linestyle="--", label=f"forecast (calibrated, α={alpha:.3f})")
+#        plt.xlabel("date")
+#    else:
+#        x_all = np.arange(len(rv)); x_fcst = np.arange(H0, H0 + H)
+#        plt.plot(x_all[:H0], rv_train, label="realized vol (history)")
+#        plt.plot(x_fcst, rv_test, label="actual (holdout)")
+#        plt.plot(x_fcst, path_pred, linestyle="--", label="forecast (raw)")
+#        plt.plot(x_fcst, path_pred_cal, linestyle="--", label=f"forecast (calibrated, α={alpha:.3f})")
+#        plt.xlabel("time index")
+#
+#    plt.title(f"Volatility Forecast (RV window={RV_WINDOW}, H={H})")
+#    plt.ylabel("realized volatility")
+#    plt.legend(loc="best")
+#    plt.tight_layout()
+#
+#    # --------------------
+#    # Tages-Tabelle
+#    # --------------------
+#    if isinstance(dates, np.ndarray) and dates.shape[0] >= len(close):
+#        dates_rv = np.array(dates[-len(rv):])
+#        last_dates = dates_rv[H0:]
+#    else:
+#        last_dates = np.arange(H)
+#
+#    abs_pct_err_orig = np.abs((path_pred - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
+#    abs_pct_err_cal  = np.abs((path_pred_cal - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
+#
+#    df_days = pd.DataFrame({
+#        "date": last_dates,
+#        "actual_vol": rv_test,
+#        "forecast_raw": path_pred,
+#        "forecast_calibrated": path_pred_cal,
+#        "abs_error_raw": np.abs(path_pred - rv_test),
+#        "abs_pct_error_raw_%": abs_pct_err_orig,
+#        "abs_pct_error_cal_%": abs_pct_err_cal,
+#    })
+#
+#    # --------------------
+#    # Outputs
+#    # --------------------
+#    out_json = {
+#        "alpha": alpha,
+#        "metrics_raw": {k: round(v, 4) for k, v in m_orig.items()},
+#        "metrics_calibrated": {k: round(v, 4) for k, v in m_cal.items()},
+#    }
+#
+#    metrics_md = (
+#        f"**Bias-/Scale-Kalibrierung** α = {alpha:.3f}\n\n"
+#        f"**RAW:** MAPE {m_orig['MAPE']:.2f}% | MPE {m_orig['MPE']:.2f}% | RMSE {m_orig['RMSE']:.5f}\n"
+#        f"**CALIBRATED:** MAPE {m_cal['MAPE']:.2f}% | MPE {m_cal['MPE']:.2f}% | RMSE {m_cal['RMSE']:.5f}"
+#    )
+#
+#    return fig, out_json, df_days, metrics_md
+#
+## --------------------
+## UI
+## --------------------
+#with gr.Blocks(title="Volatility Forecast • mit Bias-/Scale-Kalibrierung") as demo:
+#    gr.Markdown(
+#        "## Letzte 30 Tage Volatilität (mit automatischer Bias-/Scale-Kalibrierung)\n"
+#        "- Prognose einer einzelnen Sample-Bahn (kein Mittelwert, kein Median).\n"
+#        "- Anschließend wird ein Skalierungsfaktor α berechnet, um systematische Unter-/Überschätzung zu korrigieren.\n"
+#        "- Darstellung: Forecast (roh) & Forecast (kalibriert)."
+#    )
+#    run_btn = gr.Button("Run", variant="primary")
+#    plot = gr.Plot(label="Forecast vs Actual (roh & kalibriert)")
+#    meta = gr.JSON(label="Kalibrierungsparameter & Metriken")
+#    table = gr.Dataframe(label="Per-Day Vergleich", wrap=True)
+#    metrics = gr.Markdown(label="Zusammenfassung")
+#
+#    run_btn.click(run_vol_forecast_and_evaluate, inputs=None, outputs=[plot, meta, table, metrics])
+#
+#if __name__ == "__main__":
+#    demo.launch()
+#
 import os, random
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 from chronos import ChronosPipeline
+# >>> import your pipeline <<<
+import volatilitypredictor.pipeline_v2 as pipe2  # provides update_ticker_csv(...)
 # --------------------
 # Config
 # --------------------
 MODEL_ID = "amazon/chronos-t5-large"
+PREDICTION_LENGTH = 30          # forecast last 30 days
+NUM_SAMPLES = 1                 # single path -> day-by-day point prediction
+RV_WINDOW = 20                  # realized vol window (trading days)
+ANNUALIZE = True                # annualize by sqrt(252)
 EPS = 1e-8
 # --------------------
+# Model load (once)
 # --------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.bfloat16 if device == "cuda" else torch.float32
 # --------------------
 # Helpers
 # --------------------
 def _extract_close(df: pd.DataFrame) -> pd.Series:
     mapping = {c.lower(): c for c in df.columns}
     for name in ["close", "adj close", "adj_close", "price"]:
         if name in mapping:
+            return pd.Series(df[mapping[name]]).astype(float)
+    # fallback: last numeric column
+    num_cols = df.select_dtypes(include=[np.number]).columns
+    if len(num_cols) == 0:
+        raise gr.Error("Could not find a numeric price column (e.g., Close).")
+    return pd.Series(df[num_cols[-1]]).astype(float)
 def _extract_dates(df: pd.DataFrame):
     mapping = {c.lower(): c for c in df.columns}
                 return pd.to_datetime(df[mapping[name]]).to_numpy()
             except Exception:
                 pass
+    # If the CSV has a Date index, respect that
+    if df.index.name is not None:
+        try:
+            return pd.to_datetime(df.index).to_numpy()
+        except Exception:
+            pass
     return np.arange(len(df))
 def compute_realized_vol(close: pd.Series, window: int = 20, annualize: bool = True) -> pd.Series:
         rv = rv * np.sqrt(252.0)
     return rv.dropna().reset_index(drop=True)
+def bias_scale_calibration(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[float, np.ndarray]:
+    """Return alpha and calibrated predictions alpha * y_pred (MSE-optimal scaling)."""
+    alpha = float(np.sum(y_true * y_pred) / (np.sum(y_pred**2) + EPS))
+    return alpha, alpha * y_pred
+def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
+    err = y_pred - y_true
+    denom = np.maximum(EPS, np.abs(y_true))
+    mape = float((np.abs(err) / denom).mean() * 100)
+    mpe  = float((err / np.maximum(EPS, y_true)).mean() * 100)
+    rmse = float(np.sqrt(np.mean(err**2)))
+    return {"MAPE": mape, "MPE": mpe, "RMSE": rmse}
 # --------------------
+# Core routine
 # --------------------
+def run_for_ticker(tickers: str, start: str, interval: str, use_calibration: bool):
+    """
+    tickers: comma/space separated (first is used for plotting/eval)
+    start: YYYY-MM-DD
+    interval: '1d', '1wk', '1mo' (yfinance-safe)
+    use_calibration: whether to apply bias/scale calibration on the 30-day path
+    """
+    # parse first ticker
+    tick_list = [t.strip().upper() for t in tickers.replace(";", ",").replace("|", ",").split(",") if t.strip()]
+    if not tick_list:
+        raise gr.Error("Please enter at least one ticker (e.g., AAPL).")
+    ticker = tick_list[0]
+    # 1) Fetch/update CSV via your pipeline
+    csv_path = pipe2.update_ticker_csv(ticker, start=start, interval=interval)
+    # 2) Load CSV and build realized vol
+    df = pd.read_csv(csv_path, index_col=0, parse_dates=[0])
+    dates = _extract_dates(df)
+    close = _extract_close(df)
     rv = compute_realized_vol(close, window=RV_WINDOW, annualize=ANNUALIZE).to_numpy()
     n = len(rv); H = PREDICTION_LENGTH
     if n <= H + 5:
+        raise gr.Error(f"Vol series too short after rolling window. Need > {H+5}, got {n}.")
     rv_train = rv[: n - H]
     rv_test  = rv[n - H :]
+    # 3) Forecast a single sample path (deterministic via seed)
     random.seed(0); np.random.seed(0); torch.manual_seed(0)
     if torch.cuda.is_available():
         torch.cuda.manual_seed_all(0)
     context = torch.tensor(rv_train, dtype=torch.float32)
+    fcst = pipe.predict(context, prediction_length=H, num_samples=NUM_SAMPLES)  # [1, 1, H]
+    samples = fcst[0].cpu().numpy()                                            # (1, H)
+    path_pred = samples[0]                                                     # (H,)
+    # 4) (Optional) bias/scale calibration
+    alpha = None
+    if use_calibration:
+        alpha, path_pred_cal = bias_scale_calibration(rv_test, path_pred)
+        metrics_raw = compute_metrics(rv_test, path_pred)
+        metrics_cal = compute_metrics(rv_test, path_pred_cal)
     else:
+        metrics_raw = compute_metrics(rv_test, path_pred)
+        metrics_cal = None
+        path_pred_cal = None
+    # 5) Plot
+    fig = plt.figure(figsize=(10, 4))
+    H0 = len(rv_train)
+    # choose proper x-axis
     if isinstance(dates, np.ndarray) and dates.shape[0] >= len(close):
+        # Align dates to rv length (after rolling dropna)
         dates_rv = np.array(dates[-len(rv):])
+        x_hist = dates_rv[:H0]
+        x_fcst = dates_rv[H0:]
+        x_lbl = "date"
     else:
+        x_hist = np.arange(H0)
+        x_fcst = np.arange(H0, H0 + H)
+        x_lbl = "time index"
+    plt.plot(x_hist, rv_train, label="realized vol (history)")
+    plt.plot(x_fcst, rv_test, label="realized vol (actual last 30)")
+    plt.plot(x_fcst, path_pred, linestyle="--", label="forecast (raw path)")
+    if use_calibration:
+        plt.plot(x_fcst, path_pred_cal, linestyle="--", label=f"forecast (calibrated, α={alpha:.3f})")
+    plt.title(f"{ticker} — Volatility Forecast (RV={RV_WINDOW}, H={H}, interval={interval})")
+    plt.xlabel(x_lbl); plt.ylabel("realized volatility")
+    plt.legend(loc="best"); plt.tight_layout()
+    # 6) Per-day table
+    last_dates = x_fcst
     df_days = pd.DataFrame({
         "date": last_dates,
         "actual_vol": rv_test,
         "forecast_raw": path_pred,
     })
+    if use_calibration:
+        df_days["forecast_calibrated"] = path_pred_cal
+        df_days["abs_pct_error_raw_%"] = np.abs((path_pred - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
+        df_days["abs_pct_error_cal_%"] = np.abs((path_pred_cal - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
+    else:
+        df_days["abs_pct_error_raw_%"] = np.abs((path_pred - rv_test) / np.maximum(EPS, np.abs(rv_test))) * 100
+    # 7) JSON + metrics text
+    out = {
+        "ticker": ticker,
+        "csv_path": csv_path,
+        "config": {
+            "start": start,
+            "interval": interval,
+            "rv_window": RV_WINDOW,
+            "prediction_length": H,
+            "num_samples": NUM_SAMPLES,
+            "annualized": ANNUALIZE,
+            "point_forecast": "single_sample_path",
+        },
+        "metrics_raw": {k: round(v, 4) for k, v in metrics_raw.items()},
     }
+    metrics_md = f"**RAW** — MAPE {metrics_raw['MAPE']:.2f}% | MPE {metrics_raw['MPE']:.2f}% | RMSE {metrics_raw['RMSE']:.5f}"
+    if use_calibration and metrics_cal is not None:
+        out["alpha"] = alpha
+        out["metrics_calibrated"] = {k: round(v, 4) for k, v in metrics_cal.items()}
+        metrics_md += f"\n**CALIBRATED** — MAPE {metrics_cal['MAPE']:.2f}% | MPE {metrics_cal['MPE']:.2f}% | RMSE {metrics_cal['RMSE']:.5f}"
+    return fig, out, df_days, metrics_md
 # --------------------
 # UI
 # --------------------
+with gr.Blocks(title="Volatility Forecast • yfinance pipeline + Chronos") as demo:
     gr.Markdown(
+        "### Predict last 30 days of realized volatility for any ticker\n"
+        "- Data fetched via **yfinance** (your `pipeline_v2.update_ticker_csv`).\n"
+        "- Forecast uses **Chronos-T5-Large** (single path, no mean/median).\n"
+        "- Compare day-by-day to actual RV and see **MAPE/MPE/RMSE**.\n"
+        "- Optional **Bias/Scale Calibration (α)** to remove systematic under/overestimation."
     )
+    with gr.Row():
+        tickers_in = gr.Textbox(value="AAPL", label="Tickers (comma-separated, first is evaluated)")
+    with gr.Row():
+        start_in = gr.Textbox(value="2015-01-01", label="Start date (YYYY-MM-DD)")
+        interval_in = gr.Dropdown(choices=["1d", "1wk", "1mo"], value="1d", label="Interval")
+        calib_in = gr.Checkbox(value=True, label="Apply bias/scale calibration (α)")
     run_btn = gr.Button("Run", variant="primary")
+    plot = gr.Plot(label="Forecast vs Actual (last 30 days)")
+    meta = gr.JSON(label="Run config & metrics")
+    table = gr.Dataframe(label="Per-day comparison", wrap=True)
+    metrics = gr.Markdown(label="Summary")
+    run_btn.click(run_for_ticker, inputs=[tickers_in, start_in, interval_in, calib_in],
+                  outputs=[plot, meta, table, metrics])
 if __name__ == "__main__":
     demo.launch()

pipeline_v2.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import os
+from datetime import timedelta
+import pandas as pd
+import yfinance as yf
+os.makedirs("data", exist_ok=True)
+CSV_TEMPLATE = "data/{ticker}_{interval}.csv"
+DEFAULT_START = "2015-01-01"
+DEFAULT_INTERVAL = "1d"
+DEFAULT_TICKERS = ["SPY", "QQQ", "AAPL", "MSFT", "NVDA", "NESN"]
+MAX_RETRIES = 3
+def download_ohlcv(ticker: str, start: str, interval: str, end: str = None) -> pd.DataFrame:
+    print(f"[INFO] Downloading {ticker} from {start} (interval={interval}, end={end})")
+    df = pd.DataFrame()
+    for attempt in range(MAX_RETRIES):
+        df = yf.download(
+            ticker,
+            start=start,
+            end=end,                  # end is exclusive on yfinance
+            interval=interval,
+            auto_adjust=True,
+            progress=False,
+            threads=True,
+            group_by="column",        # helps avoid MultiIndex columns
+        )
+        if not df.empty:
+            break
+        if attempt < MAX_RETRIES - 1:
+            print(f"[WARN] Empty response for {ticker}, retrying... ({attempt+1}/{MAX_RETRIES})")
+    if df.empty:
+        raise ValueError(f"No data returned for {ticker}")
+    # --- NEW: collapse MultiIndex columns if present (single ticker) ---
+    if isinstance(df.columns, pd.MultiIndex):
+        # If levels are ['Price','Ticker'] or similar, drop the Ticker level
+        level_names = list(df.columns.names) if df.columns.names else []
+        if 'Ticker' in level_names:
+            df = df.droplevel('Ticker', axis=1)
+        else:
+            # Drop the *second* level by default (the ticker is usually the last level)
+            df = df.droplevel(-1, axis=1)
+    # -----------------------------------------
+    # Basic cleaning
+    if interval not in ("1d", "1wk", "1mo"):
+        df.index = pd.to_datetime(df.index, utc=True)
+    # df.index = pd.to_datetime(df.index, utc=True)        # ensure timezone # Only needed for smaller than 1d Intervals
+    df = df[~df.index.duplicated(keep="last")]           # drop duplicate timestamps
+    df = df.sort_index()                                 # ensure time order
+    # standardize core columns if present
+    cols = [c for c in ["Open","High","Low","Close","Adj Close","Volume"] if c in df.columns]
+    df = df[cols] if cols else df
+    if "Volume" in df.columns:
+        df["Volume"] = pd.to_numeric(df["Volume"], errors="coerce").fillna(0).astype("int64", errors="ignore")
+    return df
+def load_cached_csv(path: str) -> pd.DataFrame:
+    if not os.path.exists(path):
+        return pd.DataFrame()
+    df = pd.read_csv(path, index_col=0, parse_dates=[0]) # Date index as datetime64[ns] (naive)
+    # df.index = pd.to_datetime(df.index, utc=True)
+    # tidy just in case
+    df = df[~df.index.duplicated(keep="last")].sort_index()
+    return df
+def next_start_from_cache(df_cached: pd.DataFrame) -> str:
+    last_day = pd.to_datetime(df_cached.index.max()).date()
+    return (last_day + timedelta(days=1)).isoformat()
+def drop_partial_today_daily(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    For daily bars, optionally drop a partial 'today' row if the script runs before the session is complete.
+    This is a policy choice—use it if you want your cache to only contain completed daily bars.
+    """
+    if df.empty:
+        return df
+    last_day = pd.to_datetime(df.index[-1]).date()
+    today_utc = pd.Timestamp.utcnow().date()
+    return df.iloc[:-1] if last_day >= today_utc else df
+def update_ticker_csv(ticker: str, start: str = "2015-01-01", interval: str = "1d") -> str:
+    """
+    Update (or create) a CSV cache for the ticker. Returns the CSV path.
+    """
+    out_path = CSV_TEMPLATE.format(ticker=ticker.upper(), interval=interval)
+    cached = load_cached_csv(out_path)
+    #if interval in ("1d", "1wk", "1mo"):
+    # cached = drop_partial_today_daily(cached)
+    # --- make fetch_start a date, not a string ---
+    if cached.empty:
+        fetch_start = pd.to_datetime(start).date()
+        print(f"[INFO] No existing cache for {ticker}. Full download from {fetch_start}.")
+    else:
+        # next_start_from_cache currently returns a string -> parse to date
+        fetch_start = pd.to_datetime(next_start_from_cache(cached)).date()
+        print(f"[INFO] Found cache with {len(cached)} rows. Incremental from {fetch_start}.")
+    # ---------------------------------------------
+    # ----- NEW: avoid requesting future dates -----
+    today_utc = pd.Timestamp.utcnow().date()
+    if interval in ("1d", "1wk", "1mo"):
+        # If fetch_start is in the future, there is nothing to fetch yet
+        if fetch_start > today_utc:
+            print(f"[OK] {ticker}: nothing to fetch yet (next trading day {fetch_start} > today {today_utc}).")
+            df_new = pd.DataFrame(index=pd.DatetimeIndex([], name=cached.index.name or "Date"))
+        else:
+            # Optional: set an 'end' to be safe; yfinance's 'end' is exclusive, so add 1 day
+            end_date = today_utc + pd.Timedelta(days=1)
+            df_new = download_ohlcv(ticker, start=str(fetch_start), interval=interval, end=str(end_date))
+    else:
+        # Intraday: let 'now' be the implicit end
+        df_new = download_ohlcv(ticker, start=str(fetch_start), interval=interval)
+    # ----------------------------------------------
+    if cached.empty and df_new.empty:
+        raise ValueError(f"No data returned for {ticker}. Check ticker or start date.")
+    if df_new.empty:
+        print(f"[OK] {ticker}: no new rows to add.")
+        merged = cached
+    else:
+        # merge, drop duplicates, sort
+        merged = pd.concat([cached, df_new], axis=0)
+        merged = merged[~merged.index.duplicated(keep="last")].sort_index()
+        print(f"[OK] {ticker}: added {len(merged) - len(cached)} new rows.")
+        # Optional: keep only completed daily bars
+    #if interval in ("1d", "1wk", "1mo"):
+     #   merged = drop_partial_today_daily(merged)
+    # Only drop partial 'today' if we fetched something new
+    #fetched_any = not df_new.empty
+    #if interval in ("1d", "1wk", "1mo") and fetched_any:
+       # merged = drop_partial_today_daily(merged)
+    #added = len(merged) - len(cached)
+    #if added < 0:
+        # Safety net (shouldn’t happen with the guard above)
+        #added = 0
+    # save
+    merged.to_csv(out_path, date_format="%Y-%m-%d")
+    added = len(merged) - len(cached)
+    print(f"[OK] {ticker}: added {added} new row(s). Now {len(merged)} total.")
+    print(f"[OK] Saved {ticker} → {out_path}")
+    return out_path
+def update_many(
+    tickers: str = DEFAULT_TICKERS,
+    start: str = DEFAULT_START,
+    interval: str = DEFAULT_INTERVAL,
+) -> dict[str, str]:
+    """
+    Update multiple tickers; continue on errors.
+    Returns dict[ticker] -> csv_path (or None if failed).
+    """
+    results: Dict[str, Optional[str]] = {}
+    for t in [t.strip().upper() for t in tickers if t and t.strip()]:
+        print("\n" + "=" * 60)
+        print(f"[RUN] {t}")
+        try:
+            path = update_ticker_csv(t, start=start, interval=interval)
+            results[t] = path
+        except Exception as e:
+            print(f"[ERR] {t}: {e}")
+            results[t] = None
+    print("\n" + "=" * 60)
+    ok = sum(1 for v in results.values() if v)
+    print(f"[SUMMARY] Completed {ok}/{len(results)} tickers.")
+    return results
+if __name__ == "__main__":
+    # choose your universe here (or later via CLI)
+    TICKERS = DEFAULT_TICKERS
+    START = DEFAULT_START
+    INTERVAL = DEFAULT_INTERVAL
+    update_many(TICKERS, start=START, interval=INTERVAL)