Spaces:

Gilette
/

volatilitypredictor

Running

App Files Files Community

Gil Stetler commited on 24 days ago

Commit

682cd17

1 Parent(s): 438b5d2

updated files

Browse files

Files changed (2) hide show

app.py +39 -32
pipeline_v2.py +50 -175

app.py CHANGED Viewed

@@ -512,7 +512,9 @@
 #
 import os, random
 import numpy as np
 import pandas as pd
 import torch
@@ -522,8 +524,8 @@ matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 from chronos import ChronosPipeline
-# >>> import your pipeline <<<
-import pipeline_v2 as pipe2  # provides update_ticker_csv(...)
 # --------------------
 # Config
@@ -551,17 +553,22 @@ pipe = ChronosPipeline.from_pretrained(
 # Helpers
 # --------------------
 def _extract_close(df: pd.DataFrame) -> pd.Series:
     mapping = {c.lower(): c for c in df.columns}
-    for name in ["close", "adj close", "adj_close", "price"]:
         if name in mapping:
             return pd.Series(df[mapping[name]]).astype(float)
-    # fallback: last numeric column
     num_cols = df.select_dtypes(include=[np.number]).columns
     if len(num_cols) == 0:
-        raise gr.Error("Could not find a numeric price column (e.g., Close).")
     return pd.Series(df[num_cols[-1]]).astype(float)
 def _extract_dates(df: pd.DataFrame):
     mapping = {c.lower(): c for c in df.columns}
     for name in ["date", "time", "timestamp"]:
         if name in mapping:
@@ -569,12 +576,7 @@ def _extract_dates(df: pd.DataFrame):
                 return pd.to_datetime(df[mapping[name]]).to_numpy()
             except Exception:
                 pass
-    # If the CSV has a Date index, respect that
-    if df.index.name is not None:
-        try:
-            return pd.to_datetime(df.index).to_numpy()
-        except Exception:
-            pass
     return np.arange(len(df))
 def compute_realized_vol(close: pd.Series, window: int = 20, annualize: bool = True) -> pd.Series:
@@ -584,8 +586,7 @@ def compute_realized_vol(close: pd.Series, window: int = 20, annualize: bool = T
         rv = rv * np.sqrt(252.0)
     return rv.dropna().reset_index(drop=True)
-def bias_scale_calibration(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[float, np.ndarray]:
-    """Return alpha and calibrated predictions alpha * y_pred (MSE-optimal scaling)."""
     alpha = float(np.sum(y_true * y_pred) / (np.sum(y_pred**2) + EPS))
     return alpha, alpha * y_pred
@@ -602,22 +603,29 @@ def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
 # --------------------
 def run_for_ticker(tickers: str, start: str, interval: str, use_calibration: bool):
     """
-    tickers: comma/space separated (first is used for plotting/eval)
     start: YYYY-MM-DD
-    interval: '1d', '1wk', '1mo' (yfinance-safe)
-    use_calibration: whether to apply bias/scale calibration on the 30-day path
     """
-    # parse first ticker
-    tick_list = [t.strip().upper() for t in tickers.replace(";", ",").replace("|", ",").split(",") if t.strip()]
     if not tick_list:
-        raise gr.Error("Please enter at least one ticker (e.g., AAPL).")
     ticker = tick_list[0]
-    # 1) Fetch/update CSV via your pipeline
-    csv_path = pipe2.update_ticker_csv(ticker, start=start, interval=interval)
     # 2) Load CSV and build realized vol
-    df = pd.read_csv(csv_path, index_col=0, parse_dates=[0])
     dates = _extract_dates(df)
     close = _extract_close(df)
@@ -639,7 +647,7 @@ def run_for_ticker(tickers: str, start: str, interval: str, use_calibration: boo
     samples = fcst[0].cpu().numpy()                                            # (1, H)
     path_pred = samples[0]                                                     # (H,)
-    # 4) (Optional) bias/scale calibration
     alpha = None
     if use_calibration:
         alpha, path_pred_cal = bias_scale_calibration(rv_test, path_pred)
@@ -654,9 +662,8 @@ def run_for_ticker(tickers: str, start: str, interval: str, use_calibration: boo
     fig = plt.figure(figsize=(10, 4))
     H0 = len(rv_train)
-    # choose proper x-axis
-    if isinstance(dates, np.ndarray) and dates.shape[0] >= len(close):
-        # Align dates to rv length (after rolling dropna)
         dates_rv = np.array(dates[-len(rv):])
         x_hist = dates_rv[:H0]
         x_fcst = dates_rv[H0:]
@@ -672,7 +679,7 @@ def run_for_ticker(tickers: str, start: str, interval: str, use_calibration: boo
     if use_calibration:
         plt.plot(x_fcst, path_pred_cal, linestyle="--", label=f"forecast (calibrated, α={alpha:.3f})")
-    plt.title(f"{ticker} — Volatility Forecast (RV={RV_WINDOW}, H={H}, interval={interval})")
     plt.xlabel(x_lbl); plt.ylabel("realized volatility")
     plt.legend(loc="best"); plt.tight_layout()
@@ -692,7 +699,7 @@ def run_for_ticker(tickers: str, start: str, interval: str, use_calibration: boo
     # 7) JSON + metrics text
     out = {
-        "ticker": ticker,
         "csv_path": csv_path,
         "config": {
             "start": start,
@@ -720,13 +727,13 @@ def run_for_ticker(tickers: str, start: str, interval: str, use_calibration: boo
 with gr.Blocks(title="Volatility Forecast • yfinance pipeline + Chronos") as demo:
     gr.Markdown(
         "### Predict last 30 days of realized volatility for any ticker\n"
-        "- Data fetched via **yfinance** (your `pipeline_v2.update_ticker_csv`).\n"
         "- Forecast uses **Chronos-T5-Large** (single path, no mean/median).\n"
-        "- Compare day-by-day to actual RV and see **MAPE/MPE/RMSE**.\n"
-        "- Optional **Bias/Scale Calibration (α)** to remove systematic under/overestimation."
     )
     with gr.Row():
-        tickers_in = gr.Textbox(value="AAPL", label="Tickers (comma-separated, first is evaluated)")
     with gr.Row():
         start_in = gr.Textbox(value="2015-01-01", label="Start date (YYYY-MM-DD)")
         interval_in = gr.Dropdown(choices=["1d", "1wk", "1mo"], value="1d", label="Interval")

 #
+# app.py
 import os, random
+from typing import Tuple
 import numpy as np
 import pandas as pd
 import torch
 import matplotlib.pyplot as plt
 from chronos import ChronosPipeline
+# --- our data pipeline ---
+import pipeline_v2 as pipe2  # update_ticker_csv(...)
 # --------------------
 # Config
 # Helpers
 # --------------------
 def _extract_close(df: pd.DataFrame) -> pd.Series:
+    # Prefer 'Adj Close' > 'Close', else last numeric column
     mapping = {c.lower(): c for c in df.columns}
+    for name in ["adj close", "adj_close", "close", "price"]:
         if name in mapping:
             return pd.Series(df[mapping[name]]).astype(float)
     num_cols = df.select_dtypes(include=[np.number]).columns
     if len(num_cols) == 0:
+        raise gr.Error("Could not find a numeric price column (e.g., Close / Adj Close).")
     return pd.Series(df[num_cols[-1]]).astype(float)
 def _extract_dates(df: pd.DataFrame):
+    # If index is DatetimeIndex, use it
+    if isinstance(df.index, pd.DatetimeIndex):
+        return df.index.to_numpy()
+    # Else look for a date-like column
     mapping = {c.lower(): c for c in df.columns}
     for name in ["date", "time", "timestamp"]:
         if name in mapping:
                 return pd.to_datetime(df[mapping[name]]).to_numpy()
             except Exception:
                 pass
+    # Fallback to a simple range
     return np.arange(len(df))
 def compute_realized_vol(close: pd.Series, window: int = 20, annualize: bool = True) -> pd.Series:
         rv = rv * np.sqrt(252.0)
     return rv.dropna().reset_index(drop=True)
+def bias_scale_calibration(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[float, np.ndarray]:
     alpha = float(np.sum(y_true * y_pred) / (np.sum(y_pred**2) + EPS))
     return alpha, alpha * y_pred
 # --------------------
 def run_for_ticker(tickers: str, start: str, interval: str, use_calibration: bool):
     """
+    tickers: comma/space separated; we use the FIRST for plotting/eval.
     start: YYYY-MM-DD
+    interval: '1d', '1wk', '1mo'
     """
+    # Parse first ticker
+    tick_list = [t.strip() for t in tickers.replace(";", ",").replace("|", ",").split(",") if t.strip()]
     if not tick_list:
+        raise gr.Error("Please enter at least one ticker, e.g. AAPL")
     ticker = tick_list[0]
+    # 1) Fetch/update CSV via pipeline
+    try:
+        csv_path = pipe2.update_ticker_csv(ticker, start=start, interval=interval)
+    except Exception as e:
+        raise gr.Error(f"Data fetch failed for '{ticker}': {e}")
     # 2) Load CSV and build realized vol
+    try:
+        df = pd.read_csv(csv_path, index_col=0, parse_dates=[0])
+    except Exception:
+        # Fallback if index parsing fails
+        df = pd.read_csv(csv_path)
     dates = _extract_dates(df)
     close = _extract_close(df)
     samples = fcst[0].cpu().numpy()                                            # (1, H)
     path_pred = samples[0]                                                     # (H,)
+    # 4) Optional bias/scale calibration
     alpha = None
     if use_calibration:
         alpha, path_pred_cal = bias_scale_calibration(rv_test, path_pred)
     fig = plt.figure(figsize=(10, 4))
     H0 = len(rv_train)
+    # Align dates to rv length if we have real dates
+    if isinstance(dates, np.ndarray) and len(dates) >= len(close):
         dates_rv = np.array(dates[-len(rv):])
         x_hist = dates_rv[:H0]
         x_fcst = dates_rv[H0:]
     if use_calibration:
         plt.plot(x_fcst, path_pred_cal, linestyle="--", label=f"forecast (calibrated, α={alpha:.3f})")
+    plt.title(f"{ticker.upper()} — Volatility Forecast (RV={RV_WINDOW}, H={H}, interval={interval})")
     plt.xlabel(x_lbl); plt.ylabel("realized volatility")
     plt.legend(loc="best"); plt.tight_layout()
     # 7) JSON + metrics text
     out = {
+        "ticker": ticker.upper(),
         "csv_path": csv_path,
         "config": {
             "start": start,
 with gr.Blocks(title="Volatility Forecast • yfinance pipeline + Chronos") as demo:
     gr.Markdown(
         "### Predict last 30 days of realized volatility for any ticker\n"
+        "- Fetches data via **yfinance** using your `pipeline_v2.update_ticker_csv`.\n"
         "- Forecast uses **Chronos-T5-Large** (single path, no mean/median).\n"
+        "- Compares day-by-day to actual RV and reports **MAPE/MPE/RMSE**.\n"
+        "- Optional **Bias/Scale Calibration (α)** to remove systematic bias."
     )
     with gr.Row():
+        tickers_in = gr.Textbox(value="AAPL", label="Tickers (comma-separated; first is evaluated)")
     with gr.Row():
         start_in = gr.Textbox(value="2015-01-01", label="Start date (YYYY-MM-DD)")
         interval_in = gr.Dropdown(choices=["1d", "1wk", "1mo"], value="1d", label="Interval")

pipeline_v2.py CHANGED Viewed

@@ -1,189 +1,64 @@
 import os
-from datetime import timedelta
 import pandas as pd
-import yfinance as yf
-os.makedirs("data", exist_ok=True)
-CSV_TEMPLATE = "data/{ticker}_{interval}.csv"
-DEFAULT_START = "2015-01-01"
-DEFAULT_INTERVAL = "1d"
-DEFAULT_TICKERS = ["SPY", "QQQ", "AAPL", "MSFT", "NVDA", "NESN"]
-MAX_RETRIES = 3
-def download_ohlcv(ticker: str, start: str, interval: str, end: str = None) -> pd.DataFrame:
-    print(f"[INFO] Downloading {ticker} from {start} (interval={interval}, end={end})")
-    df = pd.DataFrame()
-    for attempt in range(MAX_RETRIES):
-        df = yf.download(
-            ticker,
-            start=start,
-            end=end,                  # end is exclusive on yfinance
-            interval=interval,
-            auto_adjust=True,
-            progress=False,
-            threads=True,
-            group_by="column",        # helps avoid MultiIndex columns
-        )
-        if not df.empty:
-            break
-        if attempt < MAX_RETRIES - 1:
-            print(f"[WARN] Empty response for {ticker}, retrying... ({attempt+1}/{MAX_RETRIES})")
-    if df.empty:
-        raise ValueError(f"No data returned for {ticker}")
-    # --- NEW: collapse MultiIndex columns if present (single ticker) ---
-    if isinstance(df.columns, pd.MultiIndex):
-        # If levels are ['Price','Ticker'] or similar, drop the Ticker level
-        level_names = list(df.columns.names) if df.columns.names else []
-        if 'Ticker' in level_names:
-            df = df.droplevel('Ticker', axis=1)
-        else:
-            # Drop the *second* level by default (the ticker is usually the last level)
-            df = df.droplevel(-1, axis=1)
-    # -----------------------------------------
-    # Basic cleaning
-    if interval not in ("1d", "1wk", "1mo"):
-        df.index = pd.to_datetime(df.index, utc=True)
-    # df.index = pd.to_datetime(df.index, utc=True)        # ensure timezone # Only needed for smaller than 1d Intervals
-    df = df[~df.index.duplicated(keep="last")]           # drop duplicate timestamps
-    df = df.sort_index()                                 # ensure time order
-    # standardize core columns if present
-    cols = [c for c in ["Open","High","Low","Close","Adj Close","Volume"] if c in df.columns]
-    df = df[cols] if cols else df
-    if "Volume" in df.columns:
-        df["Volume"] = pd.to_numeric(df["Volume"], errors="coerce").fillna(0).astype("int64", errors="ignore")
-    return df
-def load_cached_csv(path: str) -> pd.DataFrame:
-    if not os.path.exists(path):
-        return pd.DataFrame()
-    df = pd.read_csv(path, index_col=0, parse_dates=[0]) # Date index as datetime64[ns] (naive)
-    # df.index = pd.to_datetime(df.index, utc=True)
-    # tidy just in case
-    df = df[~df.index.duplicated(keep="last")].sort_index()
-    return df
-def next_start_from_cache(df_cached: pd.DataFrame) -> str:
-    last_day = pd.to_datetime(df_cached.index.max()).date()
-    return (last_day + timedelta(days=1)).isoformat()
-def drop_partial_today_daily(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    For daily bars, optionally drop a partial 'today' row if the script runs before the session is complete.
-    This is a policy choice—use it if you want your cache to only contain completed daily bars.
     """
-    if df.empty:
-        return df
-    last_day = pd.to_datetime(df.index[-1]).date()
-    today_utc = pd.Timestamp.utcnow().date()
-    return df.iloc[:-1] if last_day >= today_utc else df
-def update_ticker_csv(ticker: str, start: str = "2015-01-01", interval: str = "1d") -> str:
-    """
-    Update (or create) a CSV cache for the ticker. Returns the CSV path.
     """
-    out_path = CSV_TEMPLATE.format(ticker=ticker.upper(), interval=interval)
-    cached = load_cached_csv(out_path)
-    #if interval in ("1d", "1wk", "1mo"):
-    # cached = drop_partial_today_daily(cached)
-    # --- make fetch_start a date, not a string ---
-    if cached.empty:
-        fetch_start = pd.to_datetime(start).date()
-        print(f"[INFO] No existing cache for {ticker}. Full download from {fetch_start}.")
-    else:
-        # next_start_from_cache currently returns a string -> parse to date
-        fetch_start = pd.to_datetime(next_start_from_cache(cached)).date()
-        print(f"[INFO] Found cache with {len(cached)} rows. Incremental from {fetch_start}.")
-    # ---------------------------------------------
-    # ----- NEW: avoid requesting future dates -----
-    today_utc = pd.Timestamp.utcnow().date()
-    if interval in ("1d", "1wk", "1mo"):
-        # If fetch_start is in the future, there is nothing to fetch yet
-        if fetch_start > today_utc:
-            print(f"[OK] {ticker}: nothing to fetch yet (next trading day {fetch_start} > today {today_utc}).")
-            df_new = pd.DataFrame(index=pd.DatetimeIndex([], name=cached.index.name or "Date"))
-        else:
-            # Optional: set an 'end' to be safe; yfinance's 'end' is exclusive, so add 1 day
-            end_date = today_utc + pd.Timedelta(days=1)
-            df_new = download_ohlcv(ticker, start=str(fetch_start), interval=interval, end=str(end_date))
-    else:
-        # Intraday: let 'now' be the implicit end
-        df_new = download_ohlcv(ticker, start=str(fetch_start), interval=interval)
-    # ----------------------------------------------
-    if cached.empty and df_new.empty:
-        raise ValueError(f"No data returned for {ticker}. Check ticker or start date.")
-    if df_new.empty:
-        print(f"[OK] {ticker}: no new rows to add.")
-        merged = cached
     else:
-        # merge, drop duplicates, sort
-        merged = pd.concat([cached, df_new], axis=0)
-        merged = merged[~merged.index.duplicated(keep="last")].sort_index()
-        print(f"[OK] {ticker}: added {len(merged) - len(cached)} new rows.")
-        # Optional: keep only completed daily bars
-    #if interval in ("1d", "1wk", "1mo"):
-     #   merged = drop_partial_today_daily(merged)
-    # Only drop partial 'today' if we fetched something new
-    #fetched_any = not df_new.empty
-    #if interval in ("1d", "1wk", "1mo") and fetched_any:
-       # merged = drop_partial_today_daily(merged)
-    #added = len(merged) - len(cached)
-    #if added < 0:
-        # Safety net (shouldn’t happen with the guard above)
-        #added = 0
-    # save
-    merged.to_csv(out_path, date_format="%Y-%m-%d")
-    added = len(merged) - len(cached)
-    print(f"[OK] {ticker}: added {added} new row(s). Now {len(merged)} total.")
-    print(f"[OK] Saved {ticker} → {out_path}")
-    return out_path
-def update_many(
-    tickers: str = DEFAULT_TICKERS,
-    start: str = DEFAULT_START,
-    interval: str = DEFAULT_INTERVAL,
-) -> dict[str, str]:
-    """
-    Update multiple tickers; continue on errors.
-    Returns dict[ticker] -> csv_path (or None if failed).
-    """
-    results: Dict[str, Optional[str]] = {}
-    for t in [t.strip().upper() for t in tickers if t and t.strip()]:
-        print("\n" + "=" * 60)
-        print(f"[RUN] {t}")
-        try:
-            path = update_ticker_csv(t, start=start, interval=interval)
-            results[t] = path
-        except Exception as e:
-            print(f"[ERR] {t}: {e}")
-            results[t] = None
-    print("\n" + "=" * 60)
-    ok = sum(1 for v in results.values() if v)
-    print(f"[SUMMARY] Completed {ok}/{len(results)} tickers.")
-    return results
-if __name__ == "__main__":
-    # choose your universe here (or later via CLI)
-    TICKERS = DEFAULT_TICKERS
-    START = DEFAULT_START
-    INTERVAL = DEFAULT_INTERVAL
-    update_many(TICKERS, start=START, interval=INTERVAL)

+# pipeline_v2.py
 import os
+from typing import Tuple
 import pandas as pd
+try:
+    import yfinance as yf
+except Exception as e:
+    raise ImportError(
+        "yfinance is not installed. Add `yfinance>=0.2.40` to requirements.txt."
+    ) from e
+def _ensure_dir(path: str) -> None:
+    os.makedirs(path, exist_ok=True)
+def _sanitize_ticker(t: str) -> str:
+    return t.strip().upper().replace(" ", "").replace("/", "-").replace(".", "-")
+def update_ticker_csv(
+    ticker: str,
+    start: str = "2015-01-01",
+    interval: str = "1d",
+    dst_dir: str = "/mnt/data"  # HF Spaces writeable path
+) -> str:
     """
+    Download OHLCV for `ticker` using yfinance and save as CSV.
+    Returns the CSV file path.
+    Args:
+        ticker: e.g. "AAPL"
+        start: "YYYY-MM-DD"
+        interval: "1d", "1wk", "1mo"
+        dst_dir: directory to write CSVs (default: /mnt/data for Spaces)
     """
+    _ensure_dir(dst_dir)
+    tkr = _sanitize_ticker(ticker)
+    df = yf.download(
+        tkr,
+        start=start,
+        interval=interval,
+        auto_adjust=False,   # keep explicit Adj Close; we’ll pick Close / Adj Close later
+        progress=False,
+        threads=True,
+    )
+    if df is None or df.empty:
+        raise ValueError(f"No data returned for ticker '{tkr}' with start={start}, interval={interval}.")
+    # Ensure a clean, single-index Date column
+    if isinstance(df.index, pd.DatetimeIndex):
+        df = df.copy()
+        df.index.name = "Date"
     else:
+        df = df.reset_index().rename(columns={df.columns[0]: "Date"}).set_index("Date")
+    # Save
+    csv_path = os.path.join(dst_dir, f"{tkr}_{interval}.csv")
+    df.to_csv(csv_path)
+    return csv_path