Spaces:

Gilette
/

volatilitypredictor

Running

File size: 1,929 Bytes

682cd17
a5e3343
89cf40b
a5e3343
 
682cd17
 
 
 
 
 
a5e3343
 
682cd17
 
a5e3343
 
89cf40b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5e3343
 
682cd17
 
 
 
89cf40b
682cd17
a5e3343
682cd17
 
a5e3343
682cd17
89cf40b
 
 
682cd17
 
89cf40b
682cd17
 
89cf40b
682cd17
 
 
 
 
89cf40b
 
 
 
682cd17
89cf40b
 
 
 
 
 
 
 
a5e3343
89cf40b
a5e3343
89cf40b
 
682cd17

# pipeline_v2.py
import os
import re
import pandas as pd

try:
    import yfinance as yf
except Exception as e:
    raise ImportError(
        "yfinance is not installed. Add `yfinance>=0.2.40` to requirements.txt."
    ) from e


def _ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)


def _ticker_for_query(t: str) -> str:
    """
    Prepare ticker for yfinance:
    - strip spaces
    - uppercase
    - DO NOT alter '.' or '-' (yfinance relies on them, e.g. NESN.SW, BRK-B)
    """
    return t.strip().upper()


def _ticker_for_filename(t: str) -> str:
    """
    Prepare a safe filename:
    - replace any char not [A-Za-z0-9] with '_'
    """
    return re.sub(r"[^A-Za-z0-9]", "_", t)


def update_ticker_csv(
    ticker: str,
    start: str = "2015-01-01",
    interval: str = "1d",
    dst_dir: str = "/mnt/data"
) -> str:
    """
    Download OHLCV for `ticker` using yfinance and save as CSV.
    Returns the CSV file path.
    """
    _ensure_dir(dst_dir)

    tkr_query = _ticker_for_query(ticker)
    tkr_file = _ticker_for_filename(tkr_query)

    df = yf.download(
        tkr_query,
        start=start,
        interval=interval,
        auto_adjust=False,
        progress=False,
        threads=True,
    )

    if df is None or df.empty:
        raise ValueError(
            f"No data returned for ticker '{tkr_query}' (start={start}, interval={interval}). "
            "Check the symbol and exchange suffix (e.g., NESN.SW, BMW.DE, VOD.L)."
        )

    # Ensure a clean Date index
    if not isinstance(df.index, pd.DatetimeIndex):
        df = df.reset_index()
        if "Date" in df.columns:
            df = df.set_index("Date")
        else:
            df.columns = ["Date"] + list(df.columns[1:])
            df = df.set_index("Date")

    df.index.name = "Date"

    csv_path = os.path.join(dst_dir, f"{tkr_file}_{interval}.csv")
    df.to_csv(csv_path)
    return csv_path