NeerajCodz's picture
feat: full project β€” ML simulation, dashboard UI, models on HF Hub
f381be8
"""
src.data.loader
===============
Data loading utilities for the NASA PCoE Li-ion Battery Dataset.
This module handles:
- Loading and parsing ``metadata.csv`` (including MATLAB-format date vectors)
- Loading individual cycle CSV files (charge / discharge / impedance)
- Aggregating all discharge or charge cycles into a single DataFrame
- Loading impedance scalar features (Re, Rct) from metadata
Excluded batteries: B0049–B0052 (confirmed software crash / corrupt data).
"""
from __future__ import annotations
import ast
import re
from datetime import datetime
from pathlib import Path
from typing import Literal
import numpy as np
import pandas as pd
# ── Project paths ────────────────────────────────────────────────────────────
PROJECT_ROOT = Path(__file__).resolve().parents[2]
DATASET_DIR = PROJECT_ROOT / "cleaned_dataset"
METADATA_PATH = DATASET_DIR / "metadata.csv"
DATA_DIR = DATASET_DIR / "data"
ARTIFACTS_DIR = PROJECT_ROOT / "artifacts"
# ── Constants ────────────────────────────────────────────────────────────────
EXCLUDED_BATTERIES = {"B0049", "B0050", "B0051", "B0052"}
NOMINAL_CAPACITY_AH = 2.0
EOL_30PCT = 1.4 # 30 % fade β†’ 1.4 Ah
EOL_20PCT = 1.6 # 20 % fade β†’ 1.6 Ah
# Battery groups with their EOL thresholds
BATTERY_EOL_MAP: dict[str, float] = {}
for _bid in ("B0005", "B0006", "B0007", "B0018",
"B0025", "B0026", "B0027", "B0028",
"B0029", "B0030", "B0031", "B0032",
"B0041", "B0042", "B0043", "B0044",
"B0045", "B0046", "B0047", "B0048",
"B0053", "B0054", "B0055", "B0056"):
BATTERY_EOL_MAP[_bid] = EOL_30PCT
for _bid in ("B0033", "B0034", "B0036",
"B0038", "B0039", "B0040"):
BATTERY_EOL_MAP[_bid] = EOL_20PCT
# ── MATLAB date-vector parser ───────────────────────────────────────────────
def _parse_matlab_datevec(s: str) -> datetime | None:
"""Parse a MATLAB-style date vector string into a Python datetime.
Handles formats like:
``[2010. 7. 21. 15. 0. 35.093]``
``[2.008e+03, 4.000e+00, 2.000e+00, ...]``
"""
if not isinstance(s, str) or s.strip() in ("", "[]"):
return None
try:
# Strip brackets and split on comma / whitespace
inner = s.strip().strip("[]")
# Replace multiple spaces / commas with single comma
inner = re.sub(r"[,\s]+", ",", inner.strip())
parts = [float(x) for x in inner.split(",") if x]
if len(parts) < 6:
return None
yr, mo, dy, hr, mi, sc = parts[:6]
return datetime(int(yr), int(mo), int(dy), int(hr), int(mi), int(sc))
except (ValueError, OverflowError):
return None
# ── Metadata ─────────────────────────────────────────────────────────────────
def load_metadata(
*,
exclude_corrupt: bool = True,
parse_dates: bool = True,
) -> pd.DataFrame:
"""Load ``metadata.csv`` with optional date parsing and corrupt-battery exclusion.
Parameters
----------
exclude_corrupt : bool
If True, drop rows for B0049–B0052.
parse_dates : bool
If True, add a ``datetime`` column parsed from the raw ``start_time`` field.
Returns
-------
pd.DataFrame
One row per test/cycle.
"""
df = pd.read_csv(METADATA_PATH)
# Coerce Capacity to numeric (handles '[]' and empty strings)
df["Capacity"] = pd.to_numeric(df["Capacity"], errors="coerce")
df["Re"] = pd.to_numeric(df["Re"], errors="coerce")
df["Rct"] = pd.to_numeric(df["Rct"], errors="coerce")
if exclude_corrupt:
df = df[~df["battery_id"].isin(EXCLUDED_BATTERIES)].reset_index(drop=True)
if parse_dates:
df["datetime"] = df["start_time"].apply(_parse_matlab_datevec)
return df
# ── Individual cycle data ────────────────────────────────────────────────────
def load_cycle_csv(uid: int | str) -> pd.DataFrame:
"""Load a single cycle CSV by its UID (filename number).
Parameters
----------
uid : int or str
The global unique ID, e.g. 1 β†’ ``00001.csv``.
Returns
-------
pd.DataFrame
Raw time-series data for that cycle.
"""
fname = f"{int(uid):05d}.csv"
path = DATA_DIR / fname
if not path.exists():
raise FileNotFoundError(f"Cycle CSV not found: {path}")
return pd.read_csv(path)
# ── Aggregated cycle loading ─────────────────────────────────────────────────
def load_all_cycles(
cycle_type: Literal["discharge", "charge", "impedance"],
*,
exclude_corrupt: bool = True,
max_batteries: int | None = None,
verbose: bool = True,
) -> pd.DataFrame:
"""Load and concatenate all cycles of a given type across all batteries.
Adds ``battery_id``, ``test_id``, ``uid``, ``cycle_number`` (0-based per
battery for this cycle type), and ``Capacity`` (for discharge cycles).
Parameters
----------
cycle_type : {"discharge", "charge", "impedance"}
exclude_corrupt : bool
max_batteries : int or None
Limit number of batteries processed (useful for debugging).
verbose : bool
Returns
-------
pd.DataFrame
Concatenated time-series data with metadata columns appended.
"""
from tqdm import tqdm
meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=False)
subset = meta[meta["type"] == cycle_type].copy()
if max_batteries is not None:
keep_bats = subset["battery_id"].unique()[:max_batteries]
subset = subset[subset["battery_id"].isin(keep_bats)]
# Assign cycle_number per battery within this type
subset = subset.sort_values(["battery_id", "test_id"]).reset_index(drop=True)
subset["cycle_number"] = subset.groupby("battery_id").cumcount()
frames: list[pd.DataFrame] = []
iterator = tqdm(subset.iterrows(), total=len(subset), desc=f"Loading {cycle_type}") if verbose else subset.iterrows()
for _, row in iterator:
try:
df = load_cycle_csv(row["uid"])
except FileNotFoundError:
continue
df["battery_id"] = row["battery_id"]
df["test_id"] = row["test_id"]
df["uid"] = row["uid"]
df["cycle_number"] = row["cycle_number"]
if cycle_type == "discharge":
df["Capacity"] = row["Capacity"]
if cycle_type == "impedance":
df["Re"] = row["Re"]
df["Rct"] = row["Rct"]
frames.append(df)
if not frames:
return pd.DataFrame()
return pd.concat(frames, ignore_index=True)
def load_discharge_capacities(
*,
exclude_corrupt: bool = True,
drop_zero: bool = True,
) -> pd.DataFrame:
"""Return a compact DataFrame of discharge capacity per cycle per battery.
Columns: ``battery_id``, ``cycle_number``, ``Capacity``, ``ambient_temperature``.
This is much faster than `load_all_cycles("discharge")` because it only
reads metadata β€” no individual CSV loading.
"""
meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=True)
dis = meta[meta["type"] == "discharge"].copy()
dis = dis.sort_values(["battery_id", "test_id"]).reset_index(drop=True)
dis["cycle_number"] = dis.groupby("battery_id").cumcount()
cols = ["battery_id", "cycle_number", "Capacity", "ambient_temperature"]
if "datetime" in dis.columns:
cols.append("datetime")
result = dis[cols].copy()
if drop_zero:
result = result[result["Capacity"] > 0].dropna(subset=["Capacity"])
return result.reset_index(drop=True)
def load_impedance_scalars(*, exclude_corrupt: bool = True) -> pd.DataFrame:
"""Return Re and Rct per cycle per battery from impedance tests (metadata only)."""
meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=True)
imp = meta[meta["type"] == "impedance"].copy()
imp = imp.sort_values(["battery_id", "test_id"]).reset_index(drop=True)
imp["cycle_number"] = imp.groupby("battery_id").cumcount()
cols = ["battery_id", "cycle_number", "Re", "Rct", "ambient_temperature"]
if "datetime" in imp.columns:
cols.append("datetime")
return imp[cols].dropna(subset=["Re", "Rct"]).reset_index(drop=True)
def get_battery_ids(*, exclude_corrupt: bool = True) -> list[str]:
"""Return sorted list of available battery IDs."""
meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=False)
return sorted(meta["battery_id"].unique().tolist())
def get_eol_threshold(battery_id: str) -> float:
"""Get EOL capacity threshold for a given battery."""
return BATTERY_EOL_MAP.get(battery_id, EOL_30PCT)