Spaces:

VincentCroft
/

FaultDetectionDeepLearning

Sleeping

App Files Files Community

VincentCroft commited on Sep 22

Commit

c4598a9

1 Parent(s): 948686f

Add PMU fault training pipeline and improve Gradio app

Browse files

Files changed (3) hide show

app.py +377 -140
fault_classification_pmu.py +358 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,161 +1,398 @@
-# lstm_cnn_app.py (modified)
-"""
-Robust Gradio app for CNN-LSTM fault classification.
-Features added:
-- Prefer local model file; optionally download from Hugging Face Hub if HUB_REPO/HUB_FILENAME set.
-- If no model found, app still starts but prediction functions return friendly message.
-- Port selection:
-    * If GRADIO_SERVER_PORT or PORT env var is set, try that.
-    * Otherwise find a free ephemeral port and use it.
-    * If binding fails, fall back to demo.launch() with no explicit port (Gradio picks).
-- Reduces TF logging noise via TF_CPP_MIN_LOG_LEVEL (optional).
 """
 import os
 import socket
 import numpy as np
 import pandas as pd
-import gradio as gr
-from tensorflow.keras.models import load_model
 from huggingface_hub import hf_hub_download
-# Reduce TensorFlow log noise (keeps warnings but hides info/debug)
 os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")
-# CONFIG: change these if your model filename/repo are different
-LOCAL_MODEL_FILE = "lstm_cnn_model.h5"
-HUB_REPO = ""         # e.g., "username/lstm-cnn-model" (leave empty to disable)
-HUB_FILENAME = ""     # e.g., "lstm_cnn_model.h5"
-def download_from_hub(repo: str, filename: str):
     try:
-        print(f"Downloading {filename} from {repo} ...")
-        path = hf_hub_download(repo_id=repo, filename=filename)
-        print("Downloaded to:", path)
-        return path
-    except Exception as e:
-        print("Failed to download from hub:", e)
         return None
-def get_model_path():
-    # Prefer local file
-    if os.path.exists(LOCAL_MODEL_FILE):
-        return LOCAL_MODEL_FILE
-    # Try env override for local path (handy in Spaces)
-    alt = os.environ.get("MODEL_FILE_PATH")
-    if alt and os.path.exists(alt):
-        return alt
-    # Try hub
-    if HUB_REPO and HUB_FILENAME:
-        return download_from_hub(HUB_REPO, HUB_FILENAME)
-    return None
-def try_load_model(path):
     try:
-        m = load_model(path)
-        print("Loaded model:", path)
-        return m
-    except Exception as e:
-        print("Failed to load model:", e)
         return None
-MODEL_PATH = get_model_path()
-MODEL = try_load_model(MODEL_PATH) if MODEL_PATH else None
-def prepare_input_array(arr, n_timesteps=1, n_features=None):
-    arr = np.array(arr)
-    if arr.ndim == 1:
-        if n_features is None:
-            # assume arr is flattened timesteps*features
-            return arr.reshape(1, n_timesteps, -1)
-        return arr.reshape(1, n_timesteps, int(n_features))
-    elif arr.ndim == 2:
-        # treat as (timesteps, features) -> add batch dim
-        if arr.shape[0] == 1:
-            return arr.reshape(1, arr.shape[1], -1)
-        return arr
-    else:
-        return arr
-def predict_text(text, n_timesteps=1, n_features=None):
-    if MODEL is None:
-        return "模型未加载。请上传 'lstm_cnn_model.h5' 到 Space 根目录，或设置 HUB_REPO/HUB_FILENAME。"
-    try:
-        arr = np.fromstring(text, sep=',')
-        x = prepare_input_array(arr, n_timesteps=int(n_timesteps), n_features=(int(n_features) if n_features else None))
-        probs = MODEL.predict(x)
-        label = int(np.argmax(probs, axis=1)[0])
-        return f"预测类别: {label} (概率: {float(np.max(probs)):.4f})"
-    except Exception as e:
-        return f"预测失败: {e}"
-def predict_csv(file, n_timesteps=1, n_features=None):
-    if MODEL is None:
-        return {"error": "模型未加载。请上传 'lstm_cnn_model.h5' 到 Space 根目录，或设置 HUB_REPO/HUB_FILENAME。"}
     try:
-        df = pd.read_csv(file.name)
-        X = df.values
-        if n_features:
-            X = X.reshape(X.shape[0], int(n_timesteps), int(n_features))
-        preds = MODEL.predict(X)
-        labels = preds.argmax(axis=1).tolist()
-        return {"labels": labels, "probs": preds.tolist()}
-    except Exception as e:
-        return {"error": f"预测失败: {e}"}
-# Gradio UI
-with gr.Blocks() as demo:
-    gr.Markdown("# CNN-LSTM Fault Classification")
-    if MODEL is None:
-        gr.Markdown("**注意**：未检测到模型 (lstm_cnn_model.h5)。请上传模型或在代码中设置 HUB_REPO/HUB_FILENAME。应用仍会启动，但预测不可用。")
-    else:
-        gr.Markdown("模型已加载，可以上传 CSV 或粘贴逗号分隔的一行特征进行预测。")
-    with gr.Row():
-        file_in = gr.File(label="上传 CSV（每行 = 一个样本）")
-        text_in = gr.Textbox(lines=2, placeholder="粘贴逗号分隔的一行特征，例如: 0.1,0.2,0.3,...")
-    n_ts = gr.Number(value=1, label="timesteps (整型)")
-    n_feat = gr.Number(value=None, label="features (可选，留空尝试自动推断)")
-    btn = gr.Button("预测")
-    out_text = gr.Textbox(label="单样本预测输出")
-    out_json = gr.JSON(label="批量预测结果 (labels & probs)")
-    def run_predict(file, text, n_timesteps, n_features):
-        if file is not None:
-            return "CSV 预测完成", predict_csv(file, n_timesteps, n_features)
-        if text:
-            return predict_text(text, n_timesteps, n_features), {}
-        return "请提供 CSV 或特征文本", {}
-    btn.click(run_predict, inputs=[file_in, text_in, n_ts, n_feat], outputs=[out_text, out_json])
-# Robust port selection & launch helper
-def find_free_port():
-    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-    s.bind(('', 0))
-    addr, port = s.getsockname()
-    s.close()
-    return port
-def get_desired_port():
-    # priority: GRADIO_SERVER_PORT -> PORT -> auto find
-    p = os.environ.get("GRADIO_SERVER_PORT") or os.environ.get("PORT")
-    if p:
-        try:
-            return int(p)
-        except:
-            pass
-    # fallback to ephemeral free port
     return find_free_port()
-if __name__ == '__main__':
-    port = None
     try:
-        port = get_desired_port()
-        print(f"Launching server on port {port} (server_name=0.0.0.0)")
-        demo.launch(server_name='0.0.0.0', server_port=port)
-    except OSError as e:
-        print("Failed to bind requested port:", e)
-        print("Falling back to default demo.launch() (no explicit port).")
-        # last fallback: let Gradio choose/handle
         demo.launch()

+"""Gradio front-end for Fault_Classification_PMU_Data models.
+The application loads a CNN-LSTM model (and accompanying scaler/metadata)
+produced by ``fault_classification_pmu.py`` and exposes a streamlined
+prediction interface optimised for Hugging Face Spaces deployment.  It supports
+raw PMU time-series CSV uploads as well as manual comma separated feature
+vectors.
 """
+from __future__ import annotations
+import json
 import os
+import re
 import socket
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Tuple
+import gradio as gr
+import joblib
 import numpy as np
 import pandas as pd
 from huggingface_hub import hf_hub_download
+from tensorflow.keras.models import load_model
+# Reduce TensorFlow log verbosity
 os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")
+# --------------------------------------------------------------------------------------
+# Configuration
+# --------------------------------------------------------------------------------------
+DEFAULT_FEATURE_COLUMNS: List[str] = [
+    "[325] UPMU_SUB22:FREQ",
+    "[326] UPMU_SUB22:DFDT",
+    "[327] UPMU_SUB22:FLAG",
+    "[328] UPMU_SUB22-L1:MAG",
+    "[329] UPMU_SUB22-L1:ANG",
+    "[330] UPMU_SUB22-L2:MAG",
+    "[331] UPMU_SUB22-L2:ANG",
+    "[332] UPMU_SUB22-L3:MAG",
+    "[333] UPMU_SUB22-L3:ANG",
+    "[334] UPMU_SUB22-C1:MAG",
+    "[335] UPMU_SUB22-C1:ANG",
+    "[336] UPMU_SUB22-C2:MAG",
+    "[337] UPMU_SUB22-C2:ANG",
+    "[338] UPMU_SUB22-C3:MAG",
+    "[339] UPMU_SUB22-C3:ANG",
+]
+DEFAULT_SEQUENCE_LENGTH = 32
+DEFAULT_STRIDE = 4
+LOCAL_MODEL_FILE = os.environ.get("PMU_MODEL_FILE", "pmu_cnn_lstm_model.keras")
+LOCAL_SCALER_FILE = os.environ.get("PMU_SCALER_FILE", "pmu_feature_scaler.pkl")
+LOCAL_METADATA_FILE = os.environ.get("PMU_METADATA_FILE", "pmu_metadata.json")
+HUB_REPO = os.environ.get("PMU_HUB_REPO", "")
+HUB_MODEL_FILENAME = os.environ.get("PMU_HUB_MODEL_FILENAME", LOCAL_MODEL_FILE)
+HUB_SCALER_FILENAME = os.environ.get("PMU_HUB_SCALER_FILENAME", LOCAL_SCALER_FILE)
+HUB_METADATA_FILENAME = os.environ.get("PMU_HUB_METADATA_FILENAME", LOCAL_METADATA_FILE)
+ENV_MODEL_PATH = "PMU_MODEL_PATH"
+ENV_SCALER_PATH = "PMU_SCALER_PATH"
+ENV_METADATA_PATH = "PMU_METADATA_PATH"
+# --------------------------------------------------------------------------------------
+# Utility functions for loading artifacts
+# --------------------------------------------------------------------------------------
+def download_from_hub(filename: str) -> Optional[Path]:
+    if not HUB_REPO or not filename:
+        return None
     try:
+        print(f"Downloading {filename} from {HUB_REPO} ...")
+        path = hf_hub_download(repo_id=HUB_REPO, filename=filename)
+        print("Downloaded", path)
+        return Path(path)
+    except Exception as exc:  # pragma: no cover - logging convenience
+        print("Failed to download", filename, "from", HUB_REPO, ":", exc)
         return None
+def resolve_artifact(local_name: str, env_var: str, hub_filename: str) -> Optional[Path]:
+    candidates = [Path(local_name)] if local_name else []
+    env_value = os.environ.get(env_var)
+    if env_value:
+        candidates.append(Path(env_value))
+    for candidate in candidates:
+        if candidate and candidate.exists():
+            return candidate
+    return download_from_hub(hub_filename)
+def load_metadata(path: Optional[Path]) -> Dict:
+    if path and path.exists():
+        try:
+            return json.loads(path.read_text())
+        except Exception as exc:  # pragma: no cover - metadata parsing errors
+            print("Failed to read metadata", path, exc)
+    return {}
+def try_load_model(path: Optional[Path]):
+    if not path:
+        return None
     try:
+        model = load_model(path)
+        print("Loaded model from", path)
+        return model
+    except Exception as exc:  # pragma: no cover - runtime diagnostics
+        print("Failed to load model", path, exc)
         return None
+def try_load_scaler(path: Optional[Path]):
+    if not path:
+        return None
     try:
+        scaler = joblib.load(path)
+        print("Loaded scaler from", path)
+        return scaler
+    except Exception as exc:
+        print("Failed to load scaler", path, exc)
+        return None
+MODEL_PATH = resolve_artifact(LOCAL_MODEL_FILE, ENV_MODEL_PATH, HUB_MODEL_FILENAME)
+SCALER_PATH = resolve_artifact(LOCAL_SCALER_FILE, ENV_SCALER_PATH, HUB_SCALER_FILENAME)
+METADATA_PATH = resolve_artifact(LOCAL_METADATA_FILE, ENV_METADATA_PATH, HUB_METADATA_FILENAME)
+MODEL = try_load_model(MODEL_PATH)
+SCALER = try_load_scaler(SCALER_PATH)
+METADATA = load_metadata(METADATA_PATH)
+FEATURE_COLUMNS: List[str] = METADATA.get("feature_columns", DEFAULT_FEATURE_COLUMNS)
+LABEL_CLASSES: List[str] = [str(label) for label in METADATA.get("label_classes", [])]
+LABEL_COLUMN: str = METADATA.get("label_column", "Fault")
+SEQUENCE_LENGTH: int = int(METADATA.get("sequence_length", DEFAULT_SEQUENCE_LENGTH))
+DEFAULT_WINDOW_STRIDE: int = int(METADATA.get("stride", DEFAULT_STRIDE))
+if MODEL is not None and not LABEL_CLASSES:
+    LABEL_CLASSES = [str(i) for i in range(MODEL.output_shape[-1])]
+# --------------------------------------------------------------------------------------
+# Pre-processing helpers
+# --------------------------------------------------------------------------------------
+def ensure_ready():
+    if MODEL is None or SCALER is None:
+        raise RuntimeError(
+            "模型或特征缩放器未加载。请将 pmu_cnn_lstm_model.keras 和 pmu_feature_scaler.pkl "
+            "上传到 Space，或设置相关的 Hugging Face Hub 配置。"
+        )
+def parse_text_features(text: str) -> np.ndarray:
+    cleaned = re.sub(r"[;\n\t]+", ",", text.strip())
+    arr = np.fromstring(cleaned, sep=",")
+    if arr.size == 0:
+        raise ValueError("未解析到任何特征值，请输入以逗号分隔的数字。")
+    return arr.astype(np.float32)
+def apply_scaler(sequences: np.ndarray) -> np.ndarray:
+    if SCALER is None:
+        return sequences
+    shape = sequences.shape
+    flattened = sequences.reshape(-1, shape[-1])
+    scaled = SCALER.transform(flattened)
+    return scaled.reshape(shape)
+def make_sliding_windows(data: np.ndarray, sequence_length: int, stride: int) -> np.ndarray:
+    if data.shape[0] < sequence_length:
+        raise ValueError(
+            f"数据行数 ({data.shape[0]}) 小于序列长度 ({sequence_length})，无法创建窗口。"
+        )
+    windows = [data[start : start + sequence_length] for start in range(0, data.shape[0] - sequence_length + 1, stride)]
+    return np.stack(windows)
+def dataframe_to_sequences(
+    df: pd.DataFrame,
+    *,
+    sequence_length: int,
+    stride: int,
+    feature_columns: Sequence[str],
+    drop_label: bool = True,
+) -> np.ndarray:
+    work_df = df.copy()
+    if drop_label and LABEL_COLUMN in work_df.columns:
+        work_df = work_df.drop(columns=[LABEL_COLUMN])
+    if "Timestamp" in work_df.columns:
+        work_df = work_df.sort_values("Timestamp")
+    available_cols = [c for c in feature_columns if c in work_df.columns]
+    n_features = len(feature_columns)
+    if available_cols and len(available_cols) == n_features:
+        array = work_df[available_cols].astype(np.float32).to_numpy()
+        return make_sliding_windows(array, sequence_length, stride)
+    numeric_df = work_df.select_dtypes(include=[np.number])
+    array = numeric_df.astype(np.float32).to_numpy()
+    if array.shape[1] == n_features * sequence_length:
+        return array.reshape(array.shape[0], sequence_length, n_features)
+    if sequence_length == 1 and array.shape[1] == n_features:
+        return array.reshape(array.shape[0], 1, n_features)
+    raise ValueError(
+        "CSV 列与预期特征不匹配。请包含完整的 PMU 特征列，或提供整形后的窗口数据。"
+    )
+def label_name(index: int) -> str:
+    if 0 <= index < len(LABEL_CLASSES):
+        return str(LABEL_CLASSES[index])
+    return f"class_{index}"
+def format_predictions(probabilities: np.ndarray) -> pd.DataFrame:
+    rows: List[Dict[str, object]] = []
+    order = np.argsort(probabilities, axis=1)[:, ::-1]
+    for idx, (prob_row, ranking) in enumerate(zip(probabilities, order)):
+        top_idx = int(ranking[0])
+        top_label = label_name(top_idx)
+        top_conf = float(prob_row[top_idx])
+        top3 = [f"{label_name(i)} ({prob_row[i]*100:.2f}%)" for i in ranking[:3]]
+        rows.append(
+            {
+                "window": idx,
+                "predicted_label": top_label,
+                "confidence": round(top_conf, 4),
+                "top3": " | ".join(top3),
+            }
+        )
+    return pd.DataFrame(rows)
+def probabilities_to_json(probabilities: np.ndarray) -> List[Dict[str, object]]:
+    payload: List[Dict[str, object]] = []
+    for idx, prob_row in enumerate(probabilities):
+        payload.append(
+            {
+                "window": int(idx),
+                "probabilities": {label_name(i): float(prob_row[i]) for i in range(prob_row.shape[0])},
+            }
+        )
+    return payload
+def predict_sequences(sequences: np.ndarray) -> Tuple[str, pd.DataFrame, List[Dict[str, object]]]:
+    ensure_ready()
+    sequences = apply_scaler(sequences.astype(np.float32))
+    probs = MODEL.predict(sequences, verbose=0)
+    table = format_predictions(probs)
+    json_probs = probabilities_to_json(probs)
+    status = f"共生成 {len(sequences)} 个窗口，模型输出维度 {probs.shape[1]}."
+    return status, table, json_probs
+def predict_from_text(text: str, sequence_length: int) -> Tuple[str, pd.DataFrame, List[Dict[str, object]]]:
+    arr = parse_text_features(text)
+    n_features = len(FEATURE_COLUMNS)
+    if arr.size % n_features != 0:
+        raise ValueError(
+            f"输入特征数量 {arr.size} 不是特征维度 {n_features} 的整数倍。请按照 {n_features} 个特征为一组输入。"
+        )
+    timesteps = arr.size // n_features
+    if timesteps != sequence_length:
+        raise ValueError(
+            f"检测到 {timesteps} 个时间步，与当前设置的序列长度 {sequence_length} 不一致。"
+        )
+    sequences = arr.reshape(1, sequence_length, n_features)
+    status, table, probs = predict_sequences(sequences)
+    status = f"单窗口预测完成。{status}"
+    return status, table, probs
+def predict_from_csv(file_obj, sequence_length: int, stride: int) -> Tuple[str, pd.DataFrame, List[Dict[str, object]]]:
+    df = pd.read_csv(file_obj.name)
+    sequences = dataframe_to_sequences(
+        df,
+        sequence_length=sequence_length,
+        stride=stride,
+        feature_columns=FEATURE_COLUMNS,
+    )
+    status, table, probs = predict_sequences(sequences)
+    status = f"CSV 处理完成，生成 {len(sequences)} 个窗口。{status}"
+    return status, table, probs
+# --------------------------------------------------------------------------------------
+# Gradio interface
+# --------------------------------------------------------------------------------------
+def build_interface() -> gr.Blocks:
+    with gr.Blocks(title="Fault Classification - PMU Data") as demo:
+        gr.Markdown("# Fault Classification (PMU 数据)")
+        if MODEL is None or SCALER is None:
+            gr.Markdown(
+                "⚠️ **模型或缩放器未准备好。** 上传 `pmu_cnn_lstm_model.keras`、"
+                "`pmu_feature_scaler.pkl` 与 `pmu_metadata.json` 至 Space 根目录，或配置环境变量以从 Hugging Face Hub 自动下载。"
+            )
+        else:
+            gr.Markdown(
+                "模型、特征缩放器与元数据均已加载。可以上传原始 PMU CSV 数据，或粘贴单个时间窗口的特征向量进行推理。"
+            )
+        with gr.Accordion("特征说明", open=False):
+            gr.Markdown(
+                f"输入窗口按以下特征顺序排列 (每个时间步共 {len(FEATURE_COLUMNS)} 个特征):\n"
+                + "\n".join(f"- {name}" for name in FEATURE_COLUMNS)
+            )
+            gr.Markdown(
+                f"训练时使用的窗口长度默认为 **{SEQUENCE_LENGTH}**，滑动步长默认为 **{DEFAULT_WINDOW_STRIDE}**。"
+            )
+        with gr.Row():
+            file_in = gr.File(label="上传 PMU CSV", file_types=[".csv"])
+            text_in = gr.Textbox(
+                lines=4,
+                label="或粘贴单个窗口的逗号分隔特征",
+                placeholder="49.97772,1.215825E-38,...",
+            )
+        with gr.Row():
+            sequence_length_input = gr.Slider(
+                minimum=1,
+                maximum=max(1, SEQUENCE_LENGTH * 2),
+                step=1,
+                value=SEQUENCE_LENGTH,
+                label="序列长度 (timesteps)",
+            )
+            stride_input = gr.Slider(
+                minimum=1,
+                maximum=max(1, SEQUENCE_LENGTH),
+                step=1,
+                value=max(1, DEFAULT_WINDOW_STRIDE),
+                label="CSV 滑动窗口步长",
+            )
+        predict_btn = gr.Button("执行预测", variant="primary")
+        status_out = gr.Textbox(label="状态", interactive=False)
+        table_out = gr.Dataframe(headers=["window", "predicted_label", "confidence", "top3"], label="预测结果", interactive=False)
+        probs_out = gr.JSON(label="各窗口概率分布")
+        def _run_prediction(file_obj, text, sequence_length, stride):
+            sequence_length = int(sequence_length)
+            stride = int(stride)
+            try:
+                if file_obj is not None:
+                    return predict_from_csv(file_obj, sequence_length, stride)
+                if text and text.strip():
+                    return predict_from_text(text, sequence_length)
+                return "请上传 CSV 或输入文本特征。", pd.DataFrame(), []
+            except Exception as exc:
+                return f"预测失败: {exc}", pd.DataFrame(), []
+        predict_btn.click(
+            _run_prediction,
+            inputs=[file_in, text_in, sequence_length_input, stride_input],
+            outputs=[status_out, table_out, probs_out],
+        )
+    return demo
+# --------------------------------------------------------------------------------------
+# Launch helpers
+# --------------------------------------------------------------------------------------
+def find_free_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
+def choose_port() -> Optional[int]:
+    for env_var in ("GRADIO_SERVER_PORT", "PORT"):
+        value = os.environ.get(env_var)
+        if value:
+            try:
+                return int(value)
+            except ValueError:
+                pass
     return find_free_port()
+def main():
+    demo = build_interface()
     try:
+        port = choose_port()
+        print(f"Launching Gradio app on port {port}")
+        demo.launch(server_name="0.0.0.0", server_port=port)
+    except OSError as exc:
+        print("Failed to launch on requested port:", exc)
         demo.launch()
+if __name__ == "__main__":
+    main()

fault_classification_pmu.py ADDED Viewed

	@@ -0,0 +1,358 @@

+"""Fault classification training utilities for PMU data.
+This module trains a CNN-LSTM model on high-frequency PMU measurements to
+classify transmission line faults.  It implements a full training pipeline
+including preprocessing, sequence generation, model definition, evaluation,
+and artifact export so the resulting model can be served via the Gradio app
+in this repository or on Hugging Face Spaces.
+Example
+-------
+python fault_classification_pmu.py \
+    --data-path data/Fault_Classification_PMU_Data.csv \
+    --label-column FaultType \
+    --model-out pmu_cnn_lstm_model.keras \
+    --scaler-out pmu_feature_scaler.pkl \
+    --metadata-out pmu_metadata.json
+The script accepts CSV input where each row contains a timestamped PMU
+measurement and a categorical fault label.  Features default to the 14 PMU
+channels used in the project documentation, but any subset can be provided
+via the ``--feature-columns`` argument.  Data is automatically standardised
+and windowed to create temporal sequences that feed into the neural network.
+The exported metadata JSON file contains the feature ordering, label names,
+sequence length, and stride.  The Gradio front-end consumes this file to
+replicate the same preprocessing steps during inference.
+"""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import List, Sequence, Tuple
+import joblib
+import numpy as np
+import pandas as pd
+from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from tensorflow.keras import callbacks, layers, models, optimizers
+# Default PMU feature set as described in the user provided table.  Timestamp is
+# intentionally omitted because it is not a model input feature.
+DEFAULT_FEATURE_COLUMNS: List[str] = [
+    "[325] UPMU_SUB22:FREQ",
+    "[326] UPMU_SUB22:DFDT",
+    "[327] UPMU_SUB22:FLAG",
+    "[328] UPMU_SUB22-L1:MAG",
+    "[329] UPMU_SUB22-L1:ANG",
+    "[330] UPMU_SUB22-L2:MAG",
+    "[331] UPMU_SUB22-L2:ANG",
+    "[332] UPMU_SUB22-L3:MAG",
+    "[333] UPMU_SUB22-L3:ANG",
+    "[334] UPMU_SUB22-C1:MAG",
+    "[335] UPMU_SUB22-C1:ANG",
+    "[336] UPMU_SUB22-C2:MAG",
+    "[337] UPMU_SUB22-C2:ANG",
+    "[338] UPMU_SUB22-C3:MAG",
+    "[339] UPMU_SUB22-C3:ANG",
+]
+def _resolve_features(df: pd.DataFrame, feature_columns: Sequence[str] | None, label_column: str) -> List[str]:
+    if feature_columns:
+        missing = [c for c in feature_columns if c not in df.columns]
+        if missing:
+            raise ValueError(f"Feature columns not present in CSV: {missing}")
+        return list(feature_columns)
+    # Prefer the documented PMU ordering when the columns exist, falling back to
+    # any remaining numeric columns.
+    preferred = [c for c in DEFAULT_FEATURE_COLUMNS if c in df.columns]
+    excluded = {label_column, label_column.lower(), "timestamp", "Timestamp"}
+    remainder = [c for c in df.columns if c not in preferred and c not in excluded]
+    ordered = preferred + remainder
+    if not ordered:
+        raise ValueError("No feature columns detected. Specify --feature-columns explicitly.")
+    return ordered
+def load_dataset(
+    csv_path: Path,
+    *,
+    feature_columns: Sequence[str] | None,
+    label_column: str,
+) -> Tuple[np.ndarray, np.ndarray, List[str]]:
+    """Load the dataset from CSV.
+    Parameters
+    ----------
+    csv_path:
+        Path to the CSV file containing PMU measurements.
+    feature_columns:
+        Optional explicit ordering of feature columns.
+    label_column:
+        Name of the column containing the categorical fault label.
+    Returns
+    -------
+    features: np.ndarray
+        2-D array of shape (n_samples, n_features).
+    labels: np.ndarray
+        1-D array of label strings.
+    columns: list[str]
+        Actual feature ordering used.
+    """
+    df = pd.read_csv(csv_path)
+    if label_column not in df.columns:
+        raise ValueError(f"Label column '{label_column}' not found in {csv_path}")
+    columns = _resolve_features(df, feature_columns, label_column)
+    features = df[columns].astype(np.float32).values
+    labels = df[label_column].astype(str).values
+    return features, labels, columns
+def create_sequences(
+    features: np.ndarray,
+    labels: np.ndarray,
+    *,
+    sequence_length: int,
+    stride: int,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Create overlapping sequences suitable for sequence models.
+    The label assigned to a sequence corresponds to the label of the final
+    timestep in the window.  This choice aligns with fault detection use cases
+    where the most recent measurement dictates the state of the system.
+    """
+    if sequence_length <= 0:
+        raise ValueError("sequence_length must be > 0")
+    if stride <= 0:
+        raise ValueError("stride must be > 0")
+    if features.shape[0] != labels.shape[0]:
+        raise ValueError("Features and labels must contain the same number of rows")
+    if features.shape[0] < sequence_length:
+        raise ValueError("Not enough samples to create a single sequence")
+    sequences: List[np.ndarray] = []
+    seq_labels: List[str] = []
+    for start in range(0, features.shape[0] - sequence_length + 1, stride):
+        end = start + sequence_length
+        sequences.append(features[start:end])
+        seq_labels.append(labels[end - 1])
+    return np.stack(sequences), np.array(seq_labels)
+def build_cnn_lstm(
+    input_shape: Tuple[int, int],
+    num_classes: int,
+    *,
+    conv_filters: int = 128,
+    kernel_size: int = 3,
+    lstm_units: int = 128,
+    dropout: float = 0.3,
+) -> models.Model:
+    """Construct a compact yet expressive CNN-LSTM architecture."""
+    inputs = layers.Input(shape=input_shape)
+    x = layers.Conv1D(conv_filters, kernel_size, padding="same", activation="relu")(inputs)
+    x = layers.BatchNormalization()(x)
+    x = layers.Conv1D(conv_filters, kernel_size, dilation_rate=2, padding="same", activation="relu")(x)
+    x = layers.BatchNormalization()(x)
+    x = layers.Dropout(dropout)(x)
+    x = layers.LSTM(lstm_units, return_sequences=False)(x)
+    x = layers.Dropout(dropout)(x)
+    outputs = layers.Dense(num_classes, activation="softmax")(x)
+    model = models.Model(inputs, outputs)
+    model.compile(
+        optimizer=optimizers.Adam(learning_rate=1e-3),
+        loss="sparse_categorical_crossentropy",
+        metrics=["accuracy"],
+    )
+    return model
+def train_model(
+    sequences: np.ndarray,
+    labels: np.ndarray,
+    *,
+    validation_split: float,
+    batch_size: int,
+    epochs: int,
+) -> Tuple[models.Model, LabelEncoder, dict]:
+    """Train the CNN-LSTM model and return training history and validation outputs."""
+    label_encoder = LabelEncoder()
+    y = label_encoder.fit_transform(labels)
+    X_train, X_val, y_train, y_val = train_test_split(
+        sequences, y, test_size=validation_split, stratify=y, random_state=42
+    )
+    model = build_cnn_lstm(input_shape=sequences.shape[1:], num_classes=len(label_encoder.classes_))
+    callbacks_list = [
+        callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-5),
+        callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True),
+    ]
+    history = model.fit(
+        X_train,
+        y_train,
+        validation_data=(X_val, y_val),
+        epochs=epochs,
+        batch_size=batch_size,
+        callbacks=callbacks_list,
+        verbose=2,
+    )
+    y_pred = model.predict(X_val, verbose=0).argmax(axis=1)
+    metrics = {
+        "history": history.history,
+        "validation": {
+            "y_true": y_val,
+            "y_pred": y_pred,
+            "class_names": label_encoder.classes_.tolist(),
+        },
+    }
+    return model, label_encoder, metrics
+def standardise_sequences(sequences: np.ndarray) -> Tuple[np.ndarray, StandardScaler]:
+    """Apply standard scaling per feature across all timesteps."""
+    scaler = StandardScaler()
+    flattened = sequences.reshape(-1, sequences.shape[-1])
+    scaled = scaler.fit_transform(flattened)
+    return scaled.reshape(sequences.shape), scaler
+def export_artifacts(
+    *,
+    model: models.Model,
+    scaler: StandardScaler,
+    label_encoder: LabelEncoder,
+    feature_columns: Sequence[str],
+    label_column: str,
+    sequence_length: int,
+    stride: int,
+    model_path: Path,
+    scaler_path: Path,
+    metadata_path: Path,
+    metrics: dict,
+) -> None:
+    """Persist trained assets to disk for deployment."""
+    model_path.parent.mkdir(parents=True, exist_ok=True)
+    scaler_path.parent.mkdir(parents=True, exist_ok=True)
+    metadata_path.parent.mkdir(parents=True, exist_ok=True)
+    model.save(model_path)
+    joblib.dump(scaler, scaler_path)
+    metadata = {
+        "feature_columns": list(feature_columns),
+        "label_classes": label_encoder.classes_.tolist(),
+        "label_column": label_column,
+        "sequence_length": sequence_length,
+        "stride": stride,
+        "model_path": str(model_path),
+        "scaler_path": str(scaler_path),
+        "training_history": metrics["history"],
+        "classification_report": classification_report(
+            metrics["validation"]["y_true"], metrics["validation"]["y_pred"], target_names=label_encoder.classes_
+        ),
+        "confusion_matrix": metrics["validation"].get("confusion_matrix")
+        if metrics["validation"].get("confusion_matrix") is not None
+        else None,
+    }
+    # Add confusion matrix lazily to avoid recomputation.
+    if metadata["confusion_matrix"] is None:
+        cm = confusion_matrix(metrics["validation"]["y_true"], metrics["validation"]["y_pred"])
+        metadata["confusion_matrix"] = cm.tolist()
+    metadata_path.write_text(json.dumps(metadata, indent=2))
+def run_training(args: argparse.Namespace) -> None:
+    csv_path = Path(args.data_path)
+    model_out = Path(args.model_out)
+    scaler_out = Path(args.scaler_out)
+    metadata_out = Path(args.metadata_out)
+    features, labels, feature_columns = load_dataset(
+        csv_path, feature_columns=args.feature_columns, label_column=args.label_column
+    )
+    sequences, seq_labels = create_sequences(
+        features,
+        labels,
+        sequence_length=args.sequence_length,
+        stride=args.stride,
+    )
+    sequences, scaler = standardise_sequences(sequences)
+    model, label_encoder, metrics = train_model(
+        sequences,
+        seq_labels,
+        validation_split=args.validation_split,
+        batch_size=args.batch_size,
+        epochs=args.epochs,
+    )
+    export_artifacts(
+        model=model,
+        scaler=scaler,
+        label_encoder=label_encoder,
+        feature_columns=feature_columns,
+        label_column=args.label_column,
+        sequence_length=args.sequence_length,
+        stride=args.stride,
+        model_path=model_out,
+        scaler_path=scaler_out,
+        metadata_path=metadata_out,
+        metrics=metrics,
+    )
+    print("Training complete")
+    print(f"Model saved to       : {model_out}")
+    print(f"Scaler saved to      : {scaler_out}")
+    print(f"Metadata saved to    : {metadata_out}")
+    print("Validation metrics:")
+    report = classification_report(
+        metrics["validation"]["y_true"], metrics["validation"]["y_pred"], target_names=metrics["validation"]["class_names"]
+    )
+    print(report)
+def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Train a CNN-LSTM model for PMU fault classification")
+    parser.add_argument("--data-path", required=True, help="Path to Fault_Classification_PMU_Data CSV")
+    parser.add_argument(
+        "--label-column",
+        default="Fault",
+        help="Name of the target label column (default: Fault)",
+    )
+    parser.add_argument(
+        "--feature-columns",
+        nargs="*",
+        default=None,
+        help="Optional explicit list of feature columns. Defaults to all non-label columns",
+    )
+    parser.add_argument("--sequence-length", type=int, default=32, help="Number of timesteps per training window")
+    parser.add_argument("--stride", type=int, default=4, help="Step size between consecutive windows")
+    parser.add_argument("--validation-split", type=float, default=0.2, help="Validation set fraction")
+    parser.add_argument("--batch-size", type=int, default=128, help="Training batch size")
+    parser.add_argument("--epochs", type=int, default=50, help="Maximum number of training epochs")
+    parser.add_argument("--model-out", default="pmu_cnn_lstm_model.keras", help="Path to save trained Keras model")
+    parser.add_argument("--scaler-out", default="pmu_feature_scaler.pkl", help="Path to save fitted StandardScaler")
+    parser.add_argument("--metadata-out", default="pmu_metadata.json", help="Path to save metadata JSON")
+    return parser.parse_args(argv)
+def main(argv: Sequence[str] | None = None) -> None:
+    args = parse_args(argv)
+    run_training(args)
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ pandas
 scikit-learn
 huggingface_hub
 matplotlib

 scikit-learn
 huggingface_hub
 matplotlib
+joblib