"""Fault classification training utilities for PMU and PV datasets. This module trains deep learning models on high-frequency PMU measurements and supports classical machine learning baselines so the resulting artefacts can be served via the Gradio app in this repository or on Hugging Face Spaces. It implements a full training pipeline including preprocessing, sequence generation, model definition (CNN-LSTM, Temporal Convolutional Network, or Support Vector Machine), evaluation, and export of deployment metadata. Example ------- python fault_classification_pmu.py \ --data-path data/Fault_Classification_PMU_Data.csv \ --label-column FaultType \ --model-type tcn \ --model-out pmu_tcn_model.keras \ --scaler-out pmu_feature_scaler.pkl \ --metadata-out pmu_metadata.json The script accepts CSV input where each row contains a timestamped PMU measurement and a categorical fault label. Features default to the 14 PMU channels used in the project documentation, but any subset can be provided via the ``--feature-columns`` argument. Data is automatically standardised and windowed to create temporal sequences that feed into the neural network. The exported metadata JSON file contains the feature ordering, label names, sequence length, stride, and chosen architecture. The Gradio front-end consumes this file to replicate the same preprocessing steps during inference. """ from __future__ import annotations import argparse import json import os import shutil from datetime import datetime from pathlib import Path from typing import Dict, List, Optional, Sequence, Tuple import math os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1") os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2") os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0") import joblib import numpy as np import pandas as pd from pandas.api.types import is_numeric_dtype from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.svm import SVC from tensorflow.keras import callbacks, layers, models, optimizers class ProgressCallback(callbacks.Callback): """Custom callback to provide training progress updates.""" def __init__( self, total_epochs, status_file_path=None, *, status_update_interval: float = 10.0, batch_log_frequency: int = 10, ): super().__init__() self.total_epochs = total_epochs self.status_file_path = status_file_path self.status_update_interval = max(1.0, float(status_update_interval)) self.batch_log_frequency = max(1, int(batch_log_frequency)) self.current_epoch = 0 self.train_start_time: Optional[float] = None self.last_status_report: Optional[float] = None self.total_batches_per_epoch = 0 self.batches_seen = 0 # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ def _now(self) -> float: import time return time.perf_counter() def _training_elapsed(self, now: Optional[float] = None) -> float: if self.train_start_time is None: return 0.0 if now is None: now = self._now() return max(0.0, now - self.train_start_time) def _report_status(self, message: str, *, force: bool = False) -> None: now = self._now() if not force and self.last_status_report is not None: if now - self.last_status_report < self.status_update_interval: return print(message, flush=True) if self.status_file_path: try: with open(self.status_file_path, "w") as f: f.write(message) except Exception: # Silently ignore status file failures; progress should still stream to stdout pass self.last_status_report = now # ------------------------------------------------------------------ # Keras callback overrides # ------------------------------------------------------------------ def on_train_begin(self, logs=None): params = self.params or {} steps = params.get("steps") or params.get("steps_per_epoch") if steps: self.total_batches_per_epoch = int(steps) else: samples = params.get("samples") batch_size = params.get("batch_size") or 0 if samples and batch_size: self.total_batches_per_epoch = math.ceil(samples / batch_size) else: self.total_batches_per_epoch = 0 self.batches_seen = 0 self.last_status_report = None self.train_start_time = self._now() def on_epoch_begin(self, epoch, logs=None): import time now = self._now() if self.train_start_time is None: self.train_start_time = now self.current_epoch = epoch + 1 self.batches_seen = 0 progress_pct = (self.current_epoch / self.total_epochs) * 100 elapsed_time = self._training_elapsed(now) status_msg = ( f"Training epoch {self.current_epoch}/{self.total_epochs} " f"({progress_pct:.1f}%) - {elapsed_time:.1f}s elapsed" ) self._report_status(status_msg, force=True) if self.current_epoch == 1: wall_clock = time.strftime("%H:%M:%S") print(f"Starting first epoch at {wall_clock}", flush=True) def on_batch_begin(self, batch, logs=None): if self.current_epoch == 1 and batch % self.batch_log_frequency == 0: elapsed = self._training_elapsed() print(f"Epoch {self.current_epoch}, Batch {batch} started - {elapsed:.1f}s elapsed", flush=True) def on_batch_end(self, batch, logs=None): self.batches_seen = batch + 1 if self.current_epoch == 1 and batch % self.batch_log_frequency == 0: logs = logs or {} loss = logs.get("loss", 0) elapsed = self._training_elapsed() print( f"Epoch {self.current_epoch}, Batch {batch} completed - Loss: {loss:.4f}, {elapsed:.1f}s elapsed", flush=True, ) total_batches = self.total_batches_per_epoch or 0 if not total_batches: params = self.params or {} total_batches = ( params.get("steps") or params.get("steps_per_epoch") or 0 ) if total_batches: epoch_fraction = min(1.0, (batch + 1) / total_batches) else: epoch_fraction = 0.0 overall_progress = ( (self.current_epoch - 1 + epoch_fraction) / self.total_epochs * 100 ) elapsed_time = self._training_elapsed() status_msg = ( f"Epoch {self.current_epoch}/{self.total_epochs} - Batch {batch + 1}/{total_batches or '?'} " f"({overall_progress:.1f}%) - {elapsed_time:.1f}s elapsed" ) self._report_status(status_msg) def on_epoch_end(self, epoch, logs=None): logs = logs or {} loss = logs.get("loss", 0) val_loss = logs.get("val_loss", 0) accuracy = logs.get("accuracy", logs.get("acc", 0)) val_accuracy = logs.get("val_accuracy", logs.get("val_acc", 0)) _ = epoch # Suppress unused variable warning elapsed_time = self._training_elapsed() status_msg = ( f"Epoch {self.current_epoch}/{self.total_epochs} completed - " f"Loss: {loss:.4f}, Val Loss: {val_loss:.4f}, " f"Acc: {accuracy:.4f}, Val Acc: {val_accuracy:.4f} - {elapsed_time:.1f}s total" ) self._report_status(status_msg, force=True) def on_train_end(self, logs=None): total_elapsed = self._training_elapsed() final_message = ( f"Training finished after {self.total_epochs} epoch(s) - " f"{total_elapsed:.1f}s total elapsed" ) self._report_status(final_message, force=True) # Default PMU feature set as described in the user provided table. Timestamp is # intentionally omitted because it is not a model input feature. DEFAULT_FEATURE_COLUMNS: List[str] = [ "[325] UPMU_SUB22:FREQ", "[326] UPMU_SUB22:DFDT", "[327] UPMU_SUB22:FLAG", "[328] UPMU_SUB22-L1:MAG", "[329] UPMU_SUB22-L1:ANG", "[330] UPMU_SUB22-L2:MAG", "[331] UPMU_SUB22-L2:ANG", "[332] UPMU_SUB22-L3:MAG", "[333] UPMU_SUB22-L3:ANG", "[334] UPMU_SUB22-C1:MAG", "[335] UPMU_SUB22-C1:ANG", "[336] UPMU_SUB22-C2:MAG", "[337] UPMU_SUB22-C2:ANG", "[338] UPMU_SUB22-C3:MAG", "[339] UPMU_SUB22-C3:ANG", ] LABEL_GUESS_CANDIDATES: Tuple[str, ...] = ("Fault", "FaultType", "Label", "Target", "Class") def _normalise_column_name(name: str) -> str: return str(name).strip().lower() def _resolve_label_column(df: pd.DataFrame, requested: str) -> str: columns = [str(col) for col in df.columns] if not columns: raise ValueError("Provided dataframe does not contain any columns.") requested = str(requested or "").strip() if requested and requested in df.columns: return requested if requested: for col in df.columns: if str(col).strip() == requested: return str(col) lowered = requested.lower() lowered_map = {_normalise_column_name(col): str(col) for col in df.columns} if lowered in lowered_map: return lowered_map[lowered] lowered_map = {_normalise_column_name(col): str(col) for col in df.columns} for guess in LABEL_GUESS_CANDIDATES: key = guess.lower() if key in lowered_map: return lowered_map[key] for col in reversed(df.columns): if not is_numeric_dtype(df[col]): return str(col) available = ", ".join(columns) raise ValueError( f"Label column '{requested or ' '}' not found in provided dataframe. " f"Available columns: {available}" ) def _resolve_features(df: pd.DataFrame, feature_columns: Sequence[str] | None, label_column: str) -> List[str]: if feature_columns: missing = [c for c in feature_columns if c not in df.columns] if missing: raise ValueError(f"Feature columns not present in CSV: {missing}") return list(feature_columns) # Prefer the documented PMU ordering when the columns exist, falling back to # any remaining numeric columns. preferred = [c for c in DEFAULT_FEATURE_COLUMNS if c in df.columns] excluded = {label_column, label_column.lower(), "timestamp", "Timestamp"} remainder = [c for c in df.columns if c not in preferred and c not in excluded] ordered = preferred + remainder if not ordered: raise ValueError("No feature columns detected. Specify --feature-columns explicitly.") return ordered def load_dataset( csv_path: Path, *, feature_columns: Sequence[str] | None, label_column: str, ) -> Tuple[np.ndarray, np.ndarray, List[str], str]: """Load the dataset from CSV. Parameters ---------- csv_path: Path to the CSV file containing PMU measurements. feature_columns: Optional explicit ordering of feature columns. label_column: Name of the column containing the categorical fault label. Returns ------- features: np.ndarray 2-D array of shape (n_samples, n_features). labels: np.ndarray 1-D array of label strings. columns: list[str] Actual feature ordering used. resolved_label: str The column name that supplied the labels. """ df = pd.read_csv(csv_path, sep=None, engine="python") resolved_label = _resolve_label_column(df, label_column) columns = _resolve_features(df, feature_columns, resolved_label) features = df[columns].astype(np.float32).values labels = df[resolved_label].astype(str).values return features, labels, columns, resolved_label def load_dataset_from_dataframe( df: pd.DataFrame, *, feature_columns: Sequence[str] | None, label_column: str, ) -> Tuple[np.ndarray, np.ndarray, List[str], str]: """Load dataset arrays directly from a DataFrame.""" resolved_label = _resolve_label_column(df, label_column) columns = _resolve_features(df, feature_columns, resolved_label) features = df[columns].astype(np.float32).values labels = df[resolved_label].astype(str).values return features, labels, columns, resolved_label def create_sequences( features: np.ndarray, labels: np.ndarray, *, sequence_length: int, stride: int, ) -> Tuple[np.ndarray, np.ndarray]: """Create overlapping sequences suitable for sequence models. The label assigned to a sequence corresponds to the label of the final timestep in the window. This choice aligns with fault detection use cases where the most recent measurement dictates the state of the system. """ if sequence_length <= 0: raise ValueError("sequence_length must be > 0") if stride <= 0: raise ValueError("stride must be > 0") if features.shape[0] != labels.shape[0]: raise ValueError("Features and labels must contain the same number of rows") if features.shape[0] < sequence_length: raise ValueError("Not enough samples to create a single sequence") sequences: List[np.ndarray] = [] seq_labels: List[str] = [] for start in range(0, features.shape[0] - sequence_length + 1, stride): end = start + sequence_length sequences.append(features[start:end]) seq_labels.append(labels[end - 1]) return np.stack(sequences), np.array(seq_labels) def build_cnn_lstm( input_shape: Tuple[int, int], num_classes: int, *, conv_filters: int = 128, kernel_size: int = 3, lstm_units: int = 128, dropout: float = 0.3, ) -> models.Model: """Construct a compact yet expressive CNN-LSTM architecture.""" inputs = layers.Input(shape=input_shape) x = layers.Conv1D(conv_filters, kernel_size, padding="same", activation="relu")(inputs) x = layers.BatchNormalization()(x) x = layers.Conv1D(conv_filters, kernel_size, dilation_rate=2, padding="same", activation="relu")(x) x = layers.BatchNormalization()(x) x = layers.Dropout(dropout)(x) x = layers.LSTM(lstm_units, return_sequences=False)(x) x = layers.Dropout(dropout)(x) outputs = layers.Dense(num_classes, activation="softmax")(x) model = models.Model(inputs, outputs) model.compile( optimizer=optimizers.Adam(learning_rate=1e-3), loss="sparse_categorical_crossentropy", metrics=["accuracy"], ) return model def build_tcn( input_shape: Tuple[int, int], num_classes: int, *, filters: int = 64, kernel_size: int = 3, dilations: Sequence[int] = (1, 2, 4, 8), dropout: float = 0.2, ) -> models.Model: """Construct a lightweight Temporal Convolutional Network.""" inputs = layers.Input(shape=input_shape) x = inputs for dilation in dilations: residual = x x = layers.Conv1D( filters, kernel_size, padding="causal", activation="relu", dilation_rate=dilation, )(x) x = layers.BatchNormalization()(x) x = layers.Dropout(dropout)(x) x = layers.Conv1D( filters, kernel_size, padding="causal", activation="relu", dilation_rate=dilation, )(x) x = layers.BatchNormalization()(x) if residual.shape[-1] != filters: residual = layers.Conv1D(filters, 1, padding="same")(residual) x = layers.Add()([x, residual]) x = layers.Activation("relu")(x) x = layers.GlobalAveragePooling1D()(x) x = layers.Dropout(dropout)(x) outputs = layers.Dense(num_classes, activation="softmax")(x) model = models.Model(inputs, outputs) model.compile( optimizer=optimizers.Adam(learning_rate=1e-3), loss="sparse_categorical_crossentropy", metrics=["accuracy"], ) return model def train_model( sequences: np.ndarray, labels: np.ndarray, *, validation_split: float, batch_size: int, epochs: int, model_type: str = "cnn_lstm", tensorboard_log_dir: Optional[Path] = None, status_file_path: Optional[Path] = None, ) -> Tuple[object, LabelEncoder, Dict[str, object]]: """Train a sequence model and return training history and validation outputs.""" model_type = model_type.lower().strip() if model_type not in {"cnn_lstm", "tcn", "svm"}: raise ValueError("model_type must be either 'cnn_lstm', 'tcn', or 'svm'") # Handle status file for progress tracking status_file = status_file_path if status_file_path else None label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels) if model_type == "svm": features = sequences.reshape(sequences.shape[0], -1) else: features = sequences tb_dir: Optional[str] = None if model_type != "svm" and tensorboard_log_dir is not None: tensorboard_log_dir.mkdir(parents=True, exist_ok=True) tb_dir = str(tensorboard_log_dir.resolve()) else: tensorboard_log_dir = None # Check if we can use stratification (each class needs at least 2 samples) unique_labels, label_counts = np.unique(y, return_counts=True) min_samples_per_class = np.min(label_counts) print(f"Label distribution: {dict(zip(unique_labels, label_counts))}") print(f"Minimum samples per class: {min_samples_per_class}") print(f"Total sequences: {len(sequences)}, Features per sequence: {sequences.shape[1:]}") # Check for potential memory issues import sys data_size_mb = sequences.nbytes / (1024 * 1024) print(f"Data size: {data_size_mb:.2f} MB") if data_size_mb > 1000: # > 1GB print("Warning: Large dataset detected. Consider reducing batch size or sequence length.") # Validate data ranges if np.any(np.isnan(sequences)) or np.any(np.isinf(sequences)): print("Warning: NaN or Inf values detected in sequences") sequences = np.nan_to_num(sequences, nan=0.0, posinf=1e6, neginf=-1e6) # Use stratification only if each class has at least 2 samples if min_samples_per_class >= 2: X_train, X_val, y_train, y_val = train_test_split( features, y, test_size=validation_split, stratify=y, random_state=42 ) else: print(f"Warning: Some classes have only {min_samples_per_class} sample(s). Using simple random split instead of stratified split.") # If validation split would result in empty validation set for some classes, # reduce validation split or use a minimum number of samples total_samples = len(y) if validation_split * total_samples < len(unique_labels): # Ensure at least one sample per class in validation if possible adjusted_split = max(0.1, len(unique_labels) / total_samples) adjusted_split = min(adjusted_split, 0.3) # Cap at 30% print(f"Adjusting validation split from {validation_split} to {adjusted_split}") validation_split = adjusted_split X_train, X_val, y_train, y_val = train_test_split( features, y, test_size=validation_split, random_state=42 ) if model_type == "cnn_lstm": print("Building CNN-LSTM model...") # Optimize model for large datasets if len(sequences) > 100000: print("Using lightweight CNN-LSTM for large dataset") model = build_cnn_lstm( input_shape=sequences.shape[1:], num_classes=len(label_encoder.classes_), conv_filters=64, # Reduce from 128 lstm_units=64, # Reduce from 128 dropout=0.2 # Reduce dropout ) else: model = build_cnn_lstm( input_shape=sequences.shape[1:], num_classes=len(label_encoder.classes_) ) print(f"CNN-LSTM model built. Input shape: {sequences.shape[1:]}, Classes: {len(label_encoder.classes_)}") print(f"Model parameters: {model.count_params():,}") # Adjust callbacks for dataset size if len(sequences) > 100000: callbacks_list = [ ProgressCallback(total_epochs=epochs, status_file_path=str(status_file) if status_file else None), callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-5), callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True), # More aggressive ] print("Using aggressive callbacks for large dataset") else: callbacks_list = [ ProgressCallback(total_epochs=epochs, status_file_path=str(status_file) if status_file else None), callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-5), callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True), ] if tensorboard_log_dir is not None: callbacks_list.insert(-2, callbacks.TensorBoard(log_dir=tb_dir, histogram_freq=0, write_graph=False)) # Reduce TensorBoard overhead print(f"Starting CNN-LSTM training with {len(X_train)} training samples, {len(X_val)} validation samples") print(f"Batch size: {batch_size}, Epochs: {epochs}") if status_file: with open(status_file, 'w') as f: f.write(f"CNN-LSTM training started - {len(X_train)} train, {len(X_val)} val samples, batch_size={batch_size}") history = model.fit( X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=callbacks_list, verbose=2, ) print("CNN-LSTM training completed, starting prediction...") if status_file: with open(status_file, 'w') as f: f.write("CNN-LSTM training completed, evaluating model...") print(f"Making predictions on {len(X_val)} validation samples...") if status_file: with open(status_file, 'w') as f: f.write(f"Making predictions on {len(X_val)} validation samples...") y_pred = model.predict(X_val, verbose=0).argmax(axis=1) print("Predictions completed") training_history: Dict[str, object] = history.history elif model_type == "tcn": print("Building TCN model...") model = build_tcn(input_shape=sequences.shape[1:], num_classes=len(label_encoder.classes_)) print(f"TCN model built. Input shape: {sequences.shape[1:]}, Classes: {len(label_encoder.classes_)}") callbacks_list = [ ProgressCallback(total_epochs=epochs, status_file_path=str(status_file) if status_file else None), callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-5), callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True), ] if tensorboard_log_dir is not None: callbacks_list.insert(-2, callbacks.TensorBoard(log_dir=tb_dir, histogram_freq=0, write_graph=False)) # Reduce TensorBoard overhead print(f"Starting TCN training with {len(X_train)} training samples, {len(X_val)} validation samples") print(f"Batch size: {batch_size}, Epochs: {epochs}") if status_file: with open(status_file, 'w') as f: f.write(f"TCN training started - {len(X_train)} train, {len(X_val)} val samples, batch_size={batch_size}") history = model.fit( X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=callbacks_list, verbose=2, ) print("TCN training completed, starting prediction...") if status_file: with open(status_file, 'w') as f: f.write("TCN training completed, evaluating model...") print(f"Making TCN predictions on {len(X_val)} validation samples...") if status_file: with open(status_file, 'w') as f: f.write(f"Making TCN predictions on {len(X_val)} validation samples...") y_pred = model.predict(X_val, verbose=0).argmax(axis=1) print("TCN predictions completed") training_history = history.history else: # svm print("Training SVM model...", flush=True) if status_file: with open(status_file, 'w') as f: f.write("Training SVM model...") model = SVC(kernel="rbf", probability=True, class_weight="balanced") model.fit(X_train, y_train) print("SVM training completed. Evaluating...", flush=True) if status_file: with open(status_file, 'w') as f: f.write("SVM training completed. Evaluating...") y_pred = model.predict(X_val) training_history = { "train_accuracy": float(model.score(X_train, y_train)), "val_accuracy": float(accuracy_score(y_val, y_pred)), } cm = confusion_matrix(y_val, y_pred) metrics: Dict[str, object] = { "history": training_history, "validation": { "y_true": y_val, "y_pred": y_pred, "class_names": label_encoder.classes_.tolist(), "confusion_matrix": cm, }, "model_type": model_type, "input_shape": list(sequences.shape[1:]), "tensorboard_log_dir": tb_dir, } return model, label_encoder, metrics def standardise_sequences(sequences: np.ndarray) -> Tuple[np.ndarray, StandardScaler]: """Apply standard scaling per feature across all timesteps.""" scaler = StandardScaler() flattened = sequences.reshape(-1, sequences.shape[-1]) scaled = scaler.fit_transform(flattened) return scaled.reshape(sequences.shape), scaler def export_artifacts( *, model: object, scaler: StandardScaler, label_encoder: LabelEncoder, feature_columns: Sequence[str], label_column: str, sequence_length: int, stride: int, model_path: Path, scaler_path: Path, metadata_path: Path, metrics: dict, ) -> None: """Persist trained assets to disk for deployment.""" model_path.parent.mkdir(parents=True, exist_ok=True) scaler_path.parent.mkdir(parents=True, exist_ok=True) metadata_path.parent.mkdir(parents=True, exist_ok=True) model_type = str(metrics.get("model_type", "cnn_lstm")) if model_type == "svm": joblib.dump(model, model_path) else: model.save(model_path) joblib.dump(scaler, scaler_path) validation = metrics["validation"] report_dict = classification_report( validation["y_true"], validation["y_pred"], target_names=label_encoder.classes_, output_dict=True, ) metadata = { "feature_columns": list(feature_columns), "label_classes": label_encoder.classes_.tolist(), "label_column": label_column, "sequence_length": sequence_length, "stride": stride, "model_path": str(model_path), "scaler_path": str(scaler_path), "training_history": metrics["history"], "classification_report": report_dict, "model_type": model_type, "model_format": "joblib" if model_type == "svm" else "keras", "input_shape": metrics.get("input_shape"), "tensorboard_log_dir": metrics.get("tensorboard_log_dir"), } confusion = validation.get("confusion_matrix") if confusion is None: confusion = confusion_matrix(validation["y_true"], validation["y_pred"]) metadata["confusion_matrix"] = np.asarray(confusion).tolist() metadata_path.write_text(json.dumps(metadata, indent=2)) def train_from_dataframe( df: pd.DataFrame, *, label_column: str, feature_columns: Sequence[str] | None = None, sequence_length: int = 32, stride: int = 4, validation_split: float = 0.2, batch_size: int = 128, epochs: int = 50, model_type: str = "cnn_lstm", model_path: Path | str = "pmu_cnn_lstm_model.keras", scaler_path: Path | str = "pmu_feature_scaler.pkl", metadata_path: Path | str = "pmu_metadata.json", enable_tensorboard: bool = True, tensorboard_root: Path | str | None = None, ) -> dict: """Train a PMU fault classification model using an in-memory dataframe.""" model_path = Path(model_path) scaler_path = Path(scaler_path) metadata_path = Path(metadata_path) # Create status file for progress tracking status_file = model_path.parent / "training_status.txt" print(f"Training progress will be written to: {status_file}") tensorboard_log_dir: Optional[Path] = None if enable_tensorboard and model_type.lower() != "svm": base_dir = Path(tensorboard_root) if tensorboard_root is not None else Path("tensorboard_runs") timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S") tensorboard_log_dir = base_dir / f"run-{timestamp}" features, labels, used_columns, resolved_label = load_dataset_from_dataframe( df, feature_columns=feature_columns, label_column=label_column ) print(f"Input data: {len(features)} samples") print(f"Creating sequences with length={sequence_length}, stride={stride}") sequences, seq_labels = create_sequences( features, labels, sequence_length=sequence_length, stride=stride, ) print(f"Generated {len(sequences)} sequences") # Validate sequence count and adjust parameters if necessary if len(sequences) < 10: raise ValueError( f"Only {len(sequences)} sequences generated. Need at least 10 for training. " f"Try reducing sequence_length (currently {sequence_length}) or stride (currently {stride}), " "or provide more data." ) # If very few sequences, recommend SVM instead of deep learning if len(sequences) < 100 and model_type in ['cnn_lstm', 'tcn']: print(f"Warning: Only {len(sequences)} sequences available. Consider using SVM for small datasets.") sequences, scaler = standardise_sequences(sequences) # Adjust training parameters based on data size original_batch_size = batch_size original_epochs = epochs original_validation_split = validation_split # Handle large datasets (>100K sequences) - optimize for memory and speed if len(sequences) > 100000: print(f"Large dataset detected ({len(sequences)} sequences). Optimizing parameters...") batch_size = min(batch_size * 2, 512) # Increase batch size for efficiency epochs = min(epochs, 30) # Reduce epochs for large datasets print(f"Adjusted parameters for large dataset:") print(f" Batch size: {original_batch_size} -> {batch_size}") print(f" Epochs: {original_epochs} -> {epochs}") # Force garbage collection import gc gc.collect() elif len(sequences) < 100: # For very small datasets batch_size = max(min(batch_size, len(sequences) // 4), 4) # Ensure batch_size >= 4 epochs = min(epochs, 20) # Reduce epochs to prevent overfitting validation_split = min(validation_split, 0.3) # Reduce validation split print(f"Adjusted parameters for small dataset:") print(f" Batch size: {original_batch_size} -> {batch_size}") print(f" Epochs: {original_epochs} -> {epochs}") print(f" Validation split: {original_validation_split} -> {validation_split}") model, label_encoder, metrics = train_model( sequences, seq_labels, validation_split=validation_split, batch_size=batch_size, epochs=epochs, model_type=model_type, tensorboard_log_dir=tensorboard_log_dir, status_file_path=status_file, ) export_artifacts( model=model, scaler=scaler, label_encoder=label_encoder, feature_columns=used_columns, label_column=resolved_label, sequence_length=sequence_length, stride=stride, model_path=model_path, scaler_path=scaler_path, metadata_path=metadata_path, metrics=metrics, ) tensorboard_zip_path: Optional[str] = None if tensorboard_log_dir and tensorboard_log_dir.exists(): try: tensorboard_zip_path = shutil.make_archive( base_name=str(tensorboard_log_dir.parent / tensorboard_log_dir.name), format="zip", root_dir=str(tensorboard_log_dir.parent), base_dir=tensorboard_log_dir.name, ) tensorboard_zip_path = str(Path(tensorboard_zip_path).resolve()) except Exception: tensorboard_zip_path = None report_dict = classification_report( metrics["validation"]["y_true"], metrics["validation"]["y_pred"], target_names=metrics["validation"]["class_names"], output_dict=True, ) confusion = metrics["validation"].get("confusion_matrix") if confusion is None: confusion = confusion_matrix(metrics["validation"]["y_true"], metrics["validation"]["y_pred"]) return { "num_samples": int(df.shape[0]), "num_sequences": int(sequences.shape[0]), "feature_columns": used_columns, "class_names": label_encoder.classes_.tolist(), "model_path": str(model_path.resolve()), "scaler_path": str(scaler_path.resolve()), "metadata_path": str(metadata_path.resolve()), "history": metrics["history"], "model_type": metrics.get("model_type", model_type), "classification_report": report_dict, "confusion_matrix": np.asarray(confusion).tolist(), "tensorboard_log_dir": metrics.get("tensorboard_log_dir"), "tensorboard_zip_path": tensorboard_zip_path, "label_column": resolved_label, } def run_training(args: argparse.Namespace) -> None: csv_path = Path(args.data_path) model_out = Path(args.model_out) scaler_out = Path(args.scaler_out) metadata_out = Path(args.metadata_out) features, labels, feature_columns, resolved_label = load_dataset( csv_path, feature_columns=args.feature_columns, label_column=args.label_column ) sequences, seq_labels = create_sequences( features, labels, sequence_length=args.sequence_length, stride=args.stride, ) sequences, scaler = standardise_sequences(sequences) tensorboard_log_dir: Optional[Path] = None if args.tensorboard and args.model_type != "svm": if args.tensorboard_log_dir: tensorboard_log_dir = Path(args.tensorboard_log_dir) else: tensorboard_log_dir = Path("tensorboard_runs") / datetime.utcnow().strftime("%Y%m%d-%H%M%S") model, label_encoder, metrics = train_model( sequences, seq_labels, validation_split=args.validation_split, batch_size=args.batch_size, epochs=args.epochs, model_type=args.model_type, tensorboard_log_dir=tensorboard_log_dir, status_file_path=None, # No status file for CLI usage ) export_artifacts( model=model, scaler=scaler, label_encoder=label_encoder, feature_columns=feature_columns, label_column=resolved_label, sequence_length=args.sequence_length, stride=args.stride, model_path=model_out, scaler_path=scaler_out, metadata_path=metadata_out, metrics=metrics, ) print("Training complete") print(f"Model architecture : {args.model_type}") print(f"Model saved to : {model_out}") print(f"Scaler saved to : {scaler_out}") print(f"Metadata saved to : {metadata_out}") print("Validation metrics:") report = classification_report( metrics["validation"]["y_true"], metrics["validation"]["y_pred"], target_names=metrics["validation"]["class_names"] ) print(report) if metrics.get("tensorboard_log_dir"): tb_dir = metrics["tensorboard_log_dir"] print(f"TensorBoard logs written to: {tb_dir}") print(f"Launch TensorBoard with: tensorboard --logdir \"{tb_dir}\"") def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser(description="Train a sequence model for PMU fault classification") parser.add_argument("--data-path", required=True, help="Path to Fault_Classification_PMU_Data CSV") parser.add_argument( "--label-column", default="Fault", help="Name of the target label column (default: Fault)", ) parser.add_argument( "--feature-columns", nargs="*", default=None, help="Optional explicit list of feature columns. Defaults to all non-label columns", ) parser.add_argument("--sequence-length", type=int, default=32, help="Number of timesteps per training window") parser.add_argument("--stride", type=int, default=4, help="Step size between consecutive windows") parser.add_argument("--validation-split", type=float, default=0.2, help="Validation set fraction") parser.add_argument("--batch-size", type=int, default=128, help="Training batch size") parser.add_argument("--epochs", type=int, default=50, help="Maximum number of training epochs") parser.add_argument( "--model-type", choices=["cnn_lstm", "tcn", "svm"], default="cnn_lstm", help="Model architecture to train (choices: cnn_lstm, tcn, svm)", ) parser.add_argument("--model-out", default="pmu_cnn_lstm_model.keras", help="Path to save trained Keras model") parser.add_argument("--scaler-out", default="pmu_feature_scaler.pkl", help="Path to save fitted StandardScaler") parser.add_argument("--metadata-out", default="pmu_metadata.json", help="Path to save metadata JSON") parser.add_argument( "--tensorboard-log-dir", default=None, help="Optional directory to write TensorBoard logs (defaults to tensorboard_runs/)", ) parser.add_argument( "--no-tensorboard", dest="tensorboard", action="store_false", help="Disable TensorBoard logging for neural network models", ) parser.set_defaults(tensorboard=True) return parser.parse_args(argv) def main(argv: Sequence[str] | None = None) -> None: args = parse_args(argv) run_training(args) if __name__ == "__main__": main()