Spaces:

VincentCroft
/

FaultDetectionDeepLearning

Sleeping

App Files Files Community

VincentCroft commited on Oct 9

Commit

e9cfe70

1 Parent(s): 61d758d

Fix Gradio download button initialization

Browse files

Files changed (3) hide show

app.py +965 -84
model/.gitkeep +0 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -18,12 +18,13 @@ os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")
 import re
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Sequence, Tuple
 import gradio as gr
 import joblib
 import numpy as np
 import pandas as pd
 from huggingface_hub import hf_hub_download
 from tensorflow.keras.models import load_model
@@ -44,6 +45,9 @@ LOCAL_MODEL_FILE = os.environ.get("PMU_MODEL_FILE", "pmu_cnn_lstm_model.keras")
 LOCAL_SCALER_FILE = os.environ.get("PMU_SCALER_FILE", "pmu_feature_scaler.pkl")
 LOCAL_METADATA_FILE = os.environ.get("PMU_METADATA_FILE", "pmu_metadata.json")
 HUB_REPO = os.environ.get("PMU_HUB_REPO", "")
 HUB_MODEL_FILENAME = os.environ.get("PMU_HUB_MODEL_FILENAME", LOCAL_MODEL_FILE)
 HUB_SCALER_FILENAME = os.environ.get("PMU_HUB_SCALER_FILENAME", LOCAL_SCALER_FILE)
@@ -75,6 +79,8 @@ def download_from_hub(filename: str) -> Optional[Path]:
 def resolve_artifact(local_name: str, env_var: str, hub_filename: str) -> Optional[Path]:
     print(f"Resolving artifact: {local_name}, env: {env_var}, hub: {hub_filename}")
     candidates = [Path(local_name)] if local_name else []
     env_value = os.environ.get(env_var)
     if env_value:
         candidates.append(Path(env_value))
@@ -173,8 +179,12 @@ DEFAULT_WINDOW_STRIDE: int = DEFAULT_STRIDE
 MODEL_TYPE: str = "cnn_lstm"
 MODEL_FORMAT: str = "keras"
 MODEL_FILENAME_BY_TYPE: Dict[str, str] = {
-    "cnn_lstm": LOCAL_MODEL_FILE,
     "tcn": "pmu_tcn_model.keras",
     "svm": "pmu_svm_model.joblib",
 }
@@ -183,6 +193,213 @@ REQUIRED_PMU_COLUMNS: Tuple[str, ...] = tuple(DEFAULT_FEATURE_COLUMNS)
 TRAINING_UPLOAD_DIR = Path(os.environ.get("PMU_TRAINING_UPLOAD_DIR", "training_uploads"))
 TRAINING_UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
 def _normalise_header(name: str) -> str:
     return str(name).strip().lower()
@@ -218,7 +435,7 @@ def guess_label_from_columns(columns: Sequence[str], preferred: Optional[str] =
 def summarise_training_files(paths: Sequence[str], notes: Sequence[str]) -> str:
     lines = [Path(path).name for path in paths]
     lines.extend(notes)
-    return "\n".join(lines) if lines else "No training files selected."
 def read_training_status(status_file_path: str) -> str:
@@ -259,6 +476,36 @@ def _persist_uploaded_file(file_obj) -> Optional[Path]:
     return destination
 def append_training_files(new_files, existing_paths: Sequence[str], current_label: str):
     if isinstance(existing_paths, (str, Path)):
         paths: List[str] = [str(existing_paths)]
@@ -275,32 +522,411 @@ def append_training_files(new_files, existing_paths: Sequence[str], current_labe
             if path_str not in paths:
                 paths.append(path_str)
-    valid_paths: List[str] = []
     notes: List[str] = []
-    columns_map: Dict[str, str] = {}
-    for path in paths:
         try:
-            df = load_measurement_csv(path)
-        except Exception as exc:  # pragma: no cover - user file diagnostics
-            notes.append(f"⚠️ Skipped {Path(path).name}: {exc}")
             try:
-                Path(path).unlink(missing_ok=True)
-            except Exception:
-                pass
             continue
-        valid_paths.append(path)
-        for col in df.columns:
-            columns_map[_normalise_header(col)] = str(col)
-    paths = valid_paths
-    summary = summarise_training_files(paths, notes)
-    column_choices = sorted(columns_map.values())
-    preferred = current_label or LABEL_COLUMN
-    guessed = guess_label_from_columns(column_choices, preferred)
-    dropdown_choices = column_choices if column_choices else [preferred or LABEL_COLUMN]
-    dropdown_value = guessed or preferred or LABEL_COLUMN
-    return paths, summary, gr.update(choices=dropdown_choices, value=dropdown_value)
 def clear_training_files():
@@ -360,8 +986,9 @@ following ordered columns:
 15. `[338] UPMU_SUB22-C3:MAG` – phase C current magnitude
 16. `[339] UPMU_SUB22-C3:ANG` – phase C current angle
-Upload as many hourly CSV exports as needed—the training tab concatenates them
-before building sliding windows.
 ## Models Developed
@@ -729,7 +1356,17 @@ def build_interface() -> gr.Blocks:
         button_secondary_background_fill="#3f3f46",
         button_secondary_text_color="#f5f5f5",
     )
-    with gr.Blocks(title="Fault Classification - PMU Data", theme=theme) as demo:
         gr.Markdown("# Fault Classification for PMU & PV Data")
         gr.Markdown(
             "🖥️ TensorFlow is locked to CPU execution so the Space can run without CUDA drivers."
@@ -819,34 +1456,67 @@ def build_interface() -> gr.Blocks:
             with gr.Tab("Training"):
                 gr.Markdown("## Train or Fine-tune the Model")
                 gr.Markdown(
-                    "Upload one or more PMU CSV files to create a combined training dataset. "
-                    "The files will be concatenated in upload order before generating sliding windows."
                 )
                 training_files_state = gr.State([])
                 with gr.Row():
-                    training_file_drop = gr.Files(
-                        label="Drag and drop PMU training CSVs",
-                        file_types=[".csv"],
-                        file_count="multiple",
-                        type="filepath",
-                    )
-                    with gr.Column(scale=1, min_width=180):
-                        training_upload = gr.UploadButton(
-                            "📂 Add training CSVs",
-                            file_types=[".csv"],
-                            file_count="multiple",
-                            type="filepath",
-                            variant="primary",
                         )
-                        clear_training = gr.Button("Clear list", variant="secondary")
-                training_files_summary = gr.Textbox(
-                    label="Selected training CSVs",
-                    value="No training files selected.",
-                    lines=4,
-                    interactive=False,
-                )
                 with gr.Row():
                     label_input = gr.Dropdown(
@@ -879,10 +1549,8 @@ def build_interface() -> gr.Blocks:
                         label="Stride",
                     )
-                model_default = (
-                    str(MODEL_PATH)
-                    if MODEL_PATH
-                    else MODEL_FILENAME_BY_TYPE.get(MODEL_TYPE, LOCAL_MODEL_FILE)
                 )
                 with gr.Row():
@@ -909,16 +1577,54 @@ def build_interface() -> gr.Blocks:
                     )
                 with gr.Row():
-                    model_name = gr.Textbox(value=model_default, label="Model output filename")
                     scaler_name = gr.Textbox(
-                        value=str(SCALER_PATH or LOCAL_SCALER_FILE),
                         label="Scaler output filename",
                     )
                     metadata_name = gr.Textbox(
-                        value=str(METADATA_PATH or LOCAL_METADATA_FILE),
                         label="Metadata output filename",
                     )
                 tensorboard_toggle = gr.Checkbox(
                     value=True,
                     label="Enable TensorBoard logging (creates downloadable archive)",
@@ -926,8 +1632,10 @@ def build_interface() -> gr.Blocks:
                 def _suggest_model_filename(choice: str, current_value: str):
                     choice_key = (choice or "cnn_lstm").lower().replace("-", "_")
-                    suggested = MODEL_FILENAME_BY_TYPE.get(choice_key, LOCAL_MODEL_FILE)
-                    known_defaults = {Path(name).name for name in MODEL_FILENAME_BY_TYPE.values()}
                     current_name = Path(current_value).name if current_value else ""
                     if current_name and current_name not in known_defaults:
                         return gr.update()
@@ -948,10 +1656,6 @@ def build_interface() -> gr.Blocks:
                 report_output = gr.Dataframe(label="Classification report", interactive=False)
                 history_output = gr.JSON(label="Training history")
                 confusion_output = gr.Dataframe(label="Confusion matrix", interactive=False)
-                tensorboard_file = gr.File(
-                    label="TensorBoard logs (.zip)",
-                    interactive=False,
-                )
                 # Message area at the bottom for progress updates
                 with gr.Accordion("📋 Progress Messages", open=True):
@@ -978,21 +1682,56 @@ def build_interface() -> gr.Blocks:
                     validation_split,
                     batch_size,
                     epochs,
                     model_filename,
                     scaler_filename,
                     metadata_filename,
                     enable_tensorboard,
                 ):
                     try:
                         # Create status file path for progress tracking
-                        status_file = Path(model_filename).parent / "training_status.txt"
                         # Initialize status
                         with open(status_file, 'w') as f:
                             f.write("Starting training setup...")
                         if not file_paths:
-                            raise ValueError("Add at least one training CSV via the uploader before starting.")
                         with open(status_file, 'w') as f:
                             f.write("Loading and validating CSV files...")
@@ -1000,7 +1739,9 @@ def build_interface() -> gr.Blocks:
                         available_paths = [path for path in file_paths if Path(path).exists()]
                         missing_paths = [Path(path).name for path in file_paths if not Path(path).exists()]
                         if not available_paths:
-                            raise ValueError("None of the referenced CSV files are available. Please upload them again.")
                         dfs = [load_measurement_csv(path) for path in available_paths]
                         combined = pd.concat(dfs, ignore_index=True)
@@ -1038,9 +1779,9 @@ def build_interface() -> gr.Blocks:
                             batch_size=int(batch_size),
                             epochs=int(epochs),
                             model_type=model_choice,
-                            model_path=Path(model_filename),
-                            scaler_path=Path(scaler_filename),
-                            metadata_path=Path(metadata_filename),
                             enable_tensorboard=bool(enable_tensorboard),
                         )
@@ -1084,7 +1825,10 @@ def build_interface() -> gr.Blocks:
                             report_df,
                             result["history"],
                             confusion_df,
-                            tensorboard_zip,
                             gr.update(value=result.get("label_column", label_column)),
                         )
                     except Exception as exc:
@@ -1093,13 +1837,19 @@ def build_interface() -> gr.Blocks:
                             pd.DataFrame(),
                             {},
                             pd.DataFrame(),
-                            None,
                             gr.update(),
                         )
-                def _check_progress(model_filename, current_messages):
                     """Check training progress by reading status file and accumulate messages."""
-                    status_file = Path(model_filename).parent / "training_status.txt"
                     status_message = read_training_status(str(status_file))
                     # Add timestamp to the message
@@ -1131,6 +1881,7 @@ def build_interface() -> gr.Blocks:
                         validation_train,
                         batch_train,
                         epochs_train,
                         model_name,
                         scaler_name,
                         metadata_name,
@@ -1141,7 +1892,10 @@ def build_interface() -> gr.Blocks:
                         report_output,
                         history_output,
                         confusion_output,
-                        tensorboard_file,
                         label_input,
                     ],
                     concurrency_limit=EVENT_CONCURRENCY_LIMIT,
@@ -1149,25 +1903,152 @@ def build_interface() -> gr.Blocks:
                 progress_button.click(
                     _check_progress,
-                    inputs=[model_name, progress_messages],
                     outputs=[progress_messages],
                 )
-                training_upload.upload(
-                    append_training_files,
-                    inputs=[training_upload, training_files_state, label_input],
-                    outputs=[training_files_state, training_files_summary, label_input],
                     concurrency_limit=EVENT_CONCURRENCY_LIMIT,
                 )
-                training_file_drop.upload(
-                    append_training_files,
-                    inputs=[training_file_drop, training_files_state, label_input],
-                    outputs=[training_files_state, training_files_summary, label_input],
                     concurrency_limit=EVENT_CONCURRENCY_LIMIT,
                 )
-                clear_training.click(
-                    clear_training_files,
-                    outputs=[training_files_state, training_files_summary, label_input, training_file_drop],
                 )
     return demo

 import re
 from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 import gradio as gr
 import joblib
 import numpy as np
 import pandas as pd
+import requests
 from huggingface_hub import hf_hub_download
 from tensorflow.keras.models import load_model
 LOCAL_SCALER_FILE = os.environ.get("PMU_SCALER_FILE", "pmu_feature_scaler.pkl")
 LOCAL_METADATA_FILE = os.environ.get("PMU_METADATA_FILE", "pmu_metadata.json")
+MODEL_OUTPUT_DIR = Path(os.environ.get("PMU_MODEL_DIR", "model")).resolve()
+MODEL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 HUB_REPO = os.environ.get("PMU_HUB_REPO", "")
 HUB_MODEL_FILENAME = os.environ.get("PMU_HUB_MODEL_FILENAME", LOCAL_MODEL_FILE)
 HUB_SCALER_FILENAME = os.environ.get("PMU_HUB_SCALER_FILENAME", LOCAL_SCALER_FILE)
 def resolve_artifact(local_name: str, env_var: str, hub_filename: str) -> Optional[Path]:
     print(f"Resolving artifact: {local_name}, env: {env_var}, hub: {hub_filename}")
     candidates = [Path(local_name)] if local_name else []
+    if local_name:
+        candidates.append(MODEL_OUTPUT_DIR / Path(local_name).name)
     env_value = os.environ.get(env_var)
     if env_value:
         candidates.append(Path(env_value))
 MODEL_TYPE: str = "cnn_lstm"
 MODEL_FORMAT: str = "keras"
+def _model_output_path(filename: str) -> str:
+    return str(MODEL_OUTPUT_DIR / Path(filename).name)
 MODEL_FILENAME_BY_TYPE: Dict[str, str] = {
+    "cnn_lstm": Path(LOCAL_MODEL_FILE).name,
     "tcn": "pmu_tcn_model.keras",
     "svm": "pmu_svm_model.joblib",
 }
 TRAINING_UPLOAD_DIR = Path(os.environ.get("PMU_TRAINING_UPLOAD_DIR", "training_uploads"))
 TRAINING_UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
+TRAINING_DATA_REPO = os.environ.get("PMU_TRAINING_DATA_REPO", "VincentCroft/ThesisModelData")
+TRAINING_DATA_BRANCH = os.environ.get("PMU_TRAINING_DATA_BRANCH", "main")
+TRAINING_DATA_DIR = Path(os.environ.get("PMU_TRAINING_DATA_DIR", "training_dataset"))
+TRAINING_DATA_DIR.mkdir(parents=True, exist_ok=True)
+GITHUB_CONTENT_CACHE: Dict[str, List[Dict[str, Any]]] = {}
+APP_CSS = """
+#available-files-grid .wrap {
+    display: grid;
+    grid-template-columns: repeat(4, minmax(0, 1fr));
+    gap: 0.5rem;
+    max-height: 24rem;
+    overflow-y: auto;
+    padding-right: 0.25rem;
+}
+#available-files-grid {
+    position: relative;
+}
+#available-files-grid .wrap > div {
+    min-width: 0;
+}
+#available-files-grid .wrap label {
+    margin: 0;
+    display: flex;
+    align-items: center;
+    padding: 0.45rem 0.65rem;
+    border-radius: 0.65rem;
+    background-color: rgba(255, 255, 255, 0.05);
+    border: 1px solid rgba(255, 255, 255, 0.08);
+    transition: background-color 0.2s ease, border-color 0.2s ease;
+    min-height: 2.5rem;
+}
+#available-files-grid .wrap label:hover {
+    background-color: rgba(90, 200, 250, 0.16);
+    border-color: rgba(90, 200, 250, 0.4);
+}
+#available-files-grid .wrap label span {
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+}
+#available-files-grid .gradio-loading {
+    position: absolute;
+    inset: 0;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    background: rgba(10, 14, 23, 0.72);
+    border-radius: 0.75rem;
+    z-index: 10;
+}
+#date-browser-row {
+    gap: 0.75rem;
+}
+#date-browser-row .date-browser-column {
+    flex: 1 1 0%;
+    min-width: 0;
+}
+#date-browser-row .date-browser-column > .gradio-dropdown,
+#date-browser-row .date-browser-column > .gradio-button {
+    width: 100%;
+}
+#date-browser-row .date-browser-column > .gradio-dropdown > div {
+    width: 100%;
+}
+#date-browser-row .date-browser-column .gradio-button {
+    justify-content: center;
+}
+#training-files-summary textarea {
+    max-height: 12rem;
+    overflow-y: auto;
+}
+#download-selected-button {
+    width: 100%;
+}
+#download-selected-button .gradio-button {
+    width: 100%;
+    justify-content: center;
+}
+#artifact-download-row {
+    gap: 0.75rem;
+}
+#artifact-download-row .artifact-download-button {
+    flex: 1 1 0%;
+    min-width: 0;
+}
+#artifact-download-row .artifact-download-button .gradio-button {
+    width: 100%;
+    justify-content: center;
+}
+"""
+def _github_cache_key(path: str) -> str:
+    return path or "__root__"
+def _github_api_url(path: str) -> str:
+    clean_path = path.strip("/")
+    base = f"https://api.github.com/repos/{TRAINING_DATA_REPO}/contents"
+    if clean_path:
+        return f"{base}/{clean_path}?ref={TRAINING_DATA_BRANCH}"
+    return f"{base}?ref={TRAINING_DATA_BRANCH}"
+def list_remote_directory(path: str = "", *, force_refresh: bool = False) -> List[Dict[str, Any]]:
+    key = _github_cache_key(path)
+    if not force_refresh and key in GITHUB_CONTENT_CACHE:
+        return GITHUB_CONTENT_CACHE[key]
+    url = _github_api_url(path)
+    response = requests.get(url, timeout=30)
+    if response.status_code != 200:
+        raise RuntimeError(
+            f"GitHub API request failed for `{path or '.'}` (status {response.status_code})."
+        )
+    payload = response.json()
+    if not isinstance(payload, list):
+        raise RuntimeError("Unexpected GitHub API payload. Expected a directory listing.")
+    GITHUB_CONTENT_CACHE[key] = payload
+    return payload
+def list_remote_years(force_refresh: bool = False) -> List[str]:
+    entries = list_remote_directory("", force_refresh=force_refresh)
+    years = [item["name"] for item in entries if item.get("type") == "dir"]
+    return sorted(years)
+def list_remote_months(year: str, *, force_refresh: bool = False) -> List[str]:
+    if not year:
+        return []
+    entries = list_remote_directory(year, force_refresh=force_refresh)
+    months = [item["name"] for item in entries if item.get("type") == "dir"]
+    return sorted(months)
+def list_remote_days(year: str, month: str, *, force_refresh: bool = False) -> List[str]:
+    if not year or not month:
+        return []
+    entries = list_remote_directory(f"{year}/{month}", force_refresh=force_refresh)
+    days = [item["name"] for item in entries if item.get("type") == "dir"]
+    return sorted(days)
+def list_remote_files(year: str, month: str, day: str, *, force_refresh: bool = False) -> List[str]:
+    if not year or not month or not day:
+        return []
+    entries = list_remote_directory(
+        f"{year}/{month}/{day}", force_refresh=force_refresh
+    )
+    files = [item["name"] for item in entries if item.get("type") == "file"]
+    return sorted(files)
+def download_repository_file(year: str, month: str, day: str, filename: str) -> Path:
+    if not filename:
+        raise ValueError("Filename cannot be empty when downloading repository data.")
+    relative_parts = [part for part in (year, month, day, filename) if part]
+    if len(relative_parts) < 4:
+        raise ValueError("Provide year, month, day, and filename to download a CSV.")
+    relative_path = "/".join(relative_parts)
+    raw_url = (
+        f"https://raw.githubusercontent.com/{TRAINING_DATA_REPO}/"
+        f"{TRAINING_DATA_BRANCH}/{relative_path}"
+    )
+    response = requests.get(raw_url, stream=True, timeout=120)
+    if response.status_code != 200:
+        raise RuntimeError(
+            f"Failed to download `{relative_path}` (status {response.status_code})."
+        )
+    target_dir = TRAINING_DATA_DIR.joinpath(year, month, day)
+    target_dir.mkdir(parents=True, exist_ok=True)
+    target_path = target_dir / filename
+    with open(target_path, "wb") as handle:
+        for chunk in response.iter_content(chunk_size=1 << 20):
+            if chunk:
+                handle.write(chunk)
+    return target_path
 def _normalise_header(name: str) -> str:
     return str(name).strip().lower()
 def summarise_training_files(paths: Sequence[str], notes: Sequence[str]) -> str:
     lines = [Path(path).name for path in paths]
     lines.extend(notes)
+    return "\n".join(lines) if lines else "No training files available."
 def read_training_status(status_file_path: str) -> str:
     return destination
+def prepare_training_paths(
+    paths: Sequence[str], current_label: str, cleanup_missing: bool = False
+):
+    valid_paths: List[str] = []
+    notes: List[str] = []
+    columns_map: Dict[str, str] = {}
+    for path in paths:
+        try:
+            df = load_measurement_csv(path)
+        except Exception as exc:  # pragma: no cover - user file diagnostics
+            notes.append(f"⚠️ Skipped {Path(path).name}: {exc}")
+            if cleanup_missing:
+                try:
+                    Path(path).unlink(missing_ok=True)
+                except Exception:
+                    pass
+            continue
+        valid_paths.append(str(path))
+        for col in df.columns:
+            columns_map[_normalise_header(col)] = str(col)
+    summary = summarise_training_files(valid_paths, notes)
+    preferred = current_label or LABEL_COLUMN
+    dropdown_choices = sorted(columns_map.values()) if columns_map else [preferred or LABEL_COLUMN]
+    guessed = guess_label_from_columns(dropdown_choices, preferred)
+    dropdown_value = guessed or preferred or LABEL_COLUMN
+    return valid_paths, summary, gr.update(choices=dropdown_choices, value=dropdown_value)
 def append_training_files(new_files, existing_paths: Sequence[str], current_label: str):
     if isinstance(existing_paths, (str, Path)):
         paths: List[str] = [str(existing_paths)]
             if path_str not in paths:
                 paths.append(path_str)
+    return prepare_training_paths(paths, current_label, cleanup_missing=True)
+def load_repository_training_files(current_label: str, force_refresh: bool = False):
+    if force_refresh:
+        # Clearing the cache is enough because downloads are now on-demand.
+        for cached in list(TRAINING_DATA_DIR.glob("*")):
+            # On refresh we keep previously downloaded files; no deletion required.
+            # The flag triggers downstream UI updates only.
+            break
+    csv_paths = sorted(
+        str(path)
+        for path in TRAINING_DATA_DIR.rglob("*.csv")
+        if path.is_file()
+    )
+    if not csv_paths:
+        message = (
+            "No local database CSVs are available yet. Use the database browser "
+            "below to download specific days before training."
+        )
+        default_label = current_label or LABEL_COLUMN or "Fault"
+        return (
+            [],
+            message,
+            gr.update(choices=[default_label], value=default_label),
+            message,
+        )
+    valid_paths, summary, label_update = prepare_training_paths(
+        csv_paths, current_label, cleanup_missing=False
+    )
+    info = (
+        f"Ready with {len(valid_paths)} CSV file(s) cached locally under "
+        f"the database cache `{TRAINING_DATA_DIR}`."
+    )
+    return valid_paths, summary, label_update, info
+def refresh_remote_browser(force_refresh: bool = False):
+    if force_refresh:
+        GITHUB_CONTENT_CACHE.clear()
+    try:
+        years = list_remote_years(force_refresh=force_refresh)
+        if years:
+            message = "Select a year, month, and day to list available CSV files."
+        else:
+            message = (
+                "⚠️ No directories were found in the database root. Verify the upstream "
+                "structure."
+            )
+        return (
+            gr.update(choices=years, value=None),
+            gr.update(choices=[], value=None),
+            gr.update(choices=[], value=None),
+            gr.update(choices=[], value=[]),
+            message,
+        )
+    except Exception as exc:
+        return (
+            gr.update(choices=[], value=None),
+            gr.update(choices=[], value=None),
+            gr.update(choices=[], value=None),
+            gr.update(choices=[], value=[]),
+            f"⚠️ Failed to query database: {exc}",
+        )
+def on_year_change(year: Optional[str]):
+    if not year:
+        return (
+            gr.update(choices=[], value=None),
+            gr.update(choices=[], value=None),
+            gr.update(choices=[], value=[]),
+            "Select a year to continue.",
+        )
+    try:
+        months = list_remote_months(year)
+        message = (
+            f"Year `{year}` selected. Choose a month to drill down."
+            if months
+            else f"⚠️ No months available under `{year}`."
+        )
+        return (
+            gr.update(choices=months, value=None),
+            gr.update(choices=[], value=None),
+            gr.update(choices=[], value=[]),
+            message,
+        )
+    except Exception as exc:
+        return (
+            gr.update(choices=[], value=None),
+            gr.update(choices=[], value=None),
+            gr.update(choices=[], value=[]),
+            f"⚠️ Failed to list months: {exc}",
+        )
+def on_month_change(year: Optional[str], month: Optional[str]):
+    if not year or not month:
+        return (
+            gr.update(choices=[], value=None),
+            gr.update(choices=[], value=[]),
+            "Select a month to continue.",
+        )
+    try:
+        days = list_remote_days(year, month)
+        message = (
+            f"Month `{year}/{month}` ready. Pick a day to view files."
+            if days
+            else f"⚠️ No day folders found under `{year}/{month}`."
+        )
+        return (
+            gr.update(choices=days, value=None),
+            gr.update(choices=[], value=[]),
+            message,
+        )
+    except Exception as exc:
+        return (
+            gr.update(choices=[], value=None),
+            gr.update(choices=[], value=[]),
+            f"⚠️ Failed to list days: {exc}",
+        )
+def on_day_change(year: Optional[str], month: Optional[str], day: Optional[str]):
+    if not year or not month or not day:
+        return (
+            gr.update(choices=[], value=[]),
+            "Select a day to load file names.",
+        )
+    try:
+        files = list_remote_files(year, month, day)
+        message = (
+            f"{len(files)} file(s) available for `{year}/{month}/{day}`."
+            if files
+            else f"⚠️ No CSV files found under `{year}/{month}/{day}`."
+        )
+        return (
+            gr.update(choices=files, value=[]),
+            message,
+        )
+    except Exception as exc:
+        return (
+            gr.update(choices=[], value=[]),
+            f"⚠️ Failed to list files: {exc}",
+        )
+def download_selected_files(
+    year: Optional[str],
+    month: Optional[str],
+    day: Optional[str],
+    filenames: Sequence[str],
+    current_label: str,
+):
+    if not filenames:
+        message = "Select at least one CSV before downloading."
+        local = load_repository_training_files(current_label)
+        return (*local, gr.update(), message)
+    success: List[str] = []
     notes: List[str] = []
+    for filename in filenames:
         try:
+            path = download_repository_file(year or "", month or "", day or "", filename)
+            success.append(str(path))
+        except Exception as exc:
+            notes.append(f"⚠️ {filename}: {exc}")
+    local = load_repository_training_files(current_label)
+    message_lines = []
+    if success:
+        message_lines.append(
+            f"Downloaded {len(success)} file(s) to the database cache `{TRAINING_DATA_DIR}`."
+        )
+    if notes:
+        message_lines.extend(notes)
+    if not message_lines:
+        message_lines.append("No files were downloaded.")
+    return (*local, gr.update(value=[]), "\n".join(message_lines))
+def download_day_bundle(
+    year: Optional[str],
+    month: Optional[str],
+    day: Optional[str],
+    current_label: str,
+):
+    if not (year and month and day):
+        local = load_repository_training_files(current_label)
+        return (
+            *local,
+            gr.update(),
+            "Select a year, month, and day before downloading an entire day.",
+        )
+    try:
+        files = list_remote_files(year, month, day)
+    except Exception as exc:
+        local = load_repository_training_files(current_label)
+        return (
+            *local,
+            gr.update(),
+            f"⚠️ Failed to list CSVs for `{year}/{month}/{day}`: {exc}",
+        )
+    if not files:
+        local = load_repository_training_files(current_label)
+        return (
+            *local,
+            gr.update(),
+            f"No CSV files were found for `{year}/{month}/{day}`.",
+        )
+    result = list(download_selected_files(year, month, day, files, current_label))
+    result[-1] = (
+        f"Downloaded all {len(files)} CSV file(s) for `{year}/{month}/{day}`.\n"
+        f"{result[-1]}"
+    )
+    return tuple(result)
+def download_month_bundle(
+    year: Optional[str], month: Optional[str], current_label: str
+):
+    if not (year and month):
+        local = load_repository_training_files(current_label)
+        return (
+            *local,
+            gr.update(),
+            "Select a year and month before downloading an entire month.",
+        )
+    try:
+        days = list_remote_days(year, month)
+    except Exception as exc:
+        local = load_repository_training_files(current_label)
+        return (
+            *local,
+            gr.update(),
+            f"⚠️ Failed to enumerate days for `{year}/{month}`: {exc}",
+        )
+    if not days:
+        local = load_repository_training_files(current_label)
+        return (
+            *local,
+            gr.update(),
+            f"No day folders were found for `{year}/{month}`.",
+        )
+    downloaded = 0
+    notes: List[str] = []
+    for day in days:
+        try:
+            files = list_remote_files(year, month, day)
+        except Exception as exc:
+            notes.append(f"⚠️ Failed to list `{year}/{month}/{day}`: {exc}")
+            continue
+        if not files:
+            notes.append(f"⚠️ No CSV files in `{year}/{month}/{day}`.")
+            continue
+        for filename in files:
             try:
+                download_repository_file(year, month, day, filename)
+                downloaded += 1
+            except Exception as exc:
+                notes.append(
+                    f"⚠️ {year}/{month}/{day}/{filename}: {exc}"
+                )
+    local = load_repository_training_files(current_label)
+    message_lines = []
+    if downloaded:
+        message_lines.append(
+            f"Downloaded {downloaded} CSV file(s) for `{year}/{month}` into the "
+            f"database cache `{TRAINING_DATA_DIR}`."
+        )
+    message_lines.extend(notes)
+    if not message_lines:
+        message_lines.append("No files were downloaded.")
+    return (*local, gr.update(value=[]), "\n".join(message_lines))
+def download_year_bundle(year: Optional[str], current_label: str):
+    if not year:
+        local = load_repository_training_files(current_label)
+        return (
+            *local,
+            gr.update(),
+            "Select a year before downloading an entire year of CSVs.",
+        )
+    try:
+        months = list_remote_months(year)
+    except Exception as exc:
+        local = load_repository_training_files(current_label)
+        return (
+            *local,
+            gr.update(),
+            f"⚠️ Failed to enumerate months for `{year}`: {exc}",
+        )
+    if not months:
+        local = load_repository_training_files(current_label)
+        return (
+            *local,
+            gr.update(),
+            f"No month folders were found for `{year}`.",
+        )
+    downloaded = 0
+    notes: List[str] = []
+    for month in months:
+        try:
+            days = list_remote_days(year, month)
+        except Exception as exc:
+            notes.append(f"⚠️ Failed to list `{year}/{month}`: {exc}")
             continue
+        if not days:
+            notes.append(f"⚠️ No day folders in `{year}/{month}`.")
+            continue
+        for day in days:
+            try:
+                files = list_remote_files(year, month, day)
+            except Exception as exc:
+                notes.append(f"⚠️ Failed to list `{year}/{month}/{day}`: {exc}")
+                continue
+            if not files:
+                notes.append(f"⚠️ No CSV files in `{year}/{month}/{day}`.")
+                continue
+            for filename in files:
+                try:
+                    download_repository_file(year, month, day, filename)
+                    downloaded += 1
+                except Exception as exc:
+                    notes.append(
+                        f"⚠️ {year}/{month}/{day}/{filename}: {exc}"
+                    )
+    local = load_repository_training_files(current_label)
+    message_lines = []
+    if downloaded:
+        message_lines.append(
+            f"Downloaded {downloaded} CSV file(s) for `{year}` into the "
+            f"database cache `{TRAINING_DATA_DIR}`."
+        )
+    message_lines.extend(notes)
+    if not message_lines:
+        message_lines.append("No files were downloaded.")
+    return (*local, gr.update(value=[]), "\n".join(message_lines))
+def clear_downloaded_cache(current_label: str):
+    status_message = ""
+    try:
+        if TRAINING_DATA_DIR.exists():
+            shutil.rmtree(TRAINING_DATA_DIR)
+        TRAINING_DATA_DIR.mkdir(parents=True, exist_ok=True)
+        status_message = (
+            f"Cleared all downloaded CSVs from database cache `{TRAINING_DATA_DIR}`."
+        )
+    except Exception as exc:
+        status_message = f"⚠️ Failed to clear database cache: {exc}"
+    local = load_repository_training_files(current_label, force_refresh=True)
+    remote = list(refresh_remote_browser(force_refresh=False))
+    if status_message:
+        previous = remote[-1]
+        if isinstance(previous, str) and previous:
+            remote[-1] = f"{status_message}\n{previous}"
+        else:
+            remote[-1] = status_message
+    return (*local, *remote)
+def normalise_output_directory(directory: Optional[str]) -> Path:
+    base = Path(directory or MODEL_OUTPUT_DIR)
+    base = base.expanduser()
+    if not base.is_absolute():
+        base = (Path.cwd() / base).resolve()
+    return base
+def resolve_output_path(
+    directory: Optional[Union[Path, str]], filename: Optional[str], fallback: str
+) -> Path:
+    if isinstance(directory, Path):
+        base = directory
+    else:
+        base = normalise_output_directory(directory)
+    candidate = Path(filename or "").expanduser()
+    if str(candidate):
+        if candidate.is_absolute():
+            return candidate
+        return (base / candidate).resolve()
+    return (base / fallback).resolve()
 def clear_training_files():
 15. `[338] UPMU_SUB22-C3:MAG` – phase C current magnitude
 16. `[339] UPMU_SUB22-C3:ANG` – phase C current angle
+The training tab automatically downloads the latest CSV exports from the
+`VincentCroft/ThesisModelData` repository and concatenates them before building
+sliding windows.
 ## Models Developed
         button_secondary_background_fill="#3f3f46",
         button_secondary_text_color="#f5f5f5",
     )
+    def _normalise_directory_string(value: Optional[Union[str, Path]]) -> str:
+        if value is None:
+            return ""
+        path = Path(value).expanduser()
+        try:
+            return str(path.resolve())
+        except Exception:
+            return str(path)
+    with gr.Blocks(title="Fault Classification - PMU Data", theme=theme, css=APP_CSS) as demo:
         gr.Markdown("# Fault Classification for PMU & PV Data")
         gr.Markdown(
             "🖥️ TensorFlow is locked to CPU execution so the Space can run without CUDA drivers."
             with gr.Tab("Training"):
                 gr.Markdown("## Train or Fine-tune the Model")
                 gr.Markdown(
+                    "Training data is automatically downloaded from the database. "
+                    "Refresh the cache if new files are added upstream."
                 )
                 training_files_state = gr.State([])
                 with gr.Row():
+                    with gr.Column(scale=3):
+                        training_files_summary = gr.Textbox(
+                            label="Database training CSVs",
+                            value="Training dataset not loaded yet.",
+                            lines=4,
+                            interactive=False,
+                            elem_id="training-files-summary",
+                        )
+                    with gr.Column(scale=2, min_width=240):
+                        dataset_info = gr.Markdown(
+                            "No local database CSVs downloaded yet.",
+                        )
+                        dataset_refresh = gr.Button(
+                            "🔄 Reload dataset from database",
+                            variant="secondary",
+                        )
+                        clear_cache_button = gr.Button(
+                            "🧹 Clear downloaded cache",
+                            variant="secondary",
                         )
+                with gr.Accordion("📂 DataBaseBrowser", open=False):
+                    gr.Markdown(
+                        "Browse the upstream database by date and download only the CSVs you need."
+                    )
+                    with gr.Row(elem_id="date-browser-row"):
+                        with gr.Column(scale=1, elem_classes=["date-browser-column"]):
+                            year_selector = gr.Dropdown(label="Year", choices=[])
+                            year_download_button = gr.Button(
+                                "⬇️ Download year CSVs", variant="secondary"
+                            )
+                        with gr.Column(scale=1, elem_classes=["date-browser-column"]):
+                            month_selector = gr.Dropdown(label="Month", choices=[])
+                            month_download_button = gr.Button(
+                                "⬇️ Download month CSVs", variant="secondary"
+                            )
+                        with gr.Column(scale=1, elem_classes=["date-browser-column"]):
+                            day_selector = gr.Dropdown(label="Day", choices=[])
+                            day_download_button = gr.Button(
+                                "⬇️ Download day CSVs", variant="secondary"
+                            )
+                    available_files = gr.CheckboxGroup(
+                        label="Available CSV files",
+                        choices=[],
+                        value=[],
+                        elem_id="available-files-grid",
+                    )
+                    download_button = gr.Button(
+                        "⬇️ Download selected CSVs",
+                        variant="secondary",
+                        elem_id="download-selected-button",
+                    )
+                    repo_status = gr.Markdown(
+                        "Click 'Reload dataset from database' to fetch the directory tree."
+                    )
                 with gr.Row():
                     label_input = gr.Dropdown(
                         label="Stride",
                     )
+                model_default = MODEL_FILENAME_BY_TYPE.get(
+                    MODEL_TYPE, Path(LOCAL_MODEL_FILE).name
                 )
                 with gr.Row():
                     )
                 with gr.Row():
+                    output_directory = gr.Textbox(
+                        value=str(MODEL_OUTPUT_DIR),
+                        label="Output directory",
+                    )
+                    model_name = gr.Textbox(
+                        value=model_default,
+                        label="Model output filename",
+                    )
                     scaler_name = gr.Textbox(
+                        value=Path(LOCAL_SCALER_FILE).name,
                         label="Scaler output filename",
                     )
                     metadata_name = gr.Textbox(
+                        value=Path(LOCAL_METADATA_FILE).name,
                         label="Metadata output filename",
                     )
+                with gr.Row(elem_id="artifact-download-row"):
+                    model_download_button = gr.DownloadButton(
+                        "⬇️ Download model file",
+                        value=None,
+                        visible=False,
+                        elem_classes=["artifact-download-button"],
+                    )
+                    scaler_download_button = gr.DownloadButton(
+                        "⬇️ Download scaler file",
+                        value=None,
+                        visible=False,
+                        elem_classes=["artifact-download-button"],
+                    )
+                    metadata_download_button = gr.DownloadButton(
+                        "⬇️ Download metadata file",
+                        value=None,
+                        visible=False,
+                        elem_classes=["artifact-download-button"],
+                    )
+                    tensorboard_download_button = gr.DownloadButton(
+                        "⬇️ Download TensorBoard logs",
+                        value=None,
+                        visible=False,
+                        elem_classes=["artifact-download-button"],
+                    )
+                    model_download_button.file_name = Path(LOCAL_MODEL_FILE).name
+                    scaler_download_button.file_name = Path(LOCAL_SCALER_FILE).name
+                    metadata_download_button.file_name = Path(LOCAL_METADATA_FILE).name
+                    tensorboard_download_button.file_name = "tensorboard_logs.zip"
                 tensorboard_toggle = gr.Checkbox(
                     value=True,
                     label="Enable TensorBoard logging (creates downloadable archive)",
                 def _suggest_model_filename(choice: str, current_value: str):
                     choice_key = (choice or "cnn_lstm").lower().replace("-", "_")
+                    suggested = MODEL_FILENAME_BY_TYPE.get(
+                        choice_key, Path(LOCAL_MODEL_FILE).name
+                    )
+                    known_defaults = set(MODEL_FILENAME_BY_TYPE.values())
                     current_name = Path(current_value).name if current_value else ""
                     if current_name and current_name not in known_defaults:
                         return gr.update()
                 report_output = gr.Dataframe(label="Classification report", interactive=False)
                 history_output = gr.JSON(label="Training history")
                 confusion_output = gr.Dataframe(label="Confusion matrix", interactive=False)
                 # Message area at the bottom for progress updates
                 with gr.Accordion("📋 Progress Messages", open=True):
                     validation_split,
                     batch_size,
                     epochs,
+                    output_dir,
                     model_filename,
                     scaler_filename,
                     metadata_filename,
                     enable_tensorboard,
                 ):
+                    def _download_state(path: Optional[Union[str, Path]]):
+                        if not path:
+                            return gr.update(value=None, visible=False)
+                        candidate = Path(path)
+                        if candidate.exists():
+                            return gr.update(value=str(candidate), visible=True)
+                        return gr.update(value=None, visible=False)
                     try:
+                        base_dir = normalise_output_directory(output_dir)
+                        base_dir.mkdir(parents=True, exist_ok=True)
+                        model_path = resolve_output_path(
+                            base_dir,
+                            model_filename,
+                            Path(LOCAL_MODEL_FILE).name,
+                        )
+                        scaler_path = resolve_output_path(
+                            base_dir,
+                            scaler_filename,
+                            Path(LOCAL_SCALER_FILE).name,
+                        )
+                        metadata_path = resolve_output_path(
+                            base_dir,
+                            metadata_filename,
+                            Path(LOCAL_METADATA_FILE).name,
+                        )
+                        model_path.parent.mkdir(parents=True, exist_ok=True)
+                        scaler_path.parent.mkdir(parents=True, exist_ok=True)
+                        metadata_path.parent.mkdir(parents=True, exist_ok=True)
                         # Create status file path for progress tracking
+                        status_file = model_path.parent / "training_status.txt"
                         # Initialize status
                         with open(status_file, 'w') as f:
                             f.write("Starting training setup...")
                         if not file_paths:
+                            raise ValueError(
+                                "No training CSVs were found in the database cache. "
+                                "Use 'Reload dataset from database' and try again."
+                            )
                         with open(status_file, 'w') as f:
                             f.write("Loading and validating CSV files...")
                         available_paths = [path for path in file_paths if Path(path).exists()]
                         missing_paths = [Path(path).name for path in file_paths if not Path(path).exists()]
                         if not available_paths:
+                            raise ValueError(
+                                "Database training dataset is unavailable. Reload the dataset and retry."
+                            )
                         dfs = [load_measurement_csv(path) for path in available_paths]
                         combined = pd.concat(dfs, ignore_index=True)
                             batch_size=int(batch_size),
                             epochs=int(epochs),
                             model_type=model_choice,
+                            model_path=model_path,
+                            scaler_path=scaler_path,
+                            metadata_path=metadata_path,
                             enable_tensorboard=bool(enable_tensorboard),
                         )
                             report_df,
                             result["history"],
                             confusion_df,
+                            _download_state(result["model_path"]),
+                            _download_state(result["scaler_path"]),
+                            _download_state(result["metadata_path"]),
+                            _download_state(tensorboard_zip),
                             gr.update(value=result.get("label_column", label_column)),
                         )
                     except Exception as exc:
                             pd.DataFrame(),
                             {},
                             pd.DataFrame(),
+                            _download_state(None),
+                            _download_state(None),
+                            _download_state(None),
+                            _download_state(None),
                             gr.update(),
                         )
+                def _check_progress(output_dir, model_filename, current_messages):
                     """Check training progress by reading status file and accumulate messages."""
+                    model_path = resolve_output_path(
+                        output_dir, model_filename, Path(LOCAL_MODEL_FILE).name
+                    )
+                    status_file = model_path.parent / "training_status.txt"
                     status_message = read_training_status(str(status_file))
                     # Add timestamp to the message
                         validation_train,
                         batch_train,
                         epochs_train,
+                        output_directory,
                         model_name,
                         scaler_name,
                         metadata_name,
                         report_output,
                         history_output,
                         confusion_output,
+                        model_download_button,
+                        scaler_download_button,
+                        metadata_download_button,
+                        tensorboard_download_button,
                         label_input,
                     ],
                     concurrency_limit=EVENT_CONCURRENCY_LIMIT,
                 progress_button.click(
                     _check_progress,
+                    inputs=[output_directory, model_name, progress_messages],
                     outputs=[progress_messages],
                 )
+                year_selector.change(
+                    on_year_change,
+                    inputs=[year_selector],
+                    outputs=[month_selector, day_selector, available_files, repo_status],
                     concurrency_limit=EVENT_CONCURRENCY_LIMIT,
                 )
+                month_selector.change(
+                    on_month_change,
+                    inputs=[year_selector, month_selector],
+                    outputs=[day_selector, available_files, repo_status],
                     concurrency_limit=EVENT_CONCURRENCY_LIMIT,
                 )
+                day_selector.change(
+                    on_day_change,
+                    inputs=[year_selector, month_selector, day_selector],
+                    outputs=[available_files, repo_status],
+                    concurrency_limit=EVENT_CONCURRENCY_LIMIT,
+                )
+                download_button.click(
+                    download_selected_files,
+                    inputs=[
+                        year_selector,
+                        month_selector,
+                        day_selector,
+                        available_files,
+                        label_input,
+                    ],
+                    outputs=[
+                        training_files_state,
+                        training_files_summary,
+                        label_input,
+                        dataset_info,
+                        available_files,
+                        repo_status,
+                    ],
+                    concurrency_limit=EVENT_CONCURRENCY_LIMIT,
+                )
+                year_download_button.click(
+                    download_year_bundle,
+                    inputs=[year_selector, label_input],
+                    outputs=[
+                        training_files_state,
+                        training_files_summary,
+                        label_input,
+                        dataset_info,
+                        available_files,
+                        repo_status,
+                    ],
+                    concurrency_limit=EVENT_CONCURRENCY_LIMIT,
+                )
+                month_download_button.click(
+                    download_month_bundle,
+                    inputs=[year_selector, month_selector, label_input],
+                    outputs=[
+                        training_files_state,
+                        training_files_summary,
+                        label_input,
+                        dataset_info,
+                        available_files,
+                        repo_status,
+                    ],
+                    concurrency_limit=EVENT_CONCURRENCY_LIMIT,
+                )
+                day_download_button.click(
+                    download_day_bundle,
+                    inputs=[year_selector, month_selector, day_selector, label_input],
+                    outputs=[
+                        training_files_state,
+                        training_files_summary,
+                        label_input,
+                        dataset_info,
+                        available_files,
+                        repo_status,
+                    ],
+                    concurrency_limit=EVENT_CONCURRENCY_LIMIT,
+                )
+                def _reload_dataset(current_label):
+                    local = load_repository_training_files(current_label, force_refresh=True)
+                    remote = refresh_remote_browser(force_refresh=True)
+                    return (*local, *remote)
+                dataset_refresh.click(
+                    _reload_dataset,
+                    inputs=[label_input],
+                    outputs=[
+                        training_files_state,
+                        training_files_summary,
+                        label_input,
+                        dataset_info,
+                        year_selector,
+                        month_selector,
+                        day_selector,
+                        available_files,
+                        repo_status,
+                    ],
+                    concurrency_limit=EVENT_CONCURRENCY_LIMIT,
+                )
+                clear_cache_button.click(
+                    clear_downloaded_cache,
+                    inputs=[label_input],
+                    outputs=[
+                        training_files_state,
+                        training_files_summary,
+                        label_input,
+                        dataset_info,
+                        year_selector,
+                        month_selector,
+                        day_selector,
+                        available_files,
+                        repo_status,
+                    ],
+                    concurrency_limit=EVENT_CONCURRENCY_LIMIT,
+                )
+                def _initialise_dataset():
+                    local = load_repository_training_files(LABEL_COLUMN, force_refresh=False)
+                    remote = refresh_remote_browser(force_refresh=False)
+                    return (*local, *remote)
+                demo.load(
+                    _initialise_dataset,
+                    inputs=None,
+                    outputs=[
+                        training_files_state,
+                        training_files_summary,
+                        label_input,
+                        dataset_info,
+                        year_selector,
+                        month_selector,
+                        day_selector,
+                        available_files,
+                        repo_status,
+                    ],
+                    queue=False,
                 )
     return demo

model/.gitkeep ADDED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -6,3 +6,4 @@ scikit-learn
 huggingface_hub
 matplotlib
 joblib

 huggingface_hub
 matplotlib
 joblib
+requests