| """ |
| Data registration script for the predictive maintenance project. |
| |
| This script is the analogue of the notebook's `data_register.py`: |
| - It ensures the Hugging Face dataset repo exists. |
| - It uploads the raw engine dataset from the local `data/` folder |
| into the dataset repo so it can be consumed by other stages |
| (EDA, data preparation, model training) using a consistent source. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import sys |
| from pathlib import Path |
|
|
| try: |
| import config |
| from hf_data_utils import register_raw_engine_data_to_hf |
| except ImportError as e: |
| print(f"ERROR: Failed to import modules: {e}", file=sys.stderr) |
| print(f"Python path: {sys.path}", file=sys.stderr) |
| sys.exit(1) |
|
|
|
|
| def main() -> None: |
| try: |
| print(f"PROJECT_ROOT: {config.PROJECT_ROOT}") |
| print(f"RAW_DATA_FILE: {config.RAW_DATA_FILE}") |
| print(f"RAW_DATA_FILE exists: {config.RAW_DATA_FILE.exists()}") |
| print(f"HF_DATASET_REPO: {config.HF_DATASET_REPO}") |
| print(f"HF_TOKEN is set: {bool(config.HF_TOKEN)}") |
| |
| if not config.RAW_DATA_FILE.exists(): |
| raise FileNotFoundError( |
| f"Expected raw data at {config.RAW_DATA_FILE}, " |
| "but the file does not exist. Make sure `engine_data.csv` " |
| "is placed in the `data/` folder." |
| ) |
|
|
| if not config.HF_TOKEN: |
| raise ValueError( |
| "HF_TOKEN is not set. Please set it as an environment variable " |
| "or in GitHub Secrets." |
| ) |
|
|
| print(f"Registering raw engine dataset from: {config.RAW_DATA_FILE}") |
| print(f"Target HF dataset repo: {config.HF_DATASET_REPO}") |
| register_raw_engine_data_to_hf() |
| print("✅ Dataset registration to Hugging Face completed.") |
| except Exception as e: |
| print(f"ERROR: {type(e).__name__}: {e}", file=sys.stderr) |
| import traceback |
| traceback.print_exc() |
| sys.exit(1) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|