Spaces:

adowu
/

ACE-Step-Training

Running

App Files Files Community

pedroapfilho commited on Feb 11

Commit

ad1969b

unverified ·

1 Parent(s): f7d4206

Use HF cache for dataset downloads instead of custom local_dir

Browse files

On HF Spaces, snapshot_download without local_dir leverages the
co-located HF cache, avoiding file duplication to ephemeral storage.

Files changed (2) hide show

app.py +1 -2
src/lora_trainer.py +9 -14

app.py CHANGED Viewed

@@ -330,10 +330,9 @@ def lora_download_hf(dataset_id, hf_token, training_state):
             return "Enter a dataset ID (e.g. pedroapfilho/lofi-tracks)", training_state
         token = hf_token.strip() if hf_token else None
-        output_dir = str(Path("lora_training") / "hf_datasets")
         local_dir, dl_status = download_hf_dataset(
-            dataset_id.strip(), output_dir, hf_token=token
         )
         if not local_dir:

             return "Enter a dataset ID (e.g. pedroapfilho/lofi-tracks)", training_state
         token = hf_token.strip() if hf_token else None
         local_dir, dl_status = download_hf_dataset(
+            dataset_id.strip(), hf_token=token
         )
         if not local_dir:

src/lora_trainer.py CHANGED Viewed

@@ -16,35 +16,30 @@ AUDIO_EXTENSIONS = ["*.wav", "*.mp3", "*.flac", "*.ogg", "*.opus"]
 def download_hf_dataset(
     dataset_id: str,
-    output_dir: str,
     hf_token: Optional[str] = None,
 ) -> Tuple[str, str]:
     """
     Download an audio dataset from HuggingFace Hub.
-    Uses snapshot_download to fetch only audio files from the repo,
-    skipping non-audio content like READMEs, metadata, etc.
     Args:
         dataset_id: HuggingFace dataset repo ID (e.g. "pedroapfilho/lofi-tracks")
-        output_dir: Local directory to download into
         hf_token: Optional HuggingFace token for private repos
     Returns:
-        Tuple of (local_dir, status_message)
     """
     try:
         from huggingface_hub import snapshot_download
-        output_path = Path(output_dir)
-        output_path.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Downloading dataset '{dataset_id}' to {output_dir}...")
-        local_dir = snapshot_download(
             repo_id=dataset_id,
             repo_type="dataset",
-            local_dir=str(output_path / dataset_id.replace("/", "_")),
             token=hf_token or None,
             allow_patterns=AUDIO_EXTENSIONS,
         )
@@ -52,12 +47,12 @@ def download_hf_dataset(
         audio_count = sum(
             1
             for ext in AUDIO_EXTENSIONS
-            for _ in Path(local_dir).rglob(ext)
         )
-        status = f"Downloaded {audio_count} audio files from {dataset_id}"
         logger.info(status)
-        return local_dir, status
     except ImportError:
         msg = "huggingface_hub is not installed. Run: pip install huggingface_hub"

 def download_hf_dataset(
     dataset_id: str,
     hf_token: Optional[str] = None,
 ) -> Tuple[str, str]:
     """
     Download an audio dataset from HuggingFace Hub.
+    Uses snapshot_download without local_dir so HF's built-in cache
+    handles storage. On HF Spaces this is co-located with HF storage,
+    avoiding unnecessary file duplication.
     Args:
         dataset_id: HuggingFace dataset repo ID (e.g. "pedroapfilho/lofi-tracks")
         hf_token: Optional HuggingFace token for private repos
     Returns:
+        Tuple of (cached_dir, status_message)
     """
     try:
         from huggingface_hub import snapshot_download
+        logger.info(f"Fetching dataset '{dataset_id}' via HF cache...")
+        cached_dir = snapshot_download(
             repo_id=dataset_id,
             repo_type="dataset",
             token=hf_token or None,
             allow_patterns=AUDIO_EXTENSIONS,
         )
         audio_count = sum(
             1
             for ext in AUDIO_EXTENSIONS
+            for _ in Path(cached_dir).rglob(ext)
         )
+        status = f"Loaded {audio_count} audio files from {dataset_id}"
         logger.info(status)
+        return cached_dir, status
     except ImportError:
         msg = "huggingface_hub is not installed. Run: pip install huggingface_hub"