pedroapfilho commited on
Commit
ad1969b
·
unverified ·
1 Parent(s): f7d4206

Use HF cache for dataset downloads instead of custom local_dir

Browse files

On HF Spaces, snapshot_download without local_dir leverages the
co-located HF cache, avoiding file duplication to ephemeral storage.

Files changed (2) hide show
  1. app.py +1 -2
  2. src/lora_trainer.py +9 -14
app.py CHANGED
@@ -330,10 +330,9 @@ def lora_download_hf(dataset_id, hf_token, training_state):
330
  return "Enter a dataset ID (e.g. pedroapfilho/lofi-tracks)", training_state
331
 
332
  token = hf_token.strip() if hf_token else None
333
- output_dir = str(Path("lora_training") / "hf_datasets")
334
 
335
  local_dir, dl_status = download_hf_dataset(
336
- dataset_id.strip(), output_dir, hf_token=token
337
  )
338
 
339
  if not local_dir:
 
330
  return "Enter a dataset ID (e.g. pedroapfilho/lofi-tracks)", training_state
331
 
332
  token = hf_token.strip() if hf_token else None
 
333
 
334
  local_dir, dl_status = download_hf_dataset(
335
+ dataset_id.strip(), hf_token=token
336
  )
337
 
338
  if not local_dir:
src/lora_trainer.py CHANGED
@@ -16,35 +16,30 @@ AUDIO_EXTENSIONS = ["*.wav", "*.mp3", "*.flac", "*.ogg", "*.opus"]
16
 
17
  def download_hf_dataset(
18
  dataset_id: str,
19
- output_dir: str,
20
  hf_token: Optional[str] = None,
21
  ) -> Tuple[str, str]:
22
  """
23
  Download an audio dataset from HuggingFace Hub.
24
 
25
- Uses snapshot_download to fetch only audio files from the repo,
26
- skipping non-audio content like READMEs, metadata, etc.
 
27
 
28
  Args:
29
  dataset_id: HuggingFace dataset repo ID (e.g. "pedroapfilho/lofi-tracks")
30
- output_dir: Local directory to download into
31
  hf_token: Optional HuggingFace token for private repos
32
 
33
  Returns:
34
- Tuple of (local_dir, status_message)
35
  """
36
  try:
37
  from huggingface_hub import snapshot_download
38
 
39
- output_path = Path(output_dir)
40
- output_path.mkdir(parents=True, exist_ok=True)
41
 
42
- logger.info(f"Downloading dataset '{dataset_id}' to {output_dir}...")
43
-
44
- local_dir = snapshot_download(
45
  repo_id=dataset_id,
46
  repo_type="dataset",
47
- local_dir=str(output_path / dataset_id.replace("/", "_")),
48
  token=hf_token or None,
49
  allow_patterns=AUDIO_EXTENSIONS,
50
  )
@@ -52,12 +47,12 @@ def download_hf_dataset(
52
  audio_count = sum(
53
  1
54
  for ext in AUDIO_EXTENSIONS
55
- for _ in Path(local_dir).rglob(ext)
56
  )
57
 
58
- status = f"Downloaded {audio_count} audio files from {dataset_id}"
59
  logger.info(status)
60
- return local_dir, status
61
 
62
  except ImportError:
63
  msg = "huggingface_hub is not installed. Run: pip install huggingface_hub"
 
16
 
17
  def download_hf_dataset(
18
  dataset_id: str,
 
19
  hf_token: Optional[str] = None,
20
  ) -> Tuple[str, str]:
21
  """
22
  Download an audio dataset from HuggingFace Hub.
23
 
24
+ Uses snapshot_download without local_dir so HF's built-in cache
25
+ handles storage. On HF Spaces this is co-located with HF storage,
26
+ avoiding unnecessary file duplication.
27
 
28
  Args:
29
  dataset_id: HuggingFace dataset repo ID (e.g. "pedroapfilho/lofi-tracks")
 
30
  hf_token: Optional HuggingFace token for private repos
31
 
32
  Returns:
33
+ Tuple of (cached_dir, status_message)
34
  """
35
  try:
36
  from huggingface_hub import snapshot_download
37
 
38
+ logger.info(f"Fetching dataset '{dataset_id}' via HF cache...")
 
39
 
40
+ cached_dir = snapshot_download(
 
 
41
  repo_id=dataset_id,
42
  repo_type="dataset",
 
43
  token=hf_token or None,
44
  allow_patterns=AUDIO_EXTENSIONS,
45
  )
 
47
  audio_count = sum(
48
  1
49
  for ext in AUDIO_EXTENSIONS
50
+ for _ in Path(cached_dir).rglob(ext)
51
  )
52
 
53
+ status = f"Loaded {audio_count} audio files from {dataset_id}"
54
  logger.info(status)
55
+ return cached_dir, status
56
 
57
  except ImportError:
58
  msg = "huggingface_hub is not installed. Run: pip install huggingface_hub"