akazemian commited on
Commit
e58f9a0
·
verified ·
1 Parent(s): 969d75c

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. library.csv +2 -2
  2. sync_library_and_hf.py +55 -9
library.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ca91ae491fda82472e0496faf7e3a38a28753ebdedea38992459e18caf00221
3
- size 27430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94dec2938a2b0d919b028ff23391e8b17edd9158e986776821404c3735b7d013
3
+ size 1891411
sync_library_and_hf.py CHANGED
@@ -1,11 +1,20 @@
1
  #!/usr/bin/env python3
2
  # sync_library_and_hf.py
3
-
 
 
 
 
 
 
 
 
 
4
  import argparse, datetime, uuid, posixpath, sys, traceback, os, hashlib
5
  from pathlib import Path
6
  from typing import List, Tuple, Set
7
  from urllib.parse import unquote
8
-
9
  import pandas as pd
10
  from huggingface_hub import (
11
  HfApi,
@@ -43,15 +52,55 @@ def key_from_manifest_filename(fname: str) -> str:
43
  base = Path(fname).name
44
  return _strip_ext(base, AUDIO_EXTS)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def load_manifest_map(csv_path: Path) -> dict[str, tuple[str, str]]:
47
  """
48
  Returns {basename_key: (dataset, category)} from the manifest.
49
  Manifest must have columns: file_name, dataset, category
50
  """
51
- if not csv_path.exists():
52
- print(f"[manifest] WARNING: not found: {csv_path}")
53
- return {}
54
- dfm = pd.read_csv(csv_path)
 
55
  dfm = dfm.rename(columns={'audio_category':'category'})
56
  required = {"file_name", "dataset", "category"}
57
  missing = required - set(dfm.columns)
@@ -68,7 +117,6 @@ def load_manifest_map(csv_path: Path) -> dict[str, tuple[str, str]]:
68
  print(f"[manifest] loaded {len(m)} keys from {csv_path}")
69
  return m
70
 
71
-
72
  def now_iso() -> str:
73
  return datetime.datetime.now().isoformat(timespec="seconds")
74
 
@@ -208,8 +256,6 @@ def backfill_hf_paths_by_filename(df_db: pd.DataFrame, hf_repo: str, idx: pd.Dat
208
  updated += 1
209
  return updated
210
 
211
-
212
-
213
  def append_to_remote_index(remote_index: pd.DataFrame, new_rows: List[dict]) -> pd.DataFrame:
214
  if not new_rows:
215
  return remote_index
 
1
  #!/usr/bin/env python3
2
  # sync_library_and_hf.py
3
+ ''''
4
+ RUN BELOW FOR NEW HTML FILES TO UPDATE OLD ONES ON DFATASET REPO
5
+ python sync_library_and_hf.py
6
+ --db-path library.csv
7
+ --repo-id akazemian/audio-html
8
+ --model-name wavcoch_audio-preds-sr=16000
9
+ --index-filename index.csv
10
+ --wipe-remote
11
+ --wipe-local
12
+ '''
13
  import argparse, datetime, uuid, posixpath, sys, traceback, os, hashlib
14
  from pathlib import Path
15
  from typing import List, Tuple, Set
16
  from urllib.parse import unquote
17
+ import os
18
  import pandas as pd
19
  from huggingface_hub import (
20
  HfApi,
 
52
  base = Path(fname).name
53
  return _strip_ext(base, AUDIO_EXTS)
54
 
55
+ def create_file_specific_manifest(csv_path: Path) -> pd.DataFrame:
56
+
57
+ audio_dir = "/data/atlask/BAU-Quant/val"
58
+ manifest = pd.read_csv(csv_path)
59
+
60
+ # 1) Build a files dataframe
61
+ files = pd.DataFrame({"file_name": os.listdir(audio_dir)})
62
+ # keep only audio files if needed
63
+ files = files[files["file_name"].str.lower().str.endswith((".wav", ".mp3", ".flac", ".ogg", ".m4a", ".opus"))].copy()
64
+ files["file_path"] = files["file_name"].apply(lambda f: os.path.join(audio_dir, f))
65
+
66
+ # Normalize to a join key: drop extension, then strip `_chunk...`
67
+ files["key"] = (
68
+ files["file_name"]
69
+ .str.replace(r"\.[^.]+$", "", regex=True) # remove extension
70
+ .str.replace(r"_chunk.*$", "", regex=True) # remove _chunk suffix if present
71
+ )
72
+
73
+ # 2) Prepare manifest with the same key
74
+ man = manifest.copy()
75
+
76
+ # If manifest['file_name'] includes extensions / chunk suffixes, normalize the same way:
77
+ man["key"] = (
78
+ man["file_name"]
79
+ .str.replace(r"\.[^.]+$", "", regex=True)
80
+ .str.replace(r"_chunk.*$", "", regex=True)
81
+ )
82
+
83
+ # If duplicates exist in manifest for the same key, decide how to resolve:
84
+ # e.g., keep first occurrence
85
+ man = man.drop_duplicates(subset="key", keep="first")
86
+
87
+ # 3) Merge once (vectorized)
88
+ cols_to_take = ["sr", "dataset", "audio_category", "split", "duration_s"]
89
+ out = files.merge(man[["key"] + cols_to_take], on="key", how="left")
90
+
91
+ # 4) Final column order
92
+ return out[["sr", "file_name", "file_path", "dataset", "audio_category", "split"]]
93
+
94
  def load_manifest_map(csv_path: Path) -> dict[str, tuple[str, str]]:
95
  """
96
  Returns {basename_key: (dataset, category)} from the manifest.
97
  Manifest must have columns: file_name, dataset, category
98
  """
99
+ # if not csv_path.exists():
100
+ # print(f"[manifest] WARNING: not found: {csv_path}")
101
+ # return {}
102
+ # dfm = pd.read_csv(csv_path)
103
+ dfm = create_file_specific_manifest(csv_path)
104
  dfm = dfm.rename(columns={'audio_category':'category'})
105
  required = {"file_name", "dataset", "category"}
106
  missing = required - set(dfm.columns)
 
117
  print(f"[manifest] loaded {len(m)} keys from {csv_path}")
118
  return m
119
 
 
120
  def now_iso() -> str:
121
  return datetime.datetime.now().isoformat(timespec="seconds")
122
 
 
256
  updated += 1
257
  return updated
258
 
 
 
259
  def append_to_remote_index(remote_index: pd.DataFrame, new_rows: List[dict]) -> pd.DataFrame:
260
  if not new_rows:
261
  return remote_index