Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- library.csv +2 -2
- sync_library_and_hf.py +55 -9
library.csv
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94dec2938a2b0d919b028ff23391e8b17edd9158e986776821404c3735b7d013
|
| 3 |
+
size 1891411
|
sync_library_and_hf.py
CHANGED
|
@@ -1,11 +1,20 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
# sync_library_and_hf.py
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import argparse, datetime, uuid, posixpath, sys, traceback, os, hashlib
|
| 5 |
from pathlib import Path
|
| 6 |
from typing import List, Tuple, Set
|
| 7 |
from urllib.parse import unquote
|
| 8 |
-
|
| 9 |
import pandas as pd
|
| 10 |
from huggingface_hub import (
|
| 11 |
HfApi,
|
|
@@ -43,15 +52,55 @@ def key_from_manifest_filename(fname: str) -> str:
|
|
| 43 |
base = Path(fname).name
|
| 44 |
return _strip_ext(base, AUDIO_EXTS)
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
def load_manifest_map(csv_path: Path) -> dict[str, tuple[str, str]]:
|
| 47 |
"""
|
| 48 |
Returns {basename_key: (dataset, category)} from the manifest.
|
| 49 |
Manifest must have columns: file_name, dataset, category
|
| 50 |
"""
|
| 51 |
-
if not csv_path.exists():
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
dfm = pd.read_csv(csv_path)
|
|
|
|
| 55 |
dfm = dfm.rename(columns={'audio_category':'category'})
|
| 56 |
required = {"file_name", "dataset", "category"}
|
| 57 |
missing = required - set(dfm.columns)
|
|
@@ -68,7 +117,6 @@ def load_manifest_map(csv_path: Path) -> dict[str, tuple[str, str]]:
|
|
| 68 |
print(f"[manifest] loaded {len(m)} keys from {csv_path}")
|
| 69 |
return m
|
| 70 |
|
| 71 |
-
|
| 72 |
def now_iso() -> str:
|
| 73 |
return datetime.datetime.now().isoformat(timespec="seconds")
|
| 74 |
|
|
@@ -208,8 +256,6 @@ def backfill_hf_paths_by_filename(df_db: pd.DataFrame, hf_repo: str, idx: pd.Dat
|
|
| 208 |
updated += 1
|
| 209 |
return updated
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
def append_to_remote_index(remote_index: pd.DataFrame, new_rows: List[dict]) -> pd.DataFrame:
|
| 214 |
if not new_rows:
|
| 215 |
return remote_index
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
# sync_library_and_hf.py
|
| 3 |
+
''''
|
| 4 |
+
RUN BELOW FOR NEW HTML FILES TO UPDATE OLD ONES ON DFATASET REPO
|
| 5 |
+
python sync_library_and_hf.py
|
| 6 |
+
--db-path library.csv
|
| 7 |
+
--repo-id akazemian/audio-html
|
| 8 |
+
--model-name wavcoch_audio-preds-sr=16000
|
| 9 |
+
--index-filename index.csv
|
| 10 |
+
--wipe-remote
|
| 11 |
+
--wipe-local
|
| 12 |
+
'''
|
| 13 |
import argparse, datetime, uuid, posixpath, sys, traceback, os, hashlib
|
| 14 |
from pathlib import Path
|
| 15 |
from typing import List, Tuple, Set
|
| 16 |
from urllib.parse import unquote
|
| 17 |
+
import os
|
| 18 |
import pandas as pd
|
| 19 |
from huggingface_hub import (
|
| 20 |
HfApi,
|
|
|
|
| 52 |
base = Path(fname).name
|
| 53 |
return _strip_ext(base, AUDIO_EXTS)
|
| 54 |
|
| 55 |
+
def create_file_specific_manifest(csv_path: Path) -> pd.DataFrame:
|
| 56 |
+
|
| 57 |
+
audio_dir = "/data/atlask/BAU-Quant/val"
|
| 58 |
+
manifest = pd.read_csv(csv_path)
|
| 59 |
+
|
| 60 |
+
# 1) Build a files dataframe
|
| 61 |
+
files = pd.DataFrame({"file_name": os.listdir(audio_dir)})
|
| 62 |
+
# keep only audio files if needed
|
| 63 |
+
files = files[files["file_name"].str.lower().str.endswith((".wav", ".mp3", ".flac", ".ogg", ".m4a", ".opus"))].copy()
|
| 64 |
+
files["file_path"] = files["file_name"].apply(lambda f: os.path.join(audio_dir, f))
|
| 65 |
+
|
| 66 |
+
# Normalize to a join key: drop extension, then strip `_chunk...`
|
| 67 |
+
files["key"] = (
|
| 68 |
+
files["file_name"]
|
| 69 |
+
.str.replace(r"\.[^.]+$", "", regex=True) # remove extension
|
| 70 |
+
.str.replace(r"_chunk.*$", "", regex=True) # remove _chunk suffix if present
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# 2) Prepare manifest with the same key
|
| 74 |
+
man = manifest.copy()
|
| 75 |
+
|
| 76 |
+
# If manifest['file_name'] includes extensions / chunk suffixes, normalize the same way:
|
| 77 |
+
man["key"] = (
|
| 78 |
+
man["file_name"]
|
| 79 |
+
.str.replace(r"\.[^.]+$", "", regex=True)
|
| 80 |
+
.str.replace(r"_chunk.*$", "", regex=True)
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# If duplicates exist in manifest for the same key, decide how to resolve:
|
| 84 |
+
# e.g., keep first occurrence
|
| 85 |
+
man = man.drop_duplicates(subset="key", keep="first")
|
| 86 |
+
|
| 87 |
+
# 3) Merge once (vectorized)
|
| 88 |
+
cols_to_take = ["sr", "dataset", "audio_category", "split", "duration_s"]
|
| 89 |
+
out = files.merge(man[["key"] + cols_to_take], on="key", how="left")
|
| 90 |
+
|
| 91 |
+
# 4) Final column order
|
| 92 |
+
return out[["sr", "file_name", "file_path", "dataset", "audio_category", "split"]]
|
| 93 |
+
|
| 94 |
def load_manifest_map(csv_path: Path) -> dict[str, tuple[str, str]]:
|
| 95 |
"""
|
| 96 |
Returns {basename_key: (dataset, category)} from the manifest.
|
| 97 |
Manifest must have columns: file_name, dataset, category
|
| 98 |
"""
|
| 99 |
+
# if not csv_path.exists():
|
| 100 |
+
# print(f"[manifest] WARNING: not found: {csv_path}")
|
| 101 |
+
# return {}
|
| 102 |
+
# dfm = pd.read_csv(csv_path)
|
| 103 |
+
dfm = create_file_specific_manifest(csv_path)
|
| 104 |
dfm = dfm.rename(columns={'audio_category':'category'})
|
| 105 |
required = {"file_name", "dataset", "category"}
|
| 106 |
missing = required - set(dfm.columns)
|
|
|
|
| 117 |
print(f"[manifest] loaded {len(m)} keys from {csv_path}")
|
| 118 |
return m
|
| 119 |
|
|
|
|
| 120 |
def now_iso() -> str:
|
| 121 |
return datetime.datetime.now().isoformat(timespec="seconds")
|
| 122 |
|
|
|
|
| 256 |
updated += 1
|
| 257 |
return updated
|
| 258 |
|
|
|
|
|
|
|
| 259 |
def append_to_remote_index(remote_index: pd.DataFrame, new_rows: List[dict]) -> pd.DataFrame:
|
| 260 |
if not new_rows:
|
| 261 |
return remote_index
|