Spaces:

akazemian
/

audio-library

Sleeping

App Files Files Community

akazemian commited on Sep 24

Commit

e58f9a0

verified ·

1 Parent(s): 969d75c

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

library.csv +2 -2
sync_library_and_hf.py +55 -9

library.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8ca91ae491fda82472e0496faf7e3a38a28753ebdedea38992459e18caf00221
-size 27430

 version https://git-lfs.github.com/spec/v1
+oid sha256:94dec2938a2b0d919b028ff23391e8b17edd9158e986776821404c3735b7d013
+size 1891411

sync_library_and_hf.py CHANGED Viewed

@@ -1,11 +1,20 @@
 #!/usr/bin/env python3
 # sync_library_and_hf.py
 import argparse, datetime, uuid, posixpath, sys, traceback, os, hashlib
 from pathlib import Path
 from typing import List, Tuple, Set
 from urllib.parse import unquote
 import pandas as pd
 from huggingface_hub import (
     HfApi,
@@ -43,15 +52,55 @@ def key_from_manifest_filename(fname: str) -> str:
     base = Path(fname).name
     return _strip_ext(base, AUDIO_EXTS)
 def load_manifest_map(csv_path: Path) -> dict[str, tuple[str, str]]:
     """
     Returns {basename_key: (dataset, category)} from the manifest.
     Manifest must have columns: file_name, dataset, category
     """
-    if not csv_path.exists():
-        print(f"[manifest] WARNING: not found: {csv_path}")
-        return {}
-    dfm = pd.read_csv(csv_path)
     dfm = dfm.rename(columns={'audio_category':'category'})
     required = {"file_name", "dataset", "category"}
     missing = required - set(dfm.columns)
@@ -68,7 +117,6 @@ def load_manifest_map(csv_path: Path) -> dict[str, tuple[str, str]]:
     print(f"[manifest] loaded {len(m)} keys from {csv_path}")
     return m
 def now_iso() -> str:
     return datetime.datetime.now().isoformat(timespec="seconds")
@@ -208,8 +256,6 @@ def backfill_hf_paths_by_filename(df_db: pd.DataFrame, hf_repo: str, idx: pd.Dat
             updated += 1
     return updated
 def append_to_remote_index(remote_index: pd.DataFrame, new_rows: List[dict]) -> pd.DataFrame:
     if not new_rows:
         return remote_index

 #!/usr/bin/env python3
 # sync_library_and_hf.py
+''''
+RUN BELOW FOR NEW HTML FILES TO UPDATE OLD ONES ON DFATASET REPO
+python sync_library_and_hf.py
+--db-path library.csv
+ --repo-id akazemian/audio-html
+ --model-name wavcoch_audio-preds-sr=16000
+ --index-filename index.csv
+ --wipe-remote
+ --wipe-local
+'''
 import argparse, datetime, uuid, posixpath, sys, traceback, os, hashlib
 from pathlib import Path
 from typing import List, Tuple, Set
 from urllib.parse import unquote
+import os
 import pandas as pd
 from huggingface_hub import (
     HfApi,
     base = Path(fname).name
     return _strip_ext(base, AUDIO_EXTS)
+def create_file_specific_manifest(csv_path: Path) -> pd.DataFrame:
+    audio_dir = "/data/atlask/BAU-Quant/val"
+    manifest = pd.read_csv(csv_path)
+    # 1) Build a files dataframe
+    files = pd.DataFrame({"file_name": os.listdir(audio_dir)})
+    # keep only audio files if needed
+    files = files[files["file_name"].str.lower().str.endswith((".wav", ".mp3", ".flac", ".ogg", ".m4a", ".opus"))].copy()
+    files["file_path"] = files["file_name"].apply(lambda f: os.path.join(audio_dir, f))
+    # Normalize to a join key: drop extension, then strip `_chunk...`
+    files["key"] = (
+        files["file_name"]
+            .str.replace(r"\.[^.]+$", "", regex=True)      # remove extension
+            .str.replace(r"_chunk.*$", "", regex=True)     # remove _chunk suffix if present
+    )
+    # 2) Prepare manifest with the same key
+    man = manifest.copy()
+    # If manifest['file_name'] includes extensions / chunk suffixes, normalize the same way:
+    man["key"] = (
+        man["file_name"]
+            .str.replace(r"\.[^.]+$", "", regex=True)
+            .str.replace(r"_chunk.*$", "", regex=True)
+    )
+    # If duplicates exist in manifest for the same key, decide how to resolve:
+    # e.g., keep first occurrence
+    man = man.drop_duplicates(subset="key", keep="first")
+    # 3) Merge once (vectorized)
+    cols_to_take = ["sr", "dataset", "audio_category", "split", "duration_s"]
+    out = files.merge(man[["key"] + cols_to_take], on="key", how="left")
+    # 4) Final column order
+    return out[["sr", "file_name", "file_path", "dataset", "audio_category", "split"]]
 def load_manifest_map(csv_path: Path) -> dict[str, tuple[str, str]]:
     """
     Returns {basename_key: (dataset, category)} from the manifest.
     Manifest must have columns: file_name, dataset, category
     """
+    # if not csv_path.exists():
+    #     print(f"[manifest] WARNING: not found: {csv_path}")
+    #     return {}
+    # dfm = pd.read_csv(csv_path)
+    dfm = create_file_specific_manifest(csv_path)
     dfm = dfm.rename(columns={'audio_category':'category'})
     required = {"file_name", "dataset", "category"}
     missing = required - set(dfm.columns)
     print(f"[manifest] loaded {len(m)} keys from {csv_path}")
     return m
 def now_iso() -> str:
     return datetime.datetime.now().isoformat(timespec="seconds")
             updated += 1
     return updated
 def append_to_remote_index(remote_index: pd.DataFrame, new_rows: List[dict]) -> pd.DataFrame:
     if not new_rows:
         return remote_index