# upload_htmls_and_index.py import posixpath from pathlib import Path import pandas as pd from urllib.parse import unquote # add at top from huggingface_hub import HfApi REPORTS_ROOT = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio").resolve() DATASET_REPO = "akazemian/audio-html" # --- replace ONLY your upload block with this (keep the rest of the file) --- from huggingface_hub import HfApi api = HfApi() REPORTS_ROOT = REPORTS_ROOT.resolve() # your existing constant # Upload per model subfolder, but call upload_large_folder on the PARENT # (older huggingface_hub versions don't support path_in_repo) for sub in sorted([p for p in REPORTS_ROOT.iterdir() if p.is_dir()]): model = sub.name print(f"[HF] upload_large_folder: {REPORTS_ROOT} (include {model}/**/*.html) -> {DATASET_REPO}") api.upload_large_folder( repo_id=DATASET_REPO, repo_type="dataset", folder_path=str(REPORTS_ROOT), # parent folder allow_patterns=[f"{model}/**/*.html"], # only this model's files ) print(f"✓ uploaded {model}") # --- end replacement --- # (B) Build index.csv from your existing library.csv (no model_name) library = pd.read_csv("library.csv") def ensure_cols(df, cols): for c in cols: if c not in df.columns: df[c] = "" return df library = ensure_cols( library, ["id","filename","path","tags","keywords","notes","uploaded_at","category","dataset"] ) def local_to_relpath(local_path: str) -> str: # Make path relative to REPORTS_ROOT and normalize to POSIX for HF rel = Path(local_path).resolve().relative_to(REPORTS_ROOT) return posixpath.join(*rel.parts) # Only keep rows that actually point to .html files under REPORTS_ROOT keep = library["path"].astype(str).str.endswith(".html", na=False) & \ library["path"].astype(str).str.startswith(str(REPORTS_ROOT), na=False) idx = library[keep].copy() # Derive relpath inside the HF dataset from the absolute local path idx["relpath"] = idx["path"].apply(local_to_relpath) index_cols = ["id","filename","relpath","category","dataset","tags","keywords","notes","uploaded_at"] index_df = idx[index_cols].copy() index_df.to_csv("index.csv", index=False) # (C) Upload index.csv to the dataset repo (small, separate commit) from huggingface_hub import CommitOperationAdd api.create_commit( repo_id=DATASET_REPO, repo_type="dataset", operations=[CommitOperationAdd(path_in_repo="index.csv", path_or_fileobj="index.csv")], commit_message=f"Add/update index.csv ({len(index_df)} rows)" ) print("Done: uploaded HTMLs (large-folder) and index.csv")