# upload_htmls_and_index.py
import posixpath
from pathlib import Path
import pandas as pd
from urllib.parse import unquote # add at top
from huggingface_hub import HfApi
REPORTS_ROOT = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio").resolve()
DATASET_REPO = "akazemian/audio-html"
# --- replace ONLY your upload block with this (keep the rest of the file) ---
from huggingface_hub import HfApi
api = HfApi()
REPORTS_ROOT = REPORTS_ROOT.resolve() # your existing constant
# Upload per model subfolder, but call upload_large_folder on the PARENT
# (older huggingface_hub versions don't support path_in_repo)
for sub in sorted([p for p in REPORTS_ROOT.iterdir() if p.is_dir()]):
model = sub.name
print(f"[HF] upload_large_folder: {REPORTS_ROOT} (include {model}/**/*.html) -> {DATASET_REPO}")
api.upload_large_folder(
repo_id=DATASET_REPO,
repo_type="dataset",
folder_path=str(REPORTS_ROOT), # parent folder
allow_patterns=[f"{model}/**/*.html"], # only this model's files
)
print(f"✓ uploaded {model}")
# --- end replacement ---
# (B) Build index.csv from your existing library.csv (no model_name)
library = pd.read_csv("library.csv")
def ensure_cols(df, cols):
for c in cols:
if c not in df.columns:
df[c] = ""
return df
library = ensure_cols(
library,
["id","filename","path","tags","keywords","notes","uploaded_at","category","dataset"]
)
def local_to_relpath(local_path: str) -> str:
# Make path relative to REPORTS_ROOT and normalize to POSIX for HF
rel = Path(local_path).resolve().relative_to(REPORTS_ROOT)
return posixpath.join(*rel.parts)
# Only keep rows that actually point to .html files under REPORTS_ROOT
keep = library["path"].astype(str).str.endswith(".html", na=False) & \
library["path"].astype(str).str.startswith(str(REPORTS_ROOT), na=False)
idx = library[keep].copy()
# Derive relpath inside the HF dataset from the absolute local path
idx["relpath"] = idx["path"].apply(local_to_relpath)
index_cols = ["id","filename","relpath","category","dataset","tags","keywords","notes","uploaded_at"]
index_df = idx[index_cols].copy()
index_df.to_csv("index.csv", index=False)
# (C) Upload index.csv to the dataset repo (small, separate commit)
from huggingface_hub import CommitOperationAdd
api.create_commit(
repo_id=DATASET_REPO,
repo_type="dataset",
operations=[CommitOperationAdd(path_in_repo="index.csv", path_or_fileobj="index.csv")],
commit_message=f"Add/update index.csv ({len(index_df)} rows)"
)
print("Done: uploaded HTMLs (large-folder) and index.csv")