# upload_htmls_and_index.py
import posixpath
from pathlib import Path
import pandas as pd
from urllib.parse import unquote  # add at top

from huggingface_hub import HfApi

REPORTS_ROOT = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio").resolve()
DATASET_REPO = "akazemian/audio-html"

# --- replace ONLY your upload block with this (keep the rest of the file) ---
from huggingface_hub import HfApi

api = HfApi()
REPORTS_ROOT = REPORTS_ROOT.resolve()  # your existing constant

# Upload per model subfolder, but call upload_large_folder on the PARENT
# (older huggingface_hub versions don't support path_in_repo)
for sub in sorted([p for p in REPORTS_ROOT.iterdir() if p.is_dir()]):
    model = sub.name
    print(f"[HF] upload_large_folder: {REPORTS_ROOT} (include {model}/**/*.html) -> {DATASET_REPO}")
    api.upload_large_folder(
        repo_id=DATASET_REPO,
        repo_type="dataset",
        folder_path=str(REPORTS_ROOT),          # parent folder
        allow_patterns=[f"{model}/**/*.html"],  # only this model's files
    )
    print(f"✓ uploaded {model}")
# --- end replacement ---


# (B) Build index.csv from your existing library.csv (no model_name)
library = pd.read_csv("library.csv")

def ensure_cols(df, cols):
    for c in cols:
        if c not in df.columns:
            df[c] = ""
    return df

library = ensure_cols(
    library,
    ["id","filename","path","tags","keywords","notes","uploaded_at","category","dataset"]
)

def local_to_relpath(local_path: str) -> str:
    # Make path relative to REPORTS_ROOT and normalize to POSIX for HF
    rel = Path(local_path).resolve().relative_to(REPORTS_ROOT)
    return posixpath.join(*rel.parts)

# Only keep rows that actually point to .html files under REPORTS_ROOT
keep = library["path"].astype(str).str.endswith(".html", na=False) & \
       library["path"].astype(str).str.startswith(str(REPORTS_ROOT), na=False)
idx = library[keep].copy()

# Derive relpath inside the HF dataset from the absolute local path
idx["relpath"] = idx["path"].apply(local_to_relpath)

index_cols = ["id","filename","relpath","category","dataset","tags","keywords","notes","uploaded_at"]
index_df = idx[index_cols].copy()
index_df.to_csv("index.csv", index=False)

# (C) Upload index.csv to the dataset repo (small, separate commit)
from huggingface_hub import CommitOperationAdd
api.create_commit(
    repo_id=DATASET_REPO,
    repo_type="dataset",
    operations=[CommitOperationAdd(path_in_repo="index.csv", path_or_fileobj="index.csv")],
    commit_message=f"Add/update index.csv ({len(index_df)} rows)"
)

print("Done: uploaded HTMLs (large-folder) and index.csv")