Spaces:

akazemian
/

audio-library

Sleeping

App Files Files Community

audio-library / temp.py

akazemian

Upload folder using huggingface_hub

4fe3afe verified 2 months ago

raw

history blame contribute delete

2.63 kB

	# upload_htmls_and_index.py
	import posixpath
	from pathlib import Path
	import pandas as pd
	from urllib.parse import unquote # add at top

	from huggingface_hub import HfApi

	REPORTS_ROOT = Path("/data/atlask/Model-Preds-Html/AudioSet-Audio").resolve()
	DATASET_REPO = "akazemian/audio-html"

	# --- replace ONLY your upload block with this (keep the rest of the file) ---
	from huggingface_hub import HfApi

	api = HfApi()
	REPORTS_ROOT = REPORTS_ROOT.resolve() # your existing constant

	# Upload per model subfolder, but call upload_large_folder on the PARENT
	# (older huggingface_hub versions don't support path_in_repo)
	for sub in sorted([p for p in REPORTS_ROOT.iterdir() if p.is_dir()]):
	model = sub.name
	print(f"[HF] upload_large_folder: {REPORTS_ROOT} (include {model}/*/.html) -> {DATASET_REPO}")
	api.upload_large_folder(
	repo_id=DATASET_REPO,
	repo_type="dataset",
	folder_path=str(REPORTS_ROOT), # parent folder
	allow_patterns=[f"{model}/*/.html"], # only this model's files
	)
	print(f"✓ uploaded {model}")
	# --- end replacement ---



	# (B) Build index.csv from your existing library.csv (no model_name)
	library = pd.read_csv("library.csv")

	def ensure_cols(df, cols):
	for c in cols:
	if c not in df.columns:
	df[c] = ""
	return df

	library = ensure_cols(
	library,
	["id","filename","path","tags","keywords","notes","uploaded_at","category","dataset"]
	)

	def local_to_relpath(local_path: str) -> str:
	# Make path relative to REPORTS_ROOT and normalize to POSIX for HF
	rel = Path(local_path).resolve().relative_to(REPORTS_ROOT)
	return posixpath.join(*rel.parts)

	# Only keep rows that actually point to .html files under REPORTS_ROOT
	keep = library["path"].astype(str).str.endswith(".html", na=False) & \
	library["path"].astype(str).str.startswith(str(REPORTS_ROOT), na=False)
	idx = library[keep].copy()

	# Derive relpath inside the HF dataset from the absolute local path
	idx["relpath"] = idx["path"].apply(local_to_relpath)

	index_cols = ["id","filename","relpath","category","dataset","tags","keywords","notes","uploaded_at"]
	index_df = idx[index_cols].copy()
	index_df.to_csv("index.csv", index=False)

	# (C) Upload index.csv to the dataset repo (small, separate commit)
	from huggingface_hub import CommitOperationAdd
	api.create_commit(
	repo_id=DATASET_REPO,
	repo_type="dataset",
	operations=[CommitOperationAdd(path_in_repo="index.csv", path_or_fileobj="index.csv")],
	commit_message=f"Add/update index.csv ({len(index_df)} rows)"
	)

	print("Done: uploaded HTMLs (large-folder) and index.csv")