#!/usr/bin/env python """ This script will clone the `datasets` repository in your current directory and parse all currently available metadata, from the `README.md` yaml headers and the automatically generated json files. It dumps the results in a `metadata_{current-commit-of-datasets}.json` file. """ import json from pathlib import Path from subprocess import check_call, check_output from typing import Dict import yaml from apputils import new_state DATASETS_BRANCH = "tasks-alignment-with-models" def metadata_from_readme(f: Path) -> Dict: with f.open() as fi: content = [line.rstrip() for line in fi] if content[0] == "---" and "---" in content[1:]: yamlblock = "\n".join(content[1 : content[1:].index("---") + 1]) return yaml.safe_load(yamlblock) or dict() def load_ds_datas(): drepo = Path("datasets") if drepo.exists() and drepo.is_dir(): check_call(["git", "pull"], cwd=drepo) else: check_call(["git", "clone", "-b", DATASETS_BRANCH, "https://github.com/huggingface/datasets.git"]) head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo) datasets_md = dict() for ddir in sorted((drepo / "datasets").iterdir(), key=lambda d: d.name): try: metadata = metadata_from_readme(ddir / "README.md") except: metadata = None if metadata is None or len(metadata) == 0: metadata = new_state() try: with (ddir / "dataset_infos.json").open() as fi: infos = json.load(fi) except: infos = None datasets_md[ddir.name] = dict(metadata=metadata, infos=infos) return head_sha.decode().strip(), datasets_md if __name__ == "__main__": head_sha, datas = load_ds_datas() fn = f"metadata_{head_sha}.json" print(f"writing to '{fn}'") with open(fn, "w") as fi: fi.write(json.dumps(datas))