Spaces:
Sleeping
Sleeping
| import os | |
| import io | |
| import json | |
| import pandas as pd | |
| from huggingface_hub import HfApi | |
| def _as_parquet_bytes(record: dict) -> bytes: | |
| df = pd.DataFrame([record]) | |
| buf = io.BytesIO() | |
| df.to_parquet(buf, index=False) | |
| return buf.getvalue() | |
| def persist_to_hf( | |
| dataset_repo: str, | |
| record: dict, | |
| anon_pdf_bytes: bytes, | |
| parquet_path: str, | |
| json_path: str, | |
| pdf_path: str, | |
| ): | |
| token = os.environ.get("HF_TOKEN") | |
| if not token: | |
| return {"error": "HF_TOKEN not set"} | |
| api = HfApi(token=token) | |
| pq_bytes = _as_parquet_bytes(record) | |
| api.upload_file( | |
| path_or_fileobj=pq_bytes, | |
| path_in_repo=parquet_path, | |
| repo_id=dataset_repo, | |
| repo_type="dataset", | |
| commit_message="Add candidate parquet record", | |
| ) | |
| js_bytes = json.dumps(record, ensure_ascii=False, indent=2).encode("utf-8") | |
| api.upload_file( | |
| path_or_fileobj=js_bytes, | |
| path_in_repo=json_path, | |
| repo_id=dataset_repo, | |
| repo_type="dataset", | |
| commit_message="Add candidate JSON record", | |
| ) | |
| api.upload_file( | |
| path_or_fileobj=anon_pdf_bytes, | |
| path_in_repo=pdf_path, | |
| repo_id=dataset_repo, | |
| repo_type="dataset", | |
| commit_message="Add anonymized PDF", | |
| ) | |
| return {"status": "ok", "dataset_repo": dataset_repo, "files": [parquet_path, json_path, pdf_path]} | |