gen-synth-data / hub.py
Ben Burtenshaw
lose codeless version
dfd3683
raw
history blame
4.05 kB
import json
from tempfile import mktemp
import argilla as rg
from huggingface_hub import HfApi
from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH
hf_api = HfApi()
with open("DATASET_README_BASE.md") as f:
DATASET_README_BASE = f.read()
def create_readme(domain_seed_data, project_name, domain):
# create a readme for the project that shows the domain and project name
readme = DATASET_README_BASE
readme += f"# {project_name}\n\n## Domain: {domain}"
perspectives = domain_seed_data.get("perspectives")
topics = domain_seed_data.get("topics")
examples = domain_seed_data.get("examples")
if perspectives:
readme += "\n\n## Perspectives\n\n"
for p in perspectives:
readme += f"- {p}\n"
if topics:
readme += "\n\n## Topics\n\n"
for t in topics:
readme += f"- {t}\n"
if examples:
readme += "\n\n## Examples\n\n"
for example in examples:
readme += f"### {example['question']}\n\n{example['answer']}\n\n"
temp_file = mktemp()
with open(temp_file, "w") as f:
f.write(readme)
return temp_file
def setup_dataset_on_hub(repo_id, hub_token):
# create an empty dataset repo on the hub
hf_api.create_repo(
repo_id=repo_id,
token=hub_token,
repo_type="dataset",
exist_ok=True,
)
def push_dataset_to_hub(
domain_seed_data_path,
project_name,
domain,
pipeline_path,
hub_username,
hub_token: str,
):
repo_id = f"{hub_username}/{project_name}"
setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token)
# upload the seed data and readme to the hub
hf_api.upload_file(
path_or_fileobj=domain_seed_data_path,
path_in_repo="seed_data.json",
token=hub_token,
repo_id=repo_id,
repo_type="dataset",
)
# upload the readme to the hub
domain_seed_data = json.load(open(domain_seed_data_path))
hf_api.upload_file(
path_or_fileobj=create_readme(
domain_seed_data=domain_seed_data, project_name=project_name, domain=domain
),
path_in_repo="README.md",
token=hub_token,
repo_id=repo_id,
repo_type="dataset",
)
def push_pipeline_to_hub(
pipeline_path,
hub_username,
hub_token: str,
project_name,
):
repo_id = f"{hub_username}/{project_name}"
# upload the pipeline to the hub
hf_api.upload_file(
path_or_fileobj=pipeline_path,
path_in_repo="pipeline.py",
token=hub_token,
repo_id=repo_id,
repo_type="dataset",
)
for code_path in REMOTE_CODE_PATHS:
hf_api.upload_file(
path_or_fileobj=code_path,
path_in_repo=code_path,
token=hub_token,
repo_id=repo_id,
repo_type="dataset",
)
print(f"Dataset uploaded to {repo_id}")
def pull_seed_data_from_repo(repo_id, hub_token):
# pull the dataset repo from the hub
hf_api.hf_hub_download(
repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH
)
return json.load(open(SEED_DATA_PATH))
def push_argilla_dataset_to_hub(
name: str, repo_id: str, url: str, api_key: str, workspace: str = "admin"
):
rg.init(api_url=url, api_key=api_key)
feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace)
local_dataset = feedback_dataset.pull()
local_dataset.push_to_huggingface(repo_id=repo_id)
def push_pipeline_params(
pipeline_params,
hub_username,
hub_token: str,
project_name,
):
repo_id = f"{hub_username}/{project_name}"
temp_path = mktemp()
with open(temp_path, "w") as f:
json.dump(pipeline_params, f)
# upload the pipeline to the hub
hf_api.upload_file(
path_or_fileobj=temp_path,
path_in_repo="pipeline_params.json",
token=hub_token,
repo_id=repo_id,
repo_type="dataset",
)
print(f"Pipeline params uploaded to {repo_id}")