gen-synth-data

Sleeping

gen-synth-data / hub.py

Ben Burtenshaw

lose codeless version

dfd3683 10 months ago

4.05 kB

	import json
	from tempfile import mktemp

	import argilla as rg
	from huggingface_hub import HfApi

	from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH


	hf_api = HfApi()

	with open("DATASET_README_BASE.md") as f:
	DATASET_README_BASE = f.read()


	def create_readme(domain_seed_data, project_name, domain):
	# create a readme for the project that shows the domain and project name
	readme = DATASET_README_BASE
	readme += f"# {project_name}\n\n## Domain: {domain}"
	perspectives = domain_seed_data.get("perspectives")
	topics = domain_seed_data.get("topics")
	examples = domain_seed_data.get("examples")
	if perspectives:
	readme += "\n\n## Perspectives\n\n"
	for p in perspectives:
	readme += f"- {p}\n"
	if topics:
	readme += "\n\n## Topics\n\n"
	for t in topics:
	readme += f"- {t}\n"
	if examples:
	readme += "\n\n## Examples\n\n"
	for example in examples:
	readme += f"### {example['question']}\n\n{example['answer']}\n\n"
	temp_file = mktemp()

	with open(temp_file, "w") as f:
	f.write(readme)
	return temp_file


	def setup_dataset_on_hub(repo_id, hub_token):
	# create an empty dataset repo on the hub
	hf_api.create_repo(
	repo_id=repo_id,
	token=hub_token,
	repo_type="dataset",
	exist_ok=True,
	)


	def push_dataset_to_hub(
	domain_seed_data_path,
	project_name,
	domain,
	pipeline_path,
	hub_username,
	hub_token: str,
	):
	repo_id = f"{hub_username}/{project_name}"

	setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token)

	# upload the seed data and readme to the hub
	hf_api.upload_file(
	path_or_fileobj=domain_seed_data_path,
	path_in_repo="seed_data.json",
	token=hub_token,
	repo_id=repo_id,
	repo_type="dataset",
	)

	# upload the readme to the hub
	domain_seed_data = json.load(open(domain_seed_data_path))
	hf_api.upload_file(
	path_or_fileobj=create_readme(
	domain_seed_data=domain_seed_data, project_name=project_name, domain=domain
	),
	path_in_repo="README.md",
	token=hub_token,
	repo_id=repo_id,
	repo_type="dataset",
	)


	def push_pipeline_to_hub(
	pipeline_path,
	hub_username,
	hub_token: str,
	project_name,
	):
	repo_id = f"{hub_username}/{project_name}"

	# upload the pipeline to the hub
	hf_api.upload_file(
	path_or_fileobj=pipeline_path,
	path_in_repo="pipeline.py",
	token=hub_token,
	repo_id=repo_id,
	repo_type="dataset",
	)

	for code_path in REMOTE_CODE_PATHS:
	hf_api.upload_file(
	path_or_fileobj=code_path,
	path_in_repo=code_path,
	token=hub_token,
	repo_id=repo_id,
	repo_type="dataset",
	)

	print(f"Dataset uploaded to {repo_id}")


	def pull_seed_data_from_repo(repo_id, hub_token):
	# pull the dataset repo from the hub
	hf_api.hf_hub_download(
	repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH
	)
	return json.load(open(SEED_DATA_PATH))


	def push_argilla_dataset_to_hub(
	name: str, repo_id: str, url: str, api_key: str, workspace: str = "admin"
	):
	rg.init(api_url=url, api_key=api_key)
	feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace)
	local_dataset = feedback_dataset.pull()
	local_dataset.push_to_huggingface(repo_id=repo_id)


	def push_pipeline_params(
	pipeline_params,
	hub_username,
	hub_token: str,
	project_name,
	):
	repo_id = f"{hub_username}/{project_name}"
	temp_path = mktemp()
	with open(temp_path, "w") as f:
	json.dump(pipeline_params, f)
	# upload the pipeline to the hub
	hf_api.upload_file(
	path_or_fileobj=temp_path,
	path_in_repo="pipeline_params.json",
	token=hub_token,
	repo_id=repo_id,
	repo_type="dataset",
	)

	print(f"Pipeline params uploaded to {repo_id}")