dataset-creator-reddit-uwaterloo

Running

derek-thomas HF staff

Updating README

ed3130d over 1 year ago

1.57 kB

	import os

	from datasets.download.download_config import DownloadConfig
	from datasets.utils.file_utils import cached_path
	from datasets.utils.hub import hf_hub_url


	def get_readme_path(dataset_name):
	readme_path = hf_hub_url(dataset_name, "README.md")
	return cached_path(readme_path, download_config=DownloadConfig())


	def update_readme(dataset_name, subreddit, date_to_fetch):
	path = get_readme_path(dataset_name=dataset_name)
	readme_text = f"""
	# Dataset Name
	{dataset_name}

	## Update Frequency
	The dataset is updated daily and covers the period from `{os.environ["START_DATE"]}` to {date_to_fetch}

	## Dataset Overview
	The goal is to have an open dataset of `{subreddit}` submissions. This has been taken from the Pushshift API.

	## Data Collection
	This has been collected with sequential calls that follow the pagination of the pushshift request.

	## Attribution
	Data sourced from the Pushshift API.
	"""

	append_readme(path=path, readme_text=readme_text)
	return readme_text


	def append_readme(path, readme_text):
	generated_below_marker = "--- Generated Below ---"
	with open(path, "r") as file:
	content = file.read()

	if generated_below_marker in content:
	index = content.index(generated_below_marker) + len(generated_below_marker)
	content = content[:index] + "\n" + readme_text + "\n" + content[index:]
	else:
	content += "\n" + generated_below_marker + "\n" + readme_text + "\n"

	with open(path, "w") as file:
	file.write(content)