import os from datasets.download.download_config import DownloadConfig from datasets.utils.file_utils import cached_path from datasets.utils.hub import hf_hub_url def get_readme_path(dataset_name): readme_path = hf_hub_url(dataset_name, "README.md") return cached_path(readme_path, download_config=DownloadConfig()) def update_readme(dataset_name, subreddit, date_to_fetch): path = get_readme_path(dataset_name=dataset_name) readme_text = f""" # Dataset Name {dataset_name} ## Update Frequency The dataset is updated daily and covers the period from `{os.environ["START_DATE"]}` to {date_to_fetch} ## Dataset Overview The goal is to have an open dataset of `{subreddit}` submissions. This has been taken from the Pushshift API. ## Data Collection This has been collected with sequential calls that follow the pagination of the pushshift request. ## Attribution Data sourced from the Pushshift API. """ append_readme(path=path, readme_text=readme_text) return readme_text def append_readme(path, readme_text): generated_below_marker = "--- Generated Below ---" with open(path, "r") as file: content = file.read() if generated_below_marker in content: index = content.index(generated_below_marker) + len(generated_below_marker) content = content[:index] + "\n" + readme_text + "\n" + content[index:] else: content += "\n" + generated_below_marker + "\n" + readme_text + "\n" with open(path, "w") as file: file.write(content)