File size: 1,566 Bytes
ed3130d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import os
from datasets.download.download_config import DownloadConfig
from datasets.utils.file_utils import cached_path
from datasets.utils.hub import hf_hub_url
def get_readme_path(dataset_name):
readme_path = hf_hub_url(dataset_name, "README.md")
return cached_path(readme_path, download_config=DownloadConfig())
def update_readme(dataset_name, subreddit, date_to_fetch):
path = get_readme_path(dataset_name=dataset_name)
readme_text = f"""
# Dataset Name
{dataset_name}
## Update Frequency
The dataset is updated daily and covers the period from `{os.environ["START_DATE"]}` to {date_to_fetch}
## Dataset Overview
The goal is to have an open dataset of `{subreddit}` submissions. This has been taken from the Pushshift API.
## Data Collection
This has been collected with sequential calls that follow the pagination of the pushshift request.
## Attribution
Data sourced from the Pushshift API.
"""
append_readme(path=path, readme_text=readme_text)
return readme_text
def append_readme(path, readme_text):
generated_below_marker = "--- Generated Below ---"
with open(path, "r") as file:
content = file.read()
if generated_below_marker in content:
index = content.index(generated_below_marker) + len(generated_below_marker)
content = content[:index] + "\n" + readme_text + "\n" + content[index:]
else:
content += "\n" + generated_below_marker + "\n" + readme_text + "\n"
with open(path, "w") as file:
file.write(content)
|