|
import os |
|
|
|
from datasets.download.download_config import DownloadConfig |
|
from datasets.utils.file_utils import cached_path |
|
from datasets.utils.hub import hf_hub_url |
|
|
|
|
|
def get_readme_path(dataset_name): |
|
readme_path = hf_hub_url(dataset_name, "README.md") |
|
return cached_path(readme_path, download_config=DownloadConfig()) |
|
|
|
|
|
def update_readme(dataset_name, subreddit, date_to_fetch): |
|
path = get_readme_path(dataset_name=dataset_name) |
|
readme_text = f""" |
|
# Dataset Name |
|
{dataset_name} |
|
|
|
## Update Frequency |
|
The dataset is updated daily and covers the period from `{os.environ["START_DATE"]}` to {date_to_fetch} |
|
|
|
## Dataset Overview |
|
The goal is to have an open dataset of `{subreddit}` submissions. This has been taken from the Pushshift API. |
|
|
|
## Data Collection |
|
This has been collected with sequential calls that follow the pagination of the pushshift request. |
|
|
|
## Attribution |
|
Data sourced from the Pushshift API. |
|
""" |
|
|
|
append_readme(path=path, readme_text=readme_text) |
|
return readme_text |
|
|
|
|
|
def append_readme(path, readme_text): |
|
generated_below_marker = "--- Generated Below ---" |
|
with open(path, "r") as file: |
|
content = file.read() |
|
|
|
if generated_below_marker in content: |
|
index = content.index(generated_below_marker) + len(generated_below_marker) |
|
content = content[:index] + "\n" + readme_text + "\n" + content[index:] |
|
else: |
|
content += "\n" + generated_below_marker + "\n" + readme_text + "\n" |
|
|
|
with open(path, "w") as file: |
|
file.write(content) |
|
|