dataset-creator-reddit-canadian-investor

Running

App Files Files Community

dataset-creator-reddit-canadian-investor / utilities /readme_update.py

derek-thomas HF staff

Updating opt-out with a link to the file

a9b0348 verified 3 months ago

raw

history blame

3.48 kB

	import os
	from datetime import datetime
	from pathlib import Path
	from shutil import rmtree

	import pytz
	from huggingface_hub import HfApi, Repository

	frequency = os.environ.get("FREQUENCY", '').lower()
	GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
	username = os.environ["USERNAME"]
	hf_token = os.environ["HF_TOKEN"]
	local_repo_path = "./readme_repo"


	def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
	"""
	Update the README file of a specified dataset repository with new information.

	Args:
	dataset_name (str): Name of the dataset repository.
	subreddit (str): Name of the subreddit being used for dataset creation.
	new_rows (int): Number of new rows added in the latest update.
	hf_token (str): Hugging Face authentication token.
	local_repo_path (str): Local path to clone the repository.
	"""
	# Initialize HfApi
	api = HfApi()

	# Download README file
	readme_path = api.hf_hub_download(repo_id=dataset_name, repo_type="dataset", filename="README.md")

	# Read it
	with open(readme_path, "r") as file:
	old_readme = file.read()

	# Modify it
	new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)

	# Commit modifications
	api.upload_file(
	path_or_fileobj=new_readme.encode(),
	path_in_repo="README.md",
	repo_id=dataset_name,
	repo_type="dataset",
	commit_message=f'Pushing {new_rows} new rows'
	)


	def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
	"""
	Append new information to the existing README content.

	Args:
	subreddit (str): Name of the subreddit.
	new_rows (int): Number of new rows added.
	old_readme (str): Existing README content.

	Returns:
	str: Updated README content.
	"""
	latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
	latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')

	readme_text = f"""
	## Dataset Overview
	The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. I'm leveraging PRAW and the Reddit API to get downloads.

	There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions.

	## Creation Details
	This dataset was created by [{username}/dataset-creator-reddit-{subreddit}](https://huggingface.co/spaces/{username}/dataset-creator-reddit-{subreddit})

	## Update Frequency
	The dataset is updated {frequency} with the most recent update being `{latest_hour_str}` where we added {new_rows} new rows.

	## Licensing
	[Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
	[License information]

	## Opt-out
	To opt-out of this dataset please make a pull request with your justification and add your ids in filter_ids.json

	1. Go to [filter_ids.json](https://huggingface.co/spaces/reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates/blob/main/filter_ids.json)
	2. Click Edit
	3. Add your ids, 1 per row
	4. Comment with your justification
	"""

	if GENERATED_BELOW_MARKER in old_readme:
	index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
	new_readme = old_readme[:index] + "\n\n" + readme_text
	else:
	new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"

	return new_readme