Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

App Files Files Community

dataset-creator-reddit-bestofredditorupdates / main.py

derek-thomas HF staff

Init commit

749d1d8 over 1 year ago

raw

history blame

No virus

5.48 kB

	import os
	import time
	from datetime import datetime, timedelta

	import pandas as pd
	from datasets import Dataset, DatasetDict, load_dataset
	from huggingface_hub import login

	from my_logger import setup_logger
	from utilities.pushshift_data import scrape_submissions_by_day, submissions_to_dataframe

	# Set dataset name, path to README.md, and existing dataset details
	dataset_name = "derek-thomas/askreddit_test"
	dataset_readme_path = "README.md"
	subreddit = os.environ["SUBREDDIT"]

	# Authenticate with Hugging Face using an auth token
	auth_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
	login(auth_token, add_to_git_credential=True)

	logger = setup_logger(__name__)


	def update_readme(dataset_name, subreddit, date_to_fetch):
	readme_text = f"""
	# {dataset_name}

	## Dataset Overview
	The goal is to have an open dataset of `{subreddit}` submissions. This has been taken from the Pushshift API.

	## Data Collection
	This has been collected with sequential calls that follow the pagination of the pushshift request.


	## Data Structure
	- `all_days`: All the data after `{os.environ["START_DATE"]}`

	## Update Frequency
	The dataset is updated daily and covers the period from `{os.environ["START_DATE"]}` to two days ago.

	## Attribution
	Data sourced from the Pushshift API.

	## Change Log
	<details>
	<summary>Click to expand</summary>

	- {datetime.now().strftime('%Y-%m-%d')}: Added data for {date_to_fetch} to the 'all_days' split and saved as CSV

	</details>
	"""

	return readme_text


	def main(date_to_fetch):
	"""
	Runs the main data processing function to fetch and process subreddit data for the specified date.

	Args:
	date_to_fetch (str): The date to fetch subreddit data for, in the format "YYYY-MM-DD".

	Returns:
	most_recent_date (str): Most recent date in dataset
	"""

	# Load the existing dataset from the Hugging Face hub or create a new one
	try:
	logger.info("Loading existing dataset")
	dataset = load_dataset(dataset_name)
	if "__index_level_0__" in dataset["all_days"].column_names:
	dataset = dataset.remove_columns(["__index_level_0__"])
	except FileNotFoundError:
	logger.info("Creating new dataset")
	dataset = DatasetDict()

	# Call get_subreddit_day with the calculated date
	logger.info(f"Fetching data for {date_to_fetch}")
	submissions = scrape_submissions_by_day(subreddit, date_to_fetch)
	df = submissions_to_dataframe(submissions)
	logger.info(f"Data fetched for {date_to_fetch}")
	most_recent_date = datetime.strptime(date_to_fetch, '%Y-%m-%d').date()

	# Append DataFrame to split 'all_days' or create new split
	if "all_days" in dataset:
	logger.info("Appending data to split 'all_days'")
	# Merge the new submissions
	old_data = dataset['all_days'].to_pandas()
	new_data = pd.concat([old_data, df], ignore_index=True)

	# Drop duplicates just in case
	new_data = new_data.drop_duplicates(subset=['id'], keep="first")
	new_data_most_recent_date_raw = new_data['created_utc'].max()
	new_data_most_recent_date_dt = datetime.strptime(new_data_most_recent_date_raw.split(' ')[0], '%Y-%m-%d').date()
	# Adding timedelta in case there is rounding error
	most_recent_date = max(new_data_most_recent_date_dt - timedelta(days=1), most_recent_date)

	# Convert back to dataset
	dataset["all_days"] = Dataset.from_pandas(new_data)
	else:
	logger.info("Creating new split 'all_days'")
	dataset["all_days"] = Dataset.from_pandas(df)
	# Log appending or creating split 'all'
	logger.info("Appended or created split 'all_days'")

	# Push the augmented dataset to the Hugging Face hub
	logger.info(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
	readme_text = update_readme(dataset_name, subreddit, date_to_fetch)
	dataset.description = readme_text
	dataset.push_to_hub(dataset_name, token=auth_token)
	logger.info(f"Processed and pushed data for {date_to_fetch} to the Hugging Face Hub")
	return most_recent_date


	def run_main_continuously():
	"""
	This function runs the given `main_function` continuously, starting from the date specified
	in the environment variable "START_DATE" until two days ago. Once it reaches two days ago,
	it will wait until tomorrow to start again at the same time as when it started today.
	"""
	start_date_str = os.environ.get("START_DATE")
	start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()

	# Calculate the start time for running the main_function every day.
	start_time = datetime.now().time()

	while True:
	today = datetime.now().date()
	two_days_ago = today - timedelta(days=2)

	if start_date <= two_days_ago:
	logger.info(f"Running main function for date: {start_date}")
	most_recent_date = main(str(start_date))
	start_date = most_recent_date + timedelta(days=1)
	else:
	tomorrow = today + timedelta(days=1)
	now = datetime.now()
	start_of_tomorrow = datetime.combine(tomorrow, start_time)
	wait_until_tomorrow = (start_of_tomorrow - now).total_seconds()
	logger.info(f"Waiting until tomorrow: {wait_until_tomorrow} seconds")
	time.sleep(wait_until_tomorrow)


	if __name__ == '__main__':
	run_main_continuously()