Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

derek-thomas HF staff commited on Oct 25, 2023

Commit

cdbb4c0

•

1 Parent(s): 84af1d7

Adding new rows to readme, and running at 5am GMT

Files changed (2) hide show

main.py CHANGED Viewed

@@ -48,7 +48,8 @@ def main():
     dataset['train'] = Dataset.from_pandas(new_df, preserve_index=False)
     # Update README
-    update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date)
     # Push the augmented dataset to the Hugging Face hub
     logger.debug(f"Pushing data for {date} to the Hugging Face hub")
@@ -60,7 +61,8 @@ def schedule_daily_task():
     """
     Schedule the daily_task to run at the specific time every day.
     """
-    start_time = (datetime.now() + timedelta(minutes=1)).time().strftime('%H:%M')  # Now + 30 seconds
     logger.info(f'Scheduling tasks to run every day at: {start_time}')
     schedule.every().day.at(start_time).do(main)

     dataset['train'] = Dataset.from_pandas(new_df, preserve_index=False)
     # Update README
+    new_rows = len(new_df) - len(old_df)
+    update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date, new_rows=new_rows)
     # Push the augmented dataset to the Hugging Face hub
     logger.debug(f"Pushing data for {date} to the Hugging Face hub")
     """
     Schedule the daily_task to run at the specific time every day.
     """
+    # start_time = (datetime.now() + timedelta(minutes=1)).time().strftime('%H:%M')  # Now + 30 seconds
+    start_time = '05:00'
     logger.info(f'Scheduling tasks to run every day at: {start_time}')
     schedule.every().day.at(start_time).do(main)

utilities/readme_update.py CHANGED Viewed

@@ -10,7 +10,7 @@ def get_readme_path(dataset_name):
     return cached_path(readme_path, download_config=DownloadConfig())
-def update_readme(dataset_name, subreddit, latest_date):
     path = get_readme_path(dataset_name=dataset_name)
     readme_text = f"""
 ## Dataset Overview
@@ -18,11 +18,8 @@ The goal is to have an open dataset of `{subreddit}` submissions. Im leveraging
 There is a limit of 1000 in an API call and limited search functionality, so this is run every day to get new submissions.
-# Dataset Name
-{dataset_name}
 ## Update Frequency
-The dataset is updated daily with the most recent day being: {latest_date}
 """
     append_readme(path=path, readme_text=readme_text)

     return cached_path(readme_path, download_config=DownloadConfig())
+def update_readme(dataset_name, subreddit, latest_date, new_rows):
     path = get_readme_path(dataset_name=dataset_name)
     readme_text = f"""
 ## Dataset Overview
 There is a limit of 1000 in an API call and limited search functionality, so this is run every day to get new submissions.
 ## Update Frequency
+The dataset is updated daily with the most recent day being `{latest_date}`where we added `{new_rows}` new rows.
 """
     append_readme(path=path, readme_text=readme_text)