Commit
•
cdbb4c0
1
Parent(s):
84af1d7
Adding new rows to readme, and running at 5am GMT
Browse files- main.py +4 -2
- utilities/readme_update.py +2 -5
main.py
CHANGED
@@ -48,7 +48,8 @@ def main():
|
|
48 |
dataset['train'] = Dataset.from_pandas(new_df, preserve_index=False)
|
49 |
|
50 |
# Update README
|
51 |
-
|
|
|
52 |
|
53 |
# Push the augmented dataset to the Hugging Face hub
|
54 |
logger.debug(f"Pushing data for {date} to the Hugging Face hub")
|
@@ -60,7 +61,8 @@ def schedule_daily_task():
|
|
60 |
"""
|
61 |
Schedule the daily_task to run at the specific time every day.
|
62 |
"""
|
63 |
-
start_time = (datetime.now() + timedelta(minutes=1)).time().strftime('%H:%M') # Now + 30 seconds
|
|
|
64 |
logger.info(f'Scheduling tasks to run every day at: {start_time}')
|
65 |
schedule.every().day.at(start_time).do(main)
|
66 |
|
|
|
48 |
dataset['train'] = Dataset.from_pandas(new_df, preserve_index=False)
|
49 |
|
50 |
# Update README
|
51 |
+
new_rows = len(new_df) - len(old_df)
|
52 |
+
update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date, new_rows=new_rows)
|
53 |
|
54 |
# Push the augmented dataset to the Hugging Face hub
|
55 |
logger.debug(f"Pushing data for {date} to the Hugging Face hub")
|
|
|
61 |
"""
|
62 |
Schedule the daily_task to run at the specific time every day.
|
63 |
"""
|
64 |
+
# start_time = (datetime.now() + timedelta(minutes=1)).time().strftime('%H:%M') # Now + 30 seconds
|
65 |
+
start_time = '05:00'
|
66 |
logger.info(f'Scheduling tasks to run every day at: {start_time}')
|
67 |
schedule.every().day.at(start_time).do(main)
|
68 |
|
utilities/readme_update.py
CHANGED
@@ -10,7 +10,7 @@ def get_readme_path(dataset_name):
|
|
10 |
return cached_path(readme_path, download_config=DownloadConfig())
|
11 |
|
12 |
|
13 |
-
def update_readme(dataset_name, subreddit, latest_date):
|
14 |
path = get_readme_path(dataset_name=dataset_name)
|
15 |
readme_text = f"""
|
16 |
## Dataset Overview
|
@@ -18,11 +18,8 @@ The goal is to have an open dataset of `{subreddit}` submissions. Im leveraging
|
|
18 |
|
19 |
There is a limit of 1000 in an API call and limited search functionality, so this is run every day to get new submissions.
|
20 |
|
21 |
-
# Dataset Name
|
22 |
-
{dataset_name}
|
23 |
-
|
24 |
## Update Frequency
|
25 |
-
The dataset is updated daily with the most recent day being
|
26 |
"""
|
27 |
|
28 |
append_readme(path=path, readme_text=readme_text)
|
|
|
10 |
return cached_path(readme_path, download_config=DownloadConfig())
|
11 |
|
12 |
|
13 |
+
def update_readme(dataset_name, subreddit, latest_date, new_rows):
|
14 |
path = get_readme_path(dataset_name=dataset_name)
|
15 |
readme_text = f"""
|
16 |
## Dataset Overview
|
|
|
18 |
|
19 |
There is a limit of 1000 in an API call and limited search functionality, so this is run every day to get new submissions.
|
20 |
|
|
|
|
|
|
|
21 |
## Update Frequency
|
22 |
+
The dataset is updated daily with the most recent day being `{latest_date}`where we added `{new_rows}` new rows.
|
23 |
"""
|
24 |
|
25 |
append_readme(path=path, readme_text=readme_text)
|