derek-thomas HF staff commited on
Commit
5ec6657
1 Parent(s): 89f0e00

Updating README.md the right way :'(

Browse files
Files changed (3) hide show
  1. main.py +4 -2
  2. requirements.txt +1 -2
  3. utilities/readme_update.py +56 -30
main.py CHANGED
@@ -8,7 +8,7 @@ from datasets import Dataset
8
 
9
  from utilities.user_defined_functions import get_latest_data, merge_data, load_or_create_dataset
10
  from utilities.my_logger import setup_logger
11
- from utilities.readme_update import update_readme
12
 
13
  # Set dataset name, path to README.md, and existing dataset details
14
  subreddit = os.environ["SUBREDDIT"]
@@ -47,13 +47,15 @@ def main():
47
  dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
48
 
49
  # Update README
50
- update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date, new_rows=new_rows)
51
  logger.info(f"Adding {new_rows} rows for {date}.")
52
 
53
  # Push the augmented dataset to the Hugging Face hub
54
  logger.debug(f"Pushing data for {date} to the Hugging Face hub")
55
  dataset.push_to_hub(dataset_name, token=auth_token)
56
  logger.info(f"Processed and pushed data for {date} to the Hugging Face Hub")
 
 
 
57
  # files_cleaned = dataset.cleanup_cache_files()
58
  # logger.info(f"Removed {files_cleaned} cache files")
59
 
 
8
 
9
  from utilities.user_defined_functions import get_latest_data, merge_data, load_or_create_dataset
10
  from utilities.my_logger import setup_logger
11
+ from utilities.readme_update import update_dataset_readme
12
 
13
  # Set dataset name, path to README.md, and existing dataset details
14
  subreddit = os.environ["SUBREDDIT"]
 
47
  dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
48
 
49
  # Update README
 
50
  logger.info(f"Adding {new_rows} rows for {date}.")
51
 
52
  # Push the augmented dataset to the Hugging Face hub
53
  logger.debug(f"Pushing data for {date} to the Hugging Face hub")
54
  dataset.push_to_hub(dataset_name, token=auth_token)
55
  logger.info(f"Processed and pushed data for {date} to the Hugging Face Hub")
56
+ logger.info(f"Updating rEADME...")
57
+ update_dataset_readme(dataset_name=dataset_name, subreddit=subreddit, new_rows=new_rows)
58
+ logger.info(f"Updated README.")
59
  # files_cleaned = dataset.cleanup_cache_files()
60
  # logger.info(f"Removed {files_cleaned} cache files")
61
 
requirements.txt CHANGED
@@ -1,8 +1,7 @@
1
  praw==7.7.1
2
  gradio==3.50.2
3
  nbdev==2.3.12
4
- # datasets==2.14.6
5
- git+https://github.com/huggingface/datasets.git@81b3ccfc016f6a39837334a0173dac3f59112856
6
  requests==2.28.2
7
  loguru==0.7.0
8
  rich==13.3.4
 
1
  praw==7.7.1
2
  gradio==3.50.2
3
  nbdev==2.3.12
4
+ datasets==2.14.6
 
5
  requests==2.28.2
6
  loguru==0.7.0
7
  rich==13.3.4
utilities/readme_update.py CHANGED
@@ -2,26 +2,63 @@ import os
2
  from datetime import datetime
3
 
4
  import pytz
5
- from datasets.download.download_config import DownloadConfig
6
- from datasets.utils.file_utils import cached_path
7
- from datasets.utils.hub import hf_hub_url
8
 
9
  frequency = os.environ.get("FREQUENCY", '').lower()
10
-
11
-
12
- def get_readme_path(dataset_name):
13
- readme_path = hf_hub_url(dataset_name, "README.md")
14
- return cached_path(readme_path, download_config=DownloadConfig())
15
-
16
-
17
- def update_readme(dataset_name, subreddit, latest_date, new_rows):
18
- path = get_readme_path(dataset_name=dataset_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
20
  latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')
21
 
22
  readme_text = f"""
23
  ## Dataset Overview
24
- The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. Im leveraging PRAW and the reddit API to get downloads.
25
 
26
  There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions.
27
 
@@ -33,27 +70,16 @@ The dataset is updated {frequency} with the most recent update being `{latest_ho
33
 
34
  ## Licensing
35
  [Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
36
- > The Content created with or submitted to our Services by Users (“User Content”) is owned by Users and not by Reddit. Subject to your complete and ongoing compliance with the Data API Terms, Reddit grants you a non-exclusive, non-transferable, non-sublicensable, and revocable license to copy and display the User Content using the Data API solely as necessary to develop, deploy, distribute, and run your App to your App Users. You may not modify the User Content except to format it for such display. You will comply with any requirements or restrictions imposed on usage of User Content by their respective owners, which may include "all rights reserved" notices, Creative Commons licenses, or other terms and conditions that may be agreed upon between you and the owners. Except as expressly permitted by this section, no other rights or licenses are granted or implied, including any right to use User Content for other purposes, such as for training a machine learning or AI model, without the express permission of rightsholders in the applicable User Content
37
-
38
- My take is that you can't use this data for *training* without getting permission.
39
 
40
  ## Opt-out
41
  To opt-out of this dataset please make a request in the community tab
42
  """
43
 
44
- append_readme(path=path, readme_text=readme_text)
45
-
46
-
47
- def append_readme(path, readme_text):
48
- generated_below_marker = "--- Generated Part of README Below ---"
49
- with open(path, "r") as file:
50
- content = file.read()
51
-
52
- if generated_below_marker in content:
53
- index = content.index(generated_below_marker) + len(generated_below_marker)
54
- content = content[:index] + "\n\n" + readme_text
55
  else:
56
- content += "\n\n" + generated_below_marker + "\n\n" + readme_text + "\n"
57
 
58
- with open(path, "w") as file:
59
- file.write(content)
 
2
  from datetime import datetime
3
 
4
  import pytz
5
+ from huggingface_hub import HfApi, Repository
 
 
6
 
7
  frequency = os.environ.get("FREQUENCY", '').lower()
8
+ GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
9
+ hf_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
10
+ local_repo_path = "./readme_repo"
11
+
12
+
13
+ def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
14
+ """
15
+ Update the README file of a specified dataset repository with new information.
16
+
17
+ Args:
18
+ dataset_name (str): Name of the dataset repository.
19
+ subreddit (str): Name of the subreddit being used for dataset creation.
20
+ new_rows (int): Number of new rows added in the latest update.
21
+ hf_token (str): Hugging Face authentication token.
22
+ local_repo_path (str): Local path to clone the repository.
23
+ """
24
+ # Initialize HfApi
25
+ api = HfApi()
26
+
27
+ # Clone the repository locally
28
+ repo = Repository(local_repo_path, clone_from=dataset_name, repo_type='dataset', use_auth_token=hf_token)
29
+
30
+ # Read the README file
31
+ with open(f"{local_repo_path}/README.md", "r") as file:
32
+ old_readme = file.read()
33
+
34
+ # Modify the README
35
+ new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)
36
+
37
+ # Write the updated README back to the repository
38
+ with open(f"{local_repo_path}/README.md", "w") as file:
39
+ file.write(new_readme)
40
+
41
+ # Push the changes
42
+ repo.push_to_hub(blocking=True, commit_message=f'Pushing {new_rows}')
43
+
44
+ def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
45
+ """
46
+ Append new information to the existing README content.
47
+
48
+ Args:
49
+ subreddit (str): Name of the subreddit.
50
+ new_rows (int): Number of new rows added.
51
+ old_readme (str): Existing README content.
52
+
53
+ Returns:
54
+ str: Updated README content.
55
+ """
56
  latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
57
  latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')
58
 
59
  readme_text = f"""
60
  ## Dataset Overview
61
+ The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. I'm leveraging PRAW and the Reddit API to get downloads.
62
 
63
  There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions.
64
 
 
70
 
71
  ## Licensing
72
  [Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
73
+ [License information]
 
 
74
 
75
  ## Opt-out
76
  To opt-out of this dataset please make a request in the community tab
77
  """
78
 
79
+ if GENERATED_BELOW_MARKER in old_readme:
80
+ index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
81
+ new_readme = old_readme[:index] + "\n\n" + readme_text
 
 
 
 
 
 
 
 
82
  else:
83
+ new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"
84
 
85
+ return new_readme