derek-thomas HF staff commited on
Commit
5d9e0b8
1 Parent(s): 2870060

Major updates from sister repo

Browse files
app.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  from pathlib import Path
3
 
4
  import gradio as gr
 
5
  from rich.console import Console
6
  from rich.syntax import Syntax
7
 
@@ -11,6 +12,10 @@ subreddit = os.environ["SUBREDDIT"]
11
  username = os.environ["USERNAME"]
12
  dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
13
 
 
 
 
 
14
 
15
  def log_file_to_html_string():
16
  log_file = "mylog.log"
@@ -27,27 +32,75 @@ def log_file_to_html_string():
27
  output = "".join(lines)
28
  syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
29
 
30
- console.print(syntax)
31
  html_content = console.export_html(inline_styles=True)
32
 
33
- style_replace = """
 
 
 
 
 
 
 
 
 
 
34
  pre, code {
35
  background-color: #272822;
36
  }
37
- </style>"""
38
- html_content = html_content.replace('</style>', style_replace)
39
- return html_content
 
 
 
 
40
 
 
41
 
42
- markdown = f"""
43
- # Reddit Scraper
44
- This is a reddit scraper which builds and updates [{dataset_name}](https://huggingface.co/datasets/{dataset_name}). Check the README for more details.
 
 
45
 
46
  As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
47
  """
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  with gr.Blocks() as demo:
50
- gr.Markdown(markdown)
 
51
  gr.Image(proj_dir / 'media' / 'reddit_scraper.drawio.png')
52
  gr.Markdown("# Logs")
53
  output = gr.HTML(log_file_to_html_string, every=1)
@@ -58,6 +111,10 @@ with gr.Blocks() as demo:
58
  document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary)'
59
  }
60
  """, )
 
 
 
 
61
 
62
  if __name__ == '__main__':
63
  demo.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)
 
2
  from pathlib import Path
3
 
4
  import gradio as gr
5
+ from bs4 import BeautifulSoup
6
  from rich.console import Console
7
  from rich.syntax import Syntax
8
 
 
12
  username = os.environ["USERNAME"]
13
  dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
14
 
15
+ frequency = os.environ.get("FREQUENCY", '').lower()
16
+ if frequency not in ["daily", "hourly"]:
17
+ raise gr.Error("FREQUENCY environment variable must be 'daily' or 'hourly'")
18
+
19
 
20
  def log_file_to_html_string():
21
  log_file = "mylog.log"
 
32
  output = "".join(lines)
33
  syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
34
 
35
+ console.print(syntax);
36
  html_content = console.export_html(inline_styles=True)
37
 
38
+ # Parse the HTML content using BeautifulSoup
39
+ soup = BeautifulSoup(html_content, 'lxml')
40
+
41
+ # Modify the <pre> tag
42
+ pre_tag = soup.pre
43
+ pre_tag['class'] = 'scrollable'
44
+ del pre_tag['style']
45
+
46
+ # Add your custom styles and the .scrollable CSS to the <style> tag
47
+ style_tag = soup.style
48
+ style_content = """
49
  pre, code {
50
  background-color: #272822;
51
  }
52
+ .scrollable {
53
+ font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace;
54
+ height: 500px;
55
+ overflow: auto;
56
+ }
57
+ """
58
+ style_tag.append(style_content)
59
 
60
+ return soup.prettify()
61
 
62
+
63
+ intro_md = f"""
64
+ # Reddit Dataset Creator
65
+ This is a reddit dataset creator which builds and updates [{dataset_name}](https://huggingface.co/datasets/{dataset_name})
66
+ which pulls from [/r/{subreddit}](http://www.reddit.com/r/{subreddit}). Check the dataset for more details.
67
 
68
  As shown in the below diagram this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
69
  """
70
 
71
+ how_to_md = f"""
72
+ # How to make your own space and dataset
73
+ 1. Create a [reddit application](https://www.reddit.com/prefs/apps), use 'Script for personal use'
74
+ - Redirect URI can be anything, I use 'http://www.example.com/unused/redirect/uri'
75
+ - You need the `secret` and the `Client ID` from the reddit application.
76
+ - `REDDIT_USER_AGENT` can be any descriptive string, probably any undescriptive string too.
77
+ 2. Get your writable [huggingface token](https://huggingface.co/settings/tokens)
78
+ 3. <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-amitheasshole?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg" alt="Duplicate Space"></a>
79
+ and fill in the information
80
+ """
81
+
82
+ how_does_it_work_md = f"""
83
+ # Core Components
84
+ There are 2 core components [main](main.py) and [app](app.py).
85
+ Main does a few things:
86
+ - Pulls from a datasource
87
+ - Updates a dataset on the hub
88
+ - Updates the README of the dataset
89
+ - Writes a local log file (inaccessible outside the spaces container)
90
+
91
+ App
92
+ - Visualizes the log file from Main
93
+
94
+ # Running it
95
+ This uses a docker space so that I can execute supervisor. Supervisor allows me to kick off 2 processes and manage the
96
+ log files. I use gradio for `app` and map that to the open port of huggingface spaces.
97
+
98
+ The only communication between `app` and `main` is the log file.
99
+ """
100
+
101
  with gr.Blocks() as demo:
102
+ with gr.Tab("Application"):
103
+ gr.Markdown(intro_md)
104
  gr.Image(proj_dir / 'media' / 'reddit_scraper.drawio.png')
105
  gr.Markdown("# Logs")
106
  output = gr.HTML(log_file_to_html_string, every=1)
 
111
  document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary)'
112
  }
113
  """, )
114
+ with gr.Tab("How to Create?"):
115
+ gr.Markdown(how_to_md)
116
+ with gr.Tab("How does it work?"):
117
+ gr.Markdown(how_does_it_work_md)
118
 
119
  if __name__ == '__main__':
120
  demo.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)
main.py CHANGED
@@ -1,14 +1,15 @@
1
  import os
2
  import time
3
- from datetime import datetime, timedelta
4
 
5
  import pandas as pd
6
  import schedule
7
- from datasets import DatasetDict, load_dataset, Dataset
8
  from huggingface_hub import login
9
 
10
- from utilities.data_collator import merge_and_filter_data
11
  from utilities.my_logger import setup_logger
 
12
  from utilities.readme_update import update_readme
13
 
14
  # Set dataset name, path to README.md, and existing dataset details
@@ -17,6 +18,10 @@ username = os.environ["USERNAME"]
17
  dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
18
  dataset_readme_path = "README.md"
19
 
 
 
 
 
20
  # Authenticate with Hugging Face using an auth token
21
  auth_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
22
  login(auth_token, add_to_git_credential=True)
@@ -27,13 +32,21 @@ logger = setup_logger(__name__)
27
  def get_dataset():
28
  # Load the existing dataset from the Hugging Face hub or create a new one
29
  try:
30
- dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
31
  logger.debug("Loading existing dataset")
32
- if "__index_level_0__" in dataset["train"].column_names:
33
- dataset = dataset.remove_columns(["__index_level_0__"])
34
  except FileNotFoundError:
35
  logger.warning("Creating new dataset")
 
 
36
  dataset = DatasetDict()
 
 
 
 
 
 
 
 
37
  return dataset
38
 
39
 
@@ -43,12 +56,17 @@ def main():
43
  dataset = get_dataset()
44
 
45
  # Get Latest Data and merge with historic data
46
- old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
47
- new_df = merge_and_filter_data(old_df=old_df)
48
- dataset['train'] = Dataset.from_pandas(new_df, preserve_index=False)
 
 
 
 
 
 
49
 
50
  # Update README
51
- new_rows = len(new_df) - len(old_df)
52
  update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date, new_rows=new_rows)
53
  logger.info(f"Adding {new_rows} rows for {date}.")
54
 
@@ -58,14 +76,17 @@ def main():
58
  logger.info(f"Processed and pushed data for {date} to the Hugging Face Hub")
59
 
60
 
61
- def schedule_daily_task():
62
  """
63
- Schedule the daily_task to run at the specific time every day.
64
  """
65
- # start_time = (datetime.now() + timedelta(minutes=1)).time().strftime('%H:%M') # Now + 30 seconds
66
- start_time = '05:00'
67
- logger.info(f'Scheduling tasks to run every day at: {start_time}')
68
- schedule.every().day.at(start_time).do(main)
 
 
 
69
 
70
  while True:
71
  schedule.run_pending()
@@ -73,4 +94,4 @@ def schedule_daily_task():
73
 
74
 
75
  if __name__ == "__main__":
76
- schedule_daily_task()
 
1
  import os
2
  import time
3
+ from datetime import datetime
4
 
5
  import pandas as pd
6
  import schedule
7
+ from datasets import Dataset, DatasetDict, load_dataset
8
  from huggingface_hub import login
9
 
10
+ from utilities.data_collator import get_latest_data, merge_and_filter_data
11
  from utilities.my_logger import setup_logger
12
+ from utilities.praw_downloader import dummy_data
13
  from utilities.readme_update import update_readme
14
 
15
  # Set dataset name, path to README.md, and existing dataset details
 
18
  dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
19
  dataset_readme_path = "README.md"
20
 
21
+ frequency = os.environ.get("FREQUENCY", '').lower()
22
+ if frequency not in ["daily", "hourly"]:
23
+ raise ValueError("FREQUENCY environment variable must be 'daily' or 'hourly'")
24
+
25
  # Authenticate with Hugging Face using an auth token
26
  auth_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
27
  login(auth_token, add_to_git_credential=True)
 
32
  def get_dataset():
33
  # Load the existing dataset from the Hugging Face hub or create a new one
34
  try:
35
+ dataset = load_dataset(dataset_name)
36
  logger.debug("Loading existing dataset")
 
 
37
  except FileNotFoundError:
38
  logger.warning("Creating new dataset")
39
+
40
+ # Creating Initial Repo
41
  dataset = DatasetDict()
42
+ dataset['train'] = Dataset.from_dict(dummy_data)
43
+ dataset.push_to_hub(repo_id=dataset_name, token=auth_token)
44
+
45
+ # Pulling from Initial Repo
46
+ dataset = load_dataset(dataset_name)
47
+
48
+ # Remove dummy data
49
+ del dataset['train']
50
  return dataset
51
 
52
 
 
56
  dataset = get_dataset()
57
 
58
  # Get Latest Data and merge with historic data
59
+ new_df = get_latest_data()
60
+ if 'train' in dataset.keys():
61
+ old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
62
+ df = merge_and_filter_data(old_df=old_df, new_df=new_df)
63
+ new_rows = len(df) - len(old_df)
64
+ else:
65
+ df = new_df
66
+ new_rows = len(new_df)
67
+ dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
68
 
69
  # Update README
 
70
  update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date, new_rows=new_rows)
71
  logger.info(f"Adding {new_rows} rows for {date}.")
72
 
 
76
  logger.info(f"Processed and pushed data for {date} to the Hugging Face Hub")
77
 
78
 
79
+ def schedule_periodic_task():
80
  """
81
+ Schedule the main task to run at the user-defined frequency
82
  """
83
+ if frequency == 'hourly':
84
+ logger.info(f'Scheduling tasks to run every hour at the top of the hour')
85
+ schedule.every().hour.at(":00").do(main)
86
+ elif frequency == 'daily':
87
+ start_time = '05:00'
88
+ logger.info(f'Scheduling tasks to run every day at: {start_time} UTC+00')
89
+ schedule.every().day.at(start_time).do(main)
90
 
91
  while True:
92
  schedule.run_pending()
 
94
 
95
 
96
  if __name__ == "__main__":
97
+ schedule_periodic_task()
my_logger.py DELETED
@@ -1,22 +0,0 @@
1
- import logging
2
-
3
-
4
- def setup_logger(name: str):
5
- logger = logging.getLogger(name)
6
- logger.setLevel(logging.DEBUG)
7
-
8
- formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
9
-
10
- # Create a file handler to write logs to a file
11
- file_handler = logging.FileHandler('mylog.log')
12
- file_handler.setLevel(logging.DEBUG)
13
- file_handler.setFormatter(formatter)
14
- logger.addHandler(file_handler)
15
-
16
- # Create a stream handler to write logs to the console
17
- stream_handler = logging.StreamHandler()
18
- stream_handler.setLevel(logging.DEBUG)
19
- stream_handler.setFormatter(formatter)
20
- logger.addHandler(stream_handler)
21
-
22
- return logger
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -6,4 +6,6 @@ requests==2.28.2
6
  loguru==0.7.0
7
  rich==13.3.4
8
  supervisor==4.2.5
9
- schedule==1.2.0
 
 
 
6
  loguru==0.7.0
7
  rich==13.3.4
8
  supervisor==4.2.5
9
+ schedule==1.2.0
10
+ beautifulsoup4==4.12.2
11
+ lxml==4.9.3
utilities/data_collator.py CHANGED
@@ -29,7 +29,7 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
29
 
30
  # Find row with the longest content for each 'id'
31
  idx_longest_content = df.groupby('id')['content_length'].idxmax().values
32
- df_longest_content = df.loc[idx_longest_content][['id', 'content']]
33
 
34
  # Find row with the highest score for each 'id'
35
  idx_highest_score = df.groupby('id')['score'].idxmax().values
@@ -41,20 +41,28 @@ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
41
  return df_merged
42
 
43
 
44
- def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
 
 
45
  """
46
- Merges the provided dataset with the latest data, sorts them by 'date_utc',
47
- filters out redundant IDs, and returns the merged and filtered dataset.
 
 
 
48
 
49
  Args:
50
- - dataset (Type[Dataset]): The dataset to be merged with the latest data.
 
51
 
52
  Returns:
53
- - Type[Dataset]: The merged and filtered dataset.
54
  """
55
- latest_df = get_latest_data()
56
 
57
- df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
 
 
 
58
  df = filter_redundant_ids(df)
59
  return df
60
 
@@ -62,9 +70,10 @@ def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
62
  if __name__ == '__main__':
63
  # Mock data
64
  data = {
65
- 'id': [1, 1, 2, 2, 3, 3],
66
- 'content': ['short', 'longer content', 'medium', 'really long content here', 'tiny', 'big'],
67
- 'score': [10, 5, 20, 15, 30, 25]
 
68
  }
69
 
70
  df = pd.DataFrame(data)
 
29
 
30
  # Find row with the longest content for each 'id'
31
  idx_longest_content = df.groupby('id')['content_length'].idxmax().values
32
+ df_longest_content = df.loc[idx_longest_content].drop(columns=['score'])
33
 
34
  # Find row with the highest score for each 'id'
35
  idx_highest_score = df.groupby('id')['score'].idxmax().values
 
41
  return df_merged
42
 
43
 
44
+
45
+
46
+ def merge_and_filter_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
47
  """
48
+ Merges two dataframes, sorts them by 'date_utc', and filters out redundant IDs.
49
+
50
+ The function first concatenates the old and new dataframes. Then, it sorts the
51
+ resulting dataframe by the 'date_utc' column. Finally, it filters out redundant IDs
52
+ using the `filter_redundant_ids` function.
53
 
54
  Args:
55
+ - old_df (pd.DataFrame): The original dataframe.
56
+ - new_df (pd.DataFrame): The new dataframe to be merged with the original dataframe.
57
 
58
  Returns:
59
+ - pd.DataFrame: The merged, sorted, and filtered dataframe.
60
  """
 
61
 
62
+ # Concatenate old and new dataframes, sort by 'date_utc', and reset index
63
+ df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
64
+
65
+ # Filter out redundant IDs
66
  df = filter_redundant_ids(df)
67
  return df
68
 
 
70
  if __name__ == '__main__':
71
  # Mock data
72
  data = {
73
+ 'id': [1, 1, 2, 2, 3],
74
+ 'content': ['short', 'much longer content', 'mid', 'size', 'constant'],
75
+ 'score': [10, 5, 7, 9, 6],
76
+ 'another_column': ['a', 'a', 'b', 'b', 'c']
77
  }
78
 
79
  df = pd.DataFrame(data)
utilities/praw_downloader.py CHANGED
@@ -9,6 +9,22 @@ from utilities.my_logger import setup_logger
9
  # Setup logging
10
  logger = setup_logger(__name__)
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def get_reddit_instance() -> praw.Reddit:
14
  """Initialize and return a Reddit instance using PRAW."""
@@ -36,12 +52,12 @@ def extract_submission_data(submission: praw.models.Submission) -> Dict[str, Any
36
  def praw_downloader() -> List[Dict[str, str]]:
37
  """Main function to extract and save all submissions from the subreddit."""
38
  reddit = get_reddit_instance()
39
- subreddit = reddit.subreddit('bestofredditorupdates')
40
 
41
- logger.info('Starting to fetch submissions from bestofredditorupdates.')
42
 
43
  submissions = []
44
- for submission in subreddit.new(limit=200): # Set limit=None to get all posts
45
  # logger.debug(f'Processing post {submission.id} - {submission.title}')
46
  data = extract_submission_data(submission)
47
  submissions.append(data)
 
9
  # Setup logging
10
  logger = setup_logger(__name__)
11
 
12
+ # Get subreddit
13
+ subreddit_var = os.getenv("SUBREDDIT")
14
+ reddit_pull_limit = int(os.getenv("REDDIT_PULL_LIMIT"))
15
+
16
+ # Dummy row for when we create a new repo
17
+ dummy_data = {
18
+ "content": ["This is a sample post content. Just for demonstration purposes!"],
19
+ "poster": ["sampleUser123"],
20
+ "date_utc": [datetime.strptime("2023-10-26 14:30:45", '%Y-%m-%d %H:%M:%S')],
21
+ "flair": ["Discussion"],
22
+ "title": ["Sample Post Title: How to Use Hugging Face?"],
23
+ "score": [457],
24
+ "permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
25
+ "id": ['id']
26
+ }
27
+
28
 
29
  def get_reddit_instance() -> praw.Reddit:
30
  """Initialize and return a Reddit instance using PRAW."""
 
52
  def praw_downloader() -> List[Dict[str, str]]:
53
  """Main function to extract and save all submissions from the subreddit."""
54
  reddit = get_reddit_instance()
55
+ subreddit = reddit.subreddit(subreddit_var)
56
 
57
+ logger.info(f'Starting to fetch submissions from {os.getenv("SUBREDDIT")}.')
58
 
59
  submissions = []
60
+ for submission in subreddit.new(limit=reddit_pull_limit): # Set limit=None to get all posts
61
  # logger.debug(f'Processing post {submission.id} - {submission.title}')
62
  data = extract_submission_data(submission)
63
  submissions.append(data)
utilities/readme_update.py CHANGED
@@ -1,9 +1,13 @@
1
  import os
 
2
 
 
3
  from datasets.download.download_config import DownloadConfig
4
  from datasets.utils.file_utils import cached_path
5
  from datasets.utils.hub import hf_hub_url
6
 
 
 
7
 
8
  def get_readme_path(dataset_name):
9
  readme_path = hf_hub_url(dataset_name, "README.md")
@@ -12,27 +16,32 @@ def get_readme_path(dataset_name):
12
 
13
  def update_readme(dataset_name, subreddit, latest_date, new_rows):
14
  path = get_readme_path(dataset_name=dataset_name)
 
 
 
15
  readme_text = f"""
16
  ## Dataset Overview
17
- The goal is to have an open dataset of `{subreddit}` submissions. Im leveraging PRAW and the reddit API to get downloads.
18
 
19
- There is a limit of 1000 in an API call and limited search functionality, so this is run every day to get new submissions.
20
 
21
  ## Creation Details
22
- THis was created by [derek-thomas/dataset-creator-reddit](https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit)
23
 
24
  ## Update Frequency
25
- The dataset is updated daily with the most recent day being `{latest_date}` where we added `{new_rows}` new rows.
26
 
27
  ## Licensing
28
  [Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
29
  > The Content created with or submitted to our Services by Users (“User Content”) is owned by Users and not by Reddit. Subject to your complete and ongoing compliance with the Data API Terms, Reddit grants you a non-exclusive, non-transferable, non-sublicensable, and revocable license to copy and display the User Content using the Data API solely as necessary to develop, deploy, distribute, and run your App to your App Users. You may not modify the User Content except to format it for such display. You will comply with any requirements or restrictions imposed on usage of User Content by their respective owners, which may include "all rights reserved" notices, Creative Commons licenses, or other terms and conditions that may be agreed upon between you and the owners. Except as expressly permitted by this section, no other rights or licenses are granted or implied, including any right to use User Content for other purposes, such as for training a machine learning or AI model, without the express permission of rightsholders in the applicable User Content
30
 
31
  My take is that you can't use this data for *training* without getting permission.
 
 
 
32
  """
33
 
34
  append_readme(path=path, readme_text=readme_text)
35
- return readme_text
36
 
37
 
38
  def append_readme(path, readme_text):
 
1
  import os
2
+ from datetime import datetime
3
 
4
+ import pytz
5
  from datasets.download.download_config import DownloadConfig
6
  from datasets.utils.file_utils import cached_path
7
  from datasets.utils.hub import hf_hub_url
8
 
9
+ frequency = os.environ.get("FREQUENCY", '').lower()
10
+
11
 
12
  def get_readme_path(dataset_name):
13
  readme_path = hf_hub_url(dataset_name, "README.md")
 
16
 
17
  def update_readme(dataset_name, subreddit, latest_date, new_rows):
18
  path = get_readme_path(dataset_name=dataset_name)
19
+ latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
20
+ latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')
21
+
22
  readme_text = f"""
23
  ## Dataset Overview
24
+ The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. Im leveraging PRAW and the reddit API to get downloads.
25
 
26
+ There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions.
27
 
28
  ## Creation Details
29
+ This dataset was created by [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/spaces/derek-thomas/dataset-creator-reddit-{subreddit})
30
 
31
  ## Update Frequency
32
+ The dataset is updated {frequency} with the most recent update being `{latest_hour_str}` where we added **{new_rows} new rows**.
33
 
34
  ## Licensing
35
  [Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
36
  > The Content created with or submitted to our Services by Users (“User Content”) is owned by Users and not by Reddit. Subject to your complete and ongoing compliance with the Data API Terms, Reddit grants you a non-exclusive, non-transferable, non-sublicensable, and revocable license to copy and display the User Content using the Data API solely as necessary to develop, deploy, distribute, and run your App to your App Users. You may not modify the User Content except to format it for such display. You will comply with any requirements or restrictions imposed on usage of User Content by their respective owners, which may include "all rights reserved" notices, Creative Commons licenses, or other terms and conditions that may be agreed upon between you and the owners. Except as expressly permitted by this section, no other rights or licenses are granted or implied, including any right to use User Content for other purposes, such as for training a machine learning or AI model, without the express permission of rightsholders in the applicable User Content
37
 
38
  My take is that you can't use this data for *training* without getting permission.
39
+
40
+ ## Opt-out
41
+ To opt-out of this dataset please make a request in the community tab
42
  """
43
 
44
  append_readme(path=path, readme_text=readme_text)
 
45
 
46
 
47
  def append_readme(path, readme_text):