derek-thomas HF staff commited on
Commit
285612d
1 Parent(s): 1d46c26

Major updates, moving away from pushshift.io into PRAW

Browse files
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  .idea/
2
  notebooks/.ipynb_checkpoints
3
- mylog.log
 
 
1
  .idea/
2
  notebooks/.ipynb_checkpoints
3
+ mylog.log
4
+ .env
Dockerfile CHANGED
@@ -1,8 +1,10 @@
1
  # Use the official Python base image
2
- FROM python:3.9
3
 
4
  # Install Git LFS
5
- RUN apt-get update && apt-get install -y git-lfs
 
 
6
 
7
  # https://discuss.huggingface.co/t/permission-denied-for-writing-files-within-spaces/29799
8
  RUN useradd -m -u 1000 user
@@ -29,7 +31,9 @@ COPY . .
29
  COPY supervisord.conf .
30
 
31
  # Set permissions on the log file
 
32
  RUN touch $HOME/app/mylog.log $HOME/app/supervisord.log && chmod a+rwx $HOME/app/mylog.log $HOME/app/supervisord.log
 
33
  # RUN mkdir -m 777 -p /.cache/huggingface/hub/
34
 
35
 
 
1
  # Use the official Python base image
2
+ FROM python:3.10
3
 
4
  # Install Git LFS
5
+ RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
6
+ RUN apt-get -o Acquire::AllowInsecureRepositories=true update && apt-get install -y git-lfs
7
+ #RUN apt-get update && apt-get install -y git-lfs
8
 
9
  # https://discuss.huggingface.co/t/permission-denied-for-writing-files-within-spaces/29799
10
  RUN useradd -m -u 1000 user
 
31
  COPY supervisord.conf .
32
 
33
  # Set permissions on the log file
34
+ USER root
35
  RUN touch $HOME/app/mylog.log $HOME/app/supervisord.log && chmod a+rwx $HOME/app/mylog.log $HOME/app/supervisord.log
36
+ USER user
37
  # RUN mkdir -m 777 -p /.cache/huggingface/hub/
38
 
39
 
app.py CHANGED
@@ -9,7 +9,7 @@ proj_dir = Path(__name__).parent
9
 
10
  subreddit = os.environ["SUBREDDIT"]
11
  username = os.environ["USERNAME"]
12
- dataset_name = f"{username}/dataset-creator-{subreddit}"
13
 
14
 
15
  def log_file_to_html_string():
@@ -37,7 +37,7 @@ markdown = f"""
37
  # Reddit Scraper
38
  This is a reddit scraper which builds [{dataset_name}](https://huggingface.co/datasets/{dataset_name}).
39
 
40
- As shown below this space pulls data from pushshift.io, processes it, and puts it in a corresponding dataset.
41
  """
42
 
43
  with gr.Blocks() as demo:
 
9
 
10
  subreddit = os.environ["SUBREDDIT"]
11
  username = os.environ["USERNAME"]
12
+ dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
13
 
14
 
15
  def log_file_to_html_string():
 
37
  # Reddit Scraper
38
  This is a reddit scraper which builds [{dataset_name}](https://huggingface.co/datasets/{dataset_name}).
39
 
40
+ As shown below this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
41
  """
42
 
43
  with gr.Blocks() as demo:
archive/subreddit_downloader.py DELETED
@@ -1,145 +0,0 @@
1
- import csv
2
- import json
3
- import sys
4
- import time
5
- import traceback
6
- from datetime import datetime
7
-
8
- import requests
9
-
10
- username = "" # put the username you want to download in the quotes
11
- subreddit = "BestofRedditorUpdates" # put the subreddit you want to download in the quotes
12
- thread_id = "" # put the id of the thread you want to download in the quotes, it's the first 5 to 7 character string of letters and numbers from the url, like 107xayi
13
- # leave either one blank to download an entire user's or subreddit's history
14
- # or fill in both to download a specific users history from a specific subreddit
15
-
16
- # change this to one of "human", "csv" or "json"
17
- # - human: the score, creation date, author, link and then the comment/submission body on a second line. Objects are separated by lines of dashes
18
- # - csv: a comma seperated value file with the fields score, date, title, author, link and then body or url
19
- # - json: the full json object
20
- output_format = "csv"
21
-
22
- # default start time is the current time and default end time is all history
23
- # you can change out the below lines to set a custom start and end date. The script works backwards, so the end date has to be before the start date
24
- # start_time = datetime.utcnow() # datetime.strptime("10/05/2021", "%m/%d/%Y")
25
- start_time = datetime.strptime("04/02/2023", "%m/%d/%Y")
26
- end_time = None # datetime.strptime("09/25/2021", "%m/%d/%Y")
27
-
28
- convert_to_ascii = False # don't touch this unless you know what you're doing
29
- convert_thread_id_to_base_ten = True # don't touch this unless you know what you're doing
30
-
31
-
32
- def write_csv_line(writer, obj, is_submission):
33
- output_list = []
34
- output_list.append(str(obj['score']))
35
- output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
36
- if is_submission:
37
- output_list.append(obj['title'])
38
- output_list.append(f"u/{obj['author']}")
39
- output_list.append(f"https://www.reddit.com{obj['permalink']}")
40
- if is_submission:
41
- if obj['is_self']:
42
- if 'selftext' in obj:
43
- output_list.append(obj['selftext'])
44
- else:
45
- output_list.append("")
46
- else:
47
- output_list.append(obj['url'])
48
- else:
49
- output_list.append(obj['body'])
50
- writer.writerow(output_list)
51
-
52
-
53
- def write_json_line(handle, obj):
54
- handle.write(json.dumps(obj))
55
- handle.write("\n")
56
-
57
-
58
- def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii):
59
- print(f"Saving to {filename}")
60
-
61
- count = 0
62
- if output_format == "human" or output_format == "json":
63
- if convert_to_ascii:
64
- handle = open(filename, 'w', encoding='ascii')
65
- else:
66
- handle = open(filename, 'w', encoding='UTF-8')
67
- else:
68
- handle = open(filename, 'w', encoding='UTF-8', newline='')
69
- writer = csv.writer(handle)
70
-
71
- previous_epoch = int(start_datetime.timestamp())
72
- break_out = False
73
- while True:
74
- new_url = url_base + str(previous_epoch)
75
- json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
76
- time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
77
- try:
78
- json_data = json_text.json()
79
- except json.decoder.JSONDecodeError:
80
- time.sleep(1)
81
- continue
82
-
83
- if 'data' not in json_data:
84
- break
85
- objects = json_data['data']
86
- if len(objects) == 0:
87
- break
88
-
89
- for obj in objects:
90
- previous_epoch = obj['created_utc'] - 1
91
- if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime:
92
- break_out = True
93
- break
94
- count += 1
95
- try:
96
- if output_format == "csv":
97
- write_csv_line(writer, obj, is_submission)
98
- elif output_format == "json":
99
- write_json_line(handle, obj)
100
- except Exception as err:
101
- if 'permalink' in obj:
102
- print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}")
103
- else:
104
- print(f"Couldn't print object, missing permalink: {obj['id']}")
105
- print(err)
106
- print(traceback.format_exc())
107
-
108
- if break_out:
109
- break
110
-
111
- print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}")
112
-
113
- print(f"Saved {count}")
114
- handle.close()
115
-
116
-
117
- if __name__ == "__main__":
118
- filter_string = None
119
- if username == "" and subreddit == "" and thread_id == "":
120
- print("Fill in username, subreddit or thread id")
121
- sys.exit(0)
122
- if output_format not in ("human", "csv", "json"):
123
- print("Output format must be one of human, csv, json")
124
- sys.exit(0)
125
-
126
- filters = []
127
- if username:
128
- filters.append(f"author={username}")
129
- if subreddit:
130
- filters.append(f"subreddit={subreddit}")
131
- if thread_id:
132
- if convert_thread_id_to_base_ten:
133
- filters.append(f"link_id={int(thread_id, 36)}")
134
- else:
135
- filters.append(f"link_id=t3_{thread_id}")
136
- filter_string = '&'.join(filters)
137
-
138
- url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before="
139
-
140
- if not thread_id:
141
- download_from_url("posts.txt", url_template.format("submission", filter_string), output_format, start_time,
142
- end_time, True, convert_to_ascii)
143
- # download_from_url("comments.txt", url_template.format("comment", filter_string), output_format, start_time,
144
- # end_time, False, convert_to_ascii)
145
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py CHANGED
@@ -3,17 +3,18 @@ import time
3
  from datetime import datetime, timedelta
4
 
5
  import pandas as pd
6
- from datasets import Dataset, DatasetDict, load_dataset
 
7
  from huggingface_hub import login
8
 
9
- from my_logger import setup_logger
10
- from utilities.pushshift_data import scrape_submissions_by_day, submissions_to_dataframe
11
  from utilities.readme_update import update_readme
12
 
13
  # Set dataset name, path to README.md, and existing dataset details
14
  subreddit = os.environ["SUBREDDIT"]
15
  username = os.environ["USERNAME"]
16
- dataset_name = f"{username}/dataset-creator-{subreddit}"
17
  dataset_readme_path = "README.md"
18
 
19
  # Authenticate with Hugging Face using an auth token
@@ -23,94 +24,6 @@ login(auth_token, add_to_git_credential=True)
23
  logger = setup_logger(__name__)
24
 
25
 
26
- def main(dataset, date_to_fetch):
27
- """
28
- Runs the main data processing function to fetch and process subreddit data for the specified date.
29
-
30
- Args:
31
- dataset (datasets.DatasetDict): The Hugging Face dataset to fetch and process subreddit data for.
32
- date_to_fetch (str): The date to fetch subreddit data for, in YYYY-MM-DD format.
33
-
34
- Returns:
35
- most_recent_date (str): The most recent date in the updated dataset.
36
- """
37
- # Call get_subreddit_day with the calculated date
38
- logger.info(f"Fetching data for {str(date_to_fetch)}")
39
- submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
40
- df = submissions_to_dataframe(submissions)
41
- logger.debug(f"Data fetched for {str(date_to_fetch)}")
42
- most_recent_date = date_to_fetch
43
-
44
- # Append DataFrame to split 'all_days' or create new split
45
- if "all_days" in dataset:
46
- logger.debug("Appending data to split 'all_days'")
47
- # Merge the new submissions
48
- old_data = dataset['all_days'].to_pandas()
49
- new_data = pd.concat([old_data, df], ignore_index=True)
50
- if '__index_level_0__' in new_data.columns:
51
- new_data = new_data.drop('__index_level_0__', axis=1)
52
-
53
- # Drop duplicates just in case
54
- new_data = new_data.drop_duplicates(subset=['id'], keep="first")
55
-
56
- # Figure out dates when we restart
57
- old_data_most_recent_date = old_data['date'].max()
58
- old_data_most_recent_date = datetime.strptime(old_data_most_recent_date, '%Y-%m-%d').date()
59
- most_recent_date = max(old_data_most_recent_date, most_recent_date)
60
-
61
- if len(old_data) == len(new_data):
62
- logger.warning("Data in hub is much more recent, using that next!")
63
- return most_recent_date
64
-
65
- # Convert back to dataset
66
- dataset["all_days"] = Dataset.from_pandas(new_data)
67
-
68
- # Update README
69
- update_readme(dataset_name, subreddit, date_to_fetch)
70
- else:
71
- logger.debug("Creating new split 'all_days'")
72
- dataset["all_days"] = Dataset.from_pandas(df)
73
- # Log appending or creating split 'all'
74
- logger.debug("Appended or created split 'all_days'")
75
-
76
- # Push the augmented dataset to the Hugging Face hub
77
- logger.debug(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
78
- dataset.push_to_hub(dataset_name, token=auth_token)
79
- logger.info(f"Processed and pushed data for {date_to_fetch} to the Hugging Face Hub")
80
- return most_recent_date
81
-
82
-
83
- def run_main_continuously():
84
- """
85
- This function runs the given `main_function` continuously, starting from the date specified
86
- in the environment variable "START_DATE" until two days ago. Once it reaches two days ago,
87
- it will wait until tomorrow to start again at the same time as when it started today.
88
- """
89
- start_date_str = os.environ.get("START_DATE")
90
- start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()
91
-
92
- # Calculate the start time for running the main_function every day.
93
- start_time = datetime.now().time()
94
-
95
- dataset = get_dataset()
96
-
97
- while True:
98
- today = datetime.now().date()
99
- two_days_ago = today - timedelta(days=2)
100
-
101
- if start_date <= two_days_ago:
102
- logger.warning(f"Running main function for date: {start_date}")
103
- most_recent_date = main(dataset, start_date)
104
- start_date = most_recent_date + timedelta(days=1)
105
- else:
106
- tomorrow = today + timedelta(days=1)
107
- now = datetime.now()
108
- start_of_tomorrow = datetime.combine(tomorrow, start_time)
109
- wait_until_tomorrow = (start_of_tomorrow - now).total_seconds()
110
- logger.info(f"Waiting until tomorrow: {wait_until_tomorrow} seconds")
111
- time.sleep(wait_until_tomorrow)
112
-
113
-
114
  def get_dataset():
115
  # Load the existing dataset from the Hugging Face hub or create a new one
116
  try:
@@ -124,5 +37,38 @@ def get_dataset():
124
  return dataset
125
 
126
 
127
- if __name__ == '__main__':
128
- run_main_continuously()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from datetime import datetime, timedelta
4
 
5
  import pandas as pd
6
+ import schedule
7
+ from datasets import DatasetDict, load_dataset, Dataset
8
  from huggingface_hub import login
9
 
10
+ from utilities.data_collator import merge_and_filter_data
11
+ from utilities.my_logger import setup_logger
12
  from utilities.readme_update import update_readme
13
 
14
  # Set dataset name, path to README.md, and existing dataset details
15
  subreddit = os.environ["SUBREDDIT"]
16
  username = os.environ["USERNAME"]
17
+ dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
18
  dataset_readme_path = "README.md"
19
 
20
  # Authenticate with Hugging Face using an auth token
 
24
  logger = setup_logger(__name__)
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def get_dataset():
28
  # Load the existing dataset from the Hugging Face hub or create a new one
29
  try:
 
37
  return dataset
38
 
39
 
40
+ def main():
41
+ date = datetime.now().strftime('%Y-%m-%d')
42
+ logger.warning(f"Running main function for date: {date}")
43
+ dataset = get_dataset()
44
+
45
+ # Get Latest Data and merge with historic data
46
+ old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
47
+ new_df = merge_and_filter_data(old_df=old_df)
48
+ dataset['train'] = Dataset.from_pandas(new_df, preserve_index=False)
49
+
50
+ # Update README
51
+ update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date)
52
+
53
+ # Push the augmented dataset to the Hugging Face hub
54
+ logger.debug(f"Pushing data for {date} to the Hugging Face hub")
55
+ dataset.push_to_hub(dataset_name, token=auth_token)
56
+ logger.info(f"Processed and pushed data for {date} to the Hugging Face Hub")
57
+
58
+
59
+ def schedule_daily_task():
60
+ """
61
+ Schedule the daily_task to run at the specific time every day.
62
+ """
63
+ start_time = (datetime.now() + timedelta(seconds=5)).time().strftime('%H:%M') # Now + 30 seconds
64
+ logger.info(f'Scheduling tasks to run every day at: {start_time}')
65
+ main()
66
+ schedule.every().day.at(start_time).do(main)
67
+
68
+ while True:
69
+ schedule.run_pending()
70
+ time.sleep(1)
71
+
72
+
73
+ if __name__ == "__main__":
74
+ schedule_daily_task()
media/reddit_scraper.drawio.html DELETED
@@ -1,11 +0,0 @@
1
- <!--[if IE]><meta http-equiv="X-UA-Compatible" content="IE=5,IE=9" ><![endif]-->
2
- <!DOCTYPE html>
3
- <html>
4
- <head>
5
- <title>reddit_scraper</title>
6
- <meta charset="utf-8"/>
7
- </head>
8
- <body><div class="mxgraph" style="max-width:100%;border:1px solid transparent;" data-mxgraph="{&quot;highlight&quot;:&quot;#0000ff&quot;,&quot;nav&quot;:true,&quot;resize&quot;:true,&quot;toolbar&quot;:&quot;zoom layers tags lightbox&quot;,&quot;edit&quot;:&quot;_blank&quot;,&quot;xml&quot;:&quot;&lt;mxfile host=\&quot;app.diagrams.net\&quot; modified=\&quot;2023-04-14T12:12:14.014Z\&quot; agent=\&quot;Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36\&quot; etag=\&quot;puEjOIZigDmpONhGThsE\&quot; version=\&quot;21.1.7\&quot; type=\&quot;device\&quot;&gt;\n &lt;diagram name=\&quot;Page-1\&quot; id=\&quot;14ddc1Tw5ZQC4xUkB2ri\&quot;&gt;\n &lt;mxGraphModel dx=\&quot;1034\&quot; dy=\&quot;783\&quot; grid=\&quot;1\&quot; gridSize=\&quot;10\&quot; guides=\&quot;1\&quot; tooltips=\&quot;1\&quot; connect=\&quot;1\&quot; arrows=\&quot;1\&quot; fold=\&quot;1\&quot; page=\&quot;1\&quot; pageScale=\&quot;1\&quot; pageWidth=\&quot;850\&quot; pageHeight=\&quot;1100\&quot; math=\&quot;0\&quot; shadow=\&quot;0\&quot;&gt;\n &lt;root&gt;\n &lt;mxCell id=\&quot;0\&quot; /&gt;\n &lt;mxCell id=\&quot;1\&quot; parent=\&quot;0\&quot; /&gt;\n &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-3\&quot; value=\&quot;\&quot; style=\&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;\&quot; edge=\&quot;1\&quot; parent=\&quot;1\&quot; source=\&quot;KhBTRBst3V2Bs5u7l5Na-1\&quot; target=\&quot;KhBTRBst3V2Bs5u7l5Na-2\&quot;&gt;\n &lt;mxGeometry relative=\&quot;1\&quot; as=\&quot;geometry\&quot; /&gt;\n &lt;/mxCell&gt;\n &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-7\&quot; value=\&quot;HF API\&quot; style=\&quot;edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];\&quot; vertex=\&quot;1\&quot; connectable=\&quot;0\&quot; parent=\&quot;KhBTRBst3V2Bs5u7l5Na-3\&quot;&gt;\n &lt;mxGeometry x=\&quot;-0.125\&quot; y=\&quot;1\&quot; relative=\&quot;1\&quot; as=\&quot;geometry\&quot;&gt;\n &lt;mxPoint as=\&quot;offset\&quot; /&gt;\n &lt;/mxGeometry&gt;\n &lt;/mxCell&gt;\n &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-1\&quot; value=\&quot;HF SPACE&amp;lt;br&amp;gt;&amp;lt;a href=&amp;quot;SPACE_LINK&amp;quot;&amp;gt;SPACE_NAME&amp;lt;/a&amp;gt;\&quot; style=\&quot;rounded=1;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;\&quot; vertex=\&quot;1\&quot; parent=\&quot;1\&quot;&gt;\n &lt;mxGeometry x=\&quot;340\&quot; y=\&quot;360\&quot; width=\&quot;160\&quot; height=\&quot;80\&quot; as=\&quot;geometry\&quot; /&gt;\n &lt;/mxCell&gt;\n &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-2\&quot; value=\&quot;HF DATASET &amp;lt;br&amp;gt;&amp;lt;a href=&amp;quot;DATASET_LINK&amp;quot;&amp;gt;DATASET_NAME&amp;lt;/a&amp;gt;\&quot; style=\&quot;rounded=1;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;\&quot; vertex=\&quot;1\&quot; parent=\&quot;1\&quot;&gt;\n &lt;mxGeometry x=\&quot;110\&quot; y=\&quot;360\&quot; width=\&quot;160\&quot; height=\&quot;80\&quot; as=\&quot;geometry\&quot; /&gt;\n &lt;/mxCell&gt;\n &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-4\&quot; value=\&quot;&amp;lt;a href=&amp;quot;pushshift.io&amp;quot;&amp;gt;Pushshift.io&amp;lt;/a&amp;gt;&amp;lt;br&amp;gt;Hosts Reddit Data\&quot; style=\&quot;rounded=1;whiteSpace=wrap;html=1;fillColor=#d5e8d4;strokeColor=#82b366;\&quot; vertex=\&quot;1\&quot; parent=\&quot;1\&quot;&gt;\n &lt;mxGeometry x=\&quot;590\&quot; y=\&quot;360\&quot; width=\&quot;160\&quot; height=\&quot;80\&quot; as=\&quot;geometry\&quot; /&gt;\n &lt;/mxCell&gt;\n &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-8\&quot; value=\&quot;\&quot; style=\&quot;endArrow=classic;startArrow=classic;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;exitX=1;exitY=0.5;exitDx=0;exitDy=0;\&quot; edge=\&quot;1\&quot; parent=\&quot;1\&quot; source=\&quot;KhBTRBst3V2Bs5u7l5Na-1\&quot; target=\&quot;KhBTRBst3V2Bs5u7l5Na-4\&quot;&gt;\n &lt;mxGeometry width=\&quot;50\&quot; height=\&quot;50\&quot; relative=\&quot;1\&quot; as=\&quot;geometry\&quot;&gt;\n &lt;mxPoint x=\&quot;470\&quot; y=\&quot;530\&quot; as=\&quot;sourcePoint\&quot; /&gt;\n &lt;mxPoint x=\&quot;520\&quot; y=\&quot;480\&quot; as=\&quot;targetPoint\&quot; /&gt;\n &lt;/mxGeometry&gt;\n &lt;/mxCell&gt;\n &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-9\&quot; value=\&quot;HTTP\&quot; style=\&quot;edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];\&quot; vertex=\&quot;1\&quot; connectable=\&quot;0\&quot; parent=\&quot;KhBTRBst3V2Bs5u7l5Na-8\&quot;&gt;\n &lt;mxGeometry x=\&quot;0.225\&quot; y=\&quot;1\&quot; relative=\&quot;1\&quot; as=\&quot;geometry\&quot;&gt;\n &lt;mxPoint x=\&quot;-9\&quot; y=\&quot;1\&quot; as=\&quot;offset\&quot; /&gt;\n &lt;/mxGeometry&gt;\n &lt;/mxCell&gt;\n &lt;/root&gt;\n &lt;/mxGraphModel&gt;\n &lt;/diagram&gt;\n&lt;/mxfile&gt;\n&quot;}"></div>
9
- <script type="text/javascript" src="https://viewer.diagrams.net/js/viewer-static.min.js"></script>
10
- </body>
11
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
media/reddit_scraper.drawio.png CHANGED
notebooks/data_processing.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/explore.ipynb DELETED
@@ -1,323 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "id": "730ba509",
7
- "metadata": {},
8
- "outputs": [],
9
- "source": [
10
- "from IPython.core.interactiveshell import InteractiveShell\n",
11
- "InteractiveShell.ast_node_interactivity = \"all\""
12
- ]
13
- },
14
- {
15
- "cell_type": "code",
16
- "execution_count": null,
17
- "id": "d9acd4b6",
18
- "metadata": {},
19
- "outputs": [],
20
- "source": [
21
- "from pathlib import Path\n",
22
- "import sys\n",
23
- "proj_dir = Path.cwd().parent\n",
24
- "\n",
25
- "sys.path.append(str(proj_dir))\n"
26
- ]
27
- },
28
- {
29
- "cell_type": "code",
30
- "execution_count": null,
31
- "id": "62452860",
32
- "metadata": {},
33
- "outputs": [],
34
- "source": [
35
- "from utilities.pushshift_data import scrape_submissions_by_day, submissions_to_dataframe, get_post_count_for_day"
36
- ]
37
- },
38
- {
39
- "cell_type": "code",
40
- "execution_count": 4,
41
- "id": "a956a623",
42
- "metadata": {},
43
- "outputs": [
44
- {
45
- "data": {
46
- "application/vnd.jupyter.widget-view+json": {
47
- "model_id": "17df3f2812084d3591e914ffcfd948b0",
48
- "version_major": 2,
49
- "version_minor": 0
50
- },
51
- "text/plain": [
52
- "0it [00:00, ?it/s]"
53
- ]
54
- },
55
- "metadata": {},
56
- "output_type": "display_data"
57
- },
58
- {
59
- "name": "stderr",
60
- "output_type": "stream",
61
- "text": [
62
- "2023-04-12 16:23:59,392 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 20:00:00\n",
63
- "2023-04-12 16:24:03,524 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 14:37:16\n",
64
- "2023-04-12 16:24:08,443 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 05:02:52\n",
65
- "2023-04-12 16:24:13,409 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 00:43:35\n",
66
- "2023-04-12 16:24:17,548 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:28:35\n",
67
- "2023-04-12 16:24:21,490 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:00:48\n",
68
- "2023-04-12 16:24:23,658 - INFO - Finished scraping 4106 submissions in 28.86 seconds\n"
69
- ]
70
- }
71
- ],
72
- "source": [
73
- "subreddit_to_scrape = \"askreddit\"\n",
74
- "day_to_scrape = \"2013-03-01\"\n",
75
- "submissions = scrape_submissions_by_day(subreddit_to_scrape, day_to_scrape)"
76
- ]
77
- },
78
- {
79
- "cell_type": "code",
80
- "execution_count": 5,
81
- "id": "b1cc845b",
82
- "metadata": {},
83
- "outputs": [
84
- {
85
- "data": {
86
- "text/html": [
87
- "<div>\n",
88
- "<style scoped>\n",
89
- " .dataframe tbody tr th:only-of-type {\n",
90
- " vertical-align: middle;\n",
91
- " }\n",
92
- "\n",
93
- " .dataframe tbody tr th {\n",
94
- " vertical-align: top;\n",
95
- " }\n",
96
- "\n",
97
- " .dataframe thead th {\n",
98
- " text-align: right;\n",
99
- " }\n",
100
- "</style>\n",
101
- "<table border=\"1\" class=\"dataframe\">\n",
102
- " <thead>\n",
103
- " <tr style=\"text-align: right;\">\n",
104
- " <th></th>\n",
105
- " <th>permalink</th>\n",
106
- " <th>selftext</th>\n",
107
- " <th>url</th>\n",
108
- " <th>created_utc</th>\n",
109
- " <th>author</th>\n",
110
- " <th>num_comments</th>\n",
111
- " <th>score</th>\n",
112
- " <th>title</th>\n",
113
- " <th>id</th>\n",
114
- " <th>downs</th>\n",
115
- " <th>ups</th>\n",
116
- " </tr>\n",
117
- " </thead>\n",
118
- " <tbody>\n",
119
- " <tr>\n",
120
- " <th>0</th>\n",
121
- " <td>/r/AskReddit/comments/19hbm0/in_the_way_that_p...</td>\n",
122
- " <td>Basically, do other parts of the world have th...</td>\n",
123
- " <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
124
- " <td>2013-03-01 19:58:55</td>\n",
125
- " <td>sjr63</td>\n",
126
- " <td>1</td>\n",
127
- " <td>1</td>\n",
128
- " <td>In the way that popular English and American m...</td>\n",
129
- " <td>19hbm0</td>\n",
130
- " <td>0</td>\n",
131
- " <td>1</td>\n",
132
- " </tr>\n",
133
- " <tr>\n",
134
- " <th>1</th>\n",
135
- " <td>/r/AskReddit/comments/19hblp/could_i_buy_an_an...</td>\n",
136
- " <td></td>\n",
137
- " <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
138
- " <td>2013-03-01 19:58:50</td>\n",
139
- " <td>WeirdPlane</td>\n",
140
- " <td>13</td>\n",
141
- " <td>1</td>\n",
142
- " <td>Could I buy an Android phone without a plan an...</td>\n",
143
- " <td>19hblp</td>\n",
144
- " <td>0</td>\n",
145
- " <td>1</td>\n",
146
- " </tr>\n",
147
- " <tr>\n",
148
- " <th>2</th>\n",
149
- " <td>/r/AskReddit/comments/19hblj/how_do_i_reddit/</td>\n",
150
- " <td>Yeah.\n",
151
- "\n",
152
- "How do I reddit? I don't use or read re...</td>\n",
153
- " <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
154
- " <td>2013-03-01 19:58:47</td>\n",
155
- " <td>xxnovaroxgg</td>\n",
156
- " <td>14</td>\n",
157
- " <td>0</td>\n",
158
- " <td>How do I reddit</td>\n",
159
- " <td>19hblj</td>\n",
160
- " <td>0</td>\n",
161
- " <td>0</td>\n",
162
- " </tr>\n",
163
- " <tr>\n",
164
- " <th>3</th>\n",
165
- " <td>/r/AskReddit/comments/19hbjx/xpost_rsurvival_h...</td>\n",
166
- " <td>My brothers, dad and I have always been huge L...</td>\n",
167
- " <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
168
- " <td>2013-03-01 19:58:07</td>\n",
169
- " <td>tuffstough</td>\n",
170
- " <td>0</td>\n",
171
- " <td>1</td>\n",
172
- " <td>(x-post r/survival) Have any redditors seen Le...</td>\n",
173
- " <td>19hbjx</td>\n",
174
- " <td>0</td>\n",
175
- " <td>1</td>\n",
176
- " </tr>\n",
177
- " <tr>\n",
178
- " <th>4</th>\n",
179
- " <td>/r/AskReddit/comments/19hbjk/female_redditors_...</td>\n",
180
- " <td>I'm curious, guys tend to get asked the usual ...</td>\n",
181
- " <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
182
- " <td>2013-03-01 19:57:58</td>\n",
183
- " <td>redditredditx3</td>\n",
184
- " <td>13</td>\n",
185
- " <td>2</td>\n",
186
- " <td>Female Redditors, which part of the male physi...</td>\n",
187
- " <td>19hbjk</td>\n",
188
- " <td>0</td>\n",
189
- " <td>2</td>\n",
190
- " </tr>\n",
191
- " </tbody>\n",
192
- "</table>\n",
193
- "</div>"
194
- ],
195
- "text/plain": [
196
- " permalink \\\n",
197
- "0 /r/AskReddit/comments/19hbm0/in_the_way_that_p... \n",
198
- "1 /r/AskReddit/comments/19hblp/could_i_buy_an_an... \n",
199
- "2 /r/AskReddit/comments/19hblj/how_do_i_reddit/ \n",
200
- "3 /r/AskReddit/comments/19hbjx/xpost_rsurvival_h... \n",
201
- "4 /r/AskReddit/comments/19hbjk/female_redditors_... \n",
202
- "\n",
203
- " selftext \\\n",
204
- "0 Basically, do other parts of the world have th... \n",
205
- "1 \n",
206
- "2 Yeah.\n",
207
- "\n",
208
- "How do I reddit? I don't use or read re... \n",
209
- "3 My brothers, dad and I have always been huge L... \n",
210
- "4 I'm curious, guys tend to get asked the usual ... \n",
211
- "\n",
212
- " url created_utc \\\n",
213
- "0 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:55 \n",
214
- "1 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:50 \n",
215
- "2 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:47 \n",
216
- "3 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:07 \n",
217
- "4 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:57:58 \n",
218
- "\n",
219
- " author num_comments score \\\n",
220
- "0 sjr63 1 1 \n",
221
- "1 WeirdPlane 13 1 \n",
222
- "2 xxnovaroxgg 14 0 \n",
223
- "3 tuffstough 0 1 \n",
224
- "4 redditredditx3 13 2 \n",
225
- "\n",
226
- " title id downs ups \n",
227
- "0 In the way that popular English and American m... 19hbm0 0 1 \n",
228
- "1 Could I buy an Android phone without a plan an... 19hblp 0 1 \n",
229
- "2 How do I reddit 19hblj 0 0 \n",
230
- "3 (x-post r/survival) Have any redditors seen Le... 19hbjx 0 1 \n",
231
- "4 Female Redditors, which part of the male physi... 19hbjk 0 2 "
232
- ]
233
- },
234
- "execution_count": 5,
235
- "metadata": {},
236
- "output_type": "execute_result"
237
- }
238
- ],
239
- "source": [
240
- "df = submissions_to_dataframe(submissions)\n",
241
- "df.head()"
242
- ]
243
- },
244
- {
245
- "cell_type": "code",
246
- "execution_count": null,
247
- "id": "518addff",
248
- "metadata": {},
249
- "outputs": [],
250
- "source": []
251
- },
252
- {
253
- "cell_type": "code",
254
- "execution_count": null,
255
- "id": "6e5490dc",
256
- "metadata": {},
257
- "outputs": [],
258
- "source": [
259
- "start_date = datetime.strptime(\"2013-01-01\", \"%Y-%m-%d\")\n",
260
- "start_date"
261
- ]
262
- },
263
- {
264
- "cell_type": "code",
265
- "execution_count": null,
266
- "id": "bf13555a",
267
- "metadata": {},
268
- "outputs": [],
269
- "source": [
270
- "df[\"created_utc\"] = pd.to_datetime(df[\"created_utc\"], unit=\"s\").dt.tz_localize(\"UTC\").dt.strftime('%Y-%m-%d %H:%M:%S')"
271
- ]
272
- },
273
- {
274
- "cell_type": "code",
275
- "execution_count": null,
276
- "id": "48e413f3",
277
- "metadata": {},
278
- "outputs": [],
279
- "source": [
280
- "df.head()"
281
- ]
282
- },
283
- {
284
- "cell_type": "code",
285
- "execution_count": null,
286
- "id": "9e83befa",
287
- "metadata": {},
288
- "outputs": [],
289
- "source": [
290
- "df.dtypes"
291
- ]
292
- },
293
- {
294
- "cell_type": "code",
295
- "execution_count": null,
296
- "id": "ba84be68",
297
- "metadata": {},
298
- "outputs": [],
299
- "source": []
300
- }
301
- ],
302
- "metadata": {
303
- "kernelspec": {
304
- "display_name": "Python 3 (ipykernel)",
305
- "language": "python",
306
- "name": "python3"
307
- },
308
- "language_info": {
309
- "codemirror_mode": {
310
- "name": "ipython",
311
- "version": 3
312
- },
313
- "file_extension": ".py",
314
- "mimetype": "text/x-python",
315
- "name": "python",
316
- "nbconvert_exporter": "python",
317
- "pygments_lexer": "ipython3",
318
- "version": "3.9.16"
319
- }
320
- },
321
- "nbformat": 4,
322
- "nbformat_minor": 5
323
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
notebooks/validate.ipynb DELETED
@@ -1,617 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "id": "730ba509",
7
- "metadata": {},
8
- "outputs": [],
9
- "source": [
10
- "from IPython.core.interactiveshell import InteractiveShell\n",
11
- "InteractiveShell.ast_node_interactivity = \"all\""
12
- ]
13
- },
14
- {
15
- "cell_type": "code",
16
- "execution_count": 2,
17
- "id": "d9acd4b6",
18
- "metadata": {},
19
- "outputs": [],
20
- "source": [
21
- "from pathlib import Path\n",
22
- "import sys\n",
23
- "proj_dir = Path.cwd().parent\n",
24
- "\n",
25
- "sys.path.append(str(proj_dir))\n"
26
- ]
27
- },
28
- {
29
- "cell_type": "code",
30
- "execution_count": 4,
31
- "id": "62452860",
32
- "metadata": {},
33
- "outputs": [],
34
- "source": [
35
- "from datasets import load_dataset"
36
- ]
37
- },
38
- {
39
- "cell_type": "code",
40
- "execution_count": 28,
41
- "id": "00affc9a",
42
- "metadata": {},
43
- "outputs": [
44
- {
45
- "data": {
46
- "application/vnd.jupyter.widget-view+json": {
47
- "model_id": "a106bb47c1194b15bc289d2ef24258af",
48
- "version_major": 2,
49
- "version_minor": 0
50
- },
51
- "text/plain": [
52
- "Downloading readme: 0%| | 0.00/804 [00:00<?, ?B/s]"
53
- ]
54
- },
55
- "metadata": {},
56
- "output_type": "display_data"
57
- },
58
- {
59
- "name": "stderr",
60
- "output_type": "stream",
61
- "text": [
62
- "Using custom data configuration derek-thomas--dataset-creator-askreddit-a3c1289ebaf83d16\n"
63
- ]
64
- },
65
- {
66
- "name": "stdout",
67
- "output_type": "stream",
68
- "text": [
69
- "Downloading and preparing dataset None/None to /Users/derekthomas/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--dataset-creator-askreddit-a3c1289ebaf83d16/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n"
70
- ]
71
- },
72
- {
73
- "data": {
74
- "application/vnd.jupyter.widget-view+json": {
75
- "model_id": "705d55e70bf442f98a51dd0618a5c2c6",
76
- "version_major": 2,
77
- "version_minor": 0
78
- },
79
- "text/plain": [
80
- "Downloading data files: 0%| | 0/1 [00:00<?, ?it/s]"
81
- ]
82
- },
83
- "metadata": {},
84
- "output_type": "display_data"
85
- },
86
- {
87
- "data": {
88
- "application/vnd.jupyter.widget-view+json": {
89
- "model_id": "139220a81674444997f7657a4c2e1a01",
90
- "version_major": 2,
91
- "version_minor": 0
92
- },
93
- "text/plain": [
94
- "Downloading data: 0%| | 0.00/702k [00:00<?, ?B/s]"
95
- ]
96
- },
97
- "metadata": {},
98
- "output_type": "display_data"
99
- },
100
- {
101
- "data": {
102
- "application/vnd.jupyter.widget-view+json": {
103
- "model_id": "1a361406937144cebd4ff6168e56ec3d",
104
- "version_major": 2,
105
- "version_minor": 0
106
- },
107
- "text/plain": [
108
- "Extracting data files: 0%| | 0/1 [00:00<?, ?it/s]"
109
- ]
110
- },
111
- "metadata": {},
112
- "output_type": "display_data"
113
- },
114
- {
115
- "data": {
116
- "application/vnd.jupyter.widget-view+json": {
117
- "model_id": "",
118
- "version_major": 2,
119
- "version_minor": 0
120
- },
121
- "text/plain": [
122
- "Generating all_days split: 0%| | 0/3272 [00:00<?, ? examples/s]"
123
- ]
124
- },
125
- "metadata": {},
126
- "output_type": "display_data"
127
- },
128
- {
129
- "name": "stdout",
130
- "output_type": "stream",
131
- "text": [
132
- "Dataset parquet downloaded and prepared to /Users/derekthomas/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--dataset-creator-askreddit-a3c1289ebaf83d16/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.\n"
133
- ]
134
- },
135
- {
136
- "data": {
137
- "application/vnd.jupyter.widget-view+json": {
138
- "model_id": "4df7107473904386aebd66c543858abd",
139
- "version_major": 2,
140
- "version_minor": 0
141
- },
142
- "text/plain": [
143
- " 0%| | 0/1 [00:00<?, ?it/s]"
144
- ]
145
- },
146
- "metadata": {},
147
- "output_type": "display_data"
148
- }
149
- ],
150
- "source": [
151
- "dataset = load_dataset('derek-thomas/dataset-creator-askreddit', download_mode=\"reuse_cache_if_exists\", ignore_verifications=True)"
152
- ]
153
- },
154
- {
155
- "cell_type": "code",
156
- "execution_count": 29,
157
- "id": "ba84be68",
158
- "metadata": {},
159
- "outputs": [
160
- {
161
- "data": {
162
- "text/html": [
163
- "<div>\n",
164
- "<style scoped>\n",
165
- " .dataframe tbody tr th:only-of-type {\n",
166
- " vertical-align: middle;\n",
167
- " }\n",
168
- "\n",
169
- " .dataframe tbody tr th {\n",
170
- " vertical-align: top;\n",
171
- " }\n",
172
- "\n",
173
- " .dataframe thead th {\n",
174
- " text-align: right;\n",
175
- " }\n",
176
- "</style>\n",
177
- "<table border=\"1\" class=\"dataframe\">\n",
178
- " <thead>\n",
179
- " <tr style=\"text-align: right;\">\n",
180
- " <th></th>\n",
181
- " <th>score</th>\n",
182
- " <th>num_comments</th>\n",
183
- " <th>title</th>\n",
184
- " <th>permalink</th>\n",
185
- " <th>selftext</th>\n",
186
- " <th>url</th>\n",
187
- " <th>created_utc</th>\n",
188
- " <th>author</th>\n",
189
- " <th>id</th>\n",
190
- " <th>downs</th>\n",
191
- " <th>ups</th>\n",
192
- " <th>date</th>\n",
193
- " <th>time</th>\n",
194
- " </tr>\n",
195
- " </thead>\n",
196
- " <tbody>\n",
197
- " <tr>\n",
198
- " <th>0</th>\n",
199
- " <td>2</td>\n",
200
- " <td>4</td>\n",
201
- " <td>Reddit, if someone had to describe you to a st...</td>\n",
202
- " <td>/r/AskReddit/comments/15sn6y/reddit_if_someone...</td>\n",
203
- " <td>They would be talking about you without your p...</td>\n",
204
- " <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
205
- " <td>2013-01-01 23:59:40+00:00</td>\n",
206
- " <td>[deleted]</td>\n",
207
- " <td>15sn6y</td>\n",
208
- " <td>0</td>\n",
209
- " <td>2</td>\n",
210
- " <td>2013-01-01</td>\n",
211
- " <td>23:59:40</td>\n",
212
- " </tr>\n",
213
- " <tr>\n",
214
- " <th>1</th>\n",
215
- " <td>5</td>\n",
216
- " <td>24</td>\n",
217
- " <td>What kind of car does the average \\nRedditor d...</td>\n",
218
- " <td>/r/AskReddit/comments/15sn6m/what_kind_of_car_...</td>\n",
219
- " <td>I've always wanted to know what kind of car th...</td>\n",
220
- " <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
221
- " <td>2013-01-01 23:59:31+00:00</td>\n",
222
- " <td>PaytonAdams</td>\n",
223
- " <td>15sn6m</td>\n",
224
- " <td>0</td>\n",
225
- " <td>5</td>\n",
226
- " <td>2013-01-01</td>\n",
227
- " <td>23:59:31</td>\n",
228
- " </tr>\n",
229
- " <tr>\n",
230
- " <th>2</th>\n",
231
- " <td>1</td>\n",
232
- " <td>5</td>\n",
233
- " <td>What movies have made you go back to the theat...</td>\n",
234
- " <td>/r/AskReddit/comments/15sn6b/what_movies_have_...</td>\n",
235
- " <td></td>\n",
236
- " <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
237
- " <td>2013-01-01 23:59:20+00:00</td>\n",
238
- " <td>[deleted]</td>\n",
239
- " <td>15sn6b</td>\n",
240
- " <td>0</td>\n",
241
- " <td>1</td>\n",
242
- " <td>2013-01-01</td>\n",
243
- " <td>23:59:20</td>\n",
244
- " </tr>\n",
245
- " <tr>\n",
246
- " <th>3</th>\n",
247
- " <td>0</td>\n",
248
- " <td>18</td>\n",
249
- " <td>Worst fear(s)?</td>\n",
250
- " <td>/r/AskReddit/comments/15sn4u/worst_fears/</td>\n",
251
- " <td>So what is your worst fear, reddit?</td>\n",
252
- " <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
253
- " <td>2013-01-01 23:58:37+00:00</td>\n",
254
- " <td>[deleted]</td>\n",
255
- " <td>15sn4u</td>\n",
256
- " <td>0</td>\n",
257
- " <td>0</td>\n",
258
- " <td>2013-01-01</td>\n",
259
- " <td>23:58:37</td>\n",
260
- " </tr>\n",
261
- " <tr>\n",
262
- " <th>4</th>\n",
263
- " <td>11</td>\n",
264
- " <td>29</td>\n",
265
- " <td>If there was a type of ink that lasted only fo...</td>\n",
266
- " <td>/r/AskReddit/comments/15sn44/if_there_was_a_ty...</td>\n",
267
- " <td></td>\n",
268
- " <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
269
- " <td>2013-01-01 23:58:15+00:00</td>\n",
270
- " <td>Honeybeard</td>\n",
271
- " <td>15sn44</td>\n",
272
- " <td>0</td>\n",
273
- " <td>11</td>\n",
274
- " <td>2013-01-01</td>\n",
275
- " <td>23:58:15</td>\n",
276
- " </tr>\n",
277
- " <tr>\n",
278
- " <th>...</th>\n",
279
- " <td>...</td>\n",
280
- " <td>...</td>\n",
281
- " <td>...</td>\n",
282
- " <td>...</td>\n",
283
- " <td>...</td>\n",
284
- " <td>...</td>\n",
285
- " <td>...</td>\n",
286
- " <td>...</td>\n",
287
- " <td>...</td>\n",
288
- " <td>...</td>\n",
289
- " <td>...</td>\n",
290
- " <td>...</td>\n",
291
- " <td>...</td>\n",
292
- " </tr>\n",
293
- " <tr>\n",
294
- " <th>3267</th>\n",
295
- " <td>0</td>\n",
296
- " <td>11</td>\n",
297
- " <td>Smokers of Reddit- What are your reasons for s...</td>\n",
298
- " <td>/r/AskReddit/comments/15qzen/smokers_of_reddit...</td>\n",
299
- " <td>I'm very curious as to what causes someone to ...</td>\n",
300
- " <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
301
- " <td>2013-01-01 00:01:36+00:00</td>\n",
302
- " <td>kelsofb</td>\n",
303
- " <td>15qzen</td>\n",
304
- " <td>0</td>\n",
305
- " <td>0</td>\n",
306
- " <td>2013-01-01</td>\n",
307
- " <td>00:01:36</td>\n",
308
- " </tr>\n",
309
- " <tr>\n",
310
- " <th>3268</th>\n",
311
- " <td>1</td>\n",
312
- " <td>4</td>\n",
313
- " <td>Hi</td>\n",
314
- " <td>/r/AskReddit/comments/15qzei/hi/</td>\n",
315
- " <td></td>\n",
316
- " <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
317
- " <td>2013-01-01 00:01:34+00:00</td>\n",
318
- " <td>ImJE5US</td>\n",
319
- " <td>15qzei</td>\n",
320
- " <td>0</td>\n",
321
- " <td>1</td>\n",
322
- " <td>2013-01-01</td>\n",
323
- " <td>00:01:34</td>\n",
324
- " </tr>\n",
325
- " <tr>\n",
326
- " <th>3269</th>\n",
327
- " <td>1</td>\n",
328
- " <td>2</td>\n",
329
- " <td>At the stroke of midnight I was writing this p...</td>\n",
330
- " <td>/r/AskReddit/comments/15qzdx/at_the_stroke_of_...</td>\n",
331
- " <td></td>\n",
332
- " <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
333
- " <td>2013-01-01 00:01:15+00:00</td>\n",
334
- " <td>Sangfroid_Sonder</td>\n",
335
- " <td>15qzdx</td>\n",
336
- " <td>0</td>\n",
337
- " <td>1</td>\n",
338
- " <td>2013-01-01</td>\n",
339
- " <td>00:01:15</td>\n",
340
- " </tr>\n",
341
- " <tr>\n",
342
- " <th>3270</th>\n",
343
- " <td>1</td>\n",
344
- " <td>2</td>\n",
345
- " <td>With all the rape stories in the news, why don...</td>\n",
346
- " <td>/r/AskReddit/comments/15qzdc/with_all_the_rape...</td>\n",
347
- " <td></td>\n",
348
- " <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
349
- " <td>2013-01-01 00:00:58+00:00</td>\n",
350
- " <td>[deleted]</td>\n",
351
- " <td>15qzdc</td>\n",
352
- " <td>0</td>\n",
353
- " <td>1</td>\n",
354
- " <td>2013-01-01</td>\n",
355
- " <td>00:00:58</td>\n",
356
- " </tr>\n",
357
- " <tr>\n",
358
- " <th>3271</th>\n",
359
- " <td>0</td>\n",
360
- " <td>3</td>\n",
361
- " <td>Do beautiful people have low entropy?</td>\n",
362
- " <td>/r/AskReddit/comments/15qzd3/do_beautiful_peop...</td>\n",
363
- " <td>I have been reading about entropy and arrows o...</td>\n",
364
- " <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
365
- " <td>2013-01-01 00:00:53+00:00</td>\n",
366
- " <td>[deleted]</td>\n",
367
- " <td>15qzd3</td>\n",
368
- " <td>0</td>\n",
369
- " <td>0</td>\n",
370
- " <td>2013-01-01</td>\n",
371
- " <td>00:00:53</td>\n",
372
- " </tr>\n",
373
- " </tbody>\n",
374
- "</table>\n",
375
- "<p>3272 rows × 13 columns</p>\n",
376
- "</div>"
377
- ],
378
- "text/plain": [
379
- " score num_comments title \\\n",
380
- "0 2 4 Reddit, if someone had to describe you to a st... \n",
381
- "1 5 24 What kind of car does the average \\nRedditor d... \n",
382
- "2 1 5 What movies have made you go back to the theat... \n",
383
- "3 0 18 Worst fear(s)? \n",
384
- "4 11 29 If there was a type of ink that lasted only fo... \n",
385
- "... ... ... ... \n",
386
- "3267 0 11 Smokers of Reddit- What are your reasons for s... \n",
387
- "3268 1 4 Hi \n",
388
- "3269 1 2 At the stroke of midnight I was writing this p... \n",
389
- "3270 1 2 With all the rape stories in the news, why don... \n",
390
- "3271 0 3 Do beautiful people have low entropy? \n",
391
- "\n",
392
- " permalink \\\n",
393
- "0 /r/AskReddit/comments/15sn6y/reddit_if_someone... \n",
394
- "1 /r/AskReddit/comments/15sn6m/what_kind_of_car_... \n",
395
- "2 /r/AskReddit/comments/15sn6b/what_movies_have_... \n",
396
- "3 /r/AskReddit/comments/15sn4u/worst_fears/ \n",
397
- "4 /r/AskReddit/comments/15sn44/if_there_was_a_ty... \n",
398
- "... ... \n",
399
- "3267 /r/AskReddit/comments/15qzen/smokers_of_reddit... \n",
400
- "3268 /r/AskReddit/comments/15qzei/hi/ \n",
401
- "3269 /r/AskReddit/comments/15qzdx/at_the_stroke_of_... \n",
402
- "3270 /r/AskReddit/comments/15qzdc/with_all_the_rape... \n",
403
- "3271 /r/AskReddit/comments/15qzd3/do_beautiful_peop... \n",
404
- "\n",
405
- " selftext \\\n",
406
- "0 They would be talking about you without your p... \n",
407
- "1 I've always wanted to know what kind of car th... \n",
408
- "2 \n",
409
- "3 So what is your worst fear, reddit? \n",
410
- "4 \n",
411
- "... ... \n",
412
- "3267 I'm very curious as to what causes someone to ... \n",
413
- "3268 \n",
414
- "3269 \n",
415
- "3270 \n",
416
- "3271 I have been reading about entropy and arrows o... \n",
417
- "\n",
418
- " url \\\n",
419
- "0 http://www.reddit.com/r/AskReddit/comments/15s... \n",
420
- "1 http://www.reddit.com/r/AskReddit/comments/15s... \n",
421
- "2 http://www.reddit.com/r/AskReddit/comments/15s... \n",
422
- "3 http://www.reddit.com/r/AskReddit/comments/15s... \n",
423
- "4 http://www.reddit.com/r/AskReddit/comments/15s... \n",
424
- "... ... \n",
425
- "3267 http://www.reddit.com/r/AskReddit/comments/15q... \n",
426
- "3268 http://www.reddit.com/r/AskReddit/comments/15q... \n",
427
- "3269 http://www.reddit.com/r/AskReddit/comments/15q... \n",
428
- "3270 http://www.reddit.com/r/AskReddit/comments/15q... \n",
429
- "3271 http://www.reddit.com/r/AskReddit/comments/15q... \n",
430
- "\n",
431
- " created_utc author id downs ups \\\n",
432
- "0 2013-01-01 23:59:40+00:00 [deleted] 15sn6y 0 2 \n",
433
- "1 2013-01-01 23:59:31+00:00 PaytonAdams 15sn6m 0 5 \n",
434
- "2 2013-01-01 23:59:20+00:00 [deleted] 15sn6b 0 1 \n",
435
- "3 2013-01-01 23:58:37+00:00 [deleted] 15sn4u 0 0 \n",
436
- "4 2013-01-01 23:58:15+00:00 Honeybeard 15sn44 0 11 \n",
437
- "... ... ... ... ... ... \n",
438
- "3267 2013-01-01 00:01:36+00:00 kelsofb 15qzen 0 0 \n",
439
- "3268 2013-01-01 00:01:34+00:00 ImJE5US 15qzei 0 1 \n",
440
- "3269 2013-01-01 00:01:15+00:00 Sangfroid_Sonder 15qzdx 0 1 \n",
441
- "3270 2013-01-01 00:00:58+00:00 [deleted] 15qzdc 0 1 \n",
442
- "3271 2013-01-01 00:00:53+00:00 [deleted] 15qzd3 0 0 \n",
443
- "\n",
444
- " date time \n",
445
- "0 2013-01-01 23:59:40 \n",
446
- "1 2013-01-01 23:59:31 \n",
447
- "2 2013-01-01 23:59:20 \n",
448
- "3 2013-01-01 23:58:37 \n",
449
- "4 2013-01-01 23:58:15 \n",
450
- "... ... ... \n",
451
- "3267 2013-01-01 00:01:36 \n",
452
- "3268 2013-01-01 00:01:34 \n",
453
- "3269 2013-01-01 00:01:15 \n",
454
- "3270 2013-01-01 00:00:58 \n",
455
- "3271 2013-01-01 00:00:53 \n",
456
- "\n",
457
- "[3272 rows x 13 columns]"
458
- ]
459
- },
460
- "execution_count": 29,
461
- "metadata": {},
462
- "output_type": "execute_result"
463
- }
464
- ],
465
- "source": [
466
- "df = dataset['all_days'].to_pandas()\n",
467
- "df"
468
- ]
469
- },
470
- {
471
- "cell_type": "code",
472
- "execution_count": 16,
473
- "id": "28df4b06",
474
- "metadata": {},
475
- "outputs": [
476
- {
477
- "data": {
478
- "text/plain": [
479
- "score Int64\n",
480
- "num_comments Int64\n",
481
- "title string\n",
482
- "permalink string\n",
483
- "selftext string\n",
484
- "url string\n",
485
- "created_utc string\n",
486
- "author string\n",
487
- "id string\n",
488
- "downs Int64\n",
489
- "ups Int64\n",
490
- "dtype: object"
491
- ]
492
- },
493
- "execution_count": 16,
494
- "metadata": {},
495
- "output_type": "execute_result"
496
- }
497
- ],
498
- "source": [
499
- "df.convert_dtypes().dtypes"
500
- ]
501
- },
502
- {
503
- "cell_type": "code",
504
- "execution_count": 18,
505
- "id": "e322b6c0",
506
- "metadata": {},
507
- "outputs": [],
508
- "source": [
509
- "import pandas as pd"
510
- ]
511
- },
512
- {
513
- "cell_type": "code",
514
- "execution_count": 21,
515
- "id": "ed1b06c3",
516
- "metadata": {},
517
- "outputs": [],
518
- "source": [
519
- "df['created_utc'] = pd.to_datetime(df['created_utc'])\n",
520
- "df['date'] = df['created_utc'].dt.date\n",
521
- "df['time'] = df['created_utc'].dt.time"
522
- ]
523
- },
524
- {
525
- "cell_type": "code",
526
- "execution_count": 33,
527
- "id": "ff477737",
528
- "metadata": {},
529
- "outputs": [
530
- {
531
- "data": {
532
- "text/plain": [
533
- "2013-01-01 3272\n",
534
- "Name: date, dtype: int64"
535
- ]
536
- },
537
- "execution_count": 33,
538
- "metadata": {},
539
- "output_type": "execute_result"
540
- }
541
- ],
542
- "source": [
543
- "df.date.value_counts()"
544
- ]
545
- },
546
- {
547
- "cell_type": "code",
548
- "execution_count": 26,
549
- "id": "1d11b967",
550
- "metadata": {},
551
- "outputs": [],
552
- "source": [
553
- "new_df = df.drop_duplicates(subset=['id'], keep=\"first\")"
554
- ]
555
- },
556
- {
557
- "cell_type": "code",
558
- "execution_count": 27,
559
- "id": "eec00dd6",
560
- "metadata": {},
561
- "outputs": [
562
- {
563
- "data": {
564
- "text/plain": [
565
- "<Axes: >"
566
- ]
567
- },
568
- "execution_count": 27,
569
- "metadata": {},
570
- "output_type": "execute_result"
571
- },
572
- {
573
- "data": {
574
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAGdCAYAAAA7VYb2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA+sklEQVR4nO3de3hU1b3/8U+CuXFJAiqEaJC0FgG5CgoRpVpiIlALlMJBsFqLUGmwYqwXWosBLxRURAGLHItIJZXSI4jAiaRwNCqRSyRVQan6UPEUE34WSSSUZCDr94dndjO5T7ImMzt5v55nnpC916z92WvW3vNlzyVhxhgjAAAANEt4sAMAAAC0BhRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGDBOcEOEEyVlZU6evSoOnXqpLCwsGDHAQAAjWCM0ddff63ExESFh4fO9aE2XVQdPXpUSUlJwY4BAACa4PPPP9eFF14Y7BiONl1UderUSdI3D0psbKy1fj0ej7Zv3660tDRFRERY6zfQ3Jpbcnd2ifzB5ObsknvzuzW35O7sUuvIv2nTJt12223O83ioaNNFlfclv9jYWOtFVfv27RUbG+uqCevW3JK7s0vkDyY3Z5fcm9+tuSV3Z5daT35JIffWndB5IRIAAMDFKKoAAAAs8LuoysvL0w033KDExESFhYVp06ZNzjqPx6P77rtP/fv3V4cOHZSYmKibb75ZR48e9enj+PHjmjZtmmJjYxUfH6/p06fr5MmTPm3ee+89XX311YqOjlZSUpIWL15cI8uGDRvUu3dvRUdHq3///tq2bZu/uwMAAGCF30VVWVmZBg4cqBUrVtRYd+rUKb377rv6zW9+o3fffVcvv/yyDh06pB/84Ac+7aZNm6YDBw4oNzdXW7ZsUV5enmbOnOmsLy0tVVpami666CIVFBToscceU1ZWllatWuW02bVrl2688UZNnz5d+/fv1/jx4zV+/Hh98MEH/u4SAABAs/n9RvXRo0dr9OjRta6Li4tTbm6uz7Lly5friiuu0JEjR9SjRw99+OGHysnJ0d69ezV06FBJ0rJlyzRmzBg9/vjjSkxM1Lp161RRUaHVq1crMjJSl156qQoLC7VkyRKn+Hrqqad0/fXX65577pEkPfTQQ8rNzdXy5cu1cuVKf3cLAACgWQL+6b+SkhKFhYUpPj5ekpSfn6/4+HinoJKk1NRUhYeHa/fu3ZowYYLy8/M1cuRIRUZGOm3S09O1aNEiffXVV+rcubPy8/OVmZnps6309HSflyOrKy8vV3l5ufN7aWmppG9etvR4PBb2Vk5/VX+6hVtzS+7OLpE/mNycXXJvfrfmltydXWo9+UNRQIuq06dP67777tONN97ofGVBUVGRunbt6hvinHPUpUsXFRUVOW2Sk5N92nTr1s1Z17lzZxUVFTnLqrbx9lGbhQsXav78+TWWb9++3fl4pk3Vr9q5hVtzS+7OLpE/mNycXXJvfrfmltydXXJ//lAUsKLK4/Fo8uTJMsbod7/7XaA245e5c+f6XN0qLS1VUlKS0tLSrH9PVW5urq677jpXfQeIW3NL7s4ukT+Y3Jxdcm9+t+aW3J1dah35X3nllWDHqFVAiipvQfXZZ59p586dPgVLQkKCjh075tP+zJkzOn78uBISEpw2xcXFPm28vzfUxru+NlFRUYqKiqqxPCIiIiATK1D9Bppbc0vuzi6RP5jcnF1yb3635pbcnV1yf/5QZP17qrwF1ccff6y//OUvOvfcc33Wp6Sk6MSJEyooKHCW7dy5U5WVlRo2bJjTJi8vz+d109zcXF1yySXq3Lmz02bHjh0+fefm5iolJcX2LgEAADTI76Lq5MmTKiwsVGFhoSTp8OHDKiws1JEjR+TxePSjH/1I+/bt07p163T27FkVFRWpqKhIFRUVkqQ+ffro+uuv14wZM7Rnzx69/fbbmj17tqZMmaLExERJ0tSpUxUZGanp06frwIEDWr9+vZ566imfl+7uvPNO5eTk6IknntBHH32krKws7du3T7Nnz7YwLAAAAP7xu6jat2+fBg8erMGDB0uSMjMzNXjwYM2bN0//+Mc/tHnzZv3v//6vBg0apO7duzu3Xbt2OX2sW7dOvXv31qhRozRmzBhdddVVPt9BFRcXp+3bt+vw4cMaMmSI7r77bs2bN8/nu6yuvPJKZWdna9WqVRo4cKD+/Oc/a9OmTerXr19zxgMAAKBJ/H5P1TXXXCNjTJ3r61vn1aVLF2VnZ9fbZsCAAXrzzTfrbTNp0iRNmjSpwe0BAAAEGn/7DwAAwAKKKgAA4Jee928NdoSQRFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGABRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGABRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAA1KLn/VuDHQEuQ1EFAABgAUUVAACoFVfr/ENRBQAAYAFFFQAAgAUUVQAA+KFf1mvBjoAQRVEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQCgWfiCSOAbFFUAEEQUJEDrQVEFAABgAUUVAACABRRVAACgWXrev5WXskVRBQAAYIXfRVVeXp5uuOEGJSYmKiwsTJs2bfJZb4zRvHnz1L17d8XExCg1NVUff/yxT5vjx49r2rRpio2NVXx8vKZPn66TJ0/6tHnvvfd09dVXKzo6WklJSVq8eHGNLBs2bFDv3r0VHR2t/v37a9u2bf7uDgAAgBV+F1VlZWUaOHCgVqxYUev6xYsX6+mnn9bKlSu1e/dudejQQenp6Tp9+rTTZtq0aTpw4IByc3O1ZcsW5eXlaebMmc760tJSpaWl6aKLLlJBQYEee+wxZWVladWqVU6bXbt26cYbb9T06dO1f/9+jR8/XuPHj9cHH3zg7y4BAAA02zn+3mH06NEaPXp0reuMMVq6dKkeeOABjRs3TpK0du1adevWTZs2bdKUKVP04YcfKicnR3v37tXQoUMlScuWLdOYMWP0+OOPKzExUevWrVNFRYVWr16tyMhIXXrppSosLNSSJUuc4uupp57S9ddfr3vuuUeS9NBDDyk3N1fLly/XypUrmzQYANDaed/38vffjg1yEqD18buoqs/hw4dVVFSk1NRUZ1lcXJyGDRum/Px8TZkyRfn5+YqPj3cKKklKTU1VeHi4du/erQkTJig/P18jR45UZGSk0yY9PV2LFi3SV199pc6dOys/P1+ZmZk+209PT6/xcmRV5eXlKi8vd34vLS2VJHk8Hnk8nubuvsPbl80+W4Jbc0vuzi6RP5iCnT2qnWnWtv3NH9XO+NW+sX3621+wx70x6tqvqHD7Y9iS/Bn7Oseg2vJAzKu6hPK4hxljTJPvHBamjRs3avz48ZK+eUluxIgROnr0qLp37+60mzx5ssLCwrR+/Xo9+uijeuGFF3To0CGfvrp27ar58+dr1qxZSktLU3Jysp599lln/cGDB3XppZfq4MGD6tOnjyIjI/XCCy/oxhtvdNo888wzmj9/voqLi2vNm5WVpfnz59dYnp2drfbt2zd1GAAAQAs6deqUpk6dqpKSEsXGxgY7jsPqlapQN3fuXJ+rW6WlpUpKSlJaWprVB8Xj8Sg3N1fXXXedIiIirPUbaG7NLbk7u0T+YAp29n5Zr+mDrPQm39/f/P2yXpOkZm2ztj797a+p497c8fJHXdsasiBHDw2tdOV8l/wb+7rGoPryQMyrung8Hr3yyisB305TWC2qEhISJEnFxcU+V6qKi4s1aNAgp82xY8d87nfmzBkdP37cuX9CQkKNq03e3xtq411fm6ioKEVFRdVYHhEREZADI1D9Bppbc0vuzi6RP5iClb38bJiV7daWv+f9W2u8d6r8bJjT3pbm7IO/425rvJqzrfLKf4+hW+e71Lj8dY5BteWBmFduZPV7qpKTk5WQkKAdO3Y4y0pLS7V7926lpKRIklJSUnTixAkVFBQ4bXbu3KnKykoNGzbMaZOXl+fzumlubq4uueQSde7c2WlTdTveNt7tAAAAtCS/i6qTJ0+qsLBQhYWFkr55c3phYaGOHDmisLAwzZkzRw8//LA2b96s999/XzfffLMSExOd91316dNH119/vWbMmKE9e/bo7bff1uzZszVlyhQlJiZKkqZOnarIyEhNnz5dBw4c0Pr16/XUU0/5vHR35513KicnR0888YQ++ugjZWVlad++fZo9e3bzRwUBwzfuAqhNU88NbjinuCGjTW1tf6vyu6jat2+fBg8erMGDB0uSMjMzNXjwYM2bN0+SdO+99+qOO+7QzJkzdfnll+vkyZPKyclRdHS008e6devUu3dvjRo1SmPGjNFVV13l8x1UcXFx2r59uw4fPqwhQ4bo7rvv1rx583y+y+rKK69Udna2Vq1apYEDB+rPf/6zNm3apH79+jV5MACgLs19omjLTzRAW+H3e6quueYa1feBwbCwMC1YsEALFiyos02XLl2UnZ1d73YGDBigN998s942kyZN0qRJk+oPDAAhpLb3OQV7u8HKBLQ2/O0/AECbF4g/CMzVybaHogoAAMACiioAaGW4QhLaAnFVDKGBogohg5MM3Ka1Pzm25n0DAoGiCkHByRpAa9Gazmf+7ktr2ncbKKoAAAAsoKgCAIQkroLAbSiqAAAALKCoAgA0CleO3IXHq+VRVAEAWlQoPNmHQobqassUijlRN4oqAAAACyiqENJa+/cAAQBaD4oqAAhB/GeiZTDOsImiCgAAwAKKKuD/8FIjYB/HlDvwONlBURVCmNQAALgXRRUAVMN/cAA0BUUVAIQQCjrAvSiqAABWURhCapvzgKIKVrTFgwcAgoHzbeiiqELQcYIAALQGFFXwCwUQYAfHEmxiPoUGiiq4DicPAI3F+QItiaIKACziSbx2jEtwMf4tg6IK9eJAdBceLwAIHooqAAAACyiqAABAg7gS3jCKKgCAX/jj40DtKKrQZrnpScFNWQGgraKoQqvVmguRxu4bVxRQH+YGYBdFVRD484SI4ONxAAA0BkUVAAAtgP+gtX4UVQAAABZQVAFNwP84EWr6Zb0W7AghjfcXoiVQVLUCnCgAAAg+iiqgDWsrBXlr28/Wtj9Aa0FRhZDEkwYAwG0oqhAQvH+h9eHxBID6UVQBAABYQFHVSnFVAWgcPjUHwBaKKgCtBv+ZgBdzAcFAUeVCnCxaDx5LNBdX2oDQQVGFNodCpn6MD1qSm4tCN2dHYFBUAS2MT0YCCHWco5qGogoO/tcFwO2CXQzwn6a2jaKqBXGghSYeF/fisUMgMK/QVBRVAOACbeUKSGP3sS2MBdyHogoAgAZQxKExKKqANsLNTwq2srt5DACEPutF1dmzZ/Wb3/xGycnJiomJ0be//W099NBDMsY4bYwxmjdvnrp3766YmBilpqbq448/9unn+PHjmjZtmmJjYxUfH6/p06fr5MmTPm3ee+89XX311YqOjlZSUpIWL15se3cAK3gyBxAKOBcFlvWiatGiRfrd736n5cuX68MPP9SiRYu0ePFiLVu2zGmzePFiPf3001q5cqV2796tDh06KD09XadPn3baTJs2TQcOHFBubq62bNmivLw8zZw501lfWlqqtLQ0XXTRRSooKNBjjz2mrKwsrVq1yvYutUr1HVj9sl7jwAMAwE/Wi6pdu3Zp3LhxGjt2rHr27Kkf/ehHSktL0549eyR9c5Vq6dKleuCBBzRu3DgNGDBAa9eu1dGjR7Vp0yZJ0ocffqicnBw999xzGjZsmK666iotW7ZML730ko4ePSpJWrdunSoqKrR69WpdeumlmjJlin7xi19oyZIltncJaJS2Voi2xv1tzj61hvFoDfsABNM5tju88sortWrVKv3tb39Tr1699Ne//lVvvfWWU+wcPnxYRUVFSk1Nde4TFxenYcOGKT8/X1OmTFF+fr7i4+M1dOhQp01qaqrCw8O1e/duTZgwQfn5+Ro5cqQiIyOdNunp6Vq0aJG++uorde7cuUa28vJylZeXO7+XlpZKkjwejzwej7Ux8PZVvc+odkYej8f52S/rNX2QlV5jfV2/+9OurvvWe59w4/OzatamZPL2UVu7qu0bWla9n9oy1DbmDY1zbT/ry9yUZbWtq7od7/K65kxdfTflcWjuftSXoaH8zdWcY6Hq8vrmfW1zoqEMtY1vYzPU1b6+5dW3VTV/VLj/x1Rjs9SWzdtHQ49DfVmq5m6oXUPj2Jhxri1zY8altsy1ZW9oTgTyXNPQstr2qbHjVusY+Dl+NgXqPGNDmKn6ZicLKisr9atf/UqLFy9Wu3btdPbsWT3yyCOaO3eupG+uZI0YMUJHjx5V9+7dnftNnjxZYWFhWr9+vR599FG98MILOnTokE/fXbt21fz58zVr1iylpaUpOTlZzz77rLP+4MGDuvTSS3Xw4EH16dOnRrasrCzNnz+/xvLs7Gy1b9/e1hAAAIAAOnXqlKZOnaqSkhLFxsYGO86/Gcv++Mc/mgsvvND88Y9/NO+9955Zu3at6dKli1mzZo0xxpi3337bSDJHjx71ud+kSZPM5MmTjTHGPPLII6ZXr141+j7//PPNM888Y4wx5rrrrjMzZ870WX/gwAEjyRw8eLDWbKdPnzYlJSXO7fPPPzeSzJdffmkqKiqs3crKysymTZtMWVmZz/Jev3q11p/V19f1uz/tGlpW2/r+D2w2mzZtMv0f2NzojPVlqi9DbVnqWtaYDLWNeUP7UNvPpoxlQ2NT275V33Zdc8bG3KhrHPzdj/oyNJS/ubfmHAv1zYWKin/P+7KysiaNVUPHdWPmnb/zs+p9qh639e1vff0091ip73GoK4t3znhz15e5MZmamrkx49LQmDd0fAfqXOPP8V21XWPOlw2Ngb/jZ/NWVlZmsrOzjSRTUlLS7LrFJusv/91zzz26//77NWXKFElS//799dlnn2nhwoW65ZZblJCQIEkqLi72uVJVXFysQYMGSZISEhJ07Ngxn37PnDmj48ePO/dPSEhQcXGxTxvv79421UVFRSkqKqrG8oiICEVERDRhb+tXvd/ys2GKiIio8bP6+rp+96ddQ8tqXV8Z5vxsbMb6MnnHoLZ2tWWpa1n1furK4G1XX7+1La9vW3WOVSOX1bau6naq36euuWjjcWjufjQmQ6COpeZkqrq8vnlf37FZV9+1jW9jMzRnfvrMpyrHrb/HlI0sDT0O9WWpmruhdg1lamrmxoxLbZlry97QnLB9rvHn+K5tn+o7XzY4Bn6OX1th/Y3qp06dUni4b7ft2rVTZWWlJCk5OVkJCQnasWOHs760tFS7d+9WSkqKJCklJUUnTpxQQUGB02bnzp2qrKzUsGHDnDZ5eXk+r63m5ubqkksuqfX9VG7Cm0UB92uLx3Fb3GegKutF1Q033KBHHnlEW7du1d///ndt3LhRS5Ys0YQJEyRJYWFhmjNnjh5++GFt3rxZ77//vm6++WYlJiZq/PjxkqQ+ffro+uuv14wZM7Rnzx69/fbbmj17tqZMmaLExERJ0tSpUxUZGanp06frwIEDWr9+vZ566illZmba3iW4DCd29+MxBOBG1l/+W7ZsmX7zm9/o5z//uY4dO6bExET97Gc/07x585w29957r8rKyjRz5kydOHFCV111lXJychQdHe20WbdunWbPnq1Ro0YpPDxcEydO1NNPP+2sj4uL0/bt25WRkaEhQ4bovPPO07x583y+ywpA6PAWSn//7dggJ2m+nvdvbRX7AcAu60VVp06dtHTpUi1durTONmFhYVqwYIEWLFhQZ5suXbooOzu73m0NGDBAb775ZlOjAgHHky9jADuYR3AD/vYfAACABRRVAAAAFlBUAQAAWEBRBYSAflmvNfm+tX1Sjk/PAUDLo6gCAACwgKIKLcLGlROuvgAAQhlFFQDUg2IegcYcaz0oqgAACCEUWe5FUQXUgRMbAMAfFFUAgioYxSsFM4BAoKiCVW3tycr2/jbnqxUAAMFFUQVAUtsriAG34RgNfRRVAdbz/q2t+kBozfvWWvGYAUBgUFQBAABYQFEFAABgAUVVK9KUlxp5KQg28UZ7AG0ZRRUAAIAFFFVAI7X2Dx0AAJqHogpAi6M4BdAaUVQBAABYQFEFICRw9QqA21FUAWi1KNQAtCSKKrgaT5oAgFBBUYVGoXgBAKB+FFVAAFCEMgYA2h6KKgAAAAsoqtoArhgAABB4FFUAmoWiHQC+QVEFuAgFDACELooqAAAACyiqXICrEwAAhD6KKgAAAAsoqlyMK1itV8/7twb98Q329v3hpqwAWi+KKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAIYs3oANwE4oqAAAACyiqAAAALKCoAgAAsICiCoBfeJ8TANSOogoAAMACiioAAAALKKoAuEoo/F1EAKgNRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgQUCKqn/84x+66aabdO655yomJkb9+/fXvn37nPXGGM2bN0/du3dXTEyMUlNT9fHHH/v0cfz4cU2bNk2xsbGKj4/X9OnTdfLkSZ827733nq6++mpFR0crKSlJixcvDsTuAAAANMh6UfXVV19pxIgRioiI0H//93/r4MGDeuKJJ9S5c2enzeLFi/X0009r5cqV2r17tzp06KD09HSdPn3aaTNt2jQdOHBAubm52rJli/Ly8jRz5kxnfWlpqdLS0nTRRRepoKBAjz32mLKysrRq1SrbuwQAANCgc2x3uGjRIiUlJen55593liUnJzv/NsZo6dKleuCBBzRu3DhJ0tq1a9WtWzdt2rRJU6ZM0YcffqicnBzt3btXQ4cOlSQtW7ZMY8aM0eOPP67ExEStW7dOFRUVWr16tSIjI3XppZeqsLBQS5Ys8Sm+AAAAWoL1omrz5s1KT0/XpEmT9MYbb+iCCy7Qz3/+c82YMUOSdPjwYRUVFSk1NdW5T1xcnIYNG6b8/HxNmTJF+fn5io+PdwoqSUpNTVV4eLh2796tCRMmKD8/XyNHjlRkZKTTJj09XYsWLdJXX33lc2XMq7y8XOXl5c7vpaWlkiSPxyOPx2NtDLx9eTweRbUzPv+u7adXXev9aVd9W7W1r3NZuPH5aStLfe0ak6+uLFXbVR/zQGVuTJbG7G+NfqqMfVPHL5CZGzs/vfkbm6Wx87PB8Wtm5saMkb/zs8WO72pzp6Uy+zs/65x34Q3f1/b4NXZ+1pe5tuyNnZ+BPL6rauhx8PtYaca5xqZA9GlLmDHG2OwwOjpakpSZmalJkyZp7969uvPOO7Vy5Urdcsst2rVrl0aMGKGjR4+qe/fuzv0mT56ssLAwrV+/Xo8++qheeOEFHTp0yKfvrl27av78+Zo1a5bS0tKUnJysZ5991ll/8OBBXXrppTp48KD69OlTI1tWVpbmz59fY3l2drbat29vawgAAEAAnTp1SlOnTlVJSYliY2ODHeffjGUREREmJSXFZ9kdd9xhhg8fbowx5u233zaSzNGjR33aTJo0yUyePNkYY8wjjzxievXqVaPv888/3zzzzDPGGGOuu+46M3PmTJ/1Bw4cMJLMwYMHa812+vRpU1JS4tw+//xzI8l8+eWXpqKiwtqtrKzMbNq0yZSVlZlev3rV9PrVq6aioqLOn96bjXa13aexy/o/sNls2rTJ9H9gs9UsDfXT0LLG3Lf6mAcqc2OyNGZ/qy+vOvZNHb9AZm5o3Lzj781ve342NH7NOaa8Y1/b3GnO/Gyp47v63GmpzM09vqvPmabOz2Ac37Udr/7Mz0Ae37WNRV3Ha1lZmV/j25zxs3krKysz2dnZRpIpKSlpWrESINZf/uvevbv69u3rs6xPnz76r//6L0lSQkKCJKm4uNjnSlVxcbEGDRrktDl27JhPH2fOnNHx48ed+yckJKi4uNinjfd3b5vqoqKiFBUVVWN5RESEIiIiGruLjRYREaHys2E+/67tp1dd6/1pV31btbWvc1llmPPTZpb62jUmX13jV7Vd9TEPVObGZmlof2v0U2Xsmzp+gczc2Pnpzd/YLI2dnw2OXzMzN2aM/J2fLXZ8V5s7LZXZ3/lZ57yrbPi+tsevsfOzvsy1ZW/s/Azk8V1VQ49DREREgxlsnWvaCuuf/hsxYkSNl+3+9re/6aKLLpL0zZvWExIStGPHDmd9aWmpdu/erZSUFElSSkqKTpw4oYKCAqfNzp07VVlZqWHDhjlt8vLyfF5bzc3N1SWXXFLr+6kAAAACyXpRddddd+mdd97Ro48+qk8++UTZ2dlatWqVMjIyJElhYWGaM2eOHn74YW3evFnvv/++br75ZiUmJmr8+PGSvrmydf3112vGjBnas2eP3n77bc2ePVtTpkxRYmKiJGnq1KmKjIzU9OnTdeDAAa1fv15PPfWUMjMzbe8SAABAg6y//Hf55Zdr48aNmjt3rhYsWKDk5GQtXbpU06ZNc9rce++9Kisr08yZM3XixAldddVVysnJcd7kLknr1q3T7NmzNWrUKIWHh2vixIl6+umnnfVxcXHavn27MjIyNGTIEJ133nmaN28eX6cAAACCwnpRJUnf//739f3vf7/O9WFhYVqwYIEWLFhQZ5suXbooOzu73u0MGDBAb775ZpNzAgAA2MLf/gMAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAABaiZ73bw12hDaNogoAAMACiioAAAALKKoAAECj8RJj3SiqAAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgCghQT7Db7B3j7Q2lFUAQAAWEBRBQAAYAFFFQAA8BsvJ9dEUQUAAGABRRUABAn/0wdaF4oqAAgxNootCjag5VFUAQBCSlMLQgpJBNs5wQ4AAABClz/FalsvbLlSBQBoUxrzxN/WiwM0DUUVAMAVAlXo1NcvxRX8QVEFAG1YWysaWnp/29r4tnUUVQgJnHjgBnXNU+avHW4ex8Zmd8M+ejO6IWuo4Y3qAADrRUFt7Xrev1V//+1Yv3I1pu+WevL3bieqXYtsDi7ElSoAAAALuFIFAEHmvQJi4ypOc/GST9vFY998XKkCgCbgCcgX48EYgCtVaAV63r9VUe2MFl8R7CQAGhLqhUeo5AuVHPAPV6oAwA+h9GQXSllsa837htaLogquxAkXgdbz/q3MM7QYG19AWtcnLpnHLYeiCiElWAc/Jx2gdhwbjAEaj6IqyFr7wdra988teBwQKMwt4N94o3oI4KRkT7+s1ySFNdiuNY95a963YGFMEWqYk6GJK1UAAKDJKPD+jaIKABBUPCmjteDlPzRboE6InGjr/1tpjE/o4rGxh7H0xXiENq5UoU7+Hrwt8Vfa2+IJpS3uM0IP87BlhPo4h3q+YKOochl/JnRLFDmhrrn7ZuO7YxCa6vr+nlB6XL1ZQikTgLpRVLUxbngiQcvgMQeCi2Ow9aGoQoM48NFcLTmHmK8AgoWiCkCbQLGFlsR8a5soquA3/paUf+oaq2COIY9f28LjDbQMvlIhRIXiSTAUMwFtVSi8N5JzAuCLK1UB9M2fTGkZbfHkxh9fBhrGfG0deBzdgStVLYwDA27DnAWAxgn4larf/va3CgsL05w5c5xlp0+fVkZGhs4991x17NhREydOVHFxsc/9jhw5orFjx6p9+/bq2rWr7rnnHp05c8anzeuvv67LLrtMUVFRuvjii7VmzZpA7w7agOZ+OWlbuoJGwQWgNm313BDQomrv3r169tlnNWDAAJ/ld911l1599VVt2LBBb7zxho4ePaof/vCHzvqzZ89q7Nixqqio0K5du/TCCy9ozZo1mjdvntPm8OHDGjt2rK699loVFhZqzpw5uu222/Taay33kpsbtNWJ3Rq1xGPZWuYLH6aA1HrmM9wjYEXVyZMnNW3aNP3nf/6nOnfu7CwvKSnR73//ey1ZskTf+973NGTIED3//PPatWuX3nnnHUnS9u3bdfDgQb344osaNGiQRo8erYceekgrVqxQRUWFJGnlypVKTk7WE088oT59+mj27Nn60Y9+pCeffDJQuwQgiHiCRGMxV+ofg5Z8v29bE7D3VGVkZGjs2LFKTU3Vww8/7CwvKCiQx+NRamqqs6x3797q0aOH8vPzNXz4cOXn56t///7q1q2b0yY9PV2zZs3SgQMHNHjwYOXn5/v04W1T9WXG6srLy1VeXu78XlpaKknyeDzyeDzN3WWHt6+ocOOzLKpd3b831M7p8//WVV1evZ+mLvPmrZq7of30dx+qL6+6T/7mr6p69rq2VX17/u5DbRq7H/W2q5K/oX21+ZjXlau+x6zexzLcd67Wl6G2dnUtqz7vJemSX29RVLv6j4Xa9rmued/QuDdGbdvyPoFFtau7XUPL62vj73Hrz7aawu9552fuhvr1p11Tx8Cf4zVQGerro6FtVZ3zXoGYC9W3Ecp92hJmjLE+ki+99JIeeeQR7d27V9HR0brmmms0aNAgLV26VNnZ2br11lt9ihtJuuKKK3Tttddq0aJFmjlzpj777DOfl/JOnTqlDh06aNu2bRo9erR69eqlW2+9VXPnznXabNu2TWPHjtWpU6cUExNTI1dWVpbmz59fY3l2drbat29vcQQAAECgnDp1SlOnTlVJSYliY2ODHcdh/UrV559/rjvvvFO5ubmKjo623X2zzJ07V5mZmc7vpaWlSkpKUlpamtUHxePxKDc3V7/ZF67yyjBJ0gdZ6T6XXKv/XtfyhtrVtr6py6LCjR4aWumTuyn82Tep5qXoxuavqnr2urZVfXvNfRz87aP69r3LhyzIcfIXzLu+3n21+ZjXlauu+VXX8v2//p4z56vmry9D1e021K6x/TXmZY265v11112nwY/sbPD+/vTd2EzN0dTjtiWy1betqnOmOeebpm6/vuUNsXWubE6GpvTpXV51zkdEREgK3EuCVc+9tng8Hr3yyivW+7XBelFVUFCgY8eO6bLLLnOWnT17Vnl5eVq+fLlee+01VVRU6MSJE4qPj3faFBcXKyEhQZKUkJCgPXv2+PTr/XRg1TbVPzFYXFys2NjYWq9SSVJUVJSioqJqLI+IiHAmlk3llWEqPxvmbMP779p+r2t5Q+1qW9+cZdVzN4U/+yapWVmr82ava1vVt9fcx8HfPqpv32n3fyfm8sraH9OGttfcZdVzfec32yXVzFLXcqePavnry1B1uw21a2x/jZkjjX28msKfeWObv8dtS2ard94183zT1O1L/57PTWUjeyAeh8bO8cGP7NShR74vqeZ5yWaWtsT6G9VHjRql999/X4WFhc5t6NChmjZtmvPviIgI7dixw7nPoUOHdOTIEaWkpEiSUlJS9P777+vYsWNOm9zcXMXGxqpv375Om6p9eNt4+0DrEuw3ngZ7+21ZQ2Mfyo9NKGcDYJ/1K1WdOnVSv379fJZ16NBB5557rrN8+vTpyszMVJcuXRQbG6s77rhDKSkpGj58uCQpLS1Nffv21Y9//GMtXrxYRUVFeuCBB5SRkeFcabr99tu1fPly3XvvvfrpT3+qnTt36k9/+pO2bg3Nk1hTT65NuR8ncrgB87Tt4LFGWxGUb1R/8sknFR4erokTJ6q8vFzp6el65plnnPXt2rXTli1bNGvWLKWkpKhDhw665ZZbtGDBAqdNcnKytm7dqrvuuktPPfWULrzwQj333HNKT7f/+i0ANAbFA9C2tUhR9frrr/v8Hh0drRUrVmjFihV13ueiiy7Stm3b6u33mmuu0f79+21EdC1O4m0PjzlsYB4B9vEHlV2qNZ8QW/O+tQW2Hj/mAdA0HDvBQ1EVYlrzwdCa9w0AQh3n4MCjqAoSJjeAtow/lYLWiKIKrRJFKwCgpVFUAQAAWBCUr1RA68ZVotaDxxIIrGAfY8HefmvDlSqgmdriSakt7jMANIQrVWhTKAZqctOYuCkrQgfzBi2FogpB0xZPdG1xnwGgreDlPyDIAlVoUcABQMuiqEKbFoqFRyhmaqrWtC8A0BBe/kObEApP7oHMEAr7BwBtHVeqgACh0AGAtoWiqo3gCd4/jJd/QuFPjvCYAQg2iiqgDaHwqFsoFIYA3I2iCkCLorAD0FpRVAEAAFhAUQWgTeFKGYBAoagCAACwgKIKAADAAoqqNoyXQQAAsIeiCgAAwAKKKgBBx1VTAK0BRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGABRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGABRRUAAIAFFFUAAAAWWC+qFi5cqMsvv1ydOnVS165dNX78eB06dMinzenTp5WRkaFzzz1XHTt21MSJE1VcXOzT5siRIxo7dqzat2+vrl276p577tGZM2d82rz++uu67LLLFBUVpYsvvlhr1qyxvTsAAACNYr2oeuONN5SRkaF33nlHubm58ng8SktLU1lZmdPmrrvu0quvvqoNGzbojTfe0NGjR/XDH/7QWX/27FmNHTtWFRUV2rVrl1544QWtWbNG8+bNc9ocPnxYY8eO1bXXXqvCwkLNmTNHt912m1577TXbuwQAANCgc2x3mJOT4/P7mjVr1LVrVxUUFGjkyJEqKSnR73//e2VnZ+t73/ueJOn5559Xnz599M4772j48OHavn27Dh48qL/85S/q1q2bBg0apIceekj33XefsrKyFBkZqZUrVyo5OVlPPPGEJKlPnz5666239OSTTyo9Pd32bgEAANTLelFVXUlJiSSpS5cukqSCggJ5PB6lpqY6bXr37q0ePXooPz9fw4cPV35+vvr3769u3bo5bdLT0zVr1iwdOHBAgwcPVn5+vk8f3jZz5sypM0t5ebnKy8ud30tLSyVJHo9HHo+n2fvq5e0rKtxY67MlePO6Lbfk7uwS+YPJzdkl9+Z3a27J3dmlls1v87k1kH3aEtCiqrKyUnPmzNGIESPUr18/SVJRUZEiIyMVHx/v07Zbt24qKipy2lQtqLzrvevqa1NaWqp//etfiomJqZFn4cKFmj9/fo3l27dvV/v27Zu2k/V4aGil9T5bgltzS+7OLpE/mNycXXJvfrfmltydXWqZ/Nu2bQv4NkJJQIuqjIwMffDBB3rrrbcCuZlGmzt3rjIzM53fS0tLlZSUpLS0NMXGxlrbjsfjUW5urn6zL1zllWHW+g20qHCjh4ZWui635O7sEvmDyc3ZJffmd2tuyd3ZpZbN/0GW/bfjeDwevfLKK9b7tSFgRdXs2bO1ZcsW5eXl6cILL3SWJyQkqKKiQidOnPC5WlVcXKyEhASnzZ49e3z68346sGqb6p8YLC4uVmxsbK1XqSQpKipKUVFRNZZHREQoIiLC/51sQHllmMrPuu+Ac2tuyd3ZJfIHk5uzS+7N79bckruzSy2TPxDPraHM+qf/jDGaPXu2Nm7cqJ07dyo5Odln/ZAhQxQREaEdO3Y4yw4dOqQjR44oJSVFkpSSkqL3339fx44dc9rk5uYqNjZWffv2ddpU7cPbxtsHAABAS7J+pSojI0PZ2dl65ZVX1KlTJ+c9UHFxcYqJiVFcXJymT5+uzMxMdenSRbGxsbrjjjuUkpKi4cOHS5LS0tLUt29f/fjHP9bixYtVVFSkBx54QBkZGc6Vpttvv13Lly/Xvffeq5/+9KfauXOn/vSnP2nr1q22dwkAAKBB1q9U/e53v1NJSYmuueYade/e3bmtX7/eafPkk0/q+9//viZOnKiRI0cqISFBL7/8srO+Xbt22rJli9q1a6eUlBTddNNNuvnmm7VgwQKnTXJysrZu3arc3FwNHDhQTzzxhJ577jm+TgEAAASF9StVxjT8Ec3o6GitWLFCK1asqLPNRRdd1OCnBq655hrt37/f74wAAAC28bf/AAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAAAACyiqAAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAAAACyiqAAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAAAACyiqAAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAAAACyiqAAAALHB9UbVixQr17NlT0dHRGjZsmPbs2RPsSAAAoA1ydVG1fv16ZWZm6sEHH9S7776rgQMHKj09XceOHQt2NAAA0Ma4uqhasmSJZsyYoVtvvVV9+/bVypUr1b59e61evTrY0QAAQBtzTrADNFVFRYUKCgo0d+5cZ1l4eLhSU1OVn59f633Ky8tVXl7u/F5SUiJJOn78uDwej7VsHo9Hp06d0jmecJ2tDLPWb6CdU2l06lSl63JL7s4ukT+Y3Jxdcm9+t+aW3J1datn8//znP6336X2OlSRjjPX+m8W41D/+8Q8jyezatctn+T333GOuuOKKWu/z4IMPGkncuHHjxo0bt1Zw+/zzz1ui5Gg0116paoq5c+cqMzPT+b2yslLHjx/Xueeeq7Awe9V6aWmpkpKS9Pnnnys2NtZav4Hm1tySu7NL5A8mN2eX3Jvfrbkld2eXWk/+gwcPKjExMdhxfLi2qDrvvPPUrl07FRcX+ywvLi5WQkJCrfeJiopSVFSUz7L4+PhARVRsbKwrJ6xbc0vuzi6RP5jcnF1yb3635pbcnV1yf/4LLrhA4eGh9dbw0Erjh8jISA0ZMkQ7duxwllVWVmrHjh1KSUkJYjIAANAWufZKlSRlZmbqlltu0dChQ3XFFVdo6dKlKisr06233hrsaAAAoI1xdVH1H//xH/p//+//ad68eSoqKtKgQYOUk5Ojbt26BTVXVFSUHnzwwRovNYY6t+aW3J1dIn8wuTm75N78bs0tuTu7RP5ACjMm1D6PCAAA4D6ufU8VAABAKKGoAgAAsICiCgAAwAKKKgAAAAvaTFG1cOFCXX755erUqZO6du2q8ePH69ChQz5tTp8+rYyMDJ177rnq2LGjJk6cWOPLRX/xi19oyJAhioqK0qBBg2ps59ChQ7r22mvVrVs3RUdH61vf+pYeeOCBRv1twRUrVqhnz56Kjo7WsGHDtGfPHp/csbGxOu+889SxY0eFhYXpxIkTIZvba+HCheratavCw8MVFhamqKgojRo1Sh999JHTJlTze8f+nHPOUVhYmM/t9ttvD+nsknTvvffWyO29bdiwIeTzL1y4UAMGDFBERITCw8MVERGh0aNH++QLZv68vDzdcMMNSkxMVFhYmDZt2uST/fLLL1dMTIwiIyMVGRmpsLAwFRYWWs1e1SeffKJOnTo1+guNg3m+sZ1b+veYR0ZGql27dmrXrp06d+6scePGOeebUM3uzR8bG1vnuSbU87fU+SZQ+SXp008/1YQJE3T++ecrNjZWkydPrpGvQcH+OzktJT093Tz//PPmgw8+MIWFhWbMmDGmR48e5uTJk06b22+/3SQlJZkdO3aYffv2meHDh5srr7zSp5877rjDLF++3Pz4xz82AwcOrLGdTz/91KxevdoUFhaav//97+aVV14xXbt2NXPnzq0330svvWQiIyPN6tWrzYEDB8yMGTNMfHy8ufbaa53cv/zlL80ll1xi4uPjjSTz1VdfhWzu4uJiZ9xvueUWs2bNGrN161YzYsQIExMTYy644AJz5syZkM7vnTNDhw41P/zhD82oUaPMBRdcYD799FNTUlIS0tmNMSYtLc0sXbrUvP766+Yvf/mLGTVqlImLizMdOnQwX3/9dcjnT01NNeeff74ZNWqU2bBhg7n66qtNTEyMueyyy8zZs2eDnn/btm3m17/+tXn55ZeNJLNx40ZnnXfuPProo2bWrFmmf//+RvL9W6U2sntVVFSYoUOHmtGjR5u4uLh6cxsT/PON7dxVj9cHH3zQ/P73vzfXXnutSUhIMKNHjzZJSUnmzJkzIZvdmG/mzCWXXGImTpzoHK8XXHCBOXr0qNNHKOdvqfNNoPKfPHnSfOtb3zITJkww7733nnnvvffMuHHjzOWXX+6cbxqjzRRV1R07dsxIMm+88YYxxpgTJ06YiIgIs2HDBqfNhx9+aCSZ/Pz8Gvd/8MEH633Aq7rrrrvMVVddVW+bK664wmRkZDi/nz171iQmJpqFCxfWmtt7knNL7tryf/LJJ67I/93vftfceeedrp0zXt78Y8aMcUX+1157zYSHhzsFbNW5k5ubG/T8VVUvqqrbt2+fkWSee+45Y4z9sb/33nvNTTfdZJ5//vlGPcGEyvkmULmrZl+9erWRZPbv3x/y2b3nmqr5A3W+CeTYV80fqPON7fzVzzfezGFhYSY3N7fB/r3azMt/1ZWUlEiSunTpIkkqKCiQx+NRamqq06Z3797q0aOH8vPzm7ydTz75RDk5Ofrud79bZ5uKigoVFBT4bDs8PFypqak1tu3N7eWW3F5ffPGFpG/+ZlNSUpJr8q9bt069evWSJK1du1anTp1yTXavt99+W5J04403Sgr9uVNeXu68ZCz9e+6Hh4frrbfeCmp+f3399deSpLi4OEl2x37nzp3asGGDVqxY0aj2oXK+CWTuqtl37typ5ORkFRcXuyL7unXrdN555+nKK6+UJMXExEhy19hLgT3fBCJ/9fONJEVHRzvnm8Zqk0VVZWWl5syZoxEjRqhfv36SpKKiIkVGRtZ4bbZbt24qKiryextXXnmloqOj9Z3vfEdXX321FixYUGfbL7/8UmfPnq3xTfDVt+3N7c3sltyS9Mwzz6hDhw4aOHCgYmJi9MYbbygyMtIV+adOnaq1a9dqwIAB6tWrl3JycnTTTTe5IruXd+7ExMTopptukhT6c2f48OHq0KGD7rvvPp08eVJ33HGHunfvrsrKSn3xxRdBze+PyspKPfTQQ5Kkiy++WJK9sf/nP/+pn/zkJ1qzZk2j/zBuKJxvAplbkpYvX67evXtLkvbu3avc3Fz985//DPnsU6dO1YsvvqgdO3aoY8eOioyM1MKFCyW5Z+ylwJ5vApW/6vnm1KlTKisr0y9/+UudPXvWuRjQGG2yqMrIyNAHH3ygl156KWDbWL9+vd59911lZ2dr69atevzxxyVJb775pjp27Ojc1q1b1+g+vbnnzZsXqNgByS1J06ZN07hx45SQkKARI0Zo8uTJOn36tCvyz5w5U5s3b9Znn32mHTt2aO3atdq4caP/b2AMQnavn/3sZzpy5Ijuvvtuq5mrsp3//PPP14YNG/Tqq6+qU6dOysnJ0fDhw3XZZZcF5C/TB2r8MzIyanwoxpYZM2Zo6tSpGjlyZK3rQ/V8E8jckrR//34lJCRow4YN6tWrlyZPnqyKigob0QOafebMmUpPT9fKlSv11VdfOeeaTz/91Ep2KfBjLwX2fBOo/FXPNx07dlRcXJxOnDjh9/nG1X/7rylmz56tLVu2KC8vTxdeeKGzPCEhQRUVFTpx4oRPJV1cXKyEhAS/t5OUlCRJ6tu3r86ePauZM2fq7rvv1tChQ30+AdStWzdFRUWpXbt2NZ6kq267au7PPvvMNbm9fv3rX+vNN9/Url27dMEFF6hz587auHGjK/JXnzOdO3eW9M0l5VDP7s3/8ssvq127dvrFL37hLHfD2KelpWn06NHauHGjtm3bpoEDByohIUHf+ta3gpq/sbxz549//KPPk4Ct7Dt37tTmzZudAtAYo8rKSp1zzjlatWqVbrzxxpA83wQqtzf79u3b9eabbyo5OVk/+MEP1LlzZ33yySchn92b3zv2Xbt2lfTNS9JuGHtv/kCebwKZPy0tTZ9++qm+/PJLnXPOOYqPj3fON43VZq5UGWM0e/Zsbdy40XmNvaohQ4YoIiJCO3bscJYdOnRIR44cUUpKSrO2XVlZKY/Ho8rKSsXExOjiiy92bp06dVJkZKSGDBnis+3Kykrt2LFDw4cPd2Vu77ZrG3fzzQckVF5eHtL565oz3gP2qquuCtnsku/Yf/vb39a4ceN0/vnnO+1Deeyr53/99dc1cOBA7dy5U8eOHdMPfvCDoOZvSPW54y3YvGxlz8/PV2FhoXNbsGCBOnXqpMLCQk2YMCFkzze2c9d3vHrPN4mJiSGb3Zuzen7vuaZ79+4hPfbV8wfyfBOo/FWdd955io+P9znfNFqj39LucrNmzTJxcXHm9ddfN1988YVzO3XqlNPm9ttvNz169DA7d+40+/btMykpKSYlJcWnn48//tjs37/f/OxnPzO9evUy+/fvN/v37zfl5eXGGGNefPFFs379enPw4EHz6aefmvXr15vExEQzbdq0evO99NJLJioqyqxZs8YcPHjQzJw508THx5tbbrnFyf3Xv/7V5Obmmscff9xIMnl5eWb//v3mJz/5ScjlLioqMsYYM23aNBMdHW2effZZs3fvXrN582Zz3XXXmc6dOzsfZQ3FcS8qKjKzZs0ynTp1MrfeeqvJyckxe/bsMWvWrDE9e/Y0I0eODOnsxvx7zr/44otGklm3bp0r5nzV/DExMWbFihUmPz/fLFu2zMTHx5s77rgjJPJ//fXXTl+SzJIlS8z+/fvNZ5995oz95s2bTW5urvnDH/5gJJm1a9ea/fv3my+++MJK9uoa+0moYJ9vbOeuerzedtttJicnxznfjBkzxnTp0sUUFxeH5Jh75/vUqVOdc6X3XNOjRw8zYsQIp49Qzt9S55tA5TfGmNWrV5v8/HzzySefmD/84Q+mS5cuJjMzs8G+q2ozRZX+72PB1W/PP/+80+Zf//qX+fnPf246d+5s2rdvbyZMmGC++OILn36++93v1trP4cOHjTHfPHCXXXaZ6dixo+nQoYPp27evefTRR82//vWvBjMuW7bM9OjRw0RGRporrrjCvPPOO3XmrnpbtWpVyOVuaNwfffTRkB73+rKPHj3a+dhtqGavL3+oz/mG8q9evTok8v/P//xPrf3ecsstDR6zDz74oJXs1TX2CaausW+p843t3MbUPV+GDx9uPvroI2OMnfkSiOz15X/mmWecNm7Mb/t8E6j8xhhz3333mW7dupmIiAjzne98xzzxxBOmsrKyUX17hRljjAAAANAsbeY9VQAAAIFEUQUAAGABRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABb8f58N2gQoK7TXAAAAAElFTkSuQmCC\n",
575
- "text/plain": [
576
- "<Figure size 640x480 with 1 Axes>"
577
- ]
578
- },
579
- "metadata": {},
580
- "output_type": "display_data"
581
- }
582
- ],
583
- "source": [
584
- "new_df.date.hist(bins=400)"
585
- ]
586
- },
587
- {
588
- "cell_type": "code",
589
- "execution_count": null,
590
- "id": "1acf60dc",
591
- "metadata": {},
592
- "outputs": [],
593
- "source": []
594
- }
595
- ],
596
- "metadata": {
597
- "kernelspec": {
598
- "display_name": "Python 3 (ipykernel)",
599
- "language": "python",
600
- "name": "python3"
601
- },
602
- "language_info": {
603
- "codemirror_mode": {
604
- "name": "ipython",
605
- "version": 3
606
- },
607
- "file_extension": ".py",
608
- "mimetype": "text/x-python",
609
- "name": "python",
610
- "nbconvert_exporter": "python",
611
- "pygments_lexer": "ipython3",
612
- "version": "3.10.8"
613
- }
614
- },
615
- "nbformat": 4,
616
- "nbformat_minor": 5
617
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,9 +1,9 @@
1
- praw==7.7.0
2
- gradio==3.23
3
  nbdev==2.3.12
4
- datasets==2.11.0
5
  requests==2.28.2
6
  loguru==0.7.0
7
  rich==13.3.4
8
- gradio==3.23.0
9
- supervisor==4.2.5
 
1
+ praw==7.7.1
2
+ gradio==3.50.2
3
  nbdev==2.3.12
4
+ datasets==2.14.6
5
  requests==2.28.2
6
  loguru==0.7.0
7
  rich==13.3.4
8
+ supervisor==4.2.5
9
+ schedule==1.2.0
utilities/data_collator.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ from utilities.praw_downloader import praw_downloader
4
+ from utilities.praw_processor import preprocess_praw_data
5
+
6
+
7
+ def get_latest_data():
8
+ submissions = praw_downloader()
9
+ df = preprocess_praw_data(submissions=submissions)
10
+ return df
11
+
12
+
13
+ def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
14
+ """
15
+ Removes rows with redundant ids, retaining the one with the longest content.
16
+
17
+ Parameters:
18
+ - df (pd.DataFrame): The input DataFrame with columns 'id' and 'content'.
19
+
20
+ Returns:
21
+ - pd.DataFrame: A filtered DataFrame with unique ids, where each id is associated
22
+ with the longest content available.
23
+ """
24
+
25
+ # Create a column for content length
26
+ df['content_length'] = df['content'].str.len()
27
+
28
+ # Use groupby to get the index of the row with the longest content for each 'id'
29
+ idx_to_keep = df.groupby('id')['content_length'].idxmax().values
30
+
31
+ # Filter the DataFrame to only keep those rows
32
+ df_filtered = df.loc[idx_to_keep]
33
+
34
+ # Drop the 'content_length' column
35
+ df_filtered = df_filtered.drop(columns=['content_length'])
36
+
37
+ return df_filtered
38
+
39
+
40
+ def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
41
+ """
42
+ Merges the provided dataset with the latest data, sorts them by 'date_utc',
43
+ filters out redundant IDs, and returns the merged and filtered dataset.
44
+
45
+ Args:
46
+ - dataset (Type[Dataset]): The dataset to be merged with the latest data.
47
+
48
+ Returns:
49
+ - Type[Dataset]: The merged and filtered dataset.
50
+ """
51
+ latest_df = get_latest_data()
52
+
53
+ df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
54
+ df = filter_redundant_ids(df)
55
+ return df
utilities/my_logger.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+
4
+ def setup_logger(name: str):
5
+ logger = logging.getLogger(name)
6
+ logger.setLevel(logging.DEBUG)
7
+
8
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
9
+
10
+ # Create a file handler to write logs to a file
11
+ file_handler = logging.FileHandler('mylog.log')
12
+ file_handler.setLevel(logging.DEBUG)
13
+ file_handler.setFormatter(formatter)
14
+ logger.addHandler(file_handler)
15
+
16
+ # Create a stream handler to write logs to the console
17
+ stream_handler = logging.StreamHandler()
18
+ stream_handler.setLevel(logging.DEBUG)
19
+ stream_handler.setFormatter(formatter)
20
+ logger.addHandler(stream_handler)
21
+
22
+ return logger
utilities/praw_downloader.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+ from typing import Any, Dict, List
4
+
5
+ import praw
6
+
7
+ from utilities.my_logger import setup_logger
8
+
9
+ # Setup logging
10
+ logger = setup_logger(__name__)
11
+
12
+
13
+ def get_reddit_instance() -> praw.Reddit:
14
+ """Initialize and return a Reddit instance using PRAW."""
15
+ return praw.Reddit(
16
+ client_id=os.getenv('REDDIT_CLIENT_ID'),
17
+ client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
18
+ user_agent=os.getenv('REDDIT_USER_AGENT'),
19
+ ratelimit_seconds=20,
20
+ )
21
+
22
+
23
+ def extract_submission_data(submission: praw.models.Submission) -> Dict[str, Any]:
24
+ """Extract and return relevant data from a given Reddit submission."""
25
+ return {
26
+ "content": submission.selftext,
27
+ "poster": str(submission.author),
28
+ "date_utc": datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
29
+ "flair": submission.link_flair_text,
30
+ "title": submission.title,
31
+ "score": submission.ups,
32
+ "permalink": submission.permalink,
33
+ }
34
+
35
+
36
+ def praw_downloader() -> List[Dict[str, str]]:
37
+ """Main function to extract and save all submissions from the subreddit."""
38
+ reddit = get_reddit_instance()
39
+ subreddit = reddit.subreddit('bestofredditorupdates')
40
+
41
+ logger.info('Starting to fetch submissions from bestofredditorupdates.')
42
+
43
+ submissions = []
44
+ for submission in subreddit.new(limit=200): # Set limit=None to get all posts
45
+ logger.debug(f'Processing post {submission.id} - {submission.title}')
46
+ data = extract_submission_data(submission)
47
+ submissions.append(data)
48
+
49
+ logger.info(f'Finished downloading {len(submissions)} submissions.')
50
+ return submissions
51
+
52
+
53
+ if __name__ == "__main__":
54
+ praw_downloader()
utilities/praw_processor.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ import pandas as pd
4
+
5
+ from utilities.my_logger import setup_logger
6
+
7
+ # Setup logging
8
+ logger = setup_logger(__name__)
9
+
10
+
11
+ def preprocess_praw_data(submissions: List[Dict]) -> pd.DataFrame:
12
+ """
13
+ Preprocesses praw data into a DataFrame.
14
+
15
+ Parameters:
16
+ - submissions: List of submission dictionaries.
17
+
18
+ Returns:
19
+ - pd.DataFrame: Preprocessed DataFrame.
20
+ """
21
+
22
+ # Convert the submissions list to a DataFrame
23
+ praw_df = pd.DataFrame(submissions)
24
+
25
+ # Convert 'date' column to datetime format
26
+ praw_df.date_utc = pd.to_datetime(praw_df.date_utc)
27
+
28
+ # Remove 'poster_link' column if it exists
29
+ if 'poster_link' in praw_df.columns:
30
+ del praw_df['poster_link']
31
+
32
+ # Extract the 4th element from 'permalink' as 'id'
33
+ praw_df['id'] = praw_df.permalink.str.split('/').str[4]
34
+
35
+ return praw_df
utilities/readme_update.py CHANGED
@@ -10,24 +10,20 @@ def get_readme_path(dataset_name):
10
  return cached_path(readme_path, download_config=DownloadConfig())
11
 
12
 
13
- def update_readme(dataset_name, subreddit, date_to_fetch):
14
  path = get_readme_path(dataset_name=dataset_name)
15
  readme_text = f"""
 
 
 
 
 
16
  # Dataset Name
17
  {dataset_name}
18
 
19
  ## Update Frequency
20
- The dataset is updated daily and covers the period from `{os.environ["START_DATE"]}` to {date_to_fetch}
21
-
22
- ## Dataset Overview
23
- The goal is to have an open dataset of `{subreddit}` submissions. This has been taken from the Pushshift API.
24
-
25
- ## Data Collection
26
- This has been collected with sequential calls that follow the pagination of the pushshift request.
27
-
28
- ## Attribution
29
- Data sourced from the Pushshift API.
30
- """
31
 
32
  append_readme(path=path, readme_text=readme_text)
33
  return readme_text
 
10
  return cached_path(readme_path, download_config=DownloadConfig())
11
 
12
 
13
+ def update_readme(dataset_name, subreddit, latest_date):
14
  path = get_readme_path(dataset_name=dataset_name)
15
  readme_text = f"""
16
+ ## Dataset Overview
17
+ The goal is to have an open dataset of `{subreddit}` submissions. Im leveraging PRAW and the reddit API to get downloads.
18
+
19
+ There is a limit of 1000 in an API call and limited search functionality, so this is run every day to get new submissions.
20
+
21
  # Dataset Name
22
  {dataset_name}
23
 
24
  ## Update Frequency
25
+ The dataset is updated daily with the most recent day being: {latest_date}
26
+ """
 
 
 
 
 
 
 
 
 
27
 
28
  append_readme(path=path, readme_text=readme_text)
29
  return readme_text