derek-thomas HF staff commited on
Commit
749d1d8
1 Parent(s): 19a5703

Init commit

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .idea/
2
+ notebooks/.ipynb_checkpoints
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python base image
2
+ FROM python:3.9
3
+
4
+ # Install Git LFS
5
+ RUN apt-get update && apt-get install -y git-lfs
6
+
7
+ # Set the working directory
8
+ WORKDIR /app
9
+
10
+ # Copy requirements.txt into the container
11
+ COPY requirements.txt .
12
+
13
+ # Install the required packages
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Set the git credential helper to "store"
17
+ RUN git config --global credential.helper store
18
+
19
+ # Copy the rest of the application files into the container
20
+ COPY . .
21
+
22
+ # Set environment variables (Replace with your actual values)
23
+ ENV HUGGINGFACE_AUTH_TOKEN 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
24
+ ENV SUBREDDIT askreddit
25
+ ENV START_DATE 2013-01-01
26
+
27
+ # Copy supervisord.conf into the container
28
+ COPY supervisord.conf .
29
+
30
+ # Expose the desired port
31
+ EXPOSE 7860
32
+
33
+ # Run supervisord
34
+ CMD ["supervisord", "-c", "supervisord.conf"]
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from rich.console import Console
3
+ from rich.syntax import Syntax
4
+
5
+
6
+ def log_file_to_html_string():
7
+ log_file = "mylog.log"
8
+
9
+ console = Console(record=True, width=150)
10
+ with open(log_file, "rt") as f:
11
+ syntax = Syntax(f.read(), "python", theme="monokai", word_wrap=True)
12
+
13
+ console.print(syntax)
14
+ html_content = console.export_html(inline_styles=True)
15
+
16
+ return html_content
17
+
18
+
19
+ with gr.Blocks() as demo:
20
+ name = gr.Markdown("# Reddit Scraper")
21
+ output = gr.HTML(log_file_to_html_string, every=1)
22
+
23
+ if __name__ == '__main__':
24
+ demo.launch(server_name="0.0.0.0", show_error=True, server_port=7860, enable_queue=True)
archive/subreddit_downloader.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import json
3
+ import sys
4
+ import time
5
+ import traceback
6
+ from datetime import datetime
7
+
8
+ import requests
9
+
10
+ username = "" # put the username you want to download in the quotes
11
+ subreddit = "BestofRedditorUpdates" # put the subreddit you want to download in the quotes
12
+ thread_id = "" # put the id of the thread you want to download in the quotes, it's the first 5 to 7 character string of letters and numbers from the url, like 107xayi
13
+ # leave either one blank to download an entire user's or subreddit's history
14
+ # or fill in both to download a specific users history from a specific subreddit
15
+
16
+ # change this to one of "human", "csv" or "json"
17
+ # - human: the score, creation date, author, link and then the comment/submission body on a second line. Objects are separated by lines of dashes
18
+ # - csv: a comma seperated value file with the fields score, date, title, author, link and then body or url
19
+ # - json: the full json object
20
+ output_format = "csv"
21
+
22
+ # default start time is the current time and default end time is all history
23
+ # you can change out the below lines to set a custom start and end date. The script works backwards, so the end date has to be before the start date
24
+ # start_time = datetime.utcnow() # datetime.strptime("10/05/2021", "%m/%d/%Y")
25
+ start_time = datetime.strptime("04/02/2023", "%m/%d/%Y")
26
+ end_time = None # datetime.strptime("09/25/2021", "%m/%d/%Y")
27
+
28
+ convert_to_ascii = False # don't touch this unless you know what you're doing
29
+ convert_thread_id_to_base_ten = True # don't touch this unless you know what you're doing
30
+
31
+
32
+ def write_csv_line(writer, obj, is_submission):
33
+ output_list = []
34
+ output_list.append(str(obj['score']))
35
+ output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
36
+ if is_submission:
37
+ output_list.append(obj['title'])
38
+ output_list.append(f"u/{obj['author']}")
39
+ output_list.append(f"https://www.reddit.com{obj['permalink']}")
40
+ if is_submission:
41
+ if obj['is_self']:
42
+ if 'selftext' in obj:
43
+ output_list.append(obj['selftext'])
44
+ else:
45
+ output_list.append("")
46
+ else:
47
+ output_list.append(obj['url'])
48
+ else:
49
+ output_list.append(obj['body'])
50
+ writer.writerow(output_list)
51
+
52
+
53
+ def write_json_line(handle, obj):
54
+ handle.write(json.dumps(obj))
55
+ handle.write("\n")
56
+
57
+
58
+ def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii):
59
+ print(f"Saving to {filename}")
60
+
61
+ count = 0
62
+ if output_format == "human" or output_format == "json":
63
+ if convert_to_ascii:
64
+ handle = open(filename, 'w', encoding='ascii')
65
+ else:
66
+ handle = open(filename, 'w', encoding='UTF-8')
67
+ else:
68
+ handle = open(filename, 'w', encoding='UTF-8', newline='')
69
+ writer = csv.writer(handle)
70
+
71
+ previous_epoch = int(start_datetime.timestamp())
72
+ break_out = False
73
+ while True:
74
+ new_url = url_base + str(previous_epoch)
75
+ json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
76
+ time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
77
+ try:
78
+ json_data = json_text.json()
79
+ except json.decoder.JSONDecodeError:
80
+ time.sleep(1)
81
+ continue
82
+
83
+ if 'data' not in json_data:
84
+ break
85
+ objects = json_data['data']
86
+ if len(objects) == 0:
87
+ break
88
+
89
+ for obj in objects:
90
+ previous_epoch = obj['created_utc'] - 1
91
+ if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime:
92
+ break_out = True
93
+ break
94
+ count += 1
95
+ try:
96
+ if output_format == "csv":
97
+ write_csv_line(writer, obj, is_submission)
98
+ elif output_format == "json":
99
+ write_json_line(handle, obj)
100
+ except Exception as err:
101
+ if 'permalink' in obj:
102
+ print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}")
103
+ else:
104
+ print(f"Couldn't print object, missing permalink: {obj['id']}")
105
+ print(err)
106
+ print(traceback.format_exc())
107
+
108
+ if break_out:
109
+ break
110
+
111
+ print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}")
112
+
113
+ print(f"Saved {count}")
114
+ handle.close()
115
+
116
+
117
+ if __name__ == "__main__":
118
+ filter_string = None
119
+ if username == "" and subreddit == "" and thread_id == "":
120
+ print("Fill in username, subreddit or thread id")
121
+ sys.exit(0)
122
+ if output_format not in ("human", "csv", "json"):
123
+ print("Output format must be one of human, csv, json")
124
+ sys.exit(0)
125
+
126
+ filters = []
127
+ if username:
128
+ filters.append(f"author={username}")
129
+ if subreddit:
130
+ filters.append(f"subreddit={subreddit}")
131
+ if thread_id:
132
+ if convert_thread_id_to_base_ten:
133
+ filters.append(f"link_id={int(thread_id, 36)}")
134
+ else:
135
+ filters.append(f"link_id=t3_{thread_id}")
136
+ filter_string = '&'.join(filters)
137
+
138
+ url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before="
139
+
140
+ if not thread_id:
141
+ download_from_url("posts.txt", url_template.format("submission", filter_string), output_format, start_time,
142
+ end_time, True, convert_to_ascii)
143
+ # download_from_url("comments.txt", url_template.format("comment", filter_string), output_format, start_time,
144
+ # end_time, False, convert_to_ascii)
145
+
main.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from datetime import datetime, timedelta
4
+
5
+ import pandas as pd
6
+ from datasets import Dataset, DatasetDict, load_dataset
7
+ from huggingface_hub import login
8
+
9
+ from my_logger import setup_logger
10
+ from utilities.pushshift_data import scrape_submissions_by_day, submissions_to_dataframe
11
+
12
+ # Set dataset name, path to README.md, and existing dataset details
13
+ dataset_name = "derek-thomas/askreddit_test"
14
+ dataset_readme_path = "README.md"
15
+ subreddit = os.environ["SUBREDDIT"]
16
+
17
+ # Authenticate with Hugging Face using an auth token
18
+ auth_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
19
+ login(auth_token, add_to_git_credential=True)
20
+
21
+ logger = setup_logger(__name__)
22
+
23
+
24
+ def update_readme(dataset_name, subreddit, date_to_fetch):
25
+ readme_text = f"""
26
+ # {dataset_name}
27
+
28
+ ## Dataset Overview
29
+ The goal is to have an open dataset of `{subreddit}` submissions. This has been taken from the Pushshift API.
30
+
31
+ ## Data Collection
32
+ This has been collected with sequential calls that follow the pagination of the pushshift request.
33
+
34
+
35
+ ## Data Structure
36
+ - `all_days`: All the data after `{os.environ["START_DATE"]}`
37
+
38
+ ## Update Frequency
39
+ The dataset is updated daily and covers the period from `{os.environ["START_DATE"]}` to two days ago.
40
+
41
+ ## Attribution
42
+ Data sourced from the Pushshift API.
43
+
44
+ ## Change Log
45
+ <details>
46
+ <summary>Click to expand</summary>
47
+
48
+ - **{datetime.now().strftime('%Y-%m-%d')}:** Added data for {date_to_fetch} to the 'all_days' split and saved as CSV
49
+
50
+ </details>
51
+ """
52
+
53
+ return readme_text
54
+
55
+
56
+ def main(date_to_fetch):
57
+ """
58
+ Runs the main data processing function to fetch and process subreddit data for the specified date.
59
+
60
+ Args:
61
+ date_to_fetch (str): The date to fetch subreddit data for, in the format "YYYY-MM-DD".
62
+
63
+ Returns:
64
+ most_recent_date (str): Most recent date in dataset
65
+ """
66
+
67
+ # Load the existing dataset from the Hugging Face hub or create a new one
68
+ try:
69
+ logger.info("Loading existing dataset")
70
+ dataset = load_dataset(dataset_name)
71
+ if "__index_level_0__" in dataset["all_days"].column_names:
72
+ dataset = dataset.remove_columns(["__index_level_0__"])
73
+ except FileNotFoundError:
74
+ logger.info("Creating new dataset")
75
+ dataset = DatasetDict()
76
+
77
+ # Call get_subreddit_day with the calculated date
78
+ logger.info(f"Fetching data for {date_to_fetch}")
79
+ submissions = scrape_submissions_by_day(subreddit, date_to_fetch)
80
+ df = submissions_to_dataframe(submissions)
81
+ logger.info(f"Data fetched for {date_to_fetch}")
82
+ most_recent_date = datetime.strptime(date_to_fetch, '%Y-%m-%d').date()
83
+
84
+ # Append DataFrame to split 'all_days' or create new split
85
+ if "all_days" in dataset:
86
+ logger.info("Appending data to split 'all_days'")
87
+ # Merge the new submissions
88
+ old_data = dataset['all_days'].to_pandas()
89
+ new_data = pd.concat([old_data, df], ignore_index=True)
90
+
91
+ # Drop duplicates just in case
92
+ new_data = new_data.drop_duplicates(subset=['id'], keep="first")
93
+ new_data_most_recent_date_raw = new_data['created_utc'].max()
94
+ new_data_most_recent_date_dt = datetime.strptime(new_data_most_recent_date_raw.split(' ')[0], '%Y-%m-%d').date()
95
+ # Adding timedelta in case there is rounding error
96
+ most_recent_date = max(new_data_most_recent_date_dt - timedelta(days=1), most_recent_date)
97
+
98
+ # Convert back to dataset
99
+ dataset["all_days"] = Dataset.from_pandas(new_data)
100
+ else:
101
+ logger.info("Creating new split 'all_days'")
102
+ dataset["all_days"] = Dataset.from_pandas(df)
103
+ # Log appending or creating split 'all'
104
+ logger.info("Appended or created split 'all_days'")
105
+
106
+ # Push the augmented dataset to the Hugging Face hub
107
+ logger.info(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
108
+ readme_text = update_readme(dataset_name, subreddit, date_to_fetch)
109
+ dataset.description = readme_text
110
+ dataset.push_to_hub(dataset_name, token=auth_token)
111
+ logger.info(f"Processed and pushed data for {date_to_fetch} to the Hugging Face Hub")
112
+ return most_recent_date
113
+
114
+
115
+ def run_main_continuously():
116
+ """
117
+ This function runs the given `main_function` continuously, starting from the date specified
118
+ in the environment variable "START_DATE" until two days ago. Once it reaches two days ago,
119
+ it will wait until tomorrow to start again at the same time as when it started today.
120
+ """
121
+ start_date_str = os.environ.get("START_DATE")
122
+ start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()
123
+
124
+ # Calculate the start time for running the main_function every day.
125
+ start_time = datetime.now().time()
126
+
127
+ while True:
128
+ today = datetime.now().date()
129
+ two_days_ago = today - timedelta(days=2)
130
+
131
+ if start_date <= two_days_ago:
132
+ logger.info(f"Running main function for date: {start_date}")
133
+ most_recent_date = main(str(start_date))
134
+ start_date = most_recent_date + timedelta(days=1)
135
+ else:
136
+ tomorrow = today + timedelta(days=1)
137
+ now = datetime.now()
138
+ start_of_tomorrow = datetime.combine(tomorrow, start_time)
139
+ wait_until_tomorrow = (start_of_tomorrow - now).total_seconds()
140
+ logger.info(f"Waiting until tomorrow: {wait_until_tomorrow} seconds")
141
+ time.sleep(wait_until_tomorrow)
142
+
143
+
144
+ if __name__ == '__main__':
145
+ run_main_continuously()
my_logger.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+
4
+ def setup_logger(name: str):
5
+ logger = logging.getLogger(name)
6
+ logger.setLevel(logging.DEBUG)
7
+
8
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
9
+
10
+ # Create a file handler to write logs to a file
11
+ file_handler = logging.FileHandler('mylog.log')
12
+ file_handler.setLevel(logging.DEBUG)
13
+ file_handler.setFormatter(formatter)
14
+ logger.addHandler(file_handler)
15
+
16
+ # Create a stream handler to write logs to the console
17
+ stream_handler = logging.StreamHandler()
18
+ stream_handler.setLevel(logging.DEBUG)
19
+ stream_handler.setFormatter(formatter)
20
+ logger.addHandler(stream_handler)
21
+
22
+ return logger
notebooks/explore.ipynb ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "730ba509",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from IPython.core.interactiveshell import InteractiveShell\n",
11
+ "InteractiveShell.ast_node_interactivity = \"all\""
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": null,
17
+ "id": "d9acd4b6",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "from pathlib import Path\n",
22
+ "import sys\n",
23
+ "proj_dir = Path.cwd().parent\n",
24
+ "\n",
25
+ "sys.path.append(str(proj_dir))\n"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": null,
31
+ "id": "62452860",
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "from utilities.pushshift_data import scrape_submissions_by_day, submissions_to_dataframe, get_post_count_for_day"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 4,
41
+ "id": "a956a623",
42
+ "metadata": {},
43
+ "outputs": [
44
+ {
45
+ "data": {
46
+ "application/vnd.jupyter.widget-view+json": {
47
+ "model_id": "17df3f2812084d3591e914ffcfd948b0",
48
+ "version_major": 2,
49
+ "version_minor": 0
50
+ },
51
+ "text/plain": [
52
+ "0it [00:00, ?it/s]"
53
+ ]
54
+ },
55
+ "metadata": {},
56
+ "output_type": "display_data"
57
+ },
58
+ {
59
+ "name": "stderr",
60
+ "output_type": "stream",
61
+ "text": [
62
+ "2023-04-12 16:23:59,392 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 20:00:00\n",
63
+ "2023-04-12 16:24:03,524 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 14:37:16\n",
64
+ "2023-04-12 16:24:08,443 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 05:02:52\n",
65
+ "2023-04-12 16:24:13,409 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 00:43:35\n",
66
+ "2023-04-12 16:24:17,548 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:28:35\n",
67
+ "2023-04-12 16:24:21,490 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:00:48\n",
68
+ "2023-04-12 16:24:23,658 - INFO - Finished scraping 4106 submissions in 28.86 seconds\n"
69
+ ]
70
+ }
71
+ ],
72
+ "source": [
73
+ "subreddit_to_scrape = \"askreddit\"\n",
74
+ "day_to_scrape = \"2013-03-01\"\n",
75
+ "submissions = scrape_submissions_by_day(subreddit_to_scrape, day_to_scrape)"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 5,
81
+ "id": "b1cc845b",
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "data": {
86
+ "text/html": [
87
+ "<div>\n",
88
+ "<style scoped>\n",
89
+ " .dataframe tbody tr th:only-of-type {\n",
90
+ " vertical-align: middle;\n",
91
+ " }\n",
92
+ "\n",
93
+ " .dataframe tbody tr th {\n",
94
+ " vertical-align: top;\n",
95
+ " }\n",
96
+ "\n",
97
+ " .dataframe thead th {\n",
98
+ " text-align: right;\n",
99
+ " }\n",
100
+ "</style>\n",
101
+ "<table border=\"1\" class=\"dataframe\">\n",
102
+ " <thead>\n",
103
+ " <tr style=\"text-align: right;\">\n",
104
+ " <th></th>\n",
105
+ " <th>permalink</th>\n",
106
+ " <th>selftext</th>\n",
107
+ " <th>url</th>\n",
108
+ " <th>created_utc</th>\n",
109
+ " <th>author</th>\n",
110
+ " <th>num_comments</th>\n",
111
+ " <th>score</th>\n",
112
+ " <th>title</th>\n",
113
+ " <th>id</th>\n",
114
+ " <th>downs</th>\n",
115
+ " <th>ups</th>\n",
116
+ " </tr>\n",
117
+ " </thead>\n",
118
+ " <tbody>\n",
119
+ " <tr>\n",
120
+ " <th>0</th>\n",
121
+ " <td>/r/AskReddit/comments/19hbm0/in_the_way_that_p...</td>\n",
122
+ " <td>Basically, do other parts of the world have th...</td>\n",
123
+ " <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
124
+ " <td>2013-03-01 19:58:55</td>\n",
125
+ " <td>sjr63</td>\n",
126
+ " <td>1</td>\n",
127
+ " <td>1</td>\n",
128
+ " <td>In the way that popular English and American m...</td>\n",
129
+ " <td>19hbm0</td>\n",
130
+ " <td>0</td>\n",
131
+ " <td>1</td>\n",
132
+ " </tr>\n",
133
+ " <tr>\n",
134
+ " <th>1</th>\n",
135
+ " <td>/r/AskReddit/comments/19hblp/could_i_buy_an_an...</td>\n",
136
+ " <td></td>\n",
137
+ " <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
138
+ " <td>2013-03-01 19:58:50</td>\n",
139
+ " <td>WeirdPlane</td>\n",
140
+ " <td>13</td>\n",
141
+ " <td>1</td>\n",
142
+ " <td>Could I buy an Android phone without a plan an...</td>\n",
143
+ " <td>19hblp</td>\n",
144
+ " <td>0</td>\n",
145
+ " <td>1</td>\n",
146
+ " </tr>\n",
147
+ " <tr>\n",
148
+ " <th>2</th>\n",
149
+ " <td>/r/AskReddit/comments/19hblj/how_do_i_reddit/</td>\n",
150
+ " <td>Yeah.\n",
151
+ "\n",
152
+ "How do I reddit? I don't use or read re...</td>\n",
153
+ " <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
154
+ " <td>2013-03-01 19:58:47</td>\n",
155
+ " <td>xxnovaroxgg</td>\n",
156
+ " <td>14</td>\n",
157
+ " <td>0</td>\n",
158
+ " <td>How do I reddit</td>\n",
159
+ " <td>19hblj</td>\n",
160
+ " <td>0</td>\n",
161
+ " <td>0</td>\n",
162
+ " </tr>\n",
163
+ " <tr>\n",
164
+ " <th>3</th>\n",
165
+ " <td>/r/AskReddit/comments/19hbjx/xpost_rsurvival_h...</td>\n",
166
+ " <td>My brothers, dad and I have always been huge L...</td>\n",
167
+ " <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
168
+ " <td>2013-03-01 19:58:07</td>\n",
169
+ " <td>tuffstough</td>\n",
170
+ " <td>0</td>\n",
171
+ " <td>1</td>\n",
172
+ " <td>(x-post r/survival) Have any redditors seen Le...</td>\n",
173
+ " <td>19hbjx</td>\n",
174
+ " <td>0</td>\n",
175
+ " <td>1</td>\n",
176
+ " </tr>\n",
177
+ " <tr>\n",
178
+ " <th>4</th>\n",
179
+ " <td>/r/AskReddit/comments/19hbjk/female_redditors_...</td>\n",
180
+ " <td>I'm curious, guys tend to get asked the usual ...</td>\n",
181
+ " <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
182
+ " <td>2013-03-01 19:57:58</td>\n",
183
+ " <td>redditredditx3</td>\n",
184
+ " <td>13</td>\n",
185
+ " <td>2</td>\n",
186
+ " <td>Female Redditors, which part of the male physi...</td>\n",
187
+ " <td>19hbjk</td>\n",
188
+ " <td>0</td>\n",
189
+ " <td>2</td>\n",
190
+ " </tr>\n",
191
+ " </tbody>\n",
192
+ "</table>\n",
193
+ "</div>"
194
+ ],
195
+ "text/plain": [
196
+ " permalink \\\n",
197
+ "0 /r/AskReddit/comments/19hbm0/in_the_way_that_p... \n",
198
+ "1 /r/AskReddit/comments/19hblp/could_i_buy_an_an... \n",
199
+ "2 /r/AskReddit/comments/19hblj/how_do_i_reddit/ \n",
200
+ "3 /r/AskReddit/comments/19hbjx/xpost_rsurvival_h... \n",
201
+ "4 /r/AskReddit/comments/19hbjk/female_redditors_... \n",
202
+ "\n",
203
+ " selftext \\\n",
204
+ "0 Basically, do other parts of the world have th... \n",
205
+ "1 \n",
206
+ "2 Yeah.\n",
207
+ "\n",
208
+ "How do I reddit? I don't use or read re... \n",
209
+ "3 My brothers, dad and I have always been huge L... \n",
210
+ "4 I'm curious, guys tend to get asked the usual ... \n",
211
+ "\n",
212
+ " url created_utc \\\n",
213
+ "0 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:55 \n",
214
+ "1 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:50 \n",
215
+ "2 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:47 \n",
216
+ "3 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:07 \n",
217
+ "4 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:57:58 \n",
218
+ "\n",
219
+ " author num_comments score \\\n",
220
+ "0 sjr63 1 1 \n",
221
+ "1 WeirdPlane 13 1 \n",
222
+ "2 xxnovaroxgg 14 0 \n",
223
+ "3 tuffstough 0 1 \n",
224
+ "4 redditredditx3 13 2 \n",
225
+ "\n",
226
+ " title id downs ups \n",
227
+ "0 In the way that popular English and American m... 19hbm0 0 1 \n",
228
+ "1 Could I buy an Android phone without a plan an... 19hblp 0 1 \n",
229
+ "2 How do I reddit 19hblj 0 0 \n",
230
+ "3 (x-post r/survival) Have any redditors seen Le... 19hbjx 0 1 \n",
231
+ "4 Female Redditors, which part of the male physi... 19hbjk 0 2 "
232
+ ]
233
+ },
234
+ "execution_count": 5,
235
+ "metadata": {},
236
+ "output_type": "execute_result"
237
+ }
238
+ ],
239
+ "source": [
240
+ "df = submissions_to_dataframe(submissions)\n",
241
+ "df.head()"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "execution_count": null,
247
+ "id": "518addff",
248
+ "metadata": {},
249
+ "outputs": [],
250
+ "source": []
251
+ },
252
+ {
253
+ "cell_type": "code",
254
+ "execution_count": null,
255
+ "id": "6e5490dc",
256
+ "metadata": {},
257
+ "outputs": [],
258
+ "source": [
259
+ "start_date = datetime.strptime(\"2013-01-01\", \"%Y-%m-%d\")\n",
260
+ "start_date"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": null,
266
+ "id": "bf13555a",
267
+ "metadata": {},
268
+ "outputs": [],
269
+ "source": [
270
+ "df[\"created_utc\"] = pd.to_datetime(df[\"created_utc\"], unit=\"s\").dt.tz_localize(\"UTC\").dt.strftime('%Y-%m-%d %H:%M:%S')"
271
+ ]
272
+ },
273
+ {
274
+ "cell_type": "code",
275
+ "execution_count": null,
276
+ "id": "48e413f3",
277
+ "metadata": {},
278
+ "outputs": [],
279
+ "source": [
280
+ "df.head()"
281
+ ]
282
+ },
283
+ {
284
+ "cell_type": "code",
285
+ "execution_count": null,
286
+ "id": "9e83befa",
287
+ "metadata": {},
288
+ "outputs": [],
289
+ "source": [
290
+ "df.dtypes"
291
+ ]
292
+ },
293
+ {
294
+ "cell_type": "code",
295
+ "execution_count": null,
296
+ "id": "ba84be68",
297
+ "metadata": {},
298
+ "outputs": [],
299
+ "source": []
300
+ }
301
+ ],
302
+ "metadata": {
303
+ "kernelspec": {
304
+ "display_name": "Python 3 (ipykernel)",
305
+ "language": "python",
306
+ "name": "python3"
307
+ },
308
+ "language_info": {
309
+ "codemirror_mode": {
310
+ "name": "ipython",
311
+ "version": 3
312
+ },
313
+ "file_extension": ".py",
314
+ "mimetype": "text/x-python",
315
+ "name": "python",
316
+ "nbconvert_exporter": "python",
317
+ "pygments_lexer": "ipython3",
318
+ "version": "3.9.16"
319
+ }
320
+ },
321
+ "nbformat": 4,
322
+ "nbformat_minor": 5
323
+ }
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ praw==7.7.0
2
+ gradio==3.23
3
+ nbdev==2.3.12
4
+ datasets==2.11.0
5
+ requests==2.28.2
6
+ loguru==0.7.0
7
+ rich==13.3.4
8
+ gradio==3.23.0
9
+ supervisor==4.2.5
supervisord.conf ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [supervisord]
2
+ nodaemon=true
3
+
4
+ [program:main]
5
+ command=python main.py
6
+ stdout_logfile=/dev/stdout
7
+ stdout_logfile_maxbytes=0
8
+ stderr_logfile=/dev/stderr
9
+ stderr_logfile_maxbytes=0
10
+ autostart=true
11
+ # autorestart=true
12
+
13
+ [program:app]
14
+ command=python app.py
15
+ stdout_logfile=/dev/null
16
+ stdout_logfile_maxbytes=0
17
+ stderr_logfile=/dev/stderr
18
+ stderr_logfile_maxbytes=0
19
+ autostart=true
20
+ autorestart=true
utilities/pushshift_data.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from datetime import datetime, timedelta, timezone
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import pandas as pd
6
+ import requests
7
+
8
+ from my_logger import setup_logger
9
+
10
+ logger = setup_logger(__name__)
11
+
12
+
13
+ def get_pushshift_data(subreddit: str, before: Optional[int] = None,
14
+ after: Optional[int] = None, aggs: Optional[str] = None) -> Optional[Dict[str, Any]]:
15
+ """
16
+ Fetch data from the Pushshift API for the specified subreddit.
17
+
18
+ :param subreddit: The name of the subreddit to scrape.
19
+ :param before: The upper limit for the created_utc attribute of the submissions.
20
+ :param after: The lower limit for the created_utc attribute of the submissions.
21
+ :param aggs: The aggregation summary option to use.
22
+ :return: A dictionary containing the fetched data and aggregations if available.
23
+ """
24
+ url = "https://api.pushshift.io/reddit/search/submission/"
25
+ params = {
26
+ "subreddit": subreddit,
27
+ "size": 1000,
28
+ "sort": "created_utc",
29
+ "sort_type": "desc",
30
+ }
31
+ if before is not None:
32
+ params["before"] = before
33
+ if after is not None:
34
+ params["after"] = after
35
+ if aggs is not None:
36
+ params["aggs"] = aggs
37
+
38
+ response = requests.get(url, params=params)
39
+ if response.status_code == 200:
40
+ return response.json()
41
+ else:
42
+ logger.error(f"Error fetching data: {response.status_code}")
43
+ return None
44
+
45
+
46
+ def get_post_count_for_day(subreddit: str, day_to_scrape: str) -> int:
47
+ """
48
+ Get the total number of posts for a specific day in the specified subreddit using the Pushshift API.
49
+
50
+ :param subreddit: The name of the subreddit to get the post count for.
51
+ :param day_to_scrape: The date for which to get the post count (format: "YYYY-MM-DD").
52
+ :return: The total number of posts for the specified day.
53
+ """
54
+ date_obj = datetime.strptime(day_to_scrape, "%Y-%m-%d")
55
+ after = int(date_obj.timestamp())
56
+ before = int((date_obj + timedelta(days=1)).timestamp())
57
+
58
+ response = get_pushshift_data(subreddit, before=before, after=after, aggs="created_utc")
59
+ if response is not None:
60
+ aggs = response.get("aggs", {}).get("created_utc", [])
61
+ if aggs:
62
+ return aggs[0]["doc_count"]
63
+ return 0
64
+
65
+
66
+ def fetch_data(subreddit: str, before: int, after: int) -> Optional[Dict[str, Any]]:
67
+ url = "https://api.pushshift.io/reddit/search/submission/"
68
+ params = {
69
+ "subreddit": subreddit,
70
+ "size": 1000,
71
+ "sort": "created_utc",
72
+ "sort_type": "desc",
73
+ "before": before,
74
+ "after": after,
75
+ }
76
+
77
+ response = requests.get(url, params=params)
78
+ if response.status_code == 200:
79
+ return response.json()
80
+ else:
81
+ logger.error(f"Error fetching data: {response.status_code}")
82
+ return None
83
+
84
+
85
+ def convert_timestamp_to_datetime(timestamp: int) -> str:
86
+ # Convert the timestamp to a datetime object
87
+ datetime_obj = datetime.utcfromtimestamp(timestamp)
88
+
89
+ # Add timezone information
90
+ datetime_obj_utc = datetime_obj.replace(tzinfo=timezone.utc)
91
+
92
+ # Convert the datetime object to a formatted string
93
+ datetime_str = datetime_obj_utc.strftime('%Y-%m-%d %H:%M:%S')
94
+
95
+ return datetime_str
96
+
97
+
98
+ def scrape_submissions_by_day(subreddit_to_scrape: str, day_to_scrape: str) -> List[Dict[str, Any]]:
99
+ start_time = time.time()
100
+ scraped_submissions = []
101
+ date_obj = datetime.strptime(day_to_scrape, "%Y-%m-%d")
102
+
103
+ if date_obj > datetime.now() - timedelta(days=7):
104
+ logger.error("The specified date might not be available in the Pushshift API yet. "
105
+ "Please try an earlier date or wait for the API to be updated.")
106
+ return scraped_submissions
107
+
108
+ after = int(date_obj.timestamp())
109
+ before = int((date_obj + timedelta(days=1)).timestamp())
110
+
111
+ # todo get_post_count_for_day didnt seem to work
112
+ # post_count = get_post_count_for_day(subreddit_to_scrape, day_to_scrape)
113
+ # total_requests = (post_count + 99) // 100 # Estimate the total number of requests
114
+
115
+ actual_requests = 0
116
+ while after < before:
117
+ after_str, before_str = convert_timestamp_to_datetime(after), convert_timestamp_to_datetime(before)
118
+ logger.info(f"Fetching data between timestamps {after_str} and {before_str}")
119
+ data = get_pushshift_data(subreddit_to_scrape, before=before, after=after)
120
+ if data is None or len(data["data"]) == 0:
121
+ break
122
+
123
+ scraped_submissions.extend(data["data"])
124
+ before = data["data"][-1]["created_utc"]
125
+
126
+ actual_requests += 1
127
+ time.sleep(1)
128
+
129
+ elapsed_time = time.time() - start_time
130
+ if actual_requests:
131
+ logger.info(
132
+ f"{actual_requests}it [{elapsed_time // 60:02}:{elapsed_time % 60:.2f} {elapsed_time / actual_requests:.2f}s/it]")
133
+ logger.info(
134
+ f"Finished scraping {len(scraped_submissions)} submissions in {elapsed_time:.2f} seconds in {actual_requests} requests")
135
+ return scraped_submissions
136
+
137
+
138
+ def submissions_to_dataframe(submissions: List[Dict[str, Any]]) -> pd.DataFrame:
139
+ """
140
+ Parse a list of submissions into a pandas DataFrame.
141
+
142
+ :param submissions: A list of dictionaries containing the scraped submission data.
143
+ :return: A pandas DataFrame containing the submission data.
144
+ """
145
+ cols = ['score', 'num_comments', 'title', 'permalink', 'selftext', 'url', 'created_utc', 'author', 'id',
146
+ 'downs', 'ups']
147
+ df = pd.DataFrame(submissions)
148
+ df = df.convert_dtypes()
149
+ df = df[cols]
150
+ # Convert the "created_utc" column to a datetime column with timezone information
151
+ df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC').dt.strftime(
152
+ '%Y-%m-%d %H:%M:%S')
153
+ return df
154
+
155
+
156
+ if __name__ == '__main__':
157
+ subreddit_to_scrape = "askreddit"
158
+ day_to_scrape = "2013-03-01"
159
+ submissions = scrape_submissions_by_day(subreddit_to_scrape, day_to_scrape)
160
+ df = submissions_to_dataframe(submissions)
161
+ print(df.head().to_string())
162
+ logger.info(f"Scraped {len(submissions)} submissions from r/{subreddit_to_scrape} on {day_to_scrape}")