File size: 5,705 Bytes
749d1d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import csv
import json
import sys
import time
import traceback
from datetime import datetime
import requests
username = "" # put the username you want to download in the quotes
subreddit = "BestofRedditorUpdates" # put the subreddit you want to download in the quotes
thread_id = "" # put the id of the thread you want to download in the quotes, it's the first 5 to 7 character string of letters and numbers from the url, like 107xayi
# leave either one blank to download an entire user's or subreddit's history
# or fill in both to download a specific users history from a specific subreddit
# change this to one of "human", "csv" or "json"
# - human: the score, creation date, author, link and then the comment/submission body on a second line. Objects are separated by lines of dashes
# - csv: a comma seperated value file with the fields score, date, title, author, link and then body or url
# - json: the full json object
output_format = "csv"
# default start time is the current time and default end time is all history
# you can change out the below lines to set a custom start and end date. The script works backwards, so the end date has to be before the start date
# start_time = datetime.utcnow() # datetime.strptime("10/05/2021", "%m/%d/%Y")
start_time = datetime.strptime("04/02/2023", "%m/%d/%Y")
end_time = None # datetime.strptime("09/25/2021", "%m/%d/%Y")
convert_to_ascii = False # don't touch this unless you know what you're doing
convert_thread_id_to_base_ten = True # don't touch this unless you know what you're doing
def write_csv_line(writer, obj, is_submission):
output_list = []
output_list.append(str(obj['score']))
output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
if is_submission:
output_list.append(obj['title'])
output_list.append(f"u/{obj['author']}")
output_list.append(f"https://www.reddit.com{obj['permalink']}")
if is_submission:
if obj['is_self']:
if 'selftext' in obj:
output_list.append(obj['selftext'])
else:
output_list.append("")
else:
output_list.append(obj['url'])
else:
output_list.append(obj['body'])
writer.writerow(output_list)
def write_json_line(handle, obj):
handle.write(json.dumps(obj))
handle.write("\n")
def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii):
print(f"Saving to {filename}")
count = 0
if output_format == "human" or output_format == "json":
if convert_to_ascii:
handle = open(filename, 'w', encoding='ascii')
else:
handle = open(filename, 'w', encoding='UTF-8')
else:
handle = open(filename, 'w', encoding='UTF-8', newline='')
writer = csv.writer(handle)
previous_epoch = int(start_datetime.timestamp())
break_out = False
while True:
new_url = url_base + str(previous_epoch)
json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
try:
json_data = json_text.json()
except json.decoder.JSONDecodeError:
time.sleep(1)
continue
if 'data' not in json_data:
break
objects = json_data['data']
if len(objects) == 0:
break
for obj in objects:
previous_epoch = obj['created_utc'] - 1
if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime:
break_out = True
break
count += 1
try:
if output_format == "csv":
write_csv_line(writer, obj, is_submission)
elif output_format == "json":
write_json_line(handle, obj)
except Exception as err:
if 'permalink' in obj:
print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}")
else:
print(f"Couldn't print object, missing permalink: {obj['id']}")
print(err)
print(traceback.format_exc())
if break_out:
break
print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}")
print(f"Saved {count}")
handle.close()
if __name__ == "__main__":
filter_string = None
if username == "" and subreddit == "" and thread_id == "":
print("Fill in username, subreddit or thread id")
sys.exit(0)
if output_format not in ("human", "csv", "json"):
print("Output format must be one of human, csv, json")
sys.exit(0)
filters = []
if username:
filters.append(f"author={username}")
if subreddit:
filters.append(f"subreddit={subreddit}")
if thread_id:
if convert_thread_id_to_base_ten:
filters.append(f"link_id={int(thread_id, 36)}")
else:
filters.append(f"link_id=t3_{thread_id}")
filter_string = '&'.join(filters)
url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before="
if not thread_id:
download_from_url("posts.txt", url_template.format("submission", filter_string), output_format, start_time,
end_time, True, convert_to_ascii)
# download_from_url("comments.txt", url_template.format("comment", filter_string), output_format, start_time,
# end_time, False, convert_to_ascii)
|