|
import os |
|
from datetime import datetime |
|
from typing import Any, Dict, List, Tuple |
|
|
|
import praw |
|
|
|
from utilities.my_logger import setup_logger |
|
|
|
|
|
logger = setup_logger(__name__) |
|
|
|
|
|
subreddit_var = os.getenv("SUBREDDIT") |
|
reddit_pull_limit = int(os.getenv("REDDIT_PULL_LIMIT")) |
|
|
|
|
|
def get_reddit_instance() -> praw.Reddit: |
|
"""Initialize and return a Reddit instance using PRAW.""" |
|
return praw.Reddit( |
|
client_id=os.getenv('REDDIT_CLIENT_ID'), |
|
client_secret=os.getenv('REDDIT_CLIENT_SECRET'), |
|
user_agent=os.getenv('REDDIT_USER_AGENT'), |
|
ratelimit_seconds=20, |
|
) |
|
|
|
|
|
def extract_submission_data(submission: praw.models.Submission) -> Dict[str, Any]: |
|
"""Extract and return relevant data from a given Reddit submission.""" |
|
return { |
|
"content": submission.selftext, |
|
"poster": str(submission.author), |
|
"date_utc": datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'), |
|
"flair": submission.link_flair_text, |
|
"title": submission.title, |
|
"score": submission.ups, |
|
"permalink": submission.permalink, |
|
"nsfw": submission.over_18, |
|
} |
|
|
|
def extract_comment_data(comment: praw.models.Comment) -> Dict[str, Any]: |
|
"""Extract and return relevant data from a given Reddit comment""" |
|
return { |
|
'content': comment.body, |
|
'poster': str(comment.author), |
|
'date_utc': datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'), |
|
'flair': comment.author_flair_text, |
|
'ups': comment.ups, |
|
'score': comment.score, |
|
'permalink': comment.permalink, |
|
'depth': comment.depth, |
|
'link_id': comment.link_id, |
|
'parent_id': comment.parent_id, |
|
'id': comment.id |
|
} |
|
|
|
|
|
def praw_downloader() -> Tuple[List[Dict[str, str]]]: |
|
"""Main function to extract and save all submissions from the subreddit.""" |
|
reddit = get_reddit_instance() |
|
subreddit = reddit.subreddit(subreddit_var) |
|
|
|
logger.info(f'Starting to fetch submissions from {os.getenv("SUBREDDIT")}.') |
|
|
|
submissions = [] |
|
comments_list = [] |
|
for submission in subreddit.new(limit=reddit_pull_limit): |
|
|
|
data = extract_submission_data(submission) |
|
while True: |
|
comments = submission.comments |
|
print('last comment') |
|
try: |
|
print(comments[-1]) |
|
print('Replace more and show last') |
|
print(comments.replace_more()[-1]) |
|
print('Opening more comments...') |
|
except: |
|
print('Opened all comments for post: ', submission.title) |
|
comments = comments.list() |
|
break |
|
for comment in comments: |
|
comments_list.append(extract_comment_data(comment)) |
|
submissions.append(data) |
|
|
|
logger.info(f'Finished downloading {len(submissions)} submissions, {len(comments_list)} comments') |
|
return submissions, comments_list |
|
|
|
|
|
if __name__ == "__main__": |
|
praw_downloader() |
|
|