File size: 3,178 Bytes
285612d 67a3546 285612d 5d9e0b8 285612d 83f6dc4 285612d 67a3546 99ec3d4 67a3546 285612d 67a3546 285612d 5d9e0b8 285612d 5d9e0b8 285612d 8631363 5d9e0b8 f4c06f2 285612d b4826e4 285612d 2825d75 285612d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import os
from datetime import datetime
from typing import Any, Dict, List, Tuple
import praw
from utilities.my_logger import setup_logger
# Setup logging
logger = setup_logger(__name__)
# Get subreddit
subreddit_var = os.getenv("SUBREDDIT")
reddit_pull_limit = int(os.getenv("REDDIT_PULL_LIMIT"))
def get_reddit_instance() -> praw.Reddit:
"""Initialize and return a Reddit instance using PRAW."""
return praw.Reddit(
client_id=os.getenv('REDDIT_CLIENT_ID'),
client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
user_agent=os.getenv('REDDIT_USER_AGENT'),
ratelimit_seconds=20,
)
def extract_submission_data(submission: praw.models.Submission) -> Dict[str, Any]:
"""Extract and return relevant data from a given Reddit submission."""
return {
"content": submission.selftext,
"poster": str(submission.author),
"date_utc": datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
"flair": submission.link_flair_text,
"title": submission.title,
"score": submission.ups,
"permalink": submission.permalink,
"nsfw": submission.over_18,
}
def extract_comment_data(comment: praw.models.Comment) -> Dict[str, Any]:
"""Extract and return relevant data from a given Reddit comment"""
return {
'content': comment.body,
'poster': str(comment.author),
'date_utc': datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
'flair': comment.author_flair_text,
'ups': comment.ups,
'score': comment.score,
'permalink': comment.permalink,
'depth': comment.depth,
'link_id': comment.link_id,
'parent_id': comment.parent_id,
'id': comment.id
}
def praw_downloader() -> Tuple[List[Dict[str, str]]]:
"""Main function to extract and save all submissions from the subreddit."""
reddit = get_reddit_instance()
subreddit = reddit.subreddit(subreddit_var)
logger.info(f'Starting to fetch submissions from {os.getenv("SUBREDDIT")}.')
submissions = []
comments_list = []
for submission in subreddit.new(limit=reddit_pull_limit): # Set limit=None to get all posts
# logger.debug(f'Processing post {submission.id} - {submission.title}')
data = extract_submission_data(submission)
while True:
comments = submission.comments
print('last comment')
try:
print(comments[-1])
print('Replace more and show last')
print(comments.replace_more()[-1])
print('Opening more comments...')
except:
print('Opened all comments for post: ', submission.title)
comments = comments.list()
break
for comment in comments:
comments_list.append(extract_comment_data(comment))
submissions.append(data)
logger.info(f'Finished downloading {len(submissions)} submissions, {len(comments_list)} comments')
return submissions, comments_list
if __name__ == "__main__":
praw_downloader()
|