import praw import pandas as pd import datetime import re import json import os import os.path from typing import List, Dict, Any, Optional from dotenv import load_dotenv class EnhancedRedditScraper: """ An enhanced Reddit scraper that provides more advanced functionality than the basic RedditScraperAgent. """ def __init__(self, client_id: str, client_secret: str, user_agent: str): """ Initialize the Reddit scraper with API credentials. Args: client_id: Reddit API client ID client_secret: Reddit API client secret user_agent: User agent string for Reddit API """ self.reddit = praw.Reddit( client_id=client_id, client_secret=client_secret, user_agent=user_agent ) self.last_search_results = [] def scrape_subreddit(self, subreddit_name: str, keywords: List[str], limit: int = 100, sort_by: str = "hot", include_comments: bool = False, min_score: int = 0, include_selftext: bool = True) -> List[Dict[str, Any]]: """ Scrape a subreddit for posts containing specified keywords. Args: subreddit_name: Name of the subreddit to scrape keywords: List of keywords to search for limit: Maximum number of posts to retrieve sort_by: How to sort posts ('hot', 'new', 'top', 'rising') include_comments: Whether to search post comments min_score: Minimum score (upvotes) for posts include_selftext: Whether to search post content (selftext) Returns: List of matching post dictionaries """ subreddit = self.reddit.subreddit(subreddit_name) results = [] # Choose the right sort method if sort_by == "hot": submissions = subreddit.hot(limit=limit) elif sort_by == "new": submissions = subreddit.new(limit=limit) elif sort_by == "top": submissions = subreddit.top(limit=limit) elif sort_by == "rising": submissions = subreddit.rising(limit=limit) else: submissions = subreddit.hot(limit=limit) # Process each submission for submission in submissions: # Check if post meets the minimum score requirement if submission.score < min_score: continue # Check for keywords in title or selftext title_match = any(keyword.lower() in submission.title.lower() for keyword in keywords) selftext_match = False if include_selftext: selftext_match = any(keyword.lower() in submission.selftext.lower() for keyword in keywords) comment_match = False comments_data = [] # Search comments if enabled if include_comments: submission.comments.replace_more(limit=3) # Load some MoreComments for comment in submission.comments.list()[:20]: # Limit to first 20 comments if any(keyword.lower() in comment.body.lower() for keyword in keywords): comment_match = True comments_data.append({ 'author': str(comment.author), 'body': comment.body, 'score': comment.score, 'created_utc': datetime.datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S') }) # Add post to results if it matches criteria if title_match or selftext_match or comment_match: created_time = datetime.datetime.fromtimestamp(submission.created_utc) post_data = { 'title': submission.title, 'text': submission.selftext, 'url': submission.url, 'score': submission.score, 'id': submission.id, 'author': str(submission.author), 'created_utc': created_time.strftime('%Y-%m-%d %H:%M:%S'), 'upvote_ratio': submission.upvote_ratio, 'num_comments': submission.num_comments, 'permalink': f"https://www.reddit.com{submission.permalink}", } if include_comments and comments_data: post_data['matching_comments'] = comments_data results.append(post_data) # Store last search results self.last_search_results = results return results def search_multiple_subreddits(self, subreddits: List[str], keywords: List[str], **kwargs) -> Dict[str, List[Dict[str, Any]]]: """ Search multiple subreddits for the same keywords. Args: subreddits: List of subreddit names to search keywords: List of keywords to search for **kwargs: Additional arguments to pass to scrape_subreddit Returns: Dictionary mapping subreddit names to their results """ results = {} for subreddit in subreddits: results[subreddit] = self.scrape_subreddit(subreddit, keywords, **kwargs) return results def save_results_to_csv(self, filename: str) -> str: """ Save the last search results to a CSV file. Args: filename: Name of the file to save (without extension) Returns: Path to the saved file """ if not self.last_search_results: raise ValueError("No search results to save. Run a search first.") df = pd.DataFrame(self.last_search_results) # Clean up comment data for CSV format if 'matching_comments' in df.columns: df['matching_comments'] = df['matching_comments'].apply( lambda x: json.dumps(x) if isinstance(x, list) else '' ) # Add timestamp to filename timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") full_filename = f"{filename}_{timestamp}.csv" df.to_csv(full_filename, index=False) return os.path.abspath(full_filename) def save_results_to_json(self, filename: str) -> str: """ Save the last search results to a JSON file. Args: filename: Name of the file to save (without extension) Returns: Path to the saved file """ if not self.last_search_results: raise ValueError("No search results to save. Run a search first.") # Add timestamp to filename timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") full_filename = f"{filename}_{timestamp}.json" with open(full_filename, 'w', encoding='utf-8') as f: json.dump(self.last_search_results, f, ensure_ascii=False, indent=2) return os.path.abspath(full_filename) # Example usage if __name__ == "__main__": # Load environment variables from .env file load_dotenv() # Get credentials from environment variables or use defaults for development client_id = os.environ.get("REDDIT_CLIENT_ID", "") client_secret = os.environ.get("REDDIT_CLIENT_SECRET", "") user_agent = os.environ.get("REDDIT_USER_AGENT", "RedditScraperApp/1.0") if not client_id or not client_secret: print("Warning: Reddit API credentials not found in environment variables.") print("Please set REDDIT_CLIENT_ID and REDDIT_CLIENT_SECRET in .env file") print("or as environment variables for proper functionality.") # For development only, you could set default credentials here # Create the scraper instance scraper = EnhancedRedditScraper( client_id=client_id, client_secret=client_secret, user_agent=user_agent ) # Simple example try: results = scraper.scrape_subreddit( subreddit_name="cuny", keywords=["question", "help", "confused"], limit=25, sort_by="hot", include_comments=True ) print(f"Found {len(results)} matching posts") # Save results to file if results: csv_path = scraper.save_results_to_csv("reddit_results") json_path = scraper.save_results_to_json("reddit_results") print(f"Results saved to {csv_path} and {json_path}") except Exception as e: print(f"Error: {str(e)}") print("This may be due to missing or invalid API credentials.")