import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import praw import time from datetime import datetime, timedelta import json import os from typing import List, Dict, Any, Optional, Tuple import concurrent.futures from functools import lru_cache import hashlib import pytz import sqlite3 import networkx as nx from pathlib import Path # Advanced features optional - will gracefully degrade if not available try: from advanced_reddit_scraper import ( AdvancedRedditScraper, ExponentialBackoff, CommentHierarchyTracker, CheckpointManager ) ADVANCED_FEATURES = True except ImportError: ADVANCED_FEATURES = False def load_env_file(env_path: str = ".env") -> Dict[str, str]: """ Load environment variables from .env file Args: env_path: Path to .env file Returns: Dictionary of environment variables """ env_vars = {} env_file = Path(env_path) if env_file.exists(): with open(env_file, 'r') as f: for line in f: line = line.strip() if line and not line.startswith('#') and '=' in line: key, value = line.split('=', 1) key = key.strip() value = value.strip().strip('"').strip("'") env_vars[key] = value return env_vars st.set_page_config( page_title="Reddit Research Dashboard", page_icon="📊", layout="wide", initial_sidebar_state="expanded" ) st.markdown(""" """, unsafe_allow_html=True) class OptimizedRedditScraper: """ Optimized Reddit scraper with batch processing, caching, and temporal analytics """ def __init__(self, client_id: str, client_secret: str, user_agent: str): """Initialize with Reddit API credentials""" self.reddit = praw.Reddit( client_id=client_id, client_secret=client_secret, user_agent=user_agent, check_for_async=False ) self.last_request_time = 0 self.min_delay = 0.5 def fetch_subreddit_data_verbose(self, subreddit_name: str, sort_by: str = "hot", limit: int = 200, time_filter: str = "month", log_container=None) -> pd.DataFrame: """ Fetch Reddit data with verbose logging Args: subreddit_name: Name of subreddit to scrape sort_by: Sort method (hot, new, top, rising) limit: Number of posts to fetch (optimized for 200+ items) time_filter: Time filter for top posts log_container: Streamlit container for logging output Returns: DataFrame with Reddit posts data """ def stream_post(post_data, stream_container): """Display a post as it's collected""" if stream_container: timestamp = datetime.now().strftime("%H:%M:%S") with stream_container.container(): with st.expander(f"📝 {post_data['title'][:80]}...", expanded=False): col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Score", post_data['score']) with col2: st.metric("Comments", post_data['num_comments']) with col3: st.text(f"u/{post_data['author']}") with col4: st.text(timestamp) def update_stats(stats_container, total, authors, comments): """Update collection statistics""" if stats_container: stats_container.empty() with stats_container: col1, col2, col3 = st.columns(3) with col1: st.metric("📊 Posts", total) with col2: st.metric("đŸ‘Ĩ Authors", authors) with col3: st.metric("đŸ’Ŧ Comments", f"{comments:,}") # Initialize streaming containers stats_container = None stream_container = None if log_container: # Check if log_container is a tuple of (stats, stream) if isinstance(log_container, tuple): stats_container, stream_container = log_container else: stats_container = log_container stream_container = log_container data = [] try: subreddit = self.reddit.subreddit(subreddit_name) # Choose appropriate method based on sort_by if sort_by == "top": submissions = subreddit.top(limit=limit, time_filter=time_filter) elif sort_by == "new": submissions = subreddit.new(limit=limit) elif sort_by == "rising": submissions = subreddit.rising(limit=limit) else: submissions = subreddit.hot(limit=limit) # Batch processing with rate limiting batch_size = 25 batch = [] batch_num = 1 post_count = 0 total_comments = 0 try: # Convert to list to handle iterator exhaustion gracefully submissions_list = [] try: for submission in submissions: try: # Force PRAW to load the submission by accessing an attribute _ = submission.id submissions_list.append(submission) if len(submissions_list) >= limit: break except Exception as sub_error: # Skip submissions that fail to load continue except StopIteration: pass # Iterator exhausted naturally except Exception as fetch_error: error_msg = str(fetch_error) if "Ran out of input" in error_msg or "prawcore" in error_msg.lower(): # PRAW iterator exhausted - not an error, just end of data pass else: if log_container: st.warning(f"âš ī¸ Stopped early: {error_msg}") if not submissions_list: if log_container: st.error(f"No data could be fetched: {error_msg}") raise for i, submission in enumerate(submissions_list): try: # Rate limiting before fetching submission data current_time = time.time() if current_time - self.last_request_time < self.min_delay: time.sleep(self.min_delay - (current_time - self.last_request_time)) self.last_request_time = time.time() batch.append(submission) post_count += 1 if len(batch) >= batch_size or post_count >= limit: # Process batch for idx, sub in enumerate(batch): try: # Safely extract all attributes with error handling try: post_id = sub.id post_title = sub.title post_author = str(sub.author) if sub.author else '[deleted]' post_created = datetime.fromtimestamp(sub.created_utc, tz=pytz.UTC) post_score = sub.score post_comments = sub.num_comments post_ratio = sub.upvote_ratio post_text = sub.selftext[:500] if sub.selftext else '' post_url = sub.url post_flair = sub.link_flair_text or 'No Flair' post_video = sub.is_video post_self = sub.is_self post_permalink = f"https://reddit.com{sub.permalink}" except AttributeError as attr_error: # Missing attribute - skip this post continue except Exception as access_error: # Any other error accessing attributes - skip continue post_data = { 'id': post_id, 'title': post_title, 'author': post_author, 'created_utc': post_created, 'score': post_score, 'num_comments': post_comments, 'upvote_ratio': post_ratio, 'selftext': post_text, 'url': post_url, 'subreddit': subreddit_name, 'flair': post_flair, 'is_video': post_video, 'is_self': post_self, 'permalink': post_permalink } data.append(post_data) total_comments += post_data['num_comments'] # Stream the post to UI stream_post(post_data, stream_container) except Exception as post_error: # Skip posts that cause any error continue # Update stats if log_container: unique_authors = len(set(d['author'] for d in data)) update_stats(stats_container, len(data), unique_authors, total_comments) batch = [] batch_num += 1 # Update progress if st.session_state.get('progress_bar'): progress = min(post_count / limit, 1.0) st.session_state.progress_bar.progress(progress) # Stop if we've reached the limit if post_count >= limit: break except StopIteration: break except Exception as iter_error: continue # Process any remaining items in batch if batch: for idx, sub in enumerate(batch): try: # Safely extract all attributes try: post_id = sub.id post_title = sub.title post_author = str(sub.author) if sub.author else '[deleted]' post_created = datetime.fromtimestamp(sub.created_utc, tz=pytz.UTC) post_score = sub.score post_comments = sub.num_comments post_ratio = sub.upvote_ratio post_text = sub.selftext[:500] if sub.selftext else '' post_url = sub.url post_flair = sub.link_flair_text or 'No Flair' post_video = sub.is_video post_self = sub.is_self post_permalink = f"https://reddit.com{sub.permalink}" except Exception: # Skip posts that fail attribute access continue post_data = { 'id': post_id, 'title': post_title, 'author': post_author, 'created_utc': post_created, 'score': post_score, 'num_comments': post_comments, 'upvote_ratio': post_ratio, 'selftext': post_text, 'url': post_url, 'subreddit': subreddit_name, 'flair': post_flair, 'is_video': post_video, 'is_self': post_self, 'permalink': post_permalink } data.append(post_data) total_comments += post_data['num_comments'] stream_post(post_data, stream_container) except Exception: # Skip any problematic posts continue except StopIteration: pass # Final stats update if log_container: unique_authors = len(set(d['author'] for d in data)) update_stats(stats_container, len(data), unique_authors, total_comments) except Exception as e: error_msg = str(e) # Don't show scary errors for common PRAW issues if "Ran out of input" in error_msg or "prawcore" in error_msg.lower(): if log_container and len(data) == 0: st.warning("âš ī¸ No posts could be fetched. The subreddit may be empty or private.") else: if log_container: st.error(f"❌ Error: {error_msg}") if len(data) == 0: # Only raise if we got no data at all raise # Return whatever data we managed to collect if len(data) == 0 and log_container: st.info("â„šī¸ No posts were collected. Try adjusting your filters or selecting a different subreddit.") return pd.DataFrame(data) def fetch_subreddit_data(self, subreddit_name: str, sort_by: str = "hot", limit: int = 200, time_filter: str = "month") -> pd.DataFrame: """ Fetch data with manual session-based caching """ # Create cache key cache_key = f"{subreddit_name}_{sort_by}_{limit}_{time_filter}" # Check if data exists in session state cache if 'data_cache' not in st.session_state: st.session_state.data_cache = {} if cache_key in st.session_state.data_cache: cache_entry = st.session_state.data_cache[cache_key] # Check if cache is still valid (1 hour TTL) if (datetime.now() - cache_entry['timestamp']).total_seconds() < 3600: return cache_entry['data'] # Fetch new data df = self.fetch_subreddit_data_verbose(subreddit_name, sort_by, limit, time_filter, None) # Store in cache st.session_state.data_cache[cache_key] = { 'data': df, 'timestamp': datetime.now() } return df def fetch_multiple_subreddits(self, subreddits: List[str], limit_per: int = 100, sort_by: str = "hot") -> pd.DataFrame: """ Fetch data from multiple subreddits with manual caching Args: subreddits: List of subreddit names limit_per: Posts per subreddit sort_by: Sort method Returns: Combined DataFrame """ # Create cache key cache_key = f"multi_{'_'.join(sorted(subreddits))}_{sort_by}_{limit_per}" # Check cache if 'data_cache' not in st.session_state: st.session_state.data_cache = {} if cache_key in st.session_state.data_cache: cache_entry = st.session_state.data_cache[cache_key] # Check if cache is still valid (30 min TTL) if (datetime.now() - cache_entry['timestamp']).total_seconds() < 1800: return cache_entry['data'] # Fetch new data all_data = [] with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: future_to_sub = { executor.submit(self.fetch_subreddit_data, sub, sort_by, limit_per): sub for sub in subreddits } for future in concurrent.futures.as_completed(future_to_sub): sub = future_to_sub[future] try: data = future.result() all_data.append(data) except Exception as e: st.error(f"Error fetching r/{sub}: {e}") if all_data: df = pd.concat(all_data, ignore_index=True) else: df = pd.DataFrame() # Store in cache st.session_state.data_cache[cache_key] = { 'data': df, 'timestamp': datetime.now() } return df def create_temporal_visualizations(df: pd.DataFrame) -> Dict[str, go.Figure]: """ Create comprehensive temporal analytics visualizations Args: df: DataFrame with Reddit data Returns: Dictionary of Plotly figures """ figures = {} # Ensure datetime column if 'created_utc' in df.columns: df['created_utc'] = pd.to_datetime(df['created_utc']) df = df.sort_values('created_utc') # Get actual date range of collected data with padding date_min = df['created_utc'].min() date_max = df['created_utc'].max() date_range = (date_max - date_min).days # Add 2% padding to prevent edge clipping padding = pd.Timedelta(days=max(1, int(date_range * 0.02))) date_min_padded = date_min - padding date_max_padded = date_max + padding # 1. Hourly activity heatmap df['hour'] = df['created_utc'].dt.hour df['day_of_week'] = df['created_utc'].dt.day_name() heatmap_data = df.groupby(['day_of_week', 'hour']).size().reset_index(name='count') pivot_data = heatmap_data.pivot(index='day_of_week', columns='hour', values='count').fillna(0) # Reorder days days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] pivot_data = pivot_data.reindex(days_order) fig_heatmap = go.Figure(data=go.Heatmap( z=pivot_data.values, x=pivot_data.columns, y=pivot_data.index, colorscale='RdYlBu_r', text=pivot_data.values.astype(int), texttemplate='%{text}', textfont={"size": 8}, hovertemplate='%{y}
%{x}:00
Posts: %{z}' )) fig_heatmap.update_layout( title='Activity Heatmap by Hour and Day', xaxis_title='Hour of Day', yaxis_title='Day of Week', height=400 ) figures['heatmap'] = fig_heatmap # 2. Time series with rolling average - only include days with actual data daily_stats = df.set_index('created_utc').resample('D').agg({ 'id': 'count', 'score': 'mean', 'num_comments': 'mean' }).rename(columns={'id': 'post_count'}) # Filter out days with no posts to prevent misleading gaps daily_stats = daily_stats[daily_stats['post_count'] > 0] # Calculate rolling averages daily_stats['post_count_ma7'] = daily_stats['post_count'].rolling(window=7, min_periods=1).mean() daily_stats['score_ma7'] = daily_stats['score'].rolling(window=7, min_periods=1).mean() fig_timeline = make_subplots( rows=2, cols=1, subplot_titles=('Daily Post Activity', 'Average Engagement Metrics'), vertical_spacing=0.1 ) # Post count fig_timeline.add_trace( go.Scatter(x=daily_stats.index, y=daily_stats['post_count'], mode='markers', name='Daily Posts', opacity=0.5, marker=dict(size=5, color='lightblue')), row=1, col=1 ) fig_timeline.add_trace( go.Scatter(x=daily_stats.index, y=daily_stats['post_count_ma7'], mode='lines', name='7-Day Average', line=dict(color='blue', width=2), connectgaps=False), row=1, col=1 ) # Engagement metrics fig_timeline.add_trace( go.Scatter(x=daily_stats.index, y=daily_stats['score_ma7'], mode='lines', name='Avg Score (7-day)', line=dict(color='orange'), connectgaps=False), row=2, col=1 ) fig_timeline.add_trace( go.Scatter(x=daily_stats.index, y=daily_stats['num_comments'].rolling(window=7, min_periods=1).mean(), mode='lines', name='Avg Comments (7-day)', line=dict(color='green'), connectgaps=False), row=2, col=1 ) fig_timeline.update_layout( height=600, showlegend=True, title=f'Activity Timeline ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})', xaxis=dict(type='date', autorange=True), xaxis2=dict(type='date', autorange=True) ) fig_timeline.update_xaxes(title_text="Date", row=2, col=1) fig_timeline.update_yaxes(title_text="Count", row=1, col=1) fig_timeline.update_yaxes(title_text="Value", row=2, col=1) figures['timeline'] = fig_timeline # 3. Monthly trend analysis (only show if data spans at least 30 days) if date_range >= 30: monthly_data = df.set_index('created_utc').resample('M').agg({ 'id': 'count', 'score': ['mean', 'sum'], 'num_comments': ['mean', 'sum'] }) # Filter out months with no posts monthly_data = monthly_data[monthly_data[('id', 'count')] > 0] fig_monthly = go.Figure() fig_monthly.add_trace(go.Bar( x=monthly_data.index, y=monthly_data[('id', 'count')], name='Monthly Posts', marker_color='lightblue' )) fig_monthly.add_trace(go.Scatter( x=monthly_data.index, y=monthly_data[('score', 'mean')], name='Avg Score', yaxis='y2', line=dict(color='red', width=2), connectgaps=False )) fig_monthly.update_layout( title=f'Monthly Posting Trends ({date_min.strftime("%Y-%m")} to {date_max.strftime("%Y-%m")})', xaxis_title='Month', xaxis=dict(type='date', autorange=True), yaxis=dict(title='Post Count', side='left'), yaxis2=dict(title='Average Score', side='right', overlaying='y'), height=400, hovermode='x unified' ) figures['monthly'] = fig_monthly else: # For shorter periods, show weekly trends instead weekly_data = df.set_index('created_utc').resample('W').agg({ 'id': 'count', 'score': ['mean', 'sum'], 'num_comments': ['mean', 'sum'] }) # Filter out weeks with no posts weekly_data = weekly_data[weekly_data[('id', 'count')] > 0] fig_weekly = go.Figure() fig_weekly.add_trace(go.Bar( x=weekly_data.index, y=weekly_data[('id', 'count')], name='Weekly Posts', marker_color='lightblue' )) fig_weekly.add_trace(go.Scatter( x=weekly_data.index, y=weekly_data[('score', 'mean')], name='Avg Score', yaxis='y2', line=dict(color='red', width=2), connectgaps=False )) fig_weekly.update_layout( title=f'Weekly Posting Trends ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})', xaxis_title='Week', xaxis=dict(type='date', autorange=True), yaxis=dict(title='Post Count', side='left'), yaxis2=dict(title='Average Score', side='right', overlaying='y'), height=400, hovermode='x unified' ) figures['monthly'] = fig_weekly # Use same key for consistency # 4. Posting patterns by flair if 'flair' in df.columns: flair_time = df.groupby([pd.Grouper(key='created_utc', freq='W'), 'flair']).size().reset_index(name='count') top_flairs = df['flair'].value_counts().head(10).index flair_time_filtered = flair_time[flair_time['flair'].isin(top_flairs)] fig_flair = px.line(flair_time_filtered, x='created_utc', y='count', color='flair', title=f'Weekly Posting Patterns by Flair ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})', labels={'count': 'Number of Posts', 'created_utc': 'Week'}) fig_flair.update_layout( height=400, xaxis=dict(type='date', autorange=True) ) figures['flair'] = fig_flair return figures def create_engagement_analytics(df: pd.DataFrame) -> Dict[str, go.Figure]: """ Create engagement and interaction analytics Args: df: DataFrame with Reddit data Returns: Dictionary of engagement figures """ figures = {} # 1. Score distribution fig_score_dist = go.Figure() fig_score_dist.add_trace(go.Histogram( x=df['score'], nbinsx=50, name='Score Distribution', marker_color='orange' )) fig_score_dist.update_layout( title='Post Score Distribution', xaxis_title='Score', yaxis_title='Frequency', height=350 ) figures['score_dist'] = fig_score_dist # 2. Engagement correlation fig_correlation = px.scatter( df, x='score', y='num_comments', size='upvote_ratio', color='is_self', title='Score vs Comments Correlation', labels={'score': 'Post Score', 'num_comments': 'Number of Comments', 'is_self': 'Post Type', 'upvote_ratio': 'Upvote Ratio'}, hover_data=['title'] ) fig_correlation.update_layout(height=400) figures['correlation'] = fig_correlation # 3. Top performing posts top_posts = df.nlargest(10, 'score')[['title', 'score', 'num_comments', 'author']] fig_top = go.Figure(data=[ go.Bar(name='Score', x=top_posts['title'].str[:30] + '...', y=top_posts['score']), go.Bar(name='Comments', x=top_posts['title'].str[:30] + '...', y=top_posts['num_comments']) ]) fig_top.update_layout( title='Top 10 Posts by Engagement', barmode='group', height=400, xaxis_tickangle=-45 ) figures['top_posts'] = fig_top return figures def main(): """Main application function""" # Initialize session state if 'scraper' not in st.session_state: st.session_state.scraper = None if 'advanced_scraper' not in st.session_state: st.session_state.advanced_scraper = None if 'data' not in st.session_state: st.session_state.data = pd.DataFrame() if 'last_fetch' not in st.session_state: st.session_state.last_fetch = None if 'comment_hierarchies' not in st.session_state: st.session_state.comment_hierarchies = {} # Header st.markdown('

📊 Reddit Research Dashboard

', unsafe_allow_html=True) st.markdown("Optimized for high-volume data collection and temporal analytics") # Load .env file if it exists env_vars = load_env_file(".env") if not env_vars: # Try parent directory env_vars = load_env_file("../.env") # Sidebar configuration with st.sidebar: st.header("âš™ī¸ Configuration") # Show .env detection status if env_vars: st.success("✅ .env file detected and loaded") # API Credentials - prioritize .env, then environment variables with st.expander("🔑 Reddit API Credentials", expanded=not bool(env_vars)): default_client_id = env_vars.get("REDDIT_CLIENT_ID", os.environ.get("REDDIT_CLIENT_ID", "")) default_client_secret = env_vars.get("REDDIT_CLIENT_SECRET", os.environ.get("REDDIT_CLIENT_SECRET", "")) default_user_agent = env_vars.get("REDDIT_USER_AGENT", os.environ.get("REDDIT_USER_AGENT", "RedditResearch/1.0")) client_id = st.text_input( "Client ID", value=default_client_id, type="password", help="Your Reddit API client ID (auto-populated from .env if available)" ) client_secret = st.text_input( "Client Secret", value=default_client_secret, type="password", help="Your Reddit API client secret (auto-populated from .env if available)" ) user_agent = st.text_input( "User Agent", value=default_user_agent, help="User agent string for API requests (auto-populated from .env if available)" ) if st.button("Initialize Scraper", type="primary"): if client_id and client_secret: try: st.session_state.scraper = OptimizedRedditScraper( client_id, client_secret, user_agent ) if ADVANCED_FEATURES: st.session_state.advanced_scraper = AdvancedRedditScraper( client_id, client_secret, user_agent ) st.success("✅ Scrapers initialized successfully (with advanced features)!") else: st.success("✅ Scraper initialized successfully!") except Exception as e: st.error(f"❌ Failed to initialize: {e}") else: st.warning("âš ī¸ Please provide API credentials") # Data Collection Settings st.header("đŸ“Ĩ Data Collection") # Show advanced mode only if features are available if ADVANCED_FEATURES: collection_mode = st.radio( "Collection Mode", ["Single Subreddit", "Multiple Subreddits", "Advanced with Hierarchy"] ) else: collection_mode = st.radio( "Collection Mode", ["Single Subreddit", "Multiple Subreddits"] ) if collection_mode == "Single Subreddit": subreddit_name = st.text_input("Subreddit Name", value="CUNY") subreddits = [subreddit_name] elif collection_mode == "Multiple Subreddits": subreddit_input = st.text_area( "Subreddits (one per line)", value="CUNY\nBaruch\nHunterCollege", height=100 ) subreddits = [s.strip() for s in subreddit_input.split('\n') if s.strip()] else: # Advanced with Hierarchy (only if ADVANCED_FEATURES is True) subreddit_name = st.text_input("Subreddit Name", value="CUNY") subreddits = [subreddit_name] use_checkpoint = st.checkbox("Enable checkpoint/resume", value=True) if use_checkpoint: checkpoint_name = st.text_input("Checkpoint name", value=f"{subreddit_name}_checkpoint") # Advanced settings with st.expander("âš™ī¸ Advanced Settings"): sort_by = st.selectbox( "Sort By", ["hot", "new", "top", "rising"], help="How to sort posts" ) limit = st.slider( "Posts per Subreddit", min_value=50, max_value=500, value=200, step=50, help="Number of posts to fetch (optimized for 200+)" ) if sort_by == "top": time_filter = st.selectbox( "Time Filter", ["hour", "day", "week", "month", "year", "all"], index=3 ) else: time_filter = "month" batch_size = st.number_input( "Batch Size", min_value=10, max_value=50, value=25, help="Posts processed per batch" ) cache_ttl = st.number_input( "Cache Duration (minutes)", min_value=5, max_value=120, value=60, help="How long to cache results" ) # Main content area with tabs if st.session_state.scraper: # Create main tabs main_tab1, main_tab2 = st.tabs(["đŸ“Ĩ Live Collection", "📊 Analytics & Metrics"]) with main_tab1: st.header("Live Data Collection") col1, col2, col3 = st.columns(3) with col1: if st.button("🚀 Start Collection", type="primary", width="stretch"): # Initialize/clear stream posts st.session_state.stream_posts = [] # Create display containers status_text = st.empty() progress_bar = st.progress(0) st.session_state.progress_bar = progress_bar # Fixed position containers for stats and stream stats_placeholder = st.empty() stream_placeholder = st.empty() status_text.info("🚀 Starting collection...") try: if collection_mode == "Advanced with Hierarchy": # Advanced scraping with comment hierarchies status_text.info(f"Advanced scraping r/{subreddits[0]}...") checkpoint = checkpoint_name if use_checkpoint else None results = st.session_state.advanced_scraper.scrape_with_hierarchy( subreddits[0], limit=limit, checkpoint_name=checkpoint ) st.session_state.comment_hierarchies = results['hierarchies'] # Convert to DataFrame df = pd.DataFrame(results['submissions']) if df.empty: df = pd.DataFrame() else: df['created_utc'] = pd.to_datetime(df['created_utc']) st.session_state.data = df status_text.success(f"✅ Scraped {len(results['submissions'])} posts with {len(results['comments'])} comments!") elif len(subreddits) == 1: # Standard single subreddit with streaming status_text.info(f"Collecting from r/{subreddits[0]}...") # Show header for stats with stats_placeholder: st.subheader("📊 Live Collection Progress") # Pass the placeholders to the scraper df = st.session_state.scraper.fetch_subreddit_data_verbose( subreddits[0], sort_by, limit, time_filter, (stats_placeholder, stream_placeholder) ) st.session_state.data = df if not df.empty else pd.DataFrame() if len(df) > 0: status_text.success(f"✅ Collected {len(df)} posts!") else: status_text.warning("âš ī¸ No posts collected") else: # Multiple subreddits with streaming status_text.info(f"Collecting from {len(subreddits)} subreddits...") with stats_placeholder: st.subheader("📊 Live Collection Progress") all_data = [] for idx, sub in enumerate(subreddits): status_text.info(f"Collecting {idx+1}/{len(subreddits)}: r/{sub}...") df = st.session_state.scraper.fetch_subreddit_data_verbose( sub, sort_by, limit, time_filter, (stats_placeholder, stream_placeholder) ) all_data.append(df) if all_data: df = pd.concat(all_data, ignore_index=True) else: df = pd.DataFrame() st.session_state.data = df status_text.success(f"✅ Collected {len(df)} total posts!") st.session_state.last_fetch = datetime.now() except Exception as e: error_msg = str(e) # Don't show PRAW iterator exhaustion errors if "Ran out of input" not in error_msg and "prawcore" not in error_msg.lower(): status_text.error(f"❌ Error: {error_msg}") elif st.session_state.data.empty: status_text.warning("âš ī¸ No posts could be fetched. Try adjusting your filters.") with col2: if not st.session_state.data.empty: st.download_button( "đŸ“Ĩ Download CSV", st.session_state.data.to_csv(index=False), file_name=f"reddit_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", mime="text/csv", width="stretch" ) with col3: if st.session_state.last_fetch: st.info(f"Last: {st.session_state.last_fetch.strftime('%H:%M:%S')}") # Analytics & Metrics Tab with main_tab2: if not st.session_state.data.empty: df = st.session_state.data # Summary metrics at top st.header("📈 Summary Metrics") col1, col2, col3, col4, col5 = st.columns(5) with col1: st.metric("Total Posts", f"{len(df):,}") with col2: st.metric("Unique Authors", f"{df['author'].nunique():,}") with col3: st.metric("Avg Score", f"{df['score'].mean():.1f}") with col4: st.metric("Avg Comments", f"{df['num_comments'].mean():.1f}") with col5: st.metric("Subreddits", len(df['subreddit'].unique())) # Tabbed interface for different analyses tab1, tab2, tab3, tab4, tab5 = st.tabs([ "📊 Temporal Analytics", "đŸ’Ŧ Engagement Analysis", "📋 Raw Data", "🔍 Search & Filter", "đŸŒŗ Comment Hierarchies" ]) with tab1: st.header("Temporal Analytics") # Generate temporal visualizations temporal_figs = create_temporal_visualizations(df) # Activity heatmap st.plotly_chart(temporal_figs.get('heatmap'), use_container_width=True) # Time series st.plotly_chart(temporal_figs.get('timeline'), use_container_width=True) # Monthly trends col1, col2 = st.columns(2) with col1: st.plotly_chart(temporal_figs.get('monthly'), use_container_width=True) with col2: if 'flair' in temporal_figs: st.plotly_chart(temporal_figs.get('flair'), use_container_width=True) with tab2: st.header("Engagement Analysis") engagement_figs = create_engagement_analytics(df) col1, col2 = st.columns(2) with col1: st.plotly_chart(engagement_figs['score_dist'], use_container_width=True) with col2: st.plotly_chart(engagement_figs['correlation'], use_container_width=True) st.plotly_chart(engagement_figs['top_posts'], use_container_width=True) with tab3: st.header("Raw Data View") # Data filtering options col1, col2, col3 = st.columns(3) with col1: min_score = st.number_input("Min Score", value=0) with col2: min_comments = st.number_input("Min Comments", value=0) with col3: author_filter = st.text_input("Author Filter") # Apply filters filtered_df = df[ (df['score'] >= min_score) & (df['num_comments'] >= min_comments) ] if author_filter: filtered_df = filtered_df[ filtered_df['author'].str.contains(author_filter, case=False, na=False) ] st.dataframe( filtered_df[['title', 'author', 'score', 'num_comments', 'created_utc', 'subreddit', 'flair']], width="stretch", height=500 ) st.info(f"Showing {len(filtered_df)} of {len(df)} posts") with tab4: st.header("Search & Filter") search_query = st.text_input("Search in titles and text", placeholder="Enter keywords...") if search_query: mask = ( df['title'].str.contains(search_query, case=False, na=False) | df['selftext'].str.contains(search_query, case=False, na=False) ) search_results = df[mask] st.info(f"Found {len(search_results)} posts matching '{search_query}'") if not search_results.empty: for idx, row in search_results.head(10).iterrows(): with st.expander(f"📝 {row['title'][:100]}..."): col1, col2, col3 = st.columns(3) with col1: st.metric("Score", row['score']) with col2: st.metric("Comments", row['num_comments']) with col3: st.metric("Date", row['created_utc'].strftime('%Y-%m-%d')) st.write(f"**Author:** u/{row['author']}") st.write(f"**Subreddit:** r/{row['subreddit']}") if row['selftext']: st.write(f"**Text:** {row['selftext'][:500]}...") st.write(f"[View on Reddit]({row['permalink']})") with tab5: st.header("Comment Hierarchies") if not ADVANCED_FEATURES: st.info("âš ī¸ Comment hierarchy analysis requires additional dependencies. This feature is optional and not needed for basic data collection.") elif st.session_state.comment_hierarchies: # Select submission to view submission_ids = list(st.session_state.comment_hierarchies.keys()) selected_sub = st.selectbox("Select Submission", submission_ids) if selected_sub: hierarchy = st.session_state.comment_hierarchies[selected_sub] # Display submission info if hierarchy['submission']: st.subheader(f"📝 {hierarchy['submission'].get('title', 'No Title')}") col1, col2, col3 = st.columns(3) with col1: st.metric("Score", hierarchy['submission'].get('score', 0)) with col2: st.metric("Comments", len(hierarchy.get('comments', {}))) with col3: st.metric("Author", hierarchy['submission'].get('author', '[deleted]')) # Visualize comment tree def display_comment_tree(comments, level=0): for comment_id, comment in comments.items(): indent = " " * level with st.expander(f"{indent}đŸ’Ŧ {comment.get('author', '[deleted]')} - Score: {comment.get('score', 0)}"): st.write(comment.get('body', '')[:500]) if 'replies' in comment and comment['replies']: st.write("**Replies:**") display_comment_tree(comment['replies'], level + 1) st.subheader("Comment Thread Structure") if hierarchy.get('hierarchy'): display_comment_tree(hierarchy['hierarchy']) else: st.info("No comments found for this submission") # Orphan statistics if st.session_state.get('advanced_scraper'): orphan_stats = st.session_state.advanced_scraper.hierarchy_tracker.get_orphan_statistics() if orphan_stats['orphaned_count'] > 0: st.warning(f"âš ī¸ {orphan_stats['orphaned_count']} orphaned comments detected ({orphan_stats['orphan_rate']:.1%} orphan rate)") else: st.info("Use 'Advanced with Hierarchy' collection mode to analyze comment structures") else: # Empty state - no data collected yet st.info("👆 Configure your settings in the sidebar and click 'Start Collection' to begin") # Quick start guide with st.expander("🚀 Quick Start Guide"): st.markdown(""" ### Getting Started 1. **Set up API Credentials** - Get your Reddit API credentials from [reddit.com/prefs/apps](https://www.reddit.com/prefs/apps) - Enter them in the sidebar - Click "Initialize Scraper" 2. **Choose Collection Mode** - **Single Subreddit**: Analyze one community in depth - **Multiple Subreddits**: Collect from multiple communities 3. **Configure Settings** - Adjust the number of posts (200+ recommended) - Choose sort method (hot, new, top, rising) - Set time filter for top posts 4. **Fetch & Analyze** - Click "Fetch Data" to start collection - Explore temporal patterns, engagement metrics - Export results as CSV for further analysis ### Features - **Batch Processing**: Efficiently handles 200+ posts - **Caching**: Reduces API calls with smart caching - **Temporal Analytics**: Hour/day/month patterns - **Engagement Metrics**: Score, comments, correlations """) else: st.warning("âš ī¸ Please initialize the scraper with your Reddit API credentials in the sidebar") # API setup instructions with st.expander("📖 How to get Reddit API credentials"): st.markdown(""" ### Setting up Reddit API Access 1. **Create a Reddit Account** (if you don't have one) - Go to [reddit.com](https://www.reddit.com) and sign up 2. **Create an App** - Visit [reddit.com/prefs/apps](https://www.reddit.com/prefs/apps) - Click "Create App" or "Create Another App" - Fill in the form: - **Name**: Your app name (e.g., "Research Dashboard") - **App Type**: Select "script" - **Description**: Optional - **About URL**: Leave blank - **Redirect URI**: http://localhost:8000 - Click "Create app" 3. **Get Your Credentials** - **Client ID**: The string under "personal use script" - **Client Secret**: The secret key shown - **User Agent**: Format: "Platform:AppName:Version (by /u/YourUsername)" 4. **Enter in Sidebar** - Copy your credentials to the sidebar fields - Click "Initialize Scraper" """) if __name__ == "__main__": main()