Spaces:

rottg
/

telegram-analytics

Running

App Files Files Community

telegram-analytics / analyzer.py

rottg

Upload folder using huggingface_hub

4a21e7e about 1 month ago

raw

history blame contribute delete

34 kB

	#!/usr/bin/env python3
	"""
	Telegram Chat Analytics (Enhanced with Course Algorithms)

	Features:
	- LCS-based similar message detection
	- Heap-based Top-K (O(n log k) instead of O(n log n))
	- Selection algorithm for O(n) median/percentiles
	- Rank Tree for order statistics queries
	- Bucket Sort for time-based histograms

	Usage:
	python analyzer.py --db telegram.db [options]
	python analyzer.py --stats
	python analyzer.py --top-users
	python analyzer.py --similar # NEW: Find similar messages
	python analyzer.py --percentiles # NEW: Message length percentiles
	python analyzer.py --user-rank USER # NEW: Get user's rank
	"""

	import sqlite3
	import argparse
	import json
	from collections import Counter
	from datetime import datetime
	from typing import Optional
	import re

	# Import course algorithms
	from algorithms import (
	# LCS
	lcs_similarity, find_similar_messages,
	# Top-K
	TopK, top_k_frequent, top_k_by_field,
	# Selection
	find_median, find_percentile,
	# Rank Tree
	RankTree,
	# Bucket Sort
	bucket_sort_by_time, time_histogram, hourly_distribution,
	# Combined
	RankedTimeIndex
	)


	class TelegramAnalyzer:
	"""
	Analytics interface for indexed Telegram messages.

	Enhanced with efficient algorithms:
	- Top-K queries: O(n log k) using heap
	- Percentiles: O(n) using selection algorithm
	- Rank queries: O(log n) using rank tree
	- Similar messages: LCS-based detection
	"""

	def __init__(self, db_path: str = 'telegram.db'):
	self.db_path = db_path
	self.conn = sqlite3.connect(db_path)
	self.conn.row_factory = sqlite3.Row

	# Lazy-loaded data structures
	self._rank_tree: Optional[RankTree] = None
	self._time_index: Optional[RankedTimeIndex] = None

	def close(self):
	self.conn.close()

	def __enter__(self):
	return self

	def __exit__(self, *args):
	self.close()

	# ==========================================
	# ORIGINAL METHODS (kept for compatibility)
	# ==========================================

	def get_stats(self) -> dict:
	"""Get general statistics about the indexed data."""
	stats = {}

	cursor = self.conn.execute('SELECT COUNT(*) FROM messages')
	stats['total_messages'] = cursor.fetchone()[0]

	cursor = self.conn.execute('SELECT COUNT(DISTINCT from_id) FROM messages')
	stats['total_users'] = cursor.fetchone()[0]

	cursor = self.conn.execute('''
	SELECT MIN(date_unixtime), MAX(date_unixtime) FROM messages
	WHERE date_unixtime IS NOT NULL
	''')
	row = cursor.fetchone()
	if row[0] and row[1]:
	stats['first_message'] = datetime.fromtimestamp(row[0]).isoformat()
	stats['last_message'] = datetime.fromtimestamp(row[1]).isoformat()
	stats['days_span'] = (row[1] - row[0]) // 86400

	cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE has_media = 1')
	stats['messages_with_media'] = cursor.fetchone()[0]

	cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE has_links = 1')
	stats['messages_with_links'] = cursor.fetchone()[0]

	cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE has_mentions = 1')
	stats['messages_with_mentions'] = cursor.fetchone()[0]

	cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE forwarded_from IS NOT NULL')
	stats['forwarded_messages'] = cursor.fetchone()[0]

	cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE reply_to_message_id IS NOT NULL')
	stats['reply_messages'] = cursor.fetchone()[0]

	cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE is_edited = 1')
	stats['edited_messages'] = cursor.fetchone()[0]

	cursor = self.conn.execute('SELECT type, COUNT(*) FROM entities GROUP BY type')
	stats['entities'] = {row[0]: row[1] for row in cursor.fetchall()}

	# NEW: Add percentile stats using Selection algorithm
	lengths = self._get_message_lengths()
	if lengths:
	stats['median_message_length'] = find_median(lengths)
	stats['p90_message_length'] = find_percentile(lengths, 90)

	return stats

	def _get_message_lengths(self) -> list[int]:
	"""Get all message lengths for statistical analysis."""
	cursor = self.conn.execute(
	'SELECT length(text_plain) FROM messages WHERE text_plain IS NOT NULL'
	)
	return [row[0] for row in cursor.fetchall() if row[0]]

	# ==========================================
	# ENHANCED TOP-K METHODS (using Heap)
	# ==========================================

	def get_top_users(self, limit: int = 20) -> list[dict]:
	"""
	Get most active users by message count.

	Uses Heap-based Top-K: O(n log k) instead of O(n log n)
	"""
	cursor = self.conn.execute('''
	SELECT
	from_id,
	from_name,
	COUNT(*) as message_count,
	SUM(has_links) as links_shared,
	SUM(has_media) as media_shared,
	MIN(date_unixtime) as first_message,
	MAX(date_unixtime) as last_message
	FROM messages
	WHERE from_id IS NOT NULL AND from_id != ''
	GROUP BY from_id
	''')

	# Use heap-based Top-K
	top = TopK(limit, key=lambda x: x['message_count'])
	for row in cursor.fetchall():
	top.push(dict(row))

	return top.get_top()

	def get_top_words_heap(self, limit: int = 50, min_length: int = 3) -> list[tuple[str, int]]:
	"""
	Get most frequent words using Heap-based Top-K.

	O(n + m log k) where n=total words, m=unique words, k=limit
	"""
	cursor = self.conn.execute('SELECT text_plain FROM messages WHERE text_plain IS NOT NULL')

	word_pattern = re.compile(r'[\u0590-\u05FFa-zA-Z]+')
	words = []

	for row in cursor.fetchall():
	text = row[0]
	for word in word_pattern.findall(text.lower()):
	if len(word) >= min_length:
	words.append(word)

	return top_k_frequent(words, limit)

	def get_top_domains_heap(self, limit: int = 20) -> list[tuple[str, int]]:
	"""Get most shared domains using Heap-based Top-K."""
	cursor = self.conn.execute("SELECT value FROM entities WHERE type = 'link'")

	domain_pattern = re.compile(r'https?://(?:www\.)?([^/]+)')
	domains = []

	for row in cursor.fetchall():
	match = domain_pattern.match(row[0])
	if match:
	domains.append(match.group(1))

	return top_k_frequent(domains, limit)

	# ==========================================
	# LCS-BASED SIMILAR MESSAGE DETECTION
	# ==========================================

	def find_similar_messages(
	self,
	threshold: float = 0.7,
	min_length: int = 30,
	limit: int = 100,
	sample_size: int = 1000
	) -> list[tuple[int, int, float, str, str]]:
	"""
	Find similar/duplicate messages using LCS algorithm.

	Args:
	threshold: Minimum similarity (0-1)
	min_length: Minimum message length to consider
	limit: Maximum pairs to return
	sample_size: Sample size for large datasets

	Returns:
	List of (id1, id2, similarity, text1, text2) tuples
	"""
	cursor = self.conn.execute('''
	SELECT id, text_plain FROM messages
	WHERE text_plain IS NOT NULL AND length(text_plain) >= ?
	ORDER BY RANDOM()
	LIMIT ?
	''', (min_length, sample_size))

	messages = [(row[0], row[1]) for row in cursor.fetchall()]

	# Find similar pairs using LCS
	similar_pairs = find_similar_messages(messages, threshold, min_length)

	# Fetch full text for results
	results = []
	for id1, id2, sim in similar_pairs[:limit]:
	cursor = self.conn.execute(
	'SELECT text_plain FROM messages WHERE id IN (?, ?)',
	(id1, id2)
	)
	rows = cursor.fetchall()
	if len(rows) == 2:
	results.append((id1, id2, sim, rows[0][0][:100], rows[1][0][:100]))

	return results

	def find_reposts(self, threshold: float = 0.9) -> list[dict]:
	"""
	Find potential reposts (very similar messages from different users).
	"""
	cursor = self.conn.execute('''
	SELECT id, from_id, text_plain FROM messages
	WHERE text_plain IS NOT NULL AND length(text_plain) >= 50
	ORDER BY date_unixtime DESC
	LIMIT 500
	''')

	messages = [(row[0], row[1], row[2]) for row in cursor.fetchall()]
	reposts = []

	for i in range(len(messages)):
	for j in range(i + 1, len(messages)):
	id1, user1, text1 = messages[i]
	id2, user2, text2 = messages[j]

	# Only consider different users
	if user1 == user2:
	continue

	sim = lcs_similarity(text1, text2)
	if sim >= threshold:
	reposts.append({
	'message_id_1': id1,
	'message_id_2': id2,
	'user_1': user1,
	'user_2': user2,
	'similarity': sim,
	'text_preview': text1[:80]
	})

	return sorted(reposts, key=lambda x: x['similarity'], reverse=True)

	# ==========================================
	# SELECTION ALGORITHM (PERCENTILES)
	# ==========================================

	def get_message_length_stats(self) -> dict:
	"""
	Get message length statistics using O(n) Selection algorithm.

	Much faster than sorting for percentile calculations.
	"""
	lengths = self._get_message_lengths()

	if not lengths:
	return {}

	return {
	'count': len(lengths),
	'min': min(lengths),
	'max': max(lengths),
	'median': find_median(lengths),
	'p25': find_percentile(lengths, 25),
	'p75': find_percentile(lengths, 75),
	'p90': find_percentile(lengths, 90),
	'p95': find_percentile(lengths, 95),
	'p99': find_percentile(lengths, 99),
	}

	def get_response_time_percentiles(self) -> dict:
	"""
	Calculate response time percentiles for replies.

	Uses Selection algorithm for O(n) percentile calculation.
	"""
	cursor = self.conn.execute('''
	SELECT
	m1.date_unixtime - m2.date_unixtime as response_time
	FROM messages m1
	JOIN messages m2 ON m1.reply_to_message_id = m2.id
	WHERE m1.date_unixtime > m2.date_unixtime
	''')

	times = [row[0] for row in cursor.fetchall() if row[0] and row[0] > 0]

	if not times:
	return {}

	return {
	'count': len(times),
	'median_seconds': find_median(times),
	'p75_seconds': find_percentile(times, 75),
	'p90_seconds': find_percentile(times, 90),
	'p95_seconds': find_percentile(times, 95),
	}

	# ==========================================
	# RANK TREE (ORDER STATISTICS)
	# ==========================================

	def _build_user_rank_tree(self) -> RankTree:
	"""Build rank tree for user activity ranking."""
	if self._rank_tree is not None:
	return self._rank_tree

	self._rank_tree = RankTree()

	cursor = self.conn.execute('''
	SELECT from_id, from_name, COUNT(*) as msg_count
	FROM messages
	WHERE from_id IS NOT NULL AND from_id != ''
	GROUP BY from_id
	''')

	for row in cursor.fetchall():
	self._rank_tree.insert(
	row['msg_count'],
	{'user_id': row['from_id'], 'name': row['from_name'], 'count': row['msg_count']}
	)

	return self._rank_tree

	def get_user_rank(self, user_id: str) -> dict:
	"""
	Get a user's rank among all users.

	Uses Rank Tree: O(log n) instead of O(n log n)
	"""
	tree = self._build_user_rank_tree()

	# Get user's message count
	cursor = self.conn.execute(
	'SELECT COUNT(*) FROM messages WHERE from_id = ?',
	(user_id,)
	)
	count = cursor.fetchone()[0]

	if count == 0:
	return {'error': 'User not found'}

	rank = tree.rank(count)
	total_users = len(tree)

	return {
	'user_id': user_id,
	'message_count': count,
	'rank': total_users - rank + 1, # Reverse for "top" ranking
	'total_users': total_users,
	'percentile': ((total_users - rank) / total_users) * 100
	}

	def get_user_by_rank(self, rank: int) -> Optional[dict]:
	"""
	Get the user at a specific rank.

	Uses Rank Tree select(): O(log n)
	"""
	tree = self._build_user_rank_tree()
	total = len(tree)

	if rank < 1 or rank > total:
	return None

	# Convert to tree rank (reverse order for "top")
	tree_rank = total - rank + 1
	return tree.select(tree_rank)

	# ==========================================
	# BUCKET SORT (TIME-BASED HISTOGRAMS)
	# ==========================================

	def get_activity_histogram(
	self,
	bucket_size: int = 86400, # 1 day default
	start_time: int = None,
	end_time: int = None
	) -> list[tuple[str, int]]:
	"""
	Get activity histogram using Bucket Sort.

	O(n + k) where k = number of buckets

	Args:
	bucket_size: Bucket size in seconds (default: 1 day)
	start_time: Start timestamp (default: earliest message)
	end_time: End timestamp (default: latest message)

	Returns:
	List of (date_string, count) tuples
	"""
	cursor = self.conn.execute(
	'SELECT id, date_unixtime FROM messages WHERE date_unixtime IS NOT NULL'
	)
	records = [{'id': row[0], 'date_unixtime': row[1]} for row in cursor.fetchall()]

	if not records:
	return []

	hist = time_histogram(records, 'date_unixtime', bucket_size)

	# Format timestamps as dates
	return [
	(datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M'), count)
	for ts, count in hist
	]

	def get_hourly_distribution(self) -> dict[int, int]:
	"""
	Get message distribution by hour of day.

	Uses Bucket Sort: O(n)
	"""
	cursor = self.conn.execute(
	'SELECT id, date_unixtime FROM messages WHERE date_unixtime IS NOT NULL'
	)
	records = [{'id': row[0], 'date_unixtime': row[1]} for row in cursor.fetchall()]

	return hourly_distribution(records, 'date_unixtime')

	# ==========================================
	# ORIGINAL METHODS (kept for compatibility)
	# ==========================================

	def get_hourly_activity(self) -> dict[int, int]:
	"""Get message count by hour of day."""
	sql = '''
	SELECT
	CAST(strftime('%H', datetime(date_unixtime, 'unixepoch')) AS INTEGER) as hour,
	COUNT(*) as count
	FROM messages
	WHERE date_unixtime IS NOT NULL
	GROUP BY hour
	ORDER BY hour
	'''
	cursor = self.conn.execute(sql)
	return {row[0]: row[1] for row in cursor.fetchall()}

	def get_daily_activity(self) -> dict[str, int]:
	"""Get message count by day of week."""
	days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
	sql = '''
	SELECT
	CAST(strftime('%w', datetime(date_unixtime, 'unixepoch')) AS INTEGER) as day,
	COUNT(*) as count
	FROM messages
	WHERE date_unixtime IS NOT NULL
	GROUP BY day
	ORDER BY day
	'''
	cursor = self.conn.execute(sql)
	return {days[row[0]]: row[1] for row in cursor.fetchall()}

	def get_monthly_activity(self) -> dict[str, int]:
	"""Get message count by month."""
	sql = '''
	SELECT
	strftime('%Y-%m', datetime(date_unixtime, 'unixepoch')) as month,
	COUNT(*) as count
	FROM messages
	WHERE date_unixtime IS NOT NULL
	GROUP BY month
	ORDER BY month
	'''
	cursor = self.conn.execute(sql)
	return {row[0]: row[1] for row in cursor.fetchall()}

	def get_top_domains(self, limit: int = 20) -> list[tuple[str, int]]:
	"""Get most shared domains from links."""
	return self.get_top_domains_heap(limit)

	def get_top_mentioned(self, limit: int = 20) -> list[tuple[str, int]]:
	"""Get most mentioned users/channels."""
	sql = '''
	SELECT value, COUNT(*) as count
	FROM entities
	WHERE type = 'mention'
	GROUP BY value
	ORDER BY count DESC
	LIMIT ?
	'''
	cursor = self.conn.execute(sql, (limit,))
	return [(row[0], row[1]) for row in cursor.fetchall()]

	def get_forwarded_sources(self, limit: int = 20) -> list[dict]:
	"""Get top sources of forwarded messages."""
	sql = '''
	SELECT
	forwarded_from,
	forwarded_from_id,
	COUNT(*) as count
	FROM messages
	WHERE forwarded_from IS NOT NULL
	GROUP BY forwarded_from_id
	ORDER BY count DESC
	LIMIT ?
	'''
	cursor = self.conn.execute(sql, (limit,))
	return [dict(row) for row in cursor.fetchall()]

	def get_word_frequency(self, limit: int = 50, min_length: int = 3) -> list[tuple[str, int]]:
	"""Get most frequent words using Heap-based Top-K."""
	return self.get_top_words_heap(limit, min_length)

	def get_reply_network(self, limit: int = 100) -> list[dict]:
	"""Get reply relationships between users."""
	sql = '''
	SELECT
	m1.from_id as replier_id,
	m1.from_name as replier_name,
	m2.from_id as replied_to_id,
	m2.from_name as replied_to_name,
	COUNT(*) as reply_count
	FROM messages m1
	JOIN messages m2 ON m1.reply_to_message_id = m2.id
	WHERE m1.reply_to_message_id IS NOT NULL
	GROUP BY m1.from_id, m2.from_id
	ORDER BY reply_count DESC
	LIMIT ?
	'''
	cursor = self.conn.execute(sql, (limit,))
	return [dict(row) for row in cursor.fetchall()]

	def get_user_stats(self, user_id: str) -> dict:
	"""Get detailed statistics for a specific user."""
	stats = {}

	cursor = self.conn.execute('''
	SELECT
	COUNT(*) as total,
	SUM(has_links) as links,
	SUM(has_media) as media,
	SUM(has_mentions) as mentions,
	SUM(is_edited) as edited,
	MIN(date_unixtime) as first_msg,
	MAX(date_unixtime) as last_msg
	FROM messages WHERE from_id = ?
	''', (user_id,))
	row = cursor.fetchone()
	stats.update(dict(row))

	cursor = self.conn.execute('''
	SELECT COUNT(*) FROM messages m1
	JOIN messages m2 ON m1.reply_to_message_id = m2.id
	WHERE m2.from_id = ?
	''', (user_id,))
	stats['replies_received'] = cursor.fetchone()[0]

	cursor = self.conn.execute('''
	SELECT COUNT(*) FROM messages
	WHERE from_id = ? AND reply_to_message_id IS NOT NULL
	''', (user_id,))
	stats['replies_sent'] = cursor.fetchone()[0]

	# Add rank info using Rank Tree
	rank_info = self.get_user_rank(user_id)
	stats['rank'] = rank_info.get('rank')
	stats['percentile'] = rank_info.get('percentile')

	return stats


	def print_bar(value: int, max_value: int, width: int = 40) -> str:
	"""Create a simple ASCII bar."""
	if max_value == 0:
	return ''
	bar_length = int((value / max_value) * width)
	return '█' * bar_length + '░' * (width - bar_length)


	def main():
	parser = argparse.ArgumentParser(description='Analyze indexed Telegram messages (Enhanced)')
	parser.add_argument('--db', default='telegram.db', help='Database path')

	# Original options
	parser.add_argument('--stats', action='store_true', help='Show general statistics')
	parser.add_argument('--top-users', action='store_true', help='Show top users')
	parser.add_argument('--hourly', action='store_true', help='Show hourly activity')
	parser.add_argument('--daily', action='store_true', help='Show daily activity')
	parser.add_argument('--monthly', action='store_true', help='Show monthly activity')
	parser.add_argument('--domains', action='store_true', help='Show top shared domains')
	parser.add_argument('--mentions', action='store_true', help='Show top mentions')
	parser.add_argument('--words', action='store_true', help='Show word frequency')
	parser.add_argument('--sources', action='store_true', help='Show forwarded message sources')
	parser.add_argument('--replies', action='store_true', help='Show reply network')
	parser.add_argument('--user', help='Show stats for specific user ID')

	# NEW: Enhanced options
	parser.add_argument('--similar', action='store_true', help='Find similar messages (LCS)')
	parser.add_argument('--reposts', action='store_true', help='Find potential reposts')
	parser.add_argument('--percentiles', action='store_true', help='Show message length percentiles')
	parser.add_argument('--response-times', action='store_true', help='Show response time percentiles')
	parser.add_argument('--user-rank', help='Get rank of specific user')
	parser.add_argument('--rank', type=int, help='Get user at specific rank')
	parser.add_argument('--histogram', action='store_true', help='Show activity histogram')
	parser.add_argument('--bucket-size', type=int, default=86400, help='Histogram bucket size in seconds')

	parser.add_argument('--limit', type=int, default=20, help='Limit results')
	parser.add_argument('--json', action='store_true', help='Output as JSON')
	parser.add_argument('--threshold', type=float, default=0.7, help='Similarity threshold')

	args = parser.parse_args()

	with TelegramAnalyzer(args.db) as analyzer:
	# === ORIGINAL OPTIONS ===
	if args.stats:
	stats = analyzer.get_stats()
	if args.json:
	print(json.dumps(stats, indent=2, ensure_ascii=False))
	else:
	print("=== General Statistics ===\n")
	print(f"Total messages: {stats['total_messages']:,}")
	print(f"Total users: {stats['total_users']:,}")
	print(f"First message: {stats.get('first_message', 'N/A')}")
	print(f"Last message: {stats.get('last_message', 'N/A')}")
	print(f"Days span: {stats.get('days_span', 'N/A')}")
	print(f"Messages with media: {stats['messages_with_media']:,}")
	print(f"Messages with links: {stats['messages_with_links']:,}")
	print(f"Forwarded messages: {stats['forwarded_messages']:,}")
	print(f"Reply messages: {stats['reply_messages']:,}")
	if 'median_message_length' in stats:
	print(f"\nMedian msg length: {stats['median_message_length']:.0f} chars")
	print(f"90th percentile: {stats['p90_message_length']:.0f} chars")
	print(f"\nEntities: {stats.get('entities', {})}")
	return

	if args.top_users:
	users = analyzer.get_top_users(args.limit)
	if args.json:
	print(json.dumps(users, indent=2, ensure_ascii=False))
	else:
	print("=== Top Users by Message Count (Heap-based Top-K) ===\n")
	max_count = users[0]['message_count'] if users else 0
	for i, user in enumerate(users, 1):
	bar = print_bar(user['message_count'], max_count, 30)
	print(f"{i:2}. {user['from_name'][:20]:20} {bar} {user['message_count']:,}")
	return

	if args.hourly:
	hourly = analyzer.get_hourly_activity()
	if args.json:
	print(json.dumps(hourly, indent=2))
	else:
	print("=== Hourly Activity ===\n")
	max_count = max(hourly.values()) if hourly else 0
	for hour in range(24):
	count = hourly.get(hour, 0)
	bar = print_bar(count, max_count, 40)
	print(f"{hour:02}:00 {bar} {count:,}")
	return

	if args.daily:
	daily = analyzer.get_daily_activity()
	if args.json:
	print(json.dumps(daily, indent=2))
	else:
	print("=== Daily Activity ===\n")
	max_count = max(daily.values()) if daily else 0
	for day, count in daily.items():
	bar = print_bar(count, max_count, 40)
	print(f"{day:10} {bar} {count:,}")
	return

	if args.monthly:
	monthly = analyzer.get_monthly_activity()
	if args.json:
	print(json.dumps(monthly, indent=2))
	else:
	print("=== Monthly Activity ===\n")
	max_count = max(monthly.values()) if monthly else 0
	for month, count in monthly.items():
	bar = print_bar(count, max_count, 40)
	print(f"{month} {bar} {count:,}")
	return

	if args.domains:
	domains = analyzer.get_top_domains(args.limit)
	if args.json:
	print(json.dumps(dict(domains), indent=2))
	else:
	print("=== Top Shared Domains (Heap-based Top-K) ===\n")
	max_count = domains[0][1] if domains else 0
	for domain, count in domains:
	bar = print_bar(count, max_count, 30)
	print(f"{domain[:30]:30} {bar} {count:,}")
	return

	if args.mentions:
	mentions = analyzer.get_top_mentioned(args.limit)
	if args.json:
	print(json.dumps(dict(mentions), indent=2))
	else:
	print("=== Top Mentioned Users ===\n")
	max_count = mentions[0][1] if mentions else 0
	for mention, count in mentions:
	bar = print_bar(count, max_count, 30)
	print(f"{mention:20} {bar} {count:,}")
	return

	if args.words:
	words = analyzer.get_word_frequency(args.limit)
	if args.json:
	print(json.dumps(dict(words), indent=2, ensure_ascii=False))
	else:
	print("=== Top Words (Heap-based Top-K) ===\n")
	max_count = words[0][1] if words else 0
	for word, count in words:
	bar = print_bar(count, max_count, 30)
	print(f"{word:20} {bar} {count:,}")
	return

	if args.sources:
	sources = analyzer.get_forwarded_sources(args.limit)
	if args.json:
	print(json.dumps(sources, indent=2, ensure_ascii=False))
	else:
	print("=== Top Forwarded Sources ===\n")
	max_count = sources[0]['count'] if sources else 0
	for src in sources:
	bar = print_bar(src['count'], max_count, 30)
	name = src['forwarded_from'] or 'Unknown'
	print(f"{name[:30]:30} {bar} {src['count']:,}")
	return

	if args.replies:
	replies = analyzer.get_reply_network(args.limit)
	if args.json:
	print(json.dumps(replies, indent=2, ensure_ascii=False))
	else:
	print("=== Reply Network ===\n")
	for r in replies:
	print(f"{r['replier_name']} → {r['replied_to_name']}: {r['reply_count']} replies")
	return

	if args.user:
	user_stats = analyzer.get_user_stats(args.user)
	if args.json:
	print(json.dumps(user_stats, indent=2))
	else:
	print(f"=== Stats for {args.user} ===\n")
	for key, value in user_stats.items():
	print(f"{key}: {value}")
	return

	# === NEW ENHANCED OPTIONS ===

	if args.similar:
	print(f"=== Similar Messages (LCS, threshold={args.threshold}) ===\n")
	similar = analyzer.find_similar_messages(
	threshold=args.threshold,
	limit=args.limit
	)
	if args.json:
	print(json.dumps(similar, indent=2, ensure_ascii=False))
	else:
	for id1, id2, sim, text1, text2 in similar:
	print(f"Similarity: {sim:.1%}")
	print(f" [{id1}] {text1}...")
	print(f" [{id2}] {text2}...")
	print()
	return

	if args.reposts:
	print("=== Potential Reposts (LCS-based) ===\n")
	reposts = analyzer.find_reposts(threshold=args.threshold)
	if args.json:
	print(json.dumps(reposts, indent=2, ensure_ascii=False))
	else:
	for r in reposts[:args.limit]:
	print(f"Similarity: {r['similarity']:.1%}")
	print(f" User 1: {r['user_1']}")
	print(f" User 2: {r['user_2']}")
	print(f" Text: {r['text_preview']}...")
	print()
	return

	if args.percentiles:
	print("=== Message Length Percentiles (Selection Algorithm) ===\n")
	stats = analyzer.get_message_length_stats()
	if args.json:
	print(json.dumps(stats, indent=2))
	else:
	for key, value in stats.items():
	print(f"{key:15}: {value:,.0f}")
	return

	if args.response_times:
	print("=== Response Time Percentiles (Selection Algorithm) ===\n")
	stats = analyzer.get_response_time_percentiles()
	if args.json:
	print(json.dumps(stats, indent=2))
	else:
	for key, value in stats.items():
	if 'seconds' in key:
	print(f"{key:15}: {value:,.0f}s ({value/60:.1f}m)")
	else:
	print(f"{key:15}: {value:,}")
	return

	if args.user_rank:
	print(f"=== User Rank (Rank Tree O(log n)) ===\n")
	rank_info = analyzer.get_user_rank(args.user_rank)
	if args.json:
	print(json.dumps(rank_info, indent=2))
	else:
	print(f"User ID: {rank_info.get('user_id')}")
	print(f"Message count: {rank_info.get('message_count'):,}")
	print(f"Rank: #{rank_info.get('rank')} of {rank_info.get('total_users')}")
	print(f"Percentile: Top {rank_info.get('percentile'):.1f}%")
	return

	if args.rank:
	print(f"=== User at Rank #{args.rank} (Rank Tree O(log n)) ===\n")
	user = analyzer.get_user_by_rank(args.rank)
	if args.json:
	print(json.dumps(user, indent=2, ensure_ascii=False))
	elif user:
	print(f"Name: {user.get('name')}")
	print(f"User ID: {user.get('user_id')}")
	print(f"Message count: {user.get('count'):,}")
	else:
	print(f"No user at rank {args.rank}")
	return

	if args.histogram:
	print(f"=== Activity Histogram (Bucket Sort, bucket={args.bucket_size}s) ===\n")
	hist = analyzer.get_activity_histogram(bucket_size=args.bucket_size)
	if args.json:
	print(json.dumps(hist, indent=2))
	else:
	max_count = max(c for _, c in hist) if hist else 0
	for date_str, count in hist[-args.limit:]:
	bar = print_bar(count, max_count, 40)
	print(f"{date_str} {bar} {count:,}")
	return

	# Default: show help
	parser.print_help()


	if __name__ == '__main__':
	main()