Spaces:

nivakaran
/

modelx

Running

App Files Files Community

modelx / src /nodes /socialAgentNode.py

nivakaran

Upload folder using huggingface_hub

b4c4175 verified 9 days ago

raw

history blame contribute delete

42 kB

	"""
	src/nodes/socialAgentNode.py
	MODULAR - Social Agent Node with Subgraph Architecture
	Monitors trending topics, events, people, social intelligence across geographic scopes

	Updated: Uses Tool Factory pattern for parallel execution safety.
	Each agent instance gets its own private set of tools.

	Updated: Now loads user-defined keywords and profiles from intel config.
	"""

	import json
	import uuid
	import os
	from typing import Dict, Any, List
	from datetime import datetime
	from src.states.socialAgentState import SocialAgentState
	from src.utils.tool_factory import create_tool_set
	from src.llms.groqllm import GroqLLM


	def load_intel_config() -> dict:
	"""Load intel config from JSON file (same as main.py)."""
	config_path = os.path.join(
	os.path.dirname(__file__), "..", "..", "data", "intel_config.json"
	)
	default_config = {
	"user_profiles": {"twitter": [], "facebook": [], "linkedin": []},
	"user_keywords": [],
	"user_products": [],
	}
	try:
	if os.path.exists(config_path):
	with open(config_path, "r", encoding="utf-8") as f:
	return json.load(f)
	except Exception:
	pass
	return default_config


	class SocialAgentNode:
	"""
	Modular Social Agent - Geographic social intelligence collection.
	Module 1: Trending Topics (Sri Lanka specific trends)
	Module 2: Social Media (Sri Lanka, Asia, World scopes)
	Module 3: Feed Generation (Categorize, Summarize, Format)
	Module 4: User-Defined Keywords & Profiles (from frontend config)

	Thread Safety:
	Each SocialAgentNode instance creates its own private ToolSet,
	enabling safe parallel execution with other agents.
	"""

	def __init__(self, llm=None):
	"""Initialize with Groq LLM and private tool set"""
	# Create PRIVATE tool instances for this agent
	# This enables parallel execution without shared state conflicts
	self.tools = create_tool_set()

	if llm is None:
	groq = GroqLLM()
	self.llm = groq.get_llm()
	else:
	self.llm = llm

	# Load user-defined intel config (keywords, profiles, products)
	self.intel_config = load_intel_config()
	self.user_keywords = self.intel_config.get("user_keywords", [])
	self.user_profiles = self.intel_config.get("user_profiles", {})
	self.user_products = self.intel_config.get("user_products", [])

	print(
	f"[SocialAgent] Loaded {len(self.user_keywords)} user keywords, "
	f"{sum(len(v) for v in self.user_profiles.values())} profiles"
	)

	# Geographic scopes
	self.geographic_scopes = {
	"sri_lanka": ["sri lanka", "colombo", "srilanka"],
	"asia": [
	"india",
	"pakistan",
	"bangladesh",
	"maldives",
	"singapore",
	"malaysia",
	"thailand",
	],
	"world": ["global", "international", "breaking news", "world events"],
	}

	# Trending categories
	self.trending_categories = [
	"events",
	"people",
	"viral",
	"breaking",
	"technology",
	"culture",
	]

	# ============================================
	# MODULE 1: TRENDING TOPICS COLLECTION
	# ============================================

	def collect_sri_lanka_trends(self, state: SocialAgentState) -> Dict[str, Any]:
	"""
	Module 1: Collect Sri Lankan trending topics
	"""
	print("[MODULE 1] Collecting Sri Lankan Trending Topics")

	trending_results = []

	# Twitter - Sri Lanka Trends
	try:
	twitter_tool = self.tools.get("scrape_twitter")
	if twitter_tool:
	twitter_data = twitter_tool.invoke(
	{"query": "sri lanka trending viral", "max_items": 20}
	)
	trending_results.append(
	{
	"source_tool": "scrape_twitter",
	"raw_content": str(twitter_data),
	"category": "trending",
	"scope": "sri_lanka",
	"platform": "twitter",
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(" ✓ Twitter Sri Lanka Trends")
	except Exception as e:
	print(f" ⚠️ Twitter error: {e}")

	# Reddit - Sri Lanka
	try:
	reddit_tool = self.tools.get("scrape_reddit")
	if reddit_tool:
	reddit_data = reddit_tool.invoke(
	{
	"keywords": [
	"sri lanka trending",
	"sri lanka viral",
	"sri lanka news",
	],
	"limit": 20,
	"subreddit": "srilanka",
	}
	)
	trending_results.append(
	{
	"source_tool": "scrape_reddit",
	"raw_content": str(reddit_data),
	"category": "trending",
	"scope": "sri_lanka",
	"platform": "reddit",
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(" ✓ Reddit Sri Lanka Trends")
	except Exception as e:
	print(f" ⚠️ Reddit error: {e}")

	return {
	"worker_results": trending_results,
	"latest_worker_results": trending_results,
	}

	# ============================================
	# MODULE 2: SOCIAL MEDIA COLLECTION
	# ============================================

	def collect_sri_lanka_social_media(self, state: SocialAgentState) -> Dict[str, Any]:
	"""
	Module 2A: Collect Sri Lankan social media across all platforms
	"""
	print("[MODULE 2A] Collecting Sri Lankan Social Media")

	social_results = []

	# Twitter - Sri Lanka Events & People
	try:
	twitter_tool = self.tools.get("scrape_twitter")
	if twitter_tool:
	twitter_data = twitter_tool.invoke(
	{"query": "sri lanka events people celebrities", "max_items": 15}
	)
	social_results.append(
	{
	"source_tool": "scrape_twitter",
	"raw_content": str(twitter_data),
	"category": "social",
	"scope": "sri_lanka",
	"platform": "twitter",
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(" ✓ Twitter Sri Lanka Social")
	except Exception as e:
	print(f" ⚠️ Twitter error: {e}")

	# Facebook - Sri Lanka
	try:
	facebook_tool = self.tools.get("scrape_facebook")
	if facebook_tool:
	facebook_data = facebook_tool.invoke(
	{
	"keywords": ["sri lanka events", "sri lanka trending"],
	"max_items": 10,
	}
	)
	social_results.append(
	{
	"source_tool": "scrape_facebook",
	"raw_content": str(facebook_data),
	"category": "social",
	"scope": "sri_lanka",
	"platform": "facebook",
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(" ✓ Facebook Sri Lanka Social")
	except Exception as e:
	print(f" ⚠️ Facebook error: {e}")

	# LinkedIn - Sri Lanka Professional
	try:
	linkedin_tool = self.tools.get("scrape_linkedin")
	if linkedin_tool:
	linkedin_data = linkedin_tool.invoke(
	{
	"keywords": ["sri lanka events", "sri lanka people"],
	"max_items": 5,
	}
	)
	social_results.append(
	{
	"source_tool": "scrape_linkedin",
	"raw_content": str(linkedin_data),
	"category": "social",
	"scope": "sri_lanka",
	"platform": "linkedin",
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(" ✓ LinkedIn Sri Lanka Professional")
	except Exception as e:
	print(f" ⚠️ LinkedIn error: {e}")

	# Instagram - Sri Lanka
	try:
	instagram_tool = self.tools.get("scrape_instagram")
	if instagram_tool:
	instagram_data = instagram_tool.invoke(
	{"keywords": ["srilankaevents", "srilankatrending"], "max_items": 5}
	)
	social_results.append(
	{
	"source_tool": "scrape_instagram",
	"raw_content": str(instagram_data),
	"category": "social",
	"scope": "sri_lanka",
	"platform": "instagram",
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(" ✓ Instagram Sri Lanka")
	except Exception as e:
	print(f" ⚠️ Instagram error: {e}")

	return {
	"worker_results": social_results,
	"social_media_results": social_results,
	}

	def collect_asia_social_media(self, state: SocialAgentState) -> Dict[str, Any]:
	"""
	Module 2B: Collect Asian regional social media
	"""
	print("[MODULE 2B] Collecting Asian Regional Social Media")

	asia_results = []

	# Twitter - Asian Events
	try:
	twitter_tool = self.tools.get("scrape_twitter")
	if twitter_tool:
	twitter_data = twitter_tool.invoke(
	{
	"query": "asia trending india pakistan bangladesh",
	"max_items": 15,
	}
	)
	asia_results.append(
	{
	"source_tool": "scrape_twitter",
	"raw_content": str(twitter_data),
	"category": "social",
	"scope": "asia",
	"platform": "twitter",
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(" ✓ Twitter Asia Trends")
	except Exception as e:
	print(f" ⚠️ Twitter error: {e}")

	# Facebook - Asia
	try:
	facebook_tool = self.tools.get("scrape_facebook")
	if facebook_tool:
	facebook_data = facebook_tool.invoke(
	{"keywords": ["asia trending", "india events"], "max_items": 10}
	)
	asia_results.append(
	{
	"source_tool": "scrape_facebook",
	"raw_content": str(facebook_data),
	"category": "social",
	"scope": "asia",
	"platform": "facebook",
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(" ✓ Facebook Asia")
	except Exception as e:
	print(f" ⚠️ Facebook error: {e}")

	# Reddit - Asian subreddits
	try:
	reddit_tool = self.tools.get("scrape_reddit")
	if reddit_tool:
	reddit_data = reddit_tool.invoke(
	{
	"keywords": ["asia trending", "india", "pakistan"],
	"limit": 10,
	"subreddit": "asia",
	}
	)
	asia_results.append(
	{
	"source_tool": "scrape_reddit",
	"raw_content": str(reddit_data),
	"category": "social",
	"scope": "asia",
	"platform": "reddit",
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(" ✓ Reddit Asia")
	except Exception as e:
	print(f" ⚠️ Reddit error: {e}")

	return {"worker_results": asia_results, "social_media_results": asia_results}

	def collect_world_social_media(self, state: SocialAgentState) -> Dict[str, Any]:
	"""
	Module 2C: Collect world/global trending topics
	"""
	print("[MODULE 2C] Collecting World Trending Topics")

	world_results = []

	# Twitter - World Trends
	try:
	twitter_tool = self.tools.get("scrape_twitter")
	if twitter_tool:
	twitter_data = twitter_tool.invoke(
	{"query": "world trending global breaking news", "max_items": 15}
	)
	world_results.append(
	{
	"source_tool": "scrape_twitter",
	"raw_content": str(twitter_data),
	"category": "social",
	"scope": "world",
	"platform": "twitter",
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(" ✓ Twitter World Trends")
	except Exception as e:
	print(f" ⚠️ Twitter error: {e}")

	# Reddit - World News
	try:
	reddit_tool = self.tools.get("scrape_reddit")
	if reddit_tool:
	reddit_data = reddit_tool.invoke(
	{
	"keywords": ["breaking", "trending", "viral"],
	"limit": 15,
	"subreddit": "worldnews",
	}
	)
	world_results.append(
	{
	"source_tool": "scrape_reddit",
	"raw_content": str(reddit_data),
	"category": "social",
	"scope": "world",
	"platform": "reddit",
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(" ✓ Reddit World News")
	except Exception as e:
	print(f" ⚠️ Reddit error: {e}")

	return {"worker_results": world_results, "social_media_results": world_results}

	def collect_user_defined_targets(self, state: SocialAgentState) -> Dict[str, Any]:
	"""
	Module 2D: Collect data for USER-DEFINED keywords and profiles.
	These are configured via the frontend Intelligence Settings UI.
	"""
	print("[MODULE 2D] Collecting User-Defined Targets")

	user_results = []

	# Reload config to get latest user settings
	self.intel_config = load_intel_config()
	self.user_keywords = self.intel_config.get("user_keywords", [])
	self.user_profiles = self.intel_config.get("user_profiles", {})
	self.user_products = self.intel_config.get("user_products", [])

	# Skip if no user config
	if not self.user_keywords and not any(self.user_profiles.values()):
	print(" ⏭️ No user-defined targets configured")
	return {"worker_results": [], "user_target_results": []}

	# ============================================
	# Scrape USER KEYWORDS across Twitter
	# ============================================
	if self.user_keywords:
	print(f" 📝 Scraping {len(self.user_keywords)} user keywords...")
	twitter_tool = self.tools.get("scrape_twitter")

	for keyword in self.user_keywords[:10]: # Limit to 10 keywords
	try:
	if twitter_tool:
	twitter_data = twitter_tool.invoke(
	{"query": keyword, "max_items": 5}
	)
	user_results.append(
	{
	"source_tool": "scrape_twitter",
	"raw_content": str(twitter_data),
	"category": "user_keyword",
	"scope": "sri_lanka",
	"platform": "twitter",
	"keyword": keyword,
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(f" ✓ Keyword: '{keyword}'")
	except Exception as e:
	print(f" ⚠️ Keyword '{keyword}' error: {e}")

	# ============================================
	# Scrape USER PRODUCTS
	# ============================================
	if self.user_products:
	print(f" 📦 Scraping {len(self.user_products)} user products...")
	twitter_tool = self.tools.get("scrape_twitter")

	for product in self.user_products[:5]: # Limit to 5 products
	try:
	if twitter_tool:
	twitter_data = twitter_tool.invoke(
	{
	"query": f"{product} review OR {product} Sri Lanka",
	"max_items": 3,
	}
	)
	user_results.append(
	{
	"source_tool": "scrape_twitter",
	"raw_content": str(twitter_data),
	"category": "user_product",
	"scope": "sri_lanka",
	"platform": "twitter",
	"product": product,
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(f" ✓ Product: '{product}'")
	except Exception as e:
	print(f" ⚠️ Product '{product}' error: {e}")

	# ============================================
	# Scrape USER TWITTER PROFILES
	# ============================================
	twitter_profiles = self.user_profiles.get("twitter", [])
	if twitter_profiles:
	print(f" 👤 Scraping {len(twitter_profiles)} Twitter profiles...")
	twitter_tool = self.tools.get("scrape_twitter")

	for profile in twitter_profiles[:10]: # Limit to 10 profiles
	try:
	# Clean profile handle
	handle = profile.replace("@", "").strip()
	if twitter_tool:
	# Search for tweets mentioning this profile
	twitter_data = twitter_tool.invoke(
	{"query": f"from:{handle} OR @{handle}", "max_items": 5}
	)
	user_results.append(
	{
	"source_tool": "scrape_twitter",
	"raw_content": str(twitter_data),
	"category": "user_profile",
	"scope": "sri_lanka",
	"platform": "twitter",
	"profile": f"@{handle}",
	"timestamp": datetime.utcnow().isoformat(),
	}
	)
	print(f" ✓ Profile: @{handle}")
	except Exception as e:
	print(f" ⚠️ Profile @{profile} error: {e}")

	print(f" ✅ User targets: {len(user_results)} results collected")
	return {"worker_results": user_results, "user_target_results": user_results}

	# ============================================
	# MODULE 3: FEED GENERATION
	# ============================================

	def categorize_by_geography(self, state: SocialAgentState) -> Dict[str, Any]:
	"""
	Module 3A: Categorize all collected results by geographic scope
	"""
	print("[MODULE 3A] Categorizing Results by Geography")

	all_results = state.get("worker_results", []) or []

	# Initialize categories
	sri_lanka_data = []
	asia_data = []
	world_data = []
	geographic_data = {"sri_lanka": [], "asia": [], "world": []}

	for r in all_results:
	scope = r.get("scope", "unknown")
	content = r.get("raw_content", "")

	# Parse content
	try:
	data = json.loads(content)
	if isinstance(data, dict) and "error" in data:
	continue

	if isinstance(data, str):
	data = json.loads(data)

	posts = []
	if isinstance(data, list):
	posts = data
	elif isinstance(data, dict):
	posts = data.get("results", []) or data.get("data", [])
	if not posts:
	posts = [data]

	# Categorize
	if scope == "sri_lanka":
	sri_lanka_data.extend(posts[:10])
	geographic_data["sri_lanka"].extend(posts[:10])
	elif scope == "asia":
	asia_data.extend(posts[:10])
	geographic_data["asia"].extend(posts[:10])
	elif scope == "world":
	world_data.extend(posts[:10])
	geographic_data["world"].extend(posts[:10])

	except Exception:
	continue

	# Create structured feeds
	structured_feeds = {
	"sri lanka": sri_lanka_data,
	"asia": asia_data,
	"world": world_data,
	}

	print(
	f" ✓ Categorized: {len(sri_lanka_data)} Sri Lanka, {len(asia_data)} Asia, {len(world_data)} World"
	)

	return {
	"structured_output": structured_feeds,
	"geographic_feeds": geographic_data,
	"sri_lanka_feed": sri_lanka_data,
	"asia_feed": asia_data,
	"world_feed": world_data,
	}

	def generate_llm_summary(self, state: SocialAgentState) -> Dict[str, Any]:
	"""
	Module 3B: Use Groq LLM to generate executive summary AND structured insights
	"""
	print("[MODULE 3B] Generating LLM Summary + Structured Insights")

	structured_feeds = state.get("structured_output", {})
	llm_summary = "AI summary currently unavailable."
	llm_insights = []

	try:
	# Collect sample posts for analysis
	all_posts = []
	for region, posts in structured_feeds.items():
	for p in posts[:5]: # Top 5 per region
	text = p.get("text", "") or p.get("title", "")
	if text and len(text) > 20:
	all_posts.append(f"[{region.upper()}] {text[:200]}")

	if not all_posts:
	return {"llm_summary": llm_summary, "llm_insights": []}

	posts_text = "\n".join(all_posts[:15])

	# Generate summary AND structured insights
	analysis_prompt = f"""Analyze these social media posts from Sri Lanka and the region. Generate:
	1. A 3-sentence executive summary of key trends
	2. Up to 5 unique intelligence insights

	Posts:
	{posts_text}

	Respond in this exact JSON format:
	{{
	"executive_summary": "Brief 3-sentence summary of key social trends and developments",
	"insights": [
	{{"summary": "Unique insight #1 (not copying post text)", "severity": "low/medium/high", "impact_type": "risk/opportunity"}},
	{{"summary": "Unique insight #2", "severity": "low/medium/high", "impact_type": "risk/opportunity"}}
	]
	}}

	Rules:
	- Generate NEW insights, don't just copy post text
	- Identify patterns and emerging trends
	- Classify severity based on potential impact
	- Mark positive developments as "opportunity", concerning ones as "risk"

	JSON only, no explanation:"""

	llm_response = self.llm.invoke(analysis_prompt)
	content = (
	llm_response.content
	if hasattr(llm_response, "content")
	else str(llm_response)
	)

	# Parse JSON response
	import re

	content = content.strip()
	if content.startswith("```"):
	content = re.sub(r"^```\w*\n?", "", content)
	content = re.sub(r"\n?```$", "", content)

	result = json.loads(content)
	llm_summary = result.get("executive_summary", llm_summary)
	llm_insights = result.get("insights", [])

	print(f" ✓ LLM generated {len(llm_insights)} unique insights")

	except json.JSONDecodeError as e:
	print(f" ⚠️ JSON parse error: {e}")
	# Fallback to simple summary
	try:
	fallback_prompt = f"Summarize these social media trends in 3 sentences:\n{posts_text[:1500]}"
	response = self.llm.invoke(fallback_prompt)
	llm_summary = (
	response.content if hasattr(response, "content") else str(response)
	)
	except Exception as fallback_error:
	print(f" ⚠️ LLM fallback also failed: {fallback_error}")
	except Exception as e:
	print(f" ⚠️ LLM Error: {e}")

	return {"llm_summary": llm_summary, "llm_insights": llm_insights}

	def format_final_output(self, state: SocialAgentState) -> Dict[str, Any]:
	"""
	Module 3C: Format final feed output with LLM-enhanced insights
	"""
	print("[MODULE 3C] Formatting Final Output")

	llm_summary = state.get("llm_summary", "No summary available")
	llm_insights = state.get("llm_insights", []) # NEW: Get LLM-generated insights
	structured_feeds = state.get("structured_output", {})

	trending_count = len(
	[
	r
	for r in state.get("worker_results", [])
	if r.get("category") == "trending"
	]
	)
	social_count = len(
	[
	r
	for r in state.get("worker_results", [])
	if r.get("category") == "social"
	]
	)

	sri_lanka_items = len(structured_feeds.get("sri lanka", []))
	asia_items = len(structured_feeds.get("asia", []))
	world_items = len(structured_feeds.get("world", []))

	bulletin = f"""🌏 COMPREHENSIVE SOCIAL INTELLIGENCE FEED
	{datetime.utcnow().strftime("%d %b %Y • %H:%M UTC")}

	📊 EXECUTIVE SUMMARY (AI-Generated)
	{llm_summary}

	📈 DATA COLLECTION STATS
	• Trending Topics: {trending_count} items
	• Social Media Posts: {social_count} items
	• Geographic Coverage: Sri Lanka, Asia, World

	🔍 GEOGRAPHIC BREAKDOWN
	• Sri Lanka: {sri_lanka_items} trending items
	• Asia: {asia_items} regional items
	• World: {world_items} global items

	🌐 COVERAGE CATEGORIES
	• Events: Public gatherings, launches, announcements
	• People: Influencers, celebrities, public figures
	• Viral Content: Trending posts, hashtags, memes
	• Breaking: Real-time developments

	🎯 INTELLIGENCE FOCUS
	Monitoring social sentiment, trending topics, events, and people across:
	- Sri Lanka (local intelligence)
	- Asia (regional context: India, Pakistan, Bangladesh, ASEAN)
	- World (global trends affecting local sentiment)

	Source: Multi-platform aggregation (Twitter, Facebook, LinkedIn, Instagram, Reddit)
	"""

	# Create list for domain_insights (FRONTEND COMPATIBLE)
	domain_insights = []
	timestamp = datetime.utcnow().isoformat()

	# PRIORITY 1: Add LLM-generated unique insights (these are curated and unique)
	for insight in llm_insights:
	if isinstance(insight, dict) and insight.get("summary"):
	domain_insights.append(
	{
	"source_event_id": str(uuid.uuid4()),
	"domain": "social",
	"summary": f"🔍 {insight.get('summary', '')}", # Mark as AI-analyzed
	"severity": insight.get("severity", "medium"),
	"impact_type": insight.get("impact_type", "risk"),
	"timestamp": timestamp,
	"is_llm_generated": True, # Flag for frontend
	}
	)

	print(f" ✓ Added {len(llm_insights)} LLM-generated insights")

	# PRIORITY 2: Add top raw posts only if we need more (fallback)
	# Only add raw posts if LLM didn't generate enough insights
	if len(domain_insights) < 5:
	# Sri Lankan districts for geographic tagging
	districts = [
	"colombo",
	"gampaha",
	"kalutara",
	"kandy",
	"matale",
	"nuwara eliya",
	"galle",
	"matara",
	"hambantota",
	"jaffna",
	"kilinochchi",
	"mannar",
	"mullaitivu",
	"vavuniya",
	"puttalam",
	"kurunegala",
	"anuradhapura",
	"polonnaruwa",
	"badulla",
	"monaragala",
	"ratnapura",
	"kegalle",
	"ampara",
	"batticaloa",
	"trincomalee",
	]

	# Add Sri Lanka posts as fallback
	sri_lanka_data = structured_feeds.get("sri lanka", [])
	for post in sri_lanka_data[:5]:
	post_text = post.get("text", "") or post.get("title", "")
	if not post_text or len(post_text) < 20:
	continue

	# Detect district
	detected_district = "Sri Lanka"
	for district in districts:
	if district.lower() in post_text.lower():
	detected_district = district.title()
	break

	# Determine severity
	severity = "low"
	if any(
	kw in post_text.lower()
	for kw in ["protest", "riot", "emergency", "violence", "crisis"]
	):
	severity = "high"
	elif any(
	kw in post_text.lower()
	for kw in ["trending", "viral", "breaking", "update"]
	):
	severity = "medium"

	domain_insights.append(
	{
	"source_event_id": str(uuid.uuid4()),
	"domain": "social",
	"summary": f"{detected_district}: {post_text[:200]}",
	"severity": severity,
	"impact_type": (
	"risk" if severity in ["high", "medium"] else "opportunity"
	),
	"timestamp": timestamp,
	"is_llm_generated": False,
	}
	)

	# Add executive summary insight
	domain_insights.append(
	{
	"source_event_id": str(uuid.uuid4()),
	"structured_data": structured_feeds,
	"domain": "social",
	"summary": f"📊 Social Intelligence Summary: {llm_summary[:300]}",
	"severity": "medium",
	"impact_type": "risk",
	"is_llm_generated": True,
	}
	)

	print(f" ✓ Created {len(domain_insights)} total social intelligence insights")

	return {
	"final_feed": bulletin,
	"feed_history": [bulletin],
	"domain_insights": domain_insights,
	}

	# ============================================
	# MODULE 4: FEED AGGREGATOR & STORAGE
	# ============================================

	def aggregate_and_store_feeds(self, state: SocialAgentState) -> Dict[str, Any]:
	"""
	Module 4: Aggregate, deduplicate, and store feeds
	- Check uniqueness using Neo4j (URL + content hash)
	- Store unique posts in Neo4j
	- Store unique posts in ChromaDB for RAG
	- Append to CSV dataset for ML training
	"""
	print("[MODULE 4] Aggregating and Storing Feeds")

	from src.utils.db_manager import (
	Neo4jManager,
	ChromaDBManager,
	extract_post_data,
	)
	import csv
	import os

	# Initialize database managers
	neo4j_manager = Neo4jManager()
	chroma_manager = ChromaDBManager()

	# Get all worker results from state
	all_worker_results = state.get("worker_results", [])

	# Statistics
	total_posts = 0
	unique_posts = 0
	duplicate_posts = 0
	stored_neo4j = 0
	stored_chroma = 0
	stored_csv = 0

	# Setup CSV dataset
	dataset_dir = os.getenv("DATASET_PATH", "./datasets/social_feeds")
	os.makedirs(dataset_dir, exist_ok=True)

	csv_filename = f"social_feeds_{datetime.now().strftime('%Y%m')}.csv"
	csv_path = os.path.join(dataset_dir, csv_filename)

	# CSV headers
	csv_headers = [
	"post_id",
	"timestamp",
	"platform",
	"category",
	"scope",
	"poster",
	"post_url",
	"title",
	"text",
	"content_hash",
	"engagement_score",
	"engagement_likes",
	"engagement_shares",
	"engagement_comments",
	"source_tool",
	]

	# Check if CSV exists to determine if we need to write headers
	file_exists = os.path.exists(csv_path)

	try:
	# Open CSV file in append mode
	with open(csv_path, "a", newline="", encoding="utf-8") as csvfile:
	writer = csv.DictWriter(csvfile, fieldnames=csv_headers)

	# Write headers if new file
	if not file_exists:
	writer.writeheader()
	print(f" ✓ Created new CSV dataset: {csv_path}")
	else:
	print(f" ✓ Appending to existing CSV: {csv_path}")

	# Process each worker result
	for worker_result in all_worker_results:
	category = worker_result.get("category", "unknown")
	platform = worker_result.get("platform", "unknown")
	source_tool = worker_result.get("source_tool", "")
	scope = worker_result.get("scope", "")

	# Parse raw content
	raw_content = worker_result.get("raw_content", "")
	if not raw_content:
	continue

	try:
	# Try to parse JSON content
	if isinstance(raw_content, str):
	data = json.loads(raw_content)
	else:
	data = raw_content

	# Handle different data structures
	posts = []
	if isinstance(data, list):
	posts = data
	elif isinstance(data, dict):
	# Check for common result keys
	posts = (
	data.get("results")
	or data.get("data")
	or data.get("posts")
	or data.get("items")
	or []
	)

	# If still empty, treat the dict itself as a post
	if not posts and (data.get("title") or data.get("text")):
	posts = [data]

	# Process each post
	for raw_post in posts:
	total_posts += 1

	# Skip if error object
	if isinstance(raw_post, dict) and "error" in raw_post:
	continue

	# Extract normalized post data
	post_data = extract_post_data(
	raw_post=raw_post,
	category=category,
	platform=platform,
	source_tool=source_tool,
	)

	if not post_data:
	continue

	# Check uniqueness with Neo4j
	is_dup = neo4j_manager.is_duplicate(
	post_url=post_data["post_url"],
	content_hash=post_data["content_hash"],
	)

	if is_dup:
	duplicate_posts += 1
	continue

	# Unique post - store it
	unique_posts += 1

	# Store in Neo4j
	if neo4j_manager.store_post(post_data):
	stored_neo4j += 1

	# Store in ChromaDB
	if chroma_manager.add_document(post_data):
	stored_chroma += 1

	# Store in CSV
	try:
	csv_row = {
	"post_id": post_data["post_id"],
	"timestamp": post_data["timestamp"],
	"platform": post_data["platform"],
	"category": post_data["category"],
	"scope": scope,
	"poster": post_data["poster"],
	"post_url": post_data["post_url"],
	"title": post_data["title"],
	"text": post_data["text"],
	"content_hash": post_data["content_hash"],
	"engagement_score": post_data["engagement"].get(
	"score", 0
	),
	"engagement_likes": post_data["engagement"].get(
	"likes", 0
	),
	"engagement_shares": post_data["engagement"].get(
	"shares", 0
	),
	"engagement_comments": post_data["engagement"].get(
	"comments", 0
	),
	"source_tool": post_data["source_tool"],
	}
	writer.writerow(csv_row)
	stored_csv += 1
	except Exception as e:
	print(f" ⚠️ CSV write error: {e}")

	except Exception as e:
	print(f" ⚠️ Error processing worker result: {e}")
	continue

	except Exception as e:
	print(f" ⚠️ CSV file error: {e}")

	# Close database connections
	neo4j_manager.close()

	# Print statistics
	print("\n 📊 AGGREGATION STATISTICS")
	print(f" Total Posts Processed: {total_posts}")
	print(f" Unique Posts: {unique_posts}")
	print(f" Duplicate Posts: {duplicate_posts}")
	print(f" Stored in Neo4j: {stored_neo4j}")
	print(f" Stored in ChromaDB: {stored_chroma}")
	print(f" Stored in CSV: {stored_csv}")
	print(f" Dataset Path: {csv_path}")

	# Get database counts
	neo4j_total = neo4j_manager.get_post_count() if neo4j_manager.driver else 0
	chroma_total = (
	chroma_manager.get_document_count() if chroma_manager.collection else 0
	)

	print("\n 💾 DATABASE TOTALS")
	print(f" Neo4j Total Posts: {neo4j_total}")
	print(f" ChromaDB Total Docs: {chroma_total}")

	return {
	"aggregator_stats": {
	"total_processed": total_posts,
	"unique_posts": unique_posts,
	"duplicate_posts": duplicate_posts,
	"stored_neo4j": stored_neo4j,
	"stored_chroma": stored_chroma,
	"stored_csv": stored_csv,
	"neo4j_total": neo4j_total,
	"chroma_total": chroma_total,
	},
	"dataset_path": csv_path,
	}