Spaces:

WizardCoder2007
/

social_media_analyzer

Sleeping

App Files Files Community

social_media_analyzer / sentiment_analysis.py

WizardCoder2007

update

bbd259b about 1 month ago

raw

history blame contribute delete

4.7 kB

	import sys
	import os

	# ---- PERMANENT IMPORT FIX ----
	ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
	sys.path.insert(0, ROOT_DIR)

	from src.language_detection import detect_language
	from src.preprocessing import clean_text
	from src.predict import predict
	from src.feature_builder import build_features
	from src.anchor_similarity import compute_similarity
	from src.embeddings import embedder
	from src.sarcasm import sarcasm_score
	from src.sentiment import sentiment_scores
	from src.translation import translate_to_english
	from src.context_llm import get_context_probs

	# ---- SUPPORTED LANGUAGES ----
	SUPPORTED_LANGS = {"en", "hi", "ta", "ur", "bn", "te", "ml", "gu", "kn", "mr"}

	LABELS = [
	"Pro-India",
	"Anti-India",
	"Pro-Government",
	"Anti-Government",
	"Neutral"
	]

	def init_anchors():
	"""
	Load anchor text from data/anchors/, encode them, and inject into anchor_similarity module.
	"""
	print("[INIT] Loading anchor embeddings...")
	anchor_dir = os.path.join(ROOT_DIR, "data", "anchors")

	# Map keys to filenames
	keys = ["pro_india", "anti_india", "pro_government", "anti_government", "neutral"]
	loaded_anchors = {}

	for key in keys:
	file_path = os.path.join(anchor_dir, f"{key}.txt")
	if not os.path.exists(file_path):
	print(f"[WARNING] Anchor file missing: {file_path}")
	continue

	with open(file_path, "r", encoding="utf-8") as f:
	lines = [line.strip() for line in f if line.strip()]

	if not lines:
	print(f"[WARNING] Anchor file empty: {key}")
	continue

	# Encode (batch)
	# embedder is from src.embeddings
	embeddings_matrix = embedder.encode(lines)
	loaded_anchors[key] = embeddings_matrix
	print(f" - Loaded {key}: {len(lines)} examples")

	# Inject into module
	from src.anchor_similarity import load_anchor_embeddings
	load_anchor_embeddings(loaded_anchors)
	print("[INIT] Anchor embeddings initialized.\n")

	def classify(text: str):
	# 1. Clean text
	text = clean_text(text)

	if len(text.strip()) == 0:
	return {"error": "Empty input text"}

	# 2. Language detection
	lang, prob = detect_language(text)

	# DEBUG (you can remove later)
	print(f"[DEBUG] Detected language: {lang}, confidence: {round(prob, 3)}")


	# 2.5 Translation (if not English)
	# We use English for processing because the Sarcasm/Sentiment models are English-specific
	# and the Anchors are in English.
	processing_text = text
	if lang != 'en':
	print(f"[INFO] Translating {lang} to en...")
	translated = translate_to_english(text, source=lang)
	print(f" -> {translated}")
	processing_text = translated

	# 3. Sentence embedding
	text_embedding = embedder.encode(processing_text, normalize_embeddings=True)

	# 4. Cosine similarity with anchors
	similarity_scores = compute_similarity(
	text_embedding=text_embedding,
	anchor_embeddings=None # handled internally if global
	)

	# 5. Sentiment + sarcasm
	sentiment = sentiment_scores(processing_text) # [neg, neutral, pos]
	sarcasm = sarcasm_score(processing_text) # float 0–1

	# 5.5 LLM Context Analysis
	context_probs = get_context_probs(processing_text)

	# 6. Feature vector
	features = build_features(
	similarity=similarity_scores,
	sentiment=sentiment,
	sarcasm=sarcasm,
	context_probs=context_probs
	)

	# 7. Final prediction
	label_idx, confidence = predict(features)

	return {
	"text": text,
	"label": LABELS[label_idx],
	"confidence": round(confidence, 3),
	"language": lang,
	"sarcasm_score": round(sarcasm, 3),
	"sentiment": {
	"negative": round(sentiment[0], 3),
	"neutral": round(sentiment[1], 3),
	"positive": round(sentiment[2], 3),
	}
	}

	# ---- ENTRY POINT ----
	if __name__ == "__main__":
	init_anchors()

	# Process test.txt if it exists
	if os.path.exists("test.txt"):
	print("Processing test.txt...")
	with open("test.txt","r") as f:
	for line in f:
	if line.strip():
	result= classify(line)
	print(result)
	print("-" * 50)

	print("\n🔍 Reddit Political Stance Classifier")
	print("Type 'exit' to quit\n")

	while True:
	text = input("Enter Reddit post: ").strip()

	if text.lower() == "exit":
	break

	result = classify(text)
	print("\nResult:")
	print(result)
	print("-" * 50)