Agrannya commited on
Commit
7a41f2a
·
verified ·
1 Parent(s): 960d86e

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. deploy.py +739 -96
deploy.py CHANGED
@@ -1,43 +1,687 @@
1
 
2
- #file is saved as deploy.py
3
 
4
- import io # Import io for handling image bytes
5
  import gradio as gr
 
 
 
 
 
 
 
 
 
6
 
7
- # Global variables to maintain state across Gradio calls
8
- # Assuming cl and explore_reels_list are already defined and populated by login/fetch steps
9
- # from previous cells in a real execution environment.
10
- # For this cell's context, we ensure they are declared global.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  global cl
12
  global explore_reels_list
13
  global sentiment_analyzer_instance
14
  global content_classifier_pipeline
15
 
16
- # Initialize sentiment analyzer if not already done (can be done here or lazily in analyze_reels_gradio)
17
- # Doing it here ensures the model is loaded when this cell runs, potentially reducing latency on first analyze click.
18
- try:
19
- sentiment_analyzer_instance = ReelSentimentAnalyzer()
20
- print("Sentiment Analyzer initialized.")
21
- # Optional: Train Hindi model if needed and data is available
22
- # sample_train_data = [...] # Define your training data
23
- # sentiment_analyzer_instance.train_hindi_model(sample_train_data)
24
- except Exception as e:
25
- print(f"Error initializing Sentiment Analyzer globally: {e}")
26
- sentiment_analyzer_instance = None
27
-
28
-
29
- # Initialize content classifier pipeline if not already done (can be done here or lazily)
30
- try:
31
- print("Initializing Content Classifier Pipeline globally...")
32
- content_classifier_pipeline = pipeline(
33
- "zero-shot-classification",
34
- model="facebook/bart-large-mnli",
35
- device=0 if torch.cuda.is_available() else -1 # Use GPU if available
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  )
37
- print("Content Classifier Pipeline Initialized.")
38
- except Exception as e:
39
- print(f"Error initializing Content Classifier globally: {e}")
40
- content_classifier_pipeline = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
 
43
  def analyze_reels_gradio(max_to_analyze):
@@ -58,11 +702,28 @@ def analyze_reels_gradio(max_to_analyze):
58
  return "Error: No reels available to analyze.", None, None
59
 
60
 
61
- # Check if analyzers are initialized
62
  if sentiment_analyzer_instance is None:
63
- return "Error: Sentiment Analyzer not initialized.", None, None
 
 
 
 
 
 
 
 
64
  if content_classifier_pipeline is None:
65
- return "Error: Content Classifier not initialized.", None, None
 
 
 
 
 
 
 
 
 
66
 
67
 
68
  analysis_status_messages = []
@@ -115,28 +776,34 @@ def analyze_reels_gradio(max_to_analyze):
115
  return final_status_message, sentiment_plot_figure, content_plot_figure
116
 
117
 
118
- # Re-define plot functions to return bytes (if not already done in a previous cell)
119
- # Assuming they were defined in the previous subtask's code block.
120
- # If not, they would need to be included here.
121
-
122
  # --- Gradio Blocks Interface ---
123
  with gr.Blocks() as demo:
124
  gr.Markdown("# Instagram Reels Analysis")
 
 
125
  with gr.Row():
126
- username_input = gr.Textbox(label="Instagram Username")
127
- login_button = gr.Button("Login")
128
  login_status_output = gr.Label(label="Login Status")
129
 
 
 
 
 
 
 
 
130
  with gr.Row():
131
  fetch_button = gr.Button("Fetch Reels")
132
  fetch_status_output = gr.Label(label="Fetch Status")
133
 
 
134
  with gr.Row():
135
  max_reels_input = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of Reels to Analyze")
136
  analyze_button = gr.Button("Analyze Reels")
137
 
138
  analyze_status_output = gr.Label(label="Analysis Status")
139
 
 
140
  with gr.Row():
141
  # Sentiment Analysis Outputs
142
  with gr.Column():
@@ -149,74 +816,50 @@ with gr.Blocks() as demo:
149
  content_plot_output = gr.Plot(label="Content Distribution")
150
 
151
 
152
- # Link login and fetch buttons (assuming login_gradio and fetch_reels_gradio are defined)
153
- # Redefine login_gradio and fetch_reels_gradio here within the Blocks context
154
- # to ensure they are linked correctly, even though they were defined above.
155
- # This is a common pattern in Gradio Blocks.
156
-
157
- def login_gradio_blocks(username):
158
- """Gradio-compatible login function for Blocks."""
159
- global cl
160
- try:
161
- PASSWORD = userdata.get('password')
162
- except Exception as e:
163
- return f"Error accessing password secret: {e}"
164
-
165
-
166
- if not PASSWORD:
167
- return "Error: Instagram password not found in Colab secrets."
168
-
169
- cl = Client()
170
-
171
- try:
172
- cl.login(username, PASSWORD)
173
- return f"Successfully logged in as {username}"
174
- except Exception as e:
175
- cl = None # Ensure cl is None on failure
176
- return f"Error during login: {e}"
177
-
178
- def fetch_reels_gradio_blocks():
179
- """Gradio-compatible function to fetch explore reels for Blocks."""
180
- global cl
181
- global explore_reels_list
182
-
183
- if cl is None:
184
- explore_reels_list = [] # Ensure list is empty on failure
185
- return "Error: Not logged in. Please log in first."
186
-
187
- try:
188
- # Fetch a limited number of reels for demonstration purposes
189
- # You might want to make this number configurable later
190
- fetched_reels = cl.explore_reels()[:100] # Fetch up to 100 for analysis
191
- explore_reels_list = fetched_reels
192
- if explore_reels_list:
193
- return f"Successfully fetched {len(explore_reels_list)} explore reels."
194
- else:
195
- explore_reels_list = [] # Ensure it's an empty list
196
- return "Fetched 0 explore reels."
197
- except Exception as e:
198
- explore_reels_list = [] # Ensure it's an empty list on error
199
- return f"Error fetching explore reels: {e}"
200
-
201
 
202
- login_button.click(
203
- fn=login_gradio_blocks,
204
- inputs=username_input,
205
- outputs=login_status_output
206
  )
207
 
208
  fetch_button.click(
209
- fn=fetch_reels_gradio_blocks,
210
  inputs=None, # No direct inputs needed for fetching
211
  outputs=fetch_status_output
212
  )
213
 
214
- # Link the Analyze button to the analysis function
215
  analyze_button.click(
216
  fn=analyze_reels_gradio,
217
  inputs=max_reels_input, # Input is the slider value
218
  outputs=[analyze_status_output, sentiment_plot_output, content_plot_output] # Outputs are status and the two plots
219
  )
220
 
221
- # The demo is now fully defined. It can be launched in the next step.
222
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+ # This deploy.py file contains the complete code for the Instagram Reels Analysis Gradio App.
3
 
4
+ # --- Imports ---
5
  import gradio as gr
6
+ import time
7
+ import random
8
+ import matplotlib.pyplot as plt
9
+ import pandas as pd
10
+ import torch
11
+ import emoji
12
+ import re
13
+ import numpy as np
14
+ import io # Import io for handling image bytes
15
 
16
+ from google.colab import userdata
17
+ from instagrapi import Client
18
+ from transformers import (
19
+ pipeline,
20
+ AutoTokenizer,
21
+ AutoModelForSequenceClassification,
22
+ Trainer,
23
+ TrainingArguments,
24
+ RobertaForSequenceClassification,
25
+ AlbertForSequenceClassification
26
+ )
27
+ from datasets import Dataset, Features, Value
28
+ from collections import Counter
29
+ from sklearn.metrics import accuracy_score, f1_score
30
+
31
+ # --- Configuration ---
32
+ CONFIG = {
33
+ "max_length": 128,
34
+ "batch_size": 16,
35
+ "learning_rate": 2e-5,
36
+ "num_train_epochs": 3,
37
+ "few_shot_examples": 5, # per class
38
+ "confidence_threshold": 0.7,
39
+ "neutral_reanalysis_threshold": 0.33
40
+ }
41
+
42
+ # --- Global Variables for State Management ---
43
  global cl
44
  global explore_reels_list
45
  global sentiment_analyzer_instance
46
  global content_classifier_pipeline
47
 
48
+ cl = None
49
+ explore_reels_list = []
50
+ sentiment_analyzer_instance = None
51
+ content_classifier_pipeline = None
52
+
53
+
54
+ # --- Sentiment Analysis Class ---
55
+ class ReelSentimentAnalyzer:
56
+ def __init__(self):
57
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
58
+ self._initialize_models()
59
+
60
+ def _initialize_models(self):
61
+ """Initialize and configure all models"""
62
+ print("\nInitializing Sentiment Analysis Models...")
63
+ # English models
64
+ print("Loading English Emotion Model...")
65
+ self.emotion_tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-emotion-analysis")
66
+ self.emotion_model = AutoModelForSequenceClassification.from_pretrained(
67
+ "finiteautomata/bertweet-base-emotion-analysis"
68
+ ).to(self.device)
69
+ print("Loading English Sentiment Model...")
70
+ self.sentiment_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
71
+ self.sentiment_model = RobertaForSequenceClassification.from_pretrained(
72
+ "cardiffnlp/twitter-roberta-base-sentiment-latest",
73
+ ignore_mismatched_sizes=True
74
+ ).to(self.device)
75
+
76
+ # Hindi/English model (we'll fine-tune this)
77
+ print("Loading Indic-BERT Model for Hindi/Hinglish...")
78
+ self.hindi_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
79
+ self.hindi_model = AlbertForSequenceClassification.from_pretrained(
80
+ "ai4bharat/indic-bert",
81
+ num_labels=3,
82
+ id2label={0: "negative", 1: "neutral", 2: "positive"},
83
+ label2id={"negative": 0, "neutral": 1, "positive": 2}
84
+ ).to(self.device)
85
+ # Store label2id mapping for easy access
86
+ self.hindi_label2id = self.hindi_model.config.label2id
87
+ print("Models Initialized.")
88
+
89
+ # Emotion to sentiment mapping
90
+ self.emotion_map = {
91
+ "joy": "positive", "love": "positive", "happy": "positive",
92
+ "anger": "negative", "sadness": "negative", "fear": "negative",
93
+ "surprise": "neutral", "neutral": "neutral", "disgust": "negative", "shame": "negative"
94
+ }
95
+
96
+ # Neutral keywords
97
+ self.neutral_keywords = {
98
+ "ad", "sponsored", "promo", "sale", "discount", "offer", "giveaway",
99
+ "buy", "shop", "link in bio",
100
+ "विज्ञापन", "प्रचार", "ऑफर", "डिस्काउंट", "बिक्री", "लिंक ब���यो में"
101
+ }
102
+
103
+ def train_hindi_model(self, train_data, eval_data=None):
104
+ """
105
+ Fine-tune the Hindi/English model on labeled data
106
+ Args:
107
+ train_data: List of dicts [{"text": "...", "label": "positive/negative/neutral"}]
108
+ eval_data: Optional evaluation data
109
+ """
110
+ print("\nStarting Hindi model training...")
111
+ # Convert to dataset
112
+ train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
113
+
114
+ # Map string labels to integer IDs
115
+ def map_labels_to_ids(examples):
116
+ # Ensure label exists and is in expected range
117
+ labels = []
118
+ for label_str in examples["label"]:
119
+ if label_str in self.hindi_label2id:
120
+ labels.append(self.hindi_label2id[label_str])
121
+ else:
122
+ # Handle unexpected labels, maybe map to neutral or skip
123
+ print(f"Warning: Unexpected label '{label_str}'. Mapping to neutral.")
124
+ labels.append(self.hindi_label2id["neutral"]) # Map unknown to neutral
125
+ examples["label"] = labels
126
+ return examples
127
+
128
+
129
+ train_dataset = train_dataset.map(map_labels_to_ids, batched=True)
130
+
131
+ # Explicitly set the label column to integer type
132
+ train_dataset = train_dataset.cast_column("label", Value("int64"))
133
+
134
+
135
+ def tokenize_function(examples):
136
+ return self.hindi_tokenizer(
137
+ examples["text"],
138
+ padding="max_length",
139
+ truncation=True,
140
+ max_length=CONFIG["max_length"]
141
+ )
142
+
143
+ tokenized_train = train_dataset.map(tokenize_function, batched=True)
144
+
145
+ # Training arguments - using eval_strategy instead of evaluation_strategy
146
+ training_args = TrainingArguments(
147
+ output_dir="./results",
148
+ eval_strategy="epoch" if eval_data else "no",
149
+ per_device_train_batch_size=CONFIG["batch_size"],
150
+ per_device_eval_batch_size=CONFIG["batch_size"],
151
+ learning_rate=CONFIG["learning_rate"],
152
+ num_train_epochs=CONFIG["num_train_epochs"],
153
+ weight_decay=0.01,
154
+ save_strategy="no", # Don't save checkpoints during training
155
+ logging_dir='./logs',
156
+ logging_steps=10,
157
+ report_to="none" # Don't report to external services
158
+ )
159
+
160
+ # Compute metrics function
161
+ def compute_metrics(p):
162
+ predictions, labels = p
163
+ predictions = np.argmax(predictions, axis=1)
164
+ return {
165
+ "accuracy": accuracy_score(labels, predictions),
166
+ "f1": f1_score(labels, predictions, average="weighted")
167
+ }
168
+
169
+ # Trainer
170
+ eval_dataset_processed = None
171
+ if eval_data:
172
+ eval_dataset = Dataset.from_pandas(pd.DataFrame(eval_data))
173
+ eval_dataset = eval_dataset.map(map_labels_to_ids, batched=True)
174
+ eval_dataset_processed = eval_dataset.cast_column("label", Value("int64")).map(tokenize_function, batched=True)
175
+
176
+
177
+ trainer = Trainer(
178
+ model=self.hindi_model,
179
+ args=training_args,
180
+ train_dataset=tokenized_train,
181
+ eval_dataset=eval_dataset_processed,
182
+ compute_metrics=compute_metrics if eval_data else None,
183
+ )
184
+
185
+ # Train
186
+ trainer.train()
187
+
188
+ # Save the fine-tuned model
189
+ print("Saving fine-tuned Hindi model...")
190
+ self.hindi_model.save_pretrained("./fine_tuned_hindi_sentiment")
191
+ self.hindi_tokenizer.save_pretrained("./fine_tuned_hindi_sentiment")
192
+ print("Hindi model training complete.")
193
+
194
+ def preprocess_text(self, text):
195
+ """Enhanced text cleaning with multilingual support"""
196
+ if not text:
197
+ return ""
198
+
199
+ # Convert emojis to text
200
+ text = emoji.demojize(text, delimiters=(" ", " "))
201
+
202
+ # Remove URLs and mentions
203
+ text = re.sub(r"http\S+|@\w+", "", text)
204
+
205
+ # Expand common abbreviations (can be extended)
206
+ abbrevs = {
207
+ r"\bomg\b": "oh my god",
208
+ r"\btbh\b": "to be honest",
209
+ r"\bky\b": "kyun", # Hindi 'why'
210
+ r"\bkb\b": "kab", # Hindi 'when'
211
+ r"\bkya\b": "kya", # Hindi 'what'
212
+ r"\bkahan\b": "kahan", # Hindi 'where'
213
+ r"\bkaisa\b": "kaisa" # Hindi 'how'
214
+ }
215
+ for pattern, replacement in abbrevs.items():
216
+ text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
217
+
218
+ # Remove extra whitespace
219
+ text = re.sub(r"\s+", " ", text).strip()
220
+
221
+ return text
222
+
223
+ def detect_language(self, text):
224
+ """Improved language detection"""
225
+ if re.search(r"[\u0900-\u097F]", text): # Devanagari script (Hindi, Marathi etc.)
226
+ return "hi"
227
+ # Simple check for common Hindi/Hinglish words (can be expanded)
228
+ hinglish_keywords = ["hai", "kyun", "nahi", "kya", "acha", "bas", "yaar", "main"]
229
+ if any(re.search(rf"\b{kw}\b", text.lower()) for kw in hinglish_keywords):
230
+ return "hi-latin"
231
+ # Fallback to English if no strong Hindi/Hinglish indicators
232
+ return "en"
233
+
234
+
235
+ def analyze_content(self, text):
236
+ """Main analysis function with improved confidence handling"""
237
+ processed = self.preprocess_text(text)
238
+
239
+ if not processed:
240
+ return "neutral", 0.5, {"reason": "empty_text"}
241
+
242
+ lang = self.detect_language(processed)
243
+
244
+ # Check for neutral keywords first with higher confidence
245
+ if any(re.search(rf"\b{re.escape(kw)}\b", processed.lower()) for kw in self.neutral_keywords):
246
+ return "neutral", 0.9, {"reason": "neutral_keyword"}
247
+
248
+ try:
249
+ if lang in ("hi", "hi-latin"):
250
+ # Use Hindi model for Hindi/Hinglish
251
+ return self._analyze_hindi_content(processed)
252
+ else:
253
+ # Use ensemble for English
254
+ return self._analyze_english_content(processed)
255
+ except Exception as e:
256
+ print(f"Analysis error for text '{processed[:50]}...': {e}")
257
+ return "neutral", 0.5, {"error": str(e), "original_text": text[:50]}
258
+
259
+ def _analyze_hindi_content(self, text):
260
+ """Analyze Hindi content with fine-tuned model"""
261
+ inputs = self.hindi_tokenizer(
262
+ text,
263
+ return_tensors="pt",
264
+ truncation=True,
265
+ padding=True,
266
+ max_length=CONFIG["max_length"]
267
+ ).to(self.device)
268
+
269
+ with torch.no_grad():
270
+ outputs = self.hindi_model(**inputs)
271
+
272
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
273
+ pred_idx = torch.argmax(probs).item()
274
+ confidence = probs[0][pred_idx].item()
275
+
276
+ label = self.hindi_model.config.id2label[pred_idx]
277
+ return label, confidence, {"model": "fine-tuned-indic-bert", "lang": "hi"}
278
+
279
+ def _analyze_english_content(self, text):
280
+ """Analyze English content with ensemble approach"""
281
+ # Emotion analysis
282
+ emotion_inputs = self.emotion_tokenizer(
283
+ text,
284
+ return_tensors="pt",
285
+ truncation=True,
286
+ max_length=CONFIG["max_length"]
287
+ ).to(self.device)
288
+
289
+ with torch.no_grad():
290
+ emotion_outputs = self.emotion_model(**emotion_inputs)
291
+
292
+ emotion_probs = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
293
+ emotion_pred = torch.argmax(emotion_probs).item()
294
+ emotion_label = self.emotion_model.config.id2label[emotion_pred]
295
+ emotion_score = emotion_probs[0][emotion_pred].item()
296
+
297
+ # Sentiment analysis
298
+ sentiment_inputs = self.sentiment_tokenizer(
299
+ text,
300
+ return_tensors="pt",
301
+ truncation=True,
302
+ max_length=CONFIG["max_length"]
303
+ ).to(self.device)
304
+
305
+ with torch.no_grad():
306
+ sentiment_outputs = self.sentiment_model(**sentiment_inputs)
307
+
308
+ sentiment_probs = torch.nn.functional.softmax(sentiment_outputs.logits, dim=-1)
309
+ sentiment_pred = torch.argmax(sentiment_probs).item()
310
+ # sentiment_label comes as 'LABEL_0', 'LABEL_1', 'LABEL_2'
311
+ # Need to map these to 'negative', 'neutral', 'positive'
312
+ # The roberta-base-sentiment-latest model has mapping: 0: Negative, 1: Neutral, 2: Positive
313
+ sentiment_label_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
314
+ sentiment_label = sentiment_label_mapping.get(sentiment_pred, 'neutral') # Default to neutral if mapping fails
315
+ sentiment_score = sentiment_probs[0][sentiment_pred].item()
316
+
317
+ # Combine results
318
+ mapped_emotion = self.emotion_map.get(emotion_label, "neutral")
319
+
320
+ # Prioritize high-confidence sentiment
321
+ if sentiment_score > CONFIG["confidence_threshold"]:
322
+ final_label = sentiment_label
323
+ final_confidence = sentiment_score
324
+ reason = "high_sentiment_confidence"
325
+ # Then prioritize high-confidence emotion if not neutral
326
+ elif emotion_score > CONFIG["confidence_threshold"] and mapped_emotion != "neutral":
327
+ final_label = mapped_emotion
328
+ final_confidence = emotion_score
329
+ reason = "high_emotion_confidence"
330
+ else:
331
+ # Fallback mechanism for lower confidence or conflicting results
332
+ # A simple weighted sum or voting could be used,
333
+ # but let's use a clearer logic:
334
+ # If both are low confidence or neutral, and their results align, use that.
335
+ # Otherwise, default to neutral or pick the one with slightly higher confidence
336
+ # if it's not neutral.
337
+
338
+ if sentiment_label == mapped_emotion and sentiment_label != "neutral":
339
+ final_label = sentiment_label
340
+ final_confidence = (sentiment_score + emotion_score) / 2
341
+ reason = "emotion_sentiment_agreement"
342
+ elif sentiment_label != "neutral" and sentiment_score > emotion_score and sentiment_score > 0.4: # Use sentiment if somewhat confident
343
+ final_label = sentiment_label
344
+ final_confidence = sentiment_score * 0.9 # Slightly reduce confidence
345
+ reason = "sentiment_slightly_higher"
346
+ elif mapped_emotion != "neutral" and emotion_score > sentiment_score and emotion_score > 0.4: # Use emotion if somewhat confident
347
+ final_label = mapped_emotion
348
+ final_confidence = emotion_score * 0.9 # Slightly reduce confidence
349
+ reason = "emotion_slightly_higher"
350
+ else: # Default to neutral if no strong signal
351
+ final_label = "neutral"
352
+ final_confidence = 0.6 # Assign a baseline neutral confidence
353
+ reason = "fallback_to_neutral"
354
+
355
+
356
+ return final_label, final_confidence, {
357
+ "emotion_label": emotion_label,
358
+ "emotion_score": emotion_score,
359
+ "sentiment_label": sentiment_label,
360
+ "sentiment_score": sentiment_score,
361
+ "mapped_emotion": mapped_emotion,
362
+ "model": "ensemble",
363
+ "lang": "en",
364
+ "reason": reason
365
+ }
366
+
367
+ def analyze_reels(self, reels, max_to_analyze=100):
368
+ """Batch analysis with improved neutral handling"""
369
+ print(f"\n--- Starting Sentiment Analysis ({max_to_analyze} reels) ---")
370
+ results = Counter()
371
+ detailed_results = []
372
+
373
+ for i, reel in enumerate(reels[:max_to_analyze], 1):
374
+ caption = getattr(reel, 'caption_text', '') or getattr(reel, 'caption', '') or ''
375
+ print(f"Analyzing sentiment for reel {i}/{max_to_analyze} (ID: {reel.id})...")
376
+ label, confidence, details = self.analyze_content(caption)
377
+ results[label] += 1
378
+ detailed_results.append({
379
+ "reel_id": reel.id, # Add reel ID
380
+ "text": caption,
381
+ "label": label,
382
+ "confidence": confidence,
383
+ "details": details
384
+ })
385
+
386
+ print("\nInitial Sentiment Distribution:", dict(results))
387
+
388
+ # Post-analysis neutral reduction if a significant portion is neutral
389
+ total_analyzed = sum(results.values())
390
+ if total_analyzed > 0 and results["neutral"] / total_analyzed > CONFIG["neutral_reanalysis_threshold"]:
391
+ print(f"High neutral count ({results['neutral']}). Attempting to re-analyze...")
392
+ self._reduce_neutrals(results, detailed_results)
393
+ print("Sentiment distribution after re-analysis:", dict(results))
394
+
395
+ print("Sentiment Analysis Complete.")
396
+ return results, detailed_results
397
+
398
+ def _reduce_neutrals(self, results, detailed_results):
399
+ """Apply additional techniques to reduce neutral classifications"""
400
+ neutrals_to_recheck = [item for item in detailed_results if item["label"] == "neutral" and item["confidence"] < 0.8]
401
+
402
+ print(f"Re-checking {len(neutrals_to_recheck)} neutral reels...")
403
+
404
+ for item in neutrals_to_recheck:
405
+ original_text = item["text"]
406
+ processed_text = self.preprocess_text(original_text)
407
+ text_lower = processed_text.lower()
408
+
409
+ # Try keyword analysis for strong positive/negative signals
410
+ pos_keywords_strong = {"amazing", "love", "best", "fantastic", "awesome", "superb", "great",
411
+ "अद्भुत", "शानदार", "बहुत अच्छा", "मज़ेदार"}
412
+ neg_keywords_strong = {"hate", "worst", "bad", "terrible", "awful", "disappointed", "horrible", "cringe",
413
+ "खराब", "बेकार", "बहुत बुरा", "घटिया"}
414
+
415
+ is_strong_pos = any(re.search(rf"\b{re.escape(kw)}\b", text_lower) for kw in pos_keywords_strong)
416
+ is_strong_neg = any(re.search(rf"\b{re.escape(kw)}\b", text_lower) for kw in neg_keywords_strong)
417
+
418
+ if is_strong_pos and not is_strong_neg:
419
+ # Reclassify as positive if strong positive keywords found and no strong negative ones
420
+ results["neutral"] -= 1
421
+ results["positive"] += 1
422
+ item.update({
423
+ "label": "positive",
424
+ "confidence": min(0.95, item["confidence"] + 0.3), # Increase confidence
425
+ "reanalyzed": True,
426
+ "reanalysis_reason": "strong_pos_keywords"
427
+ })
428
+ # print(f" Reclassified reel {item['reel_id']} to Positive (Keywords)")
429
+ elif is_strong_neg and not is_strong_pos:
430
+ # Reclassify as negative if strong negative keywords found and no strong positive ones
431
+ results["neutral"] -= 1
432
+ results["negative"] += 1
433
+ item.update({
434
+ "label": "negative",
435
+ "confidence": min(0.95, item["confidence"] + 0.3), # Increase confidence
436
+ "reanalyzed": True,
437
+ "reanalysis_reason": "strong_neg_keywords"
438
+ })
439
+ # print(f" Reclassified reel {item['reel_id']} to Negative (Keywords)")
440
+ # Add other potential re-analysis rules here if needed
441
+ # e.g., checking for question marks (might indicate neutral query),
442
+ # or checking length (very short captions often neutral)
443
+ # For now, we stick to keyword-based re-analysis for simplicity
444
+
445
+
446
+ def plot_sentiment_pie(results, title="Reels Sentiment Analysis"):
447
+ """
448
+ Creates a pie chart from sentiment analysis results and returns the matplotlib figure.
449
+
450
+ Args:
451
+ results: Counter object or dict with 'positive', 'neutral', 'negative' keys
452
+ title: Chart title
453
+
454
+ Returns:
455
+ Matplotlib Figure object, or None if no data.
456
+ """
457
+ labels = ['Positive', 'Neutral', 'Negative']
458
+ sizes = [results.get('positive', 0), results.get('neutral', 0), results.get('negative', 0)]
459
+
460
+ if sum(sizes) == 0:
461
+ return None
462
+
463
+ colors = ['#4CAF50', '#FFC107', '#F44336']
464
+ explode = (0.05, 0, 0.05)
465
+
466
+ fig, ax = plt.subplots(figsize=(8, 6))
467
+
468
+ filtered_labels = [label for i, label in enumerate(labels) if sizes[i] > 0]
469
+ filtered_sizes = [size for size in sizes if size > 0]
470
+ filtered_colors = [colors[i] for i, size in enumerate(sizes) if size > 0]
471
+ explode_map = {'Positive': 0.05, 'Neutral': 0, 'Negative': 0.05}
472
+ filtered_explode = [explode_map.get(label, 0) for label in filtered_labels]
473
+
474
+ ax.pie(filtered_sizes, explode=filtered_explode, labels=filtered_labels, colors=filtered_colors,
475
+ autopct='%1.1f%%', shadow=True, startangle=140,
476
+ textprops={'fontsize': 12, 'color': 'black'})
477
+
478
+ ax.axis('equal')
479
+ plt.title(title, fontsize=16, pad=20)
480
+ plt.tight_layout()
481
+
482
+ # Return the figure object
483
+ return fig
484
+
485
+ # --- Content Analysis Logic ---
486
+ # Content categories
487
+ content_categories = [
488
+ "news", "meme", "sports", "science", "music", "movie",
489
+ "gym", "comedy", "food", "technology", "travel", "fashion", "art", "business"
490
+ ]
491
+
492
+ category_keywords = {
493
+ "news": {"news", "update", "breaking", "reported", "headlines"},
494
+ "meme": {"meme", "funny", "lol", "haha", "relatable"},
495
+ "sports": {"sports", "cricket", "football", "match", "game", "team", "score"},
496
+ "science": {"science", "research", "discovery", "experiment", "facts", "theory"},
497
+ "music": {"music", "song", "album", "release", "artist", "beats"},
498
+ "movie": {"movie", "film", "bollywood", "trailer", "series", "actor"},
499
+ "gym": {"gym", "workout", "fitness", "exercise", "training", "bodybuilding"},
500
+ "comedy": {"comedy", "joke", "humor", "standup", "skit", "laugh"},
501
+ "food": {"food", "recipe", "cooking", "eat", "delicious", "restaurant", "kitchen"},
502
+ "technology": {"tech", "phone", "computer", "ai", "gadget", "software", "innovation"},
503
+ "travel": {"travel", "trip", "vacation", "explore", "destination", "adventure"},
504
+ "fashion": {"fashion", "style", "ootd", "outfit", "trends", "clothing"},
505
+ "art": {"art", "artist", "painting", "drawing", "creative", "design"},
506
+ "business": {"business", "startup", "marketing", "money", "finance", "entrepreneur"}
507
+ }
508
+
509
+ def preprocess_text_cat(text):
510
+ """Basic text cleaning for categorization"""
511
+ if not text:
512
+ return ""
513
+ text = re.sub(r"http\S+|@\w+|#\w+", "", text).lower()
514
+ text = re.sub(r"\s+", " ", text).strip()
515
+ return text
516
+
517
+ def classify_reel_content(text):
518
+ """Classify content using keywords and zero-shot model"""
519
+ global content_classifier_pipeline # Use the global pipeline
520
+
521
+ processed = preprocess_text_cat(text)
522
+
523
+ if not processed or len(processed.split()) < 2:
524
+ return "other", {"reason": "short_text"}
525
+
526
+ for category, keywords in category_keywords.items():
527
+ if any(re.search(rf"\b{re.escape(keyword)}\b", processed) for keyword in keywords):
528
+ return category, {"reason": "keyword_match"}
529
+
530
+ model_text = processed[:256]
531
+
532
+ if content_classifier_pipeline is None:
533
+ # Should not happen if initialized in analyze_reels_gradio or globally
534
+ print("Content classifier pipeline not initialized in classify_reel_content.")
535
+ return "other", {"reason": "classifier_not_initialized"}
536
+
537
+ try:
538
+ result = content_classifier_pipeline(model_text, content_categories, multi_label=False)
539
+ top_label = result['labels'][0]
540
+ top_score = result['scores'][0]
541
+
542
+ if top_score > 0.5:
543
+ return top_label, {"reason": "model_prediction", "score": top_score}
544
+ else:
545
+ return "other", {"reason": "low_model_confidence", "score": top_score}
546
+
547
+ except Exception as e:
548
+ print(f"Error during zero-shot classification for text '{model_text}...': {e}")
549
+ return "other", {"reason": "classification_error"}
550
+
551
+
552
+ def plot_category_distribution(counter, title="Reels Content Distribution"):
553
+ """
554
+ Generate pie chart from category counts and returns the matplotlib figure.
555
+
556
+ Args:
557
+ counter: Counter object with category counts.
558
+ title: Chart title.
559
+
560
+ Returns:
561
+ Matplotlib Figure object, or None if no data.
562
+ """
563
+ labels = []
564
+ sizes = []
565
+
566
+ total = sum(counter.values())
567
+ if total == 0:
568
+ return None
569
+
570
+ threshold = total * 0.02
571
+ other_count = 0
572
+
573
+ sorted_categories = counter.most_common()
574
+
575
+ for category, count in sorted_categories:
576
+ if count >= threshold and category != "other":
577
+ labels.append(category.replace('_', ' ').title())
578
+ sizes.append(count)
579
+ elif category == "other":
580
+ other_count += count
581
+ else:
582
+ other_count += count
583
+
584
+ if other_count > 0:
585
+ labels.append("Other")
586
+ sizes.append(other_count)
587
+
588
+ if not sizes:
589
+ return None
590
+
591
+ fig, ax = plt.subplots(figsize=(10, 8))
592
+ colors = plt.cm.viridis(np.linspace(0, 1, len(sizes)))
593
+
594
+ ax.pie(
595
+ sizes,
596
+ labels=labels,
597
+ autopct='%1.1f%%',
598
+ startangle=140,
599
+ colors=colors,
600
+ wedgeprops={'edgecolor': 'white', 'linewidth': 1},
601
+ textprops={'fontsize': 11, 'color': 'black'}
602
  )
603
+
604
+ plt.title(title, pad=20, fontsize=15)
605
+ plt.axis('equal')
606
+ plt.tight_layout()
607
+
608
+ # Return the figure object
609
+ return fig
610
+
611
+
612
+ # --- Gradio-Compatible Functions ---
613
+ # Preset username from Colab secrets
614
+ # Ensure USERNAME is set in your Colab secrets
615
+ USERNAME = "jattman1993" # Replace with your preset username or fetch from secrets if needed
616
+
617
+ def login_gradio_auto():
618
+ """Gradio-compatible function for automatic login."""
619
+ global cl
620
+ try:
621
+ # Fetch password securely from Colab secrets
622
+ PASSWORD = userdata.get('password')
623
+ except Exception as e:
624
+ return f"Error accessing password secret: {e}", gr.update(visible=False) # Hide OTP input on error
625
+
626
+ if not PASSWORD:
627
+ return "Error: Instagram password not found in Colab secrets. Please add it to Colab secrets with the key 'password'.", gr.update(visible=False) # Hide OTP input
628
+
629
+ cl = Client()
630
+
631
+ try:
632
+ cl.login(USERNAME, PASSWORD)
633
+ # If login is successful, return success message and hide OTP input
634
+ return f"Successfully logged in as {USERNAME}", gr.update(visible=False)
635
+ except Exception as e:
636
+ cl = None # Ensure cl is None on failure
637
+ error_message = str(e)
638
+ if "Two factor challenged" in error_message or "challenge_required" in error_message:
639
+ # If 2FA is required, show the OTP input field
640
+ return f"Login failed: Two-factor authentication required. Please enter the code below.", gr.update(visible=True)
641
+ else:
642
+ # For other errors, hide OTP input and show error message
643
+ return f"Error during login: {error_message}", gr.update(visible=False)
644
+
645
+ # Function to handle OTP submission (if 2FA was required)
646
+ def submit_otp_gradio(otp_code):
647
+ """Gradio-compatible function to submit OTP."""
648
+ global cl
649
+ if cl is None:
650
+ return "Error: Not logged in or client not initialized.", "", gr.update(visible=False) # Hide OTP input
651
+
652
+ try:
653
+ # Assuming the challenge was set up correctly in the login attempt
654
+ # and the cl object has the challenge_data
655
+ cl.two_factor_login(otp_code)
656
+ # If OTP is successful
657
+ return f"OTP successful. Successfully logged in as {USERNAME}.", "", gr.update(visible=False) # Clear OTP input and hide field
658
+ except Exception as e:
659
+ # If OTP fails
660
+ return f"OTP submission failed: {e}. Please try again.", "", gr.update(visible=True) # Keep OTP input visible
661
+
662
+
663
+ def fetch_reels_gradio():
664
+ """Gradio-compatible function to fetch explore reels."""
665
+ global cl
666
+ global explore_reels_list
667
+
668
+ if cl is None:
669
+ explore_reels_list = [] # Ensure list is empty on failure
670
+ return "Error: Not logged in. Please log in first."
671
+
672
+ try:
673
+ # Fetch a limited number of reels for demonstration purposes
674
+ # You might want to make this number configurable later
675
+ fetched_reels = cl.explore_reels()[:100] # Fetch up to 100 for analysis
676
+ explore_reels_list = fetched_reels
677
+ if explore_reels_list:
678
+ return f"Successfully fetched {len(explore_reels_list)} explore reels."
679
+ else:
680
+ explore_reels_list = [] # Ensure it's an empty list
681
+ return "Fetched 0 explore reels."
682
+ except Exception as e:
683
+ explore_reels_list = [] # Ensure it's an empty list on error
684
+ return f"Error fetching explore reels: {e}"
685
 
686
 
687
  def analyze_reels_gradio(max_to_analyze):
 
702
  return "Error: No reels available to analyze.", None, None
703
 
704
 
705
+ # Initialize sentiment analyzer if not already done
706
  if sentiment_analyzer_instance is None:
707
+ try:
708
+ sentiment_analyzer_instance = ReelSentimentAnalyzer()
709
+ # Optional: Train Hindi model if needed and data is available
710
+ # sample_train_data = [...] # Define your training data
711
+ # sentiment_analyzer_instance.train_hindi_model(sample_train_data)
712
+ except Exception as e:
713
+ return f"Error initializing Sentiment Analyzer: {e}", None, None
714
+
715
+ # Initialize content classifier pipeline if not already done
716
  if content_classifier_pipeline is None:
717
+ try:
718
+ print("Initializing Content Classifier Pipeline...")
719
+ content_classifier_pipeline = pipeline(
720
+ "zero-shot-classification",
721
+ model="facebook/bart-large-mnli",
722
+ device=0 if torch.cuda.is_available() else -1 # Use GPU if available
723
+ )
724
+ print("Content Classifier Pipeline Initialized.")
725
+ except Exception as e:
726
+ return f"Error initializing Content Classifier: {e}", None, None
727
 
728
 
729
  analysis_status_messages = []
 
776
  return final_status_message, sentiment_plot_figure, content_plot_figure
777
 
778
 
 
 
 
 
779
  # --- Gradio Blocks Interface ---
780
  with gr.Blocks() as demo:
781
  gr.Markdown("# Instagram Reels Analysis")
782
+
783
+ # Login Section
784
  with gr.Row():
785
+ connect_button = gr.Button("Connect Instagram")
 
786
  login_status_output = gr.Label(label="Login Status")
787
 
788
+ # OTP Input (initially hidden)
789
+ with gr.Row(visible=False) as otp_row:
790
+ otp_input = gr.Textbox(label="Enter OTP Code")
791
+ otp_submit_button = gr.Button("Submit OTP")
792
+
793
+
794
+ # Fetch Reels Section
795
  with gr.Row():
796
  fetch_button = gr.Button("Fetch Reels")
797
  fetch_status_output = gr.Label(label="Fetch Status")
798
 
799
+ # Analysis Section
800
  with gr.Row():
801
  max_reels_input = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of Reels to Analyze")
802
  analyze_button = gr.Button("Analyze Reels")
803
 
804
  analyze_status_output = gr.Label(label="Analysis Status")
805
 
806
+ # Results Section
807
  with gr.Row():
808
  # Sentiment Analysis Outputs
809
  with gr.Column():
 
816
  content_plot_output = gr.Plot(label="Content Distribution")
817
 
818
 
819
+ # Link buttons to functions
820
+ connect_button.click(
821
+ fn=login_gradio_auto,
822
+ inputs=None, # No direct inputs, username is preset
823
+ outputs=[login_status_output, otp_row]
824
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
825
 
826
+ otp_submit_button.click(
827
+ fn=submit_otp_gradio,
828
+ inputs=otp_input,
829
+ outputs=[login_status_output, otp_input, otp_row]
830
  )
831
 
832
  fetch_button.click(
833
+ fn=fetch_reels_gradio,
834
  inputs=None, # No direct inputs needed for fetching
835
  outputs=fetch_status_output
836
  )
837
 
 
838
  analyze_button.click(
839
  fn=analyze_reels_gradio,
840
  inputs=max_reels_input, # Input is the slider value
841
  outputs=[analyze_status_output, sentiment_plot_output, content_plot_output] # Outputs are status and the two plots
842
  )
843
 
844
+ # --- Launch the Gradio app ---
845
+ if __name__ == "__main__":
846
+ # This block ensures the app only launches when the script is executed directly
847
+ # (e.g., when running `python deploy.py` or `gradio deploy.py`)
848
+ # It prevents the app from launching automatically when the file is written in Colab.
849
+ # When deploying to Hugging Face Spaces via `gradio deploy`, it will find and run this.
850
+ # For Colab sharing, you can use `demo.launch(share=True)` outside this if block.
851
+
852
+ # For standalone deploy.py, you might want to uncomment this:
853
+ # demo.launch()
854
+
855
+ # For Colab and `gradio deploy` compatibility, the `gradio deploy` command handles launching.
856
+ # The `demo.launch()` line is removed here from the main script block.
857
+ pass # Keep the __main__ block if needed for local testing setup
858
+
859
+
860
+ # Note: When using `gradio deploy` on Hugging Face Spaces, the `demo` object is
861
+ # automatically discovered and launched. You don't need `demo.launch()` here
862
+ # for that specific deployment method.
863
+
864
+ # For running directly in Colab to test before deploying:
865
+ # demo.launch(share=True)