Files changed (1) hide show
  1. app.py +136 -22
app.py CHANGED
@@ -155,13 +155,76 @@ class HistoryManager:
155
 
156
  # Core Analysis Engine
157
  class SentimentEngine:
158
- """Streamlined sentiment analysis"""
159
  def __init__(self):
160
  self.model_manager = ModelManager()
161
 
162
- @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  def analyze_single(self, text: str) -> Dict:
164
- """Analyze single text"""
165
  if not text.strip():
166
  raise ValueError("Empty text")
167
 
@@ -175,16 +238,21 @@ class SentimentEngine:
175
  probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
176
 
177
  sentiment = "Positive" if probs[1] > probs[0] else "Negative"
 
 
 
 
178
  return {
179
  'sentiment': sentiment,
180
  'confidence': float(probs.max()),
181
  'pos_prob': float(probs[1]),
182
- 'neg_prob': float(probs[0])
 
183
  }
184
 
185
  @handle_errors(default_return=[])
186
  def analyze_batch(self, texts: List[str], progress_callback=None) -> List[Dict]:
187
- """Optimized batch processing"""
188
  if len(texts) > config.BATCH_SIZE_LIMIT:
189
  texts = texts[:config.BATCH_SIZE_LIMIT]
190
 
@@ -208,13 +276,17 @@ class SentimentEngine:
208
 
209
  for text, prob in zip(batch, probs):
210
  sentiment = "Positive" if prob[1] > prob[0] else "Negative"
 
 
 
211
  results.append({
212
  'text': text[:50] + '...' if len(text) > 50 else text,
213
  'full_text': text,
214
  'sentiment': sentiment,
215
  'confidence': float(prob.max()),
216
  'pos_prob': float(prob[1]),
217
- 'neg_prob': float(prob[0])
 
218
  })
219
 
220
  return results
@@ -275,6 +347,40 @@ class PlotFactory:
275
  fig.tight_layout()
276
  return fig
277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  @staticmethod
279
  @handle_errors(default_return=None)
280
  def create_wordcloud(text: str, sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
@@ -346,7 +452,7 @@ class DataHandler:
346
 
347
  if format_type == 'csv':
348
  writer = csv.writer(temp_file)
349
- writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Pos_Prob', 'Neg_Prob'])
350
  for entry in data:
351
  writer.writerow([
352
  entry.get('timestamp', ''),
@@ -354,7 +460,8 @@ class DataHandler:
354
  entry.get('sentiment', ''),
355
  f"{entry.get('confidence', 0):.4f}",
356
  f"{entry.get('pos_prob', 0):.4f}",
357
- f"{entry.get('neg_prob', 0):.4f}"
 
358
  ])
359
  elif format_type == 'json':
360
  json.dump(data, temp_file, indent=2, ensure_ascii=False)
@@ -394,18 +501,18 @@ class SentimentApp:
394
 
395
  # Example data
396
  self.examples = [
397
- ["While the film's visual effects were undeniably impressive, the story lacked emotional weight, and the pacing felt inconsistent throughout."],
398
- ["An extraordinary achievement in filmmaking β€” the direction was masterful, the script was sharp, and every performance added depth and realism."],
399
- ["Despite a promising start, the film quickly devolved into a series of clichΓ©s, with weak character development and an ending that felt rushed and unearned."],
400
- ["A beautifully crafted story with heartfelt moments and a soundtrack that perfectly captured the emotional tone of each scene."],
401
- ["The movie was far too long, with unnecessary subplots and dull dialogue that made it difficult to stay engaged until the end."]
402
  ]
403
 
404
- @handle_errors(default_return=("Please enter text", None, None, None))
405
  def analyze_single(self, text: str, theme: str = 'default'):
406
- """Single text analysis"""
407
  if not text.strip():
408
- return "Please enter text", None, None, None
409
 
410
  result = self.engine.analyze_single(text)
411
 
@@ -423,9 +530,14 @@ class SentimentApp:
423
  prob_plot = PlotFactory.create_sentiment_bars(probs, theme_ctx)
424
  gauge_plot = PlotFactory.create_confidence_gauge(result['confidence'], result['sentiment'], theme_ctx)
425
  cloud_plot = PlotFactory.create_wordcloud(text, result['sentiment'], theme_ctx)
 
426
 
427
- result_text = f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.3f})"
428
- return result_text, prob_plot, gauge_plot, cloud_plot
 
 
 
 
429
 
430
  @handle_errors(default_return=None)
431
  def analyze_batch(self, reviews: str, progress=None):
@@ -492,7 +604,7 @@ def create_interface():
492
 
493
  with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo:
494
  gr.Markdown("# 🎬 AI Movie Sentiment Analyzer")
495
- gr.Markdown("Optimized sentiment analysis with advanced visualizations")
496
 
497
  with gr.Tab("Single Analysis"):
498
  with gr.Row():
@@ -516,13 +628,15 @@ def create_interface():
516
  )
517
 
518
  with gr.Column():
519
- result_output = gr.Textbox(label="Result", lines=2)
520
 
521
  with gr.Row():
522
  prob_plot = gr.Plot(label="Probabilities")
523
  gauge_plot = gr.Plot(label="Confidence")
524
 
525
- wordcloud_plot = gr.Plot(label="Word Cloud")
 
 
526
 
527
  with gr.Tab("Batch Analysis"):
528
  with gr.Row():
@@ -558,7 +672,7 @@ def create_interface():
558
  analyze_btn.click(
559
  app.analyze_single,
560
  inputs=[text_input, theme_selector],
561
- outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot]
562
  )
563
 
564
  load_btn.click(app.data_handler.process_file, inputs=file_upload, outputs=batch_input)
 
155
 
156
  # Core Analysis Engine
157
  class SentimentEngine:
158
+ """Streamlined sentiment analysis with attention-based keyword extraction"""
159
  def __init__(self):
160
  self.model_manager = ModelManager()
161
 
162
+ def extract_key_words(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]:
163
+ """Extract contributing words using BERT attention weights"""
164
+ try:
165
+ inputs = self.model_manager.tokenizer(
166
+ text, return_tensors="pt", padding=True,
167
+ truncation=True, max_length=config.MAX_TEXT_LENGTH
168
+ ).to(self.model_manager.device)
169
+
170
+ # Get model outputs with attention weights
171
+ with torch.no_grad():
172
+ outputs = self.model_manager.model(**inputs, output_attentions=True)
173
+ attention = outputs.attentions # Tuple of attention tensors for each layer
174
+
175
+ # Use the last layer's attention, average over all heads
176
+ last_attention = attention[-1] # Shape: [batch_size, num_heads, seq_len, seq_len]
177
+ avg_attention = last_attention.mean(dim=1) # Average over heads: [batch_size, seq_len, seq_len]
178
+
179
+ # Focus on attention to [CLS] token (index 0) as it represents the whole sequence
180
+ cls_attention = avg_attention[0, 0, :] # Attention from CLS to all tokens
181
+
182
+ # Get tokens and their attention scores
183
+ tokens = self.model_manager.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
184
+ attention_scores = cls_attention.cpu().numpy()
185
+
186
+ # Filter out special tokens and combine subword tokens
187
+ word_scores = {}
188
+ current_word = ""
189
+ current_score = 0.0
190
+
191
+ for i, (token, score) in enumerate(zip(tokens, attention_scores)):
192
+ if token in ['[CLS]', '[SEP]', '[PAD]']:
193
+ continue
194
+
195
+ if token.startswith('##'):
196
+ # Subword token, add to current word
197
+ current_word += token[2:]
198
+ current_score = max(current_score, score) # Take max attention
199
+ else:
200
+ # New word, save previous if exists
201
+ if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
202
+ word_scores[current_word.lower()] = current_score
203
+
204
+ current_word = token
205
+ current_score = score
206
+
207
+ # Don't forget the last word
208
+ if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
209
+ word_scores[current_word.lower()] = current_score
210
+
211
+ # Filter out stop words and sort by attention score
212
+ filtered_words = {
213
+ word: score for word, score in word_scores.items()
214
+ if word not in config.STOP_WORDS and len(word) >= config.MIN_WORD_LENGTH
215
+ }
216
+
217
+ # Sort by attention score and return top_k
218
+ sorted_words = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)
219
+ return sorted_words[:top_k]
220
+
221
+ except Exception as e:
222
+ logger.error(f"Key word extraction failed: {e}")
223
+ return []
224
+
225
+ @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'key_words': []})
226
  def analyze_single(self, text: str) -> Dict:
227
+ """Analyze single text with key word extraction"""
228
  if not text.strip():
229
  raise ValueError("Empty text")
230
 
 
238
  probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
239
 
240
  sentiment = "Positive" if probs[1] > probs[0] else "Negative"
241
+
242
+ # Extract key contributing words
243
+ key_words = self.extract_key_words(text)
244
+
245
  return {
246
  'sentiment': sentiment,
247
  'confidence': float(probs.max()),
248
  'pos_prob': float(probs[1]),
249
+ 'neg_prob': float(probs[0]),
250
+ 'key_words': key_words
251
  }
252
 
253
  @handle_errors(default_return=[])
254
  def analyze_batch(self, texts: List[str], progress_callback=None) -> List[Dict]:
255
+ """Optimized batch processing with key words"""
256
  if len(texts) > config.BATCH_SIZE_LIMIT:
257
  texts = texts[:config.BATCH_SIZE_LIMIT]
258
 
 
276
 
277
  for text, prob in zip(batch, probs):
278
  sentiment = "Positive" if prob[1] > prob[0] else "Negative"
279
+ # Extract key words for each text in batch
280
+ key_words = self.extract_key_words(text, top_k=5) # Fewer for batch processing
281
+
282
  results.append({
283
  'text': text[:50] + '...' if len(text) > 50 else text,
284
  'full_text': text,
285
  'sentiment': sentiment,
286
  'confidence': float(prob.max()),
287
  'pos_prob': float(prob[1]),
288
+ 'neg_prob': float(prob[0]),
289
+ 'key_words': key_words
290
  })
291
 
292
  return results
 
347
  fig.tight_layout()
348
  return fig
349
 
350
+ @staticmethod
351
+ @handle_errors(default_return=None)
352
+ def create_keyword_chart(key_words: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
353
+ """Create horizontal bar chart for key contributing words"""
354
+ if not key_words:
355
+ return None
356
+
357
+ with managed_figure(figsize=config.FIGURE_SIZE_SINGLE) as fig:
358
+ ax = fig.add_subplot(111)
359
+
360
+ words = [word for word, score in key_words]
361
+ scores = [score for word, score in key_words]
362
+
363
+ # Choose color based on sentiment
364
+ color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg']
365
+
366
+ # Create horizontal bar chart
367
+ bars = ax.barh(range(len(words)), scores, color=color, alpha=0.7)
368
+ ax.set_yticks(range(len(words)))
369
+ ax.set_yticklabels(words)
370
+ ax.set_xlabel('Attention Weight')
371
+ ax.set_title(f'Top Contributing Words ({sentiment})', fontweight='bold')
372
+
373
+ # Add value labels on bars
374
+ for i, (bar, score) in enumerate(zip(bars, scores)):
375
+ ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2.,
376
+ f'{score:.3f}', ha='left', va='center', fontsize=9)
377
+
378
+ # Invert y-axis to show highest scoring word at top
379
+ ax.invert_yaxis()
380
+ ax.grid(axis='x', alpha=0.3)
381
+ fig.tight_layout()
382
+ return fig
383
+
384
  @staticmethod
385
  @handle_errors(default_return=None)
386
  def create_wordcloud(text: str, sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
 
452
 
453
  if format_type == 'csv':
454
  writer = csv.writer(temp_file)
455
+ writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Pos_Prob', 'Neg_Prob', 'Key_Words'])
456
  for entry in data:
457
  writer.writerow([
458
  entry.get('timestamp', ''),
 
460
  entry.get('sentiment', ''),
461
  f"{entry.get('confidence', 0):.4f}",
462
  f"{entry.get('pos_prob', 0):.4f}",
463
+ f"{entry.get('neg_prob', 0):.4f}",
464
+ "|".join([f"{word}:{score:.3f}" for word, score in entry.get('key_words', [])])
465
  ])
466
  elif format_type == 'json':
467
  json.dump(data, temp_file, indent=2, ensure_ascii=False)
 
501
 
502
  # Example data
503
  self.examples = [
504
+ ["The cinematography was stunning but the plot was predictable and lacked depth."],
505
+ ["A masterpiece! Powerful performances and unforgettable scenes throughout."],
506
+ ["Boring from start to finish with terrible acting and weak plot development."],
507
+ ["Impressive effects but the story was confusing and difficult to follow."],
508
+ ["Absolutely incredible ending - one of the best films in recent years!"]
509
  ]
510
 
511
+ @handle_errors(default_return=("Please enter text", None, None, None, None))
512
  def analyze_single(self, text: str, theme: str = 'default'):
513
+ """Single text analysis with key words"""
514
  if not text.strip():
515
+ return "Please enter text", None, None, None, None
516
 
517
  result = self.engine.analyze_single(text)
518
 
 
530
  prob_plot = PlotFactory.create_sentiment_bars(probs, theme_ctx)
531
  gauge_plot = PlotFactory.create_confidence_gauge(result['confidence'], result['sentiment'], theme_ctx)
532
  cloud_plot = PlotFactory.create_wordcloud(text, result['sentiment'], theme_ctx)
533
+ keyword_plot = PlotFactory.create_keyword_chart(result['key_words'], result['sentiment'], theme_ctx)
534
 
535
+ # Format result text with key words
536
+ key_words_str = ", ".join([f"{word}({score:.3f})" for word, score in result['key_words'][:5]])
537
+ result_text = (f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.3f})\n"
538
+ f"Key Words: {key_words_str}")
539
+
540
+ return result_text, prob_plot, gauge_plot, cloud_plot, keyword_plot
541
 
542
  @handle_errors(default_return=None)
543
  def analyze_batch(self, reviews: str, progress=None):
 
604
 
605
  with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo:
606
  gr.Markdown("# 🎬 AI Movie Sentiment Analyzer")
607
+ gr.Markdown("Optimized sentiment analysis with advanced visualizations and key word extraction")
608
 
609
  with gr.Tab("Single Analysis"):
610
  with gr.Row():
 
628
  )
629
 
630
  with gr.Column():
631
+ result_output = gr.Textbox(label="Result", lines=3)
632
 
633
  with gr.Row():
634
  prob_plot = gr.Plot(label="Probabilities")
635
  gauge_plot = gr.Plot(label="Confidence")
636
 
637
+ with gr.Row():
638
+ wordcloud_plot = gr.Plot(label="Word Cloud")
639
+ keyword_plot = gr.Plot(label="Key Contributing Words")
640
 
641
  with gr.Tab("Batch Analysis"):
642
  with gr.Row():
 
672
  analyze_btn.click(
673
  app.analyze_single,
674
  inputs=[text_input, theme_selector],
675
+ outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot, keyword_plot]
676
  )
677
 
678
  load_btn.click(app.data_handler.process_file, inputs=file_upload, outputs=batch_input)