Spaces:

entropy25
/

sentiment-analysis

Sleeping

App Files Files Community

Update app.py

by nonnan - opened Jul 22

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+136

-22

Files changed (1) hide show

app.py +136 -22

app.py CHANGED Viewed

@@ -155,13 +155,76 @@ class HistoryManager:
 # Core Analysis Engine
 class SentimentEngine:
-    """Streamlined sentiment analysis"""
     def __init__(self):
         self.model_manager = ModelManager()
-    @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0})
     def analyze_single(self, text: str) -> Dict:
-        """Analyze single text"""
         if not text.strip():
             raise ValueError("Empty text")
@@ -175,16 +238,21 @@ class SentimentEngine:
             probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
         sentiment = "Positive" if probs[1] > probs[0] else "Negative"
         return {
             'sentiment': sentiment,
             'confidence': float(probs.max()),
             'pos_prob': float(probs[1]),
-            'neg_prob': float(probs[0])
         }
     @handle_errors(default_return=[])
     def analyze_batch(self, texts: List[str], progress_callback=None) -> List[Dict]:
-        """Optimized batch processing"""
         if len(texts) > config.BATCH_SIZE_LIMIT:
             texts = texts[:config.BATCH_SIZE_LIMIT]
@@ -208,13 +276,17 @@ class SentimentEngine:
             for text, prob in zip(batch, probs):
                 sentiment = "Positive" if prob[1] > prob[0] else "Negative"
                 results.append({
                     'text': text[:50] + '...' if len(text) > 50 else text,
                     'full_text': text,
                     'sentiment': sentiment,
                     'confidence': float(prob.max()),
                     'pos_prob': float(prob[1]),
-                    'neg_prob': float(prob[0])
                 })
         return results
@@ -275,6 +347,40 @@ class PlotFactory:
             fig.tight_layout()
             return fig
     @staticmethod
     @handle_errors(default_return=None)
     def create_wordcloud(text: str, sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
@@ -346,7 +452,7 @@ class DataHandler:
         if format_type == 'csv':
             writer = csv.writer(temp_file)
-            writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Pos_Prob', 'Neg_Prob'])
             for entry in data:
                 writer.writerow([
                     entry.get('timestamp', ''),
@@ -354,7 +460,8 @@ class DataHandler:
                     entry.get('sentiment', ''),
                     f"{entry.get('confidence', 0):.4f}",
                     f"{entry.get('pos_prob', 0):.4f}",
-                    f"{entry.get('neg_prob', 0):.4f}"
                 ])
         elif format_type == 'json':
             json.dump(data, temp_file, indent=2, ensure_ascii=False)
@@ -394,18 +501,18 @@ class SentimentApp:
         # Example data
         self.examples = [
-            ["While the film's visual effects were undeniably impressive, the story lacked emotional weight, and the pacing felt inconsistent throughout."],
-            ["An extraordinary achievement in filmmaking — the direction was masterful, the script was sharp, and every performance added depth and realism."],
-            ["Despite a promising start, the film quickly devolved into a series of clichés, with weak character development and an ending that felt rushed and unearned."],
-            ["A beautifully crafted story with heartfelt moments and a soundtrack that perfectly captured the emotional tone of each scene."],
-            ["The movie was far too long, with unnecessary subplots and dull dialogue that made it difficult to stay engaged until the end."]
         ]
-    @handle_errors(default_return=("Please enter text", None, None, None))
     def analyze_single(self, text: str, theme: str = 'default'):
-        """Single text analysis"""
         if not text.strip():
-            return "Please enter text", None, None, None
         result = self.engine.analyze_single(text)
@@ -423,9 +530,14 @@ class SentimentApp:
         prob_plot = PlotFactory.create_sentiment_bars(probs, theme_ctx)
         gauge_plot = PlotFactory.create_confidence_gauge(result['confidence'], result['sentiment'], theme_ctx)
         cloud_plot = PlotFactory.create_wordcloud(text, result['sentiment'], theme_ctx)
-        result_text = f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.3f})"
-        return result_text, prob_plot, gauge_plot, cloud_plot
     @handle_errors(default_return=None)
     def analyze_batch(self, reviews: str, progress=None):
@@ -492,7 +604,7 @@ def create_interface():
     with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo:
         gr.Markdown("# 🎬 AI Movie Sentiment Analyzer")
-        gr.Markdown("Optimized sentiment analysis with advanced visualizations")
         with gr.Tab("Single Analysis"):
             with gr.Row():
@@ -516,13 +628,15 @@ def create_interface():
                     )
                 with gr.Column():
-                    result_output = gr.Textbox(label="Result", lines=2)
             with gr.Row():
                 prob_plot = gr.Plot(label="Probabilities")
                 gauge_plot = gr.Plot(label="Confidence")
-            wordcloud_plot = gr.Plot(label="Word Cloud")
         with gr.Tab("Batch Analysis"):
             with gr.Row():
@@ -558,7 +672,7 @@ def create_interface():
         analyze_btn.click(
             app.analyze_single,
             inputs=[text_input, theme_selector],
-            outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot]
         )
         load_btn.click(app.data_handler.process_file, inputs=file_upload, outputs=batch_input)

 # Core Analysis Engine
 class SentimentEngine:
+    """Streamlined sentiment analysis with attention-based keyword extraction"""
     def __init__(self):
         self.model_manager = ModelManager()
+    def extract_key_words(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]:
+        """Extract contributing words using BERT attention weights"""
+        try:
+            inputs = self.model_manager.tokenizer(
+                text, return_tensors="pt", padding=True,
+                truncation=True, max_length=config.MAX_TEXT_LENGTH
+            ).to(self.model_manager.device)
+            # Get model outputs with attention weights
+            with torch.no_grad():
+                outputs = self.model_manager.model(**inputs, output_attentions=True)
+                attention = outputs.attentions  # Tuple of attention tensors for each layer
+                # Use the last layer's attention, average over all heads
+                last_attention = attention[-1]  # Shape: [batch_size, num_heads, seq_len, seq_len]
+                avg_attention = last_attention.mean(dim=1)  # Average over heads: [batch_size, seq_len, seq_len]
+                # Focus on attention to [CLS] token (index 0) as it represents the whole sequence
+                cls_attention = avg_attention[0, 0, :]  # Attention from CLS to all tokens
+            # Get tokens and their attention scores
+            tokens = self.model_manager.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
+            attention_scores = cls_attention.cpu().numpy()
+            # Filter out special tokens and combine subword tokens
+            word_scores = {}
+            current_word = ""
+            current_score = 0.0
+            for i, (token, score) in enumerate(zip(tokens, attention_scores)):
+                if token in ['[CLS]', '[SEP]', '[PAD]']:
+                    continue
+                if token.startswith('##'):
+                    # Subword token, add to current word
+                    current_word += token[2:]
+                    current_score = max(current_score, score)  # Take max attention
+                else:
+                    # New word, save previous if exists
+                    if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
+                        word_scores[current_word.lower()] = current_score
+                    current_word = token
+                    current_score = score
+            # Don't forget the last word
+            if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
+                word_scores[current_word.lower()] = current_score
+            # Filter out stop words and sort by attention score
+            filtered_words = {
+                word: score for word, score in word_scores.items()
+                if word not in config.STOP_WORDS and len(word) >= config.MIN_WORD_LENGTH
+            }
+            # Sort by attention score and return top_k
+            sorted_words = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)
+            return sorted_words[:top_k]
+        except Exception as e:
+            logger.error(f"Key word extraction failed: {e}")
+            return []
+    @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'key_words': []})
     def analyze_single(self, text: str) -> Dict:
+        """Analyze single text with key word extraction"""
         if not text.strip():
             raise ValueError("Empty text")
             probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
         sentiment = "Positive" if probs[1] > probs[0] else "Negative"
+        # Extract key contributing words
+        key_words = self.extract_key_words(text)
         return {
             'sentiment': sentiment,
             'confidence': float(probs.max()),
             'pos_prob': float(probs[1]),
+            'neg_prob': float(probs[0]),
+            'key_words': key_words
         }
     @handle_errors(default_return=[])
     def analyze_batch(self, texts: List[str], progress_callback=None) -> List[Dict]:
+        """Optimized batch processing with key words"""
         if len(texts) > config.BATCH_SIZE_LIMIT:
             texts = texts[:config.BATCH_SIZE_LIMIT]
             for text, prob in zip(batch, probs):
                 sentiment = "Positive" if prob[1] > prob[0] else "Negative"
+                # Extract key words for each text in batch
+                key_words = self.extract_key_words(text, top_k=5)  # Fewer for batch processing
                 results.append({
                     'text': text[:50] + '...' if len(text) > 50 else text,
                     'full_text': text,
                     'sentiment': sentiment,
                     'confidence': float(prob.max()),
                     'pos_prob': float(prob[1]),
+                    'neg_prob': float(prob[0]),
+                    'key_words': key_words
                 })
         return results
             fig.tight_layout()
             return fig
+    @staticmethod
+    @handle_errors(default_return=None)
+    def create_keyword_chart(key_words: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
+        """Create horizontal bar chart for key contributing words"""
+        if not key_words:
+            return None
+        with managed_figure(figsize=config.FIGURE_SIZE_SINGLE) as fig:
+            ax = fig.add_subplot(111)
+            words = [word for word, score in key_words]
+            scores = [score for word, score in key_words]
+            # Choose color based on sentiment
+            color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg']
+            # Create horizontal bar chart
+            bars = ax.barh(range(len(words)), scores, color=color, alpha=0.7)
+            ax.set_yticks(range(len(words)))
+            ax.set_yticklabels(words)
+            ax.set_xlabel('Attention Weight')
+            ax.set_title(f'Top Contributing Words ({sentiment})', fontweight='bold')
+            # Add value labels on bars
+            for i, (bar, score) in enumerate(zip(bars, scores)):
+                ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2.,
+                       f'{score:.3f}', ha='left', va='center', fontsize=9)
+            # Invert y-axis to show highest scoring word at top
+            ax.invert_yaxis()
+            ax.grid(axis='x', alpha=0.3)
+            fig.tight_layout()
+            return fig
     @staticmethod
     @handle_errors(default_return=None)
     def create_wordcloud(text: str, sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
         if format_type == 'csv':
             writer = csv.writer(temp_file)
+            writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Pos_Prob', 'Neg_Prob', 'Key_Words'])
             for entry in data:
                 writer.writerow([
                     entry.get('timestamp', ''),
                     entry.get('sentiment', ''),
                     f"{entry.get('confidence', 0):.4f}",
                     f"{entry.get('pos_prob', 0):.4f}",
+                    f"{entry.get('neg_prob', 0):.4f}",
+                    "|".join([f"{word}:{score:.3f}" for word, score in entry.get('key_words', [])])
                 ])
         elif format_type == 'json':
             json.dump(data, temp_file, indent=2, ensure_ascii=False)
         # Example data
         self.examples = [
+            ["The cinematography was stunning but the plot was predictable and lacked depth."],
+            ["A masterpiece! Powerful performances and unforgettable scenes throughout."],
+            ["Boring from start to finish with terrible acting and weak plot development."],
+            ["Impressive effects but the story was confusing and difficult to follow."],
+            ["Absolutely incredible ending - one of the best films in recent years!"]
         ]
+    @handle_errors(default_return=("Please enter text", None, None, None, None))
     def analyze_single(self, text: str, theme: str = 'default'):
+        """Single text analysis with key words"""
         if not text.strip():
+            return "Please enter text", None, None, None, None
         result = self.engine.analyze_single(text)
         prob_plot = PlotFactory.create_sentiment_bars(probs, theme_ctx)
         gauge_plot = PlotFactory.create_confidence_gauge(result['confidence'], result['sentiment'], theme_ctx)
         cloud_plot = PlotFactory.create_wordcloud(text, result['sentiment'], theme_ctx)
+        keyword_plot = PlotFactory.create_keyword_chart(result['key_words'], result['sentiment'], theme_ctx)
+        # Format result text with key words
+        key_words_str = ", ".join([f"{word}({score:.3f})" for word, score in result['key_words'][:5]])
+        result_text = (f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.3f})\n"
+                      f"Key Words: {key_words_str}")
+        return result_text, prob_plot, gauge_plot, cloud_plot, keyword_plot
     @handle_errors(default_return=None)
     def analyze_batch(self, reviews: str, progress=None):
     with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo:
         gr.Markdown("# 🎬 AI Movie Sentiment Analyzer")
+        gr.Markdown("Optimized sentiment analysis with advanced visualizations and key word extraction")
         with gr.Tab("Single Analysis"):
             with gr.Row():
                     )
                 with gr.Column():
+                    result_output = gr.Textbox(label="Result", lines=3)
             with gr.Row():
                 prob_plot = gr.Plot(label="Probabilities")
                 gauge_plot = gr.Plot(label="Confidence")
+            with gr.Row():
+                wordcloud_plot = gr.Plot(label="Word Cloud")
+                keyword_plot = gr.Plot(label="Key Contributing Words")
         with gr.Tab("Batch Analysis"):
             with gr.Row():
         analyze_btn.click(
             app.analyze_single,
             inputs=[text_input, theme_selector],
+            outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot, keyword_plot]
         )
         load_btn.click(app.data_handler.process_file, inputs=file_upload, outputs=batch_input)