Spaces:
Sleeping
Sleeping
Update app.py
#1
by
nonnan
- opened
app.py
CHANGED
@@ -155,13 +155,76 @@ class HistoryManager:
|
|
155 |
|
156 |
# Core Analysis Engine
|
157 |
class SentimentEngine:
|
158 |
-
"""Streamlined sentiment analysis"""
|
159 |
def __init__(self):
|
160 |
self.model_manager = ModelManager()
|
161 |
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
def analyze_single(self, text: str) -> Dict:
|
164 |
-
"""Analyze single text"""
|
165 |
if not text.strip():
|
166 |
raise ValueError("Empty text")
|
167 |
|
@@ -175,16 +238,21 @@ class SentimentEngine:
|
|
175 |
probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
|
176 |
|
177 |
sentiment = "Positive" if probs[1] > probs[0] else "Negative"
|
|
|
|
|
|
|
|
|
178 |
return {
|
179 |
'sentiment': sentiment,
|
180 |
'confidence': float(probs.max()),
|
181 |
'pos_prob': float(probs[1]),
|
182 |
-
'neg_prob': float(probs[0])
|
|
|
183 |
}
|
184 |
|
185 |
@handle_errors(default_return=[])
|
186 |
def analyze_batch(self, texts: List[str], progress_callback=None) -> List[Dict]:
|
187 |
-
"""Optimized batch processing"""
|
188 |
if len(texts) > config.BATCH_SIZE_LIMIT:
|
189 |
texts = texts[:config.BATCH_SIZE_LIMIT]
|
190 |
|
@@ -208,13 +276,17 @@ class SentimentEngine:
|
|
208 |
|
209 |
for text, prob in zip(batch, probs):
|
210 |
sentiment = "Positive" if prob[1] > prob[0] else "Negative"
|
|
|
|
|
|
|
211 |
results.append({
|
212 |
'text': text[:50] + '...' if len(text) > 50 else text,
|
213 |
'full_text': text,
|
214 |
'sentiment': sentiment,
|
215 |
'confidence': float(prob.max()),
|
216 |
'pos_prob': float(prob[1]),
|
217 |
-
'neg_prob': float(prob[0])
|
|
|
218 |
})
|
219 |
|
220 |
return results
|
@@ -275,6 +347,40 @@ class PlotFactory:
|
|
275 |
fig.tight_layout()
|
276 |
return fig
|
277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
@staticmethod
|
279 |
@handle_errors(default_return=None)
|
280 |
def create_wordcloud(text: str, sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
|
@@ -346,7 +452,7 @@ class DataHandler:
|
|
346 |
|
347 |
if format_type == 'csv':
|
348 |
writer = csv.writer(temp_file)
|
349 |
-
writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Pos_Prob', 'Neg_Prob'])
|
350 |
for entry in data:
|
351 |
writer.writerow([
|
352 |
entry.get('timestamp', ''),
|
@@ -354,7 +460,8 @@ class DataHandler:
|
|
354 |
entry.get('sentiment', ''),
|
355 |
f"{entry.get('confidence', 0):.4f}",
|
356 |
f"{entry.get('pos_prob', 0):.4f}",
|
357 |
-
f"{entry.get('neg_prob', 0):.4f}"
|
|
|
358 |
])
|
359 |
elif format_type == 'json':
|
360 |
json.dump(data, temp_file, indent=2, ensure_ascii=False)
|
@@ -394,18 +501,18 @@ class SentimentApp:
|
|
394 |
|
395 |
# Example data
|
396 |
self.examples = [
|
397 |
-
["
|
398 |
-
["
|
399 |
-
["
|
400 |
-
["
|
401 |
-
["
|
402 |
]
|
403 |
|
404 |
-
@handle_errors(default_return=("Please enter text", None, None, None))
|
405 |
def analyze_single(self, text: str, theme: str = 'default'):
|
406 |
-
"""Single text analysis"""
|
407 |
if not text.strip():
|
408 |
-
return "Please enter text", None, None, None
|
409 |
|
410 |
result = self.engine.analyze_single(text)
|
411 |
|
@@ -423,9 +530,14 @@ class SentimentApp:
|
|
423 |
prob_plot = PlotFactory.create_sentiment_bars(probs, theme_ctx)
|
424 |
gauge_plot = PlotFactory.create_confidence_gauge(result['confidence'], result['sentiment'], theme_ctx)
|
425 |
cloud_plot = PlotFactory.create_wordcloud(text, result['sentiment'], theme_ctx)
|
|
|
426 |
|
427 |
-
|
428 |
-
|
|
|
|
|
|
|
|
|
429 |
|
430 |
@handle_errors(default_return=None)
|
431 |
def analyze_batch(self, reviews: str, progress=None):
|
@@ -492,7 +604,7 @@ def create_interface():
|
|
492 |
|
493 |
with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo:
|
494 |
gr.Markdown("# π¬ AI Movie Sentiment Analyzer")
|
495 |
-
gr.Markdown("Optimized sentiment analysis with advanced visualizations")
|
496 |
|
497 |
with gr.Tab("Single Analysis"):
|
498 |
with gr.Row():
|
@@ -516,13 +628,15 @@ def create_interface():
|
|
516 |
)
|
517 |
|
518 |
with gr.Column():
|
519 |
-
result_output = gr.Textbox(label="Result", lines=
|
520 |
|
521 |
with gr.Row():
|
522 |
prob_plot = gr.Plot(label="Probabilities")
|
523 |
gauge_plot = gr.Plot(label="Confidence")
|
524 |
|
525 |
-
|
|
|
|
|
526 |
|
527 |
with gr.Tab("Batch Analysis"):
|
528 |
with gr.Row():
|
@@ -558,7 +672,7 @@ def create_interface():
|
|
558 |
analyze_btn.click(
|
559 |
app.analyze_single,
|
560 |
inputs=[text_input, theme_selector],
|
561 |
-
outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot]
|
562 |
)
|
563 |
|
564 |
load_btn.click(app.data_handler.process_file, inputs=file_upload, outputs=batch_input)
|
|
|
155 |
|
156 |
# Core Analysis Engine
|
157 |
class SentimentEngine:
|
158 |
+
"""Streamlined sentiment analysis with attention-based keyword extraction"""
|
159 |
def __init__(self):
|
160 |
self.model_manager = ModelManager()
|
161 |
|
162 |
+
def extract_key_words(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]:
|
163 |
+
"""Extract contributing words using BERT attention weights"""
|
164 |
+
try:
|
165 |
+
inputs = self.model_manager.tokenizer(
|
166 |
+
text, return_tensors="pt", padding=True,
|
167 |
+
truncation=True, max_length=config.MAX_TEXT_LENGTH
|
168 |
+
).to(self.model_manager.device)
|
169 |
+
|
170 |
+
# Get model outputs with attention weights
|
171 |
+
with torch.no_grad():
|
172 |
+
outputs = self.model_manager.model(**inputs, output_attentions=True)
|
173 |
+
attention = outputs.attentions # Tuple of attention tensors for each layer
|
174 |
+
|
175 |
+
# Use the last layer's attention, average over all heads
|
176 |
+
last_attention = attention[-1] # Shape: [batch_size, num_heads, seq_len, seq_len]
|
177 |
+
avg_attention = last_attention.mean(dim=1) # Average over heads: [batch_size, seq_len, seq_len]
|
178 |
+
|
179 |
+
# Focus on attention to [CLS] token (index 0) as it represents the whole sequence
|
180 |
+
cls_attention = avg_attention[0, 0, :] # Attention from CLS to all tokens
|
181 |
+
|
182 |
+
# Get tokens and their attention scores
|
183 |
+
tokens = self.model_manager.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
|
184 |
+
attention_scores = cls_attention.cpu().numpy()
|
185 |
+
|
186 |
+
# Filter out special tokens and combine subword tokens
|
187 |
+
word_scores = {}
|
188 |
+
current_word = ""
|
189 |
+
current_score = 0.0
|
190 |
+
|
191 |
+
for i, (token, score) in enumerate(zip(tokens, attention_scores)):
|
192 |
+
if token in ['[CLS]', '[SEP]', '[PAD]']:
|
193 |
+
continue
|
194 |
+
|
195 |
+
if token.startswith('##'):
|
196 |
+
# Subword token, add to current word
|
197 |
+
current_word += token[2:]
|
198 |
+
current_score = max(current_score, score) # Take max attention
|
199 |
+
else:
|
200 |
+
# New word, save previous if exists
|
201 |
+
if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
|
202 |
+
word_scores[current_word.lower()] = current_score
|
203 |
+
|
204 |
+
current_word = token
|
205 |
+
current_score = score
|
206 |
+
|
207 |
+
# Don't forget the last word
|
208 |
+
if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
|
209 |
+
word_scores[current_word.lower()] = current_score
|
210 |
+
|
211 |
+
# Filter out stop words and sort by attention score
|
212 |
+
filtered_words = {
|
213 |
+
word: score for word, score in word_scores.items()
|
214 |
+
if word not in config.STOP_WORDS and len(word) >= config.MIN_WORD_LENGTH
|
215 |
+
}
|
216 |
+
|
217 |
+
# Sort by attention score and return top_k
|
218 |
+
sorted_words = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)
|
219 |
+
return sorted_words[:top_k]
|
220 |
+
|
221 |
+
except Exception as e:
|
222 |
+
logger.error(f"Key word extraction failed: {e}")
|
223 |
+
return []
|
224 |
+
|
225 |
+
@handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'key_words': []})
|
226 |
def analyze_single(self, text: str) -> Dict:
|
227 |
+
"""Analyze single text with key word extraction"""
|
228 |
if not text.strip():
|
229 |
raise ValueError("Empty text")
|
230 |
|
|
|
238 |
probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
|
239 |
|
240 |
sentiment = "Positive" if probs[1] > probs[0] else "Negative"
|
241 |
+
|
242 |
+
# Extract key contributing words
|
243 |
+
key_words = self.extract_key_words(text)
|
244 |
+
|
245 |
return {
|
246 |
'sentiment': sentiment,
|
247 |
'confidence': float(probs.max()),
|
248 |
'pos_prob': float(probs[1]),
|
249 |
+
'neg_prob': float(probs[0]),
|
250 |
+
'key_words': key_words
|
251 |
}
|
252 |
|
253 |
@handle_errors(default_return=[])
|
254 |
def analyze_batch(self, texts: List[str], progress_callback=None) -> List[Dict]:
|
255 |
+
"""Optimized batch processing with key words"""
|
256 |
if len(texts) > config.BATCH_SIZE_LIMIT:
|
257 |
texts = texts[:config.BATCH_SIZE_LIMIT]
|
258 |
|
|
|
276 |
|
277 |
for text, prob in zip(batch, probs):
|
278 |
sentiment = "Positive" if prob[1] > prob[0] else "Negative"
|
279 |
+
# Extract key words for each text in batch
|
280 |
+
key_words = self.extract_key_words(text, top_k=5) # Fewer for batch processing
|
281 |
+
|
282 |
results.append({
|
283 |
'text': text[:50] + '...' if len(text) > 50 else text,
|
284 |
'full_text': text,
|
285 |
'sentiment': sentiment,
|
286 |
'confidence': float(prob.max()),
|
287 |
'pos_prob': float(prob[1]),
|
288 |
+
'neg_prob': float(prob[0]),
|
289 |
+
'key_words': key_words
|
290 |
})
|
291 |
|
292 |
return results
|
|
|
347 |
fig.tight_layout()
|
348 |
return fig
|
349 |
|
350 |
+
@staticmethod
|
351 |
+
@handle_errors(default_return=None)
|
352 |
+
def create_keyword_chart(key_words: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
|
353 |
+
"""Create horizontal bar chart for key contributing words"""
|
354 |
+
if not key_words:
|
355 |
+
return None
|
356 |
+
|
357 |
+
with managed_figure(figsize=config.FIGURE_SIZE_SINGLE) as fig:
|
358 |
+
ax = fig.add_subplot(111)
|
359 |
+
|
360 |
+
words = [word for word, score in key_words]
|
361 |
+
scores = [score for word, score in key_words]
|
362 |
+
|
363 |
+
# Choose color based on sentiment
|
364 |
+
color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg']
|
365 |
+
|
366 |
+
# Create horizontal bar chart
|
367 |
+
bars = ax.barh(range(len(words)), scores, color=color, alpha=0.7)
|
368 |
+
ax.set_yticks(range(len(words)))
|
369 |
+
ax.set_yticklabels(words)
|
370 |
+
ax.set_xlabel('Attention Weight')
|
371 |
+
ax.set_title(f'Top Contributing Words ({sentiment})', fontweight='bold')
|
372 |
+
|
373 |
+
# Add value labels on bars
|
374 |
+
for i, (bar, score) in enumerate(zip(bars, scores)):
|
375 |
+
ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2.,
|
376 |
+
f'{score:.3f}', ha='left', va='center', fontsize=9)
|
377 |
+
|
378 |
+
# Invert y-axis to show highest scoring word at top
|
379 |
+
ax.invert_yaxis()
|
380 |
+
ax.grid(axis='x', alpha=0.3)
|
381 |
+
fig.tight_layout()
|
382 |
+
return fig
|
383 |
+
|
384 |
@staticmethod
|
385 |
@handle_errors(default_return=None)
|
386 |
def create_wordcloud(text: str, sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
|
|
|
452 |
|
453 |
if format_type == 'csv':
|
454 |
writer = csv.writer(temp_file)
|
455 |
+
writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Pos_Prob', 'Neg_Prob', 'Key_Words'])
|
456 |
for entry in data:
|
457 |
writer.writerow([
|
458 |
entry.get('timestamp', ''),
|
|
|
460 |
entry.get('sentiment', ''),
|
461 |
f"{entry.get('confidence', 0):.4f}",
|
462 |
f"{entry.get('pos_prob', 0):.4f}",
|
463 |
+
f"{entry.get('neg_prob', 0):.4f}",
|
464 |
+
"|".join([f"{word}:{score:.3f}" for word, score in entry.get('key_words', [])])
|
465 |
])
|
466 |
elif format_type == 'json':
|
467 |
json.dump(data, temp_file, indent=2, ensure_ascii=False)
|
|
|
501 |
|
502 |
# Example data
|
503 |
self.examples = [
|
504 |
+
["The cinematography was stunning but the plot was predictable and lacked depth."],
|
505 |
+
["A masterpiece! Powerful performances and unforgettable scenes throughout."],
|
506 |
+
["Boring from start to finish with terrible acting and weak plot development."],
|
507 |
+
["Impressive effects but the story was confusing and difficult to follow."],
|
508 |
+
["Absolutely incredible ending - one of the best films in recent years!"]
|
509 |
]
|
510 |
|
511 |
+
@handle_errors(default_return=("Please enter text", None, None, None, None))
|
512 |
def analyze_single(self, text: str, theme: str = 'default'):
|
513 |
+
"""Single text analysis with key words"""
|
514 |
if not text.strip():
|
515 |
+
return "Please enter text", None, None, None, None
|
516 |
|
517 |
result = self.engine.analyze_single(text)
|
518 |
|
|
|
530 |
prob_plot = PlotFactory.create_sentiment_bars(probs, theme_ctx)
|
531 |
gauge_plot = PlotFactory.create_confidence_gauge(result['confidence'], result['sentiment'], theme_ctx)
|
532 |
cloud_plot = PlotFactory.create_wordcloud(text, result['sentiment'], theme_ctx)
|
533 |
+
keyword_plot = PlotFactory.create_keyword_chart(result['key_words'], result['sentiment'], theme_ctx)
|
534 |
|
535 |
+
# Format result text with key words
|
536 |
+
key_words_str = ", ".join([f"{word}({score:.3f})" for word, score in result['key_words'][:5]])
|
537 |
+
result_text = (f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.3f})\n"
|
538 |
+
f"Key Words: {key_words_str}")
|
539 |
+
|
540 |
+
return result_text, prob_plot, gauge_plot, cloud_plot, keyword_plot
|
541 |
|
542 |
@handle_errors(default_return=None)
|
543 |
def analyze_batch(self, reviews: str, progress=None):
|
|
|
604 |
|
605 |
with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo:
|
606 |
gr.Markdown("# π¬ AI Movie Sentiment Analyzer")
|
607 |
+
gr.Markdown("Optimized sentiment analysis with advanced visualizations and key word extraction")
|
608 |
|
609 |
with gr.Tab("Single Analysis"):
|
610 |
with gr.Row():
|
|
|
628 |
)
|
629 |
|
630 |
with gr.Column():
|
631 |
+
result_output = gr.Textbox(label="Result", lines=3)
|
632 |
|
633 |
with gr.Row():
|
634 |
prob_plot = gr.Plot(label="Probabilities")
|
635 |
gauge_plot = gr.Plot(label="Confidence")
|
636 |
|
637 |
+
with gr.Row():
|
638 |
+
wordcloud_plot = gr.Plot(label="Word Cloud")
|
639 |
+
keyword_plot = gr.Plot(label="Key Contributing Words")
|
640 |
|
641 |
with gr.Tab("Batch Analysis"):
|
642 |
with gr.Row():
|
|
|
672 |
analyze_btn.click(
|
673 |
app.analyze_single,
|
674 |
inputs=[text_input, theme_selector],
|
675 |
+
outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot, keyword_plot]
|
676 |
)
|
677 |
|
678 |
load_btn.click(app.data_handler.process_file, inputs=file_upload, outputs=batch_input)
|