waka commited on
Commit
157d43d
·
verified ·
1 Parent(s): 7da1b05

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +634 -0
  2. requirements.txt +14 -0
app.py ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio import SelectData
3
+ import torch
4
+ from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
5
+ import pandas as pd
6
+ from wordcloud import WordCloud
7
+ import io
8
+ import base64
9
+ from PIL import Image
10
+ import numpy as np
11
+ import plotly.express as px
12
+ import plotly.graph_objects as go
13
+ from sklearn.feature_extraction.text import TfidfVectorizer
14
+ from sklearn.decomposition import LatentDirichletAllocation as LDA
15
+ import nltk
16
+ from nltk.corpus import stopwords
17
+ from langdetect import detect
18
+ import langdetect
19
+ import re
20
+ from collections import Counter
21
+ from nltk.util import ngrams
22
+ from googletrans import Translator
23
+ import asyncio
24
+
25
+ # 下载停用词
26
+ nltk.download('stopwords', quiet=True)
27
+ nltk.download('punkt', quiet=True)
28
+
29
+ # 支持的语言
30
+ SUPPORTED_LANGUAGES = ['english', 'spanish', 'french', 'german', 'italian', 'portuguese', 'russian', 'arabic', 'japanese']
31
+
32
+ # 创建语言停用词字典
33
+ LANGUAGE_STOPWORDS = {}
34
+ for lang in SUPPORTED_LANGUAGES:
35
+ if lang in stopwords.fileids():
36
+ LANGUAGE_STOPWORDS[lang] = set(stopwords.words(lang))
37
+
38
+ # 语言代码映射
39
+ LANG_CODE_MAP = {
40
+ 'en': 'english',
41
+ 'es': 'spanish',
42
+ 'fr': 'french',
43
+ 'de': 'german',
44
+ 'it': 'italian',
45
+ 'pt': 'portuguese',
46
+ 'ru': 'russian',
47
+ 'ar': 'arabic',
48
+ 'ja': 'japanese'
49
+ }
50
+
51
+ def get_stopwords(text):
52
+ """根据文本语言返回相应的停用词"""
53
+ try:
54
+ lang_code = detect(text)
55
+ lang = LANG_CODE_MAP.get(lang_code, 'english')
56
+ return LANGUAGE_STOPWORDS.get(lang, LANGUAGE_STOPWORDS['english'])
57
+ except langdetect.LangDetectException:
58
+ return LANGUAGE_STOPWORDS['english']
59
+
60
+ # 初始化模型和分词器
61
+ MODEL = "sohan-ai/sentiment-analysis-model-amazon-reviews"
62
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
63
+ model = DistilBertForSequenceClassification.from_pretrained(MODEL)
64
+
65
+ # 全局变量
66
+ current_bigram_samples = []
67
+ FULL_BIGRAM_DF = pd.DataFrame() # 存储完整的bigram数据
68
+ last_selected_reviews = [] # 存放最后一次选中的评论列表
69
+ translator = Translator() # 初始化翻译器
70
+
71
+ def filter_bigrams(search_text):
72
+ """过滤关键词组"""
73
+ global FULL_BIGRAM_DF
74
+ if not search_text.strip():
75
+ return FULL_BIGRAM_DF
76
+ # 不区分大小写的搜索
77
+ mask = FULL_BIGRAM_DF["词组"].str.contains(search_text, case=False, na=False)
78
+ return FULL_BIGRAM_DF[mask]
79
+
80
+ def analyze_text(text):
81
+ """分析单个文本的情感"""
82
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
83
+ outputs = model(**inputs)
84
+ scores = torch.nn.functional.softmax(outputs.logits, dim=1)
85
+ scores = scores.detach().numpy()[0]
86
+
87
+ return {
88
+ "积极情感概率": float(scores[1]),
89
+ "消极情感概率": float(scores[0]),
90
+ "整体情感": "积极" if scores[1] > scores[0] else "消极"
91
+ }
92
+
93
+ def preprocess_text(text):
94
+ """预处理文本"""
95
+ # 转换为小写
96
+ text = text.lower()
97
+
98
+ # 去除特殊字符,只保留字母和空格
99
+ text = re.sub(r'[^a-z\s]', ' ', text)
100
+
101
+ # 去除多余空格
102
+ text = re.sub(r'\s+', ' ', text).strip()
103
+
104
+ return text
105
+
106
+ def extract_bigrams(texts, min_freq=2, max_freq_ratio=0.9):
107
+ """提取关键词组(两个单词)"""
108
+ # 预处理所有文本
109
+ processed_texts = [preprocess_text(text) for text in texts]
110
+
111
+ # 提取所有双词组及其对应的文本
112
+ all_bigrams = []
113
+ bigram_texts = {} # 存储词组对应的原始文本
114
+
115
+ for idx, (text, processed) in enumerate(zip(texts, processed_texts)):
116
+ words = processed.split()
117
+ text_bigrams = list(ngrams(words, 2))
118
+ text_bigram_strs = [' '.join(bigram) for bigram in text_bigrams]
119
+ all_bigrams.extend(text_bigram_strs)
120
+
121
+ # 记录每个词组对应的原始文本
122
+ for bigram in text_bigram_strs:
123
+ if bigram not in bigram_texts:
124
+ bigram_texts[bigram] = []
125
+ bigram_texts[bigram].append(text)
126
+
127
+ # 计算词组频率
128
+ bigram_freq = Counter(all_bigrams)
129
+ total_docs = len(texts) # 总评论数
130
+
131
+ # 过滤词组
132
+ filtered_bigrams = {
133
+ bigram: freq for bigram, freq in bigram_freq.items()
134
+ if min_freq <= freq <= total_docs * max_freq_ratio # 保留在频率范围内的词组
135
+ }
136
+
137
+ # 创建词组统计DataFrame
138
+ bigram_stats = []
139
+
140
+ # 准备Dataset数据
141
+ dataset_samples = []
142
+
143
+ for bigram, freq in sorted(filtered_bigrams.items(), key=lambda x: x[1], reverse=True):
144
+ # 计算占总评论数的百分比
145
+ percentage = freq / total_docs * 100
146
+ # 获取该词组对应的所有文本
147
+ related_texts = bigram_texts[bigram]
148
+
149
+ # 统计DataFrame数据
150
+ bigram_stats.append({
151
+ "词组": bigram,
152
+ "出现次数": freq,
153
+ "占比": f"{percentage:.2f}%" # 占总评论数的百分���
154
+ })
155
+
156
+ # Dataset数据
157
+ formatted_texts = "\n\n".join(f"{i+1}. {text}" for i, text in enumerate(related_texts))
158
+ dataset_samples.append([bigram, [formatted_texts]])
159
+
160
+ return pd.DataFrame(bigram_stats), dataset_samples
161
+
162
+ def perform_lda_analysis(texts, n_topics=15):
163
+ """执行LDA主题分析"""
164
+ # 获取动态停用词
165
+ stop_words = list(get_stopwords(' '.join(texts)))
166
+
167
+ # 创建TF-IDF向量化器
168
+ vectorizer = TfidfVectorizer(
169
+ max_df=0.9, # 忽略在90%以上文档中出现的词
170
+ min_df=2, # 忽略在少于2个文档中出现的词
171
+ stop_words=stop_words, # 使用动态停用词
172
+ ngram_range=(2, 2) # 使用双词组(bigrams)
173
+ )
174
+
175
+ # 预处理文本
176
+ processed_texts = [preprocess_text(text) for text in texts]
177
+
178
+ # 转换文本数据
179
+ try:
180
+ tfidf = vectorizer.fit_transform(processed_texts)
181
+
182
+ # 创建并训练LDA模型
183
+ lda_model = LDA(
184
+ n_components=n_topics,
185
+ random_state=0
186
+ )
187
+ lda_output = lda_model.fit_transform(tfidf)
188
+
189
+ # 获取特征词
190
+ feature_names = vectorizer.get_feature_names_out()
191
+
192
+ # 整理主题词
193
+ topics = []
194
+ for topic_idx, topic in enumerate(lda_model.components_):
195
+ top_words_idx = topic.argsort()[:-15:-1] # 获取前15个词组
196
+ top_words = [feature_names[i] for i in top_words_idx]
197
+ topics.append({
198
+ "主题": f"主题 {topic_idx + 1}",
199
+ "关键词": ", ".join(top_words)
200
+ })
201
+
202
+ # 获取每个文档的主题分布
203
+ doc_topics = []
204
+ for doc_idx, doc_topics_dist in enumerate(lda_output):
205
+ dominant_topic = doc_topics_dist.argmax()
206
+ doc_topics.append({
207
+ "文本": texts[doc_idx], # 显示完整文本
208
+ "主导主题": f"主题 {dominant_topic + 1}",
209
+ "主题概率": f"{doc_topics_dist[dominant_topic]:.2%}"
210
+ })
211
+
212
+ return pd.DataFrame(topics), pd.DataFrame(doc_topics)
213
+ except ValueError as e:
214
+ # 如果没有足够的词组进行分析,返回空的DataFrame
215
+ empty_topics = pd.DataFrame(columns=["主题", "关键词"])
216
+ empty_docs = pd.DataFrame(columns=["文本", "主导主题", "主题概率"])
217
+ return empty_topics, empty_docs
218
+
219
+ def create_pie_chart(positive_count, negative_count):
220
+ """创建情感分布饼图"""
221
+ fig = go.Figure(data=[go.Pie(
222
+ labels=['积极评价', '消极评价'],
223
+ values=[positive_count, negative_count],
224
+ hole=.3,
225
+ marker_colors=['#2ecc71', '#e74c3c']
226
+ )])
227
+
228
+ fig.update_layout(
229
+ title="情感分布",
230
+ showlegend=True,
231
+ width=400,
232
+ height=400
233
+ )
234
+
235
+ return fig
236
+
237
+ def create_score_histogram(df):
238
+ """创建情感得分直方图"""
239
+ fig = go.Figure()
240
+
241
+ fig.add_trace(go.Histogram(
242
+ x=df["积极情感概率"],
243
+ name="积极情感",
244
+ nbinsx=20,
245
+ marker_color='#2ecc71'
246
+ ))
247
+
248
+ fig.add_trace(go.Histogram(
249
+ x=df["消极情感概率"],
250
+ name="消极情感",
251
+ nbinsx=20,
252
+ marker_color='#e74c3c'
253
+ ))
254
+
255
+ fig.update_layout(
256
+ title="情感得分分布",
257
+ xaxis_title="情感得分",
258
+ yaxis_title="评论数量",
259
+ barmode='overlay',
260
+ width=600,
261
+ height=400
262
+ )
263
+
264
+ return fig
265
+
266
+ def analyze_file(file, progress=gr.Progress()):
267
+ """分析文件中的多个文本"""
268
+ global current_bigram_samples, FULL_BIGRAM_DF
269
+ results = []
270
+
271
+ try:
272
+ # 读取文件内容
273
+ if file is None:
274
+ return "请上传文件", None, None, None, None, None, None, None, None, "", None
275
+
276
+ # 读取上传的文件内容
277
+ text_content = file.name
278
+ with open(text_content, 'r', encoding='utf-8') as f:
279
+ content = f.readlines()
280
+
281
+ progress(0, desc="正在预处理文本...")
282
+ # 处理每一行评论
283
+ texts = [] # 存储所有文本用于LDA分析
284
+ total_lines = len([line for line in content if line.strip()])
285
+
286
+ # 检测语言
287
+ all_text = ' '.join([line.strip() for line in content if line.strip()])
288
+ try:
289
+ lang_code = detect(all_text)
290
+ detected_lang = LANG_CODE_MAP.get(lang_code, 'english')
291
+ lang_info = f"检测到语言:{detected_lang},将使用对应的停用词列表"
292
+ except:
293
+ detected_lang = 'english'
294
+ lang_info = "语言检测失败,将使用英语停用词列表"
295
+
296
+ progress(0.1, desc="正在进行情感分析...")
297
+ for i, line in enumerate(content):
298
+ if line.strip():
299
+ result = analyze_text(line.strip())
300
+ results.append({
301
+ "文本": line.strip(),
302
+ **result
303
+ })
304
+ texts.append(line.strip())
305
+ progress((i + 1) / total_lines * 0.3) # 情感分析占30%进度
306
+
307
+ # 创建DataFrame
308
+ df = pd.DataFrame(results)
309
+
310
+ # 生成统计信息
311
+ total = len(df)
312
+ if total == 0:
313
+ return "没有找到有效的评论文本", None, None, None, None, None, None, None, None, "", None
314
+
315
+ positive = len(df[df["整体情感"] == "积极"])
316
+ negative = len(df[df["整体情感"] == "消极"])
317
+
318
+ # 生成分析统计信息
319
+ analysis_info = (
320
+ f"{lang_info}\n"
321
+ f"分析完成!共分析{total}条文本\n"
322
+ f"积极:{positive}条 ({positive/total*100:.1f}%)\n"
323
+ f"消极:{negative}条 ({negative/total*100:.1f}%)"
324
+ )
325
+
326
+ progress(0.4, desc="正在生成词云...")
327
+ # 生成词云
328
+ positive_text = " ".join(df[df["整体情感"] == "积极"]["文本"])
329
+ negative_text = " ".join(df[df["整体情感"] == "消极"]["文本"])
330
+
331
+ pos_wordcloud = None
332
+ neg_wordcloud = None
333
+
334
+ if positive_text:
335
+ pos_wordcloud = WordCloud(width=400, height=200, background_color='white', font_path="msyh.ttc").generate(positive_text)
336
+ pos_wordcloud = pos_wordcloud.to_image()
337
+
338
+ if negative_text:
339
+ neg_wordcloud = WordCloud(width=400, height=200, background_color='white', font_path="msyh.ttc").generate(negative_text)
340
+ neg_wordcloud = neg_wordcloud.to_image()
341
+
342
+ progress(0.5, desc="正在生成可视化图表...")
343
+ # 创建可视化图表
344
+ pie_chart = create_pie_chart(positive, negative)
345
+ score_hist = create_score_histogram(df)
346
+
347
+ progress(0.6, desc="正在提取关键词组...")
348
+ # 提取关键词组
349
+ bigrams_df, bigram_samples = extract_bigrams(texts)
350
+ current_bigram_samples = bigram_samples # 更新全局变量
351
+ FULL_BIGRAM_DF = bigrams_df.copy() # 保存完整的bigram数据
352
+
353
+ progress(0.7, desc="正在进行主题分析...")
354
+ # 执行LDA主题分析
355
+ topics_df, doc_topics_df = perform_lda_analysis(texts)
356
+
357
+ progress(0.9, desc="正在保存结果...")
358
+ # 准备显示用的DataFrame
359
+ display_df = df.copy()
360
+ display_df["积极情感概率"] = display_df["积极情感概率"].apply(lambda x: f"{x:.2%}")
361
+ display_df["消极情感概率"] = display_df["消极情感概率"].apply(lambda x: f"{x:.2%}")
362
+
363
+ # 保存结果到Excel文件,包含多个sheet
364
+ excel_path = "sentiment_analysis_results.xlsx"
365
+ with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
366
+ # 保存情感分析结果
367
+ df.to_excel(writer, sheet_name='情感分析结果', index=False)
368
+
369
+ # 保存LDA主题关键词
370
+ topics_df.to_excel(writer, sheet_name='主题关键词', index=False)
371
+
372
+ # 保存文档主题分布
373
+ doc_topics_df.to_excel(writer, sheet_name='文档主题分布', index=False)
374
+
375
+ # 保存关键词组统计
376
+ bigrams_df.to_excel(writer, sheet_name='关键词组统计', index=False)
377
+
378
+ progress(1.0, desc="分析完成!")
379
+ return (
380
+ analysis_info,
381
+ pos_wordcloud,
382
+ neg_wordcloud,
383
+ display_df,
384
+ pie_chart,
385
+ score_hist,
386
+ topics_df,
387
+ doc_topics_df,
388
+ bigrams_df,
389
+ '<div style="color: #666; padding: 10px;">请点击左侧词组查看相关评论</div>', # 初始HTML提示
390
+ excel_path
391
+ )
392
+ except Exception as e:
393
+ import traceback
394
+ error_msg = f"处理文件时出错:{str(e)}\n{traceback.format_exc()}"
395
+ return error_msg, None, None, None, None, None, None, None, None, "", None
396
+
397
+ def single_text_interface(text):
398
+ """单文本分析界面的处理函数"""
399
+ if not text.strip():
400
+ return "请输入要分析的文本"
401
+
402
+ result = analyze_text(text)
403
+ return (
404
+ f"积极情感概率:{result['积极情感概率']:.2%}\n"
405
+ f"消极情感概率:{result['消极情感概率']:.2%}\n"
406
+ f"整体情感:{result['整体情感']}"
407
+ )
408
+
409
+ def highlight_keyword(text, keyword):
410
+ """用 <mark> 给 keyword 做简单的大小写不敏感高亮"""
411
+ pattern = re.compile(re.escape(keyword), re.IGNORECASE)
412
+ return pattern.sub(r'<mark style="background-color: #ffd700; padding: 0 2px; border-radius: 2px;">\g<0></mark>', text)
413
+
414
+ def show_bigram_reviews(evt: gr.SelectData, df):
415
+ """显示选中词组的相关评论"""
416
+ global current_bigram_samples, last_selected_reviews
417
+ selected_bigram = df.iloc[evt.index[0]]["词组"] # 获取选中行的词组
418
+
419
+ # 清空上一次的评论列表
420
+ last_selected_reviews = []
421
+
422
+ for sample in current_bigram_samples:
423
+ if sample[0] == selected_bigram:
424
+ # 将评论转换为HTML格式
425
+ reviews = sample[1][0].split("\n\n")
426
+ highlighted_reviews = []
427
+
428
+ for i, review in enumerate(reviews, start=1):
429
+ # 保存原文评论(含序号)到全局变量
430
+ last_selected_reviews.append(review)
431
+
432
+ # 提取评论内容(去除序号前缀)
433
+ review_content = review.split(". ", 1)[1] if ". " in review else review
434
+ # 高亮关键词
435
+ highlighted_review = highlight_keyword(review_content, selected_bigram)
436
+ # 添加序号和样式
437
+ highlighted_reviews.append(
438
+ f'<div style="margin-bottom: 10px; padding: 10px; background-color: #f5f5f5; border-radius: 5px;">'
439
+ f'<span style="font-weight: bold; color: #666;">#{i}</span> {highlighted_review}'
440
+ f'</div>'
441
+ )
442
+
443
+ # 拼接成完整的HTML
444
+ html_content = (
445
+ '<div style="max-height: 500px; overflow-y: auto; padding: 10px;">'
446
+ f'<div style="margin-bottom: 10px; color: #333;">找到 {len(reviews)} 条包含 "<b>{selected_bigram}</b>" 的评论:</div>'
447
+ f'{"".join(highlighted_reviews)}'
448
+ '</div>'
449
+ )
450
+ return html_content
451
+
452
+ return '<div style="color: #666; padding: 10px;">未找到相关评论</div>'
453
+
454
+ def translate_single_comment(comment_index):
455
+ """翻译单条评论"""
456
+ global last_selected_reviews
457
+ if not last_selected_reviews:
458
+ return "请先选择一个词组查看相关评论。"
459
+
460
+ try:
461
+ comment_index = int(comment_index)
462
+ except:
463
+ return "请输入有效的评论序号(数字)"
464
+
465
+ if comment_index < 1 or comment_index > len(last_selected_reviews):
466
+ return f"评论序号超出范围!可选范围: 1~{len(last_selected_reviews)}"
467
+
468
+ # 获取原文并去除序号前缀
469
+ original_text = last_selected_reviews[comment_index - 1]
470
+ parts = original_text.split(". ", 1)
471
+ if len(parts) == 2:
472
+ original_text = parts[1]
473
+ else:
474
+ original_text = parts[0]
475
+
476
+ try:
477
+ # 创建异步事件循环
478
+ loop = asyncio.new_event_loop()
479
+ asyncio.set_event_loop(loop)
480
+
481
+ async def translate_async():
482
+ async with Translator() as translator:
483
+ result = await translator.translate(original_text, dest='zh-cn')
484
+ return result
485
+
486
+ # 运行异步翻译
487
+ result = loop.run_until_complete(translate_async())
488
+ loop.close()
489
+
490
+ return f"原文:\n{original_text}\n\n中文翻译:\n{result.text}"
491
+ except Exception as e:
492
+ # 如果是网络错误,提示用户
493
+ if "HTTPSConnectionPool" in str(e):
494
+ return "网络连接错误,请检查网络连接并重试"
495
+ return f"翻译出错: {str(e)}"
496
+
497
+ # 创建Gradio界面
498
+ with gr.Blocks(title="亚马逊评论文本情感分析系统", theme=gr.themes.Soft()) as demo:
499
+ gr.Markdown("# 亚马逊评论文本情感分析系统")
500
+
501
+ with gr.Tabs():
502
+ with gr.TabItem("单文本分析"):
503
+ with gr.Row():
504
+ with gr.Column():
505
+ text_input = gr.Textbox(
506
+ label="输入文本",
507
+ lines=3,
508
+ placeholder="请输入要分析的文本...",
509
+ value=""
510
+ )
511
+ analyze_btn = gr.Button("分析", variant="primary")
512
+ with gr.Column():
513
+ text_output = gr.Textbox(label="分析结果", lines=3)
514
+
515
+ analyze_btn.click(
516
+ single_text_interface,
517
+ inputs=[text_input],
518
+ outputs=[text_output]
519
+ )
520
+
521
+ with gr.TabItem("批量文件分析"):
522
+ with gr.Row():
523
+ file_input = gr.File(
524
+ label="上传文本文件(UTF-8编码的txt文件,每行一条评论)",
525
+ file_types=[".txt"]
526
+ )
527
+
528
+ analyze_file_btn = gr.Button("开始分析", variant="primary")
529
+
530
+ with gr.Row():
531
+ file_output = gr.Textbox(label="分析统计", lines=4)
532
+
533
+ with gr.Row():
534
+ with gr.Column():
535
+ gr.Markdown("### 评论情感分布")
536
+ pie_chart = gr.Plot()
537
+ with gr.Column():
538
+ gr.Markdown("### 情感得分分布")
539
+ score_hist = gr.Plot()
540
+
541
+ with gr.Row():
542
+ with gr.Column():
543
+ gr.Markdown("### 积极评论词云")
544
+ pos_wordcloud = gr.Image()
545
+ with gr.Column():
546
+ gr.Markdown("### 消极评论词云")
547
+ neg_wordcloud = gr.Image()
548
+
549
+ gr.Markdown("### 关键词组统计")
550
+ with gr.Row():
551
+ with gr.Column(scale=1):
552
+ # 添加搜索框
553
+ search_box = gr.Textbox(
554
+ label="搜索关键词组",
555
+ placeholder="输入关键词以过滤词组...",
556
+ show_label=True
557
+ )
558
+ bigrams_df = gr.Dataframe(
559
+ headers=["词组", "出现次数", "占比"],
560
+ datatype=["str", "number", "str"],
561
+ wrap=True,
562
+ interactive=True
563
+ )
564
+ # 添加搜索事件
565
+ search_box.change(
566
+ fn=filter_bigrams,
567
+ inputs=[search_box],
568
+ outputs=[bigrams_df]
569
+ )
570
+ with gr.Column(scale=1):
571
+ gr.Markdown("#### 选中词组的相关评论")
572
+ bigram_reviews = gr.HTML()
573
+
574
+ # 添加翻译功能组件
575
+ with gr.Row():
576
+ comment_index = gr.Number(
577
+ label="要翻译的评论序号",
578
+ value=1,
579
+ precision=0
580
+ )
581
+ translate_btn = gr.Button("翻译")
582
+ translate_output = gr.Textbox(
583
+ label="翻译结果",
584
+ lines=6
585
+ )
586
+
587
+ # 添加词组选择事件
588
+ bigrams_df.select(
589
+ fn=show_bigram_reviews,
590
+ inputs=[bigrams_df],
591
+ outputs=bigram_reviews
592
+ )
593
+
594
+ # 添加翻译按钮事件
595
+ translate_btn.click(
596
+ fn=translate_single_comment,
597
+ inputs=[comment_index],
598
+ outputs=[translate_output]
599
+ )
600
+
601
+ gr.Markdown("### 主题分析结果")
602
+ with gr.Row():
603
+ with gr.Column():
604
+ gr.Markdown("#### 主题关键词(越靠前,主题越重要,提到次数越多)")
605
+ topics_df = gr.Dataframe(
606
+ headers=["主题", "关键词"],
607
+ datatype=["str", "str"],
608
+ wrap=True
609
+ )
610
+ with gr.Column():
611
+ gr.Markdown("#### 文档-主题分布")
612
+ doc_topics_df = gr.Dataframe(
613
+ headers=["文本", "主导主题", "主题概率"],
614
+ datatype=["str", "str", "str"],
615
+ wrap=True
616
+ )
617
+
618
+ gr.Markdown("### 详细分析结果")
619
+ results_df = gr.Dataframe(
620
+ headers=["文本", "积极情感概率", "消极情感概率", "整体情感"],
621
+ datatype=["str", "str", "str", "str"],
622
+ wrap=True
623
+ )
624
+
625
+ file_download = gr.File(label="下载完整分析结果(Excel)")
626
+
627
+ analyze_file_btn.click(
628
+ analyze_file,
629
+ inputs=[file_input],
630
+ outputs=[file_output, pos_wordcloud, neg_wordcloud, results_df, pie_chart, score_hist, topics_df, doc_topics_df, bigrams_df, bigram_reviews, file_download]
631
+ )
632
+
633
+ if __name__ == "__main__":
634
+ demo.launch(share=False)
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ transformers
4
+ pandas
5
+ wordcloud
6
+ Pillow
7
+ numpy
8
+ plotly
9
+ scikit-learn
10
+ nltk
11
+ langdetect
12
+ openpyxl
13
+ scikit-learn
14
+ googletrans