import gradio as gr from gradio import SelectData import torch from transformers import DistilBertTokenizer, DistilBertForSequenceClassification import pandas as pd from wordcloud import WordCloud import io import base64 from PIL import Image import numpy as np import plotly.express as px import plotly.graph_objects as go from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import LatentDirichletAllocation as LDA import nltk from nltk.corpus import stopwords from langdetect import detect import langdetect import re from collections import Counter from nltk.util import ngrams from googletrans import Translator import asyncio # 下载停用词 nltk.download('stopwords', quiet=True) nltk.download('punkt', quiet=True) # 支持的语言 SUPPORTED_LANGUAGES = ['english', 'spanish', 'french', 'german', 'italian', 'portuguese', 'russian', 'arabic', 'japanese'] # 创建语言停用词字典 LANGUAGE_STOPWORDS = {} for lang in SUPPORTED_LANGUAGES: if lang in stopwords.fileids(): LANGUAGE_STOPWORDS[lang] = set(stopwords.words(lang)) # 语言代码映射 LANG_CODE_MAP = { 'en': 'english', 'es': 'spanish', 'fr': 'french', 'de': 'german', 'it': 'italian', 'pt': 'portuguese', 'ru': 'russian', 'ar': 'arabic', 'ja': 'japanese' } def get_stopwords(text): """根据文本语言返回相应的停用词""" try: lang_code = detect(text) lang = LANG_CODE_MAP.get(lang_code, 'english') return LANGUAGE_STOPWORDS.get(lang, LANGUAGE_STOPWORDS['english']) except langdetect.LangDetectException: return LANGUAGE_STOPWORDS['english'] # 初始化模型和分词器 MODEL = "sohan-ai/sentiment-analysis-model-amazon-reviews" tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") model = DistilBertForSequenceClassification.from_pretrained(MODEL) # 全局变量 current_bigram_samples = [] FULL_BIGRAM_DF = pd.DataFrame() # 存储完整的bigram数据 last_selected_reviews = [] # 存放最后一次选中的评论列表 translator = Translator() # 初始化翻译器 def filter_bigrams(search_text): """过滤关键词组""" global FULL_BIGRAM_DF if not search_text.strip(): return FULL_BIGRAM_DF # 不区分大小写的搜索 mask = FULL_BIGRAM_DF["词组"].str.contains(search_text, case=False, na=False) return FULL_BIGRAM_DF[mask] def analyze_text(text): """分析单个文本的情感""" inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) outputs = model(**inputs) scores = torch.nn.functional.softmax(outputs.logits, dim=1) scores = scores.detach().numpy()[0] return { "积极情感概率": float(scores[1]), "消极情感概率": float(scores[0]), "整体情感": "积极" if scores[1] > scores[0] else "消极" } def preprocess_text(text): """预处理文本""" # 转换为小写 text = text.lower() # 去除特殊字符,只保留字母和空格 text = re.sub(r'[^a-z\s]', ' ', text) # 去除多余空格 text = re.sub(r'\s+', ' ', text).strip() return text def extract_bigrams(texts, min_freq=2, max_freq_ratio=0.9): """提取关键词组(两个单词)""" # 预处理所有文本 processed_texts = [preprocess_text(text) for text in texts] # 提取所有双词组及其对应的文本 all_bigrams = [] bigram_texts = {} # 存储词组对应的原始文本 for idx, (text, processed) in enumerate(zip(texts, processed_texts)): words = processed.split() text_bigrams = list(ngrams(words, 2)) text_bigram_strs = [' '.join(bigram) for bigram in text_bigrams] all_bigrams.extend(text_bigram_strs) # 记录每个词组对应的原始文本 for bigram in text_bigram_strs: if bigram not in bigram_texts: bigram_texts[bigram] = [] bigram_texts[bigram].append(text) # 计算词组频率 bigram_freq = Counter(all_bigrams) total_docs = len(texts) # 总评论数 # 过滤词组 filtered_bigrams = { bigram: freq for bigram, freq in bigram_freq.items() if min_freq <= freq <= total_docs * max_freq_ratio # 保留在频率范围内的词组 } # 创建词组统计DataFrame bigram_stats = [] # 准备Dataset数据 dataset_samples = [] for bigram, freq in sorted(filtered_bigrams.items(), key=lambda x: x[1], reverse=True): # 计算占总评论数的百分比 percentage = freq / total_docs * 100 # 获取该词组对应的所有文本 related_texts = bigram_texts[bigram] # 统计DataFrame数据 bigram_stats.append({ "词组": bigram, "出现次数": freq, "占比": f"{percentage:.2f}%" # 占总评论数的百分比 }) # Dataset数据 formatted_texts = "\n\n".join(f"{i+1}. {text}" for i, text in enumerate(related_texts)) dataset_samples.append([bigram, [formatted_texts]]) return pd.DataFrame(bigram_stats), dataset_samples def perform_lda_analysis(texts, n_topics=15): """执行LDA主题分析""" # 获取动态停用词 stop_words = list(get_stopwords(' '.join(texts))) # 创建TF-IDF向量化器 vectorizer = TfidfVectorizer( max_df=0.9, # 忽略在90%以上文档中出现的词 min_df=2, # 忽略在少于2个文档中出现的词 stop_words=stop_words, # 使用动态停用词 ngram_range=(2, 2) # 使用双词组(bigrams) ) # 预处理文本 processed_texts = [preprocess_text(text) for text in texts] # 转换文本数据 try: tfidf = vectorizer.fit_transform(processed_texts) # 创建并训练LDA模型 lda_model = LDA( n_components=n_topics, random_state=0 ) lda_output = lda_model.fit_transform(tfidf) # 获取特征词 feature_names = vectorizer.get_feature_names_out() # 整理主题词 topics = [] for topic_idx, topic in enumerate(lda_model.components_): top_words_idx = topic.argsort()[:-15:-1] # 获取前15个词组 top_words = [feature_names[i] for i in top_words_idx] topics.append({ "主题": f"主题 {topic_idx + 1}", "关键词": ", ".join(top_words) }) # 获取每个文档的主题分布 doc_topics = [] for doc_idx, doc_topics_dist in enumerate(lda_output): dominant_topic = doc_topics_dist.argmax() doc_topics.append({ "文本": texts[doc_idx], # 显示完整文本 "主导主题": f"主题 {dominant_topic + 1}", "主题概率": f"{doc_topics_dist[dominant_topic]:.2%}" }) return pd.DataFrame(topics), pd.DataFrame(doc_topics) except ValueError as e: # 如果没有足够的词组进行分析,返回空的DataFrame empty_topics = pd.DataFrame(columns=["主题", "关键词"]) empty_docs = pd.DataFrame(columns=["文本", "主导主题", "主题概率"]) return empty_topics, empty_docs def create_pie_chart(positive_count, negative_count): """创建情感分布饼图""" fig = go.Figure(data=[go.Pie( labels=['积极评价', '消极评价'], values=[positive_count, negative_count], hole=.3, marker_colors=['#2ecc71', '#e74c3c'] )]) fig.update_layout( title="情感分布", showlegend=True, width=400, height=400 ) return fig def create_score_histogram(df): """创建情感得分直方图""" fig = go.Figure() fig.add_trace(go.Histogram( x=df["积极情感概率"], name="积极情感", nbinsx=20, marker_color='#2ecc71' )) fig.add_trace(go.Histogram( x=df["消极情感概率"], name="消极情感", nbinsx=20, marker_color='#e74c3c' )) fig.update_layout( title="情感得分分布", xaxis_title="情感得分", yaxis_title="评论数量", barmode='overlay', width=600, height=400 ) return fig def analyze_file(file, progress=gr.Progress()): """分析文件中的多个文本""" global current_bigram_samples, FULL_BIGRAM_DF results = [] try: # 读取文件内容 if file is None: return "请上传文件", None, None, None, None, None, None, None, None, "", None # 读取上传的文件内容 text_content = file.name with open(text_content, 'r', encoding='utf-8') as f: content = f.readlines() progress(0, desc="正在预处理文本...") # 处理每一行评论 texts = [] # 存储所有文本用于LDA分析 total_lines = len([line for line in content if line.strip()]) # 检测语言 all_text = ' '.join([line.strip() for line in content if line.strip()]) try: lang_code = detect(all_text) detected_lang = LANG_CODE_MAP.get(lang_code, 'english') lang_info = f"检测到语言:{detected_lang},将使用对应的停用词列表" except: detected_lang = 'english' lang_info = "语言检测失败,将使用英语停用词列表" progress(0.1, desc="正在进行情感分析...") for i, line in enumerate(content): if line.strip(): result = analyze_text(line.strip()) results.append({ "文本": line.strip(), **result }) texts.append(line.strip()) progress((i + 1) / total_lines * 0.3) # 情感分析占30%进度 # 创建DataFrame df = pd.DataFrame(results) # 生成统计信息 total = len(df) if total == 0: return "没有找到有效的评论文本", None, None, None, None, None, None, None, None, "", None positive = len(df[df["整体情感"] == "积极"]) negative = len(df[df["整体情感"] == "消极"]) # 生成分析统计信息 analysis_info = ( f"{lang_info}\n" f"分析完成!共分析{total}条文本\n" f"积极:{positive}条 ({positive/total*100:.1f}%)\n" f"消极:{negative}条 ({negative/total*100:.1f}%)" ) progress(0.4, desc="正在生成词云...") # 生成词云 positive_text = " ".join(df[df["整体情感"] == "积极"]["文本"]) negative_text = " ".join(df[df["整体情感"] == "消极"]["文本"]) pos_wordcloud = None neg_wordcloud = None if positive_text: pos_wordcloud = WordCloud(width=400, height=200, background_color='white').generate(positive_text) pos_wordcloud = pos_wordcloud.to_image() if negative_text: neg_wordcloud = WordCloud(width=400, height=200, background_color='white').generate(negative_text) neg_wordcloud = neg_wordcloud.to_image() progress(0.5, desc="正在生成可视化图表...") # 创建可视化图表 pie_chart = create_pie_chart(positive, negative) score_hist = create_score_histogram(df) progress(0.6, desc="正在提取关键词组...") # 提取关键词组 bigrams_df, bigram_samples = extract_bigrams(texts) current_bigram_samples = bigram_samples # 更新全局变量 FULL_BIGRAM_DF = bigrams_df.copy() # 保存完整的bigram数据 progress(0.7, desc="正在进行主题分析...") # 执行LDA主题分析 topics_df, doc_topics_df = perform_lda_analysis(texts) progress(0.9, desc="正在保存结果...") # 准备显示用的DataFrame display_df = df.copy() display_df["积极情感概率"] = display_df["积极情感概率"].apply(lambda x: f"{x:.2%}") display_df["消极情感概率"] = display_df["消极情感概率"].apply(lambda x: f"{x:.2%}") # 保存结果到Excel文件,包含多个sheet excel_path = "sentiment_analysis_results.xlsx" with pd.ExcelWriter(excel_path, engine='openpyxl') as writer: # 保存情感分析结果 df.to_excel(writer, sheet_name='情感分析结果', index=False) # 保存LDA主题关键词 topics_df.to_excel(writer, sheet_name='主题关键词', index=False) # 保存文档主题分布 doc_topics_df.to_excel(writer, sheet_name='文档主题分布', index=False) # 保存关键词组统计 bigrams_df.to_excel(writer, sheet_name='关键词组统计', index=False) progress(1.0, desc="分析完成!") return ( analysis_info, pos_wordcloud, neg_wordcloud, display_df, pie_chart, score_hist, topics_df, doc_topics_df, bigrams_df, '