from transformers import pipeline import matplotlib.pyplot as plt import streamlit as st import langid import pandas as pd from difflib import SequenceMatcher import random def calculate_similarity(a, b): return SequenceMatcher(None, a, b).ratio() def filter_similar_items(list, similarity_threshold): filtered_data = [] for item in list: is_similar = False for saved_item in filtered_data: similarity = calculate_similarity(item, saved_item) if similarity > similarity_threshold: is_similar = True break if not is_similar: filtered_data.append(item) return filtered_data def process_data(input_data,columnname = 'text', num_data = 100): random.seed(20979738) processed_data = [i for i in input_data[columnname]] random_selection = random.sample(processed_data, num_data) filtered_data = filter_similar_items(random_selection, similarity_threshold = 0.5) st.write('Number of data input: ',len(random_selection)) st.write('After removing duplicates: ',len(filtered_data)) return filtered_data def chi2eng(filtered_data): translated_data = [] language_Classification = langid.classify(filtered_data[0])[0] if language_Classification == "zh": st.write("Your input is Chinese, translating to English") st.write('▶️ Translation model start downing, loading model may takes time, please wait...') trans_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en") st.write('⏺️ Translation model successfully loaded') for i in filtered_data: st.write(trans_pipe(i)[0]['translation_text']) translated_data.append(trans_pipe(i)[0]['translation_text']) elif language_Classification == 'en': st.write("Your input is English, moving to next stage...") translated_data = [i for i in filtered_data] else: st.write('The anguage you input is: ',langid.classify(filtered_data[0])[0],'the program cannot process') return translated_data # Text Classification:Negative/Neutral/Positive def emotion_classification(translated_data): st.write('▶️ Classification model start downing, loading model may takes time, please wait...') emo_pipe = pipeline("text-classification", model="deeplearningwithpython5240/twitter_roberta_base_sentiment_fintune_with_app_reviews") st.write('⏺️ Classification model successfully loaded') negative_count, neutral_count, positive_count = 0,0,0 negative_dict = {} for i in translated_data: labelled_result = emo_pipe(i)[0]['label'] st.write('Text: ',i) st.write('Label: ',labelled_result) st.write(' ') if labelled_result == 'negative': negative_dict[i] = emo_pipe(i)[0]['score'] negative_count += 1 if labelled_result == 'neutral': neutral_count += 1 if labelled_result == 'positive': positive_count += 1 sizes = [negative_count, neutral_count, positive_count] labels = ['negative_review', 'neutral_review', 'positive_review'] # 创建饼状图 st.write('Number of Positive Reviews: ', positive_count) st.write('Number of Neutral Reviews: ', neutral_count) st.write('Number of Negative Reviews: ', negative_count) plt.figure(figsize=(5, 5)) # 设置图表大小 plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90) # 显示图表 st.pyplot(plt.show()) negative_dict_sorted = dict(sorted(negative_dict.items(), key=lambda x: x[1], reverse=True)) top10_negative_str = "" if len(negative_dict_sorted) < 10: st.write("Totall Number of Negative Comments: ",len(negative_dict_sorted)) for k,v in negative_dict_sorted.items(): st.write(k) top10_negative_str += f"{k}." else: st.write("Top 10 Negative Comments") count = 0 for k,v in negative_dict_sorted.items(): if count >= 10: break st.write(k) top10_negative_str += f"{k}." count += 1 return top10_negative_str # Summarization def summarization(top10_negative_str): st.write('▶️ Summarizatio model start downing, loading model may takes time, please wait...') summarize_pipe = pipeline("text2text-generation", model="deeplearningwithpython5240/summarisation-t5-finetuned-model", max_new_tokens =512) st.write('⏺️ Summarization model successfully loaded') summarized_text = summarize_pipe(top10_negative_str) return summarized_text def main(): st.set_option('deprecation.showPyplotGlobalUse', False) st.set_page_config(page_title="Review Sentiment Analysis and Improvement Summarisation Report for Business Product", page_icon="🦜") st.header("Review Sentiment Analysis and Improvement Summarisation Report for Business Product") try: uploaded_file = st.file_uploader("🔶 Upload CSV file for analysis 🔶", type={"csv"}) if uploaded_file is not None: columnname = st.text_input("🔶 Please enter the column name in CSV file you want to analyze 🔶") num_data = st.number_input("🔶 Please enter the number of rows you want to process 🔶",step=1) input_data = pd.read_csv(uploaded_file) st.dataframe(input_data) st.text('️️ ') #stage 1:process data st.text('🔶 Processing Data 🔶') processed_data = process_data(input_data ,columnname, int(num_data)) st.write(processed_data) st.text('️️🟢 Processing Data Finished 🟢') st.text('️️ ') #stage 2:translate st.text('🔶 Checking Translation is Needed or Not 🔶') translated_data = chi2eng(processed_data) st.write(translated_data) st.text('️️🟢 Translation Finished 🟢') st.text('️️ ') #stage 3:emotion Classification st.text('️️🔶 Processing Emotion Classification 🔶') top10_negative_str = emotion_classification(translated_data) st.text('️️🟢 Emotion Classification Finished 🟢') st.text('️️ ') #stage 4:Summarization st.text('🔶 Processing Summarization 🔶') summarized_text = summarization(top10_negative_str) st.write(summarized_text) st.text('️️🟢 Summarization Finished 🟢') except: st.write("") if __name__ == "__main__": main()