from transformers import pipeline import matplotlib.pyplot as plt import streamlit as st import langid import pandas as pd from difflib import SequenceMatcher import random def calculate_similarity(a, b): return SequenceMatcher(None, a, b).ratio() def filter_similar_items(list, similarity_threshold): filtered_data = [] for item in list: is_similar = False for saved_item in filtered_data: similarity = calculate_similarity(item, saved_item) if similarity > similarity_threshold: is_similar = True break if not is_similar: filtered_data.append(item) return filtered_data def process_data(input_data,columnname = 'text', num_data = 100): random.seed(20979738) processed_data = [i for i in input_data[columnname]] random_selection = random.sample(processed_data, num_data) filtered_data = filter_similar_items(random_selection, similarity_threshold = 0.75) st.write('Number of data input: ',len(random_selection)) st.write('After removing duplicates: ',len(filtered_data)) return filtered_data def chi2eng(filtered_data): translated_data = [] language_Classification = langid.classify(filtered_data[0])[0] if language_Classification == "zh": st.write("Your input is Chinese, translating to English") st.write('▶️ Loading Translation Model...') trans_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en") st.write('⏺️ Translation Model Successfully Loaded') st.write('▶️ Start Translating...') translation_progress_count = 0 translation_bar = st.progress(0) for i in filtered_data: translated_data.append(trans_pipe(i)[0]['translation_text']) translation_progress_count += 1/len(filtered_data) translation_bar.progress(translation_progress_count) elif language_Classification == 'en': st.write("Your input is English, moving to next stage...") translated_data = [i for i in filtered_data] else: st.write('The anguage you input is: ',langid.classify(filtered_data[0])[0],'the program cannot process') return translated_data # Text Classification:Negative/Neutral/Positive def emotion_classification(translated_data): st.write('▶️ Loading Classification Model...') emo_pipe = pipeline("text-classification", model="deeplearningwithpython5240/twitter_roberta_base_sentiment_fintune_with_app_reviews") st.write('⏺️ Classification Model Successfully Loaded') negative_count, neutral_count, positive_count = 0,0,0 negative_dict = {} emotion_progress_count = 0 st.write('▶️ Data Processing ...') emotion_bar = st.progress(0) for i in translated_data: labelled_result = emo_pipe(i)[0]['label'] # st.write('Text: ',i) # st.write('Label: ',labelled_result) # st.write(' ') if labelled_result == 'negative': negative_dict[i] = emo_pipe(i)[0]['score'] negative_count += 1 if labelled_result == 'neutral': neutral_count += 1 if labelled_result == 'positive': positive_count += 1 emotion_progress_count += 1/len(translated_data) emotion_bar.progress(emotion_progress_count) sizes = [negative_count, neutral_count, positive_count] labels = ['Negative_Reviews', 'Neutral_Reviews', 'Positive_Reviews'] # 创建饼状图 st.write('Number of Positive Reviews: ', positive_count) st.write('Number of Neutral Reviews: ', neutral_count) st.write('Number of Negative Reviews: ', negative_count) plt.figure(figsize=(5, 5)) # 设置图表大小 plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90) # 显示图表 st.pyplot(plt.show()) negative_dict_sorted = dict(sorted(negative_dict.items(), key=lambda x: x[1], reverse=True)) top10_negative_str = "" if len(negative_dict_sorted) < 10: st.write("Total Number of Negative Comments: ",len(negative_dict_sorted)) for k,v in negative_dict_sorted.items(): st.write(k) top10_negative_str += f"{k}." else: st.write("Top 10 Negative Comments") count = 0 for k,v in negative_dict_sorted.items(): if count >= 10: break st.write(k) top10_negative_str += f"{k}." count += 1 return top10_negative_str # Summarization def summarization(top10_negative_str): st.write('▶️ Loading Summarizatio Model...') summarize_pipe = pipeline("text2text-generation", model="deeplearningwithpython5240/summarisation-t5-finetuned-model", max_new_tokens =512) st.write('⏺️ Summarization Model Successfully Loaded') st.write('▶️ Summarizing...') summarized_text = summarize_pipe(top10_negative_str) return summarized_text def main(): st.set_option('deprecation.showPyplotGlobalUse', False) st.set_page_config(page_title="Review Sentiment Analysis and Improvement Summarisation Report for Business Product", page_icon="🦜") st.header("Review Sentiment Analysis and Improvement Summarisation Report for Business Product") st.write(f"Welcome to the user guide for our product feedback analysis application. Our application is designed to help companies review their product feedback and summarise areas for improvement. Here are the steps to get started:\n1. Upload reviews in CSV file format\n2. Input column name and number of data to be analysed\n3. Automatically removes duplicate reviews\n4. Translate data into English (if the dataset is Chinese)\n5. Analyse reviews sentiment\n6. Summarize top negative reveiws") try: uploaded_file = st.file_uploader("🔶 Upload CSV file for analysis 🔶", type={"csv"}) if uploaded_file is not None: columnname = st.text_input("🔶 Please enter the column name in CSV file you want to analyze 🔶") num_data = st.number_input("🔶 Please enter the number of rows you want to process 🔶",min_value=2, step=1) input_data = pd.read_csv(uploaded_file) st.dataframe(input_data) st.text('️️ ') #stage 1:process data st.text('🔶 Processing Data 🔶') processed_data = process_data(input_data ,columnname, int(num_data)) st.write(processed_data) st.text('️️🟢 Processing Data Finished 🟢') st.text('️️ ') #stage 2:translate st.text('🔶 Checking Translation is Needed or Not 🔶') translated_data = chi2eng(processed_data) st.write(translated_data) st.text('️️🟢 Translation Finished 🟢') st.text('️️ ') #stage 3:emotion Classification st.text('️️🔶 Processing Emotion Classification 🔶') top10_negative_str = emotion_classification(translated_data) st.text('️️🟢 Emotion Classification Finished 🟢') st.text('️️ ') #stage 4:Summarization st.text('🔶 Processing Summarization 🔶') if len(top10_negative_str) == 0: st.write("No Negative Reviews Detected") else: summarized_text = summarization(top10_negative_str) st.write('Summarized Negative Comments:') st.write(summarized_text[0]["generated_text"]) st.text('️️🟢 Summarization Finished 🟢') except: st.write("") if __name__ == "__main__": main()