Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import streamlit as st
|
4 |
+
import langid
|
5 |
+
import pandas as pd
|
6 |
+
from difflib import SequenceMatcher
|
7 |
+
import random
|
8 |
+
|
9 |
+
def calculate_similarity(a, b):
|
10 |
+
return SequenceMatcher(None, a, b).ratio()
|
11 |
+
|
12 |
+
def filter_similar_items(list, similarity_threshold):
|
13 |
+
filtered_data = []
|
14 |
+
for item in list:
|
15 |
+
is_similar = False
|
16 |
+
for saved_item in filtered_data:
|
17 |
+
similarity = calculate_similarity(item, saved_item)
|
18 |
+
if similarity > similarity_threshold:
|
19 |
+
is_similar = True
|
20 |
+
break
|
21 |
+
if not is_similar:
|
22 |
+
filtered_data.append(item)
|
23 |
+
return filtered_data
|
24 |
+
|
25 |
+
def process_data(input_data,columnname = 'text', num_data = 100):
|
26 |
+
random.seed(20979738)
|
27 |
+
processed_data = [i for i in input_data[columnname]]
|
28 |
+
random_selection = random.sample(processed_data, num_data)
|
29 |
+
filtered_data = filter_similar_items(random_selection, similarity_threshold = 0.5)
|
30 |
+
st.write('The Number of Data You Input: ',len(random_selection))
|
31 |
+
st.write('After Removing Duplicates: ',len(filtered_data))
|
32 |
+
return filtered_data
|
33 |
+
|
34 |
+
def chi2eng(filtered_data):
|
35 |
+
trans_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")
|
36 |
+
translated_data = []
|
37 |
+
language_Classification = langid.classify(filtered_data[0])[0]
|
38 |
+
if language_Classification == "zh":
|
39 |
+
st.write("Your input is Chinese, Translating to English")
|
40 |
+
for i in filtered_data:
|
41 |
+
st.write(trans_pipe(i)[0]['translation_text'])
|
42 |
+
translated_data.append(trans_pipe(i)[0]['translation_text'])
|
43 |
+
elif language_Classification == 'en':
|
44 |
+
st.write("Your input is English, Moving to Next Stage")
|
45 |
+
translated_data = [i for i in filtered_data]
|
46 |
+
else:
|
47 |
+
st.write('The anguage you input is: ',langid.classify(filtered_data[0])[0],'the program cannot process')
|
48 |
+
return translated_data
|
49 |
+
|
50 |
+
# Text Classification:Negative/Neutral/Positive
|
51 |
+
def emotion_classification(translated_data):
|
52 |
+
emo_pipe = pipeline("text-classification", model="deeplearningwithpython5240/twitter_roberta_base_sentiment_fintune_with_app_reviews")
|
53 |
+
negative_count, neutral_count, positive_count = 0,0,0
|
54 |
+
negative_dict = {}
|
55 |
+
for i in translated_data:
|
56 |
+
labelled_result = emo_pipe(i)[0]['label']
|
57 |
+
if labelled_result == 'negative':
|
58 |
+
negative_dict[i] = emo_pipe(i)[0]['score']
|
59 |
+
negative_count += 1
|
60 |
+
if labelled_result == 'neutral':
|
61 |
+
neutral_count += 1
|
62 |
+
if labelled_result == 'positive':
|
63 |
+
positive_count += 1
|
64 |
+
sizes = [negative_count, neutral_count, positive_count]
|
65 |
+
labels = ['negative_review', 'neutral_review', 'positive_review']
|
66 |
+
# 创建饼状图
|
67 |
+
st.write('Number of Positive Reviews: ', positive_count)
|
68 |
+
st.write('Number of Neutral Reviews: ', neutral_count)
|
69 |
+
st.write('Number of Negative Reviews: ', negative_count)
|
70 |
+
plt.figure(figsize=(5, 5)) # 设置图表大小
|
71 |
+
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
|
72 |
+
# 显示图表
|
73 |
+
st.pyplot(plt.show())
|
74 |
+
negative_dict_sorted = dict(sorted(negative_dict.items(), key=lambda x: x[1], reverse=True))
|
75 |
+
top10_negative_str = ""
|
76 |
+
if len(negative_dict_sorted) < 10:
|
77 |
+
st.write("Totall Number of Negative Comments: ",len(negative_dict_sorted))
|
78 |
+
for k,v in negative_dict_sorted.items():
|
79 |
+
st.write(k)
|
80 |
+
top10_negative_str += f"{k}."
|
81 |
+
else:
|
82 |
+
st.write("Top 10 Negative Comments")
|
83 |
+
count = 0
|
84 |
+
for k,v in negative_dict_sorted.items():
|
85 |
+
if count >= 10:
|
86 |
+
break
|
87 |
+
st.write(k)
|
88 |
+
top10_negative_str += f"{k}."
|
89 |
+
count += 1
|
90 |
+
return top10_negative_str
|
91 |
+
|
92 |
+
# Summarization
|
93 |
+
def summarization(top10_negative_str):
|
94 |
+
summarize_pipe = pipeline("text2text-generation", model="deeplearningwithpython5240/summarisation-t5-finetuned-model", max_new_tokens =512)
|
95 |
+
summarized_text = summarize_pipe(top10_negative_str)
|
96 |
+
return summarized_text
|
97 |
+
|
98 |
+
def main():
|
99 |
+
st.set_option('deprecation.showPyplotGlobalUse', False)
|
100 |
+
st.set_page_config(page_title="Review Sentiment Analysis and Improvement Summarisation Report for Business Product", page_icon="🦜")
|
101 |
+
st.header("Review Sentiment Analysis and Improvement Summarisation Report for Business Product")
|
102 |
+
uploaded_file = st.file_uploader("🔶 Upload CSV file for analysis 🔶", type={"csv"})
|
103 |
+
columnname = st.text_input("🔶 Please enter the column name in CSV file you want to analyze 🔶")
|
104 |
+
num_data = st.number_input("🔶 Please enter the number of rows you want to process 🔶",step=1)
|
105 |
+
input_data = pd.read_csv(uploaded_file)
|
106 |
+
st.dataframe(input_data)
|
107 |
+
st.text('️️ ')
|
108 |
+
if uploaded_file is not None:
|
109 |
+
uploaded_file.seek(0)
|
110 |
+
#stage 1:process data
|
111 |
+
st.text('🔶 Processing Data 🔶')
|
112 |
+
processed_data = process_data(input_data ,columnname, int(num_data))
|
113 |
+
st.write(processed_data)
|
114 |
+
st.text('️️🟢 Processing Data Finished ��')
|
115 |
+
st.text('️️ ')
|
116 |
+
|
117 |
+
#stage 2:translate
|
118 |
+
st.text('🔶 Checking Translation is Needed or Not 🔶')
|
119 |
+
translated_data = chi2eng(processed_data)
|
120 |
+
st.write(translated_data)
|
121 |
+
st.text('️️🟢 Translation Finished 🟢')
|
122 |
+
st.text('️️ ')
|
123 |
+
|
124 |
+
#stage 3:emotion Classification
|
125 |
+
st.text('️️🔶 Processing Emotion Calssification 🔶')
|
126 |
+
top10_negative_str = emotion_classification(translated_data)
|
127 |
+
st.text('️️🟢 Emotion Calssification Finished 🟢')
|
128 |
+
st.text('️️ ')
|
129 |
+
|
130 |
+
#stage 4:Summarization
|
131 |
+
st.text('🔶 Processing Summarization 🔶')
|
132 |
+
summarized_text = summarization(top10_negative_str)
|
133 |
+
st.write(summarized_text)
|
134 |
+
st.text('️️🟢 Summarization Finished 🟢')
|
135 |
+
|
136 |
+
if __name__ == "__main__":
|
137 |
+
main()
|