File size: 6,465 Bytes
1a630df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ba8d69
 
1a630df
 
 
 
 
 
3ba8d69
 
 
 
1a630df
 
 
 
3ba8d69
1a630df
 
 
 
 
 
 
3ba8d69
1a630df
3ba8d69
1a630df
 
 
 
901b59e
 
 
1a630df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ba8d69
1a630df
3ba8d69
1a630df
 
 
 
 
 
 
3e5ab07
a606c00
 
 
 
 
 
 
1a630df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ba8d69
1a630df
3ba8d69
1a630df
 
 
 
 
 
 
a606c00
3ba8d69
a606c00
1a630df
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from transformers import pipeline
import matplotlib.pyplot as plt
import streamlit as st
import langid
import pandas as pd
from difflib import SequenceMatcher
import random

def calculate_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def filter_similar_items(list, similarity_threshold):
    filtered_data = []
    for item in list:
        is_similar = False
        for saved_item in filtered_data:
            similarity = calculate_similarity(item, saved_item)
            if similarity > similarity_threshold:
                is_similar = True
                break
        if not is_similar:
            filtered_data.append(item)
    return filtered_data

def process_data(input_data,columnname = 'text', num_data = 100):
    random.seed(20979738)
    processed_data = [i for i in input_data[columnname]]
    random_selection = random.sample(processed_data, num_data)
    filtered_data = filter_similar_items(random_selection, similarity_threshold = 0.5)
    st.write('Number of data input: ',len(random_selection))
    st.write('After removing duplicates: ',len(filtered_data))
    return filtered_data

def chi2eng(filtered_data):
    translated_data = []
    language_Classification = langid.classify(filtered_data[0])[0]
    if language_Classification == "zh":
      st.write("Your input is Chinese, translating to English")
      st.write('▶️ Translation model start downing, loading model may takes time, please wait...')
      trans_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")
      st.write('⏺️ Translation model successfully loaded')
      for i in filtered_data:
        st.write(trans_pipe(i)[0]['translation_text'])
        translated_data.append(trans_pipe(i)[0]['translation_text'])
    elif language_Classification == 'en':
        st.write("Your input is English, moving to next stage...")
        translated_data = [i for i in filtered_data]
    else:
        st.write('The anguage you input is: ',langid.classify(filtered_data[0])[0],'the program cannot process')
    return translated_data

# Text Classification:Negative/Neutral/Positive
def emotion_classification(translated_data):
    st.write('▶️ Classification model start downing, loading model may takes time, please wait...')
    emo_pipe = pipeline("text-classification", model="deeplearningwithpython5240/twitter_roberta_base_sentiment_fintune_with_app_reviews")
    st.write('⏺️ Classification model successfully loaded')
    negative_count, neutral_count, positive_count = 0,0,0
    negative_dict = {}
    for i in translated_data:
      labelled_result = emo_pipe(i)[0]['label']
      st.write('Text: ',i)
      st.write('Label: ',labelled_result)
      st.write(' ')
      if labelled_result == 'negative':
        negative_dict[i] = emo_pipe(i)[0]['score']
        negative_count += 1
      if labelled_result == 'neutral':
        neutral_count += 1
      if labelled_result == 'positive':
        positive_count += 1
    sizes = [negative_count, neutral_count, positive_count]
    labels = ['negative_review', 'neutral_review', 'positive_review']
    # 创建饼状图
    st.write('Number of Positive Reviews: ', positive_count)
    st.write('Number of Neutral Reviews: ', neutral_count)
    st.write('Number of Negative Reviews: ', negative_count)
    plt.figure(figsize=(5, 5))  # 设置图表大小
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
    # 显示图表
    st.pyplot(plt.show())
    negative_dict_sorted = dict(sorted(negative_dict.items(), key=lambda x: x[1], reverse=True))
    top10_negative_str = ""
    if len(negative_dict_sorted) < 10:
        st.write("Totall Number of Negative Comments: ",len(negative_dict_sorted))
        for k,v in negative_dict_sorted.items():
          st.write(k)
          top10_negative_str += f"{k}."
    else:
        st.write("Top 10 Negative Comments")
        count = 0
        for k,v in negative_dict_sorted.items():
          if count >= 10:
            break
          st.write(k)
          top10_negative_str += f"{k}."
          count += 1
    return top10_negative_str

# Summarization
def summarization(top10_negative_str):
    st.write('▶️ Summarizatio model start downing, loading model may takes time, please wait...')
    summarize_pipe = pipeline("text2text-generation", model="deeplearningwithpython5240/summarisation-t5-finetuned-model", max_new_tokens =512)
    st.write('⏺️ Summarization model successfully loaded')
    summarized_text = summarize_pipe(top10_negative_str)
    return summarized_text

def main():
    st.set_option('deprecation.showPyplotGlobalUse', False)
    st.set_page_config(page_title="Review Sentiment Analysis and Improvement Summarisation Report for Business Product", page_icon="🦜")
    st.header("Review Sentiment Analysis and Improvement Summarisation Report for Business Product")
    try:
      uploaded_file = st.file_uploader("🔶 Upload CSV file for analysis 🔶", type={"csv"})
      if uploaded_file is not None:
        columnname = st.text_input("🔶 Please enter the column name in CSV file you want to analyze 🔶")
        num_data = st.number_input("🔶 Please enter the number of rows you want to process 🔶",step=1)
        input_data = pd.read_csv(uploaded_file)
        st.dataframe(input_data)
        st.text('️️ ')
        #stage 1:process data
        st.text('🔶 Processing Data 🔶')
        processed_data = process_data(input_data ,columnname, int(num_data))
        st.write(processed_data)
        st.text('️️🟢 Processing Data Finished 🟢')
        st.text('️️ ')

        #stage 2:translate
        st.text('🔶 Checking Translation is Needed or Not 🔶')
        translated_data = chi2eng(processed_data)
        st.write(translated_data)
        st.text('️️🟢 Translation Finished 🟢')
        st.text('️️ ')

        #stage 3:emotion Classification
        st.text('️️🔶 Processing Emotion Classification 🔶')
        top10_negative_str = emotion_classification(translated_data)
        st.text('️️🟢 Emotion Classification Finished 🟢')
        st.text('️️ ')

        #stage 4:Summarization
        st.text('🔶 Processing Summarization 🔶')
        summarized_text = summarization(top10_negative_str)
        st.write(summarized_text)
        st.text('️️🟢 Summarization Finished 🟢')
    except:
        st.write("")
        
if __name__ == "__main__":
    main()