|
from transformers import pipeline |
|
import matplotlib.pyplot as plt |
|
import streamlit as st |
|
import langid |
|
import pandas as pd |
|
from difflib import SequenceMatcher |
|
import random |
|
|
|
def calculate_similarity(a, b): |
|
return SequenceMatcher(None, a, b).ratio() |
|
|
|
def filter_similar_items(list, similarity_threshold): |
|
filtered_data = [] |
|
for item in list: |
|
is_similar = False |
|
for saved_item in filtered_data: |
|
similarity = calculate_similarity(item, saved_item) |
|
if similarity > similarity_threshold: |
|
is_similar = True |
|
break |
|
if not is_similar: |
|
filtered_data.append(item) |
|
return filtered_data |
|
|
|
def process_data(input_data,columnname = 'text', num_data = 100): |
|
random.seed(20979738) |
|
processed_data = [i for i in input_data[columnname]] |
|
random_selection = random.sample(processed_data, num_data) |
|
filtered_data = filter_similar_items(random_selection, similarity_threshold = 0.5) |
|
st.write('The Number of Data You Input: ',len(random_selection)) |
|
st.write('After Removing Duplicates: ',len(filtered_data)) |
|
return filtered_data |
|
|
|
def chi2eng(filtered_data): |
|
trans_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en") |
|
translated_data = [] |
|
language_Classification = langid.classify(filtered_data[0])[0] |
|
if language_Classification == "zh": |
|
st.write("Your input is Chinese, Translating to English") |
|
for i in filtered_data: |
|
st.write(trans_pipe(i)[0]['translation_text']) |
|
translated_data.append(trans_pipe(i)[0]['translation_text']) |
|
elif language_Classification == 'en': |
|
st.write("Your input is English, Moving to Next Stage") |
|
translated_data = [i for i in filtered_data] |
|
else: |
|
st.write('The anguage you input is: ',langid.classify(filtered_data[0])[0],'the program cannot process') |
|
return translated_data |
|
|
|
|
|
def emotion_classification(translated_data): |
|
emo_pipe = pipeline("text-classification", model="deeplearningwithpython5240/twitter_roberta_base_sentiment_fintune_with_app_reviews") |
|
negative_count, neutral_count, positive_count = 0,0,0 |
|
negative_dict = {} |
|
for i in translated_data: |
|
labelled_result = emo_pipe(i)[0]['label'] |
|
if labelled_result == 'negative': |
|
negative_dict[i] = emo_pipe(i)[0]['score'] |
|
negative_count += 1 |
|
if labelled_result == 'neutral': |
|
neutral_count += 1 |
|
if labelled_result == 'positive': |
|
positive_count += 1 |
|
sizes = [negative_count, neutral_count, positive_count] |
|
labels = ['negative_review', 'neutral_review', 'positive_review'] |
|
|
|
st.write('Number of Positive Reviews: ', positive_count) |
|
st.write('Number of Neutral Reviews: ', neutral_count) |
|
st.write('Number of Negative Reviews: ', negative_count) |
|
plt.figure(figsize=(5, 5)) |
|
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90) |
|
|
|
st.pyplot(plt.show()) |
|
negative_dict_sorted = dict(sorted(negative_dict.items(), key=lambda x: x[1], reverse=True)) |
|
top10_negative_str = "" |
|
if len(negative_dict_sorted) < 10: |
|
st.write("Totall Number of Negative Comments: ",len(negative_dict_sorted)) |
|
for k,v in negative_dict_sorted.items(): |
|
st.write(k) |
|
top10_negative_str += f"{k}." |
|
else: |
|
st.write("Top 10 Negative Comments") |
|
count = 0 |
|
for k,v in negative_dict_sorted.items(): |
|
if count >= 10: |
|
break |
|
st.write(k) |
|
top10_negative_str += f"{k}." |
|
count += 1 |
|
return top10_negative_str |
|
|
|
|
|
def summarization(top10_negative_str): |
|
summarize_pipe = pipeline("text2text-generation", model="deeplearningwithpython5240/summarisation-t5-finetuned-model", max_new_tokens =512) |
|
summarized_text = summarize_pipe(top10_negative_str) |
|
return summarized_text |
|
|
|
def main(): |
|
st.set_option('deprecation.showPyplotGlobalUse', False) |
|
st.set_page_config(page_title="Review Sentiment Analysis and Improvement Summarisation Report for Business Product", page_icon="🦜") |
|
st.header("Review Sentiment Analysis and Improvement Summarisation Report for Business Product") |
|
uploaded_file = st.file_uploader("🔶 Upload CSV file for analysis 🔶", type={"csv"}) |
|
columnname = st.text_input("🔶 Please enter the column name in CSV file you want to analyze 🔶") |
|
num_data = st.number_input("🔶 Please enter the number of rows you want to process 🔶",step=1) |
|
input_data = pd.read_csv(uploaded_file) |
|
st.dataframe(input_data) |
|
st.text('️️ ') |
|
if uploaded_file is not None: |
|
uploaded_file.seek(0) |
|
|
|
st.text('🔶 Processing Data 🔶') |
|
processed_data = process_data(input_data ,columnname, int(num_data)) |
|
st.write(processed_data) |
|
st.text('️️🟢 Processing Data Finished 🟢') |
|
st.text('️️ ') |
|
|
|
|
|
st.text('🔶 Checking Translation is Needed or Not 🔶') |
|
translated_data = chi2eng(processed_data) |
|
st.write(translated_data) |
|
st.text('️️🟢 Translation Finished 🟢') |
|
st.text('️️ ') |
|
|
|
|
|
st.text('️️🔶 Processing Emotion Calssification 🔶') |
|
top10_negative_str = emotion_classification(translated_data) |
|
st.text('️️🟢 Emotion Calssification Finished 🟢') |
|
st.text('️️ ') |
|
|
|
|
|
st.text('🔶 Processing Summarization 🔶') |
|
summarized_text = summarization(top10_negative_str) |
|
st.write(summarized_text) |
|
st.text('️️🟢 Summarization Finished 🟢') |
|
|
|
if __name__ == "__main__": |
|
main() |