from transformers import pipeline
import matplotlib.pyplot as plt
import streamlit as st
import langid
import pandas as pd
from difflib import SequenceMatcher
import random

def calculate_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def filter_similar_items(list, similarity_threshold):
    filtered_data = []
    for item in list:
        is_similar = False
        for saved_item in filtered_data:
            similarity = calculate_similarity(item, saved_item)
            if similarity > similarity_threshold:
                is_similar = True
                break
        if not is_similar:
            filtered_data.append(item)
    return filtered_data

def process_data(input_data,columnname = 'text', num_data = 100):
    random.seed(20979738)
    processed_data = [i for i in input_data[columnname]]
    random_selection = random.sample(processed_data, num_data)
    filtered_data = filter_similar_items(random_selection, similarity_threshold = 0.75)
    st.write('Number of data input: ',len(random_selection))
    st.write('After removing duplicates: ',len(filtered_data))
    return filtered_data

def chi2eng(filtered_data):
    translated_data = []
    language_Classification = langid.classify(filtered_data[0])[0]
    if language_Classification == "zh":
      st.write("Your input is Chinese, translating to English")
      st.write('▶️ Loading Translation Model...')
      trans_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")
      st.write('⏺️ Translation Model Successfully Loaded')
      st.write('▶️ Start Translating...')
      translation_progress_count = 0
      translation_bar = st.progress(0)
      for i in filtered_data:
        translated_data.append(trans_pipe(i)[0]['translation_text'])
        translation_progress_count += 1/len(filtered_data)
        translation_bar.progress(translation_progress_count)
    elif language_Classification == 'en':
        st.write("Your input is English, moving to next stage...")
        translated_data = [i for i in filtered_data]
    else:
        st.write('The anguage you input is: ',langid.classify(filtered_data[0])[0],'the program cannot process')
    return translated_data

# Text Classification：Negative/Neutral/Positive
def emotion_classification(translated_data):
    st.write('▶️ Loading Classification Model...')
    emo_pipe = pipeline("text-classification", model="deeplearningwithpython5240/twitter_roberta_base_sentiment_fintune_with_app_reviews")
    st.write('⏺️ Classification Model Successfully Loaded')
    negative_count, neutral_count, positive_count = 0,0,0
    negative_dict = {}
    emotion_progress_count = 0
    st.write('▶️ Data Processing ...')
    emotion_bar = st.progress(0)
    for i in translated_data:
      labelled_result = emo_pipe(i)[0]['label']
      # st.write('Text: ',i)
      # st.write('Label: ',labelled_result)
      # st.write(' ')
      if labelled_result == 'negative':
        negative_dict[i] = emo_pipe(i)[0]['score']
        negative_count += 1
      if labelled_result == 'neutral':
        neutral_count += 1
      if labelled_result == 'positive':
        positive_count += 1
      emotion_progress_count += 1/len(translated_data)
      emotion_bar.progress(emotion_progress_count)
    sizes = [negative_count, neutral_count, positive_count]
    labels = ['Negative_Reviews', 'Neutral_Reviews', 'Positive_Reviews']
    # 创建饼状图
    st.write('Number of Positive Reviews: ', positive_count)
    st.write('Number of Neutral Reviews: ', neutral_count)
    st.write('Number of Negative Reviews: ', negative_count)
    plt.figure(figsize=(5, 5))  # 设置图表大小
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
    # 显示图表
    st.pyplot(plt.show())
    negative_dict_sorted = dict(sorted(negative_dict.items(), key=lambda x: x[1], reverse=True))
    top10_negative_str = ""
    if len(negative_dict_sorted) < 10:
        st.write("Total Number of Negative Comments: ",len(negative_dict_sorted))
        for k,v in negative_dict_sorted.items():
          st.write(k)
          top10_negative_str += f"{k}."
    else:
        st.write("Top 10 Negative Comments")
        count = 0
        for k,v in negative_dict_sorted.items():
          if count >= 10:
            break
          st.write(k)
          top10_negative_str += f"{k}."
          count += 1
    return top10_negative_str

# Summarization
def summarization(top10_negative_str):
    st.write('▶️ Loading Summarizatio Model...')
    summarize_pipe = pipeline("text2text-generation", model="deeplearningwithpython5240/summarisation-t5-finetuned-model", max_new_tokens =512)
    st.write('⏺️ Summarization Model Successfully Loaded')
    st.write('▶️ Summarizing...')
    summarized_text = summarize_pipe(top10_negative_str)
    return summarized_text

def main():
    st.set_option('deprecation.showPyplotGlobalUse', False)
    st.set_page_config(page_title="Review Sentiment Analysis and Improvement Summarisation Report for Business Product", page_icon="🦜")
    st.header("Review Sentiment Analysis and Improvement Summarisation Report for Business Product")
    st.write(f"Welcome to the user guide for our product feedback analysis application. Our application is designed to help companies review their product feedback and summarise areas for improvement. Here are the steps to get started:\n1. Upload reviews in CSV file format\n2. Input column name and number of data to be analysed\n3. Automatically removes duplicate reviews\n4. Translate data into English (if the dataset is Chinese)\n5. Analyse reviews sentiment\n6. Summarize top negative reveiws")
    try:
      uploaded_file = st.file_uploader("🔶 Upload CSV file for analysis 🔶", type={"csv"})
      if uploaded_file is not None:
        columnname = st.text_input("🔶 Please enter the column name in CSV file you want to analyze 🔶")
        num_data = st.number_input("🔶 Please enter the number of rows you want to process 🔶",min_value=2, step=1)
        input_data = pd.read_csv(uploaded_file)
        st.dataframe(input_data)
        st.text('️️ ')
        #stage 1：process data
        st.text('🔶 Processing Data 🔶')
        processed_data = process_data(input_data ,columnname, int(num_data))
        st.write(processed_data)
        st.text('️️🟢 Processing Data Finished 🟢')
        st.text('️️ ')

        #stage 2：translate
        st.text('🔶 Checking Translation is Needed or Not 🔶')
        translated_data = chi2eng(processed_data)
        st.write(translated_data)
        st.text('️️🟢 Translation Finished 🟢')
        st.text('️️ ')

        #stage 3：emotion Classification
        st.text('️️🔶 Processing Emotion Classification 🔶')
        top10_negative_str = emotion_classification(translated_data)
        st.text('️️🟢 Emotion Classification Finished 🟢')
        st.text('️️ ')

        #stage 4：Summarization
        st.text('🔶 Processing Summarization 🔶')
        if len(top10_negative_str) == 0:
          st.write("No Negative Reviews Detected")
        else:
          summarized_text = summarization(top10_negative_str)
          st.write('Summarized Negative Comments:')
          st.write(summarized_text[0]["generated_text"])
        st.text('️️🟢 Summarization Finished 🟢')
    except:
        st.write("")

if __name__ == "__main__":
    main()