import re
import jieba
import jieba.analyse
import accelerate
import numpy as np
import pandas as pd 
import streamlit as st
import matplotlib.pyplot as plt
import torch.nn.functional as F

from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from transformers import AutoModelForSequenceClassification, BertTokenizerFast, pipeline

accelerator = accelerate.Accelerator(cpu=True)

class LoadException(Exception):
    ...

class LoadModelException(Exception):
    ...

class LoadTokenizerException(Exception):
    ...

class DIR:
    DICT_DIR = Path("pages/docs/dict")
    MODEL_DIR = Path("pages/docs/model_param")
    CLASSIFIER_MODEL_DIR = Path(f"{MODEL_DIR}/board_classification_model")
    SENTIMENT_MODEL_DIR = Path(f"{MODEL_DIR}/sentiment_analysis_model")
    SUMMARIZATION_MODEL_DIR = Path(f"{MODEL_DIR}/summarization_model")

class Bert_Classify_Model:
    def __init__(self):
        self.tokenizer_loaded = False
        self.model_loaded = False
    
    def load_model(self):
        try:
            self.tokenizer = BertTokenizerFast.from_pretrained(
                pretrained_model_name_or_path=DIR.CLASSIFIER_MODEL_DIR,
                local_files_only=True
                )
            self.tokenizer_loaded = True
        except LoadTokenizerException:
            raise "Tokenizer not loaded."
        
        try:
            self.model = AutoModelForSequenceClassification.from_pretrained(
                pretrained_model_name_or_path=DIR.CLASSIFIER_MODEL_DIR,
                local_files_only=True,
                num_labels=4
            )
            self.model_loaded = True
        except LoadModelException:
            raise "Model not loaded."
        
    @staticmethod
    def __make_output(outputs):
        id2label = {
            "0": "C_Chat", 
            "1": "Gossiping", 
            "2": "HatePolotics", 
            "3": "Marginalman"
            }

        pred_prob = F.softmax(outputs.logits)
        pred_prob_df = (
            pd.DataFrame({
                "版面": id2label.values(),
                "機率": pred_prob[0, :].detach().numpy()
                })
            .sort_values(by="機率", ascending=False)
        )
        return pred_prob_df

    def predict(self, text):
        if (not self.tokenizer_loaded) and (not self.model_loaded):
            raise LoadException("Not loaded.")
        
        token_text = self.tokenizer(
            text, 
            padding=True, 
            truncation=True, 
            return_tensors='pt'
            )
        
        outputs = self.model(**token_text)
        result = self.__make_output(outputs)
        return result


class Sentiment_Model:
    def __init__(self):
        self.model_loaded = False
    
    def load_model(self):
        try:
            self.model = pipeline(
                "sentiment-analysis",
                DIR.SENTIMENT_MODEL_DIR,
            )
            self.model_loaded = True

        except LoadModelException:
            raise "Model not loaded."

    def run_sentiment(self, text):
        if not self.model_loaded:
            raise LoadModelException("model not loaded.")
        outputs = self.model(text)
        return outputs

class Summarization_Model:
    def __init__(self):
        self.model_loaded = False
    
    def load_model(self):
        try:
            self.model = pipeline(
                "summarization",
                DIR.SUMMARIZATION_MODEL_DIR
            )
            self.model_loaded = True
        except LoadModelException:
            raise "Model not loaded."

        self.model_loaded = True

    @staticmethod
    def __make_output(outputs):
        return outputs[0]["summary_text"]

    def run_summarize(self, text):
        if not self.model_loaded:
            raise LoadModelException("model not loaded.")
        outputs = self.model(text, max_length=1024)
        result = self.__make_output(outputs)
        return result


class WordCloudDrawer:
    def __init__(self):
        jieba.set_dictionary(f'{DIR.DICT_DIR}/dict.txt') # 繁中辭典
        jieba.analyse.set_stop_words(f'{DIR.DICT_DIR}/stopdict.txt') # 設置停用詞辭典
        self.punctuation_list = []       
        with open(f'{DIR.DICT_DIR}/punctuations.txt', 'r', encoding='utf-8-sig') as f2:
            for data in f2.readlines():
                self.punctuation_list.append(data.strip())

    def __filter(self, word): # 過濾特殊符號
        if word in self.punctuation_list:
            return False
        else:
            return True

    def __preprocess(self, text): # 去除停用詞並斷詞
        return list(filter(self.__filter, jieba.analyse.extract_tags(text, topK=None, withWeight=False, allowPOS=())))
    
    def word_cloud(self, text, num_words):
        processed_text = self.__preprocess(text)
        ''' TF-IDF '''
        vectorizer = TfidfVectorizer(smooth_idf=True)
        tfidf = vectorizer.fit_transform(processed_text)

        data = {
            'word': vectorizer.get_feature_names_out(),  # 修改此行
            'tfidf': tfidf.toarray().sum(axis=0).tolist()
        }
        word_score = pd.DataFrame(data).sort_values(by='tfidf', ascending=False)

        top_words = word_score.sort_values(by='tfidf', ascending=False)[:num_words]
        d = dict(zip(top_words['word'].to_list(), top_words['tfidf'].to_list()))
        wc = WordCloud(
            background_color='white', 
            collocations=False, 
            font_path=f'{DIR.DICT_DIR}/SimHei.ttf', 
            max_font_size=48
            )
        wc= wc.generate_from_frequencies(d)

        return wc
        # fig, ax = plt.subplots(figsize = (12, 8))
        # plt.imshow(wc)
        # plt.axis("off")
        # st.pyplot(plt.gcf())
        # plt.imshow(wc)
        # plt.show()
        # st.pyplot()