|
import re |
|
import jieba |
|
import jieba.analyse |
|
import accelerate |
|
import numpy as np |
|
import pandas as pd |
|
import streamlit as st |
|
import matplotlib.pyplot as plt |
|
import torch.nn.functional as F |
|
|
|
from pathlib import Path |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from wordcloud import WordCloud |
|
from transformers import AutoModelForSequenceClassification, BertTokenizerFast, pipeline |
|
|
|
accelerator = accelerate.Accelerator(cpu=True) |
|
|
|
class LoadException(Exception): |
|
... |
|
|
|
class LoadModelException(Exception): |
|
... |
|
|
|
class LoadTokenizerException(Exception): |
|
... |
|
|
|
class DIR: |
|
DICT_DIR = Path("pages/docs/dict") |
|
MODEL_DIR = Path("pages/docs/model_param") |
|
CLASSIFIER_MODEL_DIR = Path(f"{MODEL_DIR}/board_classification_model") |
|
SENTIMENT_MODEL_DIR = Path(f"{MODEL_DIR}/sentiment_analysis_model") |
|
SUMMARIZATION_MODEL_DIR = Path(f"{MODEL_DIR}/summarization_model") |
|
|
|
class Bert_Classify_Model: |
|
def __init__(self): |
|
self.tokenizer_loaded = False |
|
self.model_loaded = False |
|
|
|
def load_model(self): |
|
try: |
|
self.tokenizer = BertTokenizerFast.from_pretrained( |
|
pretrained_model_name_or_path=DIR.CLASSIFIER_MODEL_DIR, |
|
local_files_only=True |
|
) |
|
self.tokenizer_loaded = True |
|
except LoadTokenizerException: |
|
raise "Tokenizer not loaded." |
|
|
|
try: |
|
self.model = AutoModelForSequenceClassification.from_pretrained( |
|
pretrained_model_name_or_path=DIR.CLASSIFIER_MODEL_DIR, |
|
local_files_only=True, |
|
num_labels=4 |
|
) |
|
self.model_loaded = True |
|
except LoadModelException: |
|
raise "Model not loaded." |
|
|
|
@staticmethod |
|
def __make_output(outputs): |
|
id2label = { |
|
"0": "C_Chat", |
|
"1": "Gossiping", |
|
"2": "HatePolotics", |
|
"3": "Marginalman" |
|
} |
|
|
|
pred_prob = F.softmax(outputs.logits) |
|
pred_prob_df = ( |
|
pd.DataFrame({ |
|
"版面": id2label.values(), |
|
"機率": pred_prob[0, :].detach().numpy() |
|
}) |
|
.sort_values(by="機率", ascending=False) |
|
) |
|
return pred_prob_df |
|
|
|
def predict(self, text): |
|
if (not self.tokenizer_loaded) and (not self.model_loaded): |
|
raise LoadException("Not loaded.") |
|
|
|
token_text = self.tokenizer( |
|
text, |
|
padding=True, |
|
truncation=True, |
|
return_tensors='pt' |
|
) |
|
|
|
outputs = self.model(**token_text) |
|
result = self.__make_output(outputs) |
|
return result |
|
|
|
|
|
class Sentiment_Model: |
|
def __init__(self): |
|
self.model_loaded = False |
|
|
|
def load_model(self): |
|
try: |
|
self.model = pipeline( |
|
"sentiment-analysis", |
|
DIR.SENTIMENT_MODEL_DIR, |
|
) |
|
self.model_loaded = True |
|
|
|
except LoadModelException: |
|
raise "Model not loaded." |
|
|
|
def run_sentiment(self, text): |
|
if not self.model_loaded: |
|
raise LoadModelException("model not loaded.") |
|
outputs = self.model(text) |
|
return outputs |
|
|
|
class Summarization_Model: |
|
def __init__(self): |
|
self.model_loaded = False |
|
|
|
def load_model(self): |
|
try: |
|
self.model = pipeline( |
|
"summarization", |
|
DIR.SUMMARIZATION_MODEL_DIR |
|
) |
|
self.model_loaded = True |
|
except LoadModelException: |
|
raise "Model not loaded." |
|
|
|
self.model_loaded = True |
|
|
|
@staticmethod |
|
def __make_output(outputs): |
|
return outputs[0]["summary_text"] |
|
|
|
def run_summarize(self, text): |
|
if not self.model_loaded: |
|
raise LoadModelException("model not loaded.") |
|
outputs = self.model(text, max_length=1024) |
|
result = self.__make_output(outputs) |
|
return result |
|
|
|
|
|
class WordCloudDrawer: |
|
def __init__(self): |
|
jieba.set_dictionary(f'{DIR.DICT_DIR}/dict.txt') |
|
jieba.analyse.set_stop_words(f'{DIR.DICT_DIR}/stopdict.txt') |
|
self.punctuation_list = [] |
|
with open(f'{DIR.DICT_DIR}/punctuations.txt', 'r', encoding='utf-8-sig') as f2: |
|
for data in f2.readlines(): |
|
self.punctuation_list.append(data.strip()) |
|
|
|
def __filter(self, word): |
|
if word in self.punctuation_list: |
|
return False |
|
else: |
|
return True |
|
|
|
def __preprocess(self, text): |
|
return list(filter(self.__filter, jieba.analyse.extract_tags(text, topK=None, withWeight=False, allowPOS=()))) |
|
|
|
def word_cloud(self, text, num_words): |
|
processed_text = self.__preprocess(text) |
|
''' TF-IDF ''' |
|
vectorizer = TfidfVectorizer(smooth_idf=True) |
|
tfidf = vectorizer.fit_transform(processed_text) |
|
|
|
data = { |
|
'word': vectorizer.get_feature_names_out(), |
|
'tfidf': tfidf.toarray().sum(axis=0).tolist() |
|
} |
|
word_score = pd.DataFrame(data).sort_values(by='tfidf', ascending=False) |
|
|
|
top_words = word_score.sort_values(by='tfidf', ascending=False)[:num_words] |
|
d = dict(zip(top_words['word'].to_list(), top_words['tfidf'].to_list())) |
|
wc = WordCloud( |
|
background_color='white', |
|
collocations=False, |
|
font_path=f'{DIR.DICT_DIR}/SimHei.ttf', |
|
max_font_size=48 |
|
) |
|
wc= wc.generate_from_frequencies(d) |
|
|
|
return wc |
|
|
|
|
|
|
|
|
|
|
|
|
|
|