Spaces:
Sleeping
Sleeping
import re | |
import jieba | |
import jieba.analyse | |
import accelerate | |
import numpy as np | |
import pandas as pd | |
import streamlit as st | |
import matplotlib.pyplot as plt | |
import torch.nn.functional as F | |
from pathlib import Path | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from wordcloud import WordCloud | |
from transformers import AutoModelForSequenceClassification, BertTokenizerFast, pipeline | |
accelerator = accelerate.Accelerator(cpu=True) | |
class LoadException(Exception): | |
... | |
class LoadModelException(Exception): | |
... | |
class LoadTokenizerException(Exception): | |
... | |
class DIR: | |
DICT_DIR = Path("pages/docs/dict") | |
MODEL_DIR = Path("pages/docs/model_param") | |
CLASSIFIER_MODEL_DIR = Path(f"{MODEL_DIR}/board_classification_model") | |
SENTIMENT_MODEL_DIR = Path(f"{MODEL_DIR}/sentiment_analysis_model") | |
SUMMARIZATION_MODEL_DIR = Path(f"{MODEL_DIR}/summarization_model") | |
class Bert_Classify_Model: | |
def __init__(self): | |
self.tokenizer_loaded = False | |
self.model_loaded = False | |
def load_model(self): | |
try: | |
self.tokenizer = BertTokenizerFast.from_pretrained( | |
pretrained_model_name_or_path=DIR.CLASSIFIER_MODEL_DIR, | |
local_files_only=True | |
) | |
self.tokenizer_loaded = True | |
except LoadTokenizerException: | |
raise "Tokenizer not loaded." | |
try: | |
self.model = AutoModelForSequenceClassification.from_pretrained( | |
pretrained_model_name_or_path=DIR.CLASSIFIER_MODEL_DIR, | |
local_files_only=True, | |
num_labels=4 | |
) | |
self.model_loaded = True | |
except LoadModelException: | |
raise "Model not loaded." | |
def __make_output(outputs): | |
id2label = { | |
"0": "C_Chat", | |
"1": "Gossiping", | |
"2": "HatePolotics", | |
"3": "Marginalman" | |
} | |
pred_prob = F.softmax(outputs.logits) | |
pred_prob_df = ( | |
pd.DataFrame({ | |
"版面": id2label.values(), | |
"機率": pred_prob[0, :].detach().numpy() | |
}) | |
.sort_values(by="機率", ascending=False) | |
) | |
return pred_prob_df | |
def predict(self, text): | |
if (not self.tokenizer_loaded) and (not self.model_loaded): | |
raise LoadException("Not loaded.") | |
token_text = self.tokenizer( | |
text, | |
padding=True, | |
truncation=True, | |
return_tensors='pt' | |
) | |
outputs = self.model(**token_text) | |
result = self.__make_output(outputs) | |
return result | |
class Sentiment_Model: | |
def __init__(self): | |
self.model_loaded = False | |
def load_model(self): | |
try: | |
self.model = pipeline( | |
"sentiment-analysis", | |
DIR.SENTIMENT_MODEL_DIR, | |
) | |
self.model_loaded = True | |
except LoadModelException: | |
raise "Model not loaded." | |
def run_sentiment(self, text): | |
if not self.model_loaded: | |
raise LoadModelException("model not loaded.") | |
outputs = self.model(text) | |
return outputs | |
class Summarization_Model: | |
def __init__(self): | |
self.model_loaded = False | |
def load_model(self): | |
try: | |
self.model = pipeline( | |
"summarization", | |
DIR.SUMMARIZATION_MODEL_DIR | |
) | |
self.model_loaded = True | |
except LoadModelException: | |
raise "Model not loaded." | |
self.model_loaded = True | |
def __make_output(outputs): | |
return outputs[0]["summary_text"] | |
def run_summarize(self, text): | |
if not self.model_loaded: | |
raise LoadModelException("model not loaded.") | |
outputs = self.model(text, max_length=1024) | |
result = self.__make_output(outputs) | |
return result | |
class WordCloudDrawer: | |
def __init__(self): | |
jieba.set_dictionary(f'{DIR.DICT_DIR}/dict.txt') # 繁中辭典 | |
jieba.analyse.set_stop_words(f'{DIR.DICT_DIR}/stopdict.txt') # 設置停用詞辭典 | |
self.punctuation_list = [] | |
with open(f'{DIR.DICT_DIR}/punctuations.txt', 'r', encoding='utf-8-sig') as f2: | |
for data in f2.readlines(): | |
self.punctuation_list.append(data.strip()) | |
def __filter(self, word): # 過濾特殊符號 | |
if word in self.punctuation_list: | |
return False | |
else: | |
return True | |
def __preprocess(self, text): # 去除停用詞並斷詞 | |
return list(filter(self.__filter, jieba.analyse.extract_tags(text, topK=None, withWeight=False, allowPOS=()))) | |
def word_cloud(self, text, num_words): | |
processed_text = self.__preprocess(text) | |
''' TF-IDF ''' | |
vectorizer = TfidfVectorizer(smooth_idf=True) | |
tfidf = vectorizer.fit_transform(processed_text) | |
data = { | |
'word': vectorizer.get_feature_names_out(), # 修改此行 | |
'tfidf': tfidf.toarray().sum(axis=0).tolist() | |
} | |
word_score = pd.DataFrame(data).sort_values(by='tfidf', ascending=False) | |
top_words = word_score.sort_values(by='tfidf', ascending=False)[:num_words] | |
d = dict(zip(top_words['word'].to_list(), top_words['tfidf'].to_list())) | |
wc = WordCloud( | |
background_color='white', | |
collocations=False, | |
font_path=f'{DIR.DICT_DIR}/SimHei.ttf', | |
max_font_size=48 | |
) | |
wc= wc.generate_from_frequencies(d) | |
return wc | |
# fig, ax = plt.subplots(figsize = (12, 8)) | |
# plt.imshow(wc) | |
# plt.axis("off") | |
# st.pyplot(plt.gcf()) | |
# plt.imshow(wc) | |
# plt.show() | |
# st.pyplot() |