|
import streamlit as st |
|
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer |
|
import tensorflow as tf |
|
import numpy as np |
|
|
|
def convert_label_to_title(label): |
|
convert_dict = { |
|
0: "SỨC KHỎE", |
|
1: "GIÁO DỤC", |
|
2: "THỂ THAO", |
|
3: "PHÁP LUẬT", |
|
4: "KHOA HỌC", |
|
5: "DU LỊCH", |
|
6: "GIẢI TRÍ", |
|
7: "KINH DOANH" |
|
} |
|
return convert_dict[label] |
|
|
|
def predict_sentence(model, tokenizer, sentence): |
|
input_data = tokenizer(sentence, return_tensors='tf', padding=True, truncation=True) |
|
logits = model(input_data['input_ids'], attention_mask=input_data['attention_mask']).logits |
|
probabilities = tf.nn.softmax(logits, axis=1) |
|
predicted_class = tf.argmax(logits, axis=1).numpy()[0] |
|
highest_probability = probabilities.numpy()[0, predicted_class] |
|
title = convert_label_to_title(predicted_class) |
|
return title, probabilities.numpy(), highest_probability |
|
|
|
@st.cache_resource |
|
def load_model(checkpoint, num_class): |
|
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_class) |
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint) |
|
return model, tokenizer |
|
|
|
checkpoint = 'distilbert-base-multilingual-cased' |
|
model, tokenizer = load_model(checkpoint, 8) |
|
model.load_weights('best_model_weights.h5') |
|
|
|
text = st.text_area('Nhập tiêu đề vào đây') |
|
|
|
if text: |
|
title, probabilities, highest = predict_sentence(model, tokenizer, text) |
|
out = { |
|
'title': title, |
|
'prob': highest |
|
} |
|
st.json(out) |
|
|
|
|
|
|