|
from transformers import PreTrainedModel, PretrainedConfig |
|
from tensorflow.keras.models import load_model |
|
from tensorflow.keras.preprocessing.text import tokenizer_from_json |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
import numpy as np |
|
import json |
|
|
|
class NewsClassifierConfig(PretrainedConfig): |
|
model_type = "custom" |
|
|
|
def __init__( |
|
self, |
|
max_length=41, |
|
vocab_size=74934, |
|
embedding_dim=128, |
|
hidden_size=64, |
|
num_labels=2, |
|
**kwargs |
|
): |
|
self.max_length = max_length |
|
self.vocab_size = vocab_size |
|
self.embedding_dim = embedding_dim |
|
self.hidden_size = hidden_size |
|
self.num_labels = num_labels |
|
super().__init__(**kwargs) |
|
|
|
class NewsClassifier(PreTrainedModel): |
|
config_class = NewsClassifierConfig |
|
base_model_prefix = "custom" |
|
|
|
def __init__(self, config): |
|
super().__init__(config) |
|
self.model = None |
|
self.tokenizer = None |
|
|
|
def post_init(self): |
|
"""Load model and tokenizer after initialization""" |
|
self.model = load_model('custom.h5') |
|
with open('tokenizer.json', 'r') as f: |
|
tokenizer_data = json.load(f) |
|
self.tokenizer = tokenizer_from_json(tokenizer_data) |
|
|
|
def forward(self, text_input): |
|
if not self.model or not self.tokenizer: |
|
self.post_init() |
|
|
|
if isinstance(text_input, str): |
|
text_input = [text_input] |
|
|
|
sequences = self.tokenizer.texts_to_sequences(text_input) |
|
padded = pad_sequences(sequences, maxlen=self.config.max_length) |
|
predictions = self.model.predict(padded, verbose=0) |
|
|
|
results = [] |
|
for pred in predictions: |
|
|
|
score = float(pred[1]) |
|
label = "foxnews" if score > 0.5 else "nbc" |
|
results.append({ |
|
"label": label, |
|
"score": score if label == "foxnews" else 1 - score |
|
}) |
|
|
|
return results[0] if len(text_input) == 1 else results |