|
from transformers import AutoModel, AutoTokenizer |
|
import torch |
|
import numpy as np |
|
from sklearn.linear_model import LogisticRegression |
|
import joblib |
|
|
|
|
|
|
|
|
|
rubert_model_name = "cointegrated/rubert-tiny2" |
|
tokenizer = AutoTokenizer.from_pretrained(rubert_model_name) |
|
model = AutoModel.from_pretrained(rubert_model_name) |
|
|
|
|
|
logreg_model_path = "model_data/logreg_model_v2.joblib" |
|
logreg_model = joblib.load(logreg_model_path) |
|
|
|
def embed_bert_cls(text, model, tokenizer): |
|
"""Generate embeddings for input text using the RuBERT model.""" |
|
inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt") |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
embeddings = outputs.last_hidden_state[:, 0, :] |
|
embeddings = torch.nn.functional.normalize(embeddings) |
|
return embeddings.cpu().numpy() |
|
|
|
def classify_text(text, model = model, tokenizer = tokenizer, classifier = logreg_model): |
|
"""Classify text as toxic or non-toxic using embeddings from RuBERT and Logistic Regression.""" |
|
embeddings = embed_bert_cls(text, model, tokenizer) |
|
prediction = classifier.predict(embeddings) |
|
dict_class = {0: 'Good', 1: 'Neutral', 2: 'Bad'} |
|
|
|
return dict_class[prediction[0]] |