File size: 1,334 Bytes
a15e210 48ac93e a15e210 4a76dec a15e210 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
import joblib
# Load RuBERT model and tokenizer
rubert_model_name = "cointegrated/rubert-tiny2" # Example model name, adjust as needed
tokenizer = AutoTokenizer.from_pretrained(rubert_model_name)
model = AutoModel.from_pretrained(rubert_model_name)
# Load Logistic Regression model
logreg_model_path = "model_data/logreg_model_v2.joblib"
logreg_model = joblib.load(logreg_model_path)
def embed_bert_cls(text, model, tokenizer):
"""Generate embeddings for input text using the RuBERT model."""
inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state[:, 0, :]
embeddings = torch.nn.functional.normalize(embeddings)
return embeddings.cpu().numpy()
def classify_text(text, model = model, tokenizer = tokenizer, classifier = logreg_model):
"""Classify text as toxic or non-toxic using embeddings from RuBERT and Logistic Regression."""
embeddings = embed_bert_cls(text, model, tokenizer)
prediction = classifier.predict(embeddings)
dict_class = {0: 'Good', 1: 'Neutral', 2: 'Bad'}
return dict_class[prediction[0]] |