File size: 1,334 Bytes
a15e210
 
 
 
 
 
 
 
 
 
 
 
 
 
48ac93e
a15e210
 
 
 
4a76dec
a15e210
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
import joblib



# Load RuBERT model and tokenizer
rubert_model_name = "cointegrated/rubert-tiny2"  # Example model name, adjust as needed
tokenizer = AutoTokenizer.from_pretrained(rubert_model_name)
model = AutoModel.from_pretrained(rubert_model_name)

# Load Logistic Regression model
logreg_model_path = "model_data/logreg_model_v2.joblib"
logreg_model = joblib.load(logreg_model_path)

def embed_bert_cls(text, model, tokenizer):
    """Generate embeddings for input text using the RuBERT model."""
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings.cpu().numpy()

def classify_text(text, model = model, tokenizer = tokenizer, classifier = logreg_model):
    """Classify text as toxic or non-toxic using embeddings from RuBERT and Logistic Regression."""
    embeddings = embed_bert_cls(text, model, tokenizer)
    prediction = classifier.predict(embeddings)
    dict_class = {0: 'Good', 1: 'Neutral', 2: 'Bad'}

    return dict_class[prediction[0]]