Spaces:

DmitriyMineev
/

cpu

Sleeping

cpu

File size: 4,257 Bytes

5ba7acb

import streamlit as st
import joblib
import pandas as pd
from time import time
import transformers
import torch
from torch import nn


st.title("Классификация отзыва по кино")
user_input = st.text_area("Введите ваш отзыв на фильм:", height=100)

model1 = torch.load("models/model_log_reg.pth", map_location="cpu", weights_only=False)

bert_model = transformers.BertModel.from_pretrained("cointegrated/rubert-tiny2")

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=64, output_dim=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return out

model2 = LSTMClassifier(vocab_size=159014)
model2.load_state_dict(torch.load('models/lstm_model.pth', map_location="cpu"))
model2.eval()
class MyTinyBERT(nn.Module):
    def __init__(self):
        super().__init__()
        # забираем bert для русского языка
        self.bert = bert_model
        # морозим его параметры
        for param in self.bert.parameters():
            param.requires_grad = False
        # добавляем собственный слой классификации
        self.linear = nn.Sequential(
            nn.Linear(312, 256),
            nn.Softmax(),
            # # выход на 3 класса как в задаче выше
            nn.Linear(256, 3)
        )
 
        
    def forward(self, input_ids, attention_mask):
        # данные на вход берту
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # нормализируем – хуже не будет
        normed_bert_out = nn.functional.normalize(bert_out.last_hidden_state[:, 0, :])
        # далее блок классификации
        out = self.linear(normed_bert_out)
        return out

tokenizer = transformers.BertTokenizer.from_pretrained("cointegrated/rubert-tiny2")


model3 = MyTinyBERT()

model3.load_state_dict(torch.load('models/model111.pth', map_location="cpu"))
model3.eval()


labels = ['Плохой отзыв', 'Нейтральный отзыв', 'Прекрасный отзыв']

# Функция для предобработки отзыва
def preprocess_review(review_text, vocab):
    # Очистка отзыва: удаление эмодзи, знаков препинания, стоп-слов и приведение к нижнему регистру
      # Удаляем эмодзи
      # Убираем знаки препинания
    review_text = review_text.lower()  # Приводим к нижнему регистру

    # Токенизация
    tokens = review_text.split()

    # Удаляем стоп-слова

    # Преобразуем в индексы из словаря (vocab)
    tokens = [vocab.get(word, 0) for word in tokens]  # Если слово нет в vocab, заменяем на 0

    return tokens

vocab = joblib.load('models/vocab.pkl')

if user_input:
    st.subheader("🔍 Предсказания моделей")

    results = {}
    encoding = tokenizer(user_input, truncation=True, padding='max_length', max_length=18, return_tensors='pt')
    # TF-IDF + Logistic Regression
    start = time()
    pred = model1.predict([user_input])[0]
    end = time()
    st.write(labels[pred], ' : LogisticRegression')
    st.write(round(end - start,3),"сек")
    start = time()
    outputs = model3(encoding['input_ids'] ,encoding['attention_mask'])
    end = time()
    st.write(labels[outputs.argmax(1)],' : BertModel')
    st.write(round(end - start,3),"сек")
    start = time()
    lod = preprocess_review(user_input,vocab)
    input_tensor = torch.tensor(lod).unsqueeze(0)

    output = model2(input_tensor)
    _, predicted = torch.max(output, dim=1)

    label = predicted.item()
    st.write(labels[label],' : LSTM')
    end = time()
    st.write(round(end - start,3),"сек")