Spaces:

DmitriyMineev
/

cpu

Sleeping

cpu / app.py

5ba7acb 28 days ago

4.26 kB

	import streamlit as st
	import joblib
	import pandas as pd
	from time import time
	import transformers
	import torch
	from torch import nn


	st.title("Классификация отзыва по кино")
	user_input = st.text_area("Введите ваш отзыв на фильм:", height=100)

	model1 = torch.load("models/model_log_reg.pth", map_location="cpu", weights_only=False)

	bert_model = transformers.BertModel.from_pretrained("cointegrated/rubert-tiny2")

	class LSTMClassifier(nn.Module):
	def __init__(self, vocab_size, embed_dim=128, hidden_dim=64, output_dim=3):
	super().__init__()
	self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
	self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
	self.fc = nn.Linear(hidden_dim, output_dim)

	def forward(self, x):
	x = self.embedding(x)
	_, (h_n, _) = self.lstm(x)
	out = self.fc(h_n[-1])
	return out

	model2 = LSTMClassifier(vocab_size=159014)
	model2.load_state_dict(torch.load('models/lstm_model.pth', map_location="cpu"))
	model2.eval()
	class MyTinyBERT(nn.Module):
	def __init__(self):
	super().__init__()
	# забираем bert для русского языка
	self.bert = bert_model
	# морозим его параметры
	for param in self.bert.parameters():
	param.requires_grad = False
	# добавляем собственный слой классификации
	self.linear = nn.Sequential(
	nn.Linear(312, 256),
	nn.Softmax(),
	# # выход на 3 класса как в задаче выше
	nn.Linear(256, 3)
	)


	def forward(self, input_ids, attention_mask):
	# данные на вход берту
	bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
	# нормализируем – хуже не будет
	normed_bert_out = nn.functional.normalize(bert_out.last_hidden_state[:, 0, :])
	# далее блок классификации
	out = self.linear(normed_bert_out)
	return out

	tokenizer = transformers.BertTokenizer.from_pretrained("cointegrated/rubert-tiny2")


	model3 = MyTinyBERT()

	model3.load_state_dict(torch.load('models/model111.pth', map_location="cpu"))
	model3.eval()


	labels = ['Плохой отзыв', 'Нейтральный отзыв', 'Прекрасный отзыв']

	# Функция для предобработки отзыва
	def preprocess_review(review_text, vocab):
	# Очистка отзыва: удаление эмодзи, знаков препинания, стоп-слов и приведение к нижнему регистру
	# Удаляем эмодзи
	# Убираем знаки препинания
	review_text = review_text.lower() # Приводим к нижнему регистру

	# Токенизация
	tokens = review_text.split()

	# Удаляем стоп-слова

	# Преобразуем в индексы из словаря (vocab)
	tokens = [vocab.get(word, 0) for word in tokens] # Если слово нет в vocab, заменяем на 0

	return tokens

	vocab = joblib.load('models/vocab.pkl')

	if user_input:
	st.subheader("🔍 Предсказания моделей")

	results = {}
	encoding = tokenizer(user_input, truncation=True, padding='max_length', max_length=18, return_tensors='pt')
	# TF-IDF + Logistic Regression
	start = time()
	pred = model1.predict([user_input])[0]
	end = time()
	st.write(labels[pred], ' : LogisticRegression')
	st.write(round(end - start,3),"сек")
	start = time()
	outputs = model3(encoding['input_ids'] ,encoding['attention_mask'])
	end = time()
	st.write(labels[outputs.argmax(1)],' : BertModel')
	st.write(round(end - start,3),"сек")
	start = time()
	lod = preprocess_review(user_input,vocab)
	input_tensor = torch.tensor(lod).unsqueeze(0)

	output = model2(input_tensor)
	_, predicted = torch.max(output, dim=1)

	label = predicted.item()
	st.write(labels[label],' : LSTM')
	end = time()
	st.write(round(end - start,3),"сек")