romnatall
ребаланс
63bff46
import random
import streamlit as st
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
movies = pd.read_csv('data/data.csv')
toggle_state = False#st.sidebar.checkbox("режим разметки")
input_search = st.text_input('Search', value='собака очень преданно ждала хозяина на вокзале')
tfidf_slider = st.sidebar.slider("tf_idf_description", 0.0, 1.0, 0.9)
tf_idf_name = st.sidebar.slider("tf_idf_name", 0.0, 1.0, 0.66/100)
tf_idf_actors = st.sidebar.slider("tf_idf_actors", 0.0, 1.0, 0.9)
bert_weight = st.sidebar.slider("bert_weight", 0.0, 1.0, 0.5)
show_num = st.sidebar.slider("show_num", 1, 100, 10)
data = np.load('data/embeddings_bert.npy')
def top_indices(array, n,upsc=False):
# Получаем индексы элементов, отсортированных по убыванию
st.session_state["pred"] = array
sorted_indices = np.argsort(array)[::1 if upsc else -1]
# Выбираем первые n индексов
top_n_indices = sorted_indices[:n]
return top_n_indices
@st.cache_resource
def get_embeddings():
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
# model.cuda()
return model, tokenizer
@st.cache_data
def embed_bert_cls(text, ):
model, tokenizer = get_embeddings()
t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
model_output = model(**{k: v.to(model.device) for k, v in t.items()})
embeddings = model_output.last_hidden_state[:, 0, :]
embeddings = torch.nn.functional.normalize(embeddings)
return embeddings[0].cpu().numpy()
@st.cache_resource
def getmodels():
with open('data/logreg.pkl', 'rb') as f:
logreg = pickle.load(f)
with open('data/tf_idf_vectorizer.pkl', 'rb') as f:
vectorizer = pickle.load(f)
with open('data/vectorizer_actors.pkl', 'rb') as f:
vectorizer_actors = pickle.load(f)
tfidf_matrix = vectorizer.transform(movies['description'])
tfidf_matrix2 = vectorizer.transform(movies['name'])
tfidf_actors = vectorizer_actors.transform(movies['actors'].fillna(''))
return logreg, vectorizer,vectorizer_actors ,tfidf_matrix,tfidf_matrix2,tfidf_actors
@st.cache_data
def predict_rating(input_search,tfidf_slider,tf_idf_name,tf_idf_actors,bert_weight):
logreg, vectorizer,vectorizer_actors,tfidf_matrix,tfidf_matrix2,tfidf_actors=getmodels()
emb = embed_bert_cls(input_search)
X=np.column_stack((data, np.tile(emb, (data.shape[0], 1))))
user_tfidf = vectorizer.transform([input_search])
user_actors = vectorizer_actors.transform([input_search])
similarity_actors=cosine_similarity(user_actors, tfidf_actors).reshape(-1)
similarity_scores_desc = cosine_similarity(user_tfidf, tfidf_matrix)
similarity_scores_name = cosine_similarity(user_tfidf, tfidf_matrix2)
y_log = logreg.predict(X)
y_emb = cosine_similarity(data, emb.reshape(1, -1)).reshape(-1)
y=(similarity_scores_desc*tfidf_slider
+similarity_scores_name*tf_idf_name
+y_emb*bert_weight
+similarity_actors*tf_idf_actors
).reshape(-1)
st.session_state["pred"]=y
return top_indices(y, show_num,upsc=False)
def saverank(index, new_X,new_y):
dx=np.load('X.npy')
dy=np.load('y.npy')
dx=np.concatenate((dx, new_X.reshape(1,-1)))
dy=np.concatenate((dy,np.array([new_y])))
np.save('X.npy',dx)
np.save('y.npy',dy)
def ask_rating(movie,index):
# Создаем переменную для хранения оценки
rating = 0
# Создаем горизонтальный столбец
col1, col2, col3, col4, col5 = st.columns(5)
# В каждом столбце выводим кнопку оценки
with col1:
b1 = st.button("1",key="1"+str(index))
with col2:
b2 = st.button("2" ,key="2"+str(index))
with col3:
b3 = st.button("3",key="3"+str(index))
with col4:
b4 = st.button("4",key="4"+str(index))
with col5:
b5 = st.button("5",key="5"+str(index))
if b1:
rating = 1
if b2:
rating = 2
if b3:
rating = 3
if b4:
rating = 4
if b5:
rating = 5
if rating>0:
saverank(index,st.session_state["X"][index],rating)
def display_rating(rating):
stars = int(rating / 2) # Переводим рейтинг из 0-10 в 0-5 и округляем до целого
remainder = rating % 2 # Доля рейтинга, которая не переводится в целое количество звезд
star_str = '🌕' * stars
if remainder >= 0.5:
star_str += '🌗' # Добавляем половину звезды в виде половины луны, если есть доля больше или равная 0.5
return star_str
def display_movie_card(df, index):
movie = df.iloc[index]
col1, col2 = st.columns([1, 3])
with col1:
st.image(movie['poster'], use_column_width=True)
st.write(f"Жанр: {movie['genres']}")
st.write(f"Страна: {movie['country']}")
st.write(f"рейтинг: {movie['age']}")
if "pred" in st.session_state:
st.write(st.session_state["pred"][index])
with col2:
year = str(int(movie['year'])) if not np.isnan(movie['year']) else ""
st.markdown(f"<h2 style='text-align: left;'>{movie['name']} ({year})</h2>", unsafe_allow_html=True)
description = ' '.join(movie['description'][:200].split(" ")[:-1]) + '...' if len(movie['description']) > 200 else movie['description']
e = st.empty()
b=toggle_state
if movie['description'] !=description and not toggle_state:
b = st.button("раскрыть описание",key=index)
with e:
if b:
st.write(movie['description'])
else:
st.write(description)
if toggle_state:
ask_rating(movie,index)
input = st.text_input(' ',key = "search"+str(index))
if input:
emb = embed_bert_cls(input)
fullemb = np.concatenate(( st.session_state["X"][index,:312], emb))
saverank(index,fullemb,5)
st.write(f"Актеры: {movie['actors']}")
imdb,kp = st.columns([1,2])
with imdb:
st.write(f"IMDB: {display_rating(movie['imdb'])}" if not np.isnan(movie['imdb']) else "")
with kp:
st.write(f"Кинопоиск: { display_rating(movie['kp'])}" if not np.isnan(movie['kp']) else "")
st.write(f"[смотреть]({movie['link']})")
st.write("----------------------")
reqs= st.session_state["reqs"] if "reqs" in st.session_state else {}
@st.cache_data
def getnums(df,size=0,text=''):
if text in reqs:
return reqs[text]
else:
reqs[text]=list(np.random.randint(len(df), size=size))
st.session_state["reqs"] = reqs
return reqs[text]
if input_search:
for i in predict_rating(input_search,tfidf_slider,tf_idf_name,tf_idf_actors,bert_weight):
display_movie_card(movies, i )
def ask_rating(movie):
# Создаем переменную для хранения оценки
rating = 0
# Создаем горизонтальный столбец
col1, col2, col3, col4, col5 = st.columns(5)
# В каждом столбце выводим кнопку оценки
with col1:
b1 = st.button("1")
with col2:
b2 = st.button("2")
with col3:
b3 = st.button("3")
with col4:
b4 = st.button("4")
with col5:
b5 = st.button("5")
if b1:
rating = 1
if b2:
rating = 2
if b3:
rating = 3
if b4:
rating = 4
if b5:
rating = 5
return rating