import random import streamlit as st import pandas as pd import numpy as np import torch from transformers import AutoTokenizer, AutoModel import numpy as np from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import pairwise_distances import faiss from sklearn.feature_extraction.text import TfidfVectorizer import pickle movies = pd.read_csv('data/data.csv') toggle_state = False#st.sidebar.checkbox("режим разметки") input_search = st.text_input('Search', value='собака очень преданно ждала хозяина на вокзале') tfidf_slider = st.sidebar.slider("tf_idf_description", 0.0, 1.0, 0.9) tf_idf_name = st.sidebar.slider("tf_idf_name", 0.0, 1.0, 0.66/100) tf_idf_actors = st.sidebar.slider("tf_idf_actors", 0.0, 1.0, 0.9) bert_weight = st.sidebar.slider("bert_weight", 0.0, 1.0, 0.5) show_num = st.sidebar.slider("show_num", 1, 100, 10) data = np.load('data/embeddings_bert.npy') def top_indices(array, n,upsc=False): # Получаем индексы элементов, отсортированных по убыванию st.session_state["pred"] = array sorted_indices = np.argsort(array)[::1 if upsc else -1] # Выбираем первые n индексов top_n_indices = sorted_indices[:n] return top_n_indices @st.cache_resource def get_embeddings(): tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2") model = AutoModel.from_pretrained("cointegrated/rubert-tiny2") # model.cuda() return model, tokenizer @st.cache_data def embed_bert_cls(text, ): model, tokenizer = get_embeddings() t = tokenizer(text, padding=True, truncation=True, return_tensors='pt') with torch.no_grad(): model_output = model(**{k: v.to(model.device) for k, v in t.items()}) embeddings = model_output.last_hidden_state[:, 0, :] embeddings = torch.nn.functional.normalize(embeddings) return embeddings[0].cpu().numpy() @st.cache_resource def getmodels(): with open('data/logreg.pkl', 'rb') as f: logreg = pickle.load(f) with open('data/tf_idf_vectorizer.pkl', 'rb') as f: vectorizer = pickle.load(f) with open('data/vectorizer_actors.pkl', 'rb') as f: vectorizer_actors = pickle.load(f) tfidf_matrix = vectorizer.transform(movies['description']) tfidf_matrix2 = vectorizer.transform(movies['name']) tfidf_actors = vectorizer_actors.transform(movies['actors'].fillna('')) return logreg, vectorizer,vectorizer_actors ,tfidf_matrix,tfidf_matrix2,tfidf_actors @st.cache_data def predict_rating(input_search,tfidf_slider,tf_idf_name,tf_idf_actors,bert_weight): logreg, vectorizer,vectorizer_actors,tfidf_matrix,tfidf_matrix2,tfidf_actors=getmodels() emb = embed_bert_cls(input_search) X=np.column_stack((data, np.tile(emb, (data.shape[0], 1)))) user_tfidf = vectorizer.transform([input_search]) user_actors = vectorizer_actors.transform([input_search]) similarity_actors=cosine_similarity(user_actors, tfidf_actors).reshape(-1) similarity_scores_desc = cosine_similarity(user_tfidf, tfidf_matrix) similarity_scores_name = cosine_similarity(user_tfidf, tfidf_matrix2) y_log = logreg.predict(X) y_emb = cosine_similarity(data, emb.reshape(1, -1)).reshape(-1) y=(similarity_scores_desc*tfidf_slider +similarity_scores_name*tf_idf_name +y_emb*bert_weight +similarity_actors*tf_idf_actors ).reshape(-1) st.session_state["pred"]=y return top_indices(y, show_num,upsc=False) def saverank(index, new_X,new_y): dx=np.load('X.npy') dy=np.load('y.npy') dx=np.concatenate((dx, new_X.reshape(1,-1))) dy=np.concatenate((dy,np.array([new_y]))) np.save('X.npy',dx) np.save('y.npy',dy) def ask_rating(movie,index): # Создаем переменную для хранения оценки rating = 0 # Создаем горизонтальный столбец col1, col2, col3, col4, col5 = st.columns(5) # В каждом столбце выводим кнопку оценки with col1: b1 = st.button("1",key="1"+str(index)) with col2: b2 = st.button("2" ,key="2"+str(index)) with col3: b3 = st.button("3",key="3"+str(index)) with col4: b4 = st.button("4",key="4"+str(index)) with col5: b5 = st.button("5",key="5"+str(index)) if b1: rating = 1 if b2: rating = 2 if b3: rating = 3 if b4: rating = 4 if b5: rating = 5 if rating>0: saverank(index,st.session_state["X"][index],rating) def display_rating(rating): stars = int(rating / 2) # Переводим рейтинг из 0-10 в 0-5 и округляем до целого remainder = rating % 2 # Доля рейтинга, которая не переводится в целое количество звезд star_str = '🌕' * stars if remainder >= 0.5: star_str += '🌗' # Добавляем половину звезды в виде половины луны, если есть доля больше или равная 0.5 return star_str def display_movie_card(df, index): movie = df.iloc[index] col1, col2 = st.columns([1, 3]) with col1: st.image(movie['poster'], use_column_width=True) st.write(f"Жанр: {movie['genres']}") st.write(f"Страна: {movie['country']}") st.write(f"рейтинг: {movie['age']}") if "pred" in st.session_state: st.write(st.session_state["pred"][index]) with col2: year = str(int(movie['year'])) if not np.isnan(movie['year']) else "" st.markdown(f"