Spaces:

RMakushkin
/

test_2_2

Sleeping

App Files Files Community

RMakushkin commited on Dec 15, 2023

Commit

a18e62f

•

1 Parent(s): 56252f7

Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +1 -0
images/.DS_Store +0 -0
images/1.jpeg +0 -0
images/2.jpeg +0 -0
images/3.jpeg +0 -0
images/4.jpeg +0 -0
images/5.jpeg +0 -0
images/mem.jpg +0 -0
images/ser2.png +3 -0
pages/.DS_Store +0 -0
pages/01_🎥_Serials.py +105 -0
pages/02_🔥_Results.py +34 -0
pages/__init__.py +0 -0
pages/__pycache__/__init__.cpython-310.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+images/ser2.png filter=lfs diff=lfs merge=lfs -text

images/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

images/1.jpeg ADDED Viewed

images/2.jpeg ADDED Viewed

images/3.jpeg ADDED Viewed

images/4.jpeg ADDED Viewed

images/5.jpeg ADDED Viewed

images/mem.jpg ADDED Viewed

images/ser2.png ADDED Viewed

Git LFS Details

SHA256: 035a31442decd33706b2ffff57c59ef4a2363970e4c2c9e91d6a5efef4dd9191
Pointer size: 132 Bytes
Size of remote file: 1.34 MB

pages/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

pages/01_🎥_Serials.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import ast
+import random
+import torch
+import time
+from joblib import load
+from transformers import BertTokenizer, BertModel
+from sklearn.metrics.pairwise import cosine_similarity
+# import faiss
+"""
+## Сервис умного поиска сериалов 📽️
+"""
+# Читаем вектора сериалов
+embeddings = np.loadtxt('data/embs.txt')
+# Указываем пути к сохраненным модели и токенизатору
+model_path = "model"
+tokenizer_path = "tokenizer"
+# Загружаем модель
+loaded_model = BertModel.from_pretrained(model_path)
+# Загружаем токенизатор
+loaded_tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
+df = pd.read_csv('data/data.csv')
+df['ganres'] = df['ganres'].apply(lambda x: ast.literal_eval(x))
+df['description'] = df['description'].astype(str)
+st.write(f'<p style="font-family: Arial, sans-serif; font-size: 24px; ">Наш сервис насчитывает \
+         {len(df)} лучших сериалов</p>', unsafe_allow_html=True)
+st.image('images/ser2.png')
+ganres_lst = sorted(['драма', 'документальный', 'биография', 'комедия', 'фэнтези', 'приключения', 'для детей', 'мультсериалы',
+              'мелодрама', 'боевик', 'детектив', 'фантастика', 'триллер', 'семейный', 'криминал', 'исторический', 'музыкальные',
+              'мистика', 'аниме', 'ужасы', 'спорт', 'скетч-шоу', 'военный', 'для взрослых', 'вестерн'])
+st.sidebar.header('Панель инструментов :gear:')
+choice_g = st.sidebar.multiselect("Выберите жанры", options=ganres_lst)
+n = st.sidebar.selectbox("Количество отображаемых элементов на странице", options=[5, 10, 15, 20, 30])
+st.sidebar.info("📚 Для наилучшего соответствия, запрос должен быть максимально развернутым")
+text = st.text_input('Введите описание для рекомендации')
+# Векторизуем запрос
+loaded_model.eval()
+tokens = loaded_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+start_time = time.time()
+tokens = {key: value.to(loaded_model.device) for key, value in tokens.items()}
+# Передача токенов в модель для получения эмбеддингов
+with torch.no_grad():
+    output = loaded_model(**tokens)
+# Эмбеддинги получаются из последнего скрытого состояния
+user_embedding = output.last_hidden_state.mean(dim=1).squeeze().cpu().detach().numpy()
+cosine_similarities = cosine_similarity(embeddings, user_embedding.reshape(1, -1))
+button = st.button('Отправить запрос', type="primary")
+if text and button:
+    if len(choice_g) == 0:
+        choice_g = ganres_lst
+    # random = random.sample(range(len(df)), 50)
+    top_ind = np.unravel_index(np.argsort(cosine_similarities, axis=None)[-30:][::-1], cosine_similarities.shape)
+    confidence = cosine_similarities[top_ind]
+    top_ind = list(top_ind[0])
+    conf_dict = {}
+    for value, conf in zip(top_ind, confidence):
+        conf_dict[int(value)] = conf
+    # st.write(conf_dict)
+    output_dict = {}
+    for i in top_ind:
+        for ganre in df['ganres'][i]:
+            if ganre in choice_g:
+                output_dict[i] = df['ganres'][i]
+    # st.write('output_dict')
+    sorted_lst = sorted(output_dict.items(), key=lambda x: len(set(x[1]) & set(choice_g)), reverse=True)
+    n_lst = [i[0] for i in sorted_lst[:n]]
+    st.write(f'<p style="font-family: Arial, sans-serif; font-size: 18px; text-align: center;"><strong>Всего подобранных \
+         рекомендаций {len(sorted_lst)}</strong></p>', unsafe_allow_html=True)
+    st.write('\n')
+    # Отображение изображений и названий
+    for i in n_lst:
+        col1, col2 = st.columns([2, 5])
+        with col1:
+            st.image(df['poster'][i], width=200)
+        with col2:
+            st.write(f"***Название:*** {df['title'][i]}")
+            st.write(f"***Жанр:*** {', '.join(df['ganres'][i])}")
+            st.write(f"***Описание:*** {df['description'][i]}")
+            # similarity = float(confidence)
+            # st.write(f"***Cosine Similarity : {round(similarity, 3)}***")
+            st.markdown(f"[***ссылка на сериал***]({df['url'][i]})")
+            st.write(f"")
+            end_time = time.time()
+            st.write(f"<small>*Степень соответствия по косинусному сходству: {conf_dict[i]:.4f}*</small>", unsafe_allow_html=True)
+        st.markdown(
+        "<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
+        unsafe_allow_html=True
+    )

pages/02_🔥_Results.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import streamlit as st
+from PIL import Image, ImageOps
+import matplotlib.pyplot as plt
+st.write("""
+ ## 📝 Итоги проекта Рекомендательные системы.
+""")
+"""
+###### 1. Парсинг профильных сайтов, итоговый с kino.mail.ru.
+"""
+st.image('images/mem.jpg', width=400)
+"""
+###### 2. Сбор и анализ информации с киносервисов. Формирование датасета. Итоговый размер - 14939 объектов.
+"""
+col1, col2 = st.columns(2)
+with col1:
+    st.image('images/1.jpeg')
+with col2:
+    st.image('images/2.jpeg')
+# st.image('images/1.png')
+"""
+###### 3. Предобработка данных от лишных символов и пропусков.
+"""
+st.image('images/3.jpeg')
+st.image('images/4.jpeg')
+"""
+###### 4. Векторизация с использованием модели RuBERT (Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters)
+"""

pages/__init__.py ADDED Viewed

File without changes

pages/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (203 Bytes). View file