veidlink commited on
Commit
5bdc726
1 Parent(s): fe6fba6
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ clean_mail_movie.csv filter=lfs diff=lfs merge=lfs -text
37
+ mail_embeddings.joblib filter=lfs diff=lfs merge=lfs -text
38
+ mail_faiss_index.index filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,10 @@
1
  ---
2
- title: Find My Movie Hf
3
- emoji: 🌍
4
  colorFrom: pink
5
- colorTo: red
6
  sdk: streamlit
7
  sdk_version: 1.26.0
8
- app_file: app.py
9
  pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Find My Movie
3
+ emoji: 🪄
4
  colorFrom: pink
5
+ colorTo: indigo
6
  sdk: streamlit
7
  sdk_version: 1.26.0
8
+ app_file: main.py
9
  pinned: false
10
+ ---
 
 
bert_movie.ipynb ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import torch\n",
11
+ "from transformers import AutoTokenizer, AutoModel\n",
12
+ "import re\n",
13
+ "import string\n",
14
+ "import numpy as np\n",
15
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
16
+ "import streamlit as st\n",
17
+ "import faiss\n",
18
+ "\n"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 2,
24
+ "metadata": {},
25
+ "outputs": [],
26
+ "source": [
27
+ "url = '/clean_mail_movie.csv'\n",
28
+ "\n",
29
+ "df = pd.read_csv(url)"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 4,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "dataset = df['concat2embedding'].tolist() # Это объединённый столбец\n",
39
+ "titles = df['movie_title'].tolist()\n",
40
+ "images = df['image_url'].tolist()\n",
41
+ "descr = df['description'].tolist()\n",
42
+ "links = df['page_url'].tolist()"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 5,
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": [
51
+ "def clean(text):\n",
52
+ " text = text.lower() # Нижний регистр\n",
53
+ " # text = re.sub(r'\\d+', ' ', text) # Удаляем числа\n",
54
+ " # text = text.translate(str.maketrans('', '', string.punctuation)) # Удаляем пунктуацию\n",
55
+ " text = re.sub(r'\\s+', ' ', text) # Удаляем лишние пробелы\n",
56
+ " text = text.strip() # Удаляем начальные и конечные пробелы\n",
57
+ " # text = re.sub(r'\\b\\w{1,2}\\b', '', text) # Удаляем слова длиной менее 3 символов\n",
58
+ " # Дополнительные шаги, которые могут быть полезны в данном контексте:\n",
59
+ " # text = re.sub(r'\\b\\w+\\b', '', text) # Удаляем отдельные слова (без чисел и знаков препинания)\n",
60
+ " # text = ' '.join([word for word in text.split() if word not in stop_words]) # Удаляем стоп-слова\n",
61
+ " return text\n",
62
+ "\n",
63
+ "\n",
64
+ "cleaned_text = []\n",
65
+ "\n",
66
+ "for text in dataset:\n",
67
+ " cleaned_text.append(clean(text))"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 6,
73
+ "metadata": {},
74
+ "outputs": [],
75
+ "source": [
76
+ "# pip install transformers sentencepiece\n",
77
+ "\n",
78
+ "tokenizer = AutoTokenizer.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
79
+ "model = AutoModel.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
80
+ "# model.cuda() # uncomment it if you have a GPU"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": 7,
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": [
89
+ "# Дефолтная функция, шла в комплекте с моделью\n",
90
+ "\n",
91
+ "def embed_bert_cls(text, model, tokenizer):\n",
92
+ " t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=1024) # Модель сама создаёт пэддинги и маску.\n",
93
+ " with torch.no_grad():\n",
94
+ " model_output = model(**{k: v.to(model.device) for k, v in t.items()})\n",
95
+ " embeddings = model_output.last_hidden_state[:, 0, :]\n",
96
+ " embeddings = torch.nn.functional.normalize(embeddings)\n",
97
+ " return embeddings[0].cpu().numpy()"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": 8,
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": [
106
+ "# Векторизация отзывов\n",
107
+ "text_embeddings = np.array([embed_bert_cls(text, model, tokenizer) for text in cleaned_text])"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 9,
113
+ "metadata": {},
114
+ "outputs": [],
115
+ "source": [
116
+ "# Создание FAISS индекса после определения text_embeddings\n",
117
+ "dimension = text_embeddings.shape[1]\n",
118
+ "index = faiss.IndexFlatL2(dimension)\n",
119
+ "index.add(text_embeddings.astype('float32'))"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": 10,
125
+ "metadata": {},
126
+ "outputs": [
127
+ {
128
+ "data": {
129
+ "text/plain": [
130
+ "['mail_embeddings.joblib']"
131
+ ]
132
+ },
133
+ "execution_count": 10,
134
+ "metadata": {},
135
+ "output_type": "execute_result"
136
+ }
137
+ ],
138
+ "source": [
139
+ "from joblib import dump, load\n",
140
+ "\n",
141
+ "# Сохранение эмбеддингов\n",
142
+ "dump(text_embeddings, 'mail_embeddings.joblib')"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": 11,
148
+ "metadata": {},
149
+ "outputs": [],
150
+ "source": [
151
+ "# Сохранение индекса\n",
152
+ "faiss.write_index(index, \"mail_faiss_index.index\")"
153
+ ]
154
+ }
155
+ ],
156
+ "metadata": {
157
+ "kernelspec": {
158
+ "display_name": "pytorch_env",
159
+ "language": "python",
160
+ "name": "python3"
161
+ },
162
+ "language_info": {
163
+ "codemirror_mode": {
164
+ "name": "ipython",
165
+ "version": 3
166
+ },
167
+ "file_extension": ".py",
168
+ "mimetype": "text/x-python",
169
+ "name": "python",
170
+ "nbconvert_exporter": "python",
171
+ "pygments_lexer": "ipython3",
172
+ "version": "3.11.4"
173
+ },
174
+ "orig_nbformat": 4
175
+ },
176
+ "nbformat": 4,
177
+ "nbformat_minor": 2
178
+ }
bert_movie_edited.ipynb ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {
7
+ "id": "S52EVP7k-rl7"
8
+ },
9
+ "outputs": [],
10
+ "source": [
11
+ "import pandas as pd\n",
12
+ "import torch\n",
13
+ "import re\n",
14
+ "import string\n",
15
+ "import numpy as np\n",
16
+ "import streamlit as st\n",
17
+ "import faiss # хранение индексов\n",
18
+ "from tqdm import tqdm\n",
19
+ "from transformers import AutoTokenizer, AutoModel\n",
20
+ "from joblib import dump, load # Для сохранения/загрузки эмбэддингов"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 1,
26
+ "metadata": {
27
+ "id": "12BEEwcF-rl9"
28
+ },
29
+ "outputs": [],
30
+ "source": [
31
+ "path = '/content/movies_filtered.csv' # ИЗМЕНИ ТУТ ПУТЬ!\n",
32
+ "a\n",
33
+ "df = pd.read_csv(path)"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 2,
39
+ "metadata": {
40
+ "id": "df5lg8-m-rl-"
41
+ },
42
+ "outputs": [],
43
+ "source": [
44
+ "def clean(text):\n",
45
+ " text = text.lower() # Нижний регистр\n",
46
+ " text = re.sub(r'\\d+', ' ', text) # Удаляем числа\n",
47
+ " # text = text.translate(str.maketrans('', '', string.punctuation)) # Удаляем пунктуацию\n",
48
+ " text = re.sub(r'\\s+', ' ', text) # Удаляем лишние пробелы\n",
49
+ " text = text.strip() # Удаляем начальные и конечные пробелы\n",
50
+ " text = re.sub(r'\\s+|\\n', ' ', text) # Удаляет \\n и \\xa0\n",
51
+ " # text = re.sub(r'\\b\\w{1,2}\\b', '', text) # Удаляем слова длиной менее 3 символов\n",
52
+ " # Дополнительные шаги, которые могут быть полезны в данном контексте:\n",
53
+ " # text = re.sub(r'\\b\\w+\\b', '', text) # Удаляем отдельные слова (без чисел и знаков препинания)\n",
54
+ " # text = ' '.join([word for word in text.split() if word not in stop_words]) # Удаляем стоп-слова\n",
55
+ " return text\n",
56
+ "\n",
57
+ "for i, row in df.iterrows():\n",
58
+ " df.at[i, 'description'] = clean(row['description'])"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 19,
64
+ "metadata": {
65
+ "colab": {
66
+ "base_uri": "https://localhost:8080/"
67
+ },
68
+ "id": "0huKeMs4-rl_",
69
+ "outputId": "8659997c-9b8a-45bb-e2d7-fcc05422b92a"
70
+ },
71
+ "outputs": [],
72
+ "source": [
73
+ "# pip install transformers sentencepiece\n",
74
+ "\n",
75
+ "tokenizer = AutoTokenizer.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
76
+ "model = AutoModel.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
77
+ "# model.cuda() # uncomment it if you have a GPU"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": 20,
83
+ "metadata": {
84
+ "id": "Xsxq-Ohx-rmA"
85
+ },
86
+ "outputs": [],
87
+ "source": [
88
+ "# применяем токенизатор:\n",
89
+ "# -≥ add_special_tokens = добавляем служебные токены (CLS=101, EOS=102)\n",
90
+ "# -≥ truncation = обрезаем по максимальной длине\n",
91
+ "# -≥ max_length = максимальная длина последовательности\n",
92
+ "tokenized = df['description'].apply((lambda x: tokenizer.encode(x,\n",
93
+ " add_special_tokens=True,\n",
94
+ " truncation=True,\n",
95
+ " max_length=1024)))"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 21,
101
+ "metadata": {
102
+ "id": "OuaXqHNj-rmB"
103
+ },
104
+ "outputs": [],
105
+ "source": [
106
+ "max_len = 1024\n",
107
+ "# Делаю пэддинг чтобы добить до max_len последовательности\n",
108
+ "padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])\n",
109
+ "# И маску чтобы не применять self-attention на pad\n",
110
+ "attention_mask = np.where(padded != 0, 1, 0)"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 22,
116
+ "metadata": {
117
+ "id": "h3bfQh2o-rmC"
118
+ },
119
+ "outputs": [],
120
+ "source": [
121
+ "# Датасет для массивов\n",
122
+ "class BertInputs(torch.utils.data.Dataset):\n",
123
+ " def __init__(self, tokenized_inputs, attention_masks):\n",
124
+ " super().__init__()\n",
125
+ " self.tokenized_inputs = tokenized_inputs\n",
126
+ " self.attention_masks = attention_masks\n",
127
+ "\n",
128
+ " def __len__(self):\n",
129
+ " return self.tokenized_inputs.shape[0]\n",
130
+ "\n",
131
+ " def __getitem__(self, idx):\n",
132
+ " ids = self.tokenized_inputs[idx]\n",
133
+ " ams = self.attention_masks[idx]\n",
134
+ "\n",
135
+ " return ids, ams\n",
136
+ "\n",
137
+ "dataset = BertInputs(padded, attention_mask)"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": 23,
143
+ "metadata": {
144
+ "colab": {
145
+ "base_uri": "https://localhost:8080/"
146
+ },
147
+ "id": "Q7yYgEP3-rmC",
148
+ "outputId": "76047d40-f793-4cef-fc02-b98b232661f8"
149
+ },
150
+ "outputs": [
151
+ {
152
+ "name": "stdout",
153
+ "output_type": "stream",
154
+ "text": [
155
+ "torch.Size([100, 1024]) torch.Size([100, 1024])\n"
156
+ ]
157
+ }
158
+ ],
159
+ "source": [
160
+ "#DataLoader чтобы отправлять бачи в цикл обучения\n",
161
+ "loader = torch.utils.data.DataLoader(dataset, batch_size=100, shuffle=True)\n",
162
+ "sample_ids, sample_ams = next(iter(loader))\n",
163
+ "print(sample_ids.shape, sample_ams.shape)\n",
164
+ "\n",
165
+ "# shape BATCH_SIZE x MAX_LEN - что заходит в BERT"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": 25,
171
+ "metadata": {
172
+ "colab": {
173
+ "base_uri": "https://localhost:8080/"
174
+ },
175
+ "id": "r1h0BNy1-rmD",
176
+ "outputId": "adea19c9-a0f2-418c-9a21-ebe8daa00077"
177
+ },
178
+ "outputs": [
179
+ {
180
+ "name": "stderr",
181
+ "output_type": "stream",
182
+ "text": [
183
+ "100%|██████████| 94/94 [01:13<00:00, 1.28it/s]"
184
+ ]
185
+ },
186
+ {
187
+ "name": "stdout",
188
+ "output_type": "stream",
189
+ "text": [
190
+ "CPU times: user 1min 10s, sys: 145 ms, total: 1min 10s\n",
191
+ "Wall time: 1min 13s\n"
192
+ ]
193
+ },
194
+ {
195
+ "name": "stderr",
196
+ "output_type": "stream",
197
+ "text": [
198
+ "\n"
199
+ ]
200
+ }
201
+ ],
202
+ "source": [
203
+ "%%time\n",
204
+ "\n",
205
+ "vectors_in_batch = []\n",
206
+ "\n",
207
+ "# Iterate over all batches\n",
208
+ "for inputs, attention_masks in tqdm(loader):\n",
209
+ " vectors_in_mini_batch = [] # Store vectors in mini-batch\n",
210
+ " with torch.no_grad():\n",
211
+ " last_hidden_states = model(inputs.cuda(), attention_mask=attention_masks.cuda())\n",
212
+ " vector = last_hidden_states[0][:,0,:].detach().cpu().numpy()\n",
213
+ " vectors_in_mini_batch.append(vector)\n",
214
+ "\n",
215
+ " vectors_in_batch.extend(vectors_in_mini_batch)"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": 16,
221
+ "metadata": {},
222
+ "outputs": [],
223
+ "source": [
224
+ "import itertools\n",
225
+ "\n",
226
+ "# Open the file and load the nested list\n",
227
+ "vectors_in_batch = load('vectors_in_batch.joblib')\n",
228
+ "\n",
229
+ "# Convert the nested list to an unnested list\n",
230
+ "text_embeddings = list(itertools.chain.from_iterable(vectors_in_batch))"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": null,
236
+ "metadata": {},
237
+ "outputs": [],
238
+ "source": [
239
+ "# Сохранение эмбеддингов\n",
240
+ "dump(vectors_in_batch, 'vectors_in_batch.joblib')"
241
+ ]
242
+ },
243
+ {
244
+ "cell_type": "code",
245
+ "execution_count": 17,
246
+ "metadata": {},
247
+ "outputs": [
248
+ {
249
+ "data": {
250
+ "text/plain": [
251
+ "94"
252
+ ]
253
+ },
254
+ "execution_count": 17,
255
+ "metadata": {},
256
+ "output_type": "execute_result"
257
+ }
258
+ ],
259
+ "source": [
260
+ "len(vectors_in_batch)"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": 9,
266
+ "metadata": {},
267
+ "outputs": [
268
+ {
269
+ "data": {
270
+ "text/plain": [
271
+ "9366"
272
+ ]
273
+ },
274
+ "execution_count": 9,
275
+ "metadata": {},
276
+ "output_type": "execute_result"
277
+ }
278
+ ],
279
+ "source": [
280
+ "len(text_embeddings)"
281
+ ]
282
+ }
283
+ ],
284
+ "metadata": {
285
+ "accelerator": "GPU",
286
+ "colab": {
287
+ "gpuType": "T4",
288
+ "provenance": []
289
+ },
290
+ "kernelspec": {
291
+ "display_name": "Python 3",
292
+ "name": "python3"
293
+ },
294
+ "language_info": {
295
+ "codemirror_mode": {
296
+ "name": "ipython",
297
+ "version": 3
298
+ },
299
+ "file_extension": ".py",
300
+ "mimetype": "text/x-python",
301
+ "name": "python",
302
+ "nbconvert_exporter": "python",
303
+ "pygments_lexer": "ipython3",
304
+ "version": "3.11.4"
305
+ },
306
+ "orig_nbformat": 4
307
+ },
308
+ "nbformat": 4,
309
+ "nbformat_minor": 0
310
+ }
clean_mail_movie.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:057369f23a3dd85ab0cc93d9e24b3669067e1023346f40ae7d0d6dc846613d86
3
+ size 46078303
mail_embeddings.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7275e4c9f962ec2e50e02f876716f0de3f75c2548d7615a59dfc14a883fe2f2e
3
+ size 15097281
mail_faiss_index.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f1ae5c60728b9d5d7f610dc02c8978a5802b5456ab93e55cb28da8f4cb0bc56
3
+ size 15097101
main.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import streamlit as st
3
+ from transformers import AutoTokenizer, AutoModel
4
+ import torch
5
+ import joblib
6
+ import pandas as pd
7
+
8
+ # Загрузка сохраненных данных и индекса
9
+ text_embeddings = joblib.load('mail_embeddings.joblib')
10
+ index = faiss.read_index('mail_faiss_index.index')
11
+
12
+ # Датасет
13
+ df = pd.read_csv('clean_mail_movie.csv')
14
+ titles = df['movie_title'].tolist()
15
+ images = df['image_url'].tolist()
16
+ descr = df['description'].tolist()
17
+ links = df['page_url'].tolist()
18
+
19
+ # Загрузка модели и токенизатора
20
+ tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
21
+ model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
22
+
23
+ # Функция для векторизации текста
24
+ def embed_bert_cls(text, model, tokenizer):
25
+ t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=1024)
26
+ with torch.no_grad():
27
+ model_output = model(**{k: v.to(model.device) for k, v in t.items()})
28
+ embeddings = model_output.last_hidden_state[:, 0, :]
29
+ embeddings = torch.nn.functional.normalize(embeddings)
30
+ return embeddings[0].cpu().numpy()
31
+
32
+
33
+
34
+ # Streamlit интерфейс
35
+ st.title("Умный поиск фильмов")
36
+
37
+ user_input = st.text_area("Введите описание фильма:")
38
+ num_recs = st.selectbox("Количество рекомендаций:", [1, 3, 5, 10])
39
+
40
+ if st.button("Найти"):
41
+ if user_input:
42
+ user_embedding = embed_bert_cls(user_input, model, tokenizer).astype('float32').reshape(1, -1)
43
+ distances, top_indices = index.search(user_embedding, num_recs) # Здесь добавляем переменную distances
44
+
45
+ st.write(f"Рекомендованные фильмы (Топ-{num_recs}):")
46
+
47
+ for i, index in enumerate(top_indices[0]):
48
+ col1, col2, col3 = st.columns([1, 4, 1]) # Добавляем ещё одну колонку для уверенности
49
+
50
+ with col1:
51
+ try:
52
+ st.image(images[index]) # Загружаем обложку фильма
53
+ except Exception as e:
54
+ st.write(f"Could not display image at index {index}. Error: {e}") # Это на случай отсутствия обложки
55
+
56
+ with col2:
57
+ st.markdown(f"[{titles[index]}]({links[index]})") # Название фильма сделано кликабельным
58
+ st.write(descr[index]) # Выводим описание фильма
59
+
60
+ with col3:
61
+ st.write(f"Уверенность: {1 / (1 + distances[0][i]):.2f}") # Выводим уверенность
requirements.txt ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.1.1
2
+ attrs==23.1.0
3
+ blinker==1.6.2
4
+ cachetools==5.3.1
5
+ certifi==2023.7.22
6
+ charset-normalizer==3.2.0
7
+ click==8.1.7
8
+ cmake==3.27.2
9
+ faiss-gpu==1.7.2
10
+ filelock==3.12.3
11
+ fsspec==2023.6.0
12
+ gitdb==4.0.10
13
+ GitPython==3.1.33
14
+ huggingface-hub==0.16.4
15
+ idna==3.4
16
+ importlib-metadata==6.8.0
17
+ Jinja2==3.1.2
18
+ joblib==1.3.2
19
+ jsonschema==4.19.0
20
+ jsonschema-specifications==2023.7.1
21
+ lit==16.0.6
22
+ markdown-it-py==3.0.0
23
+ MarkupSafe==2.1.3
24
+ mdurl==0.1.2
25
+ mpmath==1.3.0
26
+ networkx==3.1
27
+ numpy==1.25.2
28
+ nvidia-cublas-cu11==11.10.3.66
29
+ nvidia-cuda-cupti-cu11==11.7.101
30
+ nvidia-cuda-nvrtc-cu11==11.7.99
31
+ nvidia-cuda-runtime-cu11==11.7.99
32
+ nvidia-cudnn-cu11==8.5.0.96
33
+ nvidia-cufft-cu11==10.9.0.58
34
+ nvidia-curand-cu11==10.2.10.91
35
+ nvidia-cusolver-cu11==11.4.0.1
36
+ nvidia-cusparse-cu11==11.7.4.91
37
+ nvidia-nccl-cu11==2.14.3
38
+ nvidia-nvtx-cu11==11.7.91
39
+ packaging==23.1
40
+ pandas==2.1.0
41
+ Pillow==9.5.0
42
+ protobuf==4.24.2
43
+ pyarrow==13.0.0
44
+ pydeck==0.8.0
45
+ Pygments==2.16.1
46
+ Pympler==1.0.1
47
+ python-dateutil==2.8.2
48
+ pytz==2023.3
49
+ pytz-deprecation-shim==0.1.0.post0
50
+ PyYAML==6.0.1
51
+ referencing==0.30.2
52
+ regex==2023.8.8
53
+ requests==2.31.0
54
+ rich==13.5.2
55
+ rpds-py==0.10.0
56
+ safetensors==0.3.3
57
+ six==1.16.0
58
+ smmap==5.0.0
59
+ streamlit==1.26.0
60
+ sympy==1.12
61
+ tenacity==8.2.3
62
+ tokenizers==0.13.3
63
+ toml==0.10.2
64
+ toolz==0.12.0
65
+ torch==2.0.1
66
+ tornado==6.3.3
67
+ tqdm==4.66.1
68
+ transformers==4.32.1
69
+ triton==2.0.0
70
+ typing_extensions==4.7.1
71
+ tzdata==2023.3
72
+ tzlocal==4.3.1
73
+ urllib3==2.0.4
74
+ validators==0.21.2
75
+ watchdog==3.0.0
76
+ zipp==3.16.2