Norgan97 commited on
Commit
998169b
1 Parent(s): 1902e4f
Dataset/embeddingsbooks.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0d215389841d91e403e0d2052998369eefc5546e5597dbcb2b85f126679054c
3
+ size 26199019
Dataset/embeddingsrecipes.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a5202b41888fd390fe421bdfcac1b57867260d58426834cbd71900f2d385cba
3
+ size 98568532
Dataset/faissbooks.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbeed94e0f2dbbb393b7f019d0174e2dc7861f8f2a2a3091a549b31f8bff88d7
3
+ size 8580045
Dataset/faissrecipes.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16751ffdb3319faf7cb5b01b726af9612598354d1e6783263e49f66429df0454
3
+ size 32326989
Dataset/parcedbooks.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d3abf12900ffd5ac0b3c8f503075930830c430fc9039416ce8d7c09589f900a
3
+ size 10833072
Dataset/recipesdataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b13aa75d0ad9b9e9d168fce0f36d67cd5734ffd090ca09a6f5c8643f71caa95
3
+ size 14171628
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import torch
4
+ from PIL import Image
5
+ from io import BytesIO
6
+ import requests
7
+ import faiss
8
+
9
+
10
+ from transformers import AutoTokenizer, AutoModel
11
+ import numpy as np
12
+ st.set_page_config(layout="wide")
13
+
14
+ @st.cache_resource()
15
+ def load_model():
16
+ model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
17
+ tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
18
+ return model , tokenizer
19
+
20
+ model, tokenizer = load_model()
21
+
22
+ @st.cache_data()
23
+ def load_data():
24
+ df = pd.read_csv('Dataset/parcedbooks.csv')
25
+ with open('Dataset/embeddingsbooks.txt', 'r') as file:
26
+ embeddings_list = [list(map(float, line.split())) for line in file.readlines()]
27
+ index = faiss.read_index('Dataset/faissbooks.index')
28
+ return df, embeddings_list, index
29
+
30
+ df, embeddings_list, index = load_data()
31
+
32
+ def embed_bert_cls(text, model, tokenizer):
33
+ t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
34
+ with torch.no_grad():
35
+ model_output = model(**{k: v.to(model.device) for k, v in t.items()})
36
+ embeddings = model_output.last_hidden_state[:, 0, :]
37
+ embeddings = torch.nn.functional.normalize(embeddings)
38
+ return embeddings[0].cpu().numpy()
39
+
40
+
41
+ col3, col4 = st.columns([5,1])
42
+
43
+ with col3:
44
+ text = st.text_input('Введите ваше предпочтение для рекомендации')
45
+ with col4:
46
+ num = st.number_input('Количество книг', step=1, value=1)
47
+ button = st.button('Отправить запрос')
48
+
49
+
50
+ if text and button:
51
+ decode_text = embed_bert_cls(text, model, tokenizer) # Получение вектора для введенного текста
52
+ k = num
53
+ D, I = index.search(decode_text.reshape(1, -1), k)
54
+
55
+ top_similar_indices = I[0]
56
+ top_similar_annotations = [df['annotation'].iloc[i] for i in top_similar_indices]
57
+ top_similar_images = [df['image_url'].iloc[i] for i in top_similar_indices]
58
+ images = [Image.open(BytesIO(requests.get(url).content)) for url in top_similar_images]
59
+ top_similar_authors = [df['author'].iloc[i] for i in top_similar_indices]
60
+ top_similar_title = [df['title'].iloc[i] for i in top_similar_indices]
61
+ top_similar_url = [df['page_url'].iloc[i] for i in top_similar_indices]
62
+ top_cosine_similarities = [1 - d / 2 for d in D[0]] # Преобразование расстояний в косинусное сходство
63
+
64
+ # Отображение изображений и названий
65
+ for similarity, image, author, annotation, title, url in zip(top_cosine_similarities, images, top_similar_authors, top_similar_annotations, top_similar_title, top_similar_url):
66
+ col1, col2 = st.columns([3, 4])
67
+ with col1:
68
+ st.image(image, width=300)
69
+ with col2:
70
+ st.write(f"***Автор:*** {author}")
71
+ st.write(f"***Название:*** {title}")
72
+ st.write(f"***Аннотация:*** {annotation}")
73
+ similarity = float(similarity)
74
+ st.write(f"***Cosine Similarity : {round(similarity, 3)}***")
75
+ st.write(f"***Ссылка на книгу : {url}***")
76
+
77
+ st.markdown(
78
+ "<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
79
+ unsafe_allow_html=True
80
+ )
81
+
82
+
83
+
84
+
85
+
pages/recipes.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import torch
4
+ from PIL import Image
5
+ from io import BytesIO
6
+ import requests
7
+ import faiss
8
+
9
+
10
+ from transformers import AutoTokenizer, AutoModel
11
+ import numpy as np
12
+ st.set_page_config(layout="wide")
13
+
14
+ @st.cache_resource()
15
+ def load_model():
16
+ model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
17
+ tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
18
+ return model , tokenizer
19
+
20
+ model, tokenizer = load_model()
21
+
22
+ @st.cache_data()
23
+ def load_data():
24
+ df = pd.read_csv('Dataset/recipesdataset.csv')
25
+ with open('Dataset/embeddingsrecipes.txt', 'r') as file:
26
+ embeddings_list = [list(map(float, line.split())) for line in file.readlines()]
27
+ index = faiss.read_index('Dataset/faissrecipes.index')
28
+ return df, embeddings_list, index
29
+
30
+ df, embeddings_list, index = load_data()
31
+
32
+ def embed_bert_cls(text, model, tokenizer):
33
+ t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
34
+ with torch.no_grad():
35
+ model_output = model(**{k: v.to(model.device) for k, v in t.items()})
36
+ embeddings = model_output.last_hidden_state[:, 0, :]
37
+ embeddings = torch.nn.functional.normalize(embeddings)
38
+ return embeddings[0].cpu().numpy()
39
+
40
+ col3, col4 = st.columns([5,1])
41
+
42
+ with col3:
43
+ text = st.text_input('Введите ваше предпочтение для рекомендации')
44
+ with col4:
45
+ num = st.number_input('Количество блюд', step=1, value=1)
46
+ button = st.button('Отправить запрос')
47
+
48
+
49
+ if text and button:
50
+ decode_text = embed_bert_cls(text, model, tokenizer) # Получение вектора для введенного текста
51
+ k = num
52
+ D, I = index.search(decode_text.reshape(1, -1), k)
53
+
54
+ top_similar_indices = I[0]
55
+ top_similar_annotations = [df['annotation'].iloc[i] for i in top_similar_indices]
56
+ top_similar_images = [df['image_url'].iloc[i] for i in top_similar_indices]
57
+ images = [Image.open(BytesIO(requests.get(url).content)) for url in top_similar_images]
58
+ top_similar_title = [df['title'].iloc[i] for i in top_similar_indices]
59
+ top_similar_url = [df['page_url'].iloc[i] for i in top_similar_indices]
60
+ top_cosine_similarities = [1 - d / 2 for d in D[0]] # Преобразование расстояний в косинусное сходство
61
+
62
+ # Отображение изображений и названий
63
+ for similarity, image, annotation, title, url in zip(top_cosine_similarities, images, top_similar_annotations, top_similar_title, top_similar_url):
64
+ col1, col2 = st.columns([3, 4])
65
+ with col1:
66
+ st.image(image, width=300)
67
+ with col2:
68
+ st.write(f"***Название:*** {title}")
69
+ st.write(f"***Описание:*** {annotation}")
70
+ similarity = float(similarity)
71
+ st.write(f"***Cosine Similarity : {round(similarity, 3)}***")
72
+ st.write(f"***Ссылка на блюдо : {url}***")
73
+
74
+ st.markdown(
75
+ "<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
76
+ unsafe_allow_html=True
77
+ )
requirements.txt ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.1.2
2
+ attrs==23.1.0
3
+ beautifulsoup4==4.12.2
4
+ blinker==1.7.0
5
+ bs4==0.0.1
6
+ cachetools==5.3.2
7
+ certifi==2023.7.22
8
+ charset-normalizer==3.3.2
9
+ click==8.1.7
10
+ faiss-cpu==1.7.2
11
+ filelock==3.13.1
12
+ fsspec==2023.10.0
13
+ gitdb==4.0.11
14
+ GitPython==3.1.40
15
+ huggingface-hub==0.17.3
16
+ idna==3.4
17
+ importlib-metadata==6.8.0
18
+ Jinja2==3.1.2
19
+ jsonschema==4.19.2
20
+ jsonschema-specifications==2023.7.1
21
+ markdown-it-py==3.0.0
22
+ MarkupSafe==2.1.3
23
+ mdurl==0.1.2
24
+ mpmath==1.3.0
25
+ networkx==3.2.1
26
+ numpy==1.26.1
27
+ nvidia-cublas-cu12==12.1.3.1
28
+ nvidia-cuda-cupti-cu12==12.1.105
29
+ nvidia-cuda-nvrtc-cu12==12.1.105
30
+ nvidia-cuda-runtime-cu12==12.1.105
31
+ nvidia-cudnn-cu12==8.9.2.26
32
+ nvidia-cufft-cu12==11.0.2.54
33
+ nvidia-curand-cu12==10.3.2.106
34
+ nvidia-cusolver-cu12==11.4.5.107
35
+ nvidia-cusparse-cu12==12.1.0.106
36
+ nvidia-nccl-cu12==2.18.1
37
+ nvidia-nvjitlink-cu12==12.3.52
38
+ nvidia-nvtx-cu12==12.1.105
39
+ packaging==23.2
40
+ pandas==2.1.2
41
+ Pillow==10.1.0
42
+ protobuf==4.25.0
43
+ pyarrow==14.0.1
44
+ pydeck==0.8.1b0
45
+ Pygments==2.16.1
46
+ python-dateutil==2.8.2
47
+ pytz==2023.3.post1
48
+ PyYAML==6.0.1
49
+ referencing==0.30.2
50
+ regex==2023.10.3
51
+ requests==2.31.0
52
+ rich==13.6.0
53
+ rpds-py==0.12.0
54
+ safetensors==0.4.0
55
+ six==1.16.0
56
+ smmap==5.0.1
57
+ soupsieve==2.5
58
+ streamlit==1.28.1
59
+ sympy==1.12
60
+ tenacity==8.2.3
61
+ tokenizers==0.14.1
62
+ toml==0.10.2
63
+ toolz==0.12.0
64
+ torch==2.1.0
65
+ torchaudio==2.1.0
66
+ torchvision==0.16.0
67
+ tornado==6.3.3
68
+ tqdm==4.66.1
69
+ transformers==4.35.0
70
+ triton==2.1.0
71
+ typing_extensions==4.8.0
72
+ tzdata==2023.3
73
+ tzlocal==5.2
74
+ urllib3==2.0.7
75
+ validators==0.22.0
76
+ watchdog==3.0.0
77
+ zipp==3.17.0