Commit
•
5596798
1
Parent(s):
2284c00
Upload 10 files
Browse files- .gitattributes +1 -0
- main.py +71 -0
- pages/MiniLM/MiniLM_embeddings.pkl +3 -0
- pages/MiniLM/MiniLM_index.index +3 -0
- pages/MiniLM/MiniLm_app.py +75 -0
- pages/MiniLM/__pycache__/MiniLm_app.cpython-311.pyc +0 -0
- pages/rubert/__pycache__/app_bert.cpython-311.pyc +0 -0
- pages/rubert/app_bert.py +70 -0
- pages/rubert/embeddings.npz +3 -0
- requirements.txt +11 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
pages/MiniLM/MiniLM_index.index filter=lfs diff=lfs merge=lfs -text
|
main.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pages.rubert.app_bert import ruBert_page
|
3 |
+
from pages.MiniLM.MiniLm_app import MiniLm_page
|
4 |
+
|
5 |
+
# st.markdown("""
|
6 |
+
# <style>
|
7 |
+
# /* Основной фон страницы */
|
8 |
+
# body {
|
9 |
+
# background-color: #50C878; /* Изумрудный цвет */
|
10 |
+
# }
|
11 |
+
|
12 |
+
# /* Фон основной панели, чтобы убрать несоответствующие цвета */
|
13 |
+
# .stApp {
|
14 |
+
# background-color: #50C878;
|
15 |
+
# }
|
16 |
+
|
17 |
+
# /* Фон и текст боковой панели */
|
18 |
+
# .stSidebar > div:first-child {
|
19 |
+
# background-color: #50C878; /* Изумрудный цвет для боковой панели */
|
20 |
+
# color: #FFFFFF; /* Белый цвет текста для контраста */
|
21 |
+
# }
|
22 |
+
# .stSidebar .sidebar-content {
|
23 |
+
# color: #FFFFFF;
|
24 |
+
# }
|
25 |
+
|
26 |
+
# /* Стиль заголовка боковой панели */
|
27 |
+
# .stSidebar .sidebar-content h1, .stSidebar .sidebar-content h2, .stSidebar .sidebar-content h3 {
|
28 |
+
# color: #FFFFFF;
|
29 |
+
# }
|
30 |
+
|
31 |
+
# /* Стиль кнопок */
|
32 |
+
# .stButton > button {
|
33 |
+
# background-color: #006400; /* Темно-изумрудный цвет кнопок */
|
34 |
+
# color: #FFFFFF; /* Белый текст на кнопках */
|
35 |
+
# }
|
36 |
+
|
37 |
+
# /* Стили текстового поля */
|
38 |
+
# .stTextInput input {
|
39 |
+
# background-color: #FFFFFF; /* Белый фон текстового поля */
|
40 |
+
# color: #006400; /* Темно-изумрудный текст */
|
41 |
+
# }
|
42 |
+
|
43 |
+
# /* Стили активной кнопки радио и чекбокса */
|
44 |
+
# .stRadio > label > div:first-of-type > div, .stCheckbox > label > div:first-of-type > div {
|
45 |
+
# background-color: #006400; /* Темно-изумрудный фон для активных кнопок радио и чекбоксов */
|
46 |
+
# border-color: #006400;
|
47 |
+
# }
|
48 |
+
|
49 |
+
# /* Стили слайдера */
|
50 |
+
# .stSlider > div > div > div > div {
|
51 |
+
# background-color: #006400; /* Темно-изумрудный цвет слайдера */
|
52 |
+
# }
|
53 |
+
# </style>
|
54 |
+
# """, unsafe_allow_html=True)
|
55 |
+
|
56 |
+
def app_description_page():
|
57 |
+
st.title("Welcome to My App!")
|
58 |
+
|
59 |
+
def main():
|
60 |
+
st.sidebar.title("Book app")
|
61 |
+
page = st.sidebar.radio("Select page:", ["About Project", "📚 Book search", "🔍 Book search (faiss)"])
|
62 |
+
if page == "About Project":
|
63 |
+
app_description_page()
|
64 |
+
if page == "📚 Book search":
|
65 |
+
ruBert_page()
|
66 |
+
if page == "🔍 Book search (faiss)":
|
67 |
+
MiniLm_page()
|
68 |
+
|
69 |
+
|
70 |
+
if __name__ == "__main__":
|
71 |
+
main()
|
pages/MiniLM/MiniLM_embeddings.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8b48f9594e9a78afbdd85f8acc801df16e30f90641e5e9d2c6b1b4dc66c65bf
|
3 |
+
size 7578787
|
pages/MiniLM/MiniLM_index.index
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a85188579f9f15ea1809ecfd0478c026c449b019775ffee3b8202d85ce787de8
|
3 |
+
size 7578669
|
pages/MiniLM/MiniLm_app.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
import faiss
|
5 |
+
import pickle
|
6 |
+
import time
|
7 |
+
|
8 |
+
# Загрузка данных
|
9 |
+
@st.cache_data
|
10 |
+
def load_data():
|
11 |
+
data = pd.read_csv('Data/books_data2.csv')
|
12 |
+
with open('pages/MiniLM/MiniLM_embeddings.pkl', 'rb') as f:
|
13 |
+
book_embeddings = pickle.load(f)
|
14 |
+
index = faiss.read_index('pages/MiniLM/MiniLM_index.index')
|
15 |
+
|
16 |
+
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
17 |
+
return data, index, embedder
|
18 |
+
|
19 |
+
# Функция поиска
|
20 |
+
def search_books(embedder, index, data, query, n_results):
|
21 |
+
query_embedding = embedder.encode([query])
|
22 |
+
D, I = index.search(query_embedding, n_results)
|
23 |
+
|
24 |
+
result_books = []
|
25 |
+
for i in range(n_results):
|
26 |
+
book_index = I[0][i]
|
27 |
+
book_info = data.iloc[book_index]
|
28 |
+
book_link = book_info['page_url']
|
29 |
+
book_image = book_info['image_url']
|
30 |
+
book_genre = book_info['genre']
|
31 |
+
book_title = book_info['title']
|
32 |
+
book_description = book_info['annotation']
|
33 |
+
book_author = book_info['author']
|
34 |
+
similarity_score = 1 / (1 + D[0][i])
|
35 |
+
|
36 |
+
result_books.append({
|
37 |
+
'Link': book_link,
|
38 |
+
'Image': book_image,
|
39 |
+
'Genre': book_genre,
|
40 |
+
'Title': book_title,
|
41 |
+
'Description': book_description,
|
42 |
+
'Author': book_author,
|
43 |
+
'Similarity': similarity_score
|
44 |
+
})
|
45 |
+
|
46 |
+
return result_books
|
47 |
+
|
48 |
+
# Streamlit
|
49 |
+
def MiniLm_page():
|
50 |
+
st.title('Поиск книг')
|
51 |
+
data, index, embedder = load_data()
|
52 |
+
query = st.text_input('Введите цитату или автора:')
|
53 |
+
n_results = st.slider("Количество результатов", min_value=1, max_value=20, value=5)
|
54 |
+
|
55 |
+
if st.button("Искать"):
|
56 |
+
if query:
|
57 |
+
start_time = time.time()
|
58 |
+
result_books = search_books(embedder, index, data, query, n_results)
|
59 |
+
end_time = time.time()
|
60 |
+
search_time = end_time - start_time
|
61 |
+
st.write("Результаты поиска:")
|
62 |
+
|
63 |
+
for book in result_books:
|
64 |
+
st.write('---')
|
65 |
+
st.image(book['Image'], width=250)
|
66 |
+
st.write(f"**Название:** {book['Title']}")
|
67 |
+
st.write(f"**Автор:** {book['Author']}")
|
68 |
+
st.write(f"**Жанр:** {book['Genre']}")
|
69 |
+
if len(book['Description']) > 50:
|
70 |
+
book['Description'] = ' '.join(book['Description'].split()[:50]) + '...'
|
71 |
+
st.write(f"**Описание:** {book['Description']}")
|
72 |
+
st.write(f"**Сходство:** {book['Similarity']:.2f}")
|
73 |
+
st.write(f'**Время поиска:** {search_time:.4f} секунд')
|
74 |
+
st.write(f"[Читать подробнее]({book['Link']})")
|
75 |
+
st.text("")
|
pages/MiniLM/__pycache__/MiniLm_app.cpython-311.pyc
ADDED
Binary file (4.78 kB). View file
|
|
pages/rubert/__pycache__/app_bert.cpython-311.pyc
ADDED
Binary file (5.82 kB). View file
|
|
pages/rubert/app_bert.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from transformers import AutoTokenizer, AutoModel
|
5 |
+
from scipy.spatial.distance import cosine
|
6 |
+
import pandas as pd
|
7 |
+
import time
|
8 |
+
|
9 |
+
@st.cache_data
|
10 |
+
def load_data():
|
11 |
+
book_embeddings = np.load('pages/rubert/embeddings.npz')
|
12 |
+
all_embeddings = book_embeddings['embeddings']
|
13 |
+
|
14 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
15 |
+
|
16 |
+
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
|
17 |
+
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2").to(device)
|
18 |
+
data = pd.read_csv('Data/books_data2.csv')
|
19 |
+
return all_embeddings, device, tokenizer, model, data
|
20 |
+
|
21 |
+
# Функция для получения эмбеддинга
|
22 |
+
|
23 |
+
def embed_text(text, tokenizer, model, device):
|
24 |
+
encoded_input = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
|
25 |
+
with torch.no_grad():
|
26 |
+
model_output = model(**encoded_input.to(device))
|
27 |
+
embeddings = model_output.last_hidden_state[:,0,:].cpu().numpy()
|
28 |
+
return np.mean(embeddings, axis=0)
|
29 |
+
|
30 |
+
# Функция поиска
|
31 |
+
def search_books(user_query, all_embeddings, device, tokenizer, model, data, n_results):
|
32 |
+
query_embedding = embed_text(user_query, tokenizer, model, device)
|
33 |
+
|
34 |
+
similarities = [1 - cosine(query_embedding, book_embedding) for book_embedding in all_embeddings]
|
35 |
+
top_results_indices = np.argsort(similarities)[::-1][:n_results]
|
36 |
+
top_similarities = np.sort(similarities)[::-1][:n_results]
|
37 |
+
|
38 |
+
return top_results_indices, top_similarities
|
39 |
+
|
40 |
+
# Streamlit
|
41 |
+
def ruBert_page():
|
42 |
+
st.title("Поиск книг")
|
43 |
+
all_embeddings, device, tokenizer, model, data = load_data()
|
44 |
+
|
45 |
+
user_query = st.text_input("Введите цитату или автора:")
|
46 |
+
n_results = st.slider("Количество результатов", min_value=1, max_value=20, value=5)
|
47 |
+
|
48 |
+
if st.button("Искать"):
|
49 |
+
start_time = time.time()
|
50 |
+
top_books_indices, top_similarities = search_books(user_query, all_embeddings, device, tokenizer, model, data, n_results)
|
51 |
+
end_time = time.time()
|
52 |
+
search_time = end_time - start_time
|
53 |
+
st.write("Результаты поиска:")
|
54 |
+
for i, idx in enumerate(top_books_indices):
|
55 |
+
st.write('---')
|
56 |
+
similarity = top_similarities[i]
|
57 |
+
st.image(data.loc[idx, 'image_url'], width=250)
|
58 |
+
st.write(f"**Название:** {data.loc[idx, 'title']}")
|
59 |
+
st.write(f"**Автор:** {data.loc[idx, 'author']}")
|
60 |
+
st.write(f"**Жанр:** {data.loc[idx, 'genre']}")
|
61 |
+
annotation = data.loc[idx, 'annotation']
|
62 |
+
if len(annotation) > 50:
|
63 |
+
annotation = ' '.join(annotation.split()[:50]) + '...'
|
64 |
+
st.write(f"**Описание:** {annotation}")
|
65 |
+
st.write(f"**Косинусное сходство:** {similarity:.3f}")
|
66 |
+
st.write(f'**Время поиска:** {search_time:.4f} секунд')
|
67 |
+
st.markdown(f"[Читать подробнее]({data.loc[idx, 'page_url']})")
|
68 |
+
|
69 |
+
|
70 |
+
|
pages/rubert/embeddings.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:567061998a07f76e3290d80b1cebfd6945baf186a0d1c920bd1071f40e0d1e64
|
3 |
+
size 3544594
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
huggingface-hub
|
2 |
+
numpy
|
3 |
+
pandas
|
4 |
+
streamlit
|
5 |
+
tokenizers
|
6 |
+
torch
|
7 |
+
transformers
|
8 |
+
faiss-cpu
|
9 |
+
sentence_transformers
|
10 |
+
scipy
|
11 |
+
|