Veronika1101 commited on
Commit
5596798
1 Parent(s): 2284c00

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ pages/MiniLM/MiniLM_index.index filter=lfs diff=lfs merge=lfs -text
main.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pages.rubert.app_bert import ruBert_page
3
+ from pages.MiniLM.MiniLm_app import MiniLm_page
4
+
5
+ # st.markdown("""
6
+ # <style>
7
+ # /* Основной фон страницы */
8
+ # body {
9
+ # background-color: #50C878; /* Изумрудный цвет */
10
+ # }
11
+
12
+ # /* Фон основной панели, чтобы убрать несоответствующие цвета */
13
+ # .stApp {
14
+ # background-color: #50C878;
15
+ # }
16
+
17
+ # /* Фон и текст боковой панели */
18
+ # .stSidebar > div:first-child {
19
+ # background-color: #50C878; /* Изумрудный цвет для боковой панели */
20
+ # color: #FFFFFF; /* Белый цвет текста для контраста */
21
+ # }
22
+ # .stSidebar .sidebar-content {
23
+ # color: #FFFFFF;
24
+ # }
25
+
26
+ # /* Стиль заголовка боковой панели */
27
+ # .stSidebar .sidebar-content h1, .stSidebar .sidebar-content h2, .stSidebar .sidebar-content h3 {
28
+ # color: #FFFFFF;
29
+ # }
30
+
31
+ # /* Стиль кнопок */
32
+ # .stButton > button {
33
+ # background-color: #006400; /* Темно-изумрудный цвет кнопок */
34
+ # color: #FFFFFF; /* Белый текст на кнопках */
35
+ # }
36
+
37
+ # /* Стили текстового поля */
38
+ # .stTextInput input {
39
+ # background-color: #FFFFFF; /* Белый фон текстового поля */
40
+ # color: #006400; /* Темно-изумрудный текст */
41
+ # }
42
+
43
+ # /* Стили активной кнопки радио и чекбокса */
44
+ # .stRadio > label > div:first-of-type > div, .stCheckbox > label > div:first-of-type > div {
45
+ # background-color: #006400; /* Темно-изумрудный фон для активных кнопок радио и чекбоксов */
46
+ # border-color: #006400;
47
+ # }
48
+
49
+ # /* Стили слайдера */
50
+ # .stSlider > div > div > div > div {
51
+ # background-color: #006400; /* Темно-изумрудный цвет слайдера */
52
+ # }
53
+ # </style>
54
+ # """, unsafe_allow_html=True)
55
+
56
+ def app_description_page():
57
+ st.title("Welcome to My App!")
58
+
59
+ def main():
60
+ st.sidebar.title("Book app")
61
+ page = st.sidebar.radio("Select page:", ["About Project", "📚 Book search", "🔍 Book search (faiss)"])
62
+ if page == "About Project":
63
+ app_description_page()
64
+ if page == "📚 Book search":
65
+ ruBert_page()
66
+ if page == "🔍 Book search (faiss)":
67
+ MiniLm_page()
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
pages/MiniLM/MiniLM_embeddings.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8b48f9594e9a78afbdd85f8acc801df16e30f90641e5e9d2c6b1b4dc66c65bf
3
+ size 7578787
pages/MiniLM/MiniLM_index.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a85188579f9f15ea1809ecfd0478c026c449b019775ffee3b8202d85ce787de8
3
+ size 7578669
pages/MiniLM/MiniLm_app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sentence_transformers import SentenceTransformer
4
+ import faiss
5
+ import pickle
6
+ import time
7
+
8
+ # Загрузка данных
9
+ @st.cache_data
10
+ def load_data():
11
+ data = pd.read_csv('Data/books_data2.csv')
12
+ with open('pages/MiniLM/MiniLM_embeddings.pkl', 'rb') as f:
13
+ book_embeddings = pickle.load(f)
14
+ index = faiss.read_index('pages/MiniLM/MiniLM_index.index')
15
+
16
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
17
+ return data, index, embedder
18
+
19
+ # Функция поиска
20
+ def search_books(embedder, index, data, query, n_results):
21
+ query_embedding = embedder.encode([query])
22
+ D, I = index.search(query_embedding, n_results)
23
+
24
+ result_books = []
25
+ for i in range(n_results):
26
+ book_index = I[0][i]
27
+ book_info = data.iloc[book_index]
28
+ book_link = book_info['page_url']
29
+ book_image = book_info['image_url']
30
+ book_genre = book_info['genre']
31
+ book_title = book_info['title']
32
+ book_description = book_info['annotation']
33
+ book_author = book_info['author']
34
+ similarity_score = 1 / (1 + D[0][i])
35
+
36
+ result_books.append({
37
+ 'Link': book_link,
38
+ 'Image': book_image,
39
+ 'Genre': book_genre,
40
+ 'Title': book_title,
41
+ 'Description': book_description,
42
+ 'Author': book_author,
43
+ 'Similarity': similarity_score
44
+ })
45
+
46
+ return result_books
47
+
48
+ # Streamlit
49
+ def MiniLm_page():
50
+ st.title('Поиск книг')
51
+ data, index, embedder = load_data()
52
+ query = st.text_input('Введите цитату или автора:')
53
+ n_results = st.slider("Количество результатов", min_value=1, max_value=20, value=5)
54
+
55
+ if st.button("Искать"):
56
+ if query:
57
+ start_time = time.time()
58
+ result_books = search_books(embedder, index, data, query, n_results)
59
+ end_time = time.time()
60
+ search_time = end_time - start_time
61
+ st.write("Результаты поиска:")
62
+
63
+ for book in result_books:
64
+ st.write('---')
65
+ st.image(book['Image'], width=250)
66
+ st.write(f"**Название:** {book['Title']}")
67
+ st.write(f"**Автор:** {book['Author']}")
68
+ st.write(f"**Жанр:** {book['Genre']}")
69
+ if len(book['Description']) > 50:
70
+ book['Description'] = ' '.join(book['Description'].split()[:50]) + '...'
71
+ st.write(f"**Описание:** {book['Description']}")
72
+ st.write(f"**Сходство:** {book['Similarity']:.2f}")
73
+ st.write(f'**Время поиска:** {search_time:.4f} секунд')
74
+ st.write(f"[Читать подробнее]({book['Link']})")
75
+ st.text("")
pages/MiniLM/__pycache__/MiniLm_app.cpython-311.pyc ADDED
Binary file (4.78 kB). View file
 
pages/rubert/__pycache__/app_bert.cpython-311.pyc ADDED
Binary file (5.82 kB). View file
 
pages/rubert/app_bert.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModel
5
+ from scipy.spatial.distance import cosine
6
+ import pandas as pd
7
+ import time
8
+
9
+ @st.cache_data
10
+ def load_data():
11
+ book_embeddings = np.load('pages/rubert/embeddings.npz')
12
+ all_embeddings = book_embeddings['embeddings']
13
+
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
17
+ model = AutoModel.from_pretrained("cointegrated/rubert-tiny2").to(device)
18
+ data = pd.read_csv('Data/books_data2.csv')
19
+ return all_embeddings, device, tokenizer, model, data
20
+
21
+ # Функция для получения эмбеддинга
22
+
23
+ def embed_text(text, tokenizer, model, device):
24
+ encoded_input = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
25
+ with torch.no_grad():
26
+ model_output = model(**encoded_input.to(device))
27
+ embeddings = model_output.last_hidden_state[:,0,:].cpu().numpy()
28
+ return np.mean(embeddings, axis=0)
29
+
30
+ # Функция поиска
31
+ def search_books(user_query, all_embeddings, device, tokenizer, model, data, n_results):
32
+ query_embedding = embed_text(user_query, tokenizer, model, device)
33
+
34
+ similarities = [1 - cosine(query_embedding, book_embedding) for book_embedding in all_embeddings]
35
+ top_results_indices = np.argsort(similarities)[::-1][:n_results]
36
+ top_similarities = np.sort(similarities)[::-1][:n_results]
37
+
38
+ return top_results_indices, top_similarities
39
+
40
+ # Streamlit
41
+ def ruBert_page():
42
+ st.title("Поиск книг")
43
+ all_embeddings, device, tokenizer, model, data = load_data()
44
+
45
+ user_query = st.text_input("Введите цитату или автора:")
46
+ n_results = st.slider("Количество результатов", min_value=1, max_value=20, value=5)
47
+
48
+ if st.button("Искать"):
49
+ start_time = time.time()
50
+ top_books_indices, top_similarities = search_books(user_query, all_embeddings, device, tokenizer, model, data, n_results)
51
+ end_time = time.time()
52
+ search_time = end_time - start_time
53
+ st.write("Результаты поиска:")
54
+ for i, idx in enumerate(top_books_indices):
55
+ st.write('---')
56
+ similarity = top_similarities[i]
57
+ st.image(data.loc[idx, 'image_url'], width=250)
58
+ st.write(f"**Название:** {data.loc[idx, 'title']}")
59
+ st.write(f"**Автор:** {data.loc[idx, 'author']}")
60
+ st.write(f"**Жанр:** {data.loc[idx, 'genre']}")
61
+ annotation = data.loc[idx, 'annotation']
62
+ if len(annotation) > 50:
63
+ annotation = ' '.join(annotation.split()[:50]) + '...'
64
+ st.write(f"**Описание:** {annotation}")
65
+ st.write(f"**Косинусное сходство:** {similarity:.3f}")
66
+ st.write(f'**Время поиска:** {search_time:.4f} секунд')
67
+ st.markdown(f"[Читать подробнее]({data.loc[idx, 'page_url']})")
68
+
69
+
70
+
pages/rubert/embeddings.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:567061998a07f76e3290d80b1cebfd6945baf186a0d1c920bd1071f40e0d1e64
3
+ size 3544594
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ huggingface-hub
2
+ numpy
3
+ pandas
4
+ streamlit
5
+ tokenizers
6
+ torch
7
+ transformers
8
+ faiss-cpu
9
+ sentence_transformers
10
+ scipy
11
+