Kamil Pytlak commited on
Commit
2c077c2
β€’
1 Parent(s): 4690597

Initial commit

Browse files
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+
4
+ from utils import get_similar_questions_with_score
5
+
6
+ LOGO_PATH = 'img/logo.jpg'
7
+
8
+ st.image(LOGO_PATH, width=200)
9
+
10
+ st.title('SF Seeker')
11
+
12
+ st.markdown("""
13
+ Sci-Fi Stack Exchange Seeker (aka SF Seeker) is an AI assistant that helps you write better questions and search for
14
+ semantically similar questions on Sci-Fi Stack Exchange (https://scifi.stackexchange.com/). An all-MiniLM-L6-v2
15
+ language model (transformer) was used.
16
+
17
+ **Features**
18
+ - πŸ”Ž Based on a database of 71,013 questions, it searches for the most semantically similar questions to the one entered
19
+ by the user. This supports the process of fiding the same/similar questions already asked and prevents the creation of
20
+ duplicate threads.
21
+ - πŸ‘¨β€βš•οΈ [IN PROGRESS] Indicates words in a question that have a negative and positive effect on the chance of
22
+ getting an answer. It supports the process of arranging more precise questions. A model based on gradient
23
+ reinforcement learned using TF-IDF features was used.
24
+ """)
25
+
26
+ question_input = st.text_area('Question')
27
+ k_similar_questions = st.number_input('k similar questions', min_value=1, max_value=100, value=5, step=1)
28
+
29
+ if st.button('Submit'):
30
+ if not question_input:
31
+ st.warning('⚠️ No question inputted!')
32
+ else:
33
+ question_score_results = get_similar_questions_with_score(question_input, k_similar_questions)
34
+ question_score_results_df = pd.DataFrame(question_score_results)
35
+ question_score_results_df.columns = ['Question', 'Similarity score (in %)']
36
+
37
+ st.dataframe(question_score_results_df)
data/embeddings/embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8ba0ea61f6a0bb40b3b0c4060315da6daf7140a440705af2a33f74963b22ac7
3
+ size 109076096
data/processed/question_data.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b58266ad0d4798f220c11b7fa43b5088c83e4c0ae0b6dc549c2be480b91c4f3
3
+ size 54420125
img/logo.jpg ADDED
utils.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import List, Dict, Any
3
+
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ from sentence_transformers.util import semantic_search
7
+ import streamlit as st
8
+
9
+ QUESTION_DATA_PATH = Path('data/processed/question_data.npy')
10
+ EMBEDDINGS_PATH = Path('data/embeddings/embeddings.npy')
11
+ MODEL_PATH = 'sentence-transformers/all-MiniLM-L6-v2'
12
+
13
+
14
+ @st.cache_resource
15
+ def load_model():
16
+ """
17
+ Load a pre-trained SentenceTransformer model.
18
+
19
+ Returns:
20
+ -------
21
+ SentenceTransformer
22
+ A pre-trained SentenceTransformer model loaded from the specified MODEL_PATH.
23
+ """
24
+ model = SentenceTransformer(MODEL_PATH)
25
+ return model
26
+
27
+
28
+ @st.cache_data
29
+ def load_embeddings():
30
+ """
31
+ Load pre-computed embeddings from a file.
32
+
33
+ Returns:
34
+ -------
35
+ numpy.ndarray
36
+ A NumPy array containing pre-computed embeddings loaded from the specified EMBEDDINGS_PATH.
37
+ """
38
+ embeddings = np.load(EMBEDDINGS_PATH)
39
+ return embeddings
40
+
41
+
42
+ @st.cache_data
43
+ def load_question_data():
44
+ """
45
+ Load question data from a file.
46
+
47
+ Returns:
48
+ -------
49
+ numpy.ndarray
50
+ A NumPy array containing question data loaded from the specified QUESTION_DATA_PATH.
51
+ """
52
+ question_data = np.load(QUESTION_DATA_PATH, allow_pickle=True)
53
+ return question_data
54
+
55
+
56
+ def find_similar_questions(text_input: str, k: int) -> List[List[Dict[str, Any]]]:
57
+ """
58
+ Find similar questions to a given text input using pre-trained embeddings and a semantic search model.
59
+
60
+ Parameters:
61
+ ----------
62
+ text_input : str
63
+ The input text for which similar questions are to be found.
64
+ k : int
65
+ The number of similar questions to retrieve.
66
+
67
+ Returns:
68
+ -------
69
+ List[List[Dict[str, Any]]]
70
+ A list of lists, where each inner list contains dictionaries representing similar questions.
71
+ Each dictionary has the following keys:
72
+ - 'question': str
73
+ The text of the similar question.
74
+ - 'score': float
75
+ The similarity score between the input text and the similar question.
76
+ """
77
+ model = load_model()
78
+ embeddings = load_embeddings()
79
+ text_input_vectorized = model.encode(text_input)
80
+ similar_questions = semantic_search(text_input_vectorized, embeddings, top_k=k)
81
+ return similar_questions
82
+
83
+
84
+ def get_similar_questions_with_score(text_input: str, k=5) -> List[Dict[str, Any]]:
85
+ """
86
+ Retrieve similar questions to a given text input along with their similarity scores.
87
+
88
+ Parameters:
89
+ ----------
90
+ text_input : str
91
+ The input text for which similar questions are to be retrieved.
92
+ k : int, optional (default=5)
93
+ The number of similar questions to retrieve. Default is 5.
94
+
95
+ Returns:
96
+ -------
97
+ List[Dict[str, Any]]
98
+ A list of dictionaries representing similar questions and their similarity scores.
99
+ Each dictionary has the following keys:
100
+ - 'question': str
101
+ The text of the similar question.
102
+ - 'similarity_score': float
103
+ The similarity score between the input text and the similar question, rounded to one decimal place.
104
+
105
+ Notes:
106
+ -----
107
+ This function uses the `find_similar_questions` function to retrieve similar questions to the input text.
108
+ It also retrieves the corresponding similarity scores and returns the results as a list of dictionaries.
109
+ """
110
+ similar_questions = find_similar_questions(text_input, k)
111
+
112
+ question_data = load_question_data()
113
+
114
+ corpus_ids = [item['corpus_id'] for item in similar_questions[0]]
115
+ similarity_scores = [round(item['score'] * 100, 1) for item in similar_questions[0]]
116
+
117
+ similar_question_data = question_data[corpus_ids]
118
+
119
+ results = [{'question': question, 'similarity_score': score}
120
+ for question, score in zip(similar_question_data, similarity_scores)]
121
+
122
+ return results