Spaces:
Sleeping
Sleeping
Kamil Pytlak
commited on
Commit
β’
2c077c2
1
Parent(s):
4690597
Initial commit
Browse files- app.py +37 -0
- data/embeddings/embeddings.npy +3 -0
- data/processed/question_data.npy +3 -0
- img/logo.jpg +0 -0
- utils.py +122 -0
app.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
from utils import get_similar_questions_with_score
|
5 |
+
|
6 |
+
LOGO_PATH = 'img/logo.jpg'
|
7 |
+
|
8 |
+
st.image(LOGO_PATH, width=200)
|
9 |
+
|
10 |
+
st.title('SF Seeker')
|
11 |
+
|
12 |
+
st.markdown("""
|
13 |
+
Sci-Fi Stack Exchange Seeker (aka SF Seeker) is an AI assistant that helps you write better questions and search for
|
14 |
+
semantically similar questions on Sci-Fi Stack Exchange (https://scifi.stackexchange.com/). An all-MiniLM-L6-v2
|
15 |
+
language model (transformer) was used.
|
16 |
+
|
17 |
+
**Features**
|
18 |
+
- π Based on a database of 71,013 questions, it searches for the most semantically similar questions to the one entered
|
19 |
+
by the user. This supports the process of fiding the same/similar questions already asked and prevents the creation of
|
20 |
+
duplicate threads.
|
21 |
+
- π¨ββοΈ [IN PROGRESS] Indicates words in a question that have a negative and positive effect on the chance of
|
22 |
+
getting an answer. It supports the process of arranging more precise questions. A model based on gradient
|
23 |
+
reinforcement learned using TF-IDF features was used.
|
24 |
+
""")
|
25 |
+
|
26 |
+
question_input = st.text_area('Question')
|
27 |
+
k_similar_questions = st.number_input('k similar questions', min_value=1, max_value=100, value=5, step=1)
|
28 |
+
|
29 |
+
if st.button('Submit'):
|
30 |
+
if not question_input:
|
31 |
+
st.warning('β οΈ No question inputted!')
|
32 |
+
else:
|
33 |
+
question_score_results = get_similar_questions_with_score(question_input, k_similar_questions)
|
34 |
+
question_score_results_df = pd.DataFrame(question_score_results)
|
35 |
+
question_score_results_df.columns = ['Question', 'Similarity score (in %)']
|
36 |
+
|
37 |
+
st.dataframe(question_score_results_df)
|
data/embeddings/embeddings.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f8ba0ea61f6a0bb40b3b0c4060315da6daf7140a440705af2a33f74963b22ac7
|
3 |
+
size 109076096
|
data/processed/question_data.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8b58266ad0d4798f220c11b7fa43b5088c83e4c0ae0b6dc549c2be480b91c4f3
|
3 |
+
size 54420125
|
img/logo.jpg
ADDED
utils.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from typing import List, Dict, Any
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
from sentence_transformers.util import semantic_search
|
7 |
+
import streamlit as st
|
8 |
+
|
9 |
+
QUESTION_DATA_PATH = Path('data/processed/question_data.npy')
|
10 |
+
EMBEDDINGS_PATH = Path('data/embeddings/embeddings.npy')
|
11 |
+
MODEL_PATH = 'sentence-transformers/all-MiniLM-L6-v2'
|
12 |
+
|
13 |
+
|
14 |
+
@st.cache_resource
|
15 |
+
def load_model():
|
16 |
+
"""
|
17 |
+
Load a pre-trained SentenceTransformer model.
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
-------
|
21 |
+
SentenceTransformer
|
22 |
+
A pre-trained SentenceTransformer model loaded from the specified MODEL_PATH.
|
23 |
+
"""
|
24 |
+
model = SentenceTransformer(MODEL_PATH)
|
25 |
+
return model
|
26 |
+
|
27 |
+
|
28 |
+
@st.cache_data
|
29 |
+
def load_embeddings():
|
30 |
+
"""
|
31 |
+
Load pre-computed embeddings from a file.
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
-------
|
35 |
+
numpy.ndarray
|
36 |
+
A NumPy array containing pre-computed embeddings loaded from the specified EMBEDDINGS_PATH.
|
37 |
+
"""
|
38 |
+
embeddings = np.load(EMBEDDINGS_PATH)
|
39 |
+
return embeddings
|
40 |
+
|
41 |
+
|
42 |
+
@st.cache_data
|
43 |
+
def load_question_data():
|
44 |
+
"""
|
45 |
+
Load question data from a file.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
-------
|
49 |
+
numpy.ndarray
|
50 |
+
A NumPy array containing question data loaded from the specified QUESTION_DATA_PATH.
|
51 |
+
"""
|
52 |
+
question_data = np.load(QUESTION_DATA_PATH, allow_pickle=True)
|
53 |
+
return question_data
|
54 |
+
|
55 |
+
|
56 |
+
def find_similar_questions(text_input: str, k: int) -> List[List[Dict[str, Any]]]:
|
57 |
+
"""
|
58 |
+
Find similar questions to a given text input using pre-trained embeddings and a semantic search model.
|
59 |
+
|
60 |
+
Parameters:
|
61 |
+
----------
|
62 |
+
text_input : str
|
63 |
+
The input text for which similar questions are to be found.
|
64 |
+
k : int
|
65 |
+
The number of similar questions to retrieve.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
-------
|
69 |
+
List[List[Dict[str, Any]]]
|
70 |
+
A list of lists, where each inner list contains dictionaries representing similar questions.
|
71 |
+
Each dictionary has the following keys:
|
72 |
+
- 'question': str
|
73 |
+
The text of the similar question.
|
74 |
+
- 'score': float
|
75 |
+
The similarity score between the input text and the similar question.
|
76 |
+
"""
|
77 |
+
model = load_model()
|
78 |
+
embeddings = load_embeddings()
|
79 |
+
text_input_vectorized = model.encode(text_input)
|
80 |
+
similar_questions = semantic_search(text_input_vectorized, embeddings, top_k=k)
|
81 |
+
return similar_questions
|
82 |
+
|
83 |
+
|
84 |
+
def get_similar_questions_with_score(text_input: str, k=5) -> List[Dict[str, Any]]:
|
85 |
+
"""
|
86 |
+
Retrieve similar questions to a given text input along with their similarity scores.
|
87 |
+
|
88 |
+
Parameters:
|
89 |
+
----------
|
90 |
+
text_input : str
|
91 |
+
The input text for which similar questions are to be retrieved.
|
92 |
+
k : int, optional (default=5)
|
93 |
+
The number of similar questions to retrieve. Default is 5.
|
94 |
+
|
95 |
+
Returns:
|
96 |
+
-------
|
97 |
+
List[Dict[str, Any]]
|
98 |
+
A list of dictionaries representing similar questions and their similarity scores.
|
99 |
+
Each dictionary has the following keys:
|
100 |
+
- 'question': str
|
101 |
+
The text of the similar question.
|
102 |
+
- 'similarity_score': float
|
103 |
+
The similarity score between the input text and the similar question, rounded to one decimal place.
|
104 |
+
|
105 |
+
Notes:
|
106 |
+
-----
|
107 |
+
This function uses the `find_similar_questions` function to retrieve similar questions to the input text.
|
108 |
+
It also retrieves the corresponding similarity scores and returns the results as a list of dictionaries.
|
109 |
+
"""
|
110 |
+
similar_questions = find_similar_questions(text_input, k)
|
111 |
+
|
112 |
+
question_data = load_question_data()
|
113 |
+
|
114 |
+
corpus_ids = [item['corpus_id'] for item in similar_questions[0]]
|
115 |
+
similarity_scores = [round(item['score'] * 100, 1) for item in similar_questions[0]]
|
116 |
+
|
117 |
+
similar_question_data = question_data[corpus_ids]
|
118 |
+
|
119 |
+
results = [{'question': question, 'similarity_score': score}
|
120 |
+
for question, score in zip(similar_question_data, similarity_scores)]
|
121 |
+
|
122 |
+
return results
|