Spaces:

kamil-pytlak
/

SFSeeker

Sleeping

App Files Files Community

Kamil Pytlak commited on Oct 2, 2023

Commit

2c077c2

•

1 Parent(s): 4690597

Initial commit

Browse files

Files changed (5) hide show

app.py +37 -0
data/embeddings/embeddings.npy +3 -0
data/processed/question_data.npy +3 -0
img/logo.jpg +0 -0
utils.py +122 -0

app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import pandas as pd
+import streamlit as st
+from utils import get_similar_questions_with_score
+LOGO_PATH = 'img/logo.jpg'
+st.image(LOGO_PATH, width=200)
+st.title('SF Seeker')
+st.markdown("""
+Sci-Fi Stack Exchange Seeker (aka SF Seeker) is an AI assistant that helps you write better questions and search for
+semantically similar questions on Sci-Fi Stack Exchange (https://scifi.stackexchange.com/). An all-MiniLM-L6-v2
+language model (transformer) was used.
+**Features**
+- 🔎 Based on a database of 71,013 questions, it searches for the most semantically similar questions to the one entered
+by the user. This supports the process of fiding the same/similar questions already asked and prevents the creation of
+duplicate threads.
+- 👨‍⚕️ [IN PROGRESS] Indicates words in a question that have a negative and positive effect on the chance of
+getting an answer. It supports the process of arranging more precise questions. A model based on gradient
+reinforcement learned using TF-IDF features was used.
+""")
+question_input = st.text_area('Question')
+k_similar_questions = st.number_input('k similar questions', min_value=1, max_value=100, value=5, step=1)
+if st.button('Submit'):
+    if not question_input:
+        st.warning('⚠️ No question inputted!')
+    else:
+        question_score_results = get_similar_questions_with_score(question_input, k_similar_questions)
+        question_score_results_df = pd.DataFrame(question_score_results)
+        question_score_results_df.columns = ['Question', 'Similarity score (in %)']
+        st.dataframe(question_score_results_df)

data/embeddings/embeddings.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8ba0ea61f6a0bb40b3b0c4060315da6daf7140a440705af2a33f74963b22ac7
+size 109076096

data/processed/question_data.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b58266ad0d4798f220c11b7fa43b5088c83e4c0ae0b6dc549c2be480b91c4f3
+size 54420125

img/logo.jpg ADDED Viewed

utils.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from pathlib import Path
+from typing import List, Dict, Any
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.util import semantic_search
+import streamlit as st
+QUESTION_DATA_PATH = Path('data/processed/question_data.npy')
+EMBEDDINGS_PATH = Path('data/embeddings/embeddings.npy')
+MODEL_PATH = 'sentence-transformers/all-MiniLM-L6-v2'
+@st.cache_resource
+def load_model():
+    """
+    Load a pre-trained SentenceTransformer model.
+    Returns:
+    -------
+    SentenceTransformer
+        A pre-trained SentenceTransformer model loaded from the specified MODEL_PATH.
+    """
+    model = SentenceTransformer(MODEL_PATH)
+    return model
+@st.cache_data
+def load_embeddings():
+    """
+    Load pre-computed embeddings from a file.
+    Returns:
+    -------
+    numpy.ndarray
+        A NumPy array containing pre-computed embeddings loaded from the specified EMBEDDINGS_PATH.
+    """
+    embeddings = np.load(EMBEDDINGS_PATH)
+    return embeddings
+@st.cache_data
+def load_question_data():
+    """
+    Load question data from a file.
+    Returns:
+    -------
+    numpy.ndarray
+        A NumPy array containing question data loaded from the specified QUESTION_DATA_PATH.
+    """
+    question_data = np.load(QUESTION_DATA_PATH, allow_pickle=True)
+    return question_data
+def find_similar_questions(text_input: str,  k: int) -> List[List[Dict[str, Any]]]:
+    """
+    Find similar questions to a given text input using pre-trained embeddings and a semantic search model.
+    Parameters:
+    ----------
+    text_input : str
+        The input text for which similar questions are to be found.
+    k : int
+        The number of similar questions to retrieve.
+    Returns:
+    -------
+    List[List[Dict[str, Any]]]
+        A list of lists, where each inner list contains dictionaries representing similar questions.
+        Each dictionary has the following keys:
+        - 'question': str
+            The text of the similar question.
+        - 'score': float
+            The similarity score between the input text and the similar question.
+    """
+    model = load_model()
+    embeddings = load_embeddings()
+    text_input_vectorized = model.encode(text_input)
+    similar_questions = semantic_search(text_input_vectorized, embeddings, top_k=k)
+    return similar_questions
+def get_similar_questions_with_score(text_input: str, k=5) -> List[Dict[str, Any]]:
+    """
+    Retrieve similar questions to a given text input along with their similarity scores.
+    Parameters:
+    ----------
+    text_input : str
+        The input text for which similar questions are to be retrieved.
+    k : int, optional (default=5)
+        The number of similar questions to retrieve. Default is 5.
+    Returns:
+    -------
+    List[Dict[str, Any]]
+        A list of dictionaries representing similar questions and their similarity scores.
+        Each dictionary has the following keys:
+        - 'question': str
+            The text of the similar question.
+        - 'similarity_score': float
+            The similarity score between the input text and the similar question, rounded to one decimal place.
+    Notes:
+    -----
+    This function uses the `find_similar_questions` function to retrieve similar questions to the input text.
+    It also retrieves the corresponding similarity scores and returns the results as a list of dictionaries.
+    """
+    similar_questions = find_similar_questions(text_input, k)
+    question_data = load_question_data()
+    corpus_ids = [item['corpus_id'] for item in similar_questions[0]]
+    similarity_scores = [round(item['score'] * 100, 1) for item in similar_questions[0]]
+    similar_question_data = question_data[corpus_ids]
+    results = [{'question': question, 'similarity_score': score}
+               for question, score in zip(similar_question_data, similarity_scores)]
+    return results