Spaces:
Sleeping
Sleeping
from pathlib import Path | |
from typing import List, Dict, Any | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
from sentence_transformers.util import semantic_search | |
import streamlit as st | |
QUESTION_DATA_PATH = Path('data/processed/question_data.npy') | |
EMBEDDINGS_PATH = Path('data/embeddings/embeddings.npy') | |
MODEL_PATH = 'sentence-transformers/all-MiniLM-L6-v2' | |
def load_model(): | |
""" | |
Load a pre-trained SentenceTransformer model. | |
Returns: | |
------- | |
SentenceTransformer | |
A pre-trained SentenceTransformer model loaded from the specified MODEL_PATH. | |
""" | |
model = SentenceTransformer(MODEL_PATH) | |
return model | |
def load_embeddings(): | |
""" | |
Load pre-computed embeddings from a file. | |
Returns: | |
------- | |
numpy.ndarray | |
A NumPy array containing pre-computed embeddings loaded from the specified EMBEDDINGS_PATH. | |
""" | |
embeddings = np.load(EMBEDDINGS_PATH) | |
return embeddings | |
def load_question_data(): | |
""" | |
Load question data from a file. | |
Returns: | |
------- | |
numpy.ndarray | |
A NumPy array containing question data loaded from the specified QUESTION_DATA_PATH. | |
""" | |
question_data = np.load(QUESTION_DATA_PATH, allow_pickle=True) | |
return question_data | |
def find_similar_questions(text_input: str, k: int) -> List[List[Dict[str, Any]]]: | |
""" | |
Find similar questions to a given text input using pre-trained embeddings and a semantic search model. | |
Parameters: | |
---------- | |
text_input : str | |
The input text for which similar questions are to be found. | |
k : int | |
The number of similar questions to retrieve. | |
Returns: | |
------- | |
List[List[Dict[str, Any]]] | |
A list of lists, where each inner list contains dictionaries representing similar questions. | |
Each dictionary has the following keys: | |
- 'question': str | |
The text of the similar question. | |
- 'score': float | |
The similarity score between the input text and the similar question. | |
""" | |
model = load_model() | |
embeddings = load_embeddings() | |
text_input_vectorized = model.encode(text_input) | |
similar_questions = semantic_search(text_input_vectorized, embeddings, top_k=k) | |
return similar_questions | |
def get_similar_questions_with_score(text_input: str, k=5) -> List[Dict[str, Any]]: | |
""" | |
Retrieve similar questions to a given text input along with their similarity scores. | |
Parameters: | |
---------- | |
text_input : str | |
The input text for which similar questions are to be retrieved. | |
k : int, optional (default=5) | |
The number of similar questions to retrieve. Default is 5. | |
Returns: | |
------- | |
List[Dict[str, Any]] | |
A list of dictionaries representing similar questions and their similarity scores. | |
Each dictionary has the following keys: | |
- 'question': str | |
The text of the similar question. | |
- 'similarity_score': float | |
The similarity score between the input text and the similar question, rounded to one decimal place. | |
Notes: | |
----- | |
This function uses the `find_similar_questions` function to retrieve similar questions to the input text. | |
It also retrieves the corresponding similarity scores and returns the results as a list of dictionaries. | |
""" | |
similar_questions = find_similar_questions(text_input, k) | |
question_data = load_question_data() | |
corpus_ids = [item['corpus_id'] for item in similar_questions[0]] | |
similarity_scores = [round(item['score'] * 100, 1) for item in similar_questions[0]] | |
similar_question_data = question_data[corpus_ids] | |
results = [{'question': question, 'similarity_score': score} | |
for question, score in zip(similar_question_data, similarity_scores)] | |
return results | |