from pathlib import Path from typing import List, Dict, Any import numpy as np from sentence_transformers import SentenceTransformer from sentence_transformers.util import semantic_search import streamlit as st QUESTION_DATA_PATH = Path('data/processed/question_data.npy') EMBEDDINGS_PATH = Path('data/embeddings/embeddings.npy') MODEL_PATH = 'sentence-transformers/all-MiniLM-L6-v2' @st.cache_resource def load_model(): """ Load a pre-trained SentenceTransformer model. Returns: ------- SentenceTransformer A pre-trained SentenceTransformer model loaded from the specified MODEL_PATH. """ model = SentenceTransformer(MODEL_PATH) return model @st.cache_data def load_embeddings(): """ Load pre-computed embeddings from a file. Returns: ------- numpy.ndarray A NumPy array containing pre-computed embeddings loaded from the specified EMBEDDINGS_PATH. """ embeddings = np.load(EMBEDDINGS_PATH) return embeddings @st.cache_data def load_question_data(): """ Load question data from a file. Returns: ------- numpy.ndarray A NumPy array containing question data loaded from the specified QUESTION_DATA_PATH. """ question_data = np.load(QUESTION_DATA_PATH, allow_pickle=True) return question_data def find_similar_questions(text_input: str, k: int) -> List[List[Dict[str, Any]]]: """ Find similar questions to a given text input using pre-trained embeddings and a semantic search model. Parameters: ---------- text_input : str The input text for which similar questions are to be found. k : int The number of similar questions to retrieve. Returns: ------- List[List[Dict[str, Any]]] A list of lists, where each inner list contains dictionaries representing similar questions. Each dictionary has the following keys: - 'question': str The text of the similar question. - 'score': float The similarity score between the input text and the similar question. """ model = load_model() embeddings = load_embeddings() text_input_vectorized = model.encode(text_input) similar_questions = semantic_search(text_input_vectorized, embeddings, top_k=k) return similar_questions def get_similar_questions_with_score(text_input: str, k=5) -> List[Dict[str, Any]]: """ Retrieve similar questions to a given text input along with their similarity scores. Parameters: ---------- text_input : str The input text for which similar questions are to be retrieved. k : int, optional (default=5) The number of similar questions to retrieve. Default is 5. Returns: ------- List[Dict[str, Any]] A list of dictionaries representing similar questions and their similarity scores. Each dictionary has the following keys: - 'question': str The text of the similar question. - 'similarity_score': float The similarity score between the input text and the similar question, rounded to one decimal place. Notes: ----- This function uses the `find_similar_questions` function to retrieve similar questions to the input text. It also retrieves the corresponding similarity scores and returns the results as a list of dictionaries. """ similar_questions = find_similar_questions(text_input, k) question_data = load_question_data() corpus_ids = [item['corpus_id'] for item in similar_questions[0]] similarity_scores = [round(item['score'] * 100, 1) for item in similar_questions[0]] similar_question_data = question_data[corpus_ids] results = [{'question': question, 'similarity_score': score} for question, score in zip(similar_question_data, similarity_scores)] return results