Spaces:

kamil-pytlak
/

SFSeeker

Sleeping

SFSeeker / utils.py

Kamil Pytlak

Initial commit

2c077c2 over 1 year ago

3.89 kB

	from pathlib import Path
	from typing import List, Dict, Any

	import numpy as np
	from sentence_transformers import SentenceTransformer
	from sentence_transformers.util import semantic_search
	import streamlit as st

	QUESTION_DATA_PATH = Path('data/processed/question_data.npy')
	EMBEDDINGS_PATH = Path('data/embeddings/embeddings.npy')
	MODEL_PATH = 'sentence-transformers/all-MiniLM-L6-v2'


	@st.cache_resource
	def load_model():
	"""
	Load a pre-trained SentenceTransformer model.

	Returns:
	-------
	SentenceTransformer
	A pre-trained SentenceTransformer model loaded from the specified MODEL_PATH.
	"""
	model = SentenceTransformer(MODEL_PATH)
	return model


	@st.cache_data
	def load_embeddings():
	"""
	Load pre-computed embeddings from a file.

	Returns:
	-------
	numpy.ndarray
	A NumPy array containing pre-computed embeddings loaded from the specified EMBEDDINGS_PATH.
	"""
	embeddings = np.load(EMBEDDINGS_PATH)
	return embeddings


	@st.cache_data
	def load_question_data():
	"""
	Load question data from a file.

	Returns:
	-------
	numpy.ndarray
	A NumPy array containing question data loaded from the specified QUESTION_DATA_PATH.
	"""
	question_data = np.load(QUESTION_DATA_PATH, allow_pickle=True)
	return question_data


	def find_similar_questions(text_input: str, k: int) -> List[List[Dict[str, Any]]]:
	"""
	Find similar questions to a given text input using pre-trained embeddings and a semantic search model.

	Parameters:
	----------
	text_input : str
	The input text for which similar questions are to be found.
	k : int
	The number of similar questions to retrieve.

	Returns:
	-------
	List[List[Dict[str, Any]]]
	A list of lists, where each inner list contains dictionaries representing similar questions.
	Each dictionary has the following keys:
	- 'question': str
	The text of the similar question.
	- 'score': float
	The similarity score between the input text and the similar question.
	"""
	model = load_model()
	embeddings = load_embeddings()
	text_input_vectorized = model.encode(text_input)
	similar_questions = semantic_search(text_input_vectorized, embeddings, top_k=k)
	return similar_questions


	def get_similar_questions_with_score(text_input: str, k=5) -> List[Dict[str, Any]]:
	"""
	Retrieve similar questions to a given text input along with their similarity scores.

	Parameters:
	----------
	text_input : str
	The input text for which similar questions are to be retrieved.
	k : int, optional (default=5)
	The number of similar questions to retrieve. Default is 5.

	Returns:
	-------
	List[Dict[str, Any]]
	A list of dictionaries representing similar questions and their similarity scores.
	Each dictionary has the following keys:
	- 'question': str
	The text of the similar question.
	- 'similarity_score': float
	The similarity score between the input text and the similar question, rounded to one decimal place.

	Notes:
	-----
	This function uses the `find_similar_questions` function to retrieve similar questions to the input text.
	It also retrieves the corresponding similarity scores and returns the results as a list of dictionaries.
	"""
	similar_questions = find_similar_questions(text_input, k)

	question_data = load_question_data()

	corpus_ids = [item['corpus_id'] for item in similar_questions[0]]
	similarity_scores = [round(item['score'] * 100, 1) for item in similar_questions[0]]

	similar_question_data = question_data[corpus_ids]

	results = [{'question': question, 'similarity_score': score}
	for question, score in zip(similar_question_data, similarity_scores)]

	return results