Spaces:

lazarr19
/

prompt-engine

Sleeping

prompt-engine / src /search_engine.py

Lazar Radojevic

update README

3556e6f 12 months ago

1.93 kB

	from typing import List, Sequence, Tuple

	from sentence_transformers import SentenceTransformer

	from src.similarity_scorer import SimilarityScorer
	from src.vectorizer import Vectorizer


	class PromptSearchEngine:
	def __init__(self, prompts: Sequence[str]) -> None:
	"""Initialize search engine by vectorizing prompt corpus.
	Vectorized prompt corpus should be used to find the top n most
	similar prompts w.r.t. user’s input prompt.
	Args:
	prompts: The sequence of raw prompts from the dataset.
	"""
	self.vectorizer = Vectorizer(SentenceTransformer("all-MiniLM-L6-v2"))
	self.scorer = SimilarityScorer()

	self.prompts = prompts
	self.embeddings = self.vectorizer.transform(prompts)

	def most_similar(self, query: str, n: int = 5) -> List[Tuple[float, str]]:
	"""Return top n most similar prompts from corpus.
	Input query prompt should be vectorized with chosen Vectorizer.
	After
	that, use the cosine_similarity function to get the top n most
	similar
	prompts from the corpus.
	Args:
	query: The raw query prompt input from the user.
	n: The number of similar prompts returned from the corpus.
	Returns:
	The list of top n most similar prompts from the corpus along
	with similarity scores. Note that returned prompts are
	verbatim.
	"""
	query_embedding = self.vectorizer.transform(query)

	similarities = self.scorer.cosine_similarity(query_embedding, self.embeddings)

	# Get the top n indices with highest similarity scores
	top_n_indices = similarities.argsort()[-n:][::-1]

	# Retrieve the top n most similar prompts along with their similarity scores
	top_n_similar_prompts = [
	(similarities[i], self.prompts[i]) for i in top_n_indices
	]

	return top_n_similar_prompts