search_engine / ir.py
InkarSaleshovaSDU's picture
validated ir.py
7224dc5
raw
history blame
1.24 kB
from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
class SearchEngine:
def __init__(self, model_api_path='glove-wiki-gigaword-100'):
self.model = api.load(model_api_path)
self.prompts = []
self.vectors = []
def vectorize(self, prompt):
words = simple_preprocess(prompt)
vectors = [self.model[word] for word in words if word in self.model]
return np.mean(vectors, axis=0)
def add(self, prompt):
vector = self.vectorize(prompt)
self.prompts.append(prompt)
self.vectors.append(vector)
def search(self, input_prompt):
vectorized_input = self.vectorize(input_prompt)
similarity_scores = cosine_similarity([vectorized_input], self.vectors)[0]
most_similar_idx = np.argmax(similarity_scores)
return self.prompts[most_similar_idx]
if __name__ == "__main__":
storage = SearchEngine()
storage.add("I love programming")
storage.add("You need to graduate")
storage.add("Library is open")
input_prompt = "I enjoy coding"
most_similar_prompt = storage.search(input_prompt)
print(f"Input Prompt: {input_prompt}")
print(f"Most Similar Prompt: {most_similar_prompt}")