from transformers import BertTokenizer, BertModel import torch import pandas as pd from sklearn.metrics.pairwise import cosine_similarity import numpy as np # Load embeddings DataFrame df = pd.read_parquet('embeddings.parquet') df = df.head(5) # Initialize tokenizer and model tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') def search_embeddings(query): # Prepare the query text inputs = tokenizer(query_text, return_tensors="pt", padding=True, truncation=True, max_length=512) # Generate embeddings for the query text with torch.no_grad(): outputs = model(**inputs) query_vector = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() # Assuming 'embedding' column in df contains embeddings as lists or arrays # Convert list of embeddings to numpy array for cosine similarity calculation embedding_matrix = np.stack(df['embedding'].values) # Compute cosine similarities similarities = cosine_similarity([query_vector], embedding_matrix) # Get the top 5 most similar entries top_indices = np.argsort(similarities[0])[::-1][:5] top_scores = similarities[0][top_indices] results = "" # Print top matches with their scores for index, score in zip(top_indices, top_scores): # print(f"Index: {index}, Score: {score}, Data: {df.iloc[index]}") data = df.iloc[index] results += (f"Question: {data['text']} Answer: {data['answer']} ") return results # query_text = "Paul's First Epistle to the Corinthians" # print(search_embeddings(query_text))