kitopang's picture
Upload 2 files
7fe9c2f verified
raw
history blame contribute delete
No virus
1.63 kB
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Load embeddings DataFrame
df = pd.read_parquet('embeddings.parquet')
df = df.head(5)
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def search_embeddings(query):
# Prepare the query text
inputs = tokenizer(query_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
# Generate embeddings for the query text
with torch.no_grad():
outputs = model(**inputs)
query_vector = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
# Assuming 'embedding' column in df contains embeddings as lists or arrays
# Convert list of embeddings to numpy array for cosine similarity calculation
embedding_matrix = np.stack(df['embedding'].values)
# Compute cosine similarities
similarities = cosine_similarity([query_vector], embedding_matrix)
# Get the top 5 most similar entries
top_indices = np.argsort(similarities[0])[::-1][:5]
top_scores = similarities[0][top_indices]
results = ""
# Print top matches with their scores
for index, score in zip(top_indices, top_scores):
# print(f"Index: {index}, Score: {score}, Data: {df.iloc[index]}")
data = df.iloc[index]
results += (f"Question: {data['text']} Answer: {data['answer']} ")
return results
# query_text = "Paul's First Epistle to the Corinthians"
# print(search_embeddings(query_text))