|
|
|
"""compareVec2VecWithAda.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1jPaNXdO0_oW6VczlWfm5RPUVpMtVQD9c |
|
""" |
|
|
|
import pandas as pd |
|
import numpy as np |
|
import openai |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from tensorflow.keras.models import load_model |
|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
import torch.nn.functional as F |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained('all-mpnet-base-v2') |
|
model = AutoModel.from_pretrained('all-mpnet-base-v2') |
|
|
|
|
|
def cosine_similarity_loss(y_true, y_pred): |
|
y_true = tf.nn.l2_normalize(y_true, axis=-1) |
|
y_pred = tf.nn.l2_normalize(y_pred, axis=-1) |
|
return -tf.reduce_mean(y_true * y_pred, axis=-1) |
|
|
|
|
|
|
|
def mean_pooling(model_output, attention_mask): |
|
token_embeddings = model_output[0] |
|
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
|
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) |
|
|
|
loaded_model = load_model('mpnet2adaE75V4.h5', custom_objects={'cosine_similarity_loss': cosine_similarity_loss}) |
|
|
|
openai.api_key="insert API key here" |
|
|
|
|
|
df2 = pd.read_csv('Actual_Embeddings.csv') |
|
|
|
|
|
df2['Actual_Embeddings'] = df2['Actual_Embeddings'].apply(eval).apply(np.array) |
|
|
|
|
|
def get_top_5_texts(query): |
|
encoded_input = tokenizer(query, padding=True, truncation=True, return_tensors='pt') |
|
|
|
with torch.no_grad(): |
|
model_output = model(**encoded_input) |
|
|
|
mpnetEmbeddings = mean_pooling(model_output, encoded_input['attention_mask']) |
|
|
|
mpnetEmbeddings = F.normalize(mpnetEmbeddings, p=2, dim=1) |
|
mpnetEmbeddings = mpnetEmbeddings.detach().cpu().numpy() |
|
mpnetEmbeddings = np.reshape(mpnetEmbeddings, (1,-1)) |
|
query_embedding = loaded_model.predict(mpnetEmbeddings) |
|
|
|
similarities = [cosine_similarity(query_embedding.reshape(1, -1), emb.reshape(1, -1))[0][0] for emb in df2['Actual_Embeddings']] |
|
|
|
print("Converted MPNet Embedding Results:") |
|
top_5_idx2 = np.argsort(similarities)[-5:][::-1] |
|
for i, idx in enumerate(top_5_idx2, 1): |
|
print(f'Text {i}') |
|
print(df2['combined'].iloc[idx]) |
|
print("\n") |
|
|
|
response = openai.Embedding.create(input=query, model="text-embedding-ada-002") |
|
query_embedding = np.array(response['data'][0]['embedding']) |
|
similarities2 = [cosine_similarity(query_embedding.reshape(1, -1), emb.reshape(1, -1))[0][0] for emb in df2['Actual_Embeddings']] |
|
|
|
print("OpenAI Embedding Results:") |
|
top_5_idx2 = np.argsort(similarities2)[-5:][::-1] |
|
for i, idx in enumerate(top_5_idx2, 1): |
|
print(f'Text {i}') |
|
print(df2['combined'].iloc[idx]) |
|
print("\n") |
|
|
|
while True: |
|
query = input("Enter your query: ") |
|
get_top_5_texts(query) |