brightly-ai / add_mappings_to_embeddings.py
beweinreich's picture
include mappings in dictionary embeddings
a1c159e
raw
history blame
No virus
1.73 kB
import pickle
import os
import pandas as pd
import psycopg2
from psycopg2.extras import DictCursor
from utils import generate_embedding
from db.db_utils import get_connection
pickle_file_paths = ['./embeddings/fast/sentence-transformers-all-mpnet-base-v2.pkl']
def load_pickle(file_path):
if os.path.exists(file_path):
with open(file_path, 'rb') as f:
data = pickle.load(f)
return data
else:
raise FileNotFoundError(f"No pickle file found at {file_path}")
def save_pickle(data, file_path):
with open(file_path, 'wb') as f:
pickle.dump(data, f)
def update_data(data, new_data):
data.update(new_data)
return data
def run_mappings_to_embeddings(model):
db_conn = get_connection()
db_cursor = db_conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
# select all mappings that have not been reviewed
db_cursor.execute("SELECT input_word, dictionary_word FROM mappings WHERE reviewed = true and flagged = false")
results = db_cursor.fetchall()
for pickle_file_path in pickle_file_paths:
new_entries = {}
data = load_pickle(pickle_file_path)
for row in results:
input_word = row[0]
dictionary_word = row[1]
new_entries[input_word] = {
'v': generate_embedding(model, input_word),
'd': dictionary_word
}
updated_data = update_data(data, new_entries)
print("Updated Data")
# Save the updated data back to the pickle file
print("Saving data to pickle file...")
save_pickle(updated_data, pickle_file_path)
print(f"Data saved to {pickle_file_path}")
return new_entries