import pickle import os import pandas as pd import psycopg2 from psycopg2.extras import DictCursor from similarity_fast import SimilarityFast from utils import generate_embedding from db.db_utils import get_connection def load_pickle(file_path): if os.path.exists(file_path): with open(file_path, 'rb') as f: data = pickle.load(f) return data else: raise FileNotFoundError(f"No pickle file found at {file_path}") def save_pickle(data, file_path): with open(file_path, 'wb') as f: pickle.dump(data, f) def update_data(data, new_data): data.update(new_data) return data pickle_file_paths = ['./embeddings/fast/sentence-transformers-all-mpnet-base-v2.pkl', './embeddings/slow/sentence-transformers-all-mpnet-base-v2.pkl'] db_conn = get_connection() db_cursor = db_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) # select all mappings that have not been reviewed db_cursor.execute("SELECT input_word, dictionary_word FROM mappings WHERE reviewed = 1") results = db_cursor.fetchall() for pickle_file_path in pickle_file_paths: new_entries = {} data = load_pickle(pickle_file_path) algo_fast = SimilarityFast(None) for row in results: input_word = row[0] dictionary_word = row[1] new_entries[input_word] = { 'v': generate_embedding(algo_fast.model, input_word), 'd': dictionary_word } updated_data = update_data(data, new_entries) print("Updated Data") # Save the updated data back to the pickle file print("Saving data to pickle file...") save_pickle(updated_data, pickle_file_path) print(f"Data saved to {pickle_file_path}")