Spaces:
Paused
Paused
import pickle | |
import os | |
import pandas as pd | |
import psycopg2 | |
from psycopg2.extras import DictCursor | |
from utils import generate_embedding | |
from db.db_utils import get_connection | |
pickle_file_paths = ['./embeddings/fast/sentence-transformers-all-mpnet-base-v2.pkl'] | |
def load_pickle(file_path): | |
if os.path.exists(file_path): | |
with open(file_path, 'rb') as f: | |
data = pickle.load(f) | |
return data | |
else: | |
raise FileNotFoundError(f"No pickle file found at {file_path}") | |
def save_pickle(data, file_path): | |
with open(file_path, 'wb') as f: | |
pickle.dump(data, f) | |
def update_data(data, new_data): | |
data.update(new_data) | |
return data | |
def run_mappings_to_embeddings(model): | |
db_conn = get_connection() | |
db_cursor = db_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) | |
# select all mappings that have not been reviewed | |
db_cursor.execute("SELECT input_word, dictionary_word FROM mappings WHERE reviewed = true and flagged = false") | |
results = db_cursor.fetchall() | |
for pickle_file_path in pickle_file_paths: | |
new_entries = {} | |
data = load_pickle(pickle_file_path) | |
for row in results: | |
input_word = row[0] | |
dictionary_word = row[1] | |
new_entries[input_word] = { | |
'v': generate_embedding(model, input_word), | |
'd': dictionary_word | |
} | |
updated_data = update_data(data, new_entries) | |
print("Updated Data") | |
# Save the updated data back to the pickle file | |
print("Saving data to pickle file...") | |
save_pickle(updated_data, pickle_file_path) | |
print(f"Data saved to {pickle_file_path}") | |
return new_entries | |