brightly-ai / add_mappings_to_embeddings.py
beweinreich's picture
added dry matter content to viz tool
c92bac3
raw
history blame
No virus
1.7 kB
import pickle
import os
import pandas as pd
import psycopg2
from psycopg2.extras import DictCursor
from similarity_fast import SimilarityFast
from utils import generate_embedding
from db.db_utils import get_connection
def load_pickle(file_path):
if os.path.exists(file_path):
with open(file_path, 'rb') as f:
data = pickle.load(f)
return data
else:
raise FileNotFoundError(f"No pickle file found at {file_path}")
def save_pickle(data, file_path):
with open(file_path, 'wb') as f:
pickle.dump(data, f)
def update_data(data, new_data):
data.update(new_data)
return data
pickle_file_paths = ['./embeddings/fast/sentence-transformers-all-mpnet-base-v2.pkl', './embeddings/slow/sentence-transformers-all-mpnet-base-v2.pkl']
db_conn = get_connection()
db_cursor = db_conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
# select all mappings that have not been reviewed
db_cursor.execute("SELECT input_word, dictionary_word FROM mappings WHERE reviewed = 1")
results = db_cursor.fetchall()
for pickle_file_path in pickle_file_paths:
new_entries = {}
data = load_pickle(pickle_file_path)
algo_fast = SimilarityFast(None)
for row in results:
input_word = row[0]
dictionary_word = row[1]
new_entries[input_word] = {
'v': generate_embedding(algo_fast.model, input_word),
'd': dictionary_word
}
updated_data = update_data(data, new_entries)
print("Updated Data")
# Save the updated data back to the pickle file
print("Saving data to pickle file...")
save_pickle(updated_data, pickle_file_path)
print(f"Data saved to {pickle_file_path}")