brightly-ai / update_pickle.py
beweinreich's picture
renamed usda category
54daf6f
raw
history blame
No virus
1.8 kB
import pickle
import os
from similarity_fast import SimilarityFast
import pandas as pd
from utils import generate_embedded_dictionary
# Function to load data from the pickle file
def load_pickle(file_path):
if os.path.exists(file_path):
with open(file_path, 'rb') as f:
data = pickle.load(f)
return data
else:
raise FileNotFoundError(f"No pickle file found at {file_path}")
# Function to save data to the pickle file
def save_pickle(data, file_path):
with open(file_path, 'wb') as f:
pickle.dump(data, f)
# Function to update the data
def update_data(data, new_data):
data.update(new_data)
return data
def preprocess_dictionary_word(self, text):
text = text.strip().lower()
text = text.replace(", raw", "").replace(" raw", "")
text = text.replace(", nfs", "").replace(" nfs", "")
if ',' in text:
parts = [part.strip() for part in text.split(',')]
text = ' '.join(reversed(parts))
return text
# Load the existing data from the pickle file
pickle_file_paths = ['./embeddings/fast/sentence-transformers-all-mpnet-base-v2.pkl']
for pickle_file_path in pickle_file_paths:
data = load_pickle(pickle_file_path)
algo_fast = SimilarityFast(None)
csv_file_path = './dictionary/additions.csv'
df_dictionary = pd.read_csv(csv_file_path)
dictionary = df_dictionary['description'].astype(str).tolist()
new_entries = generate_embedded_dictionary(dictionary, algo_fast.model, algo_fast.preprocess_dictionary_word)
updated_data = update_data(data, new_entries)
print("Updated Data")
# Save the updated data back to the pickle file
print("Saving data to pickle file...")
save_pickle(updated_data, pickle_file_path)
print(f"Data saved to {pickle_file_path}")