Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

brightly-ai / update_pickle.py

beweinreich

renamed usda category

54daf6f 20 days ago

raw

history blame

No virus

1.8 kB

	import pickle
	import os
	from similarity_fast import SimilarityFast
	import pandas as pd
	from utils import generate_embedded_dictionary


	# Function to load data from the pickle file
	def load_pickle(file_path):
	if os.path.exists(file_path):
	with open(file_path, 'rb') as f:
	data = pickle.load(f)
	return data
	else:
	raise FileNotFoundError(f"No pickle file found at {file_path}")

	# Function to save data to the pickle file
	def save_pickle(data, file_path):
	with open(file_path, 'wb') as f:
	pickle.dump(data, f)

	# Function to update the data
	def update_data(data, new_data):
	data.update(new_data)
	return data

	def preprocess_dictionary_word(self, text):
	text = text.strip().lower()
	text = text.replace(", raw", "").replace(" raw", "")
	text = text.replace(", nfs", "").replace(" nfs", "")
	if ',' in text:
	parts = [part.strip() for part in text.split(',')]
	text = ' '.join(reversed(parts))
	return text

	# Load the existing data from the pickle file
	pickle_file_paths = ['./embeddings/fast/sentence-transformers-all-mpnet-base-v2.pkl']

	for pickle_file_path in pickle_file_paths:
	data = load_pickle(pickle_file_path)

	algo_fast = SimilarityFast(None)

	csv_file_path = './dictionary/additions.csv'
	df_dictionary = pd.read_csv(csv_file_path)
	dictionary = df_dictionary['description'].astype(str).tolist()
	new_entries = generate_embedded_dictionary(dictionary, algo_fast.model, algo_fast.preprocess_dictionary_word)

	updated_data = update_data(data, new_entries)
	print("Updated Data")

	# Save the updated data back to the pickle file
	print("Saving data to pickle file...")
	save_pickle(updated_data, pickle_file_path)
	print(f"Data saved to {pickle_file_path}")