Spaces:
Paused
Paused
import pickle | |
import os | |
from similarity_fast import SimilarityFast | |
import pandas as pd | |
from utils import generate_embedded_dictionary | |
# Function to load data from the pickle file | |
def load_pickle(file_path): | |
if os.path.exists(file_path): | |
with open(file_path, 'rb') as f: | |
data = pickle.load(f) | |
return data | |
else: | |
raise FileNotFoundError(f"No pickle file found at {file_path}") | |
# Function to save data to the pickle file | |
def save_pickle(data, file_path): | |
with open(file_path, 'wb') as f: | |
pickle.dump(data, f) | |
# Function to update the data | |
def update_data(data, new_data): | |
data.update(new_data) | |
return data | |
def preprocess_dictionary_word(self, text): | |
text = text.strip().lower() | |
text = text.replace(", raw", "").replace(" raw", "") | |
text = text.replace(", nfs", "").replace(" nfs", "") | |
if ',' in text: | |
parts = [part.strip() for part in text.split(',')] | |
text = ' '.join(reversed(parts)) | |
return text | |
# Load the existing data from the pickle file | |
pickle_file_paths = ['./embeddings/fast/sentence-transformers-all-mpnet-base-v2.pkl'] | |
for pickle_file_path in pickle_file_paths: | |
data = load_pickle(pickle_file_path) | |
algo_fast = SimilarityFast(None) | |
csv_file_path = './dictionary/additions.csv' | |
df_dictionary = pd.read_csv(csv_file_path) | |
dictionary = df_dictionary['description'].astype(str).tolist() | |
new_entries = generate_embedded_dictionary(dictionary, algo_fast.model, algo_fast.preprocess_dictionary_word) | |
updated_data = update_data(data, new_entries) | |
print("Updated Data") | |
# Save the updated data back to the pickle file | |
print("Saving data to pickle file...") | |
save_pickle(updated_data, pickle_file_path) | |
print(f"Data saved to {pickle_file_path}") | |