Spaces:
Paused
Paused
File size: 1,864 Bytes
9189e38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import pickle
import os
from similarity_fast import SimilarityFast
import pandas as pd
from utils import generate_embedded_dictionary
# Function to load data from the pickle file
def load_pickle(file_path):
if os.path.exists(file_path):
with open(file_path, 'rb') as f:
data = pickle.load(f)
return data
else:
raise FileNotFoundError(f"No pickle file found at {file_path}")
# Function to save data to the pickle file
def save_pickle(data, file_path):
with open(file_path, 'wb') as f:
pickle.dump(data, f)
# Function to update the data
def update_data(data, new_data):
data.update(new_data)
return data
def preprocess_dictionary_word(self, text):
text = text.strip().lower()
text = text.replace(", raw", "").replace(" raw", "")
text = text.replace(", nfs", "").replace(" nfs", "")
if ',' in text:
parts = [part.strip() for part in text.split(',')]
text = ' '.join(reversed(parts))
return text
# Load the existing data from the pickle file
pickle_file_paths = ['./embeddings/fast/sentence-transformers-all-mpnet-base-v2.pkl', './embeddings/slow/sentence-transformers-all-mpnet-base-v2.pkl']
for pickle_file_path in pickle_file_paths:
data = load_pickle(pickle_file_path)
algo_fast = SimilarityFast(None)
csv_file_path = './dictionary/additions.csv'
df_dictionary = pd.read_csv(csv_file_path)
dictionary = df_dictionary['description'].astype(str).tolist()
new_entries = generate_embedded_dictionary(dictionary, algo_fast.model, algo_fast.preprocess_dictionary_word)
updated_data = update_data(data, new_entries)
print("Updated Data")
# Save the updated data back to the pickle file
print("Saving data to pickle file...")
save_pickle(updated_data, pickle_file_path)
print(f"Data saved to {pickle_file_path}")
|