Spaces:

madebybread
/

brightly-ai

Paused

File size: 1,864 Bytes

9189e38

import pickle
import os
from similarity_fast import SimilarityFast
import pandas as pd
from utils import generate_embedded_dictionary


# Function to load data from the pickle file
def load_pickle(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
        return data
    else:
        raise FileNotFoundError(f"No pickle file found at {file_path}")

# Function to save data to the pickle file
def save_pickle(data, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)

# Function to update the data
def update_data(data, new_data):
    data.update(new_data)
    return data

def preprocess_dictionary_word(self, text):
    text = text.strip().lower()
    text = text.replace(", raw", "").replace(" raw", "")
    text = text.replace(", nfs", "").replace(" nfs", "")
    if ',' in text:
        parts = [part.strip() for part in text.split(',')]
        text = ' '.join(reversed(parts))
    return text

# Load the existing data from the pickle file
pickle_file_paths = ['./embeddings/fast/sentence-transformers-all-mpnet-base-v2.pkl', './embeddings/slow/sentence-transformers-all-mpnet-base-v2.pkl']

for pickle_file_path in pickle_file_paths:
    data = load_pickle(pickle_file_path)

    algo_fast = SimilarityFast(None)

    csv_file_path = './dictionary/additions.csv'
    df_dictionary = pd.read_csv(csv_file_path)
    dictionary = df_dictionary['description'].astype(str).tolist()
    new_entries = generate_embedded_dictionary(dictionary, algo_fast.model, algo_fast.preprocess_dictionary_word)

    updated_data = update_data(data, new_entries)
    print("Updated Data")

    # Save the updated data back to the pickle file
    print("Saving data to pickle file...")
    save_pickle(updated_data, pickle_file_path)
    print(f"Data saved to {pickle_file_path}")