from sentence_transformers import SentenceTransformer, util import pandas as pd from tqdm import tqdm import os import pickle # Load pre-trained sentence transformer model model = SentenceTransformer('all-MiniLM-L6-v2') def generate_st_embedding(sentence): return model.encode(sentence, convert_to_tensor=True) def cosine_similarity_st(embedding1, embedding2): return util.pytorch_cos_sim(embedding1, embedding2).item() # Load the dictionary csv_file_path = './dictionary/dictionary.csv' df_dictionary = pd.read_csv(csv_file_path) dictionary = df_dictionary['description'].tolist() # Load the input words input_file_path = 'raw/test.csv' df_input = pd.read_csv(input_file_path) input_words = df_input['description'].tolist() print("Everything loaded...") # Check if the embeddings pickle file exists pickle_file_path = './sbert_dictionary_embeddings.pkl' if os.path.exists(pickle_file_path): with open(pickle_file_path, 'rb') as f: dictionary_embeddings = pickle.load(f) else: # Generate embeddings for dictionary words dictionary_embeddings = {} for desc in tqdm(dictionary, desc="Generating embeddings for dictionary words"): dictionary_embeddings[desc] = generate_st_embedding(desc) # Save the embeddings to a pickle file with open(pickle_file_path, 'wb') as f: pickle.dump(dictionary_embeddings, f) # Find the most similar word in the dictionary for each input word results = [] for input_word in tqdm(input_words, desc="Processing input words"): input_embedding = generate_st_embedding(input_word) similarities = [(desc, cosine_similarity_st(input_embedding, dict_embedding)) for desc, dict_embedding in dictionary_embeddings.items()] most_similar_word, highest_score = max(similarities, key=lambda x: x[1]) results.append((input_word, most_similar_word, highest_score)) # Print the results for input_word, most_similar_word, score in results: print(f"Input word: {input_word}") print(f"Most similar word: {most_similar_word}") print(f"Similarity score: {score}\n")