import re import os from tqdm import tqdm import pickle from sentence_transformers import util def generate_embedding(model, sentence): return model.encode(sentence, convert_to_tensor=True) def clean_word(input_word): cleaned_word = re.sub(r'\(.*?\)', '', input_word) cleaned_word = cleaned_word.strip().lower().replace(", raw", "").replace(" raw", "").replace(", nfs", "").replace(" nfs", "") cleaned_word = re.sub(r'\s+', ' ', cleaned_word) # Remove double or more empty spaces return cleaned_word def calculate_confidence_and_similar_words_str(similarities, highest_score): high_similarities = sorted( [(word, dict_word, score) for word, dict_word, score in similarities if abs(score - highest_score) <= 0.05], key=lambda x: x[2], reverse=True ) confidence_score = 1 if len(high_similarities) <= 1 else 0 # Select top 5 highest similar items similar_words = [dict_word for _, dict_word, _ in high_similarities[:5]] similar_words_str = ' | '.join(similar_words) return confidence_score, similar_words_str def cosine_similarity(embedding1, embedding2): return util.pytorch_cos_sim(embedding1, embedding2).item() def generate_embedded_dictionary(dictionary, model, preprocessor): dictionary_embeddings = {} for dictionary_word in tqdm(dictionary, desc="Generating embeddings for dictionary words"): preprocessed_word = preprocessor(dictionary_word) dictionary_embeddings[preprocessed_word] = { # matchable word 'v': generate_embedding(model, preprocessed_word), # value embedded 'd': dictionary_word # dictionary word } return dictionary_embeddings