import re import os import pickle import pandas as pd from tqdm import tqdm from sentence_transformers import util def generate_embedding(model, sentence): return model.encode(sentence, convert_to_tensor=True) def clean_word(input_word): cleaned_word = re.sub(r'\(.*?\)', '', input_word) cleaned_word = cleaned_word.strip().lower().replace(", raw", "").replace(" raw", "").replace(", nfs", "").replace(" nfs", "") cleaned_word = re.sub(r'\s+', ' ', cleaned_word) # Remove double or more empty spaces cleaned_word = cleaned_word.strip() return cleaned_word def is_empty_word(input_word): if not isinstance(input_word, str) or pd.isna(input_word) or input_word == "" or input_word.lower() == "nan" or input_word.lower() == "none": return True return False def calculate_confidence_and_similar_words_str(similarities, highest_score): high_similarities = sorted( [(word, dict_word, score) for word, dict_word, score in similarities if abs(score - highest_score) <= 0.1], key=lambda x: x[2], reverse=True ) confidence_score = 1 if len(high_similarities) <= 1 else 0 # remove the highest score because thats the matching word high_similarities = high_similarities[1:] # Select top 5 highest similar items similar_words = [dict_word for _, dict_word, _ in high_similarities[:5]] similar_words_str = ' | '.join(similar_words) return confidence_score, similar_words_str def cosine_similarity(embedding1, embedding2): return util.pytorch_cos_sim(embedding1, embedding2).item() def generate_embedded_dictionary(dictionary, model, preprocessor): dictionary_embeddings = {} for dictionary_word in tqdm(dictionary, desc="Embedding words"): preprocessed_word = preprocessor(dictionary_word) dictionary_embeddings[preprocessed_word] = { # matchable word 'v': generate_embedding(model, preprocessed_word), # value embedded 'd': dictionary_word # dictionary word } return dictionary_embeddings