Spaces:
Paused
Paused
import re | |
import os | |
import pickle | |
import pandas as pd | |
from tqdm import tqdm | |
from sentence_transformers import util | |
def generate_embedding(model, sentence): | |
return model.encode(sentence, convert_to_tensor=True) | |
def clean_word(input_word): | |
cleaned_word = re.sub(r'\(.*?\)', '', input_word) | |
cleaned_word = cleaned_word.strip().lower().replace(", raw", "").replace(" raw", "").replace(", nfs", "").replace(" nfs", "") | |
# Remove double or more empty spaces | |
cleaned_word = re.sub(r'\s+', ' ', cleaned_word) | |
# remove periods | |
cleaned_word = cleaned_word.replace(".", "") | |
cleaned_word = cleaned_word.strip() | |
return cleaned_word | |
def is_empty_word(input_word): | |
if not isinstance(input_word, str) or pd.isna(input_word) or input_word == "" or input_word.lower() == "nan" or input_word.lower() == "none": | |
return True | |
return False | |
def calculate_confidence_and_similar_words_str(similarities, highest_score): | |
high_similarities = sorted( | |
[(word, dict_word, score) for word, dict_word, score in similarities if abs(score - highest_score) <= 0.1], | |
key=lambda x: x[2], | |
reverse=True | |
) | |
confidence_score = 1 if len(high_similarities) <= 1 else 0 | |
# remove the highest score because thats the matching word | |
high_similarities = high_similarities[1:] | |
# Select top 5 highest similar items | |
similar_words = [dict_word for _, dict_word, _ in high_similarities[:5]] | |
similar_words_str = ' | '.join(similar_words) | |
return confidence_score, similar_words_str | |
def cosine_similarity(embedding1, embedding2): | |
return util.pytorch_cos_sim(embedding1, embedding2).item() | |
def generate_embedded_dictionary(dictionary, model, preprocessor): | |
dictionary_embeddings = {} | |
for dictionary_word in tqdm(dictionary, desc="Embedding words"): | |
preprocessed_word = preprocessor(dictionary_word) | |
dictionary_embeddings[preprocessed_word] = { # matchable word | |
'v': generate_embedding(model, preprocessed_word), # value embedded | |
'd': dictionary_word # dictionary word | |
} | |
return dictionary_embeddings | |