brightly-ai / utils.py
beweinreich's picture
remove periods from the cleaned word
97adb95
raw
history blame
No virus
2.14 kB
import re
import os
import pickle
import pandas as pd
from tqdm import tqdm
from sentence_transformers import util
def generate_embedding(model, sentence):
return model.encode(sentence, convert_to_tensor=True)
def clean_word(input_word):
cleaned_word = re.sub(r'\(.*?\)', '', input_word)
cleaned_word = cleaned_word.strip().lower().replace(", raw", "").replace(" raw", "").replace(", nfs", "").replace(" nfs", "")
# Remove double or more empty spaces
cleaned_word = re.sub(r'\s+', ' ', cleaned_word)
# remove periods
cleaned_word = cleaned_word.replace(".", "")
cleaned_word = cleaned_word.strip()
return cleaned_word
def is_empty_word(input_word):
if not isinstance(input_word, str) or pd.isna(input_word) or input_word == "" or input_word.lower() == "nan" or input_word.lower() == "none":
return True
return False
def calculate_confidence_and_similar_words_str(similarities, highest_score):
high_similarities = sorted(
[(word, dict_word, score) for word, dict_word, score in similarities if abs(score - highest_score) <= 0.1],
key=lambda x: x[2],
reverse=True
)
confidence_score = 1 if len(high_similarities) <= 1 else 0
# remove the highest score because thats the matching word
high_similarities = high_similarities[1:]
# Select top 5 highest similar items
similar_words = [dict_word for _, dict_word, _ in high_similarities[:5]]
similar_words_str = ' | '.join(similar_words)
return confidence_score, similar_words_str
def cosine_similarity(embedding1, embedding2):
return util.pytorch_cos_sim(embedding1, embedding2).item()
def generate_embedded_dictionary(dictionary, model, preprocessor):
dictionary_embeddings = {}
for dictionary_word in tqdm(dictionary, desc="Embedding words"):
preprocessed_word = preprocessor(dictionary_word)
dictionary_embeddings[preprocessed_word] = { # matchable word
'v': generate_embedding(model, preprocessed_word), # value embedded
'd': dictionary_word # dictionary word
}
return dictionary_embeddings