brightly-ai / utils.py
beweinreich's picture
audited
05bb441
raw
history blame
No virus
1.7 kB
import re
import os
from tqdm import tqdm
import pickle
from sentence_transformers import util
def generate_embedding(model, sentence):
return model.encode(sentence, convert_to_tensor=True)
def clean_word(input_word):
cleaned_word = re.sub(r'\(.*?\)', '', input_word)
cleaned_word = cleaned_word.strip().lower().replace(", raw", "").replace(" raw", "").replace(", nfs", "").replace(" nfs", "")
cleaned_word = re.sub(r'\s+', ' ', cleaned_word) # Remove double or more empty spaces
return cleaned_word
def calculate_confidence_and_similar_words_str(similarities, highest_score):
high_similarities = sorted(
[(word, dict_word, score) for word, dict_word, score in similarities if abs(score - highest_score) <= 0.05],
key=lambda x: x[2],
reverse=True
)
confidence_score = 1 if len(high_similarities) <= 1 else 0
# Select top 5 highest similar items
similar_words = [dict_word for _, dict_word, _ in high_similarities[:5]]
similar_words_str = ' | '.join(similar_words)
return confidence_score, similar_words_str
def cosine_similarity(embedding1, embedding2):
return util.pytorch_cos_sim(embedding1, embedding2).item()
def generate_embedded_dictionary(dictionary, model, preprocessor):
dictionary_embeddings = {}
for dictionary_word in tqdm(dictionary, desc="Generating embeddings for dictionary words"):
preprocessed_word = preprocessor(dictionary_word)
dictionary_embeddings[preprocessed_word] = { # matchable word
'v': generate_embedding(model, preprocessed_word), # value embedded
'd': dictionary_word # dictionary word
}
return dictionary_embeddings