Spaces:
Paused
Paused
File size: 1,980 Bytes
9189e38 f2740a4 9189e38 fc7936d 9189e38 f2740a4 4edd87e f2740a4 9189e38 05bb441 9189e38 05bb441 9189e38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import re
import os
import pickle
import pandas as pd
from tqdm import tqdm
from sentence_transformers import util
def generate_embedding(model, sentence):
return model.encode(sentence, convert_to_tensor=True)
def clean_word(input_word):
cleaned_word = re.sub(r'\(.*?\)', '', input_word)
cleaned_word = cleaned_word.strip().lower().replace(", raw", "").replace(" raw", "").replace(", nfs", "").replace(" nfs", "")
cleaned_word = re.sub(r'\s+', ' ', cleaned_word) # Remove double or more empty spaces
cleaned_word = cleaned_word.strip()
return cleaned_word
def is_empty_word(input_word):
if not isinstance(input_word, str) or pd.isna(input_word) or input_word == "" or input_word.lower() == "nan" or input_word.lower() == "none":
return True
return False
def calculate_confidence_and_similar_words_str(similarities, highest_score):
high_similarities = sorted(
[(word, dict_word, score) for word, dict_word, score in similarities if abs(score - highest_score) <= 0.05],
key=lambda x: x[2],
reverse=True
)
confidence_score = 1 if len(high_similarities) <= 1 else 0
# Select top 5 highest similar items
similar_words = [dict_word for _, dict_word, _ in high_similarities[:5]]
similar_words_str = ' | '.join(similar_words)
return confidence_score, similar_words_str
def cosine_similarity(embedding1, embedding2):
return util.pytorch_cos_sim(embedding1, embedding2).item()
def generate_embedded_dictionary(dictionary, model, preprocessor):
dictionary_embeddings = {}
for dictionary_word in tqdm(dictionary, desc="Generating embeddings for dictionary words"):
preprocessed_word = preprocessor(dictionary_word)
dictionary_embeddings[preprocessed_word] = { # matchable word
'v': generate_embedding(model, preprocessed_word), # value embedded
'd': dictionary_word # dictionary word
}
return dictionary_embeddings
|