File size: 2,089 Bytes
9189e38
 
 
f2740a4
 
9189e38
 
 
 
 
 
 
 
 
fc7936d
9189e38
 
f2740a4
4edd87e
f2740a4
 
 
 
9189e38
 
ea58a70
9189e38
 
 
 
 
 
ea58a70
 
 
9189e38
 
 
 
 
 
 
 
 
05bb441
9189e38
 
05bb441
9189e38
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import re
import os
import pickle
import pandas as pd
from tqdm import tqdm
from sentence_transformers import util

def generate_embedding(model, sentence):
    return model.encode(sentence, convert_to_tensor=True)

def clean_word(input_word):
    cleaned_word = re.sub(r'\(.*?\)', '', input_word)
    cleaned_word = cleaned_word.strip().lower().replace(", raw", "").replace(" raw", "").replace(", nfs", "").replace(" nfs", "")
    cleaned_word = re.sub(r'\s+', ' ', cleaned_word)  # Remove double or more empty spaces
    cleaned_word = cleaned_word.strip()
    return cleaned_word

def is_empty_word(input_word):
    if not isinstance(input_word, str) or pd.isna(input_word) or input_word == "" or input_word.lower() == "nan" or input_word.lower() == "none":
        return True

    return False

def calculate_confidence_and_similar_words_str(similarities, highest_score):
    high_similarities = sorted(
        [(word, dict_word, score) for word, dict_word, score in similarities if abs(score - highest_score) <= 0.1],
        key=lambda x: x[2], 
        reverse=True
    )
    
    confidence_score = 1 if len(high_similarities) <= 1 else 0

    # remove the highest score because thats the matching word
    high_similarities = high_similarities[1:]

    # Select top 5 highest similar items
    similar_words = [dict_word for _, dict_word, _ in high_similarities[:5]]
    similar_words_str = ' | '.join(similar_words)

    return confidence_score, similar_words_str

def cosine_similarity(embedding1, embedding2):
    return util.pytorch_cos_sim(embedding1, embedding2).item()

def generate_embedded_dictionary(dictionary, model, preprocessor):
    dictionary_embeddings = {}
    for dictionary_word in tqdm(dictionary, desc="Generating embeddings for dictionary words"):
        preprocessed_word = preprocessor(dictionary_word)
        dictionary_embeddings[preprocessed_word] = { # matchable word
            'v': generate_embedding(model, preprocessed_word), # value embedded
            'd': dictionary_word # dictionary word
        }

    return dictionary_embeddings