Spaces:
Paused
Paused
File size: 3,606 Bytes
9189e38 22ad617 9189e38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import time
import os
import pickle
import re
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from db.db_utils import get_connection, initialize_db, get_mapping_from_db, store_mapping_to_db
from utils import generate_embedding, cosine_similarity, clean_word, calculate_confidence_and_similar_words_str
# model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model_name = 'sentence-transformers/all-mpnet-base-v2'
filename = model_name.replace('/', '-')
pickle_file_path = f'./embeddings/slow/{filename}.pkl'
class SimilaritySlow:
def __init__(self, db_cursor, db_conn):
self.db_cursor = db_cursor
self.db_conn = db_conn
self.model = SentenceTransformer(model_name)
self.db_cursor.execute("SELECT description FROM dictionary")
dictionary = self.db_cursor.fetchall()
dictionary = [item[0] for item in dictionary]
self.dictionary_embeddings = self.load_dictionary_embeddings(dictionary)
def preprocess_dictionary_word(self, text):
text = text.strip().lower().replace(", raw", "").replace(" raw", "").replace(", nfs", "").replace(" nfs", "")
words = text.split()
return [
' '.join(reversed(words)).replace(',', ''),
', '.join(reversed(text.split(', '))),
text,
' '.join(words).replace(',', '')
]
def load_dictionary_embeddings(self, dictionary):
if os.path.exists(pickle_file_path):
with open(pickle_file_path, 'rb') as f:
return pickle.load(f)
else:
dictionary_embeddings = {}
for dictionary_word in tqdm(dictionary, desc="Generating embeddings for dictionary words"):
preprocessed_words = self.preprocess_dictionary_word(dictionary_word)
for preprocessed_word in preprocessed_words:
dictionary_embeddings[preprocessed_word] = {
'v': generate_embedding(self.model, preprocessed_word),
'd': dictionary_word
}
with open(pickle_file_path, 'wb') as f:
pickle.dump(dictionary_embeddings, f)
return dictionary_embeddings
def calculate_similarity_score(self, input_word_clean):
input_embedding = generate_embedding(self.model, input_word_clean)
similarities = []
for key, val in self.dictionary_embeddings.items():
similarities.append((key, val['d'], cosine_similarity(input_embedding, val['v'])))
most_similar_word, dictionary_word, highest_score = max(similarities, key=lambda x: x[2])
confidence_score, similar_words_str = calculate_confidence_and_similar_words_str(similarities, highest_score)
return most_similar_word, dictionary_word, highest_score, confidence_score, similar_words_str
def find_most_similar_word(self, input_word):
if not isinstance(input_word, str) or not input_word:
return None
input_word_clean = clean_word(input_word)
most_similar_word, dictionary_word, highest_score, confidence_score, similar_words_str = self.calculate_similarity_score(input_word_clean)
mapping = {
'input_word': input_word,
'cleaned_word': input_word_clean,
'dictionary_word': dictionary_word,
'similarity_score': highest_score,
'confidence_score': confidence_score,
'similar_words': similar_words_str,
}
return mapping
|