Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

brightly-ai / similarity_slow.py

beweinreich

ported everything into dictionary postgres

22ad617 about 1 month ago

raw

history blame

No virus

3.65 kB

	import time
	import os
	import pickle
	import re
	import pandas as pd
	from tqdm import tqdm
	from sentence_transformers import SentenceTransformer, util
	from db.db_utils import get_connection, initialize_db, get_mapping_from_db, store_mapping_to_db
	from utils import generate_embedding, cosine_similarity, clean_word, calculate_confidence_and_similar_words_str

	# model_name = 'sentence-transformers/all-MiniLM-L6-v2'
	model_name = 'sentence-transformers/all-mpnet-base-v2'
	filename = model_name.replace('/', '-')
	pickle_file_path = f'./embeddings/slow/{filename}.pkl'


	class SimilaritySlow:
	def __init__(self, db_cursor, db_conn):
	self.db_cursor = db_cursor
	self.db_conn = db_conn
	self.model = SentenceTransformer(model_name)

	self.db_cursor.execute("SELECT description FROM dictionary")
	dictionary = self.db_cursor.fetchall()
	dictionary = [item[0] for item in dictionary]
	self.dictionary_embeddings = self.load_dictionary_embeddings(dictionary)

	def preprocess_dictionary_word(self, text):
	text = text.strip().lower().replace(", raw", "").replace(" raw", "").replace(", nfs", "").replace(" nfs", "")
	words = text.split()
	return [
	' '.join(reversed(words)).replace(',', ''),
	', '.join(reversed(text.split(', '))),
	text,
	' '.join(words).replace(',', '')
	]

	def load_dictionary_embeddings(self, dictionary):
	if os.path.exists(pickle_file_path):
	with open(pickle_file_path, 'rb') as f:
	return pickle.load(f)
	else:
	dictionary_embeddings = {}
	for dictionary_word in tqdm(dictionary, desc="Generating embeddings for dictionary words"):
	preprocessed_words = self.preprocess_dictionary_word(dictionary_word)
	for preprocessed_word in preprocessed_words:
	dictionary_embeddings[preprocessed_word] = {
	'v': generate_embedding(self.model, preprocessed_word),
	'd': dictionary_word
	}

	with open(pickle_file_path, 'wb') as f:
	pickle.dump(dictionary_embeddings, f)
	return dictionary_embeddings

	def calculate_similarity_score(self, input_word_clean):
	input_embedding = generate_embedding(self.model, input_word_clean)
	similarities = []
	for key, val in self.dictionary_embeddings.items():
	similarities.append((key, val['d'], cosine_similarity(input_embedding, val['v'])))

	most_similar_word, dictionary_word, highest_score = max(similarities, key=lambda x: x[2])

	confidence_score, similar_words_str = calculate_confidence_and_similar_words_str(similarities, highest_score)

	return most_similar_word, dictionary_word, highest_score, confidence_score, similar_words_str

	def find_most_similar_word(self, input_word):
	if not isinstance(input_word, str) or not input_word:
	return None

	input_word_clean = clean_word(input_word)
	most_similar_word, dictionary_word, highest_score, confidence_score, similar_words_str = self.calculate_similarity_score(input_word_clean)

	mapping = {
	'input_word': input_word,
	'cleaned_word': input_word_clean,
	'matching_word': most_similar_word,
	'dictionary_word': dictionary_word,
	'similarity_score': highest_score,
	'confidence_score': confidence_score,
	'similar_words': similar_words_str,
	}

	return mapping