Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

brightly-ai / utils.py

beweinreich

audited

05bb441 about 1 month ago

raw

history blame

No virus

1.7 kB

	import re
	import os
	from tqdm import tqdm
	import pickle
	from sentence_transformers import util

	def generate_embedding(model, sentence):
	return model.encode(sentence, convert_to_tensor=True)

	def clean_word(input_word):
	cleaned_word = re.sub(r'\(.*?\)', '', input_word)
	cleaned_word = cleaned_word.strip().lower().replace(", raw", "").replace(" raw", "").replace(", nfs", "").replace(" nfs", "")
	cleaned_word = re.sub(r'\s+', ' ', cleaned_word) # Remove double or more empty spaces
	return cleaned_word

	def calculate_confidence_and_similar_words_str(similarities, highest_score):
	high_similarities = sorted(
	[(word, dict_word, score) for word, dict_word, score in similarities if abs(score - highest_score) <= 0.05],
	key=lambda x: x[2],
	reverse=True
	)

	confidence_score = 1 if len(high_similarities) <= 1 else 0

	# Select top 5 highest similar items
	similar_words = [dict_word for _, dict_word, _ in high_similarities[:5]]
	similar_words_str = ' \| '.join(similar_words)

	return confidence_score, similar_words_str

	def cosine_similarity(embedding1, embedding2):
	return util.pytorch_cos_sim(embedding1, embedding2).item()

	def generate_embedded_dictionary(dictionary, model, preprocessor):
	dictionary_embeddings = {}
	for dictionary_word in tqdm(dictionary, desc="Generating embeddings for dictionary words"):
	preprocessed_word = preprocessor(dictionary_word)
	dictionary_embeddings[preprocessed_word] = { # matchable word
	'v': generate_embedding(model, preprocessed_word), # value embedded
	'd': dictionary_word # dictionary word
	}

	return dictionary_embeddings