Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

brightly-ai / utils.py

beweinreich

remove periods from the cleaned word

97adb95 18 days ago

raw

history blame

No virus

2.14 kB

	import re
	import os
	import pickle
	import pandas as pd
	from tqdm import tqdm
	from sentence_transformers import util

	def generate_embedding(model, sentence):
	return model.encode(sentence, convert_to_tensor=True)

	def clean_word(input_word):
	cleaned_word = re.sub(r'\(.*?\)', '', input_word)
	cleaned_word = cleaned_word.strip().lower().replace(", raw", "").replace(" raw", "").replace(", nfs", "").replace(" nfs", "")
	# Remove double or more empty spaces
	cleaned_word = re.sub(r'\s+', ' ', cleaned_word)
	# remove periods
	cleaned_word = cleaned_word.replace(".", "")
	cleaned_word = cleaned_word.strip()
	return cleaned_word

	def is_empty_word(input_word):
	if not isinstance(input_word, str) or pd.isna(input_word) or input_word == "" or input_word.lower() == "nan" or input_word.lower() == "none":
	return True

	return False

	def calculate_confidence_and_similar_words_str(similarities, highest_score):
	high_similarities = sorted(
	[(word, dict_word, score) for word, dict_word, score in similarities if abs(score - highest_score) <= 0.1],
	key=lambda x: x[2],
	reverse=True
	)

	confidence_score = 1 if len(high_similarities) <= 1 else 0

	# remove the highest score because thats the matching word
	high_similarities = high_similarities[1:]

	# Select top 5 highest similar items
	similar_words = [dict_word for _, dict_word, _ in high_similarities[:5]]
	similar_words_str = ' \| '.join(similar_words)

	return confidence_score, similar_words_str

	def cosine_similarity(embedding1, embedding2):
	return util.pytorch_cos_sim(embedding1, embedding2).item()

	def generate_embedded_dictionary(dictionary, model, preprocessor):
	dictionary_embeddings = {}
	for dictionary_word in tqdm(dictionary, desc="Embedding words"):
	preprocessed_word = preprocessor(dictionary_word)
	dictionary_embeddings[preprocessed_word] = { # matchable word
	'v': generate_embedding(model, preprocessed_word), # value embedded
	'd': dictionary_word # dictionary word
	}

	return dictionary_embeddings