Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

brightly-ai / algo.py

beweinreich

fixed bug with wrapping in dictionary data

8d827d2 about 1 month ago

raw

history blame

11.8 kB

	import time
	from tqdm import tqdm
	import pandas as pd
	from pluralizer import Pluralizer
	from similarity_fast import SimilarityFast
	from food_nonfood import classify_as_food_nonfood, pessimistic_food_nonfood_score
	from utils import clean_word
	from db.db_utils import store_mapping_to_db, get_mapping_from_db, get_dictionary_data_from_db
	from ask_gpt import query_gpt
	from multi_food_item_detector import extract_items

	similarity_threshold = 0.75


	class Algo:
	def __init__(self, db_conn, enable_csv=False):
	self.db_conn = db_conn
	self.enable_csv = enable_csv
	self.db_cursor = db_conn.cursor()
	self.similarity_fast = SimilarityFast(self.db_cursor)
	# self.similarity_slow = SimilaritySlow(self.db_cursor, self.db_conn)
	self.pluralizer = Pluralizer()

	def save_to_csv(self, results):
	if not self.enable_csv:
	return
	output_file_path = f'./results/{int(time.time())}.csv'
	df_results = pd.DataFrame(results, columns=[
	'input_word', 'cleaned_word', 'matching_word',
	'dictionary_word', 'similarity_score', 'confidence_score',
	'similar_words', 'is_food', 'food_nonfood_score'
	])
	df_results.to_csv(output_file_path, index=False)

	def perform_mapping(self, input_word, attempts=0):
	mapping = self.similarity_fast.find_most_similar_word(input_word)

	# skip slow mapping for now
	# if mapping['similarity_score'] < similarity_threshold:
	# print("Attempting slow mapping")
	# slow_mapping = self.similarity_slow.find_most_similar_word(input_word)
	# print(f" - Slow: {slow_mapping}")
	# if slow_mapping['similarity_score'] > mapping['similarity_score']:
	# mapping = slow_mapping

	# if mapping['similarity_score'] < similarity_threshold and len(input_word.split(' ')) > 1:
	# print(" - Attempting reverse mapping")
	# reversed_input_word = ' '.join(input_word.split(' ')[::-1])
	# reversed_mapping = self.similarity_fast.find_most_similar_word(reversed_input_word)
	# if reversed_mapping['similarity_score'] > mapping['similarity_score']:
	# reversed_mapping.update(
	# {
	# 'input_word': input_word,
	# 'cleaned_word': mapping['cleaned_word']
	# }
	# )
	# mapping = reversed_mapping

	# check if the cleaned_word is a substring of the matching_word
	is_substring = mapping['cleaned_word'] in mapping['matching_word']

	if mapping['similarity_score'] < similarity_threshold and not is_substring:
	print(" - Attempting GPT mapping")
	try:
	gpt_recommended_word = query_gpt(input_word)
	if gpt_recommended_word:

	if gpt_recommended_word == 'Non-Food Item':
	mapping.update(
	{
	'similarity_score': 1.0,
	'confidence_score': 1.0,
	'is_food': False,
	'food_nonfood_score': 1.0
	}
	)
	return mapping
	elif gpt_recommended_word == 'Mixed Food Items':
	mapping.update(
	{
	'matching_word': 'Mixed Food Items',
	'dictionary_word': 'Mixed Food Items', 'similarity_score': 1.0,
	'confidence_score': 1.0
	}
	)
	return mapping
	else:
	gpt_mapping = self.similarity_fast.find_most_similar_word(gpt_recommended_word)
	if gpt_mapping['similarity_score'] > mapping['similarity_score']:
	gpt_mapping.update(
	{
	'input_word': input_word,
	'cleaned_word': mapping['cleaned_word']
	}
	)
	mapping = gpt_mapping
	except Exception as e:
	print(f" - Error querying GPT: {e}")

	return mapping

	def handle_multi_item(self, input_word):
	# The input word has a comma or a slash in it
	# If it has more commas, its comma-delimited
	# If it has more slashes, its slash-delimited
	# If it has equal number of commas and slashes, we'll go with slashes

	input_word_parts = extract_items(input_word)
	print(f" - Extracted items: {input_word_parts}")
	mappings = []
	for part in input_word_parts:
	mapping = self.handle_single_item(part)
	mappings.append(mapping)

	# look up the dictionary values for each mapping
	# find the wweia category
	# if all mappings have the same wweia category, return "homogenous", else "heterogeneous"
	# if is_food is False for any mappings, return "Non-Food Item" as dictionary word
	for mapping in mappings:
	if mapping['is_food'] == False:
	return {
	'input_word': input_word,
	'cleaned_word': mapping['cleaned_word'],
	'matching_word': 'Non-Food Item',
	'dictionary_word': 'Non-Food Item',
	'similarity_score': None,
	'confidence_score': None,
	'similar_words': None,
	'is_food': False,
	'food_nonfood_score': 1.0
	}
	break

	dictionary_words = [mapping['dictionary_word'] for mapping in mappings]

	if len(set(dictionary_words)) == 0:
	return {
	'input_word': input_word,
	'cleaned_word': None,
	'matching_word': None,
	'dictionary_word': None,
	'similarity_score': None,
	'confidence_score': None,
	'similar_words': None,
	'is_food': None,
	'food_nonfood_score': None
	}

	# check if "heterogeneous" is in the wweia category of any of the mappings
	# otherwise we find the mapping with the lowest DMC value, and return that as the dictionary word, dmc, wc, and leakage values
	heterogeneous_exists = False
	most_conservative_mapping = None
	for mapping in mappings:
	if mapping['wweia_category'] == "Heterogeneous Mixture":
	heterogeneous_exists = True
	break
	else:
	if most_conservative_mapping is None or mapping['dry_matter_content'] < most_conservative_mapping['dry_matter_content']:
	most_conservative_mapping = mapping

	mixture_data = {}
	if heterogeneous_exists:
	mixture_data = {
	'matching_word': 'Heterogeneous Mixture',
	'dictionary_word': 'Heterogeneous Mixture',
	'wweia_category': 'Heterogeneous Mixture',
	'dry_matter_content': 0.27,
	'water_content': 0.73,
	'leakage': 0.1
	}

	dictionary_word = 'Hetereogenous Mixture'
	if most_conservative_mapping is not None:
	dictionary_word = f"{most_conservative_mapping['dictionary_word']} (Lowest DMC)"

	return {
	'input_word': input_word,
	'cleaned_word': None,
	'matching_word': dictionary_word,
	'dictionary_word': dictionary_word,
	'similarity_score': None,
	'confidence_score': None,
	'similar_words': None,
	'is_food': True,
	'food_nonfood_score': 1.0,
	'wweia_category': most_conservative_mapping['wweia_category'],
	'water_content': most_conservative_mapping['water_content'],
	'dry_matter_content': most_conservative_mapping['dry_matter_content'],
	'leakage': most_conservative_mapping['leakage']
	}

	def handle_single_item(self, input_word):
	input_word_clean = clean_word(input_word)

	# try the singular form of the word
	singular = self.pluralizer.pluralize(input_word_clean, 1)
	mapping = get_mapping_from_db(self.db_cursor, singular)
	if mapping:
	print(f" - Found mapping in db: {mapping}")
	return self.wrap_mapping_with_dictionary_data(mapping)

	# try the plural form of the word
	plural = self.pluralizer.pluralize(input_word_clean, 2)
	mapping = get_mapping_from_db(self.db_cursor, plural)
	if mapping:
	print(f" - Found mapping in db: {mapping}")
	return self.wrap_mapping_with_dictionary_data(mapping)


	food_nonfood = classify_as_food_nonfood(input_word)

	# if we're very confident that the word is non-food, let's not even classify it
	if food_nonfood[1] > 0.9 and food_nonfood[0] == False:
	mapping = {
	'input_word': input_word,
	'cleaned_word': input_word_clean,
	'matching_word': 'Non-Food Item',
	'dictionary_word': 'Non-Food Item',
	'similarity_score': None,
	'confidence_score': None,
	'similar_words': None,
	'is_food': False,
	'food_nonfood_score': food_nonfood[1]
	}
	store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
	return self.wrap_mapping_with_dictionary_data(mapping)

	mapping = self.perform_mapping(input_word)

	food_nonfood_pessimistic = pessimistic_food_nonfood_score(food_nonfood, mapping['similarity_score'])
	mapping.update({
	'is_food': food_nonfood_pessimistic[0],
	'food_nonfood_score': food_nonfood_pessimistic[1]
	})

	print(f" - Storing new mapping to db: {mapping}")
	store_mapping_to_db(self.db_cursor, self.db_conn, mapping)

	return self.wrap_mapping_with_dictionary_data(mapping)

	def wrap_mapping_with_dictionary_data(self, mapping):
	if not mapping:
	return None

	dictionary_result = get_dictionary_data_from_db(self.db_cursor, mapping['dictionary_word'])

	mapping.update({
	'wweia_category': dictionary_result['wweia_category'] if dictionary_result else None,
	'water_content': dictionary_result['water_content'] if dictionary_result else None,
	'dry_matter_content': dictionary_result['dry_matter_content'] if dictionary_result else None,
	'leakage': dictionary_result['leakage'] if dictionary_result else None
	})

	return mapping

	def match_words(self, input_words, stream_results=False):
	results = []
	for input_word in tqdm(input_words, desc="Processing input words"):
	if not isinstance(input_word, str) or pd.isna(input_word) or input_word == "" or input_word.lower() == "nan":
	continue

	print()
	print(f"Processing: {input_word}")

	if ',' in input_word or '/' in input_word:
	mapping = self.handle_multi_item(input_word)
	else:
	mapping = self.handle_single_item(input_word)

	if mapping:
	results.append(mapping)

	if stream_results:
	return mapping

	self.save_to_csv(results)

	return results