import time from tqdm import tqdm import pandas as pd from pluralizer import Pluralizer from similarity_fast import SimilarityFast from food_nonfood import classify_as_food_nonfood, pessimistic_food_nonfood_score from utils import clean_word from db.db_utils import store_mapping_to_db, get_mapping_from_db, get_dictionary_data_from_db from ask_gpt import query_gpt from multi_food_item_detector import extract_items similarity_threshold = 0.75 class Algo: def __init__(self, db_conn, enable_csv=False): self.db_conn = db_conn self.enable_csv = enable_csv self.db_cursor = db_conn.cursor() self.similarity_fast = SimilarityFast(self.db_cursor) # self.similarity_slow = SimilaritySlow(self.db_cursor, self.db_conn) self.pluralizer = Pluralizer() def save_to_csv(self, results): if not self.enable_csv: return output_file_path = f'./results/{int(time.time())}.csv' df_results = pd.DataFrame(results, columns=[ 'input_word', 'cleaned_word', 'matching_word', 'dictionary_word', 'similarity_score', 'confidence_score', 'similar_words', 'is_food', 'food_nonfood_score' ]) df_results.to_csv(output_file_path, index=False) def perform_mapping(self, input_word, attempts=0): mapping = self.similarity_fast.find_most_similar_word(input_word) # skip slow mapping for now # if mapping['similarity_score'] < similarity_threshold: # print("Attempting slow mapping") # slow_mapping = self.similarity_slow.find_most_similar_word(input_word) # print(f" - Slow: {slow_mapping}") # if slow_mapping['similarity_score'] > mapping['similarity_score']: # mapping = slow_mapping # if mapping['similarity_score'] < similarity_threshold and len(input_word.split(' ')) > 1: # print(" - Attempting reverse mapping") # reversed_input_word = ' '.join(input_word.split(' ')[::-1]) # reversed_mapping = self.similarity_fast.find_most_similar_word(reversed_input_word) # if reversed_mapping['similarity_score'] > mapping['similarity_score']: # reversed_mapping.update( # { # 'input_word': input_word, # 'cleaned_word': mapping['cleaned_word'] # } # ) # mapping = reversed_mapping # check if the cleaned_word is a substring of the matching_word is_substring = mapping['cleaned_word'] in mapping['matching_word'] if mapping['similarity_score'] < similarity_threshold and not is_substring: print(" - Attempting GPT mapping") try: gpt_recommended_word = query_gpt(input_word) if gpt_recommended_word: if gpt_recommended_word == 'Non-Food Item': mapping.update( { 'similarity_score': 1.0, 'confidence_score': 1.0, 'is_food': False, 'food_nonfood_score': 1.0 } ) return mapping elif gpt_recommended_word == 'Mixed Food Items': mapping.update( { 'matching_word': 'Mixed Food Items', 'dictionary_word': 'Mixed Food Items', 'similarity_score': 1.0, 'confidence_score': 1.0 } ) return mapping else: gpt_mapping = self.similarity_fast.find_most_similar_word(gpt_recommended_word) if gpt_mapping['similarity_score'] > mapping['similarity_score']: gpt_mapping.update( { 'input_word': input_word, 'cleaned_word': mapping['cleaned_word'] } ) mapping = gpt_mapping except Exception as e: print(f" - Error querying GPT: {e}") return mapping def handle_multi_item(self, input_word): # The input word has a comma or a slash in it # If it has more commas, its comma-delimited # If it has more slashes, its slash-delimited # If it has equal number of commas and slashes, we'll go with slashes input_word_parts = extract_items(input_word) print(f" - Extracted items: {input_word_parts}") mappings = [] for part in input_word_parts: mapping = self.handle_single_item(part) mappings.append(mapping) # look up the dictionary values for each mapping # find the wweia category # if all mappings have the same wweia category, return "homogenous", else "heterogeneous" # if is_food is False for any mappings, return "Non-Food Item" as dictionary word for mapping in mappings: if mapping['is_food'] == False: return { 'input_word': input_word, 'cleaned_word': mapping['cleaned_word'], 'matching_word': 'Non-Food Item', 'dictionary_word': 'Non-Food Item', 'similarity_score': None, 'confidence_score': None, 'similar_words': None, 'is_food': False, 'food_nonfood_score': 1.0 } break dictionary_words = [mapping['dictionary_word'] for mapping in mappings] if len(set(dictionary_words)) == 0: return { 'input_word': input_word, 'cleaned_word': None, 'matching_word': None, 'dictionary_word': None, 'similarity_score': None, 'confidence_score': None, 'similar_words': None, 'is_food': None, 'food_nonfood_score': None } # check if "heterogeneous" is in the wweia category of any of the mappings # otherwise we find the mapping with the lowest DMC value, and return that as the dictionary word, dmc, wc, and leakage values heterogeneous_exists = False most_conservative_mapping = None for mapping in mappings: if mapping['wweia_category'] == "Heterogeneous Mixture": heterogeneous_exists = True break else: if most_conservative_mapping is None or mapping['dry_matter_content'] < most_conservative_mapping['dry_matter_content']: most_conservative_mapping = mapping mixture_data = {} if heterogeneous_exists: mixture_data = { 'matching_word': 'Heterogeneous Mixture', 'dictionary_word': 'Heterogeneous Mixture', 'wweia_category': 'Heterogeneous Mixture', 'dry_matter_content': 0.27, 'water_content': 0.73, 'leakage': 0.1 } dictionary_word = 'Hetereogenous Mixture' if most_conservative_mapping is not None: dictionary_word = f"{most_conservative_mapping['dictionary_word']} (Lowest DMC)" return { 'input_word': input_word, 'cleaned_word': None, 'matching_word': dictionary_word, 'dictionary_word': dictionary_word, 'similarity_score': None, 'confidence_score': None, 'similar_words': None, 'is_food': True, 'food_nonfood_score': 1.0, 'wweia_category': most_conservative_mapping['wweia_category'], 'water_content': most_conservative_mapping['water_content'], 'dry_matter_content': most_conservative_mapping['dry_matter_content'], 'leakage': most_conservative_mapping['leakage'] } def handle_single_item(self, input_word): input_word_clean = clean_word(input_word) # try the singular form of the word singular = self.pluralizer.pluralize(input_word_clean, 1) mapping = get_mapping_from_db(self.db_cursor, singular) if mapping: print(f" - Found mapping in db: {mapping}") return self.wrap_mapping_with_dictionary_data(mapping) # try the plural form of the word plural = self.pluralizer.pluralize(input_word_clean, 2) mapping = get_mapping_from_db(self.db_cursor, plural) if mapping: print(f" - Found mapping in db: {mapping}") return self.wrap_mapping_with_dictionary_data(mapping) food_nonfood = classify_as_food_nonfood(input_word) # if we're very confident that the word is non-food, let's not even classify it if food_nonfood[1] > 0.9 and food_nonfood[0] == False: mapping = { 'input_word': input_word, 'cleaned_word': input_word_clean, 'matching_word': 'Non-Food Item', 'dictionary_word': 'Non-Food Item', 'similarity_score': None, 'confidence_score': None, 'similar_words': None, 'is_food': False, 'food_nonfood_score': food_nonfood[1] } store_mapping_to_db(self.db_cursor, self.db_conn, mapping) return self.wrap_mapping_with_dictionary_data(mapping) mapping = self.perform_mapping(input_word) food_nonfood_pessimistic = pessimistic_food_nonfood_score(food_nonfood, mapping['similarity_score']) mapping.update({ 'is_food': food_nonfood_pessimistic[0], 'food_nonfood_score': food_nonfood_pessimistic[1] }) print(f" - Storing new mapping to db: {mapping}") store_mapping_to_db(self.db_cursor, self.db_conn, mapping) return self.wrap_mapping_with_dictionary_data(mapping) def wrap_mapping_with_dictionary_data(self, mapping): if not mapping: return None dictionary_result = get_dictionary_data_from_db(self.db_cursor, mapping['dictionary_word']) mapping.update({ 'wweia_category': dictionary_result['wweia_category'] if dictionary_result else None, 'water_content': dictionary_result['water_content'] if dictionary_result else None, 'dry_matter_content': dictionary_result['dry_matter_content'] if dictionary_result else None, 'leakage': dictionary_result['leakage'] if dictionary_result else None }) return mapping def match_words(self, input_words, stream_results=False): results = [] for input_word in tqdm(input_words, desc="Processing input words"): if not isinstance(input_word, str) or pd.isna(input_word) or input_word == "" or input_word.lower() == "nan": continue print() print(f"Processing: {input_word}") if ',' in input_word or '/' in input_word: mapping = self.handle_multi_item(input_word) else: mapping = self.handle_single_item(input_word) if mapping: results.append(mapping) if stream_results: return mapping self.save_to_csv(results) return results