Spaces:
Paused
Paused
import time | |
from tqdm import tqdm | |
import pandas as pd | |
from pluralizer import Pluralizer | |
from similarity_fast import SimilarityFast | |
from food_nonfood import classify_as_food_nonfood, pessimistic_food_nonfood_score | |
from utils import clean_word | |
from db.db_utils import store_mapping_to_db, get_mapping_from_db | |
from ask_gpt import query_gpt | |
from multi_food_item_detector import extract_food_phrases | |
similarity_threshold = 0.75 | |
class Algo: | |
def __init__(self, db_conn, enable_csv=False): | |
self.db_conn = db_conn | |
self.enable_csv = enable_csv | |
self.db_cursor = db_conn.cursor() | |
self.similarity_fast = SimilarityFast(self.db_cursor) | |
# self.similarity_slow = SimilaritySlow(self.db_cursor, self.db_conn) | |
self.pluralizer = Pluralizer() | |
def save_to_csv(self, results): | |
if not self.enable_csv: | |
return | |
output_file_path = f'./results/{int(time.time())}.csv' | |
df_results = pd.DataFrame(results, columns=[ | |
'input_word', 'cleaned_word', 'matching_word', | |
'dictionary_word', 'similarity_score', 'confidence_score', | |
'similar_words', 'is_food', 'food_nonfood_score' | |
]) | |
df_results.to_csv(output_file_path, index=False) | |
def perform_mapping(self, input_word, attempts=0): | |
mapping = self.similarity_fast.find_most_similar_word(input_word) | |
# skip slow mapping for now | |
# if mapping['similarity_score'] < similarity_threshold: | |
# print("Attempting slow mapping") | |
# slow_mapping = self.similarity_slow.find_most_similar_word(input_word) | |
# print(f" - Slow: {slow_mapping}") | |
# if slow_mapping['similarity_score'] > mapping['similarity_score']: | |
# mapping = slow_mapping | |
if mapping['similarity_score'] < similarity_threshold and len(input_word.split(' ')) > 1: | |
print(" - Attempting reverse mapping") | |
reversed_input_word = ' '.join(input_word.split(' ')[::-1]) | |
reversed_mapping = self.similarity_fast.find_most_similar_word(reversed_input_word) | |
if reversed_mapping['similarity_score'] > mapping['similarity_score']: | |
reversed_mapping.update( | |
{ | |
'input_word': input_word, | |
'cleaned_word': mapping['cleaned_word'] | |
} | |
) | |
mapping = reversed_mapping | |
# check if the cleaned_word is a substring of the matching_word | |
is_substring = mapping['cleaned_word'] in mapping['matching_word'] | |
if mapping['similarity_score'] < similarity_threshold and not is_substring: | |
print(" - Attempting GPT mapping") | |
try: | |
gpt_recommended_word = query_gpt(input_word) | |
if gpt_recommended_word: | |
if gpt_recommended_word == 'Non-Food Item': | |
mapping.update( | |
{ | |
'similarity_score': 1.0, | |
'confidence_score': 1.0, | |
'is_food': False, | |
'food_nonfood_score': 1.0 | |
} | |
) | |
return mapping | |
elif gpt_recommended_word == 'Mixed Food Items': | |
mapping.update( | |
{ | |
'matching_word': 'Mixed Food Items', | |
'dictionary_word': 'Mixed Food Items', 'similarity_score': 1.0, | |
'confidence_score': 1.0 | |
} | |
) | |
return mapping | |
else: | |
gpt_mapping = self.similarity_fast.find_most_similar_word(gpt_recommended_word) | |
if gpt_mapping['similarity_score'] > mapping['similarity_score']: | |
gpt_mapping.update( | |
{ | |
'input_word': input_word, | |
'cleaned_word': mapping['cleaned_word'] | |
} | |
) | |
mapping = gpt_mapping | |
except Exception as e: | |
print(f" - Error querying GPT: {e}") | |
return mapping | |
def handle_multi_item(self, input_word): | |
# The input word has a comma or a slash in it | |
# If it has more commas, its comma-delimited | |
# If it has more slashes, its slash-delimited | |
# If it has equal number of commas and slashes, we'll go with slashes | |
input_word_parts = extract_food_phrases(input_word) | |
mappings = [] | |
for part in input_word_parts: | |
mapping = self.handle_single_item(part) | |
mappings.append(mapping) | |
# TODO categorize the whole mapping list as homogenous, heterogenous, or non-food item | |
return None | |
def handle_single_item(self, input_word): | |
input_word_clean = clean_word(input_word) | |
# try the singular form of the word | |
singular = self.pluralizer.pluralize(input_word_clean, 1) | |
mapping = get_mapping_from_db(self.db_cursor, singular) | |
if mapping: | |
print(f" - Found mapping in db: {mapping}") | |
return mapping | |
# try the plural form of the word | |
plural = self.pluralizer.pluralize(input_word_clean, 2) | |
mapping = get_mapping_from_db(self.db_cursor, plural) | |
if mapping: | |
print(f" - Found mapping in db: {mapping}") | |
return mapping | |
food_nonfood = classify_as_food_nonfood(input_word) | |
# if we're very confident that the word is non-food, let's not even classify it | |
if food_nonfood[1] > 0.9 and food_nonfood[0] == False: | |
mapping = { | |
'input_word': input_word, | |
'cleaned_word': input_word_clean, | |
'matching_word': 'Non-Food Item', | |
'dictionary_word': 'Non-Food Item', | |
'similarity_score': None, | |
'confidence_score': None, | |
'similar_words': None, | |
'is_food': False, | |
'food_nonfood_score': food_nonfood[1] | |
} | |
store_mapping_to_db(self.db_cursor, self.db_conn, mapping) | |
return mapping | |
mapping = self.perform_mapping(input_word) | |
food_nonfood_pessimistic = pessimistic_food_nonfood_score(food_nonfood, mapping['similarity_score']) | |
mapping.update({ | |
'is_food': food_nonfood_pessimistic[0], | |
'food_nonfood_score': food_nonfood_pessimistic[1] | |
}) | |
print(f" - Storing new mapping to db: {mapping}") | |
store_mapping_to_db(self.db_cursor, self.db_conn, mapping) | |
return mapping | |
def match_words(self, input_words, stream_results=False): | |
results = [] | |
for input_word in tqdm(input_words, desc="Processing input words"): | |
if not isinstance(input_word, str) or pd.isna(input_word) or input_word == "" or input_word.lower() == "nan": | |
continue | |
print() | |
print(f"Processing: {input_word}") | |
if "&" in input_word or "and" in input_word: | |
print(" - Skipping multi-item word") | |
continue | |
# if the word has a "," or "/" in it, let's skip it for now | |
if ',' in input_word or '/' in input_word: | |
mapping = self.handle_multi_item(input_word) | |
else: | |
mapping = self.handle_single_item(input_word) | |
if mapping: | |
results.append(mapping) | |
if stream_results: | |
return mapping | |
self.save_to_csv(results) | |
return results | |