import re import os import csv import json import time import heapq import pandas as pd from openai import OpenAI from dotenv import load_dotenv from Levenshtein import distance from tqdm import tqdm from db.db_utils import get_connection from ask_gpt import query_gpt from utils import generate_embedding, cosine_similarity from sentence_transformers import SentenceTransformer, util # For any unreviewed mappings, we ask chatgpt to consider: # 1. The similar_words list # 2. Similar words from the dictionary based on small levenstein distance # ChatGPT should confirm that the current mapping is the best one. If not, they should provide the better mapping. # If its a Non-Food Item, we should confirm that # If it's a homogenous or hetergeneous mixture, we should confirm that load_dotenv() api_key = os.getenv("OPENAI_API_KEY") client = OpenAI(api_key=api_key) output_file_path = f'./audits/{int(time.time())}.csv' model_name = 'sentence-transformers/all-mpnet-base-v2' model = SentenceTransformer(model_name) def reverse_string(s): # remove all commas, and reverse the words in the string s = s.replace(',', '') return ' '.join(s.split()[::-1]) def remove_misc_words(text): text = text.replace('raw', '').replace('fresh', '').replace('canned', '').replace('prepared', '').replace('with', '').replace('and', '').replace('or', '').replace('in', '').replace('NFS', '').strip() text = re.sub(r'\(.*?\)', '', text) return text def compare_embeddings(old_dictionary_word, new_dictionary_word): old_dictionary_word = remove_misc_words(old_dictionary_word) new_dictionary_word = remove_misc_words(new_dictionary_word) old_embedding = generate_embedding(model, old_dictionary_word) new_embedding = generate_embedding(model, new_dictionary_word) cosine_similarity_score = cosine_similarity(old_embedding, new_embedding) return cosine_similarity_score def update_csv(results): df_results = pd.DataFrame(results, columns=['input_word', 'original_dictionary_word', 'new_dictionary_word',]) df_results.to_csv(output_file_path, index=False) def find_close_levenshtein_words(input_word, dictionary, threshold=7): # Calculate Levenshtein distances for each word in the dictionary close_words = [word for word in dictionary if distance(input_word, word) <= threshold] return close_words def query_gpt(food_item, dictionary_word, similar_words): line_separated_words = '\n'.join(similar_words) prompt = ( f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping? Generally, you should prefer the mapped word, but if you believe there is a better fit from provided list of similar words, please specify it. This is important: only return a word from the list of words I provide. If it's not a food item or pet food, return 'Non-Food Item'. You should respond in JSON format with an object that has the key `guess`, and the value is the most similar food item. The food item is: "{food_item}" It has been mapped to: "{dictionary_word}" Similar words: {line_separated_words}""" ) completion = client.chat.completions.create( messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt} ], model="gpt-3.5-turbo-1106", response_format={"type": "json_object"}, ) response = completion.choices[0].message.content parsed = parse_response(response) print(f"Q: '{food_item}'") print(f"A: '{parsed}'") return parsed # Define the function to parse the GPT response def parse_response(response): try: result = json.loads(response) return result['guess'] except (json.JSONDecodeError, KeyError) as e: print(f"Error parsing response: {response} - {e}") return None db_conn = get_connection() db_cursor = db_conn.cursor() # Load the dictionary db_cursor.execute("SELECT description FROM dictionary") dictionary = db_cursor.fetchall() dictionary = [item[0] for item in dictionary] # select all mappings that have not been reviewed db_cursor.execute("SELECT input_word, dictionary_word, similar_words, is_food FROM mappings WHERE gpt_reviewed = false and is_food = true and dictionary_word != 'Heterogeneous Mixture'") results = db_cursor.fetchall() # iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word csv_data = [] for row in results: input_word = row[0] dictionary_word = row[1] similar_words = [item.strip() for item in row[2].split('|')] if row[2] else [] is_food = row[3] # find words from the dictionary list based on small levenstein distance between input_word and each word in the dictionary levenshtein_words = find_close_levenshtein_words(input_word, dictionary) print() print(f"Input: {input_word}") print(f" - dictionary_word: {dictionary_word}") print(f" - similar_words: {similar_words}") print(f" - levenshtein_words: {levenshtein_words}") # concatenate the similar_words and levenshtein_words all_words = similar_words + levenshtein_words all_words = list(set(all_words)) # remove duplicates response = query_gpt(input_word, dictionary_word, all_words) if response: new_row = None if response == dictionary_word and response in dictionary: print(f" - Mapping is correct") db_cursor.execute("UPDATE mappings SET gpt_reviewed = true WHERE input_word = %s", (input_word,)) db_conn.commit() else: # We should update the mapping in the database # We should replace dictionary_word with response # We should set reviewed to 1 # first confirm that the response is in the dictionary if response in dictionary: # If the response is similar to the original dictionary word, lets just skip it compare_score1 = compare_embeddings(dictionary_word, response) compare_score2 = compare_embeddings(reverse_string(dictionary_word), reverse_string(response)) if compare_score1 > 0.8 or compare_score2 > 0.8: print(f" - Mapping is already similar to the dictionary word") db_cursor.execute("UPDATE mappings SET gpt_reviewed = true WHERE input_word = %s", (input_word,)) db_conn.commit() continue else: print(f" - Mapping is not similar to the dictionary word") print(f" - Cosine Similarity1: {compare_score1}") print(f" - Cosine Similarity2: {compare_score2}") # prompt the user if we want to make the change or not print("------") print("Updating mapping to the following:") print(f" - input_word: {input_word}") print(f" - original_dictionary_word: {dictionary_word}") print(f" - new_dictionary_word: {response}") print("------") if response == 'Non-Food Item' and is_food == False: # We can skip this one because it's already marked as a Non-Food Item print(" - Mapping is already marked as a Non-Food Item") confirm = 'y' else: confirm = input("Press 'y' to confirm, 'i' to ignore, 'd' to delete, 'm' for mixture, any other key to skip: ") if confirm.lower() == 'y': if response == 'Non-Food Item': sql = "UPDATE mappings SET dictionary_word = %s, is_food = FALSE, description = 'Non-Food Item', gpt_reviewed = true, reviewed = true WHERE input_word = %s" else: sql = "UPDATE mappings SET dictionary_word = %s, gpt_reviewed = true, reviewed = true, is_food = true WHERE input_word = %s" print(f" - Updating mapping with {response}") db_cursor.execute(sql, (response, input_word)) db_conn.commit() new_row = { 'input_word': input_word, 'original_dictionary_word': dictionary_word, 'new_dictionary_word': response } elif confirm.lower() == 'i': print(f" - Ignoring mapping") sql = "UPDATE mappings SET ignore = true, gpt_reviewed = true, reviewed = true WHERE input_word = %s" db_cursor.execute(sql, (input_word,)) db_conn.commit() elif confirm.lower() == 'd': print(f" - Deleting mapping") sql = "DELETE FROM mappings WHERE input_word = %s" db_cursor.execute(sql, (input_word,)) db_conn.commit() elif confirm.lower() == 'm': print(f" - Heterogeneous Mixture") sql = "UPDATE mappings SET gpt_reviewed = true, reviewed = true, dictionary_word = 'Heterogeneous Mixture', is_food = true WHERE input_word = %s" db_cursor.execute(sql, (input_word,)) db_conn.commit() else: db_cursor.execute("UPDATE mappings SET gpt_reviewed = true, reviewed = true WHERE input_word = %s", (input_word,)) db_conn.commit() else: print(f" - Response {response} is not in the dictionary") if new_row: csv_data.append(new_row) update_csv(csv_data) db_conn.close()