import os import csv import json import time import heapq import pandas as pd from openai import OpenAI from dotenv import load_dotenv from Levenshtein import distance from tqdm import tqdm from db.db_utils import get_connection, store_mapping_to_db, get_mapping_from_db from ask_gpt import query_gpt # For any unreviewed mappings, we ask chatgpt to consider: # 1. The similar_words list # 2. Similar words from the dictionary based on small levenstein distance # ChatGPT should confirm that the current mapping is the best one. If not, they should provide the better mapping. # If its a Non-Food Item, we should confirm that # If it's a homogenous or hetergeneous mixture, we should confirm that load_dotenv() api_key = os.getenv("OPENAI_API_KEY") client = OpenAI(api_key=api_key) output_file_path = f'./audits/{int(time.time())}.csv' def update_csv(results): df_results = pd.DataFrame(results, columns=['input_word', 'original_dictionary_word', 'new_dictionary_word',]) df_results.to_csv(output_file_path, index=False) def find_close_levenshtein_words(input_word, dictionary, threshold=7): # Calculate Levenshtein distances for each word in the dictionary close_words = [word for word in dictionary if distance(input_word, word) <= threshold] return close_words def query_gpt(food_item, dictionary_word, similar_words): line_separated_words = '\n'.join(similar_words) prompt = ( f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping? Generally, you should prefer the mapped word, but if you believe there is a better fit, please provide it. I will also provide a list of other similar words that you could be a better fit. This is important: only return a word from the list of words I provide. If it's not a food item or pet food, return 'Non-Food Item'. You should respond in JSON format with an object that has the key `guess`, and the value is the most similar food item. The food item is: "{food_item}" It has been mapped to: "{dictionary_word}" Similar words: {line_separated_words}""" ) completion = client.chat.completions.create( messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt} ], model="gpt-3.5-turbo-1106", response_format={"type": "json_object"}, ) response = completion.choices[0].message.content parsed = parse_response(response) print(f"Q: '{food_item}'") print(f"A: '{parsed}'") return parsed # Define the function to parse the GPT response def parse_response(response): try: result = json.loads(response) return result['guess'] except (json.JSONDecodeError, KeyError) as e: print(f"Error parsing response: {response} - {e}") return None db_conn = get_connection() db_cursor = db_conn.cursor() # Load the dictionary db_cursor.execute("SELECT description FROM dictionary") dictionary = db_cursor.fetchall() dictionary = [item[0] for item in dictionary] # select all mappings that have not been reviewed db_cursor.execute("SELECT input_word, dictionary_word, similar_words FROM mappings WHERE reviewed = false") results = db_cursor.fetchall() # iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word csv_data = [] for row in results: input_word = row[0] dictionary_word = row[1] similar_words = [item.strip() for item in row[2].split('|')] if row[2] else [] # find words from the dictionary list based on small levenstein distance between input_word and each word in the dictionary levenshtein_words = find_close_levenshtein_words(input_word, dictionary) print() print(f"Input: {input_word}") print(f" - dictionary_word: {dictionary_word}") print(f" - similar_words: {similar_words}") print(f" - levenshtein_words: {levenshtein_words}") # concatenate the similar_words and levenshtein_words all_words = similar_words + levenshtein_words all_words = list(set(all_words)) # remove duplicates response = query_gpt(input_word, dictionary_word, all_words) if response: new_row = None if response == dictionary_word and response in dictionary: print(f" - Mapping is correct") db_cursor.execute("UPDATE mappings SET reviewed = true WHERE input_word = %s", (input_word,)) db_conn.commit() else: # We should update the mapping in the database # We should replace dictionary_word with response # We should set reviewed to 1 # first confirm that the response is in the dictionary if response in dictionary: # prompt the user if we want to make the change or not print("------") print("Updating mapping to the following:") print(f" - input_word: {input_word}") print(f" - original_dictionary_word: {dictionary_word}") print(f" - new_dictionary_word: {response}") print("------") confirm = input("Press 'y' to confirm the update, any other key to skip: ") if confirm.lower() == 'y': if response == 'Non-Food Item': sql = "UPDATE mappings SET dictionary_word = %s, is_food = FALSE, reviewed = true WHERE input_word = %s" else: sql = "UPDATE mappings SET dictionary_word = %s, reviewed = true WHERE input_word = %s" print(f" - Updating mapping with {response}") db_cursor.execute(sql, (response, input_word)) db_conn.commit() new_row = { 'input_word': input_word, 'original_dictionary_word': dictionary_word, 'new_dictionary_word': response } else: db_cursor.execute("UPDATE mappings SET reviewed = true WHERE input_word = %s", (input_word,)) db_conn.commit() else: print(f" - Response {response} is not in the dictionary") if new_row: csv_data.append(new_row) update_csv(csv_data) db_conn.close()