import os import csv import json import time import heapq import pandas as pd from openai import OpenAI from dotenv import load_dotenv from Levenshtein import distance from tqdm import tqdm from db.db_utils import get_connection, store_mapping_to_db, get_mapping_from_db from ask_gpt import query_gpt # For any unreviewed mappings, we ask chatgpt to consider: # 1. The similar_words list # 2. Similar words from the dictionary based on small levenstein distance # ChatGPT should confirm that the current mapping is the best one. If not, they should provide the better mapping. # If its a Non-Food Item, we should confirm that # If it's a homogenous or hetergeneous mixture, we should confirm that load_dotenv() api_key = os.getenv("OPENAI_API_KEY") client = OpenAI(api_key=api_key) output_file_path = f'./audits/{int(time.time())}.csv' def update_csv(results): df_results = pd.DataFrame(results, columns=['input_word', 'original_dictionary_word', 'new_dictionary_word',]) df_results.to_csv(output_file_path, index=False) def find_close_levenshtein_words(input_word, dictionary, threshold=3): # Calculate Levenshtein distances for each word in the dictionary close_words = [word for word in dictionary if distance(input_word, word) <= threshold] return close_words def query_gpt(food_item, dictionary_word, similar_words): line_separated_words = '\n'.join(similar_words) prompt = ( f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping? Generally, you should prefer the mapped word, but if you believe there is a better fit, please provide it. I will also provide a list of other similar words that you could be a better fit. This is important: only return a word from the list of words I provide. If it's not a food item or pet food, return 'Non-Food Item'. You should respond in JSON format with an object that has the key `guess`, and the value is the most similar food item. The food item is: "{food_item}" It has been mapped to: "{dictionary_word}" Similar words: {line_separated_words}""" ) completion = client.chat.completions.create( messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt} ], model="gpt-3.5-turbo-1106", response_format={"type": "json_object"}, ) response = completion.choices[0].message.content parsed = parse_response(response) print(f"Q: '{food_item}'") print(f"A: '{parsed}'") print() return parsed # Define the function to parse the GPT response def parse_response(response): try: result = json.loads(response) return result['guess'] except (json.JSONDecodeError, KeyError) as e: print(f"Error parsing response: {response} - {e}") return None db_conn = get_connection() db_cursor = db_conn.cursor() # Load the dictionary db_cursor.execute("SELECT description FROM dictionary") dictionary = db_cursor.fetchall() dictionary = [item[0] for item in dictionary] # select all mappings that have not been reviewed db_cursor.execute("SELECT input_word, dictionary_word, similar_words FROM mappings WHERE reviewed = 0") results = db_cursor.fetchall() # iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word print("Soft drink, NFS" in dictionary) print(dictionary) print("ensure dictionary works before we start") # csv_data = [] # for row in results: # input_word = row[0] # dictionary_word = row[1] # similar_words = [item.strip() for item in row[2].split('|')] # # find words from the dictionary list based on small levenstein distance between input_word and each word in the dictionary # levenshtein_words = find_close_levenshtein_words(input_word, dictionary) # print(f"Input: {input_word}") # print(f" - dictionary_word: {dictionary_word}") # print(f" - similar_words: {similar_words}") # print(f" - levenshtein_words: {levenshtein_words}") # # concatenate the similar_words and levenshtein_words # all_words = similar_words + levenshtein_words # all_words = list(set(all_words)) # remove duplicates # response = query_gpt(input_word, dictionary_word, all_words) # if response: # csv_data.append({ # 'input_word': input_word, # 'original_dictionary_word': dictionary_word, # 'new_dictionary_word': response # }) # if response == dictionary_word and response in dictionary: # print(f" - Mapping is correct") # db_cursor.execute("UPDATE mappings SET reviewed = 1 WHERE input_word = ?", (input_word,)) # else: # # We should update the mapping in the database # # We should replace dictionary_word with response # # We should set reviewed to 1 # # first confirm that the response is in the dictionary # if response in dictionary: # print(f" - Updating mapping with {response}") # db_cursor.execute("UPDATE mappings SET dictionary_word = ?, reviewed = 1 WHERE input_word = ?", (response, input_word)) # db_conn.commit() # else: # print(f" - Response {response} is not in the dictionary") # update_csv(csv_data) # db_conn.close()