Spaces:

madebybread
/

brightly-ai

Paused

File size: 5,426 Bytes

import os
import csv
import json
import time
import heapq
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from Levenshtein import distance
from tqdm import tqdm
from db.db_utils import get_connection, store_mapping_to_db, get_mapping_from_db
from ask_gpt import query_gpt

# For any unreviewed mappings, we ask chatgpt to consider:
# 1. The similar_words list
# 2. Similar words from the dictionary based on small levenstein distance

# ChatGPT should confirm that the current mapping is the best one. If not, they should provide the better mapping.
# If its a Non-Food Item, we should confirm that
# If it's a homogenous or hetergeneous mixture, we should confirm that

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

output_file_path = f'./audits/{int(time.time())}.csv'

def update_csv(results):
    df_results = pd.DataFrame(results, columns=['input_word', 'original_dictionary_word', 'new_dictionary_word',])    
    df_results.to_csv(output_file_path, index=False)

def find_close_levenshtein_words(input_word, dictionary, threshold=3):
    # Calculate Levenshtein distances for each word in the dictionary
    close_words = [word for word in dictionary if distance(input_word, word) <= threshold]
    return close_words

def query_gpt(food_item, dictionary_word, similar_words):
    line_separated_words = '\n'.join(similar_words)

    prompt = (
      f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping?

      Generally, you should prefer the mapped word, but if you believe there is a better fit, please provide it.

      I will also provide a list of other similar words that you could be a better fit.

      This is important: only return a word from the list of words I provide.

      If it's not a food item or pet food, return 'Non-Food Item'.

      You should respond in JSON format with an object that has the key `guess`, and the value is the most similar food item.

      The food item is: "{food_item}"
      It has been mapped to: "{dictionary_word}"

      Similar words:
      {line_separated_words}"""
    )

    completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        model="gpt-3.5-turbo-1106",
        response_format={"type": "json_object"},
    )
    response = completion.choices[0].message.content
    parsed = parse_response(response)
    print(f"Q: '{food_item}'")
    print(f"A: '{parsed}'")
    print()
    return parsed

# Define the function to parse the GPT response
def parse_response(response):
    try:
        result = json.loads(response)
        return result['guess']
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error parsing response: {response} - {e}")
        return None


db_conn = get_connection()
db_cursor = db_conn.cursor()

# Load the dictionary
db_cursor.execute("SELECT description FROM dictionary")
dictionary = db_cursor.fetchall()
dictionary = [item[0] for item in dictionary]

# select all mappings that have not been reviewed
db_cursor.execute("SELECT input_word, dictionary_word, similar_words FROM mappings WHERE reviewed = false")
results = db_cursor.fetchall()

# iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word

csv_data = []
for row in results:
    input_word = row[0]
    dictionary_word = row[1]
    similar_words = [item.strip() for item in row[2].split('|')]
    
    # find words from the dictionary list based on small levenstein distance between input_word and each word in the dictionary
    levenshtein_words = find_close_levenshtein_words(input_word, dictionary)
    print(f"Input: {input_word}")
    print(f" - dictionary_word: {dictionary_word}")
    print(f" - similar_words: {similar_words}")
    print(f" - levenshtein_words: {levenshtein_words}")

    # concatenate the similar_words and levenshtein_words
    all_words = similar_words + levenshtein_words
    all_words = list(set(all_words))  # remove duplicates
    response = query_gpt(input_word, dictionary_word, all_words)
    if response:
        new_row = {
            'input_word': input_word,
            'original_dictionary_word': dictionary_word,
        }
        if response == dictionary_word and response in dictionary:
            print(f" - Mapping is correct")
            db_cursor.execute("UPDATE mappings SET reviewed = true WHERE input_word = %s", (input_word,))
            db_conn.commit()
            new_row['new_dictionary_word'] = response
        else:
            # We should update the mapping in the database
            # We should replace dictionary_word with response
            # We should set reviewed to 1
            # first confirm that the response is in the dictionary
            if response in dictionary:
                print(f" - Updating mapping with {response}")
                db_cursor.execute("UPDATE mappings SET dictionary_word = %s, reviewed = true WHERE input_word = %s", (response, input_word))
                db_conn.commit()
                new_row['new_dictionary_word'] = response
            else:
                print(f" - Response {response} is not in the dictionary")

        csv_data.append(new_row)
        update_csv(csv_data)

db_conn.close()