Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

brightly-ai / chatgpt_audit.py

beweinreich

skip anything non-csv

ecbcfc4 24 days ago

raw

history blame

No virus

7.67 kB

	import os
	import csv
	import json
	import time
	import heapq
	import pandas as pd
	from openai import OpenAI
	from dotenv import load_dotenv
	from Levenshtein import distance
	from tqdm import tqdm
	from db.db_utils import get_connection
	from ask_gpt import query_gpt

	# For any unreviewed mappings, we ask chatgpt to consider:
	# 1. The similar_words list
	# 2. Similar words from the dictionary based on small levenstein distance

	# ChatGPT should confirm that the current mapping is the best one. If not, they should provide the better mapping.
	# If its a Non-Food Item, we should confirm that
	# If it's a homogenous or hetergeneous mixture, we should confirm that

	load_dotenv()

	api_key = os.getenv("OPENAI_API_KEY")
	client = OpenAI(api_key=api_key)

	output_file_path = f'./audits/{int(time.time())}.csv'

	def update_csv(results):
	df_results = pd.DataFrame(results, columns=['input_word', 'original_dictionary_word', 'new_dictionary_word',])
	df_results.to_csv(output_file_path, index=False)

	def find_close_levenshtein_words(input_word, dictionary, threshold=7):
	# Calculate Levenshtein distances for each word in the dictionary
	close_words = [word for word in dictionary if distance(input_word, word) <= threshold]
	return close_words

	def query_gpt(food_item, dictionary_word, similar_words):
	line_separated_words = '\n'.join(similar_words)

	prompt = (
	f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping?

	Generally, you should prefer the mapped word, but if you believe there is a better fit, please provide it.

	I will also provide a list of other similar words that you could be a better fit.

	This is important: only return a word from the list of words I provide.

	If it's not a food item or pet food, return 'Non-Food Item'.

	You should respond in JSON format with an object that has the key `guess`, and the value is the most similar food item.

	The food item is: "{food_item}"
	It has been mapped to: "{dictionary_word}"

	Similar words:
	{line_separated_words}"""
	)

	completion = client.chat.completions.create(
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": prompt}
	],
	model="gpt-3.5-turbo-1106",
	response_format={"type": "json_object"},
	)
	response = completion.choices[0].message.content
	parsed = parse_response(response)
	print(f"Q: '{food_item}'")
	print(f"A: '{parsed}'")
	return parsed

	# Define the function to parse the GPT response
	def parse_response(response):
	try:
	result = json.loads(response)
	return result['guess']
	except (json.JSONDecodeError, KeyError) as e:
	print(f"Error parsing response: {response} - {e}")
	return None


	db_conn = get_connection()
	db_cursor = db_conn.cursor()

	# Load the dictionary
	db_cursor.execute("SELECT description FROM dictionary")
	dictionary = db_cursor.fetchall()
	dictionary = [item[0] for item in dictionary]

	# select all mappings that have not been reviewed
	db_cursor.execute("SELECT input_word, dictionary_word, similar_words, is_food FROM mappings WHERE reviewed = false")
	results = db_cursor.fetchall()

	# iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word

	csv_data = []
	for row in results:
	input_word = row[0]
	dictionary_word = row[1]
	similar_words = [item.strip() for item in row[2].split('\|')] if row[2] else []
	is_food = row[3]

	# find words from the dictionary list based on small levenstein distance between input_word and each word in the dictionary
	levenshtein_words = find_close_levenshtein_words(input_word, dictionary)
	print()
	print(f"Input: {input_word}")
	print(f" - dictionary_word: {dictionary_word}")
	print(f" - similar_words: {similar_words}")
	print(f" - levenshtein_words: {levenshtein_words}")

	# concatenate the similar_words and levenshtein_words
	all_words = similar_words + levenshtein_words
	all_words = list(set(all_words)) # remove duplicates
	response = query_gpt(input_word, dictionary_word, all_words)
	if response:
	new_row = None
	if response == dictionary_word and response in dictionary:
	print(f" - Mapping is correct")
	db_cursor.execute("UPDATE mappings SET reviewed = true WHERE input_word = %s", (input_word,))
	db_conn.commit()
	else:
	# We should update the mapping in the database
	# We should replace dictionary_word with response
	# We should set reviewed to 1
	# first confirm that the response is in the dictionary
	if response in dictionary:
	# prompt the user if we want to make the change or not
	print("------")
	print("Updating mapping to the following:")
	print(f" - input_word: {input_word}")
	print(f" - original_dictionary_word: {dictionary_word}")
	print(f" - new_dictionary_word: {response}")
	print("------")

	if response == 'Non-Food Item' and is_food == False:
	# We can skip this one because it's already marked as a Non-Food Item
	print(" - Mapping is already marked as a Non-Food Item")
	confirm = 'y'
	else:
	confirm = input("Press 'y' to confirm, 'i' to ignore, 'd' to delete, 'm' for mixture, any other key to skip: ")

	if confirm.lower() == 'y':
	if response == 'Non-Food Item':
	sql = "UPDATE mappings SET dictionary_word = %s, is_food = FALSE, reviewed = true WHERE input_word = %s"
	else:
	sql = "UPDATE mappings SET dictionary_word = %s, reviewed = true, is_food = true WHERE input_word = %s"

	print(f" - Updating mapping with {response}")
	db_cursor.execute(sql, (response, input_word))
	db_conn.commit()
	new_row = {
	'input_word': input_word,
	'original_dictionary_word': dictionary_word,
	'new_dictionary_word': response
	}
	elif confirm.lower() == 'i':
	print(f" - Ignoring mapping")
	sql = "UPDATE mappings SET ignore = true, reviewed = true WHERE input_word = %s"
	db_cursor.execute(sql, (input_word,))
	db_conn.commit()
	elif confirm.lower() == 'd':
	print(f" - Deleting mapping")
	sql = "DELETE FROM mappings WHERE input_word = %s"
	db_cursor.execute(sql, (input_word,))
	db_conn.commit()
	elif confirm.lower() == 'm':
	print(f" - Mixed food items")
	sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Mixed Food Items', is_food = true WHERE input_word = %s"
	db_cursor.execute(sql, (input_word,))
	db_conn.commit()
	else:
	db_cursor.execute("UPDATE mappings SET reviewed = true WHERE input_word = %s", (input_word,))
	db_conn.commit()
	else:
	print(f" - Response {response} is not in the dictionary")

	if new_row:
	csv_data.append(new_row)
	update_csv(csv_data)

	db_conn.close()