Spaces:
Paused
Paused
import os | |
import csv | |
import json | |
import time | |
import heapq | |
import pandas as pd | |
from openai import OpenAI | |
from dotenv import load_dotenv | |
from Levenshtein import distance | |
from tqdm import tqdm | |
from db.db_utils import get_connection, store_mapping_to_db, get_mapping_from_db | |
from ask_gpt import query_gpt | |
# For any unreviewed mappings, we ask chatgpt to consider: | |
# 1. The similar_words list | |
# 2. Similar words from the dictionary based on small levenstein distance | |
# ChatGPT should confirm that the current mapping is the best one. If not, they should provide the better mapping. | |
# If its a Non-Food Item, we should confirm that | |
# If it's a homogenous or hetergeneous mixture, we should confirm that | |
load_dotenv() | |
api_key = os.getenv("OPENAI_API_KEY") | |
client = OpenAI(api_key=api_key) | |
output_file_path = f'./audits/{int(time.time())}.csv' | |
def update_csv(results): | |
df_results = pd.DataFrame(results, columns=['input_word', 'original_dictionary_word', 'new_dictionary_word',]) | |
df_results.to_csv(output_file_path, index=False) | |
def find_close_levenshtein_words(input_word, dictionary, threshold=3): | |
# Calculate Levenshtein distances for each word in the dictionary | |
close_words = [word for word in dictionary if distance(input_word, word) <= threshold] | |
return close_words | |
def query_gpt(food_item, dictionary_word, similar_words): | |
line_separated_words = '\n'.join(similar_words) | |
prompt = ( | |
f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping? | |
Generally, you should prefer the mapped word, but if you believe there is a better fit, please provide it. | |
I will also provide a list of other similar words that you could be a better fit. | |
This is important: only return a word from the list of words I provide. | |
If it's not a food item or pet food, return 'Non-Food Item'. | |
You should respond in JSON format with an object that has the key `guess`, and the value is the most similar food item. | |
The food item is: "{food_item}" | |
It has been mapped to: "{dictionary_word}" | |
Similar words: | |
{line_separated_words}""" | |
) | |
completion = client.chat.completions.create( | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": prompt} | |
], | |
model="gpt-3.5-turbo-1106", | |
response_format={"type": "json_object"}, | |
) | |
response = completion.choices[0].message.content | |
parsed = parse_response(response) | |
print(f"Q: '{food_item}'") | |
print(f"A: '{parsed}'") | |
print() | |
return parsed | |
# Define the function to parse the GPT response | |
def parse_response(response): | |
try: | |
result = json.loads(response) | |
return result['guess'] | |
except (json.JSONDecodeError, KeyError) as e: | |
print(f"Error parsing response: {response} - {e}") | |
return None | |
db_conn = get_connection() | |
db_cursor = db_conn.cursor() | |
# Load the dictionary | |
db_cursor.execute("SELECT description FROM dictionary") | |
dictionary = db_cursor.fetchall() | |
dictionary = [item[0] for item in dictionary] | |
# select all mappings that have not been reviewed | |
db_cursor.execute("SELECT input_word, dictionary_word, similar_words FROM mappings WHERE reviewed = false") | |
results = db_cursor.fetchall() | |
# iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word | |
csv_data = [] | |
for row in results: | |
input_word = row[0] | |
dictionary_word = row[1] | |
similar_words = [item.strip() for item in row[2].split('|')] | |
# find words from the dictionary list based on small levenstein distance between input_word and each word in the dictionary | |
levenshtein_words = find_close_levenshtein_words(input_word, dictionary) | |
print(f"Input: {input_word}") | |
print(f" - dictionary_word: {dictionary_word}") | |
print(f" - similar_words: {similar_words}") | |
print(f" - levenshtein_words: {levenshtein_words}") | |
# concatenate the similar_words and levenshtein_words | |
all_words = similar_words + levenshtein_words | |
all_words = list(set(all_words)) # remove duplicates | |
response = query_gpt(input_word, dictionary_word, all_words) | |
if response: | |
new_row = { | |
'input_word': input_word, | |
'original_dictionary_word': dictionary_word, | |
} | |
if response == dictionary_word and response in dictionary: | |
print(f" - Mapping is correct") | |
db_cursor.execute("UPDATE mappings SET reviewed = true WHERE input_word = %s", (input_word,)) | |
db_conn.commit() | |
new_row['new_dictionary_word'] = response | |
else: | |
# We should update the mapping in the database | |
# We should replace dictionary_word with response | |
# We should set reviewed to 1 | |
# first confirm that the response is in the dictionary | |
if response in dictionary: | |
print(f" - Updating mapping with {response}") | |
db_cursor.execute("UPDATE mappings SET dictionary_word = %s, reviewed = true WHERE input_word = %s", (response, input_word)) | |
db_conn.commit() | |
new_row['new_dictionary_word'] = response | |
else: | |
print(f" - Response {response} is not in the dictionary") | |
csv_data.append(new_row) | |
update_csv(csv_data) | |
db_conn.close() |