Spaces:
Paused
Paused
import os | |
import csv | |
import json | |
import psycopg2 | |
import pandas as pd | |
from tqdm import tqdm | |
from openai import OpenAI | |
from dotenv import load_dotenv | |
from psycopg2.extras import DictCursor | |
from db.db_utils import get_connection | |
load_dotenv() | |
api_key = os.getenv("OPENAI_API_KEY") | |
client = OpenAI(api_key=api_key) | |
db_conn = get_connection() | |
db_cursor = db_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) | |
# Load your Excel file | |
file_path = './dictionary/final_corrected_wweia_food_category_complete - final_corrected_wweia_food_category_complete.csv' | |
spreadsheet = pd.read_csv(file_path) | |
def find_best_category(food_item, category, dataframe): | |
filtered_df = dataframe[dataframe['closest_category'] == category] | |
if not filtered_df.empty: | |
descriptions = filtered_df['wweia_food_category_description'].tolist() | |
prompt = ( | |
f"Given the food item '{food_item}' and the category '{category}', choose the most appropriate category from the following options:\n{descriptions}\n\n" | |
f"You should respond in json format with an object that has the key `guess`, and the value is the categoy." | |
) | |
completion = client.chat.completions.create( | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": prompt} | |
], | |
model="gpt-3.5-turbo-1106", | |
response_format={"type": "json_object"}, | |
) | |
response = completion.choices[0].message.content | |
parsed = parse_response(response) | |
return parsed | |
# Define the function to parse the GPT response | |
def parse_response(response): | |
try: | |
result = json.loads(response) | |
return result['guess'] | |
except (json.JSONDecodeError, KeyError) as e: | |
print(f"Error parsing response: {response} - {e}") | |
return None | |
# open up the current dictionary csv file | |
db_cursor.execute('SELECT * FROM dictionary where wweia_category is null') | |
rows = db_cursor.fetchall() | |
for row in tqdm(rows, desc="Processing"): | |
print() | |
fdc_id = row['fdc_id'] | |
food_item = row['description'] | |
category = row['food_category'] | |
if pd.notna(row['wweia_category']) and row['wweia_category'] != "" and row['wweia_category'] != "nan" and row is not None: | |
# print(f"Skipping '{food_item}' as it already has a category {row['wweia_category']}") | |
continue | |
else: | |
print(f"Processing '{food_item}'") | |
# Find the best category for the food item | |
best_category = find_best_category(food_item, category, spreadsheet) | |
print(f"Q: '{food_item}'") | |
print(f"A: '{best_category}'") | |
if best_category: | |
db_cursor.execute('UPDATE dictionary SET wweia_category = %s WHERE fdc_id = %s', (best_category, fdc_id)) | |
db_conn.commit() | |
else: | |
print(f"Failed to find a category for '{food_item}'") | |
db_conn.close() |