Spaces:
Paused
Paused
File size: 9,443 Bytes
988ba41 9189e38 b59ded9 9189e38 019a7e5 9189e38 019a7e5 fc0306d 988ba41 fc0306d 019a7e5 fc0306d 019a7e5 9189e38 fc7936d 9189e38 d93f20c 9189e38 22ad617 9189e38 019a7e5 9189e38 05bb441 fc7936d 7c3dcbb 9189e38 05bb441 73fda7b 05bb441 4aa61c6 05bb441 019a7e5 05bb441 019a7e5 eccaeb3 7c3dcbb ecbcfc4 7c3dcbb eccaeb3 019a7e5 eccaeb3 019a7e5 eccaeb3 4edd87e 019a7e5 ecbcfc4 d93f20c 019a7e5 ecbcfc4 4edd87e 73fda7b 019a7e5 eccaeb3 05bb441 4aa61c6 05bb441 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import re
import os
import csv
import json
import time
import heapq
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from Levenshtein import distance
from tqdm import tqdm
from db.db_utils import get_connection
from ask_gpt import query_gpt
from utils import generate_embedding, cosine_similarity
from sentence_transformers import SentenceTransformer, util
# For any unreviewed mappings, we ask chatgpt to consider:
# 1. The similar_words list
# 2. Similar words from the dictionary based on small levenstein distance
# ChatGPT should confirm that the current mapping is the best one. If not, they should provide the better mapping.
# If its a Non-Food Item, we should confirm that
# If it's a homogenous or hetergeneous mixture, we should confirm that
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)
output_file_path = f'./audits/{int(time.time())}.csv'
model_name = 'sentence-transformers/all-mpnet-base-v2'
model = SentenceTransformer(model_name)
def remove_misc_words(text):
text = text.replace('raw', '').replace('fresh', '').replace('canned', '').replace('prepared', '').replace('with', '').replace('and', '').replace('or', '').replace('in', '').replace('NFS', '').strip()
text = re.sub(r'\(.*?\)', '', text)
return text
def compare_embeddings(old_dictionary_word, new_dictionary_word):
old_dictionary_word = remove_misc_words(old_dictionary_word)
new_dictionary_word = remove_misc_words(new_dictionary_word)
old_embedding = generate_embedding(model, old_dictionary_word)
new_embedding = generate_embedding(model, new_dictionary_word)
cosine_similarity_score = cosine_similarity(old_embedding, new_embedding)
return cosine_similarity_score
def update_csv(results):
df_results = pd.DataFrame(results, columns=['input_word', 'original_dictionary_word', 'new_dictionary_word',])
df_results.to_csv(output_file_path, index=False)
def find_close_levenshtein_words(input_word, dictionary, threshold=7):
# Calculate Levenshtein distances for each word in the dictionary
close_words = [word for word in dictionary if distance(input_word, word) <= threshold]
return close_words
def query_gpt(food_item, dictionary_word, similar_words):
line_separated_words = '\n'.join(similar_words)
prompt = (
f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping?
Generally, you should prefer the mapped word, but if you believe there is a better fit from provided list of similar words, please specify it.
This is important: only return a word from the list of words I provide.
If it's not a food item or pet food, return 'Non-Food Item'.
You should respond in JSON format with an object that has the key `guess`, and the value is the most similar food item.
The food item is: "{food_item}"
It has been mapped to: "{dictionary_word}"
Similar words:
{line_separated_words}"""
)
completion = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
],
model="gpt-3.5-turbo-1106",
response_format={"type": "json_object"},
)
response = completion.choices[0].message.content
parsed = parse_response(response)
print(f"Q: '{food_item}'")
print(f"A: '{parsed}'")
return parsed
# Define the function to parse the GPT response
def parse_response(response):
try:
result = json.loads(response)
return result['guess']
except (json.JSONDecodeError, KeyError) as e:
print(f"Error parsing response: {response} - {e}")
return None
db_conn = get_connection()
db_cursor = db_conn.cursor()
# Load the dictionary
db_cursor.execute("SELECT description FROM dictionary")
dictionary = db_cursor.fetchall()
dictionary = [item[0] for item in dictionary]
# select all mappings that have not been reviewed
db_cursor.execute("SELECT input_word, dictionary_word, similar_words, is_food FROM mappings WHERE gpt_reviewed = false and is_food = true and dictionary_word != 'Heterogeneous Mixture'")
results = db_cursor.fetchall()
# iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word
csv_data = []
for row in results:
input_word = row[0]
dictionary_word = row[1]
similar_words = [item.strip() for item in row[2].split('|')] if row[2] else []
is_food = row[3]
# find words from the dictionary list based on small levenstein distance between input_word and each word in the dictionary
levenshtein_words = find_close_levenshtein_words(input_word, dictionary)
print()
print(f"Input: {input_word}")
print(f" - dictionary_word: {dictionary_word}")
print(f" - similar_words: {similar_words}")
print(f" - levenshtein_words: {levenshtein_words}")
# concatenate the similar_words and levenshtein_words
all_words = similar_words + levenshtein_words
all_words = list(set(all_words)) # remove duplicates
response = query_gpt(input_word, dictionary_word, all_words)
if response:
new_row = None
if response == dictionary_word and response in dictionary:
print(f" - Mapping is correct")
db_cursor.execute("UPDATE mappings SET gpt_reviewed = true WHERE input_word = %s", (input_word,))
db_conn.commit()
else:
# We should update the mapping in the database
# We should replace dictionary_word with response
# We should set reviewed to 1
# first confirm that the response is in the dictionary
if response in dictionary:
# If the response is similar to the original dictionary word, lets just skip it
compare_score = compare_embeddings(dictionary_word, response)
if compare_score > 0.8:
print(f" - Mapping is already similar to the dictionary word")
db_cursor.execute("UPDATE mappings SET gpt_reviewed = true WHERE input_word = %s", (input_word,))
db_conn.commit()
continue
else:
print(f" - Mapping is not similar to the dictionary word")
print(f" - Cosine Similarity: {compare_score}")
# prompt the user if we want to make the change or not
print("------")
print("Updating mapping to the following:")
print(f" - input_word: {input_word}")
print(f" - original_dictionary_word: {dictionary_word}")
print(f" - new_dictionary_word: {response}")
print("------")
if response == 'Non-Food Item' and is_food == False:
# We can skip this one because it's already marked as a Non-Food Item
print(" - Mapping is already marked as a Non-Food Item")
confirm = 'y'
else:
confirm = input("Press 'y' to confirm, 'i' to ignore, 'd' to delete, 'm' for mixture, any other key to skip: ")
if confirm.lower() == 'y':
if response == 'Non-Food Item':
sql = "UPDATE mappings SET dictionary_word = %s, is_food = FALSE, description = 'Non-Food Item', gpt_reviewed = true, reviewed = true WHERE input_word = %s"
else:
sql = "UPDATE mappings SET dictionary_word = %s, gpt_reviewed = true, reviewed = true, is_food = true WHERE input_word = %s"
print(f" - Updating mapping with {response}")
db_cursor.execute(sql, (response, input_word))
db_conn.commit()
new_row = {
'input_word': input_word,
'original_dictionary_word': dictionary_word,
'new_dictionary_word': response
}
elif confirm.lower() == 'i':
print(f" - Ignoring mapping")
sql = "UPDATE mappings SET ignore = true, gpt_reviewed = true, reviewed = true WHERE input_word = %s"
db_cursor.execute(sql, (input_word,))
db_conn.commit()
elif confirm.lower() == 'd':
print(f" - Deleting mapping")
sql = "DELETE FROM mappings WHERE input_word = %s"
db_cursor.execute(sql, (input_word,))
db_conn.commit()
elif confirm.lower() == 'm':
print(f" - Heterogeneous Mixture")
sql = "UPDATE mappings SET gpt_reviewed = true, reviewed = true, dictionary_word = 'Heterogeneous Mixture', is_food = true WHERE input_word = %s"
db_cursor.execute(sql, (input_word,))
db_conn.commit()
else:
db_cursor.execute("UPDATE mappings SET gpt_reviewed = true, reviewed = true WHERE input_word = %s", (input_word,))
db_conn.commit()
else:
print(f" - Response {response} is not in the dictionary")
if new_row:
csv_data.append(new_row)
update_csv(csv_data)
db_conn.close() |