File size: 9,443 Bytes
988ba41
9189e38
 
 
 
 
 
 
 
 
 
b59ded9
9189e38
019a7e5
 
9189e38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
019a7e5
 
 
fc0306d
988ba41
 
 
fc0306d
019a7e5
fc0306d
 
019a7e5
 
 
 
 
9189e38
 
 
 
fc7936d
9189e38
 
 
 
 
 
 
 
 
 
d93f20c
9189e38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22ad617
 
 
 
 
9189e38
019a7e5
9189e38
 
 
 
05bb441
 
 
 
fc7936d
7c3dcbb
9189e38
05bb441
 
73fda7b
05bb441
 
 
 
 
 
 
 
 
 
4aa61c6
05bb441
 
019a7e5
05bb441
 
 
 
 
 
 
019a7e5
 
 
 
 
 
 
 
 
 
 
 
eccaeb3
 
 
 
 
 
 
7c3dcbb
 
 
 
 
 
ecbcfc4
7c3dcbb
eccaeb3
 
019a7e5
eccaeb3
019a7e5
eccaeb3
 
 
 
 
 
 
 
 
4edd87e
 
019a7e5
ecbcfc4
 
 
 
 
 
 
 
d93f20c
019a7e5
ecbcfc4
4edd87e
73fda7b
019a7e5
eccaeb3
05bb441
 
 
4aa61c6
 
05bb441
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import re
import os
import csv
import json
import time
import heapq
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from Levenshtein import distance
from tqdm import tqdm
from db.db_utils import get_connection
from ask_gpt import query_gpt
from utils import generate_embedding, cosine_similarity
from sentence_transformers import SentenceTransformer, util

# For any unreviewed mappings, we ask chatgpt to consider:
# 1. The similar_words list
# 2. Similar words from the dictionary based on small levenstein distance

# ChatGPT should confirm that the current mapping is the best one. If not, they should provide the better mapping.
# If its a Non-Food Item, we should confirm that
# If it's a homogenous or hetergeneous mixture, we should confirm that

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

output_file_path = f'./audits/{int(time.time())}.csv'

model_name = 'sentence-transformers/all-mpnet-base-v2'
model = SentenceTransformer(model_name)

def remove_misc_words(text):
    text = text.replace('raw', '').replace('fresh', '').replace('canned', '').replace('prepared', '').replace('with', '').replace('and', '').replace('or', '').replace('in', '').replace('NFS', '').strip()
    text = re.sub(r'\(.*?\)', '', text)
    return text

def compare_embeddings(old_dictionary_word, new_dictionary_word):
    old_dictionary_word = remove_misc_words(old_dictionary_word)
    new_dictionary_word = remove_misc_words(new_dictionary_word)
    old_embedding = generate_embedding(model, old_dictionary_word)
    new_embedding = generate_embedding(model, new_dictionary_word)
    cosine_similarity_score = cosine_similarity(old_embedding, new_embedding)
    return cosine_similarity_score

def update_csv(results):
    df_results = pd.DataFrame(results, columns=['input_word', 'original_dictionary_word', 'new_dictionary_word',])    
    df_results.to_csv(output_file_path, index=False)

def find_close_levenshtein_words(input_word, dictionary, threshold=7):
    # Calculate Levenshtein distances for each word in the dictionary
    close_words = [word for word in dictionary if distance(input_word, word) <= threshold]
    return close_words

def query_gpt(food_item, dictionary_word, similar_words):
    line_separated_words = '\n'.join(similar_words)

    prompt = (
      f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping?

      Generally, you should prefer the mapped word, but if you believe there is a better fit from provided list of similar words, please specify it.

      This is important: only return a word from the list of words I provide.

      If it's not a food item or pet food, return 'Non-Food Item'.

      You should respond in JSON format with an object that has the key `guess`, and the value is the most similar food item.

      The food item is: "{food_item}"
      It has been mapped to: "{dictionary_word}"

      Similar words:
      {line_separated_words}"""
    )

    completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        model="gpt-3.5-turbo-1106",
        response_format={"type": "json_object"},
    )
    response = completion.choices[0].message.content
    parsed = parse_response(response)
    print(f"Q: '{food_item}'")
    print(f"A: '{parsed}'")
    return parsed

# Define the function to parse the GPT response
def parse_response(response):
    try:
        result = json.loads(response)
        return result['guess']
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error parsing response: {response} - {e}")
        return None


db_conn = get_connection()
db_cursor = db_conn.cursor()

# Load the dictionary
db_cursor.execute("SELECT description FROM dictionary")
dictionary = db_cursor.fetchall()
dictionary = [item[0] for item in dictionary]

# select all mappings that have not been reviewed
db_cursor.execute("SELECT input_word, dictionary_word, similar_words, is_food FROM mappings WHERE gpt_reviewed = false and is_food = true and dictionary_word != 'Heterogeneous Mixture'")
results = db_cursor.fetchall()

# iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word

csv_data = []
for row in results:
    input_word = row[0]
    dictionary_word = row[1]
    similar_words = [item.strip() for item in row[2].split('|')] if row[2] else []
    is_food = row[3]
    
    # find words from the dictionary list based on small levenstein distance between input_word and each word in the dictionary
    levenshtein_words = find_close_levenshtein_words(input_word, dictionary)
    print()
    print(f"Input: {input_word}")
    print(f" - dictionary_word: {dictionary_word}")
    print(f" - similar_words: {similar_words}")
    print(f" - levenshtein_words: {levenshtein_words}")

    # concatenate the similar_words and levenshtein_words
    all_words = similar_words + levenshtein_words
    all_words = list(set(all_words))  # remove duplicates
    response = query_gpt(input_word, dictionary_word, all_words)
    if response:
        new_row = None
        if response == dictionary_word and response in dictionary:
            print(f" - Mapping is correct")
            db_cursor.execute("UPDATE mappings SET gpt_reviewed = true WHERE input_word = %s", (input_word,))
            db_conn.commit()
        else:
            # We should update the mapping in the database
            # We should replace dictionary_word with response
            # We should set reviewed to 1
            # first confirm that the response is in the dictionary
            if response in dictionary:

                # If the response is similar to the original dictionary word, lets just skip it
                compare_score = compare_embeddings(dictionary_word, response)
                if compare_score > 0.8:
                    print(f" - Mapping is already similar to the dictionary word")
                    db_cursor.execute("UPDATE mappings SET gpt_reviewed = true WHERE input_word = %s", (input_word,))
                    db_conn.commit()
                    continue
                else:
                    print(f" - Mapping is not similar to the dictionary word")
                    print(f" - Cosine Similarity: {compare_score}")

                # prompt the user if we want to make the change or not
                print("------")
                print("Updating mapping to the following:")
                print(f" - input_word: {input_word}")
                print(f" - original_dictionary_word: {dictionary_word}")
                print(f" - new_dictionary_word: {response}")
                print("------")

                if response == 'Non-Food Item' and is_food == False:
                    # We can skip this one because it's already marked as a Non-Food Item
                    print(" - Mapping is already marked as a Non-Food Item")
                    confirm = 'y'
                else:
                    confirm = input("Press 'y' to confirm, 'i' to ignore, 'd' to delete, 'm' for mixture, any other key to skip: ")

                if confirm.lower() == 'y':
                    if response == 'Non-Food Item':
                        sql = "UPDATE mappings SET dictionary_word = %s, is_food = FALSE, description = 'Non-Food Item', gpt_reviewed = true, reviewed = true WHERE input_word = %s"
                    else:
                        sql = "UPDATE mappings SET dictionary_word = %s, gpt_reviewed = true, reviewed = true, is_food = true WHERE input_word = %s"
                    
                    print(f" - Updating mapping with {response}")
                    db_cursor.execute(sql, (response, input_word))
                    db_conn.commit()
                    new_row = {
                        'input_word': input_word,
                        'original_dictionary_word': dictionary_word,
                        'new_dictionary_word': response
                    }
                elif confirm.lower() == 'i':
                    print(f" - Ignoring mapping")
                    sql = "UPDATE mappings SET ignore = true, gpt_reviewed = true, reviewed = true WHERE input_word = %s"
                    db_cursor.execute(sql, (input_word,))
                    db_conn.commit()
                elif confirm.lower() == 'd':
                    print(f" - Deleting mapping")
                    sql = "DELETE FROM mappings WHERE input_word = %s"
                    db_cursor.execute(sql, (input_word,))
                    db_conn.commit()
                elif confirm.lower() == 'm':
                    print(f" - Heterogeneous Mixture")
                    sql = "UPDATE mappings SET gpt_reviewed = true, reviewed = true, dictionary_word = 'Heterogeneous Mixture', is_food = true WHERE input_word = %s"
                    db_cursor.execute(sql, (input_word,))
                    db_conn.commit()
                else:
                    db_cursor.execute("UPDATE mappings SET gpt_reviewed = true, reviewed = true WHERE input_word = %s", (input_word,))
                    db_conn.commit()
            else:
                print(f" - Response {response} is not in the dictionary")

        if new_row:
            csv_data.append(new_row)
        update_csv(csv_data)

db_conn.close()