beweinreich commited on
Commit
019a7e5
1 Parent(s): 0ff64fc

improvements

Browse files
audits/1720034413.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
audits/1720034790.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ Tea,"Tea, bubble","Tea, hot, leaf, green"
audits/1720034901.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ Snack cakes,"Snacks, popcorn, cakes","Snack cake, chocolate"
audits/1720035211.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ arrabiata sauce,"Beef and noodles, no sauce",Spaghetti sauce with meat
3
+ Canned Meats,"Meat, NFS","Luncheon meat, pork, canned"
4
+ Chips,Vegetable chips,"Potato chips, NFS"
5
+ graham crumbs,"Cereals ready-to-eat, QUAKER, HONEY GRAHAM OH!S",Graham crackers
6
+ Chedder Popcorn,"Popcorn, NFS","Popcorn, ready-to-eat, cheese flavored"
7
+ Rice Krispies,"Snacks, rice cakes, brown rice, corn","Snacks, KELLOGG, KELLOGG'S RICE KRISPIES TREATS Squares"
8
+ "Beverages, Propel, with electrolytes and sweeteners, fruit-flavored, water","Beverages, Mixed vegetable and fruit juice drink, with added nutrients","Beverages, Water with added vitamins and minerals, bottles, sweetened, assorted fruit flavors"
9
+ Spag. Sause,"Sauce, pasta, spaghetti/marinara, ready-to-serve",Spaghetti sauce
10
+ Baking mix,"Cake or cupcake, white with white icing, bakery",Baked Products
11
+ Cereal,"Cereal, other, plain","Cereal, other, NFS"
12
+ RiceCrisps,"Snacks, rice cakes, brown rice, sesame seed",Rice cake
13
+ Riced Cauliflower,Fried cauliflower,"Cauliflower, cooked, as ingredient"
14
+ Canned Juice,"Carrot juice, canned","Fruit juice, NFS"
15
+ Cliff,"Snacks, granola bars, soft, uncoated, plain",Clif Z bar
16
+ Crackers Bulk,"Cookies, graham crackers, plain or honey (includes cinnamon)","Crackers, wheat"
audits/1720035904.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
chatgpt_audit.py CHANGED
@@ -10,6 +10,8 @@ from Levenshtein import distance
10
  from tqdm import tqdm
11
  from db.db_utils import get_connection
12
  from ask_gpt import query_gpt
 
 
13
 
14
  # For any unreviewed mappings, we ask chatgpt to consider:
15
  # 1. The similar_words list
@@ -26,6 +28,15 @@ client = OpenAI(api_key=api_key)
26
 
27
  output_file_path = f'./audits/{int(time.time())}.csv'
28
 
 
 
 
 
 
 
 
 
 
29
  def update_csv(results):
30
  df_results = pd.DataFrame(results, columns=['input_word', 'original_dictionary_word', 'new_dictionary_word',])
31
  df_results.to_csv(output_file_path, index=False)
@@ -89,7 +100,7 @@ dictionary = db_cursor.fetchall()
89
  dictionary = [item[0] for item in dictionary]
90
 
91
  # select all mappings that have not been reviewed
92
- db_cursor.execute("SELECT input_word, dictionary_word, similar_words, is_food FROM mappings WHERE reviewed = false")
93
  results = db_cursor.fetchall()
94
 
95
  # iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word
@@ -117,7 +128,7 @@ for row in results:
117
  new_row = None
118
  if response == dictionary_word and response in dictionary:
119
  print(f" - Mapping is correct")
120
- db_cursor.execute("UPDATE mappings SET reviewed = true WHERE input_word = %s", (input_word,))
121
  db_conn.commit()
122
  else:
123
  # We should update the mapping in the database
@@ -125,6 +136,18 @@ for row in results:
125
  # We should set reviewed to 1
126
  # first confirm that the response is in the dictionary
127
  if response in dictionary:
 
 
 
 
 
 
 
 
 
 
 
 
128
  # prompt the user if we want to make the change or not
129
  print("------")
130
  print("Updating mapping to the following:")
@@ -142,9 +165,9 @@ for row in results:
142
 
143
  if confirm.lower() == 'y':
144
  if response == 'Non-Food Item':
145
- sql = "UPDATE mappings SET dictionary_word = %s, is_food = FALSE, reviewed = true WHERE input_word = %s"
146
  else:
147
- sql = "UPDATE mappings SET dictionary_word = %s, reviewed = true, is_food = true WHERE input_word = %s"
148
 
149
  print(f" - Updating mapping with {response}")
150
  db_cursor.execute(sql, (response, input_word))
@@ -156,7 +179,7 @@ for row in results:
156
  }
157
  elif confirm.lower() == 'i':
158
  print(f" - Ignoring mapping")
159
- sql = "UPDATE mappings SET ignore = true, reviewed = true WHERE input_word = %s"
160
  db_cursor.execute(sql, (input_word,))
161
  db_conn.commit()
162
  elif confirm.lower() == 'd':
@@ -166,11 +189,11 @@ for row in results:
166
  db_conn.commit()
167
  elif confirm.lower() == 'm':
168
  print(f" - Heterogeneous Mixture")
169
- sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Heterogeneous Mixture', is_food = true WHERE input_word = %s"
170
  db_cursor.execute(sql, (input_word,))
171
  db_conn.commit()
172
  else:
173
- db_cursor.execute("UPDATE mappings SET reviewed = true WHERE input_word = %s", (input_word,))
174
  db_conn.commit()
175
  else:
176
  print(f" - Response {response} is not in the dictionary")
 
10
  from tqdm import tqdm
11
  from db.db_utils import get_connection
12
  from ask_gpt import query_gpt
13
+ from utils import generate_embedding, cosine_similarity
14
+ from sentence_transformers import SentenceTransformer, util
15
 
16
  # For any unreviewed mappings, we ask chatgpt to consider:
17
  # 1. The similar_words list
 
28
 
29
  output_file_path = f'./audits/{int(time.time())}.csv'
30
 
31
+ model_name = 'sentence-transformers/all-mpnet-base-v2'
32
+ model = SentenceTransformer(model_name)
33
+
34
+ def compare_embeddings(old_dictionary_word, new_dictionary_word):
35
+ old_embedding = generate_embedding(model, old_dictionary_word)
36
+ new_embedding = generate_embedding(model, new_dictionary_word)
37
+ cosine_similarity_score = cosine_similarity(old_embedding, new_embedding)
38
+ return cosine_similarity_score
39
+
40
  def update_csv(results):
41
  df_results = pd.DataFrame(results, columns=['input_word', 'original_dictionary_word', 'new_dictionary_word',])
42
  df_results.to_csv(output_file_path, index=False)
 
100
  dictionary = [item[0] for item in dictionary]
101
 
102
  # select all mappings that have not been reviewed
103
+ db_cursor.execute("SELECT input_word, dictionary_word, similar_words, is_food FROM mappings WHERE gpt_reviewed = false and is_food = true and dictionary_word != 'Heterogeneous Mixture'")
104
  results = db_cursor.fetchall()
105
 
106
  # iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word
 
128
  new_row = None
129
  if response == dictionary_word and response in dictionary:
130
  print(f" - Mapping is correct")
131
+ db_cursor.execute("UPDATE mappings SET gpt_reviewed = true WHERE input_word = %s", (input_word,))
132
  db_conn.commit()
133
  else:
134
  # We should update the mapping in the database
 
136
  # We should set reviewed to 1
137
  # first confirm that the response is in the dictionary
138
  if response in dictionary:
139
+
140
+ # If the response is similar to the original dictionary word, lets just skip it
141
+ compare_score = compare_embeddings(dictionary_word, response)
142
+ if compare_score > 0.8:
143
+ print(f" - Mapping is already similar to the dictionary word")
144
+ db_cursor.execute("UPDATE mappings SET gpt_reviewed = true WHERE input_word = %s", (input_word,))
145
+ db_conn.commit()
146
+ continue
147
+ else:
148
+ print(f" - Mapping is not similar to the dictionary word")
149
+ print(f" - Cosine Similarity: {compare_score}")
150
+
151
  # prompt the user if we want to make the change or not
152
  print("------")
153
  print("Updating mapping to the following:")
 
165
 
166
  if confirm.lower() == 'y':
167
  if response == 'Non-Food Item':
168
+ sql = "UPDATE mappings SET dictionary_word = %s, is_food = FALSE, description = 'Non-Food Item', gpt_reviewed = true, reviewed = true WHERE input_word = %s"
169
  else:
170
+ sql = "UPDATE mappings SET dictionary_word = %s, gpt_reviewed = true, reviewed = true, is_food = true WHERE input_word = %s"
171
 
172
  print(f" - Updating mapping with {response}")
173
  db_cursor.execute(sql, (response, input_word))
 
179
  }
180
  elif confirm.lower() == 'i':
181
  print(f" - Ignoring mapping")
182
+ sql = "UPDATE mappings SET ignore = true, gpt_reviewed = true, reviewed = true WHERE input_word = %s"
183
  db_cursor.execute(sql, (input_word,))
184
  db_conn.commit()
185
  elif confirm.lower() == 'd':
 
189
  db_conn.commit()
190
  elif confirm.lower() == 'm':
191
  print(f" - Heterogeneous Mixture")
192
+ sql = "UPDATE mappings SET gpt_reviewed = true, reviewed = true, dictionary_word = 'Heterogeneous Mixture', is_food = true WHERE input_word = %s"
193
  db_cursor.execute(sql, (input_word,))
194
  db_conn.commit()
195
  else:
196
+ db_cursor.execute("UPDATE mappings SET gpt_reviewed = true, reviewed = true WHERE input_word = %s", (input_word,))
197
  db_conn.commit()
198
  else:
199
  print(f" - Response {response} is not in the dictionary")