beweinreich commited on
Commit
ecbcfc4
1 Parent(s): 184aa9e

skip anything non-csv

Browse files
algo.py CHANGED
@@ -13,7 +13,7 @@ from ask_gpt import query_gpt
13
  from multi_food_item_detector import extract_items, has_delimiters
14
  from mapping_template import empty_template
15
 
16
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
  similarity_threshold = 0.75
18
 
19
 
 
13
  from multi_food_item_detector import extract_items, has_delimiters
14
  from mapping_template import empty_template
15
 
16
+ logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
17
  similarity_threshold = 0.75
18
 
19
 
audits/1719530507.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
audits/1719530709.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
audits/1719531384.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
audits/1719531486.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ Eyeglasses,"Candies, REESE'S Peanut Butter Cups",Non-Food Item
3
+ Mini Crackers,"Pretzels, NFS","Crackers, NFS"
4
+ Pool Shock,"Sports drink, NFS",Non-Food Item
5
+ Office Paper,Rice paper,Non-Food Item
6
+ Games,"Sports drink, NFS",Non-Food Item
7
+ Comforter,Pig in a blanket,Non-Food Item
8
+ Mulch,Barley,Non-Food Item
9
+ Yoga Mats,"Drumstick pods, raw",Non-Food Item
10
+ Floor Mats,"Drumstick pods, raw",Non-Food Item
11
+ Knives Cultlery Packs,"Snacks, fruit leather, pieces",Non-Food Item
12
+ Bar Stools,"Cereal or granola bar, with coconut, chocolate coated",Non-Food Item
13
+ Grill Cover,"Chicken fillet, grilled",Non-Food Item
14
+ Pet Products,"Hot dog, meat and poultry",Non-Food Item
15
+ Needles,Cough drops,Non-Food Item
16
+ Cart Covers,"Tomatoes, raw",Non-Food Item
17
+ Ear Buds,"Cereal, ready-to-eat, NFS",Non-Food Item
audits/1719531575.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ Assorted Deli Sandwiches Frozen,"Ham, sliced, restaurant",Mixed Food Items
audits/1719531601.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
audits/1719531628.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ All-purpose cleaner,Screwdriver,Non-Food Item
audits/1719531665.csv ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ door locks,Miscellaneous Items,Non-Food Item
3
+ Asst. Vgetables,"Vegetables, mixed, frozen, cooked, boiled, drained, with salt",Assorted Vegetables
4
+ Choc. Candy,Dark chocolate candy,Chocolate candy
5
+ Corn chowder,Potato chowder,"Corn, creamed"
6
+ Frozen bagels,"Croissants, cheese",Bagel
7
+ misc. boxes,"Cereal, chocolate crispy",Non-Food Item
8
+ purex,"Water, bottled, generic",Non-Food Item
9
+ Propel Lemon,"Beverages, Water with added vitamins and minerals, bottles, sweetened, assorted fruit flavors","Beverages, Propel Zero, fruit-flavored, non-carbonated"
10
+ Honey Buns,"Rolls, hamburger or hotdog, plain","Cinnamon buns, frosted (includes honey buns)"
11
+ Powerade Lemonade,"Lemon juice, raw",Sports drink (Powerade)
12
+ Non-Food Office,Assorted Edible Items,Non-Food Item
13
+ Landscaping,Mixed Food Items,Non-Food Item
14
+ Fresh Pineapple,"Pineapple, dried","Pineapple, raw"
15
+ White Cheddar Popcorn,"Popcorn, NFS","Popcorn, ready-to-eat, cheese flavored"
16
+ Signs,Mixed Food Items,Non-Food Item
17
+ Mixed Produce Box,"Vegetables, mixed, canned, drained solids",Mixed Produce
18
+ Vegetablesl,"Classic mixed vegetables, canned, cooked with oil",Mixed Vegetables
19
+ Mixed Food Canned,Non-Food Item,Mixed Food Items
audits/1719531862.csv ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ misc CVS items,"Cholesterol, Cheese, swiss, slices (CA2, NC) - 18c-16-03-Chol",Misc Items
3
+ "Paper, standard, letter size",Rice paper,Non-Food Item
4
+ nacho dip,Chocolate dip,Layer dip
5
+ Syrup Assorted Chololate,"Syrup, NFS",Chocolate syrup
6
+ Tools,Screwdriver,Non-Food Item
7
+ Bottle Caps,"Water, bottled, plain",Non-Food Item
8
+ Can Beans,"Green beans, canned, cooked with oil",Baked beans
9
+ Beef Sticks,"Snacks, popcorn, cakes",Beef jerky
10
+ Peanut butter packets,"Crackers, sandwich, peanut butter filled",Peanut butter
audits/1719532037.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
audits/1719532097.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ LED Bulbs,"Fennel bulb, raw",Non-Food Item
3
+ Powerstrips,"BURGER KING, Chicken Strips",Non-Food Item
4
+ S/P Shakers,"Candies, REESE'S Peanut Butter Cups",Non-Food Item
5
+ Twix Choc Milk,"Milk, NFS","Chocolate milk, NFS"
6
+ vegetales,"Vegetables, mixed, canned, drained solids",Misc Vegetables
7
+ powerade powder,Sports drink (Gatorade G),Sports drink (Powerade)
8
+ Broth & Salsa,Mixed Food Items,Salsa
9
+ Vegetables and Lettuce,Assorted Fruit and Veg,Assorted Vegetables
10
+ King Hawaiian Bread,Ham and cheese loaf or roll,"Rolls, dinner, wheat"
11
+ Fresh Cranberries,"Cranberries, dried","Cranberries, raw"
audits/1719532313.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
audits/1719532660.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
audits/1719532743.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
audits/1719532810.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ cheese and alfredo sauce,"Ravioli, cheese-filled, with tomato sauce",Alfredo sauce
audits/1719532915.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ cleaning & paper,Screwdriver,Non-Food Item
audits/1719532985.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ Condiments and Paper,"Salad dressing, mayonnaise, regular",Non-Food Item
3
+ Pet & Non-Food Items,"Hot dog, meat and poultry",Non-Food Item
audits/1719533030.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ Assorted Quiche & Pot Pies,"Cheese quiche, meatless","Pot pie, beef"
audits/1719533703.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ Shampoo & Conditioner Packets,Mixed Food Items,Non-Food Item
audits/1719533786.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ Biscuits & Fruit,Mixed Food Items,Biscuit with fruit
3
+ sour & tartar sauce,Mixed Food Items,Tartar sauce
audits/1719533844.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
2
+ baking & toppings,"Bread, white, commercially prepared",Mixed Food Items
3
+ snacks & canned vegs.,"Crackers, saltines (includes oyster, soda, soup)",Mixed Food Items
audits/1719533924.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
audits/1719534233.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
chatgpt_audit.py CHANGED
@@ -140,13 +140,13 @@ for row in results:
140
  print(" - Mapping is already marked as a Non-Food Item")
141
  confirm = 'y'
142
  else:
143
- confirm = input("Press 'y' to confirm the update, 'i' to ignore the item, any other key to skip: ")
144
 
145
  if confirm.lower() == 'y':
146
  if response == 'Non-Food Item':
147
  sql = "UPDATE mappings SET dictionary_word = %s, is_food = FALSE, reviewed = true WHERE input_word = %s"
148
  else:
149
- sql = "UPDATE mappings SET dictionary_word = %s, reviewed = true WHERE input_word = %s"
150
 
151
  print(f" - Updating mapping with {response}")
152
  db_cursor.execute(sql, (response, input_word))
@@ -158,8 +158,18 @@ for row in results:
158
  }
159
  elif confirm.lower() == 'i':
160
  print(f" - Ignoring mapping")
161
- sql = "UPDATE mappings SET ignored = true, reviewed = true WHERE input_word = %s"
162
- db_cursor.execute(sql, (response, input_word))
 
 
 
 
 
 
 
 
 
 
163
  db_conn.commit()
164
  else:
165
  db_cursor.execute("UPDATE mappings SET reviewed = true WHERE input_word = %s", (input_word,))
 
140
  print(" - Mapping is already marked as a Non-Food Item")
141
  confirm = 'y'
142
  else:
143
+ confirm = input("Press 'y' to confirm, 'i' to ignore, 'd' to delete, 'm' for mixture, any other key to skip: ")
144
 
145
  if confirm.lower() == 'y':
146
  if response == 'Non-Food Item':
147
  sql = "UPDATE mappings SET dictionary_word = %s, is_food = FALSE, reviewed = true WHERE input_word = %s"
148
  else:
149
+ sql = "UPDATE mappings SET dictionary_word = %s, reviewed = true, is_food = true WHERE input_word = %s"
150
 
151
  print(f" - Updating mapping with {response}")
152
  db_cursor.execute(sql, (response, input_word))
 
158
  }
159
  elif confirm.lower() == 'i':
160
  print(f" - Ignoring mapping")
161
+ sql = "UPDATE mappings SET ignore = true, reviewed = true WHERE input_word = %s"
162
+ db_cursor.execute(sql, (input_word,))
163
+ db_conn.commit()
164
+ elif confirm.lower() == 'd':
165
+ print(f" - Deleting mapping")
166
+ sql = "DELETE FROM mappings WHERE input_word = %s"
167
+ db_cursor.execute(sql, (input_word,))
168
+ db_conn.commit()
169
+ elif confirm.lower() == 'm':
170
+ print(f" - Mixed food items")
171
+ sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Mixed Food Items', is_food = true WHERE input_word = %s"
172
+ db_cursor.execute(sql, (input_word,))
173
  db_conn.commit()
174
  else:
175
  db_cursor.execute("UPDATE mappings SET reviewed = true WHERE input_word = %s", (input_word,))
food_nonfood.py CHANGED
@@ -3,7 +3,7 @@ import numpy as np
3
  import torch
4
  import logging
5
  from transformers import pipeline
6
-
7
  # Load a pre-trained SBERT model
8
 
9
  # Set seeds for reproducibility of zero-shot classification
@@ -17,8 +17,11 @@ def set_seed(seed):
17
 
18
  set_seed(1)
19
 
 
 
20
  # Load a pre-trained model and tokenizer
21
  classifier = pipeline("zero-shot-classification", model="roberta-large-mnli")
 
22
 
23
  # Classify item as food or non-food
24
  def classify_as_food_nonfood(item):
@@ -40,6 +43,16 @@ def classify_as_food_nonfood(item):
40
  label = "food"
41
  score = drink_score
42
 
 
 
 
 
 
 
 
 
 
 
43
  # logging.info(f"Item: {item}, Label: {label}, Score: {score}")
44
  return label, score
45
 
 
3
  import torch
4
  import logging
5
  from transformers import pipeline
6
+ from autocorrect import Speller
7
  # Load a pre-trained SBERT model
8
 
9
  # Set seeds for reproducibility of zero-shot classification
 
17
 
18
  set_seed(1)
19
 
20
+
21
+
22
  # Load a pre-trained model and tokenizer
23
  classifier = pipeline("zero-shot-classification", model="roberta-large-mnli")
24
+ spell = Speller()
25
 
26
  # Classify item as food or non-food
27
  def classify_as_food_nonfood(item):
 
43
  label = "food"
44
  score = drink_score
45
 
46
+ # try correcting the spelling
47
+ if label == "non-food":
48
+ spell_fix_item = spell(cleaned_item)
49
+ result = classifier(cleaned_item, candidate_labels=["food", "non-food"])
50
+ food_label = result["labels"][0]
51
+ food_score = result["scores"][0]
52
+ if food_label == "food" and food_score >= 0.7:
53
+ label = "food"
54
+ score = food_score
55
+
56
  # logging.info(f"Item: {item}, Label: {label}, Score: {score}")
57
  return label, score
58
 
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  --extra-index-url https://download.pytorch.org/whl/cu113
 
2
  gradio==4.36.1
3
  mistralai==0.4.0
4
  numpy==1.26.4
 
1
  --extra-index-url https://download.pytorch.org/whl/cu113
2
+ autocorrect==2.6.1
3
  gradio==4.36.1
4
  mistralai==0.4.0
5
  numpy==1.26.4
run.py CHANGED
@@ -14,8 +14,11 @@ if __name__ == "__main__":
14
 
15
  # get all files in the raw folder and iterate through them
16
  raw_files = os.listdir('./raw')
17
- for raw_file_name in raw_files:
18
  # for raw_file_name in ['test.csv']:
 
 
 
 
19
  # chop off the extension for the results run key
20
  # result_file_name = raw_file_name.split('.')[0]
21
  # run_key = f"{result_file_name}-{int(time.time())}"
 
14
 
15
  # get all files in the raw folder and iterate through them
16
  raw_files = os.listdir('./raw')
 
17
  # for raw_file_name in ['test.csv']:
18
+ for raw_file_name in raw_files:
19
+ if not raw_file_name.endswith('.csv'):
20
+ continue
21
+
22
  # chop off the extension for the results run key
23
  # result_file_name = raw_file_name.split('.')[0]
24
  # run_key = f"{result_file_name}-{int(time.time())}"
similarity_fast.py CHANGED
@@ -3,7 +3,6 @@ import re
3
  import pickle
4
  import pandas as pd
5
  import requests
6
- from tqdm import tqdm
7
  from sentence_transformers import SentenceTransformer, util
8
  from db.db_utils import get_connection, initialize_db, store_mapping_to_db
9
  from food_nonfood import classify_as_food_nonfood
 
3
  import pickle
4
  import pandas as pd
5
  import requests
 
6
  from sentence_transformers import SentenceTransformer, util
7
  from db.db_utils import get_connection, initialize_db, store_mapping_to_db
8
  from food_nonfood import classify_as_food_nonfood