Spaces:
Paused
Paused
Commit
•
ecbcfc4
1
Parent(s):
184aa9e
skip anything non-csv
Browse files- algo.py +1 -1
- audits/1719530507.csv +1 -0
- audits/1719530709.csv +1 -0
- audits/1719531384.csv +1 -0
- audits/1719531486.csv +17 -0
- audits/1719531575.csv +2 -0
- audits/1719531601.csv +1 -0
- audits/1719531628.csv +2 -0
- audits/1719531665.csv +19 -0
- audits/1719531862.csv +10 -0
- audits/1719532037.csv +1 -0
- audits/1719532097.csv +11 -0
- audits/1719532313.csv +1 -0
- audits/1719532660.csv +1 -0
- audits/1719532743.csv +1 -0
- audits/1719532810.csv +2 -0
- audits/1719532915.csv +2 -0
- audits/1719532985.csv +3 -0
- audits/1719533030.csv +2 -0
- audits/1719533703.csv +2 -0
- audits/1719533786.csv +3 -0
- audits/1719533844.csv +3 -0
- audits/1719533924.csv +1 -0
- audits/1719534233.csv +1 -0
- chatgpt_audit.py +14 -4
- food_nonfood.py +14 -1
- requirements.txt +1 -0
- run.py +4 -1
- similarity_fast.py +0 -1
algo.py
CHANGED
@@ -13,7 +13,7 @@ from ask_gpt import query_gpt
|
|
13 |
from multi_food_item_detector import extract_items, has_delimiters
|
14 |
from mapping_template import empty_template
|
15 |
|
16 |
-
logging.basicConfig(level=logging.
|
17 |
similarity_threshold = 0.75
|
18 |
|
19 |
|
|
|
13 |
from multi_food_item_detector import extract_items, has_delimiters
|
14 |
from mapping_template import empty_template
|
15 |
|
16 |
+
logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
|
17 |
similarity_threshold = 0.75
|
18 |
|
19 |
|
audits/1719530507.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
audits/1719530709.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
audits/1719531384.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
audits/1719531486.csv
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
2 |
+
Eyeglasses,"Candies, REESE'S Peanut Butter Cups",Non-Food Item
|
3 |
+
Mini Crackers,"Pretzels, NFS","Crackers, NFS"
|
4 |
+
Pool Shock,"Sports drink, NFS",Non-Food Item
|
5 |
+
Office Paper,Rice paper,Non-Food Item
|
6 |
+
Games,"Sports drink, NFS",Non-Food Item
|
7 |
+
Comforter,Pig in a blanket,Non-Food Item
|
8 |
+
Mulch,Barley,Non-Food Item
|
9 |
+
Yoga Mats,"Drumstick pods, raw",Non-Food Item
|
10 |
+
Floor Mats,"Drumstick pods, raw",Non-Food Item
|
11 |
+
Knives Cultlery Packs,"Snacks, fruit leather, pieces",Non-Food Item
|
12 |
+
Bar Stools,"Cereal or granola bar, with coconut, chocolate coated",Non-Food Item
|
13 |
+
Grill Cover,"Chicken fillet, grilled",Non-Food Item
|
14 |
+
Pet Products,"Hot dog, meat and poultry",Non-Food Item
|
15 |
+
Needles,Cough drops,Non-Food Item
|
16 |
+
Cart Covers,"Tomatoes, raw",Non-Food Item
|
17 |
+
Ear Buds,"Cereal, ready-to-eat, NFS",Non-Food Item
|
audits/1719531575.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
2 |
+
Assorted Deli Sandwiches Frozen,"Ham, sliced, restaurant",Mixed Food Items
|
audits/1719531601.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
audits/1719531628.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
2 |
+
All-purpose cleaner,Screwdriver,Non-Food Item
|
audits/1719531665.csv
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
2 |
+
door locks,Miscellaneous Items,Non-Food Item
|
3 |
+
Asst. Vgetables,"Vegetables, mixed, frozen, cooked, boiled, drained, with salt",Assorted Vegetables
|
4 |
+
Choc. Candy,Dark chocolate candy,Chocolate candy
|
5 |
+
Corn chowder,Potato chowder,"Corn, creamed"
|
6 |
+
Frozen bagels,"Croissants, cheese",Bagel
|
7 |
+
misc. boxes,"Cereal, chocolate crispy",Non-Food Item
|
8 |
+
purex,"Water, bottled, generic",Non-Food Item
|
9 |
+
Propel Lemon,"Beverages, Water with added vitamins and minerals, bottles, sweetened, assorted fruit flavors","Beverages, Propel Zero, fruit-flavored, non-carbonated"
|
10 |
+
Honey Buns,"Rolls, hamburger or hotdog, plain","Cinnamon buns, frosted (includes honey buns)"
|
11 |
+
Powerade Lemonade,"Lemon juice, raw",Sports drink (Powerade)
|
12 |
+
Non-Food Office,Assorted Edible Items,Non-Food Item
|
13 |
+
Landscaping,Mixed Food Items,Non-Food Item
|
14 |
+
Fresh Pineapple,"Pineapple, dried","Pineapple, raw"
|
15 |
+
White Cheddar Popcorn,"Popcorn, NFS","Popcorn, ready-to-eat, cheese flavored"
|
16 |
+
Signs,Mixed Food Items,Non-Food Item
|
17 |
+
Mixed Produce Box,"Vegetables, mixed, canned, drained solids",Mixed Produce
|
18 |
+
Vegetablesl,"Classic mixed vegetables, canned, cooked with oil",Mixed Vegetables
|
19 |
+
Mixed Food Canned,Non-Food Item,Mixed Food Items
|
audits/1719531862.csv
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
2 |
+
misc CVS items,"Cholesterol, Cheese, swiss, slices (CA2, NC) - 18c-16-03-Chol",Misc Items
|
3 |
+
"Paper, standard, letter size",Rice paper,Non-Food Item
|
4 |
+
nacho dip,Chocolate dip,Layer dip
|
5 |
+
Syrup Assorted Chololate,"Syrup, NFS",Chocolate syrup
|
6 |
+
Tools,Screwdriver,Non-Food Item
|
7 |
+
Bottle Caps,"Water, bottled, plain",Non-Food Item
|
8 |
+
Can Beans,"Green beans, canned, cooked with oil",Baked beans
|
9 |
+
Beef Sticks,"Snacks, popcorn, cakes",Beef jerky
|
10 |
+
Peanut butter packets,"Crackers, sandwich, peanut butter filled",Peanut butter
|
audits/1719532037.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
audits/1719532097.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
2 |
+
LED Bulbs,"Fennel bulb, raw",Non-Food Item
|
3 |
+
Powerstrips,"BURGER KING, Chicken Strips",Non-Food Item
|
4 |
+
S/P Shakers,"Candies, REESE'S Peanut Butter Cups",Non-Food Item
|
5 |
+
Twix Choc Milk,"Milk, NFS","Chocolate milk, NFS"
|
6 |
+
vegetales,"Vegetables, mixed, canned, drained solids",Misc Vegetables
|
7 |
+
powerade powder,Sports drink (Gatorade G),Sports drink (Powerade)
|
8 |
+
Broth & Salsa,Mixed Food Items,Salsa
|
9 |
+
Vegetables and Lettuce,Assorted Fruit and Veg,Assorted Vegetables
|
10 |
+
King Hawaiian Bread,Ham and cheese loaf or roll,"Rolls, dinner, wheat"
|
11 |
+
Fresh Cranberries,"Cranberries, dried","Cranberries, raw"
|
audits/1719532313.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
audits/1719532660.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
audits/1719532743.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
audits/1719532810.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
2 |
+
cheese and alfredo sauce,"Ravioli, cheese-filled, with tomato sauce",Alfredo sauce
|
audits/1719532915.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
2 |
+
cleaning & paper,Screwdriver,Non-Food Item
|
audits/1719532985.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
2 |
+
Condiments and Paper,"Salad dressing, mayonnaise, regular",Non-Food Item
|
3 |
+
Pet & Non-Food Items,"Hot dog, meat and poultry",Non-Food Item
|
audits/1719533030.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
2 |
+
Assorted Quiche & Pot Pies,"Cheese quiche, meatless","Pot pie, beef"
|
audits/1719533703.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
2 |
+
Shampoo & Conditioner Packets,Mixed Food Items,Non-Food Item
|
audits/1719533786.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
2 |
+
Biscuits & Fruit,Mixed Food Items,Biscuit with fruit
|
3 |
+
sour & tartar sauce,Mixed Food Items,Tartar sauce
|
audits/1719533844.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
2 |
+
baking & toppings,"Bread, white, commercially prepared",Mixed Food Items
|
3 |
+
snacks & canned vegs.,"Crackers, saltines (includes oyster, soda, soup)",Mixed Food Items
|
audits/1719533924.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
audits/1719534233.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
chatgpt_audit.py
CHANGED
@@ -140,13 +140,13 @@ for row in results:
|
|
140 |
print(" - Mapping is already marked as a Non-Food Item")
|
141 |
confirm = 'y'
|
142 |
else:
|
143 |
-
confirm = input("Press 'y' to confirm
|
144 |
|
145 |
if confirm.lower() == 'y':
|
146 |
if response == 'Non-Food Item':
|
147 |
sql = "UPDATE mappings SET dictionary_word = %s, is_food = FALSE, reviewed = true WHERE input_word = %s"
|
148 |
else:
|
149 |
-
sql = "UPDATE mappings SET dictionary_word = %s, reviewed = true WHERE input_word = %s"
|
150 |
|
151 |
print(f" - Updating mapping with {response}")
|
152 |
db_cursor.execute(sql, (response, input_word))
|
@@ -158,8 +158,18 @@ for row in results:
|
|
158 |
}
|
159 |
elif confirm.lower() == 'i':
|
160 |
print(f" - Ignoring mapping")
|
161 |
-
sql = "UPDATE mappings SET
|
162 |
-
db_cursor.execute(sql, (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
db_conn.commit()
|
164 |
else:
|
165 |
db_cursor.execute("UPDATE mappings SET reviewed = true WHERE input_word = %s", (input_word,))
|
|
|
140 |
print(" - Mapping is already marked as a Non-Food Item")
|
141 |
confirm = 'y'
|
142 |
else:
|
143 |
+
confirm = input("Press 'y' to confirm, 'i' to ignore, 'd' to delete, 'm' for mixture, any other key to skip: ")
|
144 |
|
145 |
if confirm.lower() == 'y':
|
146 |
if response == 'Non-Food Item':
|
147 |
sql = "UPDATE mappings SET dictionary_word = %s, is_food = FALSE, reviewed = true WHERE input_word = %s"
|
148 |
else:
|
149 |
+
sql = "UPDATE mappings SET dictionary_word = %s, reviewed = true, is_food = true WHERE input_word = %s"
|
150 |
|
151 |
print(f" - Updating mapping with {response}")
|
152 |
db_cursor.execute(sql, (response, input_word))
|
|
|
158 |
}
|
159 |
elif confirm.lower() == 'i':
|
160 |
print(f" - Ignoring mapping")
|
161 |
+
sql = "UPDATE mappings SET ignore = true, reviewed = true WHERE input_word = %s"
|
162 |
+
db_cursor.execute(sql, (input_word,))
|
163 |
+
db_conn.commit()
|
164 |
+
elif confirm.lower() == 'd':
|
165 |
+
print(f" - Deleting mapping")
|
166 |
+
sql = "DELETE FROM mappings WHERE input_word = %s"
|
167 |
+
db_cursor.execute(sql, (input_word,))
|
168 |
+
db_conn.commit()
|
169 |
+
elif confirm.lower() == 'm':
|
170 |
+
print(f" - Mixed food items")
|
171 |
+
sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Mixed Food Items', is_food = true WHERE input_word = %s"
|
172 |
+
db_cursor.execute(sql, (input_word,))
|
173 |
db_conn.commit()
|
174 |
else:
|
175 |
db_cursor.execute("UPDATE mappings SET reviewed = true WHERE input_word = %s", (input_word,))
|
food_nonfood.py
CHANGED
@@ -3,7 +3,7 @@ import numpy as np
|
|
3 |
import torch
|
4 |
import logging
|
5 |
from transformers import pipeline
|
6 |
-
|
7 |
# Load a pre-trained SBERT model
|
8 |
|
9 |
# Set seeds for reproducibility of zero-shot classification
|
@@ -17,8 +17,11 @@ def set_seed(seed):
|
|
17 |
|
18 |
set_seed(1)
|
19 |
|
|
|
|
|
20 |
# Load a pre-trained model and tokenizer
|
21 |
classifier = pipeline("zero-shot-classification", model="roberta-large-mnli")
|
|
|
22 |
|
23 |
# Classify item as food or non-food
|
24 |
def classify_as_food_nonfood(item):
|
@@ -40,6 +43,16 @@ def classify_as_food_nonfood(item):
|
|
40 |
label = "food"
|
41 |
score = drink_score
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
# logging.info(f"Item: {item}, Label: {label}, Score: {score}")
|
44 |
return label, score
|
45 |
|
|
|
3 |
import torch
|
4 |
import logging
|
5 |
from transformers import pipeline
|
6 |
+
from autocorrect import Speller
|
7 |
# Load a pre-trained SBERT model
|
8 |
|
9 |
# Set seeds for reproducibility of zero-shot classification
|
|
|
17 |
|
18 |
set_seed(1)
|
19 |
|
20 |
+
|
21 |
+
|
22 |
# Load a pre-trained model and tokenizer
|
23 |
classifier = pipeline("zero-shot-classification", model="roberta-large-mnli")
|
24 |
+
spell = Speller()
|
25 |
|
26 |
# Classify item as food or non-food
|
27 |
def classify_as_food_nonfood(item):
|
|
|
43 |
label = "food"
|
44 |
score = drink_score
|
45 |
|
46 |
+
# try correcting the spelling
|
47 |
+
if label == "non-food":
|
48 |
+
spell_fix_item = spell(cleaned_item)
|
49 |
+
result = classifier(cleaned_item, candidate_labels=["food", "non-food"])
|
50 |
+
food_label = result["labels"][0]
|
51 |
+
food_score = result["scores"][0]
|
52 |
+
if food_label == "food" and food_score >= 0.7:
|
53 |
+
label = "food"
|
54 |
+
score = food_score
|
55 |
+
|
56 |
# logging.info(f"Item: {item}, Label: {label}, Score: {score}")
|
57 |
return label, score
|
58 |
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
--extra-index-url https://download.pytorch.org/whl/cu113
|
|
|
2 |
gradio==4.36.1
|
3 |
mistralai==0.4.0
|
4 |
numpy==1.26.4
|
|
|
1 |
--extra-index-url https://download.pytorch.org/whl/cu113
|
2 |
+
autocorrect==2.6.1
|
3 |
gradio==4.36.1
|
4 |
mistralai==0.4.0
|
5 |
numpy==1.26.4
|
run.py
CHANGED
@@ -14,8 +14,11 @@ if __name__ == "__main__":
|
|
14 |
|
15 |
# get all files in the raw folder and iterate through them
|
16 |
raw_files = os.listdir('./raw')
|
17 |
-
for raw_file_name in raw_files:
|
18 |
# for raw_file_name in ['test.csv']:
|
|
|
|
|
|
|
|
|
19 |
# chop off the extension for the results run key
|
20 |
# result_file_name = raw_file_name.split('.')[0]
|
21 |
# run_key = f"{result_file_name}-{int(time.time())}"
|
|
|
14 |
|
15 |
# get all files in the raw folder and iterate through them
|
16 |
raw_files = os.listdir('./raw')
|
|
|
17 |
# for raw_file_name in ['test.csv']:
|
18 |
+
for raw_file_name in raw_files:
|
19 |
+
if not raw_file_name.endswith('.csv'):
|
20 |
+
continue
|
21 |
+
|
22 |
# chop off the extension for the results run key
|
23 |
# result_file_name = raw_file_name.split('.')[0]
|
24 |
# run_key = f"{result_file_name}-{int(time.time())}"
|
similarity_fast.py
CHANGED
@@ -3,7 +3,6 @@ import re
|
|
3 |
import pickle
|
4 |
import pandas as pd
|
5 |
import requests
|
6 |
-
from tqdm import tqdm
|
7 |
from sentence_transformers import SentenceTransformer, util
|
8 |
from db.db_utils import get_connection, initialize_db, store_mapping_to_db
|
9 |
from food_nonfood import classify_as_food_nonfood
|
|
|
3 |
import pickle
|
4 |
import pandas as pd
|
5 |
import requests
|
|
|
6 |
from sentence_transformers import SentenceTransformer, util
|
7 |
from db.db_utils import get_connection, initialize_db, store_mapping_to_db
|
8 |
from food_nonfood import classify_as_food_nonfood
|