Spaces:
Paused
Paused
Commit
•
d93f20c
1
Parent(s):
dc79224
integrate a specificity
Browse files- algo.py +11 -6
- ask_gpt.py +1 -1
- chatgpt_audit.py +3 -5
- chatgpt_audit2.py +3 -3
- db/db_utils.py +5 -3
- item_or_category.py +0 -35
- mapping_template.py +8 -2
- post_import_updates.py +2 -2
- specificity_classifier.py +212 -0
algo.py
CHANGED
@@ -15,7 +15,7 @@ from ask_gpt import query_gpt
|
|
15 |
from multi_food_item_detector import extract_items, has_delimiters
|
16 |
from mapping_template import empty_template, heterogeneous_template, multi_item_template, nonfood_template, usda_template
|
17 |
from tasks import insert_result
|
18 |
-
|
19 |
|
20 |
logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
|
21 |
similarity_threshold = 0.78
|
@@ -81,10 +81,10 @@ class Algo:
|
|
81 |
}
|
82 |
)
|
83 |
return mapping
|
84 |
-
elif gpt_recommended_word == '
|
85 |
mapping.update(
|
86 |
{
|
87 |
-
'dictionary_word': '
|
88 |
'confidence_score': 1.0
|
89 |
}
|
90 |
)
|
@@ -213,6 +213,11 @@ class Algo:
|
|
213 |
|
214 |
mapping = self.perform_mapping(input_word)
|
215 |
|
|
|
|
|
|
|
|
|
|
|
216 |
food_nonfood_pessimistic = pessimistic_food_nonfood_score(food_nonfood, mapping['similarity_score'])
|
217 |
mapping.update({
|
218 |
'is_food': food_nonfood_pessimistic[0],
|
@@ -298,7 +303,7 @@ class Algo:
|
|
298 |
|
299 |
def match_words(self, input_data):
|
300 |
# input_data is a list of tuples, where each tuple is (description, donor)
|
301 |
-
|
302 |
result_batch = []
|
303 |
for input_item in tqdm(input_data, desc="Processing input words"):
|
304 |
input_word = input_item[0]
|
@@ -333,7 +338,7 @@ class Algo:
|
|
333 |
})
|
334 |
result_batch.append(mapping)
|
335 |
# store_result_to_db(self.db_cursor, self.db_conn, self.run_key, mapping)
|
336 |
-
|
337 |
|
338 |
if len(result_batch) >= 100:
|
339 |
insert_result.delay(self.run_key, result_batch)
|
@@ -344,4 +349,4 @@ class Algo:
|
|
344 |
result_batch = []
|
345 |
|
346 |
|
347 |
-
|
|
|
15 |
from multi_food_item_detector import extract_items, has_delimiters
|
16 |
from mapping_template import empty_template, heterogeneous_template, multi_item_template, nonfood_template, usda_template
|
17 |
from tasks import insert_result
|
18 |
+
from specificity_classifier import classify_text_to_specificity
|
19 |
|
20 |
logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
|
21 |
similarity_threshold = 0.78
|
|
|
81 |
}
|
82 |
)
|
83 |
return mapping
|
84 |
+
elif gpt_recommended_word == 'Heterogeneous Mixture':
|
85 |
mapping.update(
|
86 |
{
|
87 |
+
'dictionary_word': 'Heterogeneous Mixture', 'similarity_score': 1.0,
|
88 |
'confidence_score': 1.0
|
89 |
}
|
90 |
)
|
|
|
213 |
|
214 |
mapping = self.perform_mapping(input_word)
|
215 |
|
216 |
+
specificity = classify_text_to_specificity(input_word_clean)
|
217 |
+
mapping.update({
|
218 |
+
'specificity': specificity
|
219 |
+
})
|
220 |
+
|
221 |
food_nonfood_pessimistic = pessimistic_food_nonfood_score(food_nonfood, mapping['similarity_score'])
|
222 |
mapping.update({
|
223 |
'is_food': food_nonfood_pessimistic[0],
|
|
|
303 |
|
304 |
def match_words(self, input_data):
|
305 |
# input_data is a list of tuples, where each tuple is (description, donor)
|
306 |
+
results = []
|
307 |
result_batch = []
|
308 |
for input_item in tqdm(input_data, desc="Processing input words"):
|
309 |
input_word = input_item[0]
|
|
|
338 |
})
|
339 |
result_batch.append(mapping)
|
340 |
# store_result_to_db(self.db_cursor, self.db_conn, self.run_key, mapping)
|
341 |
+
results.append(mapping)
|
342 |
|
343 |
if len(result_batch) >= 100:
|
344 |
insert_result.delay(self.run_key, result_batch)
|
|
|
349 |
result_batch = []
|
350 |
|
351 |
|
352 |
+
return results
|
ask_gpt.py
CHANGED
@@ -20,7 +20,7 @@ def query_gpt(food_item):
|
|
20 |
f"Make sure you're accurate about whether it is cooked, prepared, etc or not.\n\n"
|
21 |
f"But if its an obscure food, you can come up with a extremely similar food item that is similar in DMC.\n\n"
|
22 |
f"If it's not a food item, return 'Non-Food Item'.\n\n"
|
23 |
-
f"If it's a generic term like 'Mixture of foods', just say: '
|
24 |
f"If it's not a food item, but a broad category like 'Various Produce', just say: 'Broad Category'.\n\n"
|
25 |
f"You should respond in json format with an object that has the key `guess`, and the value is the most similar food item.\n\n"
|
26 |
f"The food item is: \"{food_item}\""
|
|
|
20 |
f"Make sure you're accurate about whether it is cooked, prepared, etc or not.\n\n"
|
21 |
f"But if its an obscure food, you can come up with a extremely similar food item that is similar in DMC.\n\n"
|
22 |
f"If it's not a food item, return 'Non-Food Item'.\n\n"
|
23 |
+
f"If it's a generic term like 'Mixture of foods', or 'grocery items' just say: 'Heterogeneous Mixture'.\n\n"
|
24 |
f"If it's not a food item, but a broad category like 'Various Produce', just say: 'Broad Category'.\n\n"
|
25 |
f"You should respond in json format with an object that has the key `guess`, and the value is the most similar food item.\n\n"
|
26 |
f"The food item is: \"{food_item}\""
|
chatgpt_audit.py
CHANGED
@@ -41,9 +41,7 @@ def query_gpt(food_item, dictionary_word, similar_words):
|
|
41 |
prompt = (
|
42 |
f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping?
|
43 |
|
44 |
-
Generally, you should prefer the mapped word, but if you believe there is a better fit, please
|
45 |
-
|
46 |
-
I will also provide a list of other similar words that you could be a better fit.
|
47 |
|
48 |
This is important: only return a word from the list of words I provide.
|
49 |
|
@@ -167,8 +165,8 @@ for row in results:
|
|
167 |
db_cursor.execute(sql, (input_word,))
|
168 |
db_conn.commit()
|
169 |
elif confirm.lower() == 'm':
|
170 |
-
print(f" -
|
171 |
-
sql = "UPDATE mappings SET reviewed = true, dictionary_word = '
|
172 |
db_cursor.execute(sql, (input_word,))
|
173 |
db_conn.commit()
|
174 |
else:
|
|
|
41 |
prompt = (
|
42 |
f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping?
|
43 |
|
44 |
+
Generally, you should prefer the mapped word, but if you believe there is a better fit from provided list of similar words, please specify it.
|
|
|
|
|
45 |
|
46 |
This is important: only return a word from the list of words I provide.
|
47 |
|
|
|
165 |
db_cursor.execute(sql, (input_word,))
|
166 |
db_conn.commit()
|
167 |
elif confirm.lower() == 'm':
|
168 |
+
print(f" - Heterogeneous Mixture")
|
169 |
+
sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Heterogeneous Mixture', is_food = true WHERE input_word = %s"
|
170 |
db_cursor.execute(sql, (input_word,))
|
171 |
db_conn.commit()
|
172 |
else:
|
chatgpt_audit2.py
CHANGED
@@ -103,7 +103,7 @@ for row in results:
|
|
103 |
else:
|
104 |
similar_words_list = mapping['similar_words'].split('|')
|
105 |
similar_words_list.append('Non-Food Item')
|
106 |
-
similar_words_list.append('
|
107 |
|
108 |
response = query_gpt(input_word, dictionary_word, similar_words_list)
|
109 |
if response:
|
@@ -138,8 +138,8 @@ for row in results:
|
|
138 |
db_cursor.execute(sql, (input_word,))
|
139 |
db_conn.commit()
|
140 |
elif confirm.lower() == 'm':
|
141 |
-
print(f" -
|
142 |
-
sql = "UPDATE mappings SET reviewed = true, dictionary_word = '
|
143 |
db_cursor.execute(sql, (input_word,))
|
144 |
db_conn.commit()
|
145 |
else:
|
|
|
103 |
else:
|
104 |
similar_words_list = mapping['similar_words'].split('|')
|
105 |
similar_words_list.append('Non-Food Item')
|
106 |
+
similar_words_list.append('Heterogeneous Mixture')
|
107 |
|
108 |
response = query_gpt(input_word, dictionary_word, similar_words_list)
|
109 |
if response:
|
|
|
138 |
db_cursor.execute(sql, (input_word,))
|
139 |
db_conn.commit()
|
140 |
elif confirm.lower() == 'm':
|
141 |
+
print(f" - Heterogeneous Mixture")
|
142 |
+
sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Heterogeneous Mixture', is_food = true WHERE input_word = %s"
|
143 |
db_cursor.execute(sql, (input_word,))
|
144 |
db_conn.commit()
|
145 |
else:
|
db/db_utils.py
CHANGED
@@ -20,6 +20,7 @@ def initialize_db(conn):
|
|
20 |
input_word TEXT PRIMARY KEY,
|
21 |
cleaned_word TEXT,
|
22 |
dictionary_word TEXT,
|
|
|
23 |
similarity_score REAL,
|
24 |
confidence_score REAL,
|
25 |
similar_words TEXT,
|
@@ -155,8 +156,8 @@ def store_mapping_to_db(cursor, conn, mapping):
|
|
155 |
logging.info(f" - Storing new mapping to db: {mapping}")
|
156 |
try:
|
157 |
cursor.execute('''
|
158 |
-
INSERT INTO mappings (input_word, cleaned_word, dictionary_word, similarity_score, confidence_score, similar_words, is_food, food_nonfood_score)
|
159 |
-
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
160 |
''', (
|
161 |
mapping['input_word'],
|
162 |
mapping['cleaned_word'],
|
@@ -165,7 +166,8 @@ def store_mapping_to_db(cursor, conn, mapping):
|
|
165 |
mapping['confidence_score'],
|
166 |
mapping['similar_words'],
|
167 |
mapping['is_food'],
|
168 |
-
mapping['food_nonfood_score']
|
|
|
169 |
))
|
170 |
conn.commit()
|
171 |
except Exception as e:
|
|
|
20 |
input_word TEXT PRIMARY KEY,
|
21 |
cleaned_word TEXT,
|
22 |
dictionary_word TEXT,
|
23 |
+
specificity TEXT,
|
24 |
similarity_score REAL,
|
25 |
confidence_score REAL,
|
26 |
similar_words TEXT,
|
|
|
156 |
logging.info(f" - Storing new mapping to db: {mapping}")
|
157 |
try:
|
158 |
cursor.execute('''
|
159 |
+
INSERT INTO mappings (input_word, cleaned_word, dictionary_word, similarity_score, confidence_score, similar_words, is_food, food_nonfood_score, specificity)
|
160 |
+
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
161 |
''', (
|
162 |
mapping['input_word'],
|
163 |
mapping['cleaned_word'],
|
|
|
166 |
mapping['confidence_score'],
|
167 |
mapping['similar_words'],
|
168 |
mapping['is_food'],
|
169 |
+
mapping['food_nonfood_score'],
|
170 |
+
mapping['specificity']
|
171 |
))
|
172 |
conn.commit()
|
173 |
except Exception as e:
|
item_or_category.py
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
import random
|
2 |
-
import numpy as np
|
3 |
-
import torch
|
4 |
-
import logging
|
5 |
-
from transformers import pipeline
|
6 |
-
from autocorrect import Speller
|
7 |
-
# Load a pre-trained SBERT model
|
8 |
-
|
9 |
-
# Set seeds for reproducibility of zero-shot classification
|
10 |
-
def set_seed(seed):
|
11 |
-
random.seed(seed)
|
12 |
-
np.random.seed(seed)
|
13 |
-
torch.manual_seed(seed)
|
14 |
-
torch.cuda.manual_seed_all(seed)
|
15 |
-
torch.backends.cudnn.deterministic = True
|
16 |
-
torch.backends.cudnn.benchmark = False
|
17 |
-
|
18 |
-
set_seed(1)
|
19 |
-
|
20 |
-
|
21 |
-
# Load a pre-trained model and tokenizer
|
22 |
-
classifier = pipeline("zero-shot-classification", model="roberta-large-mnli")
|
23 |
-
spell = Speller()
|
24 |
-
|
25 |
-
# Classify item as food or non-food
|
26 |
-
def classify_as_item_or_category(item):
|
27 |
-
cleaned_item = item.strip().lower()
|
28 |
-
spell_fix_item = spell(cleaned_item)
|
29 |
-
result = classifier(spell_fix_item, candidate_labels=["single food item", "food category"])
|
30 |
-
label = result["labels"][0]
|
31 |
-
score = result["scores"][0]
|
32 |
-
|
33 |
-
# logging.info(f"Item: {item}, Label: {label}, Score: {score}")
|
34 |
-
return label, score
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mapping_template.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from utils import clean_word
|
2 |
|
3 |
-
def generic_template(input_word, cleaned_word=None, similarity_score=None, confidence_score=None, similar_words=None, is_food=None, food_nonfood_score=None, dictionary_word=None, sr_legacy_food_category=None, wweia_category=None, dry_matter_content=None, water_content=None, leakage=None):
|
4 |
if cleaned_word is None:
|
5 |
cleaned_word = clean_word(input_word)
|
6 |
|
@@ -18,6 +18,7 @@ def generic_template(input_word, cleaned_word=None, similarity_score=None, confi
|
|
18 |
'dry_matter_content': dry_matter_content,
|
19 |
'water_content': water_content,
|
20 |
'leakage': leakage,
|
|
|
21 |
}
|
22 |
|
23 |
def empty_template(input_word, cleaned_word=None):
|
@@ -38,6 +39,7 @@ def empty_template(input_word, cleaned_word=None):
|
|
38 |
'dry_matter_content': None,
|
39 |
'water_content': None,
|
40 |
'leakage': None,
|
|
|
41 |
}
|
42 |
|
43 |
def usda_template(input_word, cleaned_word=None):
|
@@ -58,6 +60,7 @@ def usda_template(input_word, cleaned_word=None):
|
|
58 |
'dry_matter_content': None,
|
59 |
'water_content': None,
|
60 |
'leakage': None,
|
|
|
61 |
}
|
62 |
|
63 |
def nonfood_template(input_word, cleaned_word=None, food_nonfood_score=None, similar_words=None):
|
@@ -78,6 +81,7 @@ def nonfood_template(input_word, cleaned_word=None, food_nonfood_score=None, sim
|
|
78 |
'dry_matter_content': 0,
|
79 |
'water_content': 0,
|
80 |
'leakage': 0,
|
|
|
81 |
}
|
82 |
|
83 |
def heterogeneous_template(input_word, cleaned_word=None):
|
@@ -98,6 +102,7 @@ def heterogeneous_template(input_word, cleaned_word=None):
|
|
98 |
'dry_matter_content': 0.27,
|
99 |
'water_content': 0.73,
|
100 |
'leakage': 0.1
|
|
|
101 |
}
|
102 |
|
103 |
def multi_item_template(input_word, cleaned_word=None, conservative_mapping=None):
|
@@ -117,6 +122,7 @@ def multi_item_template(input_word, cleaned_word=None, conservative_mapping=None
|
|
117 |
'sr_legacy_food_category': conservative_mapping['sr_legacy_food_category'],
|
118 |
'dry_matter_content': conservative_mapping['dry_matter_content'],
|
119 |
'water_content': conservative_mapping['water_content'],
|
120 |
-
'leakage': conservative_mapping['leakage']
|
|
|
121 |
}
|
122 |
|
|
|
1 |
from utils import clean_word
|
2 |
|
3 |
+
def generic_template(input_word, cleaned_word=None, similarity_score=None, confidence_score=None, similar_words=None, is_food=None, food_nonfood_score=None, dictionary_word=None, sr_legacy_food_category=None, wweia_category=None, dry_matter_content=None, water_content=None, leakage=None, specificity=None):
|
4 |
if cleaned_word is None:
|
5 |
cleaned_word = clean_word(input_word)
|
6 |
|
|
|
18 |
'dry_matter_content': dry_matter_content,
|
19 |
'water_content': water_content,
|
20 |
'leakage': leakage,
|
21 |
+
'specificity': specificity
|
22 |
}
|
23 |
|
24 |
def empty_template(input_word, cleaned_word=None):
|
|
|
39 |
'dry_matter_content': None,
|
40 |
'water_content': None,
|
41 |
'leakage': None,
|
42 |
+
'specificity': None
|
43 |
}
|
44 |
|
45 |
def usda_template(input_word, cleaned_word=None):
|
|
|
60 |
'dry_matter_content': None,
|
61 |
'water_content': None,
|
62 |
'leakage': None,
|
63 |
+
'specificity': None
|
64 |
}
|
65 |
|
66 |
def nonfood_template(input_word, cleaned_word=None, food_nonfood_score=None, similar_words=None):
|
|
|
81 |
'dry_matter_content': 0,
|
82 |
'water_content': 0,
|
83 |
'leakage': 0,
|
84 |
+
'specificity': None
|
85 |
}
|
86 |
|
87 |
def heterogeneous_template(input_word, cleaned_word=None):
|
|
|
102 |
'dry_matter_content': 0.27,
|
103 |
'water_content': 0.73,
|
104 |
'leakage': 0.1
|
105 |
+
'specificity': 'Heterogeneous Mixture'
|
106 |
}
|
107 |
|
108 |
def multi_item_template(input_word, cleaned_word=None, conservative_mapping=None):
|
|
|
122 |
'sr_legacy_food_category': conservative_mapping['sr_legacy_food_category'],
|
123 |
'dry_matter_content': conservative_mapping['dry_matter_content'],
|
124 |
'water_content': conservative_mapping['water_content'],
|
125 |
+
'leakage': conservative_mapping['leakage'],
|
126 |
+
'specificity': conservative_mapping['specificity']
|
127 |
}
|
128 |
|
post_import_updates.py
CHANGED
@@ -118,13 +118,13 @@ for item in categories:
|
|
118 |
input_word = f"{qualifier} {category}"
|
119 |
print(f"Storing {input_word}")
|
120 |
cleaned_word = clean_word(input_word)
|
121 |
-
mapping = generic_template(input_word, cleaned_word, 1, 1, None, True, 1, category, category, dry_matter_content, water_content, leakage)
|
122 |
store_mapping_to_db(db_cursor, db_conn, mapping)
|
123 |
|
124 |
input_word = f"{category} {qualifier}"
|
125 |
print(f"Storing {input_word}")
|
126 |
cleaned_word = clean_word(input_word)
|
127 |
-
mapping = generic_template(input_word, cleaned_word, 1, 1, None, True, 1, category, category, dry_matter_content, water_content, leakage)
|
128 |
store_mapping_to_db(db_cursor, db_conn, mapping)
|
129 |
|
130 |
|
|
|
118 |
input_word = f"{qualifier} {category}"
|
119 |
print(f"Storing {input_word}")
|
120 |
cleaned_word = clean_word(input_word)
|
121 |
+
mapping = generic_template(input_word, cleaned_word, 1, 1, None, True, 1, category, category, dry_matter_content, water_content, leakage, "Category")
|
122 |
store_mapping_to_db(db_cursor, db_conn, mapping)
|
123 |
|
124 |
input_word = f"{category} {qualifier}"
|
125 |
print(f"Storing {input_word}")
|
126 |
cleaned_word = clean_word(input_word)
|
127 |
+
mapping = generic_template(input_word, cleaned_word, 1, 1, None, True, 1, category, category, dry_matter_content, water_content, leakage, "Category")
|
128 |
store_mapping_to_db(db_cursor, db_conn, mapping)
|
129 |
|
130 |
|
specificity_classifier.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.ensemble import RandomForestClassifier
|
2 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
3 |
+
from sklearn.pipeline import make_pipeline
|
4 |
+
|
5 |
+
training_data = [
|
6 |
+
("Bananas", "Specific"),
|
7 |
+
("misc groceries", "Heterogeneous Mixture"),
|
8 |
+
("Produce", "Category"),
|
9 |
+
("Xoconostle", "Specific"),
|
10 |
+
("Banana, hot chocolate & chips", "Specific"),
|
11 |
+
("assorted apples", "Specific"),
|
12 |
+
("breakfast foods", "Heterogeneous Mixture"),
|
13 |
+
("General Groceries", "Heterogeneous Mixture"),
|
14 |
+
("Grocery Assortment", "Heterogeneous Mixture"),
|
15 |
+
("Assorted Grocery Items", "Heterogeneous Mixture"),
|
16 |
+
("Assorted Heterogeneous Mixture", "Heterogeneous Mixture"),
|
17 |
+
("Assorted Non-Perishables", "Heterogeneous Mixture"),
|
18 |
+
("Assorted Packaged Foods", "Heterogeneous Mixture"),
|
19 |
+
("Assorted Perishables", "Heterogeneous Mixture"),
|
20 |
+
("Box Heterogeneous Mixture", "Heterogeneous Mixture"),
|
21 |
+
("Bundle Heterogeneous Mixture", "Heterogeneous Mixture"),
|
22 |
+
("Collection Heterogeneous Mixture", "Heterogeneous Mixture"),
|
23 |
+
("Combo Heterogeneous Mixture", "Heterogeneous Mixture"),
|
24 |
+
("Food Item Collection", "Heterogeneous Mixture"),
|
25 |
+
("Food Item Mix", "Heterogeneous Mixture"),
|
26 |
+
("Food Variety Pack", "Heterogeneous Mixture"),
|
27 |
+
("General Groceries", "Heterogeneous Mixture"),
|
28 |
+
("Grocery Assortment", "Heterogeneous Mixture"),
|
29 |
+
("Grocery Combo Pack", "Heterogeneous Mixture"),
|
30 |
+
("Grocery Mix", "Heterogeneous Mixture"),
|
31 |
+
("Grocery Selection", "Heterogeneous Mixture"),
|
32 |
+
("Grocery Variety Box","Heterogeneous Mixture"),
|
33 |
+
("Various Items", "Heterogeneous Mixture"),
|
34 |
+
('almond', 'Specific'),
|
35 |
+
('Almond - Fresh Almond', 'Specific'),
|
36 |
+
('Apple ', 'Specific'),
|
37 |
+
('Apple Other', 'Specific'),
|
38 |
+
('Apple - Granny Smith Apple', 'Specific'),
|
39 |
+
('Apricot', 'Specific'),
|
40 |
+
('Artichoke', 'Specific'),
|
41 |
+
('asparagus', 'Specific'),
|
42 |
+
('Assorted Beans and Lentils', 'Specific'),
|
43 |
+
('Assorted Condiments and Sauces', 'Specific'),
|
44 |
+
('Avocado', 'Specific'),
|
45 |
+
('Baby Corn (10 lbs )', 'Specific'),
|
46 |
+
('Banana', 'Specific'),
|
47 |
+
('Banana - Burro Banana', 'Specific'),
|
48 |
+
('Banana - Plantain', 'Specific'),
|
49 |
+
('Banana - Thai Banana', 'Specific'),
|
50 |
+
('Banana leaf', 'Specific'),
|
51 |
+
('Basil', 'Specific'),
|
52 |
+
('Basil - Thai Basil', 'Specific'),
|
53 |
+
('Bean - Fava Bean', 'Specific'),
|
54 |
+
('Bean - Garbanzo Bean', 'Specific'),
|
55 |
+
('Bean - Green Bean', 'Specific'),
|
56 |
+
('Bean Green Bean', 'Specific'),
|
57 |
+
('Bean green beann', 'Specific'),
|
58 |
+
('Bean Romano Bean', 'Specific'),
|
59 |
+
('Bean- French Bean (10 Lbs )', 'Specific'),
|
60 |
+
('Beet - Red Beet', 'Specific'),
|
61 |
+
('Beet - Yellow Beet', 'Specific'),
|
62 |
+
('beet Warter melon beet', 'Specific'),
|
63 |
+
('bell peppers', 'Specific'),
|
64 |
+
('Berry - Blackberry', 'Specific'),
|
65 |
+
('Berry - Blueberry', 'Specific'),
|
66 |
+
('Onion - White Onion', 'Specific'),
|
67 |
+
('Onion Green onion iceless', 'Specific'),
|
68 |
+
('Onion-Mexican Green Onion', 'Specific'),
|
69 |
+
('Orange ', 'Specific'),
|
70 |
+
('Orange - Mandarin Orange', 'Specific'),
|
71 |
+
('Orange - Minneola Orange', 'Specific'),
|
72 |
+
('Orange Mandarine (10 Lbs )', 'Specific'),
|
73 |
+
('Organic Ginger', 'Specific'),
|
74 |
+
('Organic Mango', 'Specific'),
|
75 |
+
('Organic Onion - Brown Onion', 'Specific'),
|
76 |
+
('Beef Products Combo', 'Category'),
|
77 |
+
('Collection Beef Products', 'Category'),
|
78 |
+
('Beef Products Collection', 'Category'),
|
79 |
+
('Selection Beef Products', 'Category'),
|
80 |
+
('Beef Products Selection', 'Category'),
|
81 |
+
('Bundle Beef Products', 'Category'),
|
82 |
+
('Beef Products Bundle', 'Category'),
|
83 |
+
('Pack Beef Products', 'Category'),
|
84 |
+
('Beef Products Pack', 'Category'),
|
85 |
+
('Box Beef Products', 'Category'),
|
86 |
+
('Beef Products Box', 'Category'),
|
87 |
+
('Various Beef Products', 'Category'),
|
88 |
+
('Beef Products Various', 'Category'),
|
89 |
+
('Miscellaneous Beef Products', 'Category'),
|
90 |
+
('Assorted Beverages', 'Category'),
|
91 |
+
('Beverages Assorted', 'Category'),
|
92 |
+
('Mixed Beverages', 'Category'),
|
93 |
+
('Beverages Mixed', 'Category'),
|
94 |
+
('Variety Beverages', 'Category'),
|
95 |
+
('Beverages Variety', 'Category'),
|
96 |
+
('Combo Beverages', 'Category'),
|
97 |
+
('Beverages Combo', 'Category'),
|
98 |
+
('Collection Beverages', 'Category'),
|
99 |
+
('Beverages Collection', 'Category'),
|
100 |
+
('Selection Beverages', 'Category'),
|
101 |
+
('Beverages Selection', 'Category'),
|
102 |
+
('Bundle Beverages', 'Category'),
|
103 |
+
('Beverages Bundle', 'Category'),
|
104 |
+
('Pack Beverages', 'Category'),
|
105 |
+
('Beverages Pack', 'Category'),
|
106 |
+
('Box Beverages', 'Category'),
|
107 |
+
('Beverages Box', 'Category'),
|
108 |
+
('Various Beverages', 'Category'),
|
109 |
+
('Beverages Various', 'Category'),
|
110 |
+
('Miscellaneous Beverages', 'Category'),
|
111 |
+
('Beverages Miscellaneous', 'Category'),
|
112 |
+
('Misc Beverages', 'Category'),
|
113 |
+
('Beverages Misc', 'Category'),
|
114 |
+
('Mixture Beverages', 'Category'),
|
115 |
+
('Beverages Mixture', 'Category'),
|
116 |
+
('Bundle Breakfast Cereals', 'Category'),
|
117 |
+
('Breakfast Cereals Bundle', 'Category'),
|
118 |
+
('Pack Breakfast Cereals', 'Category'),
|
119 |
+
('Breakfast Cereals Pack', 'Category'),
|
120 |
+
('Box Breakfast Cereals', 'Category'),
|
121 |
+
('Breakfast Cereals Box', 'Category'),
|
122 |
+
('Various Breakfast Cereals', 'Category'),
|
123 |
+
('Breakfast Cereals Various', 'Category'),
|
124 |
+
('Assorted Breakfast Cereals', 'Category'),
|
125 |
+
('Breakfast Cereals Assorted', 'Category'),
|
126 |
+
('Miscellaneous Breakfast Cereals', 'Category'),
|
127 |
+
('Breakfast Cereals Miscellaneous', 'Category'),
|
128 |
+
('Misc Breakfast Cereals', 'Category'),
|
129 |
+
('Breakfast Cereals Misc', 'Category'),
|
130 |
+
('Mixture Breakfast Cereals', 'Category'),
|
131 |
+
('Breakfast Cereals Mixture', 'Category'),
|
132 |
+
('Mixed Breakfast Cereals', 'Category'),
|
133 |
+
('Breakfast Cereals Mixed', 'Category'),
|
134 |
+
('Variety Breakfast Cereals', 'Category'),
|
135 |
+
('Pack Fats and Oils', 'Category'),
|
136 |
+
('Fats and Oils Pack', 'Category'),
|
137 |
+
('Box Fats and Oils', 'Category'),
|
138 |
+
('Fats and Oils Box', 'Category'),
|
139 |
+
('Various Fats and Oils', 'Category'),
|
140 |
+
('Fats and Oils Various', 'Category'),
|
141 |
+
('Miscellaneous Fats and Oils', 'Category'),
|
142 |
+
('Meals, Entrees, and Side Dishes Various', 'Category'),
|
143 |
+
('Miscellaneous Meals, Entrees, and Side Dishes', 'Category'),
|
144 |
+
('Meals, Entrees, and Side Dishes Miscellaneous', 'Category'),
|
145 |
+
('Misc Meals, Entrees, and Side Dishes', 'Category'),
|
146 |
+
('Meals, Entrees, and Side Dishes Misc', 'Category'),
|
147 |
+
('Mixture Meals, Entrees, and Side Dishes', 'Category'),
|
148 |
+
('Meals, Entrees, and Side Dishes Mixture', 'Category'),
|
149 |
+
('Misc Non-Food Item', 'Category'),
|
150 |
+
('Nut and Seed Products Selection', 'Category'),
|
151 |
+
('Bundle Nut and Seed Products', 'Category'),
|
152 |
+
('Nut and Seed Products Bundle', 'Category'),
|
153 |
+
('Pack Nut and Seed Products', 'Category'),
|
154 |
+
('Nut and Seed Products Pack', 'Category'),
|
155 |
+
('Box Nut and Seed Products', 'Category'),
|
156 |
+
('Nut and Seed Products Box', 'Category'),
|
157 |
+
('Various Nut and Seed Products', 'Category'),
|
158 |
+
('Poultry Products Mixed', 'Category'),
|
159 |
+
('Variety Poultry Products', 'Category'),
|
160 |
+
('Poultry Products Variety', 'Category'),
|
161 |
+
('Combo Poultry Products', 'Category'),
|
162 |
+
('Poultry Products Combo', 'Category'),
|
163 |
+
('Sausages and Luncheon Meats Various', 'Category'),
|
164 |
+
('Miscellaneous Sausages and Luncheon Meats', 'Category'),
|
165 |
+
('Sausages and Luncheon Meats Miscellaneous', 'Category'),
|
166 |
+
('Misc Sausages and Luncheon Meats', 'Category'),
|
167 |
+
('Sausages and Luncheon Meats Misc', 'Category'),
|
168 |
+
('Selection Snacks', 'Category'),
|
169 |
+
('Snacks Selection', 'Category'),
|
170 |
+
('Bundle Snacks', 'Category'),
|
171 |
+
('Snacks Bundle', 'Category'),
|
172 |
+
('Pack Snacks', 'Category'),
|
173 |
+
('Snacks Pack', 'Category'),
|
174 |
+
('Box Snacks', 'Category'),
|
175 |
+
('Pack Sweets', 'Category'),
|
176 |
+
('Sweets Pack', 'Category'),
|
177 |
+
('Box Sweets', 'Category'),
|
178 |
+
('Sweets Box', 'Category'),
|
179 |
+
('Various Sweets', 'Category'),
|
180 |
+
('Sweets Various', 'Category'),
|
181 |
+
('Miscellaneous Vegetables and Vegetable Products', 'Category'),
|
182 |
+
('Vegetables and Vegetable Products Miscellaneous', 'Category'),
|
183 |
+
('Misc Vegetables and Vegetable Products', 'Category'),
|
184 |
+
('Vegetables and Vegetable Products Misc', 'Category'),
|
185 |
+
]
|
186 |
+
texts, labels = zip(*training_data)
|
187 |
+
|
188 |
+
# Create a pipeline with TfidfVectorizer and RandomForestClassifier
|
189 |
+
pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier())
|
190 |
+
|
191 |
+
# Train the model
|
192 |
+
pipeline.fit(texts, labels)
|
193 |
+
|
194 |
+
# Function to classify text using the trained model
|
195 |
+
def classify_text_to_type(text):
|
196 |
+
return pipeline.predict([text])[0]
|
197 |
+
|
198 |
+
|
199 |
+
from db.db_utils import get_connection
|
200 |
+
db_conn = get_connection()
|
201 |
+
db_cursor = db_conn.cursor()
|
202 |
+
|
203 |
+
db_cursor.execute("SELECT input_word FROM mappings WHERE specificity IS NULL")
|
204 |
+
results = db_cursor.fetchall()
|
205 |
+
|
206 |
+
for row in results:
|
207 |
+
input_word = row[0]
|
208 |
+
specificity = classify_text_to_type(input_word)
|
209 |
+
db_cursor.execute("UPDATE mappings SET specificity = %s WHERE input_word = %s", (specificity, input_word))
|
210 |
+
db_conn.commit()
|
211 |
+
|
212 |
+
db_conn.close()
|