beweinreich commited on
Commit
d93f20c
1 Parent(s): dc79224

integrate a specificity

Browse files
algo.py CHANGED
@@ -15,7 +15,7 @@ from ask_gpt import query_gpt
15
  from multi_food_item_detector import extract_items, has_delimiters
16
  from mapping_template import empty_template, heterogeneous_template, multi_item_template, nonfood_template, usda_template
17
  from tasks import insert_result
18
-
19
 
20
  logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
21
  similarity_threshold = 0.78
@@ -81,10 +81,10 @@ class Algo:
81
  }
82
  )
83
  return mapping
84
- elif gpt_recommended_word == 'Mixed Food Items':
85
  mapping.update(
86
  {
87
- 'dictionary_word': 'Mixed Food Items', 'similarity_score': 1.0,
88
  'confidence_score': 1.0
89
  }
90
  )
@@ -213,6 +213,11 @@ class Algo:
213
 
214
  mapping = self.perform_mapping(input_word)
215
 
 
 
 
 
 
216
  food_nonfood_pessimistic = pessimistic_food_nonfood_score(food_nonfood, mapping['similarity_score'])
217
  mapping.update({
218
  'is_food': food_nonfood_pessimistic[0],
@@ -298,7 +303,7 @@ class Algo:
298
 
299
  def match_words(self, input_data):
300
  # input_data is a list of tuples, where each tuple is (description, donor)
301
- # results = []
302
  result_batch = []
303
  for input_item in tqdm(input_data, desc="Processing input words"):
304
  input_word = input_item[0]
@@ -333,7 +338,7 @@ class Algo:
333
  })
334
  result_batch.append(mapping)
335
  # store_result_to_db(self.db_cursor, self.db_conn, self.run_key, mapping)
336
- # results.append(mapping)
337
 
338
  if len(result_batch) >= 100:
339
  insert_result.delay(self.run_key, result_batch)
@@ -344,4 +349,4 @@ class Algo:
344
  result_batch = []
345
 
346
 
347
- # return results
 
15
  from multi_food_item_detector import extract_items, has_delimiters
16
  from mapping_template import empty_template, heterogeneous_template, multi_item_template, nonfood_template, usda_template
17
  from tasks import insert_result
18
+ from specificity_classifier import classify_text_to_specificity
19
 
20
  logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
21
  similarity_threshold = 0.78
 
81
  }
82
  )
83
  return mapping
84
+ elif gpt_recommended_word == 'Heterogeneous Mixture':
85
  mapping.update(
86
  {
87
+ 'dictionary_word': 'Heterogeneous Mixture', 'similarity_score': 1.0,
88
  'confidence_score': 1.0
89
  }
90
  )
 
213
 
214
  mapping = self.perform_mapping(input_word)
215
 
216
+ specificity = classify_text_to_specificity(input_word_clean)
217
+ mapping.update({
218
+ 'specificity': specificity
219
+ })
220
+
221
  food_nonfood_pessimistic = pessimistic_food_nonfood_score(food_nonfood, mapping['similarity_score'])
222
  mapping.update({
223
  'is_food': food_nonfood_pessimistic[0],
 
303
 
304
  def match_words(self, input_data):
305
  # input_data is a list of tuples, where each tuple is (description, donor)
306
+ results = []
307
  result_batch = []
308
  for input_item in tqdm(input_data, desc="Processing input words"):
309
  input_word = input_item[0]
 
338
  })
339
  result_batch.append(mapping)
340
  # store_result_to_db(self.db_cursor, self.db_conn, self.run_key, mapping)
341
+ results.append(mapping)
342
 
343
  if len(result_batch) >= 100:
344
  insert_result.delay(self.run_key, result_batch)
 
349
  result_batch = []
350
 
351
 
352
+ return results
ask_gpt.py CHANGED
@@ -20,7 +20,7 @@ def query_gpt(food_item):
20
  f"Make sure you're accurate about whether it is cooked, prepared, etc or not.\n\n"
21
  f"But if its an obscure food, you can come up with a extremely similar food item that is similar in DMC.\n\n"
22
  f"If it's not a food item, return 'Non-Food Item'.\n\n"
23
- f"If it's a generic term like 'Mixture of foods', just say: 'Mixed Food Items'.\n\n"
24
  f"If it's not a food item, but a broad category like 'Various Produce', just say: 'Broad Category'.\n\n"
25
  f"You should respond in json format with an object that has the key `guess`, and the value is the most similar food item.\n\n"
26
  f"The food item is: \"{food_item}\""
 
20
  f"Make sure you're accurate about whether it is cooked, prepared, etc or not.\n\n"
21
  f"But if its an obscure food, you can come up with a extremely similar food item that is similar in DMC.\n\n"
22
  f"If it's not a food item, return 'Non-Food Item'.\n\n"
23
+ f"If it's a generic term like 'Mixture of foods', or 'grocery items' just say: 'Heterogeneous Mixture'.\n\n"
24
  f"If it's not a food item, but a broad category like 'Various Produce', just say: 'Broad Category'.\n\n"
25
  f"You should respond in json format with an object that has the key `guess`, and the value is the most similar food item.\n\n"
26
  f"The food item is: \"{food_item}\""
chatgpt_audit.py CHANGED
@@ -41,9 +41,7 @@ def query_gpt(food_item, dictionary_word, similar_words):
41
  prompt = (
42
  f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping?
43
 
44
- Generally, you should prefer the mapped word, but if you believe there is a better fit, please provide it.
45
-
46
- I will also provide a list of other similar words that you could be a better fit.
47
 
48
  This is important: only return a word from the list of words I provide.
49
 
@@ -167,8 +165,8 @@ for row in results:
167
  db_cursor.execute(sql, (input_word,))
168
  db_conn.commit()
169
  elif confirm.lower() == 'm':
170
- print(f" - Mixed food items")
171
- sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Mixed Food Items', is_food = true WHERE input_word = %s"
172
  db_cursor.execute(sql, (input_word,))
173
  db_conn.commit()
174
  else:
 
41
  prompt = (
42
  f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping?
43
 
44
+ Generally, you should prefer the mapped word, but if you believe there is a better fit from provided list of similar words, please specify it.
 
 
45
 
46
  This is important: only return a word from the list of words I provide.
47
 
 
165
  db_cursor.execute(sql, (input_word,))
166
  db_conn.commit()
167
  elif confirm.lower() == 'm':
168
+ print(f" - Heterogeneous Mixture")
169
+ sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Heterogeneous Mixture', is_food = true WHERE input_word = %s"
170
  db_cursor.execute(sql, (input_word,))
171
  db_conn.commit()
172
  else:
chatgpt_audit2.py CHANGED
@@ -103,7 +103,7 @@ for row in results:
103
  else:
104
  similar_words_list = mapping['similar_words'].split('|')
105
  similar_words_list.append('Non-Food Item')
106
- similar_words_list.append('Mixed Food Items')
107
 
108
  response = query_gpt(input_word, dictionary_word, similar_words_list)
109
  if response:
@@ -138,8 +138,8 @@ for row in results:
138
  db_cursor.execute(sql, (input_word,))
139
  db_conn.commit()
140
  elif confirm.lower() == 'm':
141
- print(f" - Mixed food items")
142
- sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Mixed Food Items', is_food = true WHERE input_word = %s"
143
  db_cursor.execute(sql, (input_word,))
144
  db_conn.commit()
145
  else:
 
103
  else:
104
  similar_words_list = mapping['similar_words'].split('|')
105
  similar_words_list.append('Non-Food Item')
106
+ similar_words_list.append('Heterogeneous Mixture')
107
 
108
  response = query_gpt(input_word, dictionary_word, similar_words_list)
109
  if response:
 
138
  db_cursor.execute(sql, (input_word,))
139
  db_conn.commit()
140
  elif confirm.lower() == 'm':
141
+ print(f" - Heterogeneous Mixture")
142
+ sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Heterogeneous Mixture', is_food = true WHERE input_word = %s"
143
  db_cursor.execute(sql, (input_word,))
144
  db_conn.commit()
145
  else:
db/db_utils.py CHANGED
@@ -20,6 +20,7 @@ def initialize_db(conn):
20
  input_word TEXT PRIMARY KEY,
21
  cleaned_word TEXT,
22
  dictionary_word TEXT,
 
23
  similarity_score REAL,
24
  confidence_score REAL,
25
  similar_words TEXT,
@@ -155,8 +156,8 @@ def store_mapping_to_db(cursor, conn, mapping):
155
  logging.info(f" - Storing new mapping to db: {mapping}")
156
  try:
157
  cursor.execute('''
158
- INSERT INTO mappings (input_word, cleaned_word, dictionary_word, similarity_score, confidence_score, similar_words, is_food, food_nonfood_score)
159
- VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
160
  ''', (
161
  mapping['input_word'],
162
  mapping['cleaned_word'],
@@ -165,7 +166,8 @@ def store_mapping_to_db(cursor, conn, mapping):
165
  mapping['confidence_score'],
166
  mapping['similar_words'],
167
  mapping['is_food'],
168
- mapping['food_nonfood_score']
 
169
  ))
170
  conn.commit()
171
  except Exception as e:
 
20
  input_word TEXT PRIMARY KEY,
21
  cleaned_word TEXT,
22
  dictionary_word TEXT,
23
+ specificity TEXT,
24
  similarity_score REAL,
25
  confidence_score REAL,
26
  similar_words TEXT,
 
156
  logging.info(f" - Storing new mapping to db: {mapping}")
157
  try:
158
  cursor.execute('''
159
+ INSERT INTO mappings (input_word, cleaned_word, dictionary_word, similarity_score, confidence_score, similar_words, is_food, food_nonfood_score, specificity)
160
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
161
  ''', (
162
  mapping['input_word'],
163
  mapping['cleaned_word'],
 
166
  mapping['confidence_score'],
167
  mapping['similar_words'],
168
  mapping['is_food'],
169
+ mapping['food_nonfood_score'],
170
+ mapping['specificity']
171
  ))
172
  conn.commit()
173
  except Exception as e:
item_or_category.py DELETED
@@ -1,35 +0,0 @@
1
- import random
2
- import numpy as np
3
- import torch
4
- import logging
5
- from transformers import pipeline
6
- from autocorrect import Speller
7
- # Load a pre-trained SBERT model
8
-
9
- # Set seeds for reproducibility of zero-shot classification
10
- def set_seed(seed):
11
- random.seed(seed)
12
- np.random.seed(seed)
13
- torch.manual_seed(seed)
14
- torch.cuda.manual_seed_all(seed)
15
- torch.backends.cudnn.deterministic = True
16
- torch.backends.cudnn.benchmark = False
17
-
18
- set_seed(1)
19
-
20
-
21
- # Load a pre-trained model and tokenizer
22
- classifier = pipeline("zero-shot-classification", model="roberta-large-mnli")
23
- spell = Speller()
24
-
25
- # Classify item as food or non-food
26
- def classify_as_item_or_category(item):
27
- cleaned_item = item.strip().lower()
28
- spell_fix_item = spell(cleaned_item)
29
- result = classifier(spell_fix_item, candidate_labels=["single food item", "food category"])
30
- label = result["labels"][0]
31
- score = result["scores"][0]
32
-
33
- # logging.info(f"Item: {item}, Label: {label}, Score: {score}")
34
- return label, score
35
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mapping_template.py CHANGED
@@ -1,6 +1,6 @@
1
  from utils import clean_word
2
 
3
- def generic_template(input_word, cleaned_word=None, similarity_score=None, confidence_score=None, similar_words=None, is_food=None, food_nonfood_score=None, dictionary_word=None, sr_legacy_food_category=None, wweia_category=None, dry_matter_content=None, water_content=None, leakage=None):
4
  if cleaned_word is None:
5
  cleaned_word = clean_word(input_word)
6
 
@@ -18,6 +18,7 @@ def generic_template(input_word, cleaned_word=None, similarity_score=None, confi
18
  'dry_matter_content': dry_matter_content,
19
  'water_content': water_content,
20
  'leakage': leakage,
 
21
  }
22
 
23
  def empty_template(input_word, cleaned_word=None):
@@ -38,6 +39,7 @@ def empty_template(input_word, cleaned_word=None):
38
  'dry_matter_content': None,
39
  'water_content': None,
40
  'leakage': None,
 
41
  }
42
 
43
  def usda_template(input_word, cleaned_word=None):
@@ -58,6 +60,7 @@ def usda_template(input_word, cleaned_word=None):
58
  'dry_matter_content': None,
59
  'water_content': None,
60
  'leakage': None,
 
61
  }
62
 
63
  def nonfood_template(input_word, cleaned_word=None, food_nonfood_score=None, similar_words=None):
@@ -78,6 +81,7 @@ def nonfood_template(input_word, cleaned_word=None, food_nonfood_score=None, sim
78
  'dry_matter_content': 0,
79
  'water_content': 0,
80
  'leakage': 0,
 
81
  }
82
 
83
  def heterogeneous_template(input_word, cleaned_word=None):
@@ -98,6 +102,7 @@ def heterogeneous_template(input_word, cleaned_word=None):
98
  'dry_matter_content': 0.27,
99
  'water_content': 0.73,
100
  'leakage': 0.1
 
101
  }
102
 
103
  def multi_item_template(input_word, cleaned_word=None, conservative_mapping=None):
@@ -117,6 +122,7 @@ def multi_item_template(input_word, cleaned_word=None, conservative_mapping=None
117
  'sr_legacy_food_category': conservative_mapping['sr_legacy_food_category'],
118
  'dry_matter_content': conservative_mapping['dry_matter_content'],
119
  'water_content': conservative_mapping['water_content'],
120
- 'leakage': conservative_mapping['leakage']
 
121
  }
122
 
 
1
  from utils import clean_word
2
 
3
+ def generic_template(input_word, cleaned_word=None, similarity_score=None, confidence_score=None, similar_words=None, is_food=None, food_nonfood_score=None, dictionary_word=None, sr_legacy_food_category=None, wweia_category=None, dry_matter_content=None, water_content=None, leakage=None, specificity=None):
4
  if cleaned_word is None:
5
  cleaned_word = clean_word(input_word)
6
 
 
18
  'dry_matter_content': dry_matter_content,
19
  'water_content': water_content,
20
  'leakage': leakage,
21
+ 'specificity': specificity
22
  }
23
 
24
  def empty_template(input_word, cleaned_word=None):
 
39
  'dry_matter_content': None,
40
  'water_content': None,
41
  'leakage': None,
42
+ 'specificity': None
43
  }
44
 
45
  def usda_template(input_word, cleaned_word=None):
 
60
  'dry_matter_content': None,
61
  'water_content': None,
62
  'leakage': None,
63
+ 'specificity': None
64
  }
65
 
66
  def nonfood_template(input_word, cleaned_word=None, food_nonfood_score=None, similar_words=None):
 
81
  'dry_matter_content': 0,
82
  'water_content': 0,
83
  'leakage': 0,
84
+ 'specificity': None
85
  }
86
 
87
  def heterogeneous_template(input_word, cleaned_word=None):
 
102
  'dry_matter_content': 0.27,
103
  'water_content': 0.73,
104
  'leakage': 0.1
105
+ 'specificity': 'Heterogeneous Mixture'
106
  }
107
 
108
  def multi_item_template(input_word, cleaned_word=None, conservative_mapping=None):
 
122
  'sr_legacy_food_category': conservative_mapping['sr_legacy_food_category'],
123
  'dry_matter_content': conservative_mapping['dry_matter_content'],
124
  'water_content': conservative_mapping['water_content'],
125
+ 'leakage': conservative_mapping['leakage'],
126
+ 'specificity': conservative_mapping['specificity']
127
  }
128
 
post_import_updates.py CHANGED
@@ -118,13 +118,13 @@ for item in categories:
118
  input_word = f"{qualifier} {category}"
119
  print(f"Storing {input_word}")
120
  cleaned_word = clean_word(input_word)
121
- mapping = generic_template(input_word, cleaned_word, 1, 1, None, True, 1, category, category, dry_matter_content, water_content, leakage)
122
  store_mapping_to_db(db_cursor, db_conn, mapping)
123
 
124
  input_word = f"{category} {qualifier}"
125
  print(f"Storing {input_word}")
126
  cleaned_word = clean_word(input_word)
127
- mapping = generic_template(input_word, cleaned_word, 1, 1, None, True, 1, category, category, dry_matter_content, water_content, leakage)
128
  store_mapping_to_db(db_cursor, db_conn, mapping)
129
 
130
 
 
118
  input_word = f"{qualifier} {category}"
119
  print(f"Storing {input_word}")
120
  cleaned_word = clean_word(input_word)
121
+ mapping = generic_template(input_word, cleaned_word, 1, 1, None, True, 1, category, category, dry_matter_content, water_content, leakage, "Category")
122
  store_mapping_to_db(db_cursor, db_conn, mapping)
123
 
124
  input_word = f"{category} {qualifier}"
125
  print(f"Storing {input_word}")
126
  cleaned_word = clean_word(input_word)
127
+ mapping = generic_template(input_word, cleaned_word, 1, 1, None, True, 1, category, category, dry_matter_content, water_content, leakage, "Category")
128
  store_mapping_to_db(db_cursor, db_conn, mapping)
129
 
130
 
specificity_classifier.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.ensemble import RandomForestClassifier
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.pipeline import make_pipeline
4
+
5
+ training_data = [
6
+ ("Bananas", "Specific"),
7
+ ("misc groceries", "Heterogeneous Mixture"),
8
+ ("Produce", "Category"),
9
+ ("Xoconostle", "Specific"),
10
+ ("Banana, hot chocolate & chips", "Specific"),
11
+ ("assorted apples", "Specific"),
12
+ ("breakfast foods", "Heterogeneous Mixture"),
13
+ ("General Groceries", "Heterogeneous Mixture"),
14
+ ("Grocery Assortment", "Heterogeneous Mixture"),
15
+ ("Assorted Grocery Items", "Heterogeneous Mixture"),
16
+ ("Assorted Heterogeneous Mixture", "Heterogeneous Mixture"),
17
+ ("Assorted Non-Perishables", "Heterogeneous Mixture"),
18
+ ("Assorted Packaged Foods", "Heterogeneous Mixture"),
19
+ ("Assorted Perishables", "Heterogeneous Mixture"),
20
+ ("Box Heterogeneous Mixture", "Heterogeneous Mixture"),
21
+ ("Bundle Heterogeneous Mixture", "Heterogeneous Mixture"),
22
+ ("Collection Heterogeneous Mixture", "Heterogeneous Mixture"),
23
+ ("Combo Heterogeneous Mixture", "Heterogeneous Mixture"),
24
+ ("Food Item Collection", "Heterogeneous Mixture"),
25
+ ("Food Item Mix", "Heterogeneous Mixture"),
26
+ ("Food Variety Pack", "Heterogeneous Mixture"),
27
+ ("General Groceries", "Heterogeneous Mixture"),
28
+ ("Grocery Assortment", "Heterogeneous Mixture"),
29
+ ("Grocery Combo Pack", "Heterogeneous Mixture"),
30
+ ("Grocery Mix", "Heterogeneous Mixture"),
31
+ ("Grocery Selection", "Heterogeneous Mixture"),
32
+ ("Grocery Variety Box","Heterogeneous Mixture"),
33
+ ("Various Items", "Heterogeneous Mixture"),
34
+ ('almond', 'Specific'),
35
+ ('Almond - Fresh Almond', 'Specific'),
36
+ ('Apple ', 'Specific'),
37
+ ('Apple Other', 'Specific'),
38
+ ('Apple - Granny Smith Apple', 'Specific'),
39
+ ('Apricot', 'Specific'),
40
+ ('Artichoke', 'Specific'),
41
+ ('asparagus', 'Specific'),
42
+ ('Assorted Beans and Lentils', 'Specific'),
43
+ ('Assorted Condiments and Sauces', 'Specific'),
44
+ ('Avocado', 'Specific'),
45
+ ('Baby Corn (10 lbs )', 'Specific'),
46
+ ('Banana', 'Specific'),
47
+ ('Banana - Burro Banana', 'Specific'),
48
+ ('Banana - Plantain', 'Specific'),
49
+ ('Banana - Thai Banana', 'Specific'),
50
+ ('Banana leaf', 'Specific'),
51
+ ('Basil', 'Specific'),
52
+ ('Basil - Thai Basil', 'Specific'),
53
+ ('Bean - Fava Bean', 'Specific'),
54
+ ('Bean - Garbanzo Bean', 'Specific'),
55
+ ('Bean - Green Bean', 'Specific'),
56
+ ('Bean Green Bean', 'Specific'),
57
+ ('Bean green beann', 'Specific'),
58
+ ('Bean Romano Bean', 'Specific'),
59
+ ('Bean- French Bean (10 Lbs )', 'Specific'),
60
+ ('Beet - Red Beet', 'Specific'),
61
+ ('Beet - Yellow Beet', 'Specific'),
62
+ ('beet Warter melon beet', 'Specific'),
63
+ ('bell peppers', 'Specific'),
64
+ ('Berry - Blackberry', 'Specific'),
65
+ ('Berry - Blueberry', 'Specific'),
66
+ ('Onion - White Onion', 'Specific'),
67
+ ('Onion Green onion iceless', 'Specific'),
68
+ ('Onion-Mexican Green Onion', 'Specific'),
69
+ ('Orange ', 'Specific'),
70
+ ('Orange - Mandarin Orange', 'Specific'),
71
+ ('Orange - Minneola Orange', 'Specific'),
72
+ ('Orange Mandarine (10 Lbs )', 'Specific'),
73
+ ('Organic Ginger', 'Specific'),
74
+ ('Organic Mango', 'Specific'),
75
+ ('Organic Onion - Brown Onion', 'Specific'),
76
+ ('Beef Products Combo', 'Category'),
77
+ ('Collection Beef Products', 'Category'),
78
+ ('Beef Products Collection', 'Category'),
79
+ ('Selection Beef Products', 'Category'),
80
+ ('Beef Products Selection', 'Category'),
81
+ ('Bundle Beef Products', 'Category'),
82
+ ('Beef Products Bundle', 'Category'),
83
+ ('Pack Beef Products', 'Category'),
84
+ ('Beef Products Pack', 'Category'),
85
+ ('Box Beef Products', 'Category'),
86
+ ('Beef Products Box', 'Category'),
87
+ ('Various Beef Products', 'Category'),
88
+ ('Beef Products Various', 'Category'),
89
+ ('Miscellaneous Beef Products', 'Category'),
90
+ ('Assorted Beverages', 'Category'),
91
+ ('Beverages Assorted', 'Category'),
92
+ ('Mixed Beverages', 'Category'),
93
+ ('Beverages Mixed', 'Category'),
94
+ ('Variety Beverages', 'Category'),
95
+ ('Beverages Variety', 'Category'),
96
+ ('Combo Beverages', 'Category'),
97
+ ('Beverages Combo', 'Category'),
98
+ ('Collection Beverages', 'Category'),
99
+ ('Beverages Collection', 'Category'),
100
+ ('Selection Beverages', 'Category'),
101
+ ('Beverages Selection', 'Category'),
102
+ ('Bundle Beverages', 'Category'),
103
+ ('Beverages Bundle', 'Category'),
104
+ ('Pack Beverages', 'Category'),
105
+ ('Beverages Pack', 'Category'),
106
+ ('Box Beverages', 'Category'),
107
+ ('Beverages Box', 'Category'),
108
+ ('Various Beverages', 'Category'),
109
+ ('Beverages Various', 'Category'),
110
+ ('Miscellaneous Beverages', 'Category'),
111
+ ('Beverages Miscellaneous', 'Category'),
112
+ ('Misc Beverages', 'Category'),
113
+ ('Beverages Misc', 'Category'),
114
+ ('Mixture Beverages', 'Category'),
115
+ ('Beverages Mixture', 'Category'),
116
+ ('Bundle Breakfast Cereals', 'Category'),
117
+ ('Breakfast Cereals Bundle', 'Category'),
118
+ ('Pack Breakfast Cereals', 'Category'),
119
+ ('Breakfast Cereals Pack', 'Category'),
120
+ ('Box Breakfast Cereals', 'Category'),
121
+ ('Breakfast Cereals Box', 'Category'),
122
+ ('Various Breakfast Cereals', 'Category'),
123
+ ('Breakfast Cereals Various', 'Category'),
124
+ ('Assorted Breakfast Cereals', 'Category'),
125
+ ('Breakfast Cereals Assorted', 'Category'),
126
+ ('Miscellaneous Breakfast Cereals', 'Category'),
127
+ ('Breakfast Cereals Miscellaneous', 'Category'),
128
+ ('Misc Breakfast Cereals', 'Category'),
129
+ ('Breakfast Cereals Misc', 'Category'),
130
+ ('Mixture Breakfast Cereals', 'Category'),
131
+ ('Breakfast Cereals Mixture', 'Category'),
132
+ ('Mixed Breakfast Cereals', 'Category'),
133
+ ('Breakfast Cereals Mixed', 'Category'),
134
+ ('Variety Breakfast Cereals', 'Category'),
135
+ ('Pack Fats and Oils', 'Category'),
136
+ ('Fats and Oils Pack', 'Category'),
137
+ ('Box Fats and Oils', 'Category'),
138
+ ('Fats and Oils Box', 'Category'),
139
+ ('Various Fats and Oils', 'Category'),
140
+ ('Fats and Oils Various', 'Category'),
141
+ ('Miscellaneous Fats and Oils', 'Category'),
142
+ ('Meals, Entrees, and Side Dishes Various', 'Category'),
143
+ ('Miscellaneous Meals, Entrees, and Side Dishes', 'Category'),
144
+ ('Meals, Entrees, and Side Dishes Miscellaneous', 'Category'),
145
+ ('Misc Meals, Entrees, and Side Dishes', 'Category'),
146
+ ('Meals, Entrees, and Side Dishes Misc', 'Category'),
147
+ ('Mixture Meals, Entrees, and Side Dishes', 'Category'),
148
+ ('Meals, Entrees, and Side Dishes Mixture', 'Category'),
149
+ ('Misc Non-Food Item', 'Category'),
150
+ ('Nut and Seed Products Selection', 'Category'),
151
+ ('Bundle Nut and Seed Products', 'Category'),
152
+ ('Nut and Seed Products Bundle', 'Category'),
153
+ ('Pack Nut and Seed Products', 'Category'),
154
+ ('Nut and Seed Products Pack', 'Category'),
155
+ ('Box Nut and Seed Products', 'Category'),
156
+ ('Nut and Seed Products Box', 'Category'),
157
+ ('Various Nut and Seed Products', 'Category'),
158
+ ('Poultry Products Mixed', 'Category'),
159
+ ('Variety Poultry Products', 'Category'),
160
+ ('Poultry Products Variety', 'Category'),
161
+ ('Combo Poultry Products', 'Category'),
162
+ ('Poultry Products Combo', 'Category'),
163
+ ('Sausages and Luncheon Meats Various', 'Category'),
164
+ ('Miscellaneous Sausages and Luncheon Meats', 'Category'),
165
+ ('Sausages and Luncheon Meats Miscellaneous', 'Category'),
166
+ ('Misc Sausages and Luncheon Meats', 'Category'),
167
+ ('Sausages and Luncheon Meats Misc', 'Category'),
168
+ ('Selection Snacks', 'Category'),
169
+ ('Snacks Selection', 'Category'),
170
+ ('Bundle Snacks', 'Category'),
171
+ ('Snacks Bundle', 'Category'),
172
+ ('Pack Snacks', 'Category'),
173
+ ('Snacks Pack', 'Category'),
174
+ ('Box Snacks', 'Category'),
175
+ ('Pack Sweets', 'Category'),
176
+ ('Sweets Pack', 'Category'),
177
+ ('Box Sweets', 'Category'),
178
+ ('Sweets Box', 'Category'),
179
+ ('Various Sweets', 'Category'),
180
+ ('Sweets Various', 'Category'),
181
+ ('Miscellaneous Vegetables and Vegetable Products', 'Category'),
182
+ ('Vegetables and Vegetable Products Miscellaneous', 'Category'),
183
+ ('Misc Vegetables and Vegetable Products', 'Category'),
184
+ ('Vegetables and Vegetable Products Misc', 'Category'),
185
+ ]
186
+ texts, labels = zip(*training_data)
187
+
188
+ # Create a pipeline with TfidfVectorizer and RandomForestClassifier
189
+ pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier())
190
+
191
+ # Train the model
192
+ pipeline.fit(texts, labels)
193
+
194
+ # Function to classify text using the trained model
195
+ def classify_text_to_type(text):
196
+ return pipeline.predict([text])[0]
197
+
198
+
199
+ from db.db_utils import get_connection
200
+ db_conn = get_connection()
201
+ db_cursor = db_conn.cursor()
202
+
203
+ db_cursor.execute("SELECT input_word FROM mappings WHERE specificity IS NULL")
204
+ results = db_cursor.fetchall()
205
+
206
+ for row in results:
207
+ input_word = row[0]
208
+ specificity = classify_text_to_type(input_word)
209
+ db_cursor.execute("UPDATE mappings SET specificity = %s WHERE input_word = %s", (specificity, input_word))
210
+ db_conn.commit()
211
+
212
+ db_conn.close()