Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

beweinreich commited on 18 days ago

Commit

d93f20c

•

1 Parent(s): dc79224

integrate a specificity

Browse files

Files changed (9) hide show

algo.py +11 -6
ask_gpt.py +1 -1
chatgpt_audit.py +3 -5
chatgpt_audit2.py +3 -3
db/db_utils.py +5 -3
item_or_category.py +0 -35
mapping_template.py +8 -2
post_import_updates.py +2 -2
specificity_classifier.py +212 -0

algo.py CHANGED Viewed

@@ -15,7 +15,7 @@ from ask_gpt import query_gpt
 from multi_food_item_detector import extract_items, has_delimiters
 from mapping_template import empty_template, heterogeneous_template, multi_item_template, nonfood_template, usda_template
 from tasks import insert_result
 logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
 similarity_threshold = 0.78
@@ -81,10 +81,10 @@ class Algo:
                             }
                         )
                         return mapping
-                    elif gpt_recommended_word == 'Mixed Food Items':
                         mapping.update(
                             {
-                                'dictionary_word': 'Mixed Food Items', 'similarity_score': 1.0,
                                 'confidence_score': 1.0
                             }
                         )
@@ -213,6 +213,11 @@ class Algo:
         mapping = self.perform_mapping(input_word)
         food_nonfood_pessimistic = pessimistic_food_nonfood_score(food_nonfood, mapping['similarity_score'])
         mapping.update({
             'is_food': food_nonfood_pessimistic[0],
@@ -298,7 +303,7 @@ class Algo:
     def match_words(self, input_data):
         # input_data is a list of tuples, where each tuple is (description, donor)
-        # results = []
         result_batch = []
         for input_item in tqdm(input_data, desc="Processing input words"):
             input_word = input_item[0]
@@ -333,7 +338,7 @@ class Algo:
                 })
                 result_batch.append(mapping)
                 # store_result_to_db(self.db_cursor, self.db_conn, self.run_key, mapping)
-                # results.append(mapping)
             if len(result_batch) >= 100:
                 insert_result.delay(self.run_key, result_batch)
@@ -344,4 +349,4 @@ class Algo:
             result_batch = []
-        # return results

 from multi_food_item_detector import extract_items, has_delimiters
 from mapping_template import empty_template, heterogeneous_template, multi_item_template, nonfood_template, usda_template
 from tasks import insert_result
+from specificity_classifier import classify_text_to_specificity
 logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
 similarity_threshold = 0.78
                             }
                         )
                         return mapping
+                    elif gpt_recommended_word == 'Heterogeneous Mixture':
                         mapping.update(
                             {
+                                'dictionary_word': 'Heterogeneous Mixture', 'similarity_score': 1.0,
                                 'confidence_score': 1.0
                             }
                         )
         mapping = self.perform_mapping(input_word)
+        specificity = classify_text_to_specificity(input_word_clean)
+        mapping.update({
+            'specificity': specificity
+        })
         food_nonfood_pessimistic = pessimistic_food_nonfood_score(food_nonfood, mapping['similarity_score'])
         mapping.update({
             'is_food': food_nonfood_pessimistic[0],
     def match_words(self, input_data):
         # input_data is a list of tuples, where each tuple is (description, donor)
+        results = []
         result_batch = []
         for input_item in tqdm(input_data, desc="Processing input words"):
             input_word = input_item[0]
                 })
                 result_batch.append(mapping)
                 # store_result_to_db(self.db_cursor, self.db_conn, self.run_key, mapping)
+                results.append(mapping)
             if len(result_batch) >= 100:
                 insert_result.delay(self.run_key, result_batch)
             result_batch = []
+        return results

ask_gpt.py CHANGED Viewed

@@ -20,7 +20,7 @@ def query_gpt(food_item):
         f"Make sure you're accurate about whether it is cooked, prepared, etc or not.\n\n"
         f"But if its an obscure food, you can come up with a extremely similar food item that is similar in DMC.\n\n"
         f"If it's not a food item, return 'Non-Food Item'.\n\n"
-        f"If it's a generic term like 'Mixture of foods', just say: 'Mixed Food Items'.\n\n"
         f"If it's not a food item, but a broad category like 'Various Produce', just say: 'Broad Category'.\n\n"
         f"You should respond in json format with an object that has the key `guess`, and the value is the most similar food item.\n\n"
         f"The food item is: \"{food_item}\""

         f"Make sure you're accurate about whether it is cooked, prepared, etc or not.\n\n"
         f"But if its an obscure food, you can come up with a extremely similar food item that is similar in DMC.\n\n"
         f"If it's not a food item, return 'Non-Food Item'.\n\n"
+        f"If it's a generic term like 'Mixture of foods', or 'grocery items' just say: 'Heterogeneous Mixture'.\n\n"
         f"If it's not a food item, but a broad category like 'Various Produce', just say: 'Broad Category'.\n\n"
         f"You should respond in json format with an object that has the key `guess`, and the value is the most similar food item.\n\n"
         f"The food item is: \"{food_item}\""

chatgpt_audit.py CHANGED Viewed

@@ -41,9 +41,7 @@ def query_gpt(food_item, dictionary_word, similar_words):
     prompt = (
       f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping?
-      Generally, you should prefer the mapped word, but if you believe there is a better fit, please provide it.
-      I will also provide a list of other similar words that you could be a better fit.
       This is important: only return a word from the list of words I provide.
@@ -167,8 +165,8 @@ for row in results:
                     db_cursor.execute(sql, (input_word,))
                     db_conn.commit()
                 elif confirm.lower() == 'm':
-                    print(f" - Mixed food items")
-                    sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Mixed Food Items', is_food = true WHERE input_word = %s"
                     db_cursor.execute(sql, (input_word,))
                     db_conn.commit()
                 else:

     prompt = (
       f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping?
+      Generally, you should prefer the mapped word, but if you believe there is a better fit from provided list of similar words, please specify it.
       This is important: only return a word from the list of words I provide.
                     db_cursor.execute(sql, (input_word,))
                     db_conn.commit()
                 elif confirm.lower() == 'm':
+                    print(f" - Heterogeneous Mixture")
+                    sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Heterogeneous Mixture', is_food = true WHERE input_word = %s"
                     db_cursor.execute(sql, (input_word,))
                     db_conn.commit()
                 else:

chatgpt_audit2.py CHANGED Viewed

@@ -103,7 +103,7 @@ for row in results:
     else:
         similar_words_list = mapping['similar_words'].split('|')
         similar_words_list.append('Non-Food Item')
-        similar_words_list.append('Mixed Food Items')
         response = query_gpt(input_word, dictionary_word, similar_words_list)
         if response:
@@ -138,8 +138,8 @@ for row in results:
                       db_cursor.execute(sql, (input_word,))
                       db_conn.commit()
                   elif confirm.lower() == 'm':
-                      print(f" - Mixed food items")
-                      sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Mixed Food Items', is_food = true WHERE input_word = %s"
                       db_cursor.execute(sql, (input_word,))
                       db_conn.commit()
                   else:

     else:
         similar_words_list = mapping['similar_words'].split('|')
         similar_words_list.append('Non-Food Item')
+        similar_words_list.append('Heterogeneous Mixture')
         response = query_gpt(input_word, dictionary_word, similar_words_list)
         if response:
                       db_cursor.execute(sql, (input_word,))
                       db_conn.commit()
                   elif confirm.lower() == 'm':
+                      print(f" - Heterogeneous Mixture")
+                      sql = "UPDATE mappings SET reviewed = true, dictionary_word = 'Heterogeneous Mixture', is_food = true WHERE input_word = %s"
                       db_cursor.execute(sql, (input_word,))
                       db_conn.commit()
                   else:

db/db_utils.py CHANGED Viewed

@@ -20,6 +20,7 @@ def initialize_db(conn):
             input_word TEXT PRIMARY KEY,
             cleaned_word TEXT,
             dictionary_word TEXT,
             similarity_score REAL,
             confidence_score REAL,
             similar_words TEXT,
@@ -155,8 +156,8 @@ def store_mapping_to_db(cursor, conn, mapping):
     logging.info(f" - Storing new mapping to db: {mapping}")
     try:
         cursor.execute('''
-            INSERT INTO mappings (input_word, cleaned_word, dictionary_word, similarity_score, confidence_score, similar_words, is_food, food_nonfood_score)
-            VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
         ''', (
             mapping['input_word'],
             mapping['cleaned_word'],
@@ -165,7 +166,8 @@ def store_mapping_to_db(cursor, conn, mapping):
             mapping['confidence_score'],
             mapping['similar_words'],
             mapping['is_food'],
-            mapping['food_nonfood_score']
         ))
         conn.commit()
     except Exception as e:

             input_word TEXT PRIMARY KEY,
             cleaned_word TEXT,
             dictionary_word TEXT,
+            specificity TEXT,
             similarity_score REAL,
             confidence_score REAL,
             similar_words TEXT,
     logging.info(f" - Storing new mapping to db: {mapping}")
     try:
         cursor.execute('''
+            INSERT INTO mappings (input_word, cleaned_word, dictionary_word, similarity_score, confidence_score, similar_words, is_food, food_nonfood_score, specificity)
+            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
         ''', (
             mapping['input_word'],
             mapping['cleaned_word'],
             mapping['confidence_score'],
             mapping['similar_words'],
             mapping['is_food'],
+            mapping['food_nonfood_score'],
+            mapping['specificity']
         ))
         conn.commit()
     except Exception as e:

item_or_category.py DELETED Viewed

@@ -1,35 +0,0 @@
-import random
-import numpy as np
-import torch
-import logging
-from transformers import pipeline
-from autocorrect import Speller
-# Load a pre-trained SBERT model
-# Set seeds for reproducibility of zero-shot classification
-def set_seed(seed):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-set_seed(1)
-# Load a pre-trained model and tokenizer
-classifier = pipeline("zero-shot-classification", model="roberta-large-mnli")
-spell = Speller()
-# Classify item as food or non-food
-def classify_as_item_or_category(item):
-    cleaned_item = item.strip().lower()
-    spell_fix_item = spell(cleaned_item)
-    result = classifier(spell_fix_item, candidate_labels=["single food item", "food category"])
-    label = result["labels"][0]
-    score = result["scores"][0]
-    # logging.info(f"Item: {item}, Label: {label}, Score: {score}")
-    return label, score

mapping_template.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from utils import clean_word
-def generic_template(input_word, cleaned_word=None, similarity_score=None, confidence_score=None, similar_words=None, is_food=None, food_nonfood_score=None, dictionary_word=None, sr_legacy_food_category=None, wweia_category=None, dry_matter_content=None, water_content=None, leakage=None):
     if cleaned_word is None:
         cleaned_word = clean_word(input_word)
@@ -18,6 +18,7 @@ def generic_template(input_word, cleaned_word=None, similarity_score=None, confi
         'dry_matter_content': dry_matter_content,
         'water_content': water_content,
         'leakage': leakage,
     }
 def empty_template(input_word, cleaned_word=None):
@@ -38,6 +39,7 @@ def empty_template(input_word, cleaned_word=None):
         'dry_matter_content': None,
         'water_content': None,
         'leakage': None,
     }
 def usda_template(input_word, cleaned_word=None):
@@ -58,6 +60,7 @@ def usda_template(input_word, cleaned_word=None):
         'dry_matter_content': None,
         'water_content': None,
         'leakage': None,
     }
 def nonfood_template(input_word, cleaned_word=None, food_nonfood_score=None, similar_words=None):
@@ -78,6 +81,7 @@ def nonfood_template(input_word, cleaned_word=None, food_nonfood_score=None, sim
         'dry_matter_content': 0,
         'water_content': 0,
         'leakage': 0,
     }
 def heterogeneous_template(input_word, cleaned_word=None):
@@ -98,6 +102,7 @@ def heterogeneous_template(input_word, cleaned_word=None):
         'dry_matter_content': 0.27,
         'water_content': 0.73,
         'leakage': 0.1
     }
 def multi_item_template(input_word, cleaned_word=None, conservative_mapping=None):
@@ -117,6 +122,7 @@ def multi_item_template(input_word, cleaned_word=None, conservative_mapping=None
         'sr_legacy_food_category': conservative_mapping['sr_legacy_food_category'],
         'dry_matter_content': conservative_mapping['dry_matter_content'],
         'water_content': conservative_mapping['water_content'],
-        'leakage': conservative_mapping['leakage']
     }

 from utils import clean_word
+def generic_template(input_word, cleaned_word=None, similarity_score=None, confidence_score=None, similar_words=None, is_food=None, food_nonfood_score=None, dictionary_word=None, sr_legacy_food_category=None, wweia_category=None, dry_matter_content=None, water_content=None, leakage=None, specificity=None):
     if cleaned_word is None:
         cleaned_word = clean_word(input_word)
         'dry_matter_content': dry_matter_content,
         'water_content': water_content,
         'leakage': leakage,
+        'specificity': specificity
     }
 def empty_template(input_word, cleaned_word=None):
         'dry_matter_content': None,
         'water_content': None,
         'leakage': None,
+        'specificity': None
     }
 def usda_template(input_word, cleaned_word=None):
         'dry_matter_content': None,
         'water_content': None,
         'leakage': None,
+        'specificity': None
     }
 def nonfood_template(input_word, cleaned_word=None, food_nonfood_score=None, similar_words=None):
         'dry_matter_content': 0,
         'water_content': 0,
         'leakage': 0,
+        'specificity': None
     }
 def heterogeneous_template(input_word, cleaned_word=None):
         'dry_matter_content': 0.27,
         'water_content': 0.73,
         'leakage': 0.1
+        'specificity': 'Heterogeneous Mixture'
     }
 def multi_item_template(input_word, cleaned_word=None, conservative_mapping=None):
         'sr_legacy_food_category': conservative_mapping['sr_legacy_food_category'],
         'dry_matter_content': conservative_mapping['dry_matter_content'],
         'water_content': conservative_mapping['water_content'],
+        'leakage': conservative_mapping['leakage'],
+        'specificity': conservative_mapping['specificity']
     }

post_import_updates.py CHANGED Viewed

@@ -118,13 +118,13 @@ for item in categories:
         input_word = f"{qualifier} {category}"
         print(f"Storing {input_word}")
         cleaned_word = clean_word(input_word)
-        mapping = generic_template(input_word, cleaned_word, 1, 1, None, True, 1, category, category, dry_matter_content, water_content, leakage)
         store_mapping_to_db(db_cursor, db_conn, mapping)
         input_word = f"{category} {qualifier}"
         print(f"Storing {input_word}")
         cleaned_word = clean_word(input_word)
-        mapping = generic_template(input_word, cleaned_word, 1, 1, None, True, 1, category, category, dry_matter_content, water_content, leakage)
         store_mapping_to_db(db_cursor, db_conn, mapping)

         input_word = f"{qualifier} {category}"
         print(f"Storing {input_word}")
         cleaned_word = clean_word(input_word)
+        mapping = generic_template(input_word, cleaned_word, 1, 1, None, True, 1, category, category, dry_matter_content, water_content, leakage, "Category")
         store_mapping_to_db(db_cursor, db_conn, mapping)
         input_word = f"{category} {qualifier}"
         print(f"Storing {input_word}")
         cleaned_word = clean_word(input_word)
+        mapping = generic_template(input_word, cleaned_word, 1, 1, None, True, 1, category, category, dry_matter_content, water_content, leakage, "Category")
         store_mapping_to_db(db_cursor, db_conn, mapping)

specificity_classifier.py ADDED Viewed

	@@ -0,0 +1,212 @@

+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import make_pipeline
+training_data = [
+    ("Bananas", "Specific"),
+    ("misc groceries", "Heterogeneous Mixture"),
+    ("Produce", "Category"),
+    ("Xoconostle", "Specific"),
+    ("Banana, hot chocolate & chips", "Specific"),
+    ("assorted apples", "Specific"),
+    ("breakfast foods", "Heterogeneous Mixture"),
+    ("General Groceries", "Heterogeneous Mixture"),
+    ("Grocery Assortment", "Heterogeneous Mixture"),
+    ("Assorted Grocery Items", "Heterogeneous Mixture"),
+    ("Assorted Heterogeneous Mixture", "Heterogeneous Mixture"),
+    ("Assorted Non-Perishables", "Heterogeneous Mixture"),
+    ("Assorted Packaged Foods", "Heterogeneous Mixture"),
+    ("Assorted Perishables", "Heterogeneous Mixture"),
+    ("Box Heterogeneous Mixture", "Heterogeneous Mixture"),
+    ("Bundle Heterogeneous Mixture", "Heterogeneous Mixture"),
+    ("Collection Heterogeneous Mixture", "Heterogeneous Mixture"),
+    ("Combo Heterogeneous Mixture", "Heterogeneous Mixture"),
+    ("Food Item Collection", "Heterogeneous Mixture"),
+    ("Food Item Mix", "Heterogeneous Mixture"),
+    ("Food Variety Pack", "Heterogeneous Mixture"),
+    ("General Groceries", "Heterogeneous Mixture"),
+    ("Grocery Assortment", "Heterogeneous Mixture"),
+    ("Grocery Combo Pack", "Heterogeneous Mixture"),
+    ("Grocery Mix", "Heterogeneous Mixture"),
+    ("Grocery Selection", "Heterogeneous Mixture"),
+    ("Grocery Variety Box","Heterogeneous Mixture"),
+    ("Various Items", "Heterogeneous Mixture"),
+    ('almond', 'Specific'),
+    ('Almond - Fresh Almond', 'Specific'),
+    ('Apple ', 'Specific'),
+    ('Apple  Other', 'Specific'),
+    ('Apple - Granny Smith Apple', 'Specific'),
+    ('Apricot', 'Specific'),
+    ('Artichoke', 'Specific'),
+    ('asparagus', 'Specific'),
+    ('Assorted Beans and Lentils', 'Specific'),
+    ('Assorted Condiments and Sauces', 'Specific'),
+    ('Avocado', 'Specific'),
+    ('Baby Corn (10 lbs )', 'Specific'),
+    ('Banana', 'Specific'),
+    ('Banana - Burro Banana', 'Specific'),
+    ('Banana - Plantain', 'Specific'),
+    ('Banana - Thai Banana', 'Specific'),
+    ('Banana leaf', 'Specific'),
+    ('Basil', 'Specific'),
+    ('Basil - Thai Basil', 'Specific'),
+    ('Bean - Fava Bean', 'Specific'),
+    ('Bean - Garbanzo Bean', 'Specific'),
+    ('Bean - Green Bean', 'Specific'),
+    ('Bean Green Bean', 'Specific'),
+    ('Bean green beann', 'Specific'),
+    ('Bean Romano Bean', 'Specific'),
+    ('Bean- French Bean (10 Lbs )', 'Specific'),
+    ('Beet - Red Beet', 'Specific'),
+    ('Beet - Yellow Beet', 'Specific'),
+    ('beet Warter melon beet', 'Specific'),
+    ('bell peppers', 'Specific'),
+    ('Berry - Blackberry', 'Specific'),
+    ('Berry - Blueberry', 'Specific'),
+    ('Onion - White Onion', 'Specific'),
+    ('Onion Green onion  iceless', 'Specific'),
+    ('Onion-Mexican Green Onion', 'Specific'),
+    ('Orange ', 'Specific'),
+    ('Orange - Mandarin Orange', 'Specific'),
+    ('Orange - Minneola Orange', 'Specific'),
+    ('Orange Mandarine (10 Lbs )', 'Specific'),
+    ('Organic Ginger', 'Specific'),
+    ('Organic Mango', 'Specific'),
+    ('Organic Onion - Brown Onion', 'Specific'),
+    ('Beef Products Combo', 'Category'),
+    ('Collection Beef Products', 'Category'),
+    ('Beef Products Collection', 'Category'),
+    ('Selection Beef Products', 'Category'),
+    ('Beef Products Selection', 'Category'),
+    ('Bundle Beef Products', 'Category'),
+    ('Beef Products Bundle', 'Category'),
+    ('Pack Beef Products', 'Category'),
+    ('Beef Products Pack', 'Category'),
+    ('Box Beef Products', 'Category'),
+    ('Beef Products Box', 'Category'),
+    ('Various Beef Products', 'Category'),
+    ('Beef Products Various', 'Category'),
+    ('Miscellaneous Beef Products', 'Category'),
+    ('Assorted Beverages', 'Category'),
+    ('Beverages Assorted', 'Category'),
+    ('Mixed Beverages', 'Category'),
+    ('Beverages Mixed', 'Category'),
+    ('Variety Beverages', 'Category'),
+    ('Beverages Variety', 'Category'),
+    ('Combo Beverages', 'Category'),
+    ('Beverages Combo', 'Category'),
+    ('Collection Beverages', 'Category'),
+    ('Beverages Collection', 'Category'),
+    ('Selection Beverages', 'Category'),
+    ('Beverages Selection', 'Category'),
+    ('Bundle Beverages', 'Category'),
+    ('Beverages Bundle', 'Category'),
+    ('Pack Beverages', 'Category'),
+    ('Beverages Pack', 'Category'),
+    ('Box Beverages', 'Category'),
+    ('Beverages Box', 'Category'),
+    ('Various Beverages', 'Category'),
+    ('Beverages Various', 'Category'),
+    ('Miscellaneous Beverages', 'Category'),
+    ('Beverages Miscellaneous', 'Category'),
+    ('Misc Beverages', 'Category'),
+    ('Beverages Misc', 'Category'),
+    ('Mixture Beverages', 'Category'),
+    ('Beverages Mixture', 'Category'),
+    ('Bundle Breakfast Cereals', 'Category'),
+    ('Breakfast Cereals Bundle', 'Category'),
+    ('Pack Breakfast Cereals', 'Category'),
+    ('Breakfast Cereals Pack', 'Category'),
+    ('Box Breakfast Cereals', 'Category'),
+    ('Breakfast Cereals Box', 'Category'),
+    ('Various Breakfast Cereals', 'Category'),
+    ('Breakfast Cereals Various', 'Category'),
+    ('Assorted Breakfast Cereals', 'Category'),
+    ('Breakfast Cereals Assorted', 'Category'),
+    ('Miscellaneous Breakfast Cereals', 'Category'),
+    ('Breakfast Cereals Miscellaneous', 'Category'),
+    ('Misc Breakfast Cereals', 'Category'),
+    ('Breakfast Cereals Misc', 'Category'),
+    ('Mixture Breakfast Cereals', 'Category'),
+    ('Breakfast Cereals Mixture', 'Category'),
+    ('Mixed Breakfast Cereals', 'Category'),
+    ('Breakfast Cereals Mixed', 'Category'),
+    ('Variety Breakfast Cereals', 'Category'),
+    ('Pack Fats and Oils', 'Category'),
+    ('Fats and Oils Pack', 'Category'),
+    ('Box Fats and Oils', 'Category'),
+    ('Fats and Oils Box', 'Category'),
+    ('Various Fats and Oils', 'Category'),
+    ('Fats and Oils Various', 'Category'),
+    ('Miscellaneous Fats and Oils', 'Category'),
+    ('Meals, Entrees, and Side Dishes Various', 'Category'),
+    ('Miscellaneous Meals, Entrees, and Side Dishes', 'Category'),
+    ('Meals, Entrees, and Side Dishes Miscellaneous', 'Category'),
+    ('Misc Meals, Entrees, and Side Dishes', 'Category'),
+    ('Meals, Entrees, and Side Dishes Misc', 'Category'),
+    ('Mixture Meals, Entrees, and Side Dishes', 'Category'),
+    ('Meals, Entrees, and Side Dishes Mixture', 'Category'),
+    ('Misc Non-Food Item', 'Category'),
+    ('Nut and Seed Products Selection', 'Category'),
+    ('Bundle Nut and Seed Products', 'Category'),
+    ('Nut and Seed Products Bundle', 'Category'),
+    ('Pack Nut and Seed Products', 'Category'),
+    ('Nut and Seed Products Pack', 'Category'),
+    ('Box Nut and Seed Products', 'Category'),
+    ('Nut and Seed Products Box', 'Category'),
+    ('Various Nut and Seed Products', 'Category'),
+    ('Poultry Products Mixed', 'Category'),
+    ('Variety Poultry Products', 'Category'),
+    ('Poultry Products Variety', 'Category'),
+    ('Combo Poultry Products', 'Category'),
+    ('Poultry Products Combo', 'Category'),
+    ('Sausages and Luncheon Meats Various', 'Category'),
+    ('Miscellaneous Sausages and Luncheon Meats', 'Category'),
+    ('Sausages and Luncheon Meats Miscellaneous', 'Category'),
+    ('Misc Sausages and Luncheon Meats', 'Category'),
+    ('Sausages and Luncheon Meats Misc', 'Category'),
+    ('Selection Snacks', 'Category'),
+    ('Snacks Selection', 'Category'),
+    ('Bundle Snacks', 'Category'),
+    ('Snacks Bundle', 'Category'),
+    ('Pack Snacks', 'Category'),
+    ('Snacks Pack', 'Category'),
+    ('Box Snacks', 'Category'),
+    ('Pack Sweets', 'Category'),
+    ('Sweets Pack', 'Category'),
+    ('Box Sweets', 'Category'),
+    ('Sweets Box', 'Category'),
+    ('Various Sweets', 'Category'),
+    ('Sweets Various', 'Category'),
+    ('Miscellaneous Vegetables and Vegetable Products', 'Category'),
+    ('Vegetables and Vegetable Products Miscellaneous', 'Category'),
+    ('Misc Vegetables and Vegetable Products', 'Category'),
+    ('Vegetables and Vegetable Products Misc', 'Category'),
+]
+texts, labels = zip(*training_data)
+# Create a pipeline with TfidfVectorizer and RandomForestClassifier
+pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier())
+# Train the model
+pipeline.fit(texts, labels)
+# Function to classify text using the trained model
+def classify_text_to_type(text):
+    return pipeline.predict([text])[0]
+from db.db_utils import get_connection
+db_conn = get_connection()
+db_cursor = db_conn.cursor()
+db_cursor.execute("SELECT input_word FROM mappings WHERE specificity IS NULL")
+results = db_cursor.fetchall()
+for row in results:
+    input_word = row[0]
+    specificity = classify_text_to_type(input_word)
+    db_cursor.execute("UPDATE mappings SET specificity = %s WHERE input_word = %s", (specificity, input_word))
+    db_conn.commit()
+db_conn.close()