brightly-ai / multi_food_item_detector.py
beweinreich's picture
prefer food items, but if there are none, then lets skip it
4a1be51
raw
history blame
No virus
1.75 kB
import spacy
import re
# Load the spaCy model
nlp = spacy.load("en_core_web_trf")
def get_nouns(text):
doc = nlp(text)
nouns = [token.text for token in doc if token.pos_ == "NOUN"]
return nouns
def extract_food_phrases(text):
# Determine the delimiter
if '/' in text:
delimiter = '/'
elif ',' in text:
delimiter = ','
else:
# If it's not comma or slash delimited, return the text as is
# this will be an edge-case and we'll handle it later
return [text]
# Split the text using the identified delimiter
items = [item.strip() for item in text.split(delimiter)]
# Process each item to find food items
food_items = []
for item in items:
doc = nlp(item)
tokens = [token.text for token in doc]
# Check if any noun in the list of known nouns is present in the tokens
for token in doc:
if token.pos_ == "NOUN":
food_items.append(item.strip())
break
return food_items
def extract_items(text):
# Determine the delimiter
if '/' in text:
delimiter = '/'
elif ',' in text:
delimiter = ','
else:
# If it's not comma or slash delimited, return the text as is
return [text]
# Split the text using the identified delimiter
items = [item.strip() for item in text.split(delimiter)]
# Get the food items
food_items = extract_food_phrases(text)
if len(food_items) > 0:
return food_items
# Find the items that were not matched as food items
non_food_items = [item for item in items if item not in food_items]
# Combine the food items and non_food_items
return food_items + non_food_items