brightly-ai / multi_food_item_detector.py
beweinreich's picture
export csv script
5983c9d
raw
history blame
No virus
3.29 kB
import re
import spacy
import logging
# Load the spaCy model
nlp = spacy.load("en_core_web_trf")
def analyze_text(text):
# Replace different delimiters with a uniform delimiter (comma)
normalized_text = re.sub(r'[\/,]', ',', text)
# an ampersand with spaces on both sides is a delimiter
normalized_text = re.sub(r'\s*&\s*', ',', normalized_text)
# the word 'and' with spaces on both sides is a delimiter
normalized_text = re.sub(r'\s+and\s+', ',', normalized_text)
doc = nlp(normalized_text)
# Print tokens with their attributes
for token in doc:
logging.info(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")
items = []
current_item = []
for token in doc:
# If the token is punctuation, finalize the current item
if token.pos_ == 'PUNCT' and token.text == ',':
if current_item:
items.append(" ".join(current_item))
current_item = []
else:
# If token is part of a compound noun or an adjective, add to the current item
if token.dep_ in ('compound', 'amod'):
current_item.append(token.text)
elif token.dep_ in ('ROOT', 'appos'):
if current_item:
current_item.append(token.text)
else:
current_item = [token.text]
if token.head.dep_ == 'ROOT':
items.append(" ".join(current_item))
current_item = []
else:
current_item.append(token.text)
# Add the last item if it exists
if current_item:
items.append(" ".join(current_item))
# Determine if the text is a single noun phrase or multiple items
is_single_noun_phrase = len(items) == 1
delimiter = determine_delimiter(text)
items = [item.strip() for item in items]
# remove empty strings
items = [item for item in items if item]
return is_single_noun_phrase, delimiter, items
def determine_delimiter(text):
number_of_slashes = text.count('/')
number_of_commas = text.count(',')
number_of_ampersands = text.count(' & ')
number_of_ands = text.count(' and ')
number_of_spaces = text.count(' ')
if number_of_slashes > 0 and number_of_slashes >= number_of_commas:
# prefer slash over comma, since its rarer
return '/'
elif number_of_commas > 0:
return ','
elif number_of_ampersands > 0:
return '&'
elif number_of_ands > 0:
return 'and'
else:
return ' '
def has_delimiters(text):
return determine_delimiter(text) != ' '
def extract_items(text):
is_single_noun_phrase, delimiter, _ = analyze_text(text)
if is_single_noun_phrase:
return [text]
else:
items = text.split(delimiter)
# remove empty strings
items = [item.strip() for item in items if item]
# if the word starts with "& " or "and ", remove it
items = [re.sub(r'^& ', '', item) for item in items]
items = [re.sub(r'^and ', '', item) for item in items]
# trim any leading or trailing spaces
items = [item.strip() for item in items]
return items