Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

brightly-ai / multi_food_item_detector.py

beweinreich

export csv script

5983c9d 24 days ago

raw

history blame

No virus

3.29 kB

	import re
	import spacy
	import logging

	# Load the spaCy model
	nlp = spacy.load("en_core_web_trf")

	def analyze_text(text):
	# Replace different delimiters with a uniform delimiter (comma)
	normalized_text = re.sub(r'[\/,]', ',', text)

	# an ampersand with spaces on both sides is a delimiter
	normalized_text = re.sub(r'\s&\s', ',', normalized_text)

	# the word 'and' with spaces on both sides is a delimiter
	normalized_text = re.sub(r'\s+and\s+', ',', normalized_text)

	doc = nlp(normalized_text)

	# Print tokens with their attributes
	for token in doc:
	logging.info(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")

	items = []
	current_item = []

	for token in doc:
	# If the token is punctuation, finalize the current item
	if token.pos_ == 'PUNCT' and token.text == ',':
	if current_item:
	items.append(" ".join(current_item))
	current_item = []
	else:
	# If token is part of a compound noun or an adjective, add to the current item
	if token.dep_ in ('compound', 'amod'):
	current_item.append(token.text)
	elif token.dep_ in ('ROOT', 'appos'):
	if current_item:
	current_item.append(token.text)
	else:
	current_item = [token.text]
	if token.head.dep_ == 'ROOT':
	items.append(" ".join(current_item))
	current_item = []
	else:
	current_item.append(token.text)

	# Add the last item if it exists
	if current_item:
	items.append(" ".join(current_item))

	# Determine if the text is a single noun phrase or multiple items
	is_single_noun_phrase = len(items) == 1

	delimiter = determine_delimiter(text)

	items = [item.strip() for item in items]
	# remove empty strings
	items = [item for item in items if item]

	return is_single_noun_phrase, delimiter, items

	def determine_delimiter(text):
	number_of_slashes = text.count('/')
	number_of_commas = text.count(',')
	number_of_ampersands = text.count(' & ')
	number_of_ands = text.count(' and ')
	number_of_spaces = text.count(' ')

	if number_of_slashes > 0 and number_of_slashes >= number_of_commas:
	# prefer slash over comma, since its rarer
	return '/'
	elif number_of_commas > 0:
	return ','
	elif number_of_ampersands > 0:
	return '&'
	elif number_of_ands > 0:
	return 'and'
	else:
	return ' '

	def has_delimiters(text):
	return determine_delimiter(text) != ' '

	def extract_items(text):
	is_single_noun_phrase, delimiter, _ = analyze_text(text)

	if is_single_noun_phrase:
	return [text]
	else:
	items = text.split(delimiter)
	# remove empty strings
	items = [item.strip() for item in items if item]

	# if the word starts with "& " or "and ", remove it
	items = [re.sub(r'^& ', '', item) for item in items]
	items = [re.sub(r'^and ', '', item) for item in items]

	# trim any leading or trailing spaces
	items = [item.strip() for item in items]

	return items