Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

brightly-ai / multi_food_item_detector.py

beweinreich

bugfix multiitem

7450395 about 1 month ago

raw

history blame

No virus

3.82 kB

	import spacy
	import re

	# Load the spaCy model
	nlp = spacy.load("en_core_web_trf")

	def analyze_text(text):
	# Track the positions of slashes in the original text
	original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]

	# Replace different delimiters with a uniform delimiter (comma)
	normalized_text = re.sub(r'[\/,]', ',', text)

	doc = nlp(normalized_text)

	# Print tokens with their attributes
	for token in doc:
	print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")

	items = []
	current_item = []
	current_position = 0
	root_noun_found = False

	for token in doc:
	token_start = text.find(token.text, current_position)
	token_end = token_start + len(token.text)

	# If the token is punctuation and a root noun has been found, finalize the current item
	if token.pos_ == 'PUNCT' and token.text == ',':
	if root_noun_found:
	items.append(" ".join(current_item))
	current_item = []
	root_noun_found = False
	# Check if the comma was originally a slash
	if token_start in original_slash_positions:
	items.append('/')
	else:
	items.append(',')
	else:
	# If token is part of a compound noun or an adjective, add to the current item
	if token.dep_ in ('compound', 'amod'):
	current_item.append(token.text)
	elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN':
	current_item.append(token.text)
	root_noun_found = True
	elif token.dep_ == 'appos':
	if current_item:
	current_item.append(token.text)
	else:
	current_item = [token.text]
	root_noun_found = True
	else:
	current_item.append(token.text)

	current_position = token_end

	# Add the last item if it exists
	if current_item:
	items.append(" ".join(current_item))

	# Process items to handle delimiters correctly
	final_items = []
	temp_item = []
	for item in items:
	if item in [',', '/']:
	if temp_item:
	final_items.append("".join(temp_item).strip())
	temp_item = []
	if item == '/':
	final_items.append('/')
	else:
	temp_item.append(item + " ")

	if temp_item:
	final_items.append("".join(temp_item).strip())

	# Combine items separated by slashes into single items
	combined_items = []
	i = 0
	while i < len(final_items):
	if final_items[i] == '/':
	combined_items[-1] += '/' + final_items[i + 1]
	i += 2
	else:
	combined_items.append(final_items[i])
	i += 1

	# Determine if the text is a single noun phrase or multiple items
	non_delimiter_items = [item for item in combined_items if item not in [',', '/']]
	is_single_noun_phrase = len(non_delimiter_items) == 1

	delimiter = determine_delimiter(text)

	return is_single_noun_phrase, delimiter, combined_items

	def determine_delimiter(text):
	number_of_slashes = text.count('/')
	number_of_commas = text.count(',')
	number_of_spaces = text.count(' ')

	if number_of_slashes > 0 and number_of_slashes >= number_of_commas:
	# prefer slash over comma, since its rarer
	return '/'
	elif number_of_commas > 0:
	return ','
	else:
	return ' '

	def extract_items(text):
	is_single_noun_phrase, delimiter, _ = analyze_text(text)

	if is_single_noun_phrase:
	return [text]
	else:
	items = text.split(delimiter)
	return items