import re import spacy import logging # Load the spaCy model nlp = spacy.load("en_core_web_trf") def analyze_text(text): # Replace different delimiters with a uniform delimiter (comma) normalized_text = re.sub(r'[\/,]', ',', text) # an ampersand with spaces on both sides is a delimiter normalized_text = re.sub(r'\s*&\s*', ',', normalized_text) # the word 'and' with spaces on both sides is a delimiter normalized_text = re.sub(r'\s+and\s+', ',', normalized_text) doc = nlp(normalized_text) # Print tokens with their attributes for token in doc: logging.info(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}") items = [] current_item = [] for token in doc: # If the token is punctuation, finalize the current item if token.pos_ == 'PUNCT' and token.text == ',': if current_item: items.append(" ".join(current_item)) current_item = [] else: # If token is part of a compound noun or an adjective, add to the current item if token.dep_ in ('compound', 'amod'): current_item.append(token.text) elif token.dep_ in ('ROOT', 'appos'): if current_item: current_item.append(token.text) else: current_item = [token.text] if token.head.dep_ == 'ROOT': items.append(" ".join(current_item)) current_item = [] else: current_item.append(token.text) # Add the last item if it exists if current_item: items.append(" ".join(current_item)) # Determine if the text is a single noun phrase or multiple items is_single_noun_phrase = len(items) == 1 delimiter = determine_delimiter(text) items = [item.strip() for item in items] # remove empty strings items = [item for item in items if item] return is_single_noun_phrase, delimiter, items def determine_delimiter(text): number_of_slashes = text.count('/') number_of_commas = text.count(',') number_of_ampersands = text.count(' & ') number_of_ands = text.count(' and ') number_of_spaces = text.count(' ') if number_of_slashes > 0 and number_of_slashes >= number_of_commas: # prefer slash over comma, since its rarer return '/' elif number_of_commas > 0: return ',' elif number_of_ampersands > 0: return '&' elif number_of_ands > 0: return 'and' else: return ' ' def has_delimiters(text): return determine_delimiter(text) != ' ' def extract_items(text): is_single_noun_phrase, delimiter, _ = analyze_text(text) if is_single_noun_phrase: return [text] else: items = text.split(delimiter) # remove empty strings items = [item.strip() for item in items if item] return items