import spacy import re # Load the spaCy model nlp = spacy.load("en_core_web_trf") def analyze_text(text): # Track the positions of slashes in the original text original_slash_positions = [m.start() for m in re.finditer(r'\/', text)] # Replace different delimiters with a uniform delimiter (comma) normalized_text = re.sub(r'[\/,]', ',', text) doc = nlp(normalized_text) # Print tokens with their attributes for token in doc: print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}") items = [] current_item = [] current_position = 0 root_noun_found = False for token in doc: token_start = text.find(token.text, current_position) token_end = token_start + len(token.text) # If the token is punctuation and a root noun has been found, finalize the current item if token.pos_ == 'PUNCT' and token.text == ',': if root_noun_found: items.append(" ".join(current_item)) current_item = [] root_noun_found = False # Check if the comma was originally a slash if token_start in original_slash_positions: items.append('/') else: items.append(',') else: # If token is part of a compound noun or an adjective, add to the current item if token.dep_ in ('compound', 'amod'): current_item.append(token.text) elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN': current_item.append(token.text) root_noun_found = True elif token.dep_ == 'appos': if current_item: current_item.append(token.text) else: current_item = [token.text] root_noun_found = True else: current_item.append(token.text) current_position = token_end # Add the last item if it exists if current_item: items.append(" ".join(current_item)) # Process items to handle delimiters correctly final_items = [] temp_item = [] for item in items: if item in [',', '/']: if temp_item: final_items.append("".join(temp_item).strip()) temp_item = [] if item == '/': final_items.append('/') else: temp_item.append(item + " ") if temp_item: final_items.append("".join(temp_item).strip()) # Combine items separated by slashes into single items combined_items = [] i = 0 while i < len(final_items): if final_items[i] == '/': combined_items[-1] += '/' + final_items[i + 1] i += 2 else: combined_items.append(final_items[i]) i += 1 # Determine if the text is a single noun phrase or multiple items non_delimiter_items = [item for item in combined_items if item not in [',', '/']] is_single_noun_phrase = len(non_delimiter_items) == 1 delimiter = determine_delimiter(text) return is_single_noun_phrase, delimiter, combined_items def determine_delimiter(text): number_of_slashes = text.count('/') number_of_commas = text.count(',') number_of_spaces = text.count(' ') if number_of_slashes > 0 and number_of_slashes >= number_of_commas: # prefer slash over comma, since its rarer return '/' elif number_of_commas > 0: return ',' else: return ' ' def extract_items(text): is_single_noun_phrase, delimiter, _ = analyze_text(text) if is_single_noun_phrase: return [text] else: items = text.split(delimiter) return items