Spaces:
Paused
Paused
import re | |
import spacy | |
import logging | |
# Load the spaCy model | |
nlp = spacy.load("en_core_web_trf") | |
def analyze_text(text): | |
# Replace different delimiters with a uniform delimiter (comma) | |
normalized_text = re.sub(r'[\/,]', ',', text) | |
# an ampersand with spaces on both sides is a delimiter | |
normalized_text = re.sub(r'\s*&\s*', ',', normalized_text) | |
# the word 'and' with spaces on both sides is a delimiter | |
normalized_text = re.sub(r'\s+and\s+', ',', normalized_text) | |
doc = nlp(normalized_text) | |
# Print tokens with their attributes | |
for token in doc: | |
logging.info(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}") | |
items = [] | |
current_item = [] | |
for token in doc: | |
# If the token is punctuation, finalize the current item | |
if token.pos_ == 'PUNCT' and token.text == ',': | |
if current_item: | |
items.append(" ".join(current_item)) | |
current_item = [] | |
else: | |
# If token is part of a compound noun or an adjective, add to the current item | |
if token.dep_ in ('compound', 'amod'): | |
current_item.append(token.text) | |
elif token.dep_ in ('ROOT', 'appos'): | |
if current_item: | |
current_item.append(token.text) | |
else: | |
current_item = [token.text] | |
if token.head.dep_ == 'ROOT': | |
items.append(" ".join(current_item)) | |
current_item = [] | |
else: | |
current_item.append(token.text) | |
# Add the last item if it exists | |
if current_item: | |
items.append(" ".join(current_item)) | |
# Determine if the text is a single noun phrase or multiple items | |
is_single_noun_phrase = len(items) == 1 | |
delimiter = determine_delimiter(text) | |
items = [item.strip() for item in items] | |
# remove empty strings | |
items = [item for item in items if item] | |
return is_single_noun_phrase, delimiter, items | |
def determine_delimiter(text): | |
number_of_slashes = text.count('/') | |
number_of_commas = text.count(',') | |
number_of_ampersands = text.count(' & ') | |
number_of_ands = text.count(' and ') | |
number_of_spaces = text.count(' ') | |
if number_of_slashes > 0 and number_of_slashes >= number_of_commas: | |
# prefer slash over comma, since its rarer | |
return '/' | |
elif number_of_commas > 0: | |
return ',' | |
elif number_of_ampersands > 0: | |
return '&' | |
elif number_of_ands > 0: | |
return 'and' | |
else: | |
return ' ' | |
def has_delimiters(text): | |
return determine_delimiter(text) != ' ' | |
def extract_items(text): | |
is_single_noun_phrase, delimiter, _ = analyze_text(text) | |
if is_single_noun_phrase: | |
return [text] | |
else: | |
items = text.split(delimiter) | |
# remove empty strings | |
items = [item.strip() for item in items if item] | |
return items | |