Spaces:
Paused
Paused
import spacy | |
import re | |
# Load the spaCy model | |
nlp = spacy.load("en_core_web_trf") | |
def analyze_text(text): | |
# Track the positions of slashes in the original text | |
original_slash_positions = [m.start() for m in re.finditer(r'\/', text)] | |
# Replace different delimiters with a uniform delimiter (comma) | |
normalized_text = re.sub(r'[\/,]', ',', text) | |
doc = nlp(normalized_text) | |
# Print tokens with their attributes | |
for token in doc: | |
print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}") | |
items = [] | |
current_item = [] | |
current_position = 0 | |
root_noun_found = False | |
for token in doc: | |
token_start = text.find(token.text, current_position) | |
token_end = token_start + len(token.text) | |
# If the token is punctuation and a root noun has been found, finalize the current item | |
if token.pos_ == 'PUNCT' and token.text == ',': | |
if root_noun_found: | |
items.append(" ".join(current_item)) | |
current_item = [] | |
root_noun_found = False | |
# Check if the comma was originally a slash | |
if token_start in original_slash_positions: | |
items.append('/') | |
else: | |
items.append(',') | |
else: | |
# If token is part of a compound noun or an adjective, add to the current item | |
if token.dep_ in ('compound', 'amod'): | |
current_item.append(token.text) | |
elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN': | |
current_item.append(token.text) | |
root_noun_found = True | |
elif token.dep_ == 'appos': | |
if current_item: | |
current_item.append(token.text) | |
else: | |
current_item = [token.text] | |
root_noun_found = True | |
else: | |
current_item.append(token.text) | |
current_position = token_end | |
# Add the last item if it exists | |
if current_item: | |
items.append(" ".join(current_item)) | |
# Process items to handle delimiters correctly | |
final_items = [] | |
temp_item = [] | |
for item in items: | |
if item in [',', '/']: | |
if temp_item: | |
final_items.append("".join(temp_item).strip()) | |
temp_item = [] | |
if item == '/': | |
final_items.append('/') | |
else: | |
temp_item.append(item + " ") | |
if temp_item: | |
final_items.append("".join(temp_item).strip()) | |
# Combine items separated by slashes into single items | |
combined_items = [] | |
i = 0 | |
while i < len(final_items): | |
if final_items[i] == '/': | |
combined_items[-1] += '/' + final_items[i + 1] | |
i += 2 | |
else: | |
combined_items.append(final_items[i]) | |
i += 1 | |
# Determine if the text is a single noun phrase or multiple items | |
non_delimiter_items = [item for item in combined_items if item not in [',', '/']] | |
is_single_noun_phrase = len(non_delimiter_items) == 1 | |
delimiter = determine_delimiter(text) | |
return is_single_noun_phrase, delimiter, combined_items | |
def determine_delimiter(text): | |
number_of_slashes = text.count('/') | |
number_of_commas = text.count(',') | |
number_of_spaces = text.count(' ') | |
if number_of_slashes > 0 and number_of_slashes >= number_of_commas: | |
# prefer slash over comma, since its rarer | |
return '/' | |
elif number_of_commas > 0: | |
return ',' | |
else: | |
return ' ' | |
def extract_items(text): | |
is_single_noun_phrase, delimiter, _ = analyze_text(text) | |
if is_single_noun_phrase: | |
return [text] | |
else: | |
items = text.split(delimiter) | |
return items | |