Spaces:
Paused
Paused
import spacy | |
import re | |
# Load the spaCy model | |
nlp = spacy.load("en_core_web_trf") | |
def analyze_text(text): | |
# Track the positions of slashes in the original text | |
original_slash_positions = [m.start() for m in re.finditer(r'\/', text)] | |
# Replace different delimiters with a uniform delimiter (comma) | |
normalized_text = re.sub(r'[\/,]', ',', text) | |
doc = nlp(normalized_text) | |
# Print tokens with their attributes | |
for token in doc: | |
print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}") | |
items = [] | |
current_item = [] | |
current_position = 0 | |
root_noun_found = False | |
for token in doc: | |
token_start = text.find(token.text, current_position) | |
token_end = token_start + len(token.text) | |
# If the token is punctuation and a root noun has been found, finalize the current item | |
if token.pos_ == 'PUNCT' and token.text == ',': | |
if root_noun_found: | |
items.append(" ".join(current_item)) | |
current_item = [] | |
root_noun_found = False | |
# Check if the comma was originally a slash | |
if token_start in original_slash_positions: | |
items.append('/') | |
else: | |
items.append(',') | |
else: | |
# If token is part of a compound noun or an adjective, add to the current item | |
if token.dep_ in ('compound', 'amod'): | |
current_item.append(token.text) | |
elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN': | |
current_item.append(token.text) | |
root_noun_found = True | |
elif token.dep_ == 'appos': | |
if current_item: | |
current_item.append(token.text) | |
else: | |
current_item = [token.text] | |
root_noun_found = True | |
else: | |
current_item.append(token.text) | |
current_position = token_end | |
# Add the last item if it exists | |
if current_item: | |
items.append(" ".join(current_item)) | |
# Process items to handle delimiters correctly | |
final_items = [] | |
temp_item = [] | |
for item in items: | |
if item in [',', '/']: | |
if temp_item: | |
final_items.append("".join(temp_item).strip()) | |
temp_item = [] | |
if item == '/': | |
final_items.append('/') | |
else: | |
temp_item.append(item + " ") | |
if temp_item: | |
final_items.append("".join(temp_item).strip()) | |
# Combine items separated by slashes into single items | |
combined_items = [] | |
i = 0 | |
while i < len(final_items): | |
if final_items[i] == '/': | |
combined_items[-1] += '/' + final_items[i + 1] | |
i += 2 | |
else: | |
combined_items.append(final_items[i]) | |
i += 1 | |
# Determine if the text is a single noun phrase or multiple items | |
non_delimiter_items = [item for item in combined_items if item not in [',', '/']] | |
if len(non_delimiter_items) == 1: | |
print("The text is a single noun phrase.") | |
else: | |
print("The text contains multiple items.") | |
print("Items identified:", non_delimiter_items) | |
# Example usage | |
texts = [ | |
"apple", | |
"italian squash, raw, unpeeled", | |
"chocolate chips, bananas", | |
"chocolate chips/bananas", | |
"chocolate chips / bananas", | |
"chocolate chips, bananas, 1/2 lb carrots", | |
"pink berries/raw carrots/chcolate, raw/winter squash", | |
] | |
for text in texts: | |
print(f"Analyzing: {text}") | |
analyze_text(text) | |
print() | |