brightly-ai / playground.py
beweinreich's picture
improvements to multi-item classifier, and adding dictionary data to mappings
b1c94e2
raw
history blame
No virus
3.65 kB
import spacy
import re
# Load the spaCy model
nlp = spacy.load("en_core_web_trf")
def analyze_text(text):
# Track the positions of slashes in the original text
original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]
# Replace different delimiters with a uniform delimiter (comma)
normalized_text = re.sub(r'[\/,]', ',', text)
doc = nlp(normalized_text)
# Print tokens with their attributes
for token in doc:
print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")
items = []
current_item = []
current_position = 0
root_noun_found = False
for token in doc:
token_start = text.find(token.text, current_position)
token_end = token_start + len(token.text)
# If the token is punctuation and a root noun has been found, finalize the current item
if token.pos_ == 'PUNCT' and token.text == ',':
if root_noun_found:
items.append(" ".join(current_item))
current_item = []
root_noun_found = False
# Check if the comma was originally a slash
if token_start in original_slash_positions:
items.append('/')
else:
items.append(',')
else:
# If token is part of a compound noun or an adjective, add to the current item
if token.dep_ in ('compound', 'amod'):
current_item.append(token.text)
elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN':
current_item.append(token.text)
root_noun_found = True
elif token.dep_ == 'appos':
if current_item:
current_item.append(token.text)
else:
current_item = [token.text]
root_noun_found = True
else:
current_item.append(token.text)
current_position = token_end
# Add the last item if it exists
if current_item:
items.append(" ".join(current_item))
# Process items to handle delimiters correctly
final_items = []
temp_item = []
for item in items:
if item in [',', '/']:
if temp_item:
final_items.append("".join(temp_item).strip())
temp_item = []
if item == '/':
final_items.append('/')
else:
temp_item.append(item + " ")
if temp_item:
final_items.append("".join(temp_item).strip())
# Combine items separated by slashes into single items
combined_items = []
i = 0
while i < len(final_items):
if final_items[i] == '/':
combined_items[-1] += '/' + final_items[i + 1]
i += 2
else:
combined_items.append(final_items[i])
i += 1
# Determine if the text is a single noun phrase or multiple items
non_delimiter_items = [item for item in combined_items if item not in [',', '/']]
if len(non_delimiter_items) == 1:
print("The text is a single noun phrase.")
else:
print("The text contains multiple items.")
print("Items identified:", non_delimiter_items)
# Example usage
texts = [
"apple",
"italian squash, raw, unpeeled",
"chocolate chips, bananas",
"chocolate chips/bananas",
"chocolate chips / bananas",
"chocolate chips, bananas, 1/2 lb carrots",
"pink berries/raw carrots/chcolate, raw/winter squash",
]
for text in texts:
print(f"Analyzing: {text}")
analyze_text(text)
print()