Spaces:
Paused
Paused
File size: 2,996 Bytes
9189e38 b72dd6f 9189e38 313433a 9189e38 b1c94e2 8e06613 b1c94e2 b72dd6f b1c94e2 306cc03 b1c94e2 306cc03 b1c94e2 306cc03 b1c94e2 306cc03 b1c94e2 9189e38 b1c94e2 ecfb899 b1c94e2 306cc03 b1c94e2 fb1ac97 c41a309 fb1ac97 306cc03 ecfb899 b1c94e2 9a6b725 b1c94e2 9a6b725 b1c94e2 d3d3a5b b1c94e2 7450395 ecfb899 b1c94e2 c41a309 e4c350f b1c94e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import re
import spacy
import logging
# Load the spaCy model
nlp = spacy.load("en_core_web_trf")
def analyze_text(text):
# Replace different delimiters with a uniform delimiter (comma)
normalized_text = re.sub(r'[\/,]', ',', text)
# an ampersand with spaces on both sides is a delimiter
normalized_text = re.sub(r'\s*&\s*', ',', normalized_text)
# the word 'and' with spaces on both sides is a delimiter
normalized_text = re.sub(r'\s+and\s+', ',', normalized_text)
doc = nlp(normalized_text)
# Print tokens with their attributes
for token in doc:
logging.info(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")
items = []
current_item = []
for token in doc:
# If the token is punctuation, finalize the current item
if token.pos_ == 'PUNCT' and token.text == ',':
if current_item:
items.append(" ".join(current_item))
current_item = []
else:
# If token is part of a compound noun or an adjective, add to the current item
if token.dep_ in ('compound', 'amod'):
current_item.append(token.text)
elif token.dep_ in ('ROOT', 'appos'):
if current_item:
current_item.append(token.text)
else:
current_item = [token.text]
if token.head.dep_ == 'ROOT':
items.append(" ".join(current_item))
current_item = []
else:
current_item.append(token.text)
# Add the last item if it exists
if current_item:
items.append(" ".join(current_item))
# Determine if the text is a single noun phrase or multiple items
is_single_noun_phrase = len(items) == 1
delimiter = determine_delimiter(text)
items = [item.strip() for item in items]
# remove empty strings
items = [item for item in items if item]
return is_single_noun_phrase, delimiter, items
def determine_delimiter(text):
number_of_slashes = text.count('/')
number_of_commas = text.count(',')
number_of_ampersands = text.count(' & ')
number_of_ands = text.count(' and ')
number_of_spaces = text.count(' ')
if number_of_slashes > 0 and number_of_slashes >= number_of_commas:
# prefer slash over comma, since its rarer
return '/'
elif number_of_commas > 0:
return ','
elif number_of_ampersands > 0:
return '&'
elif number_of_ands > 0:
return 'and'
else:
return ' '
def has_delimiters(text):
return determine_delimiter(text) != ' '
def extract_items(text):
is_single_noun_phrase, delimiter, _ = analyze_text(text)
if is_single_noun_phrase:
return [text]
else:
items = text.split(delimiter)
# remove empty strings
items = [item.strip() for item in items if item]
return items
|