File size: 2,575 Bytes
b1c94e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306cc03
b1c94e2
306cc03
b1c94e2
 
 
 
 
 
306cc03
b1c94e2
 
 
 
306cc03
 
 
b1c94e2
 
 
 
 
 
73fda7b
b1c94e2
73fda7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1c94e2
73fda7b
 
 
 
b1c94e2
73fda7b
 
 
 
 
b1c94e2
306cc03
b1c94e2
73fda7b
b1c94e2
 
 
 
73fda7b
 
 
 
b1c94e2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import spacy
import re

# Load the spaCy model
nlp = spacy.load("en_core_web_trf")

def analyze_text(text):
    # Replace different delimiters with a uniform delimiter (comma)
    normalized_text = re.sub(r'[\/,]', ',', text)
    
    doc = nlp(normalized_text)
    
    # Print tokens with their attributes
    for token in doc:
        print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")
    
    items = []
    current_item = []
    
    for token in doc:
        # If the token is punctuation, finalize the current item
        if token.pos_ == 'PUNCT' and token.text == ',':
            if current_item:
                items.append(" ".join(current_item))
                current_item = []
        else:
            # If token is part of a compound noun or an adjective, add to the current item
            if token.dep_ in ('compound', 'amod'):
                current_item.append(token.text)
            elif token.dep_ in ('ROOT', 'appos'):
                if current_item:
                    current_item.append(token.text)
                else:
                    current_item = [token.text]
                if token.head.dep_ == 'ROOT':
                    items.append(" ".join(current_item))
                    current_item = []
            else:
                current_item.append(token.text)

    # Add the last item if it exists
    if current_item:
        items.append(" ".join(current_item))

    # Determine if the text is a single noun phrase or multiple items
    is_single_noun_phrase = len(items) == 1

    delimiter = determine_delimiter(text)

    return is_single_noun_phrase, delimiter, items
    
def determine_delimiter(text):
    number_of_slashes = text.count('/')
    number_of_commas = text.count(',')
    number_of_spaces = text.count(' ')

    if number_of_slashes > 0 and number_of_slashes >= number_of_commas:
        # prefer slash over comma, since its rarer
        return '/'
    elif number_of_commas > 0:
        return ','
    else:
        return ' '

def extract_items(text):
    is_single_noun_phrase, delimiter, _ = analyze_text(text)
    
    if is_single_noun_phrase:
        return [text]
    else:
        items = text.split(delimiter)
        return items

# Example usages
texts = [
    "salads, sandwiches & sushi",
]

for text in texts:
    print(f"Analyzing: {text}")
    is_single_noun_phrase, delimiter, items = analyze_text(text)
    print(f"Is single noun phrase: {is_single_noun_phrase}")
    print(f"Delimiter: {delimiter}")
    print(f"Items: {items}")
    print()