File size: 2,996 Bytes
9189e38
b72dd6f
 
9189e38
 
313433a
9189e38
b1c94e2
 
 
8e06613
 
 
 
 
 
b1c94e2
 
 
 
 
b72dd6f
b1c94e2
 
 
 
 
306cc03
b1c94e2
306cc03
b1c94e2
 
 
 
 
 
306cc03
b1c94e2
 
 
 
306cc03
 
 
b1c94e2
 
9189e38
b1c94e2
 
 
ecfb899
b1c94e2
306cc03
b1c94e2
 
 
fb1ac97
c41a309
 
fb1ac97
306cc03
ecfb899
b1c94e2
 
 
9a6b725
 
b1c94e2
 
 
 
 
 
 
9a6b725
 
 
 
b1c94e2
 
 
d3d3a5b
 
 
b1c94e2
7450395
ecfb899
b1c94e2
 
 
 
c41a309
e4c350f
b1c94e2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import re
import spacy
import logging

# Load the spaCy model
nlp = spacy.load("en_core_web_trf")

def analyze_text(text):
    # Replace different delimiters with a uniform delimiter (comma)
    normalized_text = re.sub(r'[\/,]', ',', text)

    # an ampersand with spaces on both sides is a delimiter
    normalized_text = re.sub(r'\s*&\s*', ',', normalized_text)

    # the word 'and' with spaces on both sides is a delimiter
    normalized_text = re.sub(r'\s+and\s+', ',', normalized_text)
    
    doc = nlp(normalized_text)
    
    # Print tokens with their attributes
    for token in doc:
        logging.info(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")
    
    items = []
    current_item = []
    
    for token in doc:
        # If the token is punctuation, finalize the current item
        if token.pos_ == 'PUNCT' and token.text == ',':
            if current_item:
                items.append(" ".join(current_item))
                current_item = []
        else:
            # If token is part of a compound noun or an adjective, add to the current item
            if token.dep_ in ('compound', 'amod'):
                current_item.append(token.text)
            elif token.dep_ in ('ROOT', 'appos'):
                if current_item:
                    current_item.append(token.text)
                else:
                    current_item = [token.text]
                if token.head.dep_ == 'ROOT':
                    items.append(" ".join(current_item))
                    current_item = []
            else:
                current_item.append(token.text)

    # Add the last item if it exists
    if current_item:
        items.append(" ".join(current_item))

    # Determine if the text is a single noun phrase or multiple items
    is_single_noun_phrase = len(items) == 1

    delimiter = determine_delimiter(text)

    items = [item.strip() for item in items]
    # remove empty strings
    items = [item for item in items if item]

    return is_single_noun_phrase, delimiter, items
    
def determine_delimiter(text):
    number_of_slashes = text.count('/')
    number_of_commas = text.count(',')
    number_of_ampersands = text.count(' & ')
    number_of_ands = text.count(' and ')
    number_of_spaces = text.count(' ')

    if number_of_slashes > 0 and number_of_slashes >= number_of_commas:
        # prefer slash over comma, since its rarer
        return '/'
    elif number_of_commas > 0:
        return ','
    elif number_of_ampersands > 0:
        return '&'
    elif number_of_ands > 0:
        return 'and'
    else:
        return ' '

def has_delimiters(text):
    return determine_delimiter(text) != ' '

def extract_items(text):
    is_single_noun_phrase, delimiter, _ = analyze_text(text)
    
    if is_single_noun_phrase:
        return [text]
    else:
        items = text.split(delimiter)
        # remove empty strings
        items = [item.strip() for item in items if item]
        return items