Spaces:
Paused
Paused
Commit
•
306cc03
1
Parent(s):
88cdfd7
bugfix for multi-item items
Browse files- multi_food_item_detector.py +8 -53
- playground.py +11 -59
multi_food_item_detector.py
CHANGED
@@ -5,9 +5,6 @@ import re
|
|
5 |
nlp = spacy.load("en_core_web_trf")
|
6 |
|
7 |
def analyze_text(text):
|
8 |
-
# Track the positions of slashes in the original text
|
9 |
-
original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]
|
10 |
-
|
11 |
# Replace different delimiters with a uniform delimiter (comma)
|
12 |
normalized_text = re.sub(r'[\/,]', ',', text)
|
13 |
|
@@ -19,80 +16,38 @@ def analyze_text(text):
|
|
19 |
|
20 |
items = []
|
21 |
current_item = []
|
22 |
-
current_position = 0
|
23 |
-
root_noun_found = False
|
24 |
|
25 |
for token in doc:
|
26 |
-
|
27 |
-
token_end = token_start + len(token.text)
|
28 |
-
|
29 |
-
# If the token is punctuation and a root noun has been found, finalize the current item
|
30 |
if token.pos_ == 'PUNCT' and token.text == ',':
|
31 |
-
if
|
32 |
items.append(" ".join(current_item))
|
33 |
current_item = []
|
34 |
-
root_noun_found = False
|
35 |
-
# Check if the comma was originally a slash
|
36 |
-
if token_start in original_slash_positions:
|
37 |
-
items.append('/')
|
38 |
-
else:
|
39 |
-
items.append(',')
|
40 |
else:
|
41 |
# If token is part of a compound noun or an adjective, add to the current item
|
42 |
if token.dep_ in ('compound', 'amod'):
|
43 |
current_item.append(token.text)
|
44 |
-
elif token.dep_
|
45 |
-
current_item.append(token.text)
|
46 |
-
root_noun_found = True
|
47 |
-
elif token.dep_ == 'appos':
|
48 |
if current_item:
|
49 |
current_item.append(token.text)
|
50 |
else:
|
51 |
current_item = [token.text]
|
52 |
-
|
|
|
|
|
53 |
else:
|
54 |
current_item.append(token.text)
|
55 |
-
|
56 |
-
current_position = token_end
|
57 |
|
58 |
# Add the last item if it exists
|
59 |
if current_item:
|
60 |
items.append(" ".join(current_item))
|
61 |
-
|
62 |
-
# Process items to handle delimiters correctly
|
63 |
-
final_items = []
|
64 |
-
temp_item = []
|
65 |
-
for item in items:
|
66 |
-
if item in [',', '/']:
|
67 |
-
if temp_item:
|
68 |
-
final_items.append("".join(temp_item).strip())
|
69 |
-
temp_item = []
|
70 |
-
if item == '/':
|
71 |
-
final_items.append('/')
|
72 |
-
else:
|
73 |
-
temp_item.append(item + " ")
|
74 |
-
|
75 |
-
if temp_item:
|
76 |
-
final_items.append("".join(temp_item).strip())
|
77 |
-
|
78 |
-
# Combine items separated by slashes into single items
|
79 |
-
combined_items = []
|
80 |
-
i = 0
|
81 |
-
while i < len(final_items):
|
82 |
-
if final_items[i] == '/':
|
83 |
-
combined_items[-1] += '/' + final_items[i + 1]
|
84 |
-
i += 2
|
85 |
-
else:
|
86 |
-
combined_items.append(final_items[i])
|
87 |
-
i += 1
|
88 |
|
89 |
# Determine if the text is a single noun phrase or multiple items
|
90 |
-
|
91 |
-
is_single_noun_phrase = len(non_delimiter_items) == 1
|
92 |
|
93 |
delimiter = determine_delimiter(text)
|
94 |
|
95 |
-
return is_single_noun_phrase, delimiter,
|
96 |
|
97 |
def determine_delimiter(text):
|
98 |
number_of_slashes = text.count('/')
|
|
|
5 |
nlp = spacy.load("en_core_web_trf")
|
6 |
|
7 |
def analyze_text(text):
|
|
|
|
|
|
|
8 |
# Replace different delimiters with a uniform delimiter (comma)
|
9 |
normalized_text = re.sub(r'[\/,]', ',', text)
|
10 |
|
|
|
16 |
|
17 |
items = []
|
18 |
current_item = []
|
|
|
|
|
19 |
|
20 |
for token in doc:
|
21 |
+
# If the token is punctuation, finalize the current item
|
|
|
|
|
|
|
22 |
if token.pos_ == 'PUNCT' and token.text == ',':
|
23 |
+
if current_item:
|
24 |
items.append(" ".join(current_item))
|
25 |
current_item = []
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
else:
|
27 |
# If token is part of a compound noun or an adjective, add to the current item
|
28 |
if token.dep_ in ('compound', 'amod'):
|
29 |
current_item.append(token.text)
|
30 |
+
elif token.dep_ in ('ROOT', 'appos'):
|
|
|
|
|
|
|
31 |
if current_item:
|
32 |
current_item.append(token.text)
|
33 |
else:
|
34 |
current_item = [token.text]
|
35 |
+
if token.head.dep_ == 'ROOT':
|
36 |
+
items.append(" ".join(current_item))
|
37 |
+
current_item = []
|
38 |
else:
|
39 |
current_item.append(token.text)
|
|
|
|
|
40 |
|
41 |
# Add the last item if it exists
|
42 |
if current_item:
|
43 |
items.append(" ".join(current_item))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
# Determine if the text is a single noun phrase or multiple items
|
46 |
+
is_single_noun_phrase = len(items) == 1
|
|
|
47 |
|
48 |
delimiter = determine_delimiter(text)
|
49 |
|
50 |
+
return is_single_noun_phrase, delimiter, items
|
51 |
|
52 |
def determine_delimiter(text):
|
53 |
number_of_slashes = text.count('/')
|
playground.py
CHANGED
@@ -5,9 +5,6 @@ import re
|
|
5 |
nlp = spacy.load("en_core_web_trf")
|
6 |
|
7 |
def analyze_text(text):
|
8 |
-
# Track the positions of slashes in the original text
|
9 |
-
original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]
|
10 |
-
|
11 |
# Replace different delimiters with a uniform delimiter (comma)
|
12 |
normalized_text = re.sub(r'[\/,]', ',', text)
|
13 |
|
@@ -19,91 +16,46 @@ def analyze_text(text):
|
|
19 |
|
20 |
items = []
|
21 |
current_item = []
|
22 |
-
current_position = 0
|
23 |
-
root_noun_found = False
|
24 |
|
25 |
for token in doc:
|
26 |
-
|
27 |
-
token_end = token_start + len(token.text)
|
28 |
-
|
29 |
-
# If the token is punctuation and a root noun has been found, finalize the current item
|
30 |
if token.pos_ == 'PUNCT' and token.text == ',':
|
31 |
-
if
|
32 |
items.append(" ".join(current_item))
|
33 |
current_item = []
|
34 |
-
root_noun_found = False
|
35 |
-
# Check if the comma was originally a slash
|
36 |
-
if token_start in original_slash_positions:
|
37 |
-
items.append('/')
|
38 |
-
else:
|
39 |
-
items.append(',')
|
40 |
else:
|
41 |
# If token is part of a compound noun or an adjective, add to the current item
|
42 |
if token.dep_ in ('compound', 'amod'):
|
43 |
current_item.append(token.text)
|
44 |
-
elif token.dep_
|
45 |
-
current_item.append(token.text)
|
46 |
-
root_noun_found = True
|
47 |
-
elif token.dep_ == 'appos':
|
48 |
if current_item:
|
49 |
current_item.append(token.text)
|
50 |
else:
|
51 |
current_item = [token.text]
|
52 |
-
|
|
|
|
|
53 |
else:
|
54 |
current_item.append(token.text)
|
55 |
-
|
56 |
-
current_position = token_end
|
57 |
|
58 |
# Add the last item if it exists
|
59 |
if current_item:
|
60 |
items.append(" ".join(current_item))
|
61 |
|
62 |
-
# Process items to handle delimiters correctly
|
63 |
-
final_items = []
|
64 |
-
temp_item = []
|
65 |
-
for item in items:
|
66 |
-
if item in [',', '/']:
|
67 |
-
if temp_item:
|
68 |
-
final_items.append("".join(temp_item).strip())
|
69 |
-
temp_item = []
|
70 |
-
if item == '/':
|
71 |
-
final_items.append('/')
|
72 |
-
else:
|
73 |
-
temp_item.append(item + " ")
|
74 |
-
|
75 |
-
if temp_item:
|
76 |
-
final_items.append("".join(temp_item).strip())
|
77 |
-
|
78 |
-
# Combine items separated by slashes into single items
|
79 |
-
combined_items = []
|
80 |
-
i = 0
|
81 |
-
while i < len(final_items):
|
82 |
-
if final_items[i] == '/':
|
83 |
-
combined_items[-1] += '/' + final_items[i + 1]
|
84 |
-
i += 2
|
85 |
-
else:
|
86 |
-
combined_items.append(final_items[i])
|
87 |
-
i += 1
|
88 |
-
|
89 |
# Determine if the text is a single noun phrase or multiple items
|
90 |
-
|
91 |
-
if len(non_delimiter_items) == 1:
|
92 |
print("The text is a single noun phrase.")
|
93 |
else:
|
94 |
print("The text contains multiple items.")
|
95 |
|
96 |
-
print("Items identified:",
|
97 |
|
98 |
-
# Example
|
99 |
texts = [
|
100 |
-
"
|
101 |
-
"italian squash, raw, unpeeled",
|
102 |
-
"chocolate chips, bananas",
|
103 |
"chocolate chips/bananas",
|
104 |
"chocolate chips / bananas",
|
105 |
-
"chocolate chips, bananas, 1/2 lb carrots"
|
106 |
-
"pink berries/raw carrots/chcolate, raw/winter squash",
|
107 |
]
|
108 |
|
109 |
for text in texts:
|
|
|
5 |
nlp = spacy.load("en_core_web_trf")
|
6 |
|
7 |
def analyze_text(text):
|
|
|
|
|
|
|
8 |
# Replace different delimiters with a uniform delimiter (comma)
|
9 |
normalized_text = re.sub(r'[\/,]', ',', text)
|
10 |
|
|
|
16 |
|
17 |
items = []
|
18 |
current_item = []
|
|
|
|
|
19 |
|
20 |
for token in doc:
|
21 |
+
# If the token is punctuation, finalize the current item
|
|
|
|
|
|
|
22 |
if token.pos_ == 'PUNCT' and token.text == ',':
|
23 |
+
if current_item:
|
24 |
items.append(" ".join(current_item))
|
25 |
current_item = []
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
else:
|
27 |
# If token is part of a compound noun or an adjective, add to the current item
|
28 |
if token.dep_ in ('compound', 'amod'):
|
29 |
current_item.append(token.text)
|
30 |
+
elif token.dep_ in ('ROOT', 'appos'):
|
|
|
|
|
|
|
31 |
if current_item:
|
32 |
current_item.append(token.text)
|
33 |
else:
|
34 |
current_item = [token.text]
|
35 |
+
if token.head.dep_ == 'ROOT':
|
36 |
+
items.append(" ".join(current_item))
|
37 |
+
current_item = []
|
38 |
else:
|
39 |
current_item.append(token.text)
|
|
|
|
|
40 |
|
41 |
# Add the last item if it exists
|
42 |
if current_item:
|
43 |
items.append(" ".join(current_item))
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
# Determine if the text is a single noun phrase or multiple items
|
46 |
+
if len(items) == 1:
|
|
|
47 |
print("The text is a single noun phrase.")
|
48 |
else:
|
49 |
print("The text contains multiple items.")
|
50 |
|
51 |
+
print("Items identified:", items)
|
52 |
|
53 |
+
# Example usages
|
54 |
texts = [
|
55 |
+
"chocolate, bananas",
|
|
|
|
|
56 |
"chocolate chips/bananas",
|
57 |
"chocolate chips / bananas",
|
58 |
+
"chocolate chips, bananas, 1/2 lb carrots"
|
|
|
59 |
]
|
60 |
|
61 |
for text in texts:
|