beweinreich commited on
Commit
306cc03
1 Parent(s): 88cdfd7

bugfix for multi-item items

Browse files
Files changed (2) hide show
  1. multi_food_item_detector.py +8 -53
  2. playground.py +11 -59
multi_food_item_detector.py CHANGED
@@ -5,9 +5,6 @@ import re
5
  nlp = spacy.load("en_core_web_trf")
6
 
7
  def analyze_text(text):
8
- # Track the positions of slashes in the original text
9
- original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]
10
-
11
  # Replace different delimiters with a uniform delimiter (comma)
12
  normalized_text = re.sub(r'[\/,]', ',', text)
13
 
@@ -19,80 +16,38 @@ def analyze_text(text):
19
 
20
  items = []
21
  current_item = []
22
- current_position = 0
23
- root_noun_found = False
24
 
25
  for token in doc:
26
- token_start = text.find(token.text, current_position)
27
- token_end = token_start + len(token.text)
28
-
29
- # If the token is punctuation and a root noun has been found, finalize the current item
30
  if token.pos_ == 'PUNCT' and token.text == ',':
31
- if root_noun_found:
32
  items.append(" ".join(current_item))
33
  current_item = []
34
- root_noun_found = False
35
- # Check if the comma was originally a slash
36
- if token_start in original_slash_positions:
37
- items.append('/')
38
- else:
39
- items.append(',')
40
  else:
41
  # If token is part of a compound noun or an adjective, add to the current item
42
  if token.dep_ in ('compound', 'amod'):
43
  current_item.append(token.text)
44
- elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN':
45
- current_item.append(token.text)
46
- root_noun_found = True
47
- elif token.dep_ == 'appos':
48
  if current_item:
49
  current_item.append(token.text)
50
  else:
51
  current_item = [token.text]
52
- root_noun_found = True
 
 
53
  else:
54
  current_item.append(token.text)
55
-
56
- current_position = token_end
57
 
58
  # Add the last item if it exists
59
  if current_item:
60
  items.append(" ".join(current_item))
61
-
62
- # Process items to handle delimiters correctly
63
- final_items = []
64
- temp_item = []
65
- for item in items:
66
- if item in [',', '/']:
67
- if temp_item:
68
- final_items.append("".join(temp_item).strip())
69
- temp_item = []
70
- if item == '/':
71
- final_items.append('/')
72
- else:
73
- temp_item.append(item + " ")
74
-
75
- if temp_item:
76
- final_items.append("".join(temp_item).strip())
77
-
78
- # Combine items separated by slashes into single items
79
- combined_items = []
80
- i = 0
81
- while i < len(final_items):
82
- if final_items[i] == '/':
83
- combined_items[-1] += '/' + final_items[i + 1]
84
- i += 2
85
- else:
86
- combined_items.append(final_items[i])
87
- i += 1
88
 
89
  # Determine if the text is a single noun phrase or multiple items
90
- non_delimiter_items = [item for item in combined_items if item not in [',', '/']]
91
- is_single_noun_phrase = len(non_delimiter_items) == 1
92
 
93
  delimiter = determine_delimiter(text)
94
 
95
- return is_single_noun_phrase, delimiter, combined_items
96
 
97
  def determine_delimiter(text):
98
  number_of_slashes = text.count('/')
 
5
  nlp = spacy.load("en_core_web_trf")
6
 
7
  def analyze_text(text):
 
 
 
8
  # Replace different delimiters with a uniform delimiter (comma)
9
  normalized_text = re.sub(r'[\/,]', ',', text)
10
 
 
16
 
17
  items = []
18
  current_item = []
 
 
19
 
20
  for token in doc:
21
+ # If the token is punctuation, finalize the current item
 
 
 
22
  if token.pos_ == 'PUNCT' and token.text == ',':
23
+ if current_item:
24
  items.append(" ".join(current_item))
25
  current_item = []
 
 
 
 
 
 
26
  else:
27
  # If token is part of a compound noun or an adjective, add to the current item
28
  if token.dep_ in ('compound', 'amod'):
29
  current_item.append(token.text)
30
+ elif token.dep_ in ('ROOT', 'appos'):
 
 
 
31
  if current_item:
32
  current_item.append(token.text)
33
  else:
34
  current_item = [token.text]
35
+ if token.head.dep_ == 'ROOT':
36
+ items.append(" ".join(current_item))
37
+ current_item = []
38
  else:
39
  current_item.append(token.text)
 
 
40
 
41
  # Add the last item if it exists
42
  if current_item:
43
  items.append(" ".join(current_item))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  # Determine if the text is a single noun phrase or multiple items
46
+ is_single_noun_phrase = len(items) == 1
 
47
 
48
  delimiter = determine_delimiter(text)
49
 
50
+ return is_single_noun_phrase, delimiter, items
51
 
52
  def determine_delimiter(text):
53
  number_of_slashes = text.count('/')
playground.py CHANGED
@@ -5,9 +5,6 @@ import re
5
  nlp = spacy.load("en_core_web_trf")
6
 
7
  def analyze_text(text):
8
- # Track the positions of slashes in the original text
9
- original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]
10
-
11
  # Replace different delimiters with a uniform delimiter (comma)
12
  normalized_text = re.sub(r'[\/,]', ',', text)
13
 
@@ -19,91 +16,46 @@ def analyze_text(text):
19
 
20
  items = []
21
  current_item = []
22
- current_position = 0
23
- root_noun_found = False
24
 
25
  for token in doc:
26
- token_start = text.find(token.text, current_position)
27
- token_end = token_start + len(token.text)
28
-
29
- # If the token is punctuation and a root noun has been found, finalize the current item
30
  if token.pos_ == 'PUNCT' and token.text == ',':
31
- if root_noun_found:
32
  items.append(" ".join(current_item))
33
  current_item = []
34
- root_noun_found = False
35
- # Check if the comma was originally a slash
36
- if token_start in original_slash_positions:
37
- items.append('/')
38
- else:
39
- items.append(',')
40
  else:
41
  # If token is part of a compound noun or an adjective, add to the current item
42
  if token.dep_ in ('compound', 'amod'):
43
  current_item.append(token.text)
44
- elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN':
45
- current_item.append(token.text)
46
- root_noun_found = True
47
- elif token.dep_ == 'appos':
48
  if current_item:
49
  current_item.append(token.text)
50
  else:
51
  current_item = [token.text]
52
- root_noun_found = True
 
 
53
  else:
54
  current_item.append(token.text)
55
-
56
- current_position = token_end
57
 
58
  # Add the last item if it exists
59
  if current_item:
60
  items.append(" ".join(current_item))
61
 
62
- # Process items to handle delimiters correctly
63
- final_items = []
64
- temp_item = []
65
- for item in items:
66
- if item in [',', '/']:
67
- if temp_item:
68
- final_items.append("".join(temp_item).strip())
69
- temp_item = []
70
- if item == '/':
71
- final_items.append('/')
72
- else:
73
- temp_item.append(item + " ")
74
-
75
- if temp_item:
76
- final_items.append("".join(temp_item).strip())
77
-
78
- # Combine items separated by slashes into single items
79
- combined_items = []
80
- i = 0
81
- while i < len(final_items):
82
- if final_items[i] == '/':
83
- combined_items[-1] += '/' + final_items[i + 1]
84
- i += 2
85
- else:
86
- combined_items.append(final_items[i])
87
- i += 1
88
-
89
  # Determine if the text is a single noun phrase or multiple items
90
- non_delimiter_items = [item for item in combined_items if item not in [',', '/']]
91
- if len(non_delimiter_items) == 1:
92
  print("The text is a single noun phrase.")
93
  else:
94
  print("The text contains multiple items.")
95
 
96
- print("Items identified:", non_delimiter_items)
97
 
98
- # Example usage
99
  texts = [
100
- "apple",
101
- "italian squash, raw, unpeeled",
102
- "chocolate chips, bananas",
103
  "chocolate chips/bananas",
104
  "chocolate chips / bananas",
105
- "chocolate chips, bananas, 1/2 lb carrots",
106
- "pink berries/raw carrots/chcolate, raw/winter squash",
107
  ]
108
 
109
  for text in texts:
 
5
  nlp = spacy.load("en_core_web_trf")
6
 
7
  def analyze_text(text):
 
 
 
8
  # Replace different delimiters with a uniform delimiter (comma)
9
  normalized_text = re.sub(r'[\/,]', ',', text)
10
 
 
16
 
17
  items = []
18
  current_item = []
 
 
19
 
20
  for token in doc:
21
+ # If the token is punctuation, finalize the current item
 
 
 
22
  if token.pos_ == 'PUNCT' and token.text == ',':
23
+ if current_item:
24
  items.append(" ".join(current_item))
25
  current_item = []
 
 
 
 
 
 
26
  else:
27
  # If token is part of a compound noun or an adjective, add to the current item
28
  if token.dep_ in ('compound', 'amod'):
29
  current_item.append(token.text)
30
+ elif token.dep_ in ('ROOT', 'appos'):
 
 
 
31
  if current_item:
32
  current_item.append(token.text)
33
  else:
34
  current_item = [token.text]
35
+ if token.head.dep_ == 'ROOT':
36
+ items.append(" ".join(current_item))
37
+ current_item = []
38
  else:
39
  current_item.append(token.text)
 
 
40
 
41
  # Add the last item if it exists
42
  if current_item:
43
  items.append(" ".join(current_item))
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  # Determine if the text is a single noun phrase or multiple items
46
+ if len(items) == 1:
 
47
  print("The text is a single noun phrase.")
48
  else:
49
  print("The text contains multiple items.")
50
 
51
+ print("Items identified:", items)
52
 
53
+ # Example usages
54
  texts = [
55
+ "chocolate, bananas",
 
 
56
  "chocolate chips/bananas",
57
  "chocolate chips / bananas",
58
+ "chocolate chips, bananas, 1/2 lb carrots"
 
59
  ]
60
 
61
  for text in texts: