Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

beweinreich commited on Jun 18

Commit

306cc03

•

1 Parent(s): 88cdfd7

bugfix for multi-item items

Browse files

Files changed (2) hide show

multi_food_item_detector.py +8 -53
playground.py +11 -59

multi_food_item_detector.py CHANGED Viewed

@@ -5,9 +5,6 @@ import re
 nlp = spacy.load("en_core_web_trf")
 def analyze_text(text):
-    # Track the positions of slashes in the original text
-    original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]
     # Replace different delimiters with a uniform delimiter (comma)
     normalized_text = re.sub(r'[\/,]', ',', text)
@@ -19,80 +16,38 @@ def analyze_text(text):
     items = []
     current_item = []
-    current_position = 0
-    root_noun_found = False
     for token in doc:
-        token_start = text.find(token.text, current_position)
-        token_end = token_start + len(token.text)
-        # If the token is punctuation and a root noun has been found, finalize the current item
         if token.pos_ == 'PUNCT' and token.text == ',':
-            if root_noun_found:
                 items.append(" ".join(current_item))
                 current_item = []
-                root_noun_found = False
-            # Check if the comma was originally a slash
-            if token_start in original_slash_positions:
-                items.append('/')
-            else:
-                items.append(',')
         else:
             # If token is part of a compound noun or an adjective, add to the current item
             if token.dep_ in ('compound', 'amod'):
                 current_item.append(token.text)
-            elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN':
-                current_item.append(token.text)
-                root_noun_found = True
-            elif token.dep_ == 'appos':
                 if current_item:
                     current_item.append(token.text)
                 else:
                     current_item = [token.text]
-                root_noun_found = True
             else:
                 current_item.append(token.text)
-        current_position = token_end
     # Add the last item if it exists
     if current_item:
         items.append(" ".join(current_item))
-    # Process items to handle delimiters correctly
-    final_items = []
-    temp_item = []
-    for item in items:
-        if item in [',', '/']:
-            if temp_item:
-                final_items.append("".join(temp_item).strip())
-                temp_item = []
-            if item == '/':
-                final_items.append('/')
-        else:
-            temp_item.append(item + " ")
-    if temp_item:
-        final_items.append("".join(temp_item).strip())
-    # Combine items separated by slashes into single items
-    combined_items = []
-    i = 0
-    while i < len(final_items):
-        if final_items[i] == '/':
-            combined_items[-1] += '/' + final_items[i + 1]
-            i += 2
-        else:
-            combined_items.append(final_items[i])
-            i += 1
     # Determine if the text is a single noun phrase or multiple items
-    non_delimiter_items = [item for item in combined_items if item not in [',', '/']]
-    is_single_noun_phrase = len(non_delimiter_items) == 1
     delimiter = determine_delimiter(text)
-    return is_single_noun_phrase, delimiter, combined_items
 def determine_delimiter(text):
     number_of_slashes = text.count('/')

 nlp = spacy.load("en_core_web_trf")
 def analyze_text(text):
     # Replace different delimiters with a uniform delimiter (comma)
     normalized_text = re.sub(r'[\/,]', ',', text)
     items = []
     current_item = []
     for token in doc:
+        # If the token is punctuation, finalize the current item
         if token.pos_ == 'PUNCT' and token.text == ',':
+            if current_item:
                 items.append(" ".join(current_item))
                 current_item = []
         else:
             # If token is part of a compound noun or an adjective, add to the current item
             if token.dep_ in ('compound', 'amod'):
                 current_item.append(token.text)
+            elif token.dep_ in ('ROOT', 'appos'):
                 if current_item:
                     current_item.append(token.text)
                 else:
                     current_item = [token.text]
+                if token.head.dep_ == 'ROOT':
+                    items.append(" ".join(current_item))
+                    current_item = []
             else:
                 current_item.append(token.text)
     # Add the last item if it exists
     if current_item:
         items.append(" ".join(current_item))
     # Determine if the text is a single noun phrase or multiple items
+    is_single_noun_phrase = len(items) == 1
     delimiter = determine_delimiter(text)
+    return is_single_noun_phrase, delimiter, items
 def determine_delimiter(text):
     number_of_slashes = text.count('/')

playground.py CHANGED Viewed

@@ -5,9 +5,6 @@ import re
 nlp = spacy.load("en_core_web_trf")
 def analyze_text(text):
-    # Track the positions of slashes in the original text
-    original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]
     # Replace different delimiters with a uniform delimiter (comma)
     normalized_text = re.sub(r'[\/,]', ',', text)
@@ -19,91 +16,46 @@ def analyze_text(text):
     items = []
     current_item = []
-    current_position = 0
-    root_noun_found = False
     for token in doc:
-        token_start = text.find(token.text, current_position)
-        token_end = token_start + len(token.text)
-        # If the token is punctuation and a root noun has been found, finalize the current item
         if token.pos_ == 'PUNCT' and token.text == ',':
-            if root_noun_found:
                 items.append(" ".join(current_item))
                 current_item = []
-                root_noun_found = False
-            # Check if the comma was originally a slash
-            if token_start in original_slash_positions:
-                items.append('/')
-            else:
-                items.append(',')
         else:
             # If token is part of a compound noun or an adjective, add to the current item
             if token.dep_ in ('compound', 'amod'):
                 current_item.append(token.text)
-            elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN':
-                current_item.append(token.text)
-                root_noun_found = True
-            elif token.dep_ == 'appos':
                 if current_item:
                     current_item.append(token.text)
                 else:
                     current_item = [token.text]
-                root_noun_found = True
             else:
                 current_item.append(token.text)
-        current_position = token_end
     # Add the last item if it exists
     if current_item:
         items.append(" ".join(current_item))
-    # Process items to handle delimiters correctly
-    final_items = []
-    temp_item = []
-    for item in items:
-        if item in [',', '/']:
-            if temp_item:
-                final_items.append("".join(temp_item).strip())
-                temp_item = []
-            if item == '/':
-                final_items.append('/')
-        else:
-            temp_item.append(item + " ")
-    if temp_item:
-        final_items.append("".join(temp_item).strip())
-    # Combine items separated by slashes into single items
-    combined_items = []
-    i = 0
-    while i < len(final_items):
-        if final_items[i] == '/':
-            combined_items[-1] += '/' + final_items[i + 1]
-            i += 2
-        else:
-            combined_items.append(final_items[i])
-            i += 1
     # Determine if the text is a single noun phrase or multiple items
-    non_delimiter_items = [item for item in combined_items if item not in [',', '/']]
-    if len(non_delimiter_items) == 1:
         print("The text is a single noun phrase.")
     else:
         print("The text contains multiple items.")
-    print("Items identified:", non_delimiter_items)
-# Example usage
 texts = [
-    "apple",
-    "italian squash, raw, unpeeled",
-    "chocolate chips, bananas",
     "chocolate chips/bananas",
     "chocolate chips / bananas",
-    "chocolate chips, bananas, 1/2 lb carrots",
-    "pink berries/raw carrots/chcolate, raw/winter squash",
 ]
 for text in texts:

 nlp = spacy.load("en_core_web_trf")
 def analyze_text(text):
     # Replace different delimiters with a uniform delimiter (comma)
     normalized_text = re.sub(r'[\/,]', ',', text)
     items = []
     current_item = []
     for token in doc:
+        # If the token is punctuation, finalize the current item
         if token.pos_ == 'PUNCT' and token.text == ',':
+            if current_item:
                 items.append(" ".join(current_item))
                 current_item = []
         else:
             # If token is part of a compound noun or an adjective, add to the current item
             if token.dep_ in ('compound', 'amod'):
                 current_item.append(token.text)
+            elif token.dep_ in ('ROOT', 'appos'):
                 if current_item:
                     current_item.append(token.text)
                 else:
                     current_item = [token.text]
+                if token.head.dep_ == 'ROOT':
+                    items.append(" ".join(current_item))
+                    current_item = []
             else:
                 current_item.append(token.text)
     # Add the last item if it exists
     if current_item:
         items.append(" ".join(current_item))
     # Determine if the text is a single noun phrase or multiple items
+    if len(items) == 1:
         print("The text is a single noun phrase.")
     else:
         print("The text contains multiple items.")
+    print("Items identified:", items)
+# Example usages
 texts = [
+    "chocolate, bananas",
     "chocolate chips/bananas",
     "chocolate chips / bananas",
+    "chocolate chips, bananas, 1/2 lb carrots"
 ]
 for text in texts: