Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

beweinreich commited on Jun 18

Commit

b1c94e2

•

1 Parent(s): e5e36ab

improvements to multi-item classifier, and adding dictionary data to mappings

Browse files

Files changed (5) hide show

algo.py +17 -5
app.py +3 -11
db/db_utils.py +8 -0
multi_food_item_detector.py +103 -47
playground.py +112 -1

algo.py CHANGED Viewed

@@ -107,7 +107,7 @@ class Algo:
         # If it has equal number of commas and slashes, we'll go with slashes
         input_word_parts = extract_items(input_word)
         mappings = []
         for part in input_word_parts:
             mapping = self.handle_single_item(part)
@@ -133,7 +133,7 @@ class Algo:
                 break
         dictionary_words = [mapping['dictionary_word'] for mapping in mappings]
-        print("dictionary words -> ", dictionary_words)
         if len(set(dictionary_words)) == 0:
             return {
                 'input_word': input_word,
@@ -175,7 +175,7 @@ class Algo:
         # try the singular form of the word
         singular = self.pluralizer.pluralize(input_word_clean, 1)
-        mapping = get_mapping_from_db(self.db_cursor, singular)
         if mapping:
             print(f" - Found mapping in db: {mapping}")
             return mapping
@@ -204,7 +204,7 @@ class Algo:
                 'food_nonfood_score': food_nonfood[1]
             }
             store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
-            return mapping
         mapping = self.perform_mapping(input_word)
@@ -216,6 +216,19 @@ class Algo:
         print(f" - Storing new mapping to db: {mapping}")
         store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
         return mapping
     def match_words(self, input_words, stream_results=False):
@@ -227,7 +240,6 @@ class Algo:
             print()
             print(f"Processing: {input_word}")
-            # if the word has a "," or "/" in it, let's skip it for now
             if ',' in input_word or '/' in input_word:
                 mapping = self.handle_multi_item(input_word)
             else:

         # If it has equal number of commas and slashes, we'll go with slashes
         input_word_parts = extract_items(input_word)
+        print(f" - Extracted items: {input_word_parts}")
         mappings = []
         for part in input_word_parts:
             mapping = self.handle_single_item(part)
                 break
         dictionary_words = [mapping['dictionary_word'] for mapping in mappings]
         if len(set(dictionary_words)) == 0:
             return {
                 'input_word': input_word,
         # try the singular form of the word
         singular = self.pluralizer.pluralize(input_word_clean, 1)
+        mapping = wrap_mapping_with_dictionary_data(get_mapping_from_db(self.db_cursor, singular))
         if mapping:
             print(f" - Found mapping in db: {mapping}")
             return mapping
                 'food_nonfood_score': food_nonfood[1]
             }
             store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
+            return wrap_mapping_with_dictionary_data(mapping)
         mapping = self.perform_mapping(input_word)
         print(f" - Storing new mapping to db: {mapping}")
         store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
+        return wrap_mapping_with_dictionary_data(mapping)
+    def wrap_mapping_with_dictionary_data(self, mapping):
+        dictionary_result = get_dictionary_data_from_db(self.db_cursor, mapping['dictionary_word'])
+        mapping.update({
+            'wwiea_category': dictionary_result['wweia_category'],
+            'water_content': dictionary_result['water_content'],
+            'dry_matter_content': dictionary_result['dry_matter_content'],
+            'leakage': dictionary_result['leakage']
+        })
         return mapping
     def match_words(self, input_words, stream_results=False):
             print()
             print(f"Processing: {input_word}")
             if ',' in input_word or '/' in input_word:
                 mapping = self.handle_multi_item(input_word)
             else:

app.py CHANGED Viewed

@@ -29,18 +29,10 @@ def process_input(input_text, csv_file):
         # Process the single input text
         results = algo.match_words([input_text])
-    # Fetch the dictionary data for each word
-    # This needs to be more performant, but its just for demo purposes / gradio
-    for result in results:
-        dictionary_word = result['dictionary_word']
-        dictionary_data = fetch_the_dictionary_data(dictionary_word)
-        print(dictionary_data)
-        result['dry_matter_content'] = dictionary_data['dry_matter_content'] if dictionary_data else None
-        result['water_content'] = dictionary_data['water_content'] if dictionary_data else None
-    df = pd.DataFrame(results, columns=["input_word", "cleaned_word", 'matching_word', 'dictionary_word', 'similarity_score', 'confidence_score', 'similar_words', 'is_food', 'food_nonfood_score', 'dry_matter_content', 'water_content'])
     # Filter to only required columns
-    df_filtered = df[["input_word", "dictionary_word", "is_food", "dry_matter_content", "water_content",  "similarity_score", "food_nonfood_score"]]
     return df_filtered
 # Gradio interface

         # Process the single input text
         results = algo.match_words([input_text])
+    df = pd.DataFrame(results, columns=["input_word", "cleaned_word", 'matching_word', 'dictionary_word', 'wweia_category', 'dry_matter_content',
+    'water_content', 'similarity_score', 'confidence_score', 'similar_words', 'is_food', 'food_nonfood_score', 'dry_matter_content', 'water_content'])
     # Filter to only required columns
+    df_filtered = df[["input_word", "dictionary_word", "is_food", 'wweia_category', 'dry_matter_content', "water_content",  "similarity_score", "food_nonfood_score"]]
     return df_filtered
 # Gradio interface

db/db_utils.py CHANGED Viewed

@@ -60,6 +60,14 @@ def get_mapping_from_db(cursor, cleaned_word):
         return dict(zip(columns, row))
     return None
 def store_mapping_to_db(cursor, conn, mapping):
     try:
         cursor.execute('''

         return dict(zip(columns, row))
     return None
+def get_dictionary_data_from_db(cursor, dictionary_word):
+    cursor.execute('SELECT * FROM dictionary WHERE description = %s', (dictionary_word,))
+    row = cursor.fetchone()
+    if row:
+        columns = [col[0] for col in cursor.description]
+        return dict(zip(columns, row))
+    return None
 def store_mapping_to_db(cursor, conn, mapping):
     try:
         cursor.execute('''

multi_food_item_detector.py CHANGED Viewed

@@ -4,58 +4,114 @@ import re
 # Load the spaCy model
 nlp = spacy.load("en_core_web_trf")
-def get_nouns(text):
-    doc = nlp(text)
-    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
-    return nouns
-def extract_food_phrases(text):
-    # Determine the delimiter
-    if '/' in text:
-        delimiter = '/'
-    elif ',' in text:
-        delimiter = ','
-    else:
-        # If it's not comma or slash delimited, return the text as is
-        # this will be an edge-case and we'll handle it later
-        return [text]
-    # Split the text using the identified delimiter
-    items = [item.strip() for item in text.split(delimiter)]
-    # Process each item to find food items
-    food_items = []
     for item in items:
-        doc = nlp(item)
-        tokens = [token.text for token in doc]
-        # Check if any noun in the list of known nouns is present in the tokens
-        for token in doc:
-            if token.pos_ == "NOUN":
-                food_items.append(item.strip())
-                break
-    return food_items
-def extract_items(text):
-    # Determine the delimiter
-    if '/' in text:
-        delimiter = '/'
-    elif ',' in text:
-        delimiter = ','
-    else:
-        # If it's not comma or slash delimited, return the text as is
-        return [text]
-    # Split the text using the identified delimiter
-    items = [item.strip() for item in text.split(delimiter)]
-    # Get the food items
-    food_items = extract_food_phrases(text)
-    if len(food_items) > 0:
-        return food_items
-    # Find the items that were not matched as food items
-    non_food_items = [item for item in items if item not in food_items]
-    # Combine the food items and non_food_items
-    return food_items + non_food_items

 # Load the spaCy model
 nlp = spacy.load("en_core_web_trf")
+def analyze_text(text):
+    # Track the positions of slashes in the original text
+    original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]
+    # Replace different delimiters with a uniform delimiter (comma)
+    normalized_text = re.sub(r'[\/,]', ',', text)
+    doc = nlp(normalized_text)
+    # Print tokens with their attributes
+    for token in doc:
+        print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")
+    items = []
+    current_item = []
+    current_position = 0
+    root_noun_found = False
+    for token in doc:
+        token_start = text.find(token.text, current_position)
+        token_end = token_start + len(token.text)
+        # If the token is punctuation and a root noun has been found, finalize the current item
+        if token.pos_ == 'PUNCT' and token.text == ',':
+            if root_noun_found:
+                items.append(" ".join(current_item))
+                current_item = []
+                root_noun_found = False
+            # Check if the comma was originally a slash
+            if token_start in original_slash_positions:
+                items.append('/')
+            else:
+                items.append(',')
+        else:
+            # If token is part of a compound noun or an adjective, add to the current item
+            if token.dep_ in ('compound', 'amod'):
+                current_item.append(token.text)
+            elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN':
+                current_item.append(token.text)
+                root_noun_found = True
+            elif token.dep_ == 'appos':
+                if current_item:
+                    current_item.append(token.text)
+                else:
+                    current_item = [token.text]
+                root_noun_found = True
+            else:
+                current_item.append(token.text)
+        current_position = token_end
+    # Add the last item if it exists
+    if current_item:
+        items.append(" ".join(current_item))
+    # Process items to handle delimiters correctly
+    final_items = []
+    temp_item = []
     for item in items:
+        if item in [',', '/']:
+            if temp_item:
+                final_items.append("".join(temp_item).strip())
+                temp_item = []
+            if item == '/':
+                final_items.append('/')
+        else:
+            temp_item.append(item + " ")
+    if temp_item:
+        final_items.append("".join(temp_item).strip())
+    # Combine items separated by slashes into single items
+    combined_items = []
+    i = 0
+    while i < len(final_items):
+        if final_items[i] == '/':
+            combined_items[-1] += '/' + final_items[i + 1]
+            i += 2
+        else:
+            combined_items.append(final_items[i])
+            i += 1
+    # Determine if the text is a single noun phrase or multiple items
+    non_delimiter_items = [item for item in combined_items if item not in [',', '/']]
+    is_single_noun_phrase = len(non_delimiter_items) == 1
+    delimiter = determine_delimiter(text)
+    return is_single_noun_phrase, delimiter, combined_items
+def determine_delimiter(text):
+    number_of_slashes = text.count('/')
+    number_of_commas = text.count(',')
+    number_of_spaces = text.count(' ')
+    if number_of_slashes > 0 and number_of_slashes >= number_of_commas:
+        # prefer slash over comma, since its rarer
+        return '/'
+    elif number_of_commas > 0:
+        return ','
+    else:
+        return ' '
+def extract_items(text):
+    is_single_noun_phrase, delimiter = analyze_text(text)
+    if is_single_noun_phrase:
+        return [text]
+    else:
+        items = text.split(delimiter)
+        return items

playground.py CHANGED Viewed

	@@ -1 +1,112 @@
1	- # ~~Nothing here~~

+import spacy
+import re
+# Load the spaCy model
+nlp = spacy.load("en_core_web_trf")
+def analyze_text(text):
+    # Track the positions of slashes in the original text
+    original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]
+    # Replace different delimiters with a uniform delimiter (comma)
+    normalized_text = re.sub(r'[\/,]', ',', text)
+    doc = nlp(normalized_text)
+    # Print tokens with their attributes
+    for token in doc:
+        print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")
+    items = []
+    current_item = []
+    current_position = 0
+    root_noun_found = False
+    for token in doc:
+        token_start = text.find(token.text, current_position)
+        token_end = token_start + len(token.text)
+        # If the token is punctuation and a root noun has been found, finalize the current item
+        if token.pos_ == 'PUNCT' and token.text == ',':
+            if root_noun_found:
+                items.append(" ".join(current_item))
+                current_item = []
+                root_noun_found = False
+            # Check if the comma was originally a slash
+            if token_start in original_slash_positions:
+                items.append('/')
+            else:
+                items.append(',')
+        else:
+            # If token is part of a compound noun or an adjective, add to the current item
+            if token.dep_ in ('compound', 'amod'):
+                current_item.append(token.text)
+            elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN':
+                current_item.append(token.text)
+                root_noun_found = True
+            elif token.dep_ == 'appos':
+                if current_item:
+                    current_item.append(token.text)
+                else:
+                    current_item = [token.text]
+                root_noun_found = True
+            else:
+                current_item.append(token.text)
+        current_position = token_end
+    # Add the last item if it exists
+    if current_item:
+        items.append(" ".join(current_item))
+    # Process items to handle delimiters correctly
+    final_items = []
+    temp_item = []
+    for item in items:
+        if item in [',', '/']:
+            if temp_item:
+                final_items.append("".join(temp_item).strip())
+                temp_item = []
+            if item == '/':
+                final_items.append('/')
+        else:
+            temp_item.append(item + " ")
+    if temp_item:
+        final_items.append("".join(temp_item).strip())
+    # Combine items separated by slashes into single items
+    combined_items = []
+    i = 0
+    while i < len(final_items):
+        if final_items[i] == '/':
+            combined_items[-1] += '/' + final_items[i + 1]
+            i += 2
+        else:
+            combined_items.append(final_items[i])
+            i += 1
+    # Determine if the text is a single noun phrase or multiple items
+    non_delimiter_items = [item for item in combined_items if item not in [',', '/']]
+    if len(non_delimiter_items) == 1:
+        print("The text is a single noun phrase.")
+    else:
+        print("The text contains multiple items.")
+    print("Items identified:", non_delimiter_items)
+# Example usage
+texts = [
+    "apple",
+    "italian squash, raw, unpeeled",
+    "chocolate chips, bananas",
+    "chocolate chips/bananas",
+    "chocolate chips / bananas",
+    "chocolate chips, bananas, 1/2 lb carrots",
+    "pink berries/raw carrots/chcolate, raw/winter squash",
+]
+for text in texts:
+    print(f"Analyzing: {text}")
+    analyze_text(text)
+    print()