beweinreich commited on
Commit
b1c94e2
1 Parent(s): e5e36ab

improvements to multi-item classifier, and adding dictionary data to mappings

Browse files
Files changed (5) hide show
  1. algo.py +17 -5
  2. app.py +3 -11
  3. db/db_utils.py +8 -0
  4. multi_food_item_detector.py +103 -47
  5. playground.py +112 -1
algo.py CHANGED
@@ -107,7 +107,7 @@ class Algo:
107
  # If it has equal number of commas and slashes, we'll go with slashes
108
 
109
  input_word_parts = extract_items(input_word)
110
-
111
  mappings = []
112
  for part in input_word_parts:
113
  mapping = self.handle_single_item(part)
@@ -133,7 +133,7 @@ class Algo:
133
  break
134
 
135
  dictionary_words = [mapping['dictionary_word'] for mapping in mappings]
136
- print("dictionary words -> ", dictionary_words)
137
  if len(set(dictionary_words)) == 0:
138
  return {
139
  'input_word': input_word,
@@ -175,7 +175,7 @@ class Algo:
175
 
176
  # try the singular form of the word
177
  singular = self.pluralizer.pluralize(input_word_clean, 1)
178
- mapping = get_mapping_from_db(self.db_cursor, singular)
179
  if mapping:
180
  print(f" - Found mapping in db: {mapping}")
181
  return mapping
@@ -204,7 +204,7 @@ class Algo:
204
  'food_nonfood_score': food_nonfood[1]
205
  }
206
  store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
207
- return mapping
208
 
209
  mapping = self.perform_mapping(input_word)
210
 
@@ -216,6 +216,19 @@ class Algo:
216
 
217
  print(f" - Storing new mapping to db: {mapping}")
218
  store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  return mapping
220
 
221
  def match_words(self, input_words, stream_results=False):
@@ -227,7 +240,6 @@ class Algo:
227
  print()
228
  print(f"Processing: {input_word}")
229
 
230
- # if the word has a "," or "/" in it, let's skip it for now
231
  if ',' in input_word or '/' in input_word:
232
  mapping = self.handle_multi_item(input_word)
233
  else:
 
107
  # If it has equal number of commas and slashes, we'll go with slashes
108
 
109
  input_word_parts = extract_items(input_word)
110
+ print(f" - Extracted items: {input_word_parts}")
111
  mappings = []
112
  for part in input_word_parts:
113
  mapping = self.handle_single_item(part)
 
133
  break
134
 
135
  dictionary_words = [mapping['dictionary_word'] for mapping in mappings]
136
+
137
  if len(set(dictionary_words)) == 0:
138
  return {
139
  'input_word': input_word,
 
175
 
176
  # try the singular form of the word
177
  singular = self.pluralizer.pluralize(input_word_clean, 1)
178
+ mapping = wrap_mapping_with_dictionary_data(get_mapping_from_db(self.db_cursor, singular))
179
  if mapping:
180
  print(f" - Found mapping in db: {mapping}")
181
  return mapping
 
204
  'food_nonfood_score': food_nonfood[1]
205
  }
206
  store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
207
+ return wrap_mapping_with_dictionary_data(mapping)
208
 
209
  mapping = self.perform_mapping(input_word)
210
 
 
216
 
217
  print(f" - Storing new mapping to db: {mapping}")
218
  store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
219
+
220
+ return wrap_mapping_with_dictionary_data(mapping)
221
+
222
+ def wrap_mapping_with_dictionary_data(self, mapping):
223
+ dictionary_result = get_dictionary_data_from_db(self.db_cursor, mapping['dictionary_word'])
224
+
225
+ mapping.update({
226
+ 'wwiea_category': dictionary_result['wweia_category'],
227
+ 'water_content': dictionary_result['water_content'],
228
+ 'dry_matter_content': dictionary_result['dry_matter_content'],
229
+ 'leakage': dictionary_result['leakage']
230
+ })
231
+
232
  return mapping
233
 
234
  def match_words(self, input_words, stream_results=False):
 
240
  print()
241
  print(f"Processing: {input_word}")
242
 
 
243
  if ',' in input_word or '/' in input_word:
244
  mapping = self.handle_multi_item(input_word)
245
  else:
app.py CHANGED
@@ -29,18 +29,10 @@ def process_input(input_text, csv_file):
29
  # Process the single input text
30
  results = algo.match_words([input_text])
31
 
32
- # Fetch the dictionary data for each word
33
- # This needs to be more performant, but its just for demo purposes / gradio
34
- for result in results:
35
- dictionary_word = result['dictionary_word']
36
- dictionary_data = fetch_the_dictionary_data(dictionary_word)
37
- print(dictionary_data)
38
- result['dry_matter_content'] = dictionary_data['dry_matter_content'] if dictionary_data else None
39
- result['water_content'] = dictionary_data['water_content'] if dictionary_data else None
40
-
41
- df = pd.DataFrame(results, columns=["input_word", "cleaned_word", 'matching_word', 'dictionary_word', 'similarity_score', 'confidence_score', 'similar_words', 'is_food', 'food_nonfood_score', 'dry_matter_content', 'water_content'])
42
  # Filter to only required columns
43
- df_filtered = df[["input_word", "dictionary_word", "is_food", "dry_matter_content", "water_content", "similarity_score", "food_nonfood_score"]]
44
  return df_filtered
45
 
46
  # Gradio interface
 
29
  # Process the single input text
30
  results = algo.match_words([input_text])
31
 
32
+ df = pd.DataFrame(results, columns=["input_word", "cleaned_word", 'matching_word', 'dictionary_word', 'wweia_category', 'dry_matter_content',
33
+ 'water_content', 'similarity_score', 'confidence_score', 'similar_words', 'is_food', 'food_nonfood_score', 'dry_matter_content', 'water_content'])
 
 
 
 
 
 
 
 
34
  # Filter to only required columns
35
+ df_filtered = df[["input_word", "dictionary_word", "is_food", 'wweia_category', 'dry_matter_content', "water_content", "similarity_score", "food_nonfood_score"]]
36
  return df_filtered
37
 
38
  # Gradio interface
db/db_utils.py CHANGED
@@ -60,6 +60,14 @@ def get_mapping_from_db(cursor, cleaned_word):
60
  return dict(zip(columns, row))
61
  return None
62
 
 
 
 
 
 
 
 
 
63
  def store_mapping_to_db(cursor, conn, mapping):
64
  try:
65
  cursor.execute('''
 
60
  return dict(zip(columns, row))
61
  return None
62
 
63
+ def get_dictionary_data_from_db(cursor, dictionary_word):
64
+ cursor.execute('SELECT * FROM dictionary WHERE description = %s', (dictionary_word,))
65
+ row = cursor.fetchone()
66
+ if row:
67
+ columns = [col[0] for col in cursor.description]
68
+ return dict(zip(columns, row))
69
+ return None
70
+
71
  def store_mapping_to_db(cursor, conn, mapping):
72
  try:
73
  cursor.execute('''
multi_food_item_detector.py CHANGED
@@ -4,58 +4,114 @@ import re
4
  # Load the spaCy model
5
  nlp = spacy.load("en_core_web_trf")
6
 
7
- def get_nouns(text):
8
- doc = nlp(text)
9
- nouns = [token.text for token in doc if token.pos_ == "NOUN"]
10
- return nouns
11
-
12
- def extract_food_phrases(text):
13
- # Determine the delimiter
14
- if '/' in text:
15
- delimiter = '/'
16
- elif ',' in text:
17
- delimiter = ','
18
- else:
19
- # If it's not comma or slash delimited, return the text as is
20
- # this will be an edge-case and we'll handle it later
21
- return [text]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- # Split the text using the identified delimiter
24
- items = [item.strip() for item in text.split(delimiter)]
 
25
 
26
- # Process each item to find food items
27
- food_items = []
 
28
  for item in items:
29
- doc = nlp(item)
30
- tokens = [token.text for token in doc]
31
- # Check if any noun in the list of known nouns is present in the tokens
32
- for token in doc:
33
- if token.pos_ == "NOUN":
34
- food_items.append(item.strip())
35
- break
 
36
 
37
- return food_items
 
38
 
39
- def extract_items(text):
40
- # Determine the delimiter
41
- if '/' in text:
42
- delimiter = '/'
43
- elif ',' in text:
44
- delimiter = ','
45
- else:
46
- # If it's not comma or slash delimited, return the text as is
47
- return [text]
 
48
 
49
- # Split the text using the identified delimiter
50
- items = [item.strip() for item in text.split(delimiter)]
51
-
52
- # Get the food items
53
- food_items = extract_food_phrases(text)
54
- if len(food_items) > 0:
55
- return food_items
56
 
57
- # Find the items that were not matched as food items
58
- non_food_items = [item for item in items if item not in food_items]
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- # Combine the food items and non_food_items
61
- return food_items + non_food_items
 
 
 
 
4
  # Load the spaCy model
5
  nlp = spacy.load("en_core_web_trf")
6
 
7
+ def analyze_text(text):
8
+ # Track the positions of slashes in the original text
9
+ original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]
10
+
11
+ # Replace different delimiters with a uniform delimiter (comma)
12
+ normalized_text = re.sub(r'[\/,]', ',', text)
13
+
14
+ doc = nlp(normalized_text)
15
+
16
+ # Print tokens with their attributes
17
+ for token in doc:
18
+ print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")
19
+
20
+ items = []
21
+ current_item = []
22
+ current_position = 0
23
+ root_noun_found = False
24
+
25
+ for token in doc:
26
+ token_start = text.find(token.text, current_position)
27
+ token_end = token_start + len(token.text)
28
+
29
+ # If the token is punctuation and a root noun has been found, finalize the current item
30
+ if token.pos_ == 'PUNCT' and token.text == ',':
31
+ if root_noun_found:
32
+ items.append(" ".join(current_item))
33
+ current_item = []
34
+ root_noun_found = False
35
+ # Check if the comma was originally a slash
36
+ if token_start in original_slash_positions:
37
+ items.append('/')
38
+ else:
39
+ items.append(',')
40
+ else:
41
+ # If token is part of a compound noun or an adjective, add to the current item
42
+ if token.dep_ in ('compound', 'amod'):
43
+ current_item.append(token.text)
44
+ elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN':
45
+ current_item.append(token.text)
46
+ root_noun_found = True
47
+ elif token.dep_ == 'appos':
48
+ if current_item:
49
+ current_item.append(token.text)
50
+ else:
51
+ current_item = [token.text]
52
+ root_noun_found = True
53
+ else:
54
+ current_item.append(token.text)
55
+
56
+ current_position = token_end
57
 
58
+ # Add the last item if it exists
59
+ if current_item:
60
+ items.append(" ".join(current_item))
61
 
62
+ # Process items to handle delimiters correctly
63
+ final_items = []
64
+ temp_item = []
65
  for item in items:
66
+ if item in [',', '/']:
67
+ if temp_item:
68
+ final_items.append("".join(temp_item).strip())
69
+ temp_item = []
70
+ if item == '/':
71
+ final_items.append('/')
72
+ else:
73
+ temp_item.append(item + " ")
74
 
75
+ if temp_item:
76
+ final_items.append("".join(temp_item).strip())
77
 
78
+ # Combine items separated by slashes into single items
79
+ combined_items = []
80
+ i = 0
81
+ while i < len(final_items):
82
+ if final_items[i] == '/':
83
+ combined_items[-1] += '/' + final_items[i + 1]
84
+ i += 2
85
+ else:
86
+ combined_items.append(final_items[i])
87
+ i += 1
88
 
89
+ # Determine if the text is a single noun phrase or multiple items
90
+ non_delimiter_items = [item for item in combined_items if item not in [',', '/']]
91
+ is_single_noun_phrase = len(non_delimiter_items) == 1
92
+
93
+ delimiter = determine_delimiter(text)
94
+
95
+ return is_single_noun_phrase, delimiter, combined_items
96
 
97
+ def determine_delimiter(text):
98
+ number_of_slashes = text.count('/')
99
+ number_of_commas = text.count(',')
100
+ number_of_spaces = text.count(' ')
101
+
102
+ if number_of_slashes > 0 and number_of_slashes >= number_of_commas:
103
+ # prefer slash over comma, since its rarer
104
+ return '/'
105
+ elif number_of_commas > 0:
106
+ return ','
107
+ else:
108
+ return ' '
109
+
110
+ def extract_items(text):
111
+ is_single_noun_phrase, delimiter = analyze_text(text)
112
 
113
+ if is_single_noun_phrase:
114
+ return [text]
115
+ else:
116
+ items = text.split(delimiter)
117
+ return items
playground.py CHANGED
@@ -1 +1,112 @@
1
- # Nothing here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import re
3
+
4
+ # Load the spaCy model
5
+ nlp = spacy.load("en_core_web_trf")
6
+
7
+ def analyze_text(text):
8
+ # Track the positions of slashes in the original text
9
+ original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]
10
+
11
+ # Replace different delimiters with a uniform delimiter (comma)
12
+ normalized_text = re.sub(r'[\/,]', ',', text)
13
+
14
+ doc = nlp(normalized_text)
15
+
16
+ # Print tokens with their attributes
17
+ for token in doc:
18
+ print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")
19
+
20
+ items = []
21
+ current_item = []
22
+ current_position = 0
23
+ root_noun_found = False
24
+
25
+ for token in doc:
26
+ token_start = text.find(token.text, current_position)
27
+ token_end = token_start + len(token.text)
28
+
29
+ # If the token is punctuation and a root noun has been found, finalize the current item
30
+ if token.pos_ == 'PUNCT' and token.text == ',':
31
+ if root_noun_found:
32
+ items.append(" ".join(current_item))
33
+ current_item = []
34
+ root_noun_found = False
35
+ # Check if the comma was originally a slash
36
+ if token_start in original_slash_positions:
37
+ items.append('/')
38
+ else:
39
+ items.append(',')
40
+ else:
41
+ # If token is part of a compound noun or an adjective, add to the current item
42
+ if token.dep_ in ('compound', 'amod'):
43
+ current_item.append(token.text)
44
+ elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN':
45
+ current_item.append(token.text)
46
+ root_noun_found = True
47
+ elif token.dep_ == 'appos':
48
+ if current_item:
49
+ current_item.append(token.text)
50
+ else:
51
+ current_item = [token.text]
52
+ root_noun_found = True
53
+ else:
54
+ current_item.append(token.text)
55
+
56
+ current_position = token_end
57
+
58
+ # Add the last item if it exists
59
+ if current_item:
60
+ items.append(" ".join(current_item))
61
+
62
+ # Process items to handle delimiters correctly
63
+ final_items = []
64
+ temp_item = []
65
+ for item in items:
66
+ if item in [',', '/']:
67
+ if temp_item:
68
+ final_items.append("".join(temp_item).strip())
69
+ temp_item = []
70
+ if item == '/':
71
+ final_items.append('/')
72
+ else:
73
+ temp_item.append(item + " ")
74
+
75
+ if temp_item:
76
+ final_items.append("".join(temp_item).strip())
77
+
78
+ # Combine items separated by slashes into single items
79
+ combined_items = []
80
+ i = 0
81
+ while i < len(final_items):
82
+ if final_items[i] == '/':
83
+ combined_items[-1] += '/' + final_items[i + 1]
84
+ i += 2
85
+ else:
86
+ combined_items.append(final_items[i])
87
+ i += 1
88
+
89
+ # Determine if the text is a single noun phrase or multiple items
90
+ non_delimiter_items = [item for item in combined_items if item not in [',', '/']]
91
+ if len(non_delimiter_items) == 1:
92
+ print("The text is a single noun phrase.")
93
+ else:
94
+ print("The text contains multiple items.")
95
+
96
+ print("Items identified:", non_delimiter_items)
97
+
98
+ # Example usage
99
+ texts = [
100
+ "apple",
101
+ "italian squash, raw, unpeeled",
102
+ "chocolate chips, bananas",
103
+ "chocolate chips/bananas",
104
+ "chocolate chips / bananas",
105
+ "chocolate chips, bananas, 1/2 lb carrots",
106
+ "pink berries/raw carrots/chcolate, raw/winter squash",
107
+ ]
108
+
109
+ for text in texts:
110
+ print(f"Analyzing: {text}")
111
+ analyze_text(text)
112
+ print()