beweinreich commited on
Commit
ecfb899
1 Parent(s): c92bac3

return all items from list no matter if its a noun

Browse files
Files changed (3) hide show
  1. algo.py +57 -8
  2. multi_food_item_detector.py +23 -2
  3. similarity_fast.py +0 -2
algo.py CHANGED
@@ -7,7 +7,7 @@ from food_nonfood import classify_as_food_nonfood, pessimistic_food_nonfood_scor
7
  from utils import clean_word
8
  from db.db_utils import store_mapping_to_db, get_mapping_from_db
9
  from ask_gpt import query_gpt
10
- from multi_food_item_detector import extract_food_phrases
11
 
12
  similarity_threshold = 0.75
13
 
@@ -106,15 +106,68 @@ class Algo:
106
  # If it has more slashes, its slash-delimited
107
  # If it has equal number of commas and slashes, we'll go with slashes
108
 
109
- input_word_parts = extract_food_phrases(input_word)
110
 
111
  mappings = []
112
  for part in input_word_parts:
113
  mapping = self.handle_single_item(part)
114
  mappings.append(mapping)
115
 
116
- # TODO categorize the whole mapping list as homogenous, heterogenous, or non-food item
117
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  def handle_single_item(self, input_word):
120
  input_word_clean = clean_word(input_word)
@@ -173,10 +226,6 @@ class Algo:
173
  print()
174
  print(f"Processing: {input_word}")
175
 
176
- if "&" in input_word or "and" in input_word:
177
- print(" - Skipping multi-item word")
178
- continue
179
-
180
  # if the word has a "," or "/" in it, let's skip it for now
181
  if ',' in input_word or '/' in input_word:
182
  mapping = self.handle_multi_item(input_word)
 
7
  from utils import clean_word
8
  from db.db_utils import store_mapping_to_db, get_mapping_from_db
9
  from ask_gpt import query_gpt
10
+ from multi_food_item_detector import extract_items
11
 
12
  similarity_threshold = 0.75
13
 
 
106
  # If it has more slashes, its slash-delimited
107
  # If it has equal number of commas and slashes, we'll go with slashes
108
 
109
+ input_word_parts = extract_items(input_word)
110
 
111
  mappings = []
112
  for part in input_word_parts:
113
  mapping = self.handle_single_item(part)
114
  mappings.append(mapping)
115
 
116
+ # look up the dictionary values for each mapping
117
+ # find the wweia category
118
+ # if all mappings have the same wweia category, return "homogenous", else "heterogenous"
119
+ # if is_food is False for any mappings, return "Non-Food Item" as dictionary word
120
+ for mapping in mappings:
121
+ if mapping['is_food'] == False:
122
+ return {
123
+ 'input_word': input_word,
124
+ 'cleaned_word': mapping['cleaned_word'],
125
+ 'matching_word': 'Non-Food Item',
126
+ 'dictionary_word': 'Non-Food Item',
127
+ 'similarity_score': None,
128
+ 'confidence_score': None,
129
+ 'similar_words': None,
130
+ 'is_food': False,
131
+ 'food_nonfood_score': 1.0
132
+ }
133
+ break
134
+
135
+ dictionary_words = [mapping['dictionary_word'] for mapping in mappings]
136
+ if len(set(dictionary_words)) == 0:
137
+ return {
138
+ 'input_word': input_word,
139
+ 'cleaned_word': None,
140
+ 'matching_word': None,
141
+ 'dictionary_word': None,
142
+ 'similarity_score': None,
143
+ 'confidence_score': None,
144
+ 'similar_words': None,
145
+ 'is_food': None,
146
+ 'food_nonfood_score': None
147
+ }
148
+
149
+ self.db_cursor.execute(
150
+ f"SELECT DISTINCT wweia_category FROM dictionary WHERE description IN ({','.join(['%s']*len(dictionary_words))})",
151
+ dictionary_words
152
+ )
153
+ wweia_categories = self.db_cursor.fetchall()
154
+ wweia_categories = [x[0] for x in wweia_categories]
155
+ print("categories -> ", wweia_categories)
156
+ mixture = "Heterogeneous Mixture"
157
+ if len(set(wweia_categories)) == 1:
158
+ mixture = "Homogenous Mixture"
159
+
160
+ return {
161
+ 'input_word': input_word,
162
+ 'cleaned_word': None,
163
+ 'matching_word': mixture,
164
+ 'dictionary_word': mixture,
165
+ 'similarity_score': None,
166
+ 'confidence_score': None,
167
+ 'similar_words': None,
168
+ 'is_food': True,
169
+ 'food_nonfood_score': 1.0,
170
+ }
171
 
172
  def handle_single_item(self, input_word):
173
  input_word_clean = clean_word(input_word)
 
226
  print()
227
  print(f"Processing: {input_word}")
228
 
 
 
 
 
229
  # if the word has a "," or "/" in it, let's skip it for now
230
  if ',' in input_word or '/' in input_word:
231
  mapping = self.handle_multi_item(input_word)
multi_food_item_detector.py CHANGED
@@ -16,8 +16,8 @@ def extract_food_phrases(text):
16
  elif ',' in text:
17
  delimiter = ','
18
  else:
19
- # if it's not comma or slash delimited, return the text as is
20
- # this will be an edge-case and we'll handle it later
21
  return [text]
22
 
23
  # Split the text using the identified delimiter
@@ -36,3 +36,24 @@ def extract_food_phrases(text):
36
 
37
  return food_items
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  elif ',' in text:
17
  delimiter = ','
18
  else:
19
+ # If it's not comma or slash delimited, return the text as is
20
+ # this will be an edge-case and we'll handle it later
21
  return [text]
22
 
23
  # Split the text using the identified delimiter
 
36
 
37
  return food_items
38
 
39
+ def extract_items(text):
40
+ # Determine the delimiter
41
+ if '/' in text:
42
+ delimiter = '/'
43
+ elif ',' in text:
44
+ delimiter = ','
45
+ else:
46
+ # If it's not comma or slash delimited, return the text as is
47
+ return [text]
48
+
49
+ # Split the text using the identified delimiter
50
+ items = [item.strip() for item in text.split(delimiter)]
51
+
52
+ # Get the food items
53
+ food_items = extract_food_phrases(text)
54
+
55
+ # Find the items that were not matched as food items
56
+ non_food_items = [item for item in items if item not in food_items]
57
+
58
+ # Combine the food items and non_food_items
59
+ return food_items + non_food_items
similarity_fast.py CHANGED
@@ -64,8 +64,6 @@ class SimilarityFast:
64
 
65
  return most_similar_word, dictionary_word, highest_score, confidence_score, similar_words_str
66
 
67
-
68
-
69
  def find_most_similar_word(self, input_word):
70
  if not isinstance(input_word, str) or not input_word:
71
  return None
 
64
 
65
  return most_similar_word, dictionary_word, highest_score, confidence_score, similar_words_str
66
 
 
 
67
  def find_most_similar_word(self, input_word):
68
  if not isinstance(input_word, str) or not input_word:
69
  return None