Spaces:
Paused
Paused
beweinreich
commited on
Commit
•
ecfb899
1
Parent(s):
c92bac3
return all items from list no matter if its a noun
Browse files- algo.py +57 -8
- multi_food_item_detector.py +23 -2
- similarity_fast.py +0 -2
algo.py
CHANGED
@@ -7,7 +7,7 @@ from food_nonfood import classify_as_food_nonfood, pessimistic_food_nonfood_scor
|
|
7 |
from utils import clean_word
|
8 |
from db.db_utils import store_mapping_to_db, get_mapping_from_db
|
9 |
from ask_gpt import query_gpt
|
10 |
-
from multi_food_item_detector import
|
11 |
|
12 |
similarity_threshold = 0.75
|
13 |
|
@@ -106,15 +106,68 @@ class Algo:
|
|
106 |
# If it has more slashes, its slash-delimited
|
107 |
# If it has equal number of commas and slashes, we'll go with slashes
|
108 |
|
109 |
-
input_word_parts =
|
110 |
|
111 |
mappings = []
|
112 |
for part in input_word_parts:
|
113 |
mapping = self.handle_single_item(part)
|
114 |
mappings.append(mapping)
|
115 |
|
116 |
-
#
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
def handle_single_item(self, input_word):
|
120 |
input_word_clean = clean_word(input_word)
|
@@ -173,10 +226,6 @@ class Algo:
|
|
173 |
print()
|
174 |
print(f"Processing: {input_word}")
|
175 |
|
176 |
-
if "&" in input_word or "and" in input_word:
|
177 |
-
print(" - Skipping multi-item word")
|
178 |
-
continue
|
179 |
-
|
180 |
# if the word has a "," or "/" in it, let's skip it for now
|
181 |
if ',' in input_word or '/' in input_word:
|
182 |
mapping = self.handle_multi_item(input_word)
|
|
|
7 |
from utils import clean_word
|
8 |
from db.db_utils import store_mapping_to_db, get_mapping_from_db
|
9 |
from ask_gpt import query_gpt
|
10 |
+
from multi_food_item_detector import extract_items
|
11 |
|
12 |
similarity_threshold = 0.75
|
13 |
|
|
|
106 |
# If it has more slashes, its slash-delimited
|
107 |
# If it has equal number of commas and slashes, we'll go with slashes
|
108 |
|
109 |
+
input_word_parts = extract_items(input_word)
|
110 |
|
111 |
mappings = []
|
112 |
for part in input_word_parts:
|
113 |
mapping = self.handle_single_item(part)
|
114 |
mappings.append(mapping)
|
115 |
|
116 |
+
# look up the dictionary values for each mapping
|
117 |
+
# find the wweia category
|
118 |
+
# if all mappings have the same wweia category, return "homogenous", else "heterogenous"
|
119 |
+
# if is_food is False for any mappings, return "Non-Food Item" as dictionary word
|
120 |
+
for mapping in mappings:
|
121 |
+
if mapping['is_food'] == False:
|
122 |
+
return {
|
123 |
+
'input_word': input_word,
|
124 |
+
'cleaned_word': mapping['cleaned_word'],
|
125 |
+
'matching_word': 'Non-Food Item',
|
126 |
+
'dictionary_word': 'Non-Food Item',
|
127 |
+
'similarity_score': None,
|
128 |
+
'confidence_score': None,
|
129 |
+
'similar_words': None,
|
130 |
+
'is_food': False,
|
131 |
+
'food_nonfood_score': 1.0
|
132 |
+
}
|
133 |
+
break
|
134 |
+
|
135 |
+
dictionary_words = [mapping['dictionary_word'] for mapping in mappings]
|
136 |
+
if len(set(dictionary_words)) == 0:
|
137 |
+
return {
|
138 |
+
'input_word': input_word,
|
139 |
+
'cleaned_word': None,
|
140 |
+
'matching_word': None,
|
141 |
+
'dictionary_word': None,
|
142 |
+
'similarity_score': None,
|
143 |
+
'confidence_score': None,
|
144 |
+
'similar_words': None,
|
145 |
+
'is_food': None,
|
146 |
+
'food_nonfood_score': None
|
147 |
+
}
|
148 |
+
|
149 |
+
self.db_cursor.execute(
|
150 |
+
f"SELECT DISTINCT wweia_category FROM dictionary WHERE description IN ({','.join(['%s']*len(dictionary_words))})",
|
151 |
+
dictionary_words
|
152 |
+
)
|
153 |
+
wweia_categories = self.db_cursor.fetchall()
|
154 |
+
wweia_categories = [x[0] for x in wweia_categories]
|
155 |
+
print("categories -> ", wweia_categories)
|
156 |
+
mixture = "Heterogeneous Mixture"
|
157 |
+
if len(set(wweia_categories)) == 1:
|
158 |
+
mixture = "Homogenous Mixture"
|
159 |
+
|
160 |
+
return {
|
161 |
+
'input_word': input_word,
|
162 |
+
'cleaned_word': None,
|
163 |
+
'matching_word': mixture,
|
164 |
+
'dictionary_word': mixture,
|
165 |
+
'similarity_score': None,
|
166 |
+
'confidence_score': None,
|
167 |
+
'similar_words': None,
|
168 |
+
'is_food': True,
|
169 |
+
'food_nonfood_score': 1.0,
|
170 |
+
}
|
171 |
|
172 |
def handle_single_item(self, input_word):
|
173 |
input_word_clean = clean_word(input_word)
|
|
|
226 |
print()
|
227 |
print(f"Processing: {input_word}")
|
228 |
|
|
|
|
|
|
|
|
|
229 |
# if the word has a "," or "/" in it, let's skip it for now
|
230 |
if ',' in input_word or '/' in input_word:
|
231 |
mapping = self.handle_multi_item(input_word)
|
multi_food_item_detector.py
CHANGED
@@ -16,8 +16,8 @@ def extract_food_phrases(text):
|
|
16 |
elif ',' in text:
|
17 |
delimiter = ','
|
18 |
else:
|
19 |
-
|
20 |
-
|
21 |
return [text]
|
22 |
|
23 |
# Split the text using the identified delimiter
|
@@ -36,3 +36,24 @@ def extract_food_phrases(text):
|
|
36 |
|
37 |
return food_items
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
elif ',' in text:
|
17 |
delimiter = ','
|
18 |
else:
|
19 |
+
# If it's not comma or slash delimited, return the text as is
|
20 |
+
# this will be an edge-case and we'll handle it later
|
21 |
return [text]
|
22 |
|
23 |
# Split the text using the identified delimiter
|
|
|
36 |
|
37 |
return food_items
|
38 |
|
39 |
+
def extract_items(text):
|
40 |
+
# Determine the delimiter
|
41 |
+
if '/' in text:
|
42 |
+
delimiter = '/'
|
43 |
+
elif ',' in text:
|
44 |
+
delimiter = ','
|
45 |
+
else:
|
46 |
+
# If it's not comma or slash delimited, return the text as is
|
47 |
+
return [text]
|
48 |
+
|
49 |
+
# Split the text using the identified delimiter
|
50 |
+
items = [item.strip() for item in text.split(delimiter)]
|
51 |
+
|
52 |
+
# Get the food items
|
53 |
+
food_items = extract_food_phrases(text)
|
54 |
+
|
55 |
+
# Find the items that were not matched as food items
|
56 |
+
non_food_items = [item for item in items if item not in food_items]
|
57 |
+
|
58 |
+
# Combine the food items and non_food_items
|
59 |
+
return food_items + non_food_items
|
similarity_fast.py
CHANGED
@@ -64,8 +64,6 @@ class SimilarityFast:
|
|
64 |
|
65 |
return most_similar_word, dictionary_word, highest_score, confidence_score, similar_words_str
|
66 |
|
67 |
-
|
68 |
-
|
69 |
def find_most_similar_word(self, input_word):
|
70 |
if not isinstance(input_word, str) or not input_word:
|
71 |
return None
|
|
|
64 |
|
65 |
return most_similar_word, dictionary_word, highest_score, confidence_score, similar_words_str
|
66 |
|
|
|
|
|
67 |
def find_most_similar_word(self, input_word):
|
68 |
if not isinstance(input_word, str) or not input_word:
|
69 |
return None
|