Spaces:
Paused
Paused
Commit
•
b1c94e2
1
Parent(s):
e5e36ab
improvements to multi-item classifier, and adding dictionary data to mappings
Browse files- algo.py +17 -5
- app.py +3 -11
- db/db_utils.py +8 -0
- multi_food_item_detector.py +103 -47
- playground.py +112 -1
algo.py
CHANGED
@@ -107,7 +107,7 @@ class Algo:
|
|
107 |
# If it has equal number of commas and slashes, we'll go with slashes
|
108 |
|
109 |
input_word_parts = extract_items(input_word)
|
110 |
-
|
111 |
mappings = []
|
112 |
for part in input_word_parts:
|
113 |
mapping = self.handle_single_item(part)
|
@@ -133,7 +133,7 @@ class Algo:
|
|
133 |
break
|
134 |
|
135 |
dictionary_words = [mapping['dictionary_word'] for mapping in mappings]
|
136 |
-
|
137 |
if len(set(dictionary_words)) == 0:
|
138 |
return {
|
139 |
'input_word': input_word,
|
@@ -175,7 +175,7 @@ class Algo:
|
|
175 |
|
176 |
# try the singular form of the word
|
177 |
singular = self.pluralizer.pluralize(input_word_clean, 1)
|
178 |
-
mapping = get_mapping_from_db(self.db_cursor, singular)
|
179 |
if mapping:
|
180 |
print(f" - Found mapping in db: {mapping}")
|
181 |
return mapping
|
@@ -204,7 +204,7 @@ class Algo:
|
|
204 |
'food_nonfood_score': food_nonfood[1]
|
205 |
}
|
206 |
store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
|
207 |
-
return mapping
|
208 |
|
209 |
mapping = self.perform_mapping(input_word)
|
210 |
|
@@ -216,6 +216,19 @@ class Algo:
|
|
216 |
|
217 |
print(f" - Storing new mapping to db: {mapping}")
|
218 |
store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
return mapping
|
220 |
|
221 |
def match_words(self, input_words, stream_results=False):
|
@@ -227,7 +240,6 @@ class Algo:
|
|
227 |
print()
|
228 |
print(f"Processing: {input_word}")
|
229 |
|
230 |
-
# if the word has a "," or "/" in it, let's skip it for now
|
231 |
if ',' in input_word or '/' in input_word:
|
232 |
mapping = self.handle_multi_item(input_word)
|
233 |
else:
|
|
|
107 |
# If it has equal number of commas and slashes, we'll go with slashes
|
108 |
|
109 |
input_word_parts = extract_items(input_word)
|
110 |
+
print(f" - Extracted items: {input_word_parts}")
|
111 |
mappings = []
|
112 |
for part in input_word_parts:
|
113 |
mapping = self.handle_single_item(part)
|
|
|
133 |
break
|
134 |
|
135 |
dictionary_words = [mapping['dictionary_word'] for mapping in mappings]
|
136 |
+
|
137 |
if len(set(dictionary_words)) == 0:
|
138 |
return {
|
139 |
'input_word': input_word,
|
|
|
175 |
|
176 |
# try the singular form of the word
|
177 |
singular = self.pluralizer.pluralize(input_word_clean, 1)
|
178 |
+
mapping = wrap_mapping_with_dictionary_data(get_mapping_from_db(self.db_cursor, singular))
|
179 |
if mapping:
|
180 |
print(f" - Found mapping in db: {mapping}")
|
181 |
return mapping
|
|
|
204 |
'food_nonfood_score': food_nonfood[1]
|
205 |
}
|
206 |
store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
|
207 |
+
return wrap_mapping_with_dictionary_data(mapping)
|
208 |
|
209 |
mapping = self.perform_mapping(input_word)
|
210 |
|
|
|
216 |
|
217 |
print(f" - Storing new mapping to db: {mapping}")
|
218 |
store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
|
219 |
+
|
220 |
+
return wrap_mapping_with_dictionary_data(mapping)
|
221 |
+
|
222 |
+
def wrap_mapping_with_dictionary_data(self, mapping):
|
223 |
+
dictionary_result = get_dictionary_data_from_db(self.db_cursor, mapping['dictionary_word'])
|
224 |
+
|
225 |
+
mapping.update({
|
226 |
+
'wwiea_category': dictionary_result['wweia_category'],
|
227 |
+
'water_content': dictionary_result['water_content'],
|
228 |
+
'dry_matter_content': dictionary_result['dry_matter_content'],
|
229 |
+
'leakage': dictionary_result['leakage']
|
230 |
+
})
|
231 |
+
|
232 |
return mapping
|
233 |
|
234 |
def match_words(self, input_words, stream_results=False):
|
|
|
240 |
print()
|
241 |
print(f"Processing: {input_word}")
|
242 |
|
|
|
243 |
if ',' in input_word or '/' in input_word:
|
244 |
mapping = self.handle_multi_item(input_word)
|
245 |
else:
|
app.py
CHANGED
@@ -29,18 +29,10 @@ def process_input(input_text, csv_file):
|
|
29 |
# Process the single input text
|
30 |
results = algo.match_words([input_text])
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
for result in results:
|
35 |
-
dictionary_word = result['dictionary_word']
|
36 |
-
dictionary_data = fetch_the_dictionary_data(dictionary_word)
|
37 |
-
print(dictionary_data)
|
38 |
-
result['dry_matter_content'] = dictionary_data['dry_matter_content'] if dictionary_data else None
|
39 |
-
result['water_content'] = dictionary_data['water_content'] if dictionary_data else None
|
40 |
-
|
41 |
-
df = pd.DataFrame(results, columns=["input_word", "cleaned_word", 'matching_word', 'dictionary_word', 'similarity_score', 'confidence_score', 'similar_words', 'is_food', 'food_nonfood_score', 'dry_matter_content', 'water_content'])
|
42 |
# Filter to only required columns
|
43 |
-
df_filtered = df[["input_word", "dictionary_word", "is_food",
|
44 |
return df_filtered
|
45 |
|
46 |
# Gradio interface
|
|
|
29 |
# Process the single input text
|
30 |
results = algo.match_words([input_text])
|
31 |
|
32 |
+
df = pd.DataFrame(results, columns=["input_word", "cleaned_word", 'matching_word', 'dictionary_word', 'wweia_category', 'dry_matter_content',
|
33 |
+
'water_content', 'similarity_score', 'confidence_score', 'similar_words', 'is_food', 'food_nonfood_score', 'dry_matter_content', 'water_content'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
# Filter to only required columns
|
35 |
+
df_filtered = df[["input_word", "dictionary_word", "is_food", 'wweia_category', 'dry_matter_content', "water_content", "similarity_score", "food_nonfood_score"]]
|
36 |
return df_filtered
|
37 |
|
38 |
# Gradio interface
|
db/db_utils.py
CHANGED
@@ -60,6 +60,14 @@ def get_mapping_from_db(cursor, cleaned_word):
|
|
60 |
return dict(zip(columns, row))
|
61 |
return None
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
def store_mapping_to_db(cursor, conn, mapping):
|
64 |
try:
|
65 |
cursor.execute('''
|
|
|
60 |
return dict(zip(columns, row))
|
61 |
return None
|
62 |
|
63 |
+
def get_dictionary_data_from_db(cursor, dictionary_word):
|
64 |
+
cursor.execute('SELECT * FROM dictionary WHERE description = %s', (dictionary_word,))
|
65 |
+
row = cursor.fetchone()
|
66 |
+
if row:
|
67 |
+
columns = [col[0] for col in cursor.description]
|
68 |
+
return dict(zip(columns, row))
|
69 |
+
return None
|
70 |
+
|
71 |
def store_mapping_to_db(cursor, conn, mapping):
|
72 |
try:
|
73 |
cursor.execute('''
|
multi_food_item_detector.py
CHANGED
@@ -4,58 +4,114 @@ import re
|
|
4 |
# Load the spaCy model
|
5 |
nlp = spacy.load("en_core_web_trf")
|
6 |
|
7 |
-
def
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
#
|
24 |
-
|
|
|
25 |
|
26 |
-
# Process
|
27 |
-
|
|
|
28 |
for item in items:
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
if
|
34 |
-
|
35 |
-
|
|
|
36 |
|
37 |
-
|
|
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
48 |
|
49 |
-
#
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
4 |
# Load the spaCy model
|
5 |
nlp = spacy.load("en_core_web_trf")
|
6 |
|
7 |
+
def analyze_text(text):
|
8 |
+
# Track the positions of slashes in the original text
|
9 |
+
original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]
|
10 |
+
|
11 |
+
# Replace different delimiters with a uniform delimiter (comma)
|
12 |
+
normalized_text = re.sub(r'[\/,]', ',', text)
|
13 |
+
|
14 |
+
doc = nlp(normalized_text)
|
15 |
+
|
16 |
+
# Print tokens with their attributes
|
17 |
+
for token in doc:
|
18 |
+
print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")
|
19 |
+
|
20 |
+
items = []
|
21 |
+
current_item = []
|
22 |
+
current_position = 0
|
23 |
+
root_noun_found = False
|
24 |
+
|
25 |
+
for token in doc:
|
26 |
+
token_start = text.find(token.text, current_position)
|
27 |
+
token_end = token_start + len(token.text)
|
28 |
+
|
29 |
+
# If the token is punctuation and a root noun has been found, finalize the current item
|
30 |
+
if token.pos_ == 'PUNCT' and token.text == ',':
|
31 |
+
if root_noun_found:
|
32 |
+
items.append(" ".join(current_item))
|
33 |
+
current_item = []
|
34 |
+
root_noun_found = False
|
35 |
+
# Check if the comma was originally a slash
|
36 |
+
if token_start in original_slash_positions:
|
37 |
+
items.append('/')
|
38 |
+
else:
|
39 |
+
items.append(',')
|
40 |
+
else:
|
41 |
+
# If token is part of a compound noun or an adjective, add to the current item
|
42 |
+
if token.dep_ in ('compound', 'amod'):
|
43 |
+
current_item.append(token.text)
|
44 |
+
elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN':
|
45 |
+
current_item.append(token.text)
|
46 |
+
root_noun_found = True
|
47 |
+
elif token.dep_ == 'appos':
|
48 |
+
if current_item:
|
49 |
+
current_item.append(token.text)
|
50 |
+
else:
|
51 |
+
current_item = [token.text]
|
52 |
+
root_noun_found = True
|
53 |
+
else:
|
54 |
+
current_item.append(token.text)
|
55 |
+
|
56 |
+
current_position = token_end
|
57 |
|
58 |
+
# Add the last item if it exists
|
59 |
+
if current_item:
|
60 |
+
items.append(" ".join(current_item))
|
61 |
|
62 |
+
# Process items to handle delimiters correctly
|
63 |
+
final_items = []
|
64 |
+
temp_item = []
|
65 |
for item in items:
|
66 |
+
if item in [',', '/']:
|
67 |
+
if temp_item:
|
68 |
+
final_items.append("".join(temp_item).strip())
|
69 |
+
temp_item = []
|
70 |
+
if item == '/':
|
71 |
+
final_items.append('/')
|
72 |
+
else:
|
73 |
+
temp_item.append(item + " ")
|
74 |
|
75 |
+
if temp_item:
|
76 |
+
final_items.append("".join(temp_item).strip())
|
77 |
|
78 |
+
# Combine items separated by slashes into single items
|
79 |
+
combined_items = []
|
80 |
+
i = 0
|
81 |
+
while i < len(final_items):
|
82 |
+
if final_items[i] == '/':
|
83 |
+
combined_items[-1] += '/' + final_items[i + 1]
|
84 |
+
i += 2
|
85 |
+
else:
|
86 |
+
combined_items.append(final_items[i])
|
87 |
+
i += 1
|
88 |
|
89 |
+
# Determine if the text is a single noun phrase or multiple items
|
90 |
+
non_delimiter_items = [item for item in combined_items if item not in [',', '/']]
|
91 |
+
is_single_noun_phrase = len(non_delimiter_items) == 1
|
92 |
+
|
93 |
+
delimiter = determine_delimiter(text)
|
94 |
+
|
95 |
+
return is_single_noun_phrase, delimiter, combined_items
|
96 |
|
97 |
+
def determine_delimiter(text):
|
98 |
+
number_of_slashes = text.count('/')
|
99 |
+
number_of_commas = text.count(',')
|
100 |
+
number_of_spaces = text.count(' ')
|
101 |
+
|
102 |
+
if number_of_slashes > 0 and number_of_slashes >= number_of_commas:
|
103 |
+
# prefer slash over comma, since its rarer
|
104 |
+
return '/'
|
105 |
+
elif number_of_commas > 0:
|
106 |
+
return ','
|
107 |
+
else:
|
108 |
+
return ' '
|
109 |
+
|
110 |
+
def extract_items(text):
|
111 |
+
is_single_noun_phrase, delimiter = analyze_text(text)
|
112 |
|
113 |
+
if is_single_noun_phrase:
|
114 |
+
return [text]
|
115 |
+
else:
|
116 |
+
items = text.split(delimiter)
|
117 |
+
return items
|
playground.py
CHANGED
@@ -1 +1,112 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
import re
|
3 |
+
|
4 |
+
# Load the spaCy model
|
5 |
+
nlp = spacy.load("en_core_web_trf")
|
6 |
+
|
7 |
+
def analyze_text(text):
|
8 |
+
# Track the positions of slashes in the original text
|
9 |
+
original_slash_positions = [m.start() for m in re.finditer(r'\/', text)]
|
10 |
+
|
11 |
+
# Replace different delimiters with a uniform delimiter (comma)
|
12 |
+
normalized_text = re.sub(r'[\/,]', ',', text)
|
13 |
+
|
14 |
+
doc = nlp(normalized_text)
|
15 |
+
|
16 |
+
# Print tokens with their attributes
|
17 |
+
for token in doc:
|
18 |
+
print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")
|
19 |
+
|
20 |
+
items = []
|
21 |
+
current_item = []
|
22 |
+
current_position = 0
|
23 |
+
root_noun_found = False
|
24 |
+
|
25 |
+
for token in doc:
|
26 |
+
token_start = text.find(token.text, current_position)
|
27 |
+
token_end = token_start + len(token.text)
|
28 |
+
|
29 |
+
# If the token is punctuation and a root noun has been found, finalize the current item
|
30 |
+
if token.pos_ == 'PUNCT' and token.text == ',':
|
31 |
+
if root_noun_found:
|
32 |
+
items.append(" ".join(current_item))
|
33 |
+
current_item = []
|
34 |
+
root_noun_found = False
|
35 |
+
# Check if the comma was originally a slash
|
36 |
+
if token_start in original_slash_positions:
|
37 |
+
items.append('/')
|
38 |
+
else:
|
39 |
+
items.append(',')
|
40 |
+
else:
|
41 |
+
# If token is part of a compound noun or an adjective, add to the current item
|
42 |
+
if token.dep_ in ('compound', 'amod'):
|
43 |
+
current_item.append(token.text)
|
44 |
+
elif token.dep_ == 'ROOT' and token.pos_ == 'NOUN':
|
45 |
+
current_item.append(token.text)
|
46 |
+
root_noun_found = True
|
47 |
+
elif token.dep_ == 'appos':
|
48 |
+
if current_item:
|
49 |
+
current_item.append(token.text)
|
50 |
+
else:
|
51 |
+
current_item = [token.text]
|
52 |
+
root_noun_found = True
|
53 |
+
else:
|
54 |
+
current_item.append(token.text)
|
55 |
+
|
56 |
+
current_position = token_end
|
57 |
+
|
58 |
+
# Add the last item if it exists
|
59 |
+
if current_item:
|
60 |
+
items.append(" ".join(current_item))
|
61 |
+
|
62 |
+
# Process items to handle delimiters correctly
|
63 |
+
final_items = []
|
64 |
+
temp_item = []
|
65 |
+
for item in items:
|
66 |
+
if item in [',', '/']:
|
67 |
+
if temp_item:
|
68 |
+
final_items.append("".join(temp_item).strip())
|
69 |
+
temp_item = []
|
70 |
+
if item == '/':
|
71 |
+
final_items.append('/')
|
72 |
+
else:
|
73 |
+
temp_item.append(item + " ")
|
74 |
+
|
75 |
+
if temp_item:
|
76 |
+
final_items.append("".join(temp_item).strip())
|
77 |
+
|
78 |
+
# Combine items separated by slashes into single items
|
79 |
+
combined_items = []
|
80 |
+
i = 0
|
81 |
+
while i < len(final_items):
|
82 |
+
if final_items[i] == '/':
|
83 |
+
combined_items[-1] += '/' + final_items[i + 1]
|
84 |
+
i += 2
|
85 |
+
else:
|
86 |
+
combined_items.append(final_items[i])
|
87 |
+
i += 1
|
88 |
+
|
89 |
+
# Determine if the text is a single noun phrase or multiple items
|
90 |
+
non_delimiter_items = [item for item in combined_items if item not in [',', '/']]
|
91 |
+
if len(non_delimiter_items) == 1:
|
92 |
+
print("The text is a single noun phrase.")
|
93 |
+
else:
|
94 |
+
print("The text contains multiple items.")
|
95 |
+
|
96 |
+
print("Items identified:", non_delimiter_items)
|
97 |
+
|
98 |
+
# Example usage
|
99 |
+
texts = [
|
100 |
+
"apple",
|
101 |
+
"italian squash, raw, unpeeled",
|
102 |
+
"chocolate chips, bananas",
|
103 |
+
"chocolate chips/bananas",
|
104 |
+
"chocolate chips / bananas",
|
105 |
+
"chocolate chips, bananas, 1/2 lb carrots",
|
106 |
+
"pink berries/raw carrots/chcolate, raw/winter squash",
|
107 |
+
]
|
108 |
+
|
109 |
+
for text in texts:
|
110 |
+
print(f"Analyzing: {text}")
|
111 |
+
analyze_text(text)
|
112 |
+
print()
|