File size: 14,342 Bytes
9189e38
 
 
 
 
 
 
bbe1fe8
9189e38
ecfb899
9189e38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6db11ac
9189e38
 
 
 
42e3a31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9189e38
 
 
 
 
 
 
 
 
 
68d1a5f
 
 
 
 
 
 
 
 
 
 
 
9189e38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ecfb899
b1c94e2
9189e38
 
 
 
 
ecfb899
 
b0e080a
ecfb899
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1c94e2
ecfb899
 
 
 
 
 
 
 
 
 
 
 
 
b0e080a
68d1a5f
b0e080a
68d1a5f
 
b0e080a
 
68d1a5f
 
 
 
 
df3751f
b0e080a
df3751f
b0e080a
 
 
df3751f
 
 
 
8826d52
 
 
 
 
 
 
 
 
df3751f
8826d52
68d1a5f
ecfb899
 
8826d52
ecfb899
 
 
 
 
8826d52
ecfb899
9189e38
 
 
 
 
 
df3751f
9189e38
 
df3751f
9189e38
 
 
 
 
 
df3751f
9189e38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbe1fe8
9189e38
 
 
 
 
 
 
 
 
 
 
b1c94e2
bbe1fe8
b1c94e2
 
df3751f
 
 
b1c94e2
 
 
8d827d2
 
 
 
b1c94e2
 
9189e38
c5c929e
e3179bc
c5c929e
 
 
 
13b3103
c5c929e
e3179bc
 
 
 
 
 
 
 
8826d52
 
 
 
 
 
 
 
 
 
e3071d6
e3179bc
 
 
 
 
 
 
 
 
 
c5c929e
 
9189e38
13b3103
 
9189e38
13b3103
 
6db11ac
 
 
13b3103
9189e38
 
 
 
 
 
 
 
 
 
 
 
e3179bc
9189e38
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
import time
from tqdm import tqdm
import pandas as pd
from pluralizer import Pluralizer
from similarity_fast import SimilarityFast
from food_nonfood import classify_as_food_nonfood, pessimistic_food_nonfood_score
from utils import clean_word
from db.db_utils import store_mapping_to_db, get_mapping_from_db, get_dictionary_data_from_db
from ask_gpt import query_gpt
from multi_food_item_detector import extract_items

similarity_threshold = 0.75


class Algo:
    def __init__(self, db_conn, enable_csv=False):
        self.db_conn = db_conn
        self.enable_csv = enable_csv
        self.db_cursor = db_conn.cursor()
        self.similarity_fast = SimilarityFast(self.db_cursor)
        # self.similarity_slow = SimilaritySlow(self.db_cursor, self.db_conn)
        self.pluralizer = Pluralizer()

    def save_to_csv(self, results):
        if not self.enable_csv:
            return
        output_file_path = f'./results/{int(time.time())}.csv'
        df_results = pd.DataFrame(results, columns=[
            'date', "input_word", "dictionary_word", "is_food", 'wweia_category', 'dry_matter_content', "water_content", 'carbon_credits',  'weight', 'donor', "similarity_score", "food_nonfood_score"
        ])
        df_results.to_csv(output_file_path, index=False)

    def perform_mapping(self, input_word, attempts=0):

        # if the input word is a USDA food item, we can skip the similarity check
        # this is a special case because the USDA food items are Government Donation (Not Counted) items
        if 'usda' in input_word.lower():
            return {
                'input_word': input_word, 
                'cleaned_word': clean_word(input_word), 
                'matching_word': 'USDA Food Item', 
                'dictionary_word': 'USDA Food Item', 
                'similarity_score': 1.0, 
                'confidence_score': 1.0, 
                'similar_words': None,
                'is_food': True,
                'food_nonfood_score': 1.0
            }

        mapping = self.similarity_fast.find_most_similar_word(input_word)

        # skip slow mapping for now
        # if mapping['similarity_score'] < similarity_threshold:
        #     print("Attempting slow mapping")
        #     slow_mapping = self.similarity_slow.find_most_similar_word(input_word)
        #     print(f" - Slow: {slow_mapping}")
        #     if slow_mapping['similarity_score'] > mapping['similarity_score']:
        #         mapping = slow_mapping

        # if mapping['similarity_score'] < similarity_threshold and len(input_word.split(' ')) > 1:
        #     print(" - Attempting reverse mapping")
        #     reversed_input_word = ' '.join(input_word.split(' ')[::-1])
        #     reversed_mapping = self.similarity_fast.find_most_similar_word(reversed_input_word)
        #     if reversed_mapping['similarity_score'] > mapping['similarity_score']:
        #         reversed_mapping.update(
        #             {
        #                 'input_word': input_word, 
        #                 'cleaned_word': mapping['cleaned_word']
        #             }
        #         )
        #         mapping = reversed_mapping

        # check if the cleaned_word is a substring of the matching_word
        is_substring = mapping['cleaned_word'] in mapping['matching_word']

        if mapping['similarity_score'] < similarity_threshold and not is_substring:
            print(" - Attempting GPT mapping")
            try:
                gpt_recommended_word = query_gpt(input_word)
                if gpt_recommended_word:

                    if gpt_recommended_word == 'Non-Food Item':
                        mapping.update(
                            {
                                'similarity_score': 1.0, 
                                'confidence_score': 1.0,
                                'is_food': False,
                                'food_nonfood_score': 1.0
                            }
                        )
                        return mapping
                    elif gpt_recommended_word == 'Mixed Food Items':
                        mapping.update(
                            {
                                'matching_word': 'Mixed Food Items',
                                'dictionary_word': 'Mixed Food Items', 'similarity_score': 1.0, 
                                'confidence_score': 1.0
                            }
                        )
                        return mapping
                    else:
                        gpt_mapping = self.similarity_fast.find_most_similar_word(gpt_recommended_word)
                        if gpt_mapping['similarity_score'] > mapping['similarity_score']:
                            gpt_mapping.update(
                                {
                                    'input_word': input_word, 
                                    'cleaned_word': mapping['cleaned_word']
                                }
                            )
                        mapping = gpt_mapping
            except Exception as e:
                print(f" - Error querying GPT: {e}")

        return mapping

    def handle_multi_item(self, input_word):
        # The input word has a comma or a slash in it
        # If it has more commas, its comma-delimited
        # If it has more slashes, its slash-delimited
        # If it has equal number of commas and slashes, we'll go with slashes
       
        input_word_parts = extract_items(input_word)
        print(f" - Extracted items: {input_word_parts}")
        mappings = []
        for part in input_word_parts:
            mapping = self.handle_single_item(part)
            mappings.append(mapping)

        # look up the dictionary values for each mapping
        # find the wweia category
        # if all mappings have the same wweia category, return "homogenous", else "heterogeneous"
        # if is_food is False for any mappings, return "Non-Food Item" as dictionary word
        for mapping in mappings:
            if mapping['is_food'] == False:
                return {
                    'input_word': input_word, 
                    'cleaned_word': mapping['cleaned_word'], 
                    'matching_word': 'Non-Food Item', 
                    'dictionary_word': 'Non-Food Item', 
                    'similarity_score': None, 
                    'confidence_score': None, 
                    'similar_words': None,
                    'is_food': False,
                    'food_nonfood_score': 1.0
                }
                break

        dictionary_words = [mapping['dictionary_word'] for mapping in mappings]

        if len(set(dictionary_words)) == 0:
            return {
                'input_word': input_word, 
                'cleaned_word': None, 
                'matching_word': None, 
                'dictionary_word': None, 
                'similarity_score': None, 
                'confidence_score': None, 
                'similar_words': None,
                'is_food': None,
                'food_nonfood_score': None
            }

        # check if "heterogeneous" is in the wweia category of any of the mappings
        # otherwise we find the mapping with the lowest DMC value, and return that as the dictionary word, dmc, wc, and leakage values
        heterogeneous_exists = False
        most_conservative_mapping = None
        for mapping in mappings:
            if mapping['wweia_category'] == "Heterogeneous Mixture":
                heterogeneous_exists = True
                break
            else:
                if most_conservative_mapping is None or mapping['dry_matter_content'] < most_conservative_mapping['dry_matter_content']:
                    most_conservative_mapping = mapping

        mixture_data = {}
        if heterogeneous_exists:
            mixture_data = {
                'matching_word': 'Heterogeneous Mixture',
                'dictionary_word': 'Heterogeneous Mixture',
                'wweia_category': 'Heterogeneous Mixture',
                'dry_matter_content': 0.27,
                'water_content': 0.73,
                'leakage': 0.1
            }
        else:
            mixture_data = {
                'matching_word': most_conservative_mapping['matching_word'],
                'dictionary_word': f"{most_conservative_mapping['dictionary_word']} (Lowest DMC)",
                'wweia_category': most_conservative_mapping['wweia_category'],
                'dry_matter_content': most_conservative_mapping['dry_matter_content'],
                'water_content': most_conservative_mapping['water_content'],
                'leakage': most_conservative_mapping['leakage']
            }

        print(f" - Mixture data: {mixture_data}")

        return {
            'input_word': input_word, 
            'cleaned_word': None,
            'similarity_score': None, 
            'confidence_score': None,
            'similar_words': None,
            'is_food': True,
            'food_nonfood_score': 1.0,
            **mixture_data
        }

    def handle_single_item(self, input_word):
        input_word_clean = clean_word(input_word)

        # try the singular form of the word
        singular = self.pluralizer.pluralize(input_word_clean, 1)
        mapping = get_mapping_from_db(self.db_cursor, singular)
        if mapping:
            print(f" - Found mapping in db: {mapping}")
            return self.wrap_mapping_with_dictionary_data(mapping)

        # try the plural form of the word
        plural = self.pluralizer.pluralize(input_word_clean, 2)
        mapping = get_mapping_from_db(self.db_cursor, plural)
        if mapping:
            print(f" - Found mapping in db: {mapping}")
            return self.wrap_mapping_with_dictionary_data(mapping)

        food_nonfood = classify_as_food_nonfood(input_word)

        # if we're very confident that the word is non-food, let's not even classify it
        if food_nonfood[1] > 0.9 and food_nonfood[0] == False:
            mapping = {
                'input_word': input_word, 
                'cleaned_word': input_word_clean, 
                'matching_word': 'Non-Food Item', 
                'dictionary_word': 'Non-Food Item', 
                'similarity_score': None, 
                'confidence_score': None, 
                'similar_words': None,
                'is_food': False,
                'food_nonfood_score': food_nonfood[1]
            }
            store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
            return self.wrap_mapping_with_dictionary_data(mapping)

        mapping = self.perform_mapping(input_word)

        food_nonfood_pessimistic = pessimistic_food_nonfood_score(food_nonfood, mapping['similarity_score'])
        mapping.update({
            'is_food': food_nonfood_pessimistic[0],
            'food_nonfood_score': food_nonfood_pessimistic[1]
        })

        print(f" - Storing new mapping to db: {mapping}")
        store_mapping_to_db(self.db_cursor, self.db_conn, mapping)

        return self.wrap_mapping_with_dictionary_data(mapping)

    def wrap_mapping_with_dictionary_data(self, mapping):
        if not mapping:
            return None

        dictionary_result = get_dictionary_data_from_db(self.db_cursor, mapping['dictionary_word'])

        mapping.update({
            'wweia_category': dictionary_result['wweia_category'] if dictionary_result else None,
            'water_content': dictionary_result['water_content'] if dictionary_result else None,
            'dry_matter_content': dictionary_result['dry_matter_content'] if dictionary_result else None,
            'leakage': dictionary_result['leakage'] if dictionary_result else None
        })

        return mapping
    
    def add_carbon_credit_data(self, mapping, donor, date, weight):
        if not mapping:
            return None

        mapping.update({
            'donor': donor
        })
        mapping.update({
            'date': date,
            'weight': weight,
            'weight_metric_tonnes': float(weight) * 0.000453592,
            'distance': 250,
            'ef': 2.968073544,
            'mt_lb_mile': 0.0000000809,
        })
        
        if mapping['is_food'] == False:
            return {
                'baseline_emissions': None,
                'leakage_emissions': None,
                'project_emissions': None,
                'total_emissions_reduction': None,
                **mapping
            }


        baseline_emissions = mapping['weight_metric_tonnes'] * mapping['dry_matter_content'] * mapping['ef']
        leakage_emissions = mapping['leakage'] * baseline_emissions
        project_emissions = mapping['distance'] * mapping['mt_lb_mile'] * baseline_emissions
        total_emissions_reduction = baseline_emissions - leakage_emissions - project_emissions
        mapping.update({
            'baseline_emissions': baseline_emissions,
            'leakage_emissions': leakage_emissions,
            'project_emissions': project_emissions,
            'total_emissions_reduction': total_emissions_reduction
        })

        return mapping

    def match_words(self, input_data, stream_results=False):
        # input_data is a list of tuples, where each tuple is (description, donor)
        results = []
        for input_item in tqdm(input_data, desc="Processing input words"):
            input_word = input_item[0]
            input_donor = input_item[1] if len(input_item) > 1 else None
            input_date = input_item[2] if len(input_item) > 2 else None
            input_weight = input_item[3] if len(input_item) > 3 else None

            if not isinstance(input_word, str) or pd.isna(input_word) or input_word == "" or input_word.lower() == "nan":
                continue

            print()
            print(f"Processing: {input_word}")
            
            if ',' in input_word or '/' in input_word:
                mapping = self.handle_multi_item(input_word)
            else:
                mapping = self.handle_single_item(input_word)

            if mapping:
                mapping = self.add_carbon_credit_data(mapping, input_donor, input_date, input_weight)
                results.append(mapping)
            
            if stream_results:
                return mapping

        self.save_to_csv(results)
        
        return results