File size: 15,405 Bytes
22bb999
47d890f
9189e38
efa7589
b72dd6f
efa7589
9189e38
b72dd6f
9189e38
 
 
f2740a4
54daf6f
9189e38
d3d3a5b
e5de092
9323d71
0ff64fc
b9a27ee
0a288ad
71df5fb
9189e38
 
9323d71
 
 
 
 
 
9189e38
184aa9e
9189e38
77cf600
b1f9aab
9189e38
 
 
 
c4f0a69
 
 
0a288ad
c4f0a69
 
 
 
713485e
c4f0a69
e5de092
 
 
 
 
 
 
 
0a288ad
 
c4f0a69
 
9189e38
 
42e3a31
 
 
 
e5de092
42e3a31
9189e38
 
b43643f
e5de092
 
9189e38
 
b72dd6f
9189e38
 
 
 
 
 
 
 
 
 
 
 
 
 
d93f20c
9189e38
 
d93f20c
9189e38
 
 
 
71df5fb
 
 
 
 
 
 
 
 
9189e38
 
 
 
 
 
 
 
 
 
 
b72dd6f
9189e38
 
 
 
 
 
 
 
f2740a4
9189e38
ecfb899
b72dd6f
9189e38
 
 
3534b4d
 
 
 
 
9189e38
ecfb899
 
b0e080a
ecfb899
a04fda8
ecfb899
e5de092
 
 
 
 
ecfb899
 
 
b1c94e2
ecfb899
e5de092
ecfb899
b0e080a
68d1a5f
b0e080a
68d1a5f
be25015
68d1a5f
034c968
b0e080a
68d1a5f
 
46a46af
be25015
 
46a46af
68d1a5f
b0e080a
e5de092
e9b9609
e5de092
e9b9609
 
 
df3751f
9189e38
 
 
bdee24c
 
 
 
 
 
9189e38
 
c4f0a69
 
 
 
 
 
 
 
9189e38
 
 
c4f0a69
 
 
 
 
 
 
9189e38
016ed0e
9189e38
 
 
e5de092
9189e38
d839108
bbe1fe8
9189e38
 
 
d93f20c
 
 
 
 
9189e38
 
 
 
 
 
 
d839108
b1c94e2
bbe1fe8
b1c94e2
 
df3751f
 
 
b1c94e2
 
f2740a4
 
 
b1c94e2
8d827d2
1c28270
8d827d2
 
f2740a4
 
b1c94e2
 
9189e38
c5c929e
e3179bc
c5c929e
 
 
 
13b3103
c5c929e
01a709c
4afe3a5
01a709c
 
9a6b725
e9b9609
9a6b725
01a709c
47d890f
 
 
e3179bc
 
 
01a709c
e3179bc
 
 
 
7e5979c
62c7d90
 
7e5979c
8826d52
 
 
 
 
 
 
 
b72dd6f
e3179bc
 
 
 
 
 
 
 
 
 
c5c929e
 
9189e38
b1f9aab
13b3103
d93f20c
22bb999
13b3103
 
f2740a4
4edd87e
f2740a4
 
 
 
4edd87e
 
 
 
f2740a4
 
4edd87e
 
f2740a4
 
4edd87e
 
 
 
 
 
9189e38
 
22bb999
e3179bc
efa7589
 
 
22bb999
b9a27ee
d93f20c
22bb999
 
9323d71
22bb999
 
 
9323d71
22bb999
 
efa7589
d93f20c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
# algo.py
import math
import time
import queue
import logging
import threading
import pandas as pd
from tqdm import tqdm
from pluralizer import Pluralizer
from similarity_fast import SimilarityFast
from food_nonfood import classify_as_food_nonfood, pessimistic_food_nonfood_score
from utils import clean_word, is_empty_word
from db.db_utils import store_mapping_to_db, cached_get_mapping_from_db, get_dictionary_data_from_db, store_result_to_db
from ask_gpt import query_gpt
from multi_food_item_detector import extract_items, has_delimiters
from mapping_template import empty_template, heterogeneous_template, multi_item_template, nonfood_template, usda_template
# from tasks import insert_result
from specificity_classifier import classify_text_to_specificity

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
similarity_threshold = 0.78


def insert_result(db_conn, run_key, mappings):
    db_cursor = db_conn.cursor()
    for mapping in mappings:
        store_result_to_db(db_cursor, db_conn, run_key, mapping)


class Algo:
    def __init__(self, db_conn, run_key=None):
        self.db_conn = db_conn
        self.run_key = run_key if run_key else int(time.time())

        self.db_cursor = db_conn.cursor()
        self.similarity_fast = SimilarityFast(self.db_cursor)
        # self.similarity_slow = SimilaritySlow(self.db_cursor, self.db_conn)
        self.pluralizer = Pluralizer()
        self.mappings_with_dictionary = self.initialize_mappings()

    def initialize_mappings(self):
        self.db_cursor.execute('SELECT cleaned_word, dictionary_word, is_food, similarity_score, food_nonfood_score, wweia_category, sr_legacy_food_category, water_content, dry_matter_content, leakage, ignore, specificity from mappings join dictionary on mappings.dictionary_word = dictionary.description')
        rows = self.db_cursor.fetchall()
        mappings_with_dictionary = {}
        for row in rows:
            mappings_with_dictionary[row[0]] = {
                'cleaned_word': row[0],
                'dictionary_word': row[1],
                'is_food': row[2],
                'similarity_score': row[3],
                'food_nonfood_score': row[4],
                'wweia_category': row[5],
                'sr_legacy_food_category': row[6],
                'water_content': row[7],
                'dry_matter_content': row[8],
                'leakage': row[9],
                'ignore': row[10],
                'specificity': row[11]
            }
        return mappings_with_dictionary

    def perform_mapping(self, input_word, attempts=0):

        # if the input word is a USDA food item, we can skip the similarity check
        # this is a special case because the USDA food items are Government Donation (Not Counted) items
        if 'usda' in input_word.lower():
            return usda_template(input_word, clean_word(input_word)) 

        mapping = self.similarity_fast.find_most_similar_word(input_word)

        logging.info(f" - Simlarity Fast mapping: {mapping}")
        # check if the cleaned_word is a substring of the most_similar_word
        is_substring = mapping['cleaned_word'] in mapping['most_similar_word']

        if mapping['similarity_score'] < similarity_threshold and not is_substring:
            logging.info(" - Attempting GPT mapping")
            try:
                gpt_recommended_word = query_gpt(input_word)
                if gpt_recommended_word:

                    if gpt_recommended_word == 'Non-Food Item':
                        mapping.update(
                            {
                                'similarity_score': 1.0, 
                                'confidence_score': 1.0,
                                'is_food': False,
                                'food_nonfood_score': 1.0
                            }
                        )
                        return mapping
                    elif gpt_recommended_word == 'Heterogeneous Mixture':
                        mapping.update(
                            {
                                'dictionary_word': 'Heterogeneous Mixture', 'similarity_score': 1.0, 
                                'confidence_score': 1.0
                            }
                        )
                        return mapping
                    elif gpt_recommended_word == 'Broad Category':
                        category_mapping = self.similarity_fast.find_most_similar_word(input_word, True)
                        mapping.update(
                            {
                                'dictionary_word': category_mapping['dictionary_word'],
                                'similarity_score': category_mapping['similarity_score'],
                                'confidence_score': category_mapping['confidence_score']
                            }
                        )
                    else:
                        gpt_mapping = self.similarity_fast.find_most_similar_word(gpt_recommended_word)
                        if gpt_mapping['similarity_score'] > mapping['similarity_score']:
                            gpt_mapping.update(
                                {
                                    'input_word': input_word, 
                                    'cleaned_word': mapping['cleaned_word']
                                }
                            )
                        mapping = gpt_mapping
            except Exception as e:
                logging.info(f" - Error querying GPT: {e}")

        return mapping

    def handle_multi_item(self, input_word):
        # The input word has a comma or a slash in it
        # If it has more commas, its comma-delimited
        # If it has more slashes, its slash-delimited
        # If it has equal number of commas and slashes, we'll go with slashes
        logging.info(f"Handling multi-item {input_word}")
       
        input_word_parts = extract_items(input_word)
        logging.info(f" - Extracted items: {input_word_parts}")
        mappings = []
        for part in input_word_parts:
            mapping = self.handle_single_item(part)
            if mapping:
                # Some words in the mapping can be ignored because they are 
                # just filler words that don't add any value to the mapping
                if mapping['ignore'] == False:
                    mappings.append(mapping)

        # look up the dictionary values for each mapping
        # find the wweia category
        # if all mappings have the same wweia category, return "homogenous", else "heterogeneous"
        # if is_food is False for any mappings, return "Non-Food Item" as dictionary word
        for mapping in mappings:
            if mapping['is_food'] == False:
                return nonfood_template(
                    input_word, 
                    mapping['cleaned_word'], 
                    mapping['food_nonfood_score']
                )
                break

        dictionary_words = [mapping['dictionary_word'] for mapping in mappings]

        if len(set(dictionary_words)) == 0:
            return empty_template(input_word)

        # check if "heterogeneous" is in the wweia category of any of the mappings
        # otherwise we find the mapping with the lowest DMC value, and return that as the dictionary word, dmc, wc, and leakage values
        heterogeneous_exists = False
        most_conservative_mapping = None

        for mapping in mappings:
            if mapping['sr_legacy_food_category'] == "Heterogeneous Mixture":
                heterogeneous_exists = True
                break
            else:
                dry_matter_content = mapping.get('dry_matter_content')
                if dry_matter_content is not None:
                    if most_conservative_mapping is None or dry_matter_content < most_conservative_mapping.get('dry_matter_content', float('inf')):
                        most_conservative_mapping = mapping

        if heterogeneous_exists:
            return heterogeneous_template(input_word)
        elif most_conservative_mapping is not None:
            return multi_item_template(input_word, None, most_conservative_mapping)
        else:
            logging.warning(f" - No mappings found for {input_word}")
            return None

    def handle_single_item(self, input_word):
        input_word_clean = clean_word(input_word)

        if not input_word_clean:
            return None

        if input_word_clean == "":
            return None

        # try the singular form of the word
        singular = self.pluralizer.pluralize(input_word_clean, 1)
        # mapping = cached_get_mapping_from_db(self.db_cursor, singular)
        mapping_with_dict = self.mappings_with_dictionary.get(singular)
        if mapping_with_dict:
            mapping_with_dict.update({
                'input_word': input_word,
            })
            logging.info(f" - Found mapping in db: {mapping_with_dict}")
            return mapping_with_dict

        # try the plural form of the word
        plural = self.pluralizer.pluralize(input_word_clean, 2)
        mapping_with_dict = self.mappings_with_dictionary.get(plural)
        if mapping_with_dict:
            mapping_with_dict.update({
                'input_word': input_word,
            })
            logging.info(f" - Found mapping in db: {mapping_with_dict}")
            return mapping_with_dict

        food_nonfood = classify_as_food_nonfood(input_word_clean)

        # if we're very confident that the word is non-food, let's not even classify it
        if food_nonfood[1] > 0.9 and food_nonfood[0] == False:
            mapping = nonfood_template(input_word, input_word_clean, food_nonfood[1])
            store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
            self.mappings_with_dictionary[input_word_clean] = mapping
            return self.wrap_mapping_with_dictionary_data(mapping)

        mapping = self.perform_mapping(input_word)

        specificity = classify_text_to_specificity(input_word_clean)
        mapping.update({
            'specificity': specificity
        })

        food_nonfood_pessimistic = pessimistic_food_nonfood_score(food_nonfood, mapping['similarity_score'])
        mapping.update({
            'is_food': food_nonfood_pessimistic[0],
            'food_nonfood_score': food_nonfood_pessimistic[1]
        })

        store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
        self.mappings_with_dictionary[input_word_clean] = mapping

        return self.wrap_mapping_with_dictionary_data(mapping)

    def wrap_mapping_with_dictionary_data(self, mapping):
        if not mapping:
            return None

        dictionary_result = get_dictionary_data_from_db(self.db_cursor, mapping['dictionary_word'])

        # set default on ignore
        ignore = mapping['ignore'] if 'ignore' in mapping else False

        mapping.update({
            'wweia_category': dictionary_result['wweia_category'] if dictionary_result else None,
            'sr_legacy_food_category': dictionary_result['sr_legacy_food_category'] if dictionary_result else None,
            'water_content': dictionary_result['water_content'] if dictionary_result else None,
            'dry_matter_content': dictionary_result['dry_matter_content'] if dictionary_result else None,
            'leakage': dictionary_result['leakage'] if dictionary_result else None,
            'ignore': ignore
        })

        return mapping
    
    def add_carbon_credit_data(self, mapping, donor, date, weight):
        if not mapping:
            return None

        mapping.update({
            'donor': donor
        })
        try:
            weight = float(weight)
        except ValueError:
            weight = 0
        except Exception as e:
            logging.info(f" - Error converting weight to float: {e}")
            weight = 0

        if math.isnan(weight):
            weight = 0

        mapping.update({
            'date': date,
            'weight': weight,
            'weight_metric_tonnes': weight * 0.000453592,
            'distance': 250,
            'ef': 2.968073544,
            'mt_lb_mile': 0.0000000809,
        })

        required_fields_exist = 'leakage' in mapping and mapping['leakage'] is not None and 'dry_matter_content' in mapping and mapping['dry_matter_content'] is not None

        if mapping['is_food'] == False or required_fields_exist == False:
            return {
                'baseline_emissions': None,
                'leakage_emissions': None,
                'project_emissions': None,
                'total_emissions_reduction': None,
                **mapping
            }

        logging.info(f" - Calculating carbon credits for: {mapping}")
        baseline_emissions = mapping['weight_metric_tonnes'] * mapping['dry_matter_content'] * mapping['ef']
        leakage_emissions = mapping['leakage'] * baseline_emissions
        project_emissions = mapping['distance'] * mapping['mt_lb_mile'] * baseline_emissions
        total_emissions_reduction = baseline_emissions - leakage_emissions - project_emissions
        mapping.update({
            'baseline_emissions': baseline_emissions,
            'leakage_emissions': leakage_emissions,
            'project_emissions': project_emissions,
            'total_emissions_reduction': total_emissions_reduction
        })

        return mapping

    def match_words(self, input_data):
        # input_data is a list of tuples, where each tuple is (description, donor)
        results = []
        result_batch = []
        for input_item in tqdm(input_data, desc="Processing input words"):
            input_word = input_item[0]
            input_word_alt = input_item[1] if len(input_item) > 1 else None
            input_row_num = input_item[2] if len(input_item) > 2 else None
            input_donor = input_item[3] if len(input_item) > 3 else None
            input_date = input_item[4] if len(input_item) > 4 else None
            input_weight = input_item[5] if len(input_item) > 5 else None

            logging.info("")
            logging.info(f"Processing: {input_word}")

            is_empty = False
            if is_empty_word(input_word):
                if is_empty_word(input_word_alt):
                    mapping = empty_template(input_word)
                    is_empty = True
                else:
                    input_word = input_word_alt
            
            if not is_empty:
                if has_delimiters(input_word):
                    mapping = self.handle_multi_item(input_word)
                else:
                    mapping = self.handle_single_item(input_word)

            if mapping:
                mapping = dict(mapping)
                mapping = self.add_carbon_credit_data(mapping, input_donor, input_date, input_weight)
                mapping.update({
                    'run_row': input_row_num
                })
                result_batch.append(mapping)
                # store_result_to_db(self.db_cursor, self.db_conn, self.run_key, mapping)
                results.append(mapping)

            if len(result_batch) >= 100:
                insert_result(self.db_conn, self.run_key, result_batch)
                result_batch = []

        if len(result_batch) > 0:
            insert_result(self.db_conn, self.run_key, result_batch)
            result_batch = []


        return results