Spaces:

madebybread
/

brightly-ai

Paused

File size: 14,342 Bytes

import time
from tqdm import tqdm
import pandas as pd
from pluralizer import Pluralizer
from similarity_fast import SimilarityFast
from food_nonfood import classify_as_food_nonfood, pessimistic_food_nonfood_score
from utils import clean_word
from db.db_utils import store_mapping_to_db, get_mapping_from_db, get_dictionary_data_from_db
from ask_gpt import query_gpt
from multi_food_item_detector import extract_items

similarity_threshold = 0.75


class Algo:
    def __init__(self, db_conn, enable_csv=False):
        self.db_conn = db_conn
        self.enable_csv = enable_csv
        self.db_cursor = db_conn.cursor()
        self.similarity_fast = SimilarityFast(self.db_cursor)
        # self.similarity_slow = SimilaritySlow(self.db_cursor, self.db_conn)
        self.pluralizer = Pluralizer()

    def save_to_csv(self, results):
        if not self.enable_csv:
            return
        output_file_path = f'./results/{int(time.time())}.csv'
        df_results = pd.DataFrame(results, columns=[
            'date', "input_word", "dictionary_word", "is_food", 'wweia_category', 'dry_matter_content', "water_content", 'carbon_credits',  'weight', 'donor', "similarity_score", "food_nonfood_score"
        ])
        df_results.to_csv(output_file_path, index=False)

    def perform_mapping(self, input_word, attempts=0):

        # if the input word is a USDA food item, we can skip the similarity check
        # this is a special case because the USDA food items are Government Donation (Not Counted) items
        if 'usda' in input_word.lower():
            return {
                'input_word': input_word, 
                'cleaned_word': clean_word(input_word), 
                'matching_word': 'USDA Food Item', 
                'dictionary_word': 'USDA Food Item', 
                'similarity_score': 1.0, 
                'confidence_score': 1.0, 
                'similar_words': None,
                'is_food': True,
                'food_nonfood_score': 1.0
            }

        mapping = self.similarity_fast.find_most_similar_word(input_word)

        # skip slow mapping for now
        # if mapping['similarity_score'] < similarity_threshold:
        #     print("Attempting slow mapping")
        #     slow_mapping = self.similarity_slow.find_most_similar_word(input_word)
        #     print(f" - Slow: {slow_mapping}")
        #     if slow_mapping['similarity_score'] > mapping['similarity_score']:
        #         mapping = slow_mapping

        # if mapping['similarity_score'] < similarity_threshold and len(input_word.split(' ')) > 1:
        #     print(" - Attempting reverse mapping")
        #     reversed_input_word = ' '.join(input_word.split(' ')[::-1])
        #     reversed_mapping = self.similarity_fast.find_most_similar_word(reversed_input_word)
        #     if reversed_mapping['similarity_score'] > mapping['similarity_score']:
        #         reversed_mapping.update(
        #             {
        #                 'input_word': input_word, 
        #                 'cleaned_word': mapping['cleaned_word']
        #             }
        #         )
        #         mapping = reversed_mapping

        # check if the cleaned_word is a substring of the matching_word
        is_substring = mapping['cleaned_word'] in mapping['matching_word']

        if mapping['similarity_score'] < similarity_threshold and not is_substring:
            print(" - Attempting GPT mapping")
            try:
                gpt_recommended_word = query_gpt(input_word)
                if gpt_recommended_word:

                    if gpt_recommended_word == 'Non-Food Item':
                        mapping.update(
                            {
                                'similarity_score': 1.0, 
                                'confidence_score': 1.0,
                                'is_food': False,
                                'food_nonfood_score': 1.0
                            }
                        )
                        return mapping
                    elif gpt_recommended_word == 'Mixed Food Items':
                        mapping.update(
                            {
                                'matching_word': 'Mixed Food Items',
                                'dictionary_word': 'Mixed Food Items', 'similarity_score': 1.0, 
                                'confidence_score': 1.0
                            }
                        )
                        return mapping
                    else:
                        gpt_mapping = self.similarity_fast.find_most_similar_word(gpt_recommended_word)
                        if gpt_mapping['similarity_score'] > mapping['similarity_score']:
                            gpt_mapping.update(
                                {
                                    'input_word': input_word, 
                                    'cleaned_word': mapping['cleaned_word']
                                }
                            )
                        mapping = gpt_mapping
            except Exception as e:
                print(f" - Error querying GPT: {e}")

        return mapping

    def handle_multi_item(self, input_word):
        # The input word has a comma or a slash in it
        # If it has more commas, its comma-delimited
        # If it has more slashes, its slash-delimited
        # If it has equal number of commas and slashes, we'll go with slashes
       
        input_word_parts = extract_items(input_word)
        print(f" - Extracted items: {input_word_parts}")
        mappings = []
        for part in input_word_parts:
            mapping = self.handle_single_item(part)
            mappings.append(mapping)

        # look up the dictionary values for each mapping
        # find the wweia category
        # if all mappings have the same wweia category, return "homogenous", else "heterogeneous"
        # if is_food is False for any mappings, return "Non-Food Item" as dictionary word
        for mapping in mappings:
            if mapping['is_food'] == False:
                return {
                    'input_word': input_word, 
                    'cleaned_word': mapping['cleaned_word'], 
                    'matching_word': 'Non-Food Item', 
                    'dictionary_word': 'Non-Food Item', 
                    'similarity_score': None, 
                    'confidence_score': None, 
                    'similar_words': None,
                    'is_food': False,
                    'food_nonfood_score': 1.0
                }
                break

        dictionary_words = [mapping['dictionary_word'] for mapping in mappings]

        if len(set(dictionary_words)) == 0:
            return {
                'input_word': input_word, 
                'cleaned_word': None, 
                'matching_word': None, 
                'dictionary_word': None, 
                'similarity_score': None, 
                'confidence_score': None, 
                'similar_words': None,
                'is_food': None,
                'food_nonfood_score': None
            }

        # check if "heterogeneous" is in the wweia category of any of the mappings
        # otherwise we find the mapping with the lowest DMC value, and return that as the dictionary word, dmc, wc, and leakage values
        heterogeneous_exists = False
        most_conservative_mapping = None
        for mapping in mappings:
            if mapping['wweia_category'] == "Heterogeneous Mixture":
                heterogeneous_exists = True
                break
            else:
                if most_conservative_mapping is None or mapping['dry_matter_content'] < most_conservative_mapping['dry_matter_content']:
                    most_conservative_mapping = mapping

        mixture_data = {}
        if heterogeneous_exists:
            mixture_data = {
                'matching_word': 'Heterogeneous Mixture',
                'dictionary_word': 'Heterogeneous Mixture',
                'wweia_category': 'Heterogeneous Mixture',
                'dry_matter_content': 0.27,
                'water_content': 0.73,
                'leakage': 0.1
            }
        else:
            mixture_data = {
                'matching_word': most_conservative_mapping['matching_word'],
                'dictionary_word': f"{most_conservative_mapping['dictionary_word']} (Lowest DMC)",
                'wweia_category': most_conservative_mapping['wweia_category'],
                'dry_matter_content': most_conservative_mapping['dry_matter_content'],
                'water_content': most_conservative_mapping['water_content'],
                'leakage': most_conservative_mapping['leakage']
            }

        print(f" - Mixture data: {mixture_data}")

        return {
            'input_word': input_word, 
            'cleaned_word': None,
            'similarity_score': None, 
            'confidence_score': None,
            'similar_words': None,
            'is_food': True,
            'food_nonfood_score': 1.0,
            **mixture_data
        }

    def handle_single_item(self, input_word):
        input_word_clean = clean_word(input_word)

        # try the singular form of the word
        singular = self.pluralizer.pluralize(input_word_clean, 1)
        mapping = get_mapping_from_db(self.db_cursor, singular)
        if mapping:
            print(f" - Found mapping in db: {mapping}")
            return self.wrap_mapping_with_dictionary_data(mapping)

        # try the plural form of the word
        plural = self.pluralizer.pluralize(input_word_clean, 2)
        mapping = get_mapping_from_db(self.db_cursor, plural)
        if mapping:
            print(f" - Found mapping in db: {mapping}")
            return self.wrap_mapping_with_dictionary_data(mapping)

        food_nonfood = classify_as_food_nonfood(input_word)

        # if we're very confident that the word is non-food, let's not even classify it
        if food_nonfood[1] > 0.9 and food_nonfood[0] == False:
            mapping = {
                'input_word': input_word, 
                'cleaned_word': input_word_clean, 
                'matching_word': 'Non-Food Item', 
                'dictionary_word': 'Non-Food Item', 
                'similarity_score': None, 
                'confidence_score': None, 
                'similar_words': None,
                'is_food': False,
                'food_nonfood_score': food_nonfood[1]
            }
            store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
            return self.wrap_mapping_with_dictionary_data(mapping)

        mapping = self.perform_mapping(input_word)

        food_nonfood_pessimistic = pessimistic_food_nonfood_score(food_nonfood, mapping['similarity_score'])
        mapping.update({
            'is_food': food_nonfood_pessimistic[0],
            'food_nonfood_score': food_nonfood_pessimistic[1]
        })

        print(f" - Storing new mapping to db: {mapping}")
        store_mapping_to_db(self.db_cursor, self.db_conn, mapping)

        return self.wrap_mapping_with_dictionary_data(mapping)

    def wrap_mapping_with_dictionary_data(self, mapping):
        if not mapping:
            return None

        dictionary_result = get_dictionary_data_from_db(self.db_cursor, mapping['dictionary_word'])

        mapping.update({
            'wweia_category': dictionary_result['wweia_category'] if dictionary_result else None,
            'water_content': dictionary_result['water_content'] if dictionary_result else None,
            'dry_matter_content': dictionary_result['dry_matter_content'] if dictionary_result else None,
            'leakage': dictionary_result['leakage'] if dictionary_result else None
        })

        return mapping
    
    def add_carbon_credit_data(self, mapping, donor, date, weight):
        if not mapping:
            return None

        mapping.update({
            'donor': donor
        })
        mapping.update({
            'date': date,
            'weight': weight,
            'weight_metric_tonnes': float(weight) * 0.000453592,
            'distance': 250,
            'ef': 2.968073544,
            'mt_lb_mile': 0.0000000809,
        })
        
        if mapping['is_food'] == False:
            return {
                'baseline_emissions': None,
                'leakage_emissions': None,
                'project_emissions': None,
                'total_emissions_reduction': None,
                **mapping
            }


        baseline_emissions = mapping['weight_metric_tonnes'] * mapping['dry_matter_content'] * mapping['ef']
        leakage_emissions = mapping['leakage'] * baseline_emissions
        project_emissions = mapping['distance'] * mapping['mt_lb_mile'] * baseline_emissions
        total_emissions_reduction = baseline_emissions - leakage_emissions - project_emissions
        mapping.update({
            'baseline_emissions': baseline_emissions,
            'leakage_emissions': leakage_emissions,
            'project_emissions': project_emissions,
            'total_emissions_reduction': total_emissions_reduction
        })

        return mapping

    def match_words(self, input_data, stream_results=False):
        # input_data is a list of tuples, where each tuple is (description, donor)
        results = []
        for input_item in tqdm(input_data, desc="Processing input words"):
            input_word = input_item[0]
            input_donor = input_item[1] if len(input_item) > 1 else None
            input_date = input_item[2] if len(input_item) > 2 else None
            input_weight = input_item[3] if len(input_item) > 3 else None

            if not isinstance(input_word, str) or pd.isna(input_word) or input_word == "" or input_word.lower() == "nan":
                continue

            print()
            print(f"Processing: {input_word}")
            
            if ',' in input_word or '/' in input_word:
                mapping = self.handle_multi_item(input_word)
            else:
                mapping = self.handle_single_item(input_word)

            if mapping:
                mapping = self.add_carbon_credit_data(mapping, input_donor, input_date, input_weight)
                results.append(mapping)
            
            if stream_results:
                return mapping

        self.save_to_csv(results)
        
        return results