brightly-ai / algo.py
beweinreich's picture
more bug fixes for missing data
46a46af
raw
history blame
15.1 kB
import time
from tqdm import tqdm
import pandas as pd
from pluralizer import Pluralizer
from similarity_fast import SimilarityFast
from food_nonfood import classify_as_food_nonfood, pessimistic_food_nonfood_score
from utils import clean_word
from db.db_utils import store_mapping_to_db, get_mapping_from_db, get_dictionary_data_from_db
from ask_gpt import query_gpt
from multi_food_item_detector import extract_items
similarity_threshold = 0.75
class Algo:
def __init__(self, db_conn, enable_csv=False):
self.db_conn = db_conn
self.enable_csv = enable_csv
self.db_cursor = db_conn.cursor()
self.similarity_fast = SimilarityFast(self.db_cursor)
# self.similarity_slow = SimilaritySlow(self.db_cursor, self.db_conn)
self.pluralizer = Pluralizer()
def save_to_csv(self, results):
if not self.enable_csv:
return
output_file_path = f'./results/{int(time.time())}.csv'
df_results = pd.DataFrame(results, columns=[
'date',
"input_word",
"dictionary_word",
"is_food",
'wweia_category',
'dry_matter_content',
"water_content",
'carbon_credits',
'weight',
'weight_metric_tonnes',
'donor',
"similarity_score",
"food_nonfood_score",
'distance',
'ef',
'mt_lb_mile',
'baseline_emissions',
'leakage_emissions',
'project_emissions',
'total_emissions_reduction',
])
df_results.to_csv(output_file_path, index=False)
def perform_mapping(self, input_word, attempts=0):
# if the input word is a USDA food item, we can skip the similarity check
# this is a special case because the USDA food items are Government Donation (Not Counted) items
if 'usda' in input_word.lower():
return {
'input_word': input_word,
'cleaned_word': clean_word(input_word),
'matching_word': 'USDA Food Item',
'dictionary_word': 'USDA Food Item',
'similarity_score': 1.0,
'confidence_score': 1.0,
'similar_words': None,
'is_food': True,
'food_nonfood_score': 1.0
}
mapping = self.similarity_fast.find_most_similar_word(input_word)
# skip slow mapping for now
# if mapping['similarity_score'] < similarity_threshold:
# print("Attempting slow mapping")
# slow_mapping = self.similarity_slow.find_most_similar_word(input_word)
# print(f" - Slow: {slow_mapping}")
# if slow_mapping['similarity_score'] > mapping['similarity_score']:
# mapping = slow_mapping
# if mapping['similarity_score'] < similarity_threshold and len(input_word.split(' ')) > 1:
# print(" - Attempting reverse mapping")
# reversed_input_word = ' '.join(input_word.split(' ')[::-1])
# reversed_mapping = self.similarity_fast.find_most_similar_word(reversed_input_word)
# if reversed_mapping['similarity_score'] > mapping['similarity_score']:
# reversed_mapping.update(
# {
# 'input_word': input_word,
# 'cleaned_word': mapping['cleaned_word']
# }
# )
# mapping = reversed_mapping
# check if the cleaned_word is a substring of the matching_word
is_substring = mapping['cleaned_word'] in mapping['matching_word']
if mapping['similarity_score'] < similarity_threshold and not is_substring:
print(" - Attempting GPT mapping")
try:
gpt_recommended_word = query_gpt(input_word)
if gpt_recommended_word:
if gpt_recommended_word == 'Non-Food Item':
mapping.update(
{
'similarity_score': 1.0,
'confidence_score': 1.0,
'is_food': False,
'food_nonfood_score': 1.0
}
)
return mapping
elif gpt_recommended_word == 'Mixed Food Items':
mapping.update(
{
'matching_word': 'Mixed Food Items',
'dictionary_word': 'Mixed Food Items', 'similarity_score': 1.0,
'confidence_score': 1.0
}
)
return mapping
else:
gpt_mapping = self.similarity_fast.find_most_similar_word(gpt_recommended_word)
if gpt_mapping['similarity_score'] > mapping['similarity_score']:
gpt_mapping.update(
{
'input_word': input_word,
'cleaned_word': mapping['cleaned_word']
}
)
mapping = gpt_mapping
except Exception as e:
print(f" - Error querying GPT: {e}")
return mapping
def handle_multi_item(self, input_word):
# The input word has a comma or a slash in it
# If it has more commas, its comma-delimited
# If it has more slashes, its slash-delimited
# If it has equal number of commas and slashes, we'll go with slashes
input_word_parts = extract_items(input_word)
print(f" - Extracted items: {input_word_parts}")
mappings = []
for part in input_word_parts:
mapping = self.handle_single_item(part)
mappings.append(mapping)
# look up the dictionary values for each mapping
# find the wweia category
# if all mappings have the same wweia category, return "homogenous", else "heterogeneous"
# if is_food is False for any mappings, return "Non-Food Item" as dictionary word
for mapping in mappings:
if mapping['is_food'] == False:
return {
'input_word': input_word,
'cleaned_word': mapping['cleaned_word'],
'matching_word': 'Non-Food Item',
'dictionary_word': 'Non-Food Item',
'similarity_score': None,
'confidence_score': None,
'similar_words': None,
'is_food': False,
'food_nonfood_score': 1.0
}
break
dictionary_words = [mapping['dictionary_word'] for mapping in mappings]
if len(set(dictionary_words)) == 0:
return {
'input_word': input_word,
'cleaned_word': None,
'matching_word': None,
'dictionary_word': None,
'similarity_score': None,
'confidence_score': None,
'similar_words': None,
'is_food': None,
'food_nonfood_score': None
}
# check if "heterogeneous" is in the wweia category of any of the mappings
# otherwise we find the mapping with the lowest DMC value, and return that as the dictionary word, dmc, wc, and leakage values
heterogeneous_exists = False
most_conservative_mapping = None
for mapping in mappings:
if mapping['wweia_category'] == "Heterogeneous Mixture":
heterogeneous_exists = True
break
else:
dry_matter_content = mapping.get('dry_matter_content')
if dry_matter_content is not None:
if most_conservative_mapping is None or dry_matter_content < most_conservative_mapping['dry_matter_content']:
most_conservative_mapping = mapping
mixture_data = {}
if heterogeneous_exists:
mixture_data = {
'matching_word': 'Heterogeneous Mixture',
'dictionary_word': 'Heterogeneous Mixture',
'wweia_category': 'Heterogeneous Mixture',
'dry_matter_content': 0.27,
'water_content': 0.73,
'leakage': 0.1
}
else:
mixture_data = {
'matching_word': most_conservative_mapping['matching_word'],
'dictionary_word': f"{most_conservative_mapping['dictionary_word']} (Lowest DMC)",
'wweia_category': most_conservative_mapping['wweia_category'],
'dry_matter_content': most_conservative_mapping['dry_matter_content'],
'water_content': most_conservative_mapping['water_content'],
'leakage': most_conservative_mapping['leakage']
}
print(f" - Mixture data: {mixture_data}")
return {
'input_word': input_word,
'cleaned_word': None,
'similarity_score': None,
'confidence_score': None,
'similar_words': None,
'is_food': True,
'food_nonfood_score': 1.0,
**mixture_data
}
def handle_single_item(self, input_word):
input_word_clean = clean_word(input_word)
# try the singular form of the word
singular = self.pluralizer.pluralize(input_word_clean, 1)
mapping = get_mapping_from_db(self.db_cursor, singular)
if mapping:
print(f" - Found mapping in db: {mapping}")
return self.wrap_mapping_with_dictionary_data(mapping)
# try the plural form of the word
plural = self.pluralizer.pluralize(input_word_clean, 2)
mapping = get_mapping_from_db(self.db_cursor, plural)
if mapping:
print(f" - Found mapping in db: {mapping}")
return self.wrap_mapping_with_dictionary_data(mapping)
food_nonfood = classify_as_food_nonfood(input_word)
# if we're very confident that the word is non-food, let's not even classify it
if food_nonfood[1] > 0.9 and food_nonfood[0] == False:
mapping = {
'input_word': input_word,
'cleaned_word': input_word_clean,
'matching_word': 'Non-Food Item',
'dictionary_word': 'Non-Food Item',
'similarity_score': None,
'confidence_score': None,
'similar_words': None,
'is_food': False,
'food_nonfood_score': food_nonfood[1]
}
store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
return self.wrap_mapping_with_dictionary_data(mapping)
mapping = self.perform_mapping(input_word)
food_nonfood_pessimistic = pessimistic_food_nonfood_score(food_nonfood, mapping['similarity_score'])
mapping.update({
'is_food': food_nonfood_pessimistic[0],
'food_nonfood_score': food_nonfood_pessimistic[1]
})
print(f" - Storing new mapping to db: {mapping}")
store_mapping_to_db(self.db_cursor, self.db_conn, mapping)
return self.wrap_mapping_with_dictionary_data(mapping)
def wrap_mapping_with_dictionary_data(self, mapping):
if not mapping:
return None
dictionary_result = get_dictionary_data_from_db(self.db_cursor, mapping['dictionary_word'])
mapping.update({
'wweia_category': dictionary_result['wweia_category'] if dictionary_result else None,
'water_content': dictionary_result['water_content'] if dictionary_result else None,
'dry_matter_content': dictionary_result['dry_matter_content'] if dictionary_result else None,
'leakage': dictionary_result['leakage'] if dictionary_result else None
})
return mapping
def add_carbon_credit_data(self, mapping, donor, date, weight):
if not mapping:
return None
mapping.update({
'donor': donor
})
mapping.update({
'date': date,
'weight': weight,
'weight_metric_tonnes': float(weight) * 0.000453592,
'distance': 250,
'ef': 2.968073544,
'mt_lb_mile': 0.0000000809,
})
required_fields_exist = 'leakage' in mapping and mapping['leakage'] is not None and 'dry_matter_content' in mapping and mapping['dry_matter_content'] is not None
if mapping['is_food'] == False or required_fields_exist == False:
return {
'baseline_emissions': None,
'leakage_emissions': None,
'project_emissions': None,
'total_emissions_reduction': None,
**mapping
}
print(f" - Calculating carbon credits for: {mapping}")
baseline_emissions = mapping['weight_metric_tonnes'] * mapping['dry_matter_content'] * mapping['ef']
leakage_emissions = mapping['leakage'] * baseline_emissions
project_emissions = mapping['distance'] * mapping['mt_lb_mile'] * baseline_emissions
total_emissions_reduction = baseline_emissions - leakage_emissions - project_emissions
mapping.update({
'baseline_emissions': baseline_emissions,
'leakage_emissions': leakage_emissions,
'project_emissions': project_emissions,
'total_emissions_reduction': total_emissions_reduction
})
return mapping
def match_words(self, input_data, stream_results=False):
# input_data is a list of tuples, where each tuple is (description, donor)
results = []
for input_item in tqdm(input_data, desc="Processing input words"):
input_word = input_item[0]
input_donor = input_item[1] if len(input_item) > 1 else None
input_date = input_item[2] if len(input_item) > 2 else None
input_weight = input_item[3] if len(input_item) > 3 else None
if not isinstance(input_word, str) or pd.isna(input_word) or input_word == "" or input_word.lower() == "nan":
continue
print()
print(f"Processing: {input_word}")
if ',' in input_word or '/' in input_word:
mapping = self.handle_multi_item(input_word)
else:
mapping = self.handle_single_item(input_word)
if mapping:
mapping = self.add_carbon_credit_data(mapping, input_donor, input_date, input_weight)
results.append(mapping)
if stream_results:
return mapping
self.save_to_csv(results)
return results