# This script loads the list and profiles of our ingredients selection. # It defines rules to recognize ingredients from the list in recipes and the function to extract that information from ingredient strings. import pandas as pd from src.cocktails.config import INGREDIENTS_LIST_PATH, COCKTAILS_CSV_DATA import numpy as np ingredient_profiles = pd.read_csv(INGREDIENTS_LIST_PATH) ingredient_list = [ing.lower() for ing in ingredient_profiles['ingredient']] n_ingredients = len(ingredient_list) ingredient2ingredient_id = dict(zip(ingredient_list, range(n_ingredients))) ingredients_types = sorted(set(ingredient_profiles['type'])) # for each type, get all ingredients ing_per_type = [[ing for ing in ingredient_list if ingredient_profiles['type'][ingredient_list.index(ing)] == type] for type in ingredients_types] ingredients_per_type = dict(zip(ingredients_types, ing_per_type)) bubble_ingredients = ['soda', 'ginger beer', 'tonic', 'sparkling wine'] # rules to recognize ingredients in recipes. # in [] are separate rules with an OR relation: only one needs to be satisfied # within [], rules apply with and AND relation: all rules need to be satisfied. # ~ indicates that the following expression must NOT appear # simple expression indicate that the expression MUST appear. ingredient_search = {#'salt': ['salt'], 'lime juice': [['lime', '~soda', '~lemonade', '~cordial']], 'lemon juice': [['lemon', '~soda', '~lemonade']], 'angostura': [['angostura', '~orange'], ['bitter', '~campari', '~orange', '~red', '~italian', '~fernet']], 'orange bitters': [['orange', 'bitter', '~bittersweet']], 'orange juice': [['orange', '~bitter', '~jam', '~marmalade', '~liqueur', '~water'], ['orange', 'squeeze']], 'pineapple juice': [['pineapple']], # 'apple juice': [['apple', 'juice', '~pine']], 'cranberry juice': [['cranberry', 'juice']], 'cointreau': ['cointreau', 'triple sec', 'grand marnier', 'curaçao', 'curacao'], 'luxardo maraschino': ['luxardo', 'maraschino', 'kirsch'], 'amaretto': ['amaretto'], 'benedictine': ['benedictine', 'bénédictine', 'bénedictine', 'benédictine'], 'campari': ['campari', ['italian', 'red', 'bitter'], 'aperol', 'bittersweet', 'aperitivo', 'orange-red'], # 'campari': ['campari', ['italian', 'red', 'bitter']], # 'crème de violette': [['violette', 'crème'], ['crême', 'violette'], ['liqueur', 'violette']], # 'aperol': ['aperol', 'bittersweet', 'aperitivo', 'orange-red'], 'green chartreuse': ['chartreuse'], 'black raspberry liqueur': [['cassis', 'liqueur'], ['black raspberry', 'liqueur'], ['raspberry', 'liqueur'], ['strawberry', 'liqueur'], ['blackberry', 'liqueur'], ['violette', 'crème'], ['crême', 'violette'], ['liqueur', 'violette']], # 'simple syrup': [], # 'drambuie': ['drambuie'], # 'fernet branca': ['fernet', 'branca'], 'gin': [['gin', '~sloe', '~ginger']], 'vodka': ['vodka'], 'cuban rum': [['rum', 'puerto rican'], ['light', 'rum'], ['white', 'rum'], ['rum', 'havana', '~7'], ['rum', 'bacardi']], 'cognac': [['cognac', '~grand marnier', '~cointreau', '~orange']], # 'bourbon': [['bourbon', '~liqueur']], # 'tequila': ['tequila', 'pisco'], # 'tequila': ['tequila'], 'scotch': ['scotch'], 'dark rum': [['rum', 'age', '~bacardi', '~havana'], ['rum', 'dark', '~bacardi', '~havana'], ['rum', 'old', '~bacardi', '~havana'], ['rum', 'old', '7'], ['rum', 'havana', '7'], ['havana', 'rum', 'especial']], 'absinthe': ['absinthe'], 'rye whiskey': ['rye', ['bourbon', '~liqueur']], # 'rye whiskey': ['rye'], 'apricot brandy': [['apricot', 'brandy']], # 'pisco': ['pisco'], # 'cachaça': ['cachaça', 'cachaca'], 'egg': [['egg', 'white', '~yolk', '~whole']], 'soda': [['soda', 'water', '~lemon', '~lime']], 'mint': ['mint'], 'sparkling wine': ['sparkling wine', 'prosecco', 'champagne'], 'ginger beer': [['ginger', 'beer'], ['ginger', 'ale']], 'tonic': [['tonic'], ['7up'], ['sprite']], # 'espresso': ['espresso', 'expresso', ['café', '~liqueur', '~cream'], # ['cafe', '~liqueur', '~cream'], # ['coffee', '~liqueur', '~cream']], # 'southern comfort': ['southern comfort'], # 'cola': ['cola', 'coke', 'pepsi'], 'double syrup': [['sugar','~raspberry'], ['simple', 'syrup'], ['double', 'syrup']], # 'grenadine': ['grenadine', ['pomegranate', 'syrup']], 'grenadine': ['grenadine', ['pomegranate', 'syrup'], ['raspberry', 'syrup', '~black']], 'honey syrup': ['honey', ['maple', 'syrup']], # 'raspberry syrup': [['raspberry', 'syrup', '~black']], 'dry vermouth': [['vermouth', 'dry'], ['vermouth', 'white'], ['vermouth', 'french'], 'lillet'], 'sweet vermouth': [['vermouth', 'sweet'], ['vermouth', 'red'], ['vermouth', 'italian']], # 'lillet blanc': ['lillet'], 'water': [['water', '~sugar', '~coconut', '~soda', '~tonic', '~honey', '~orange', '~melon']] } # check that there is a rule for all ingredients in the list assert sorted(ingredient_list) == sorted(ingredient_search.keys()), 'ing search dict keys do not match ingredient list' def get_ingredients_info(): data = pd.read_csv(COCKTAILS_CSV_DATA) max_ingredients, ingredient_set, liquor_set, liqueur_set, vermouth_set = get_max_n_ingredients(data) ingredient_list = sorted(ingredient_set) alcohol = sorted(liquor_set.union(liqueur_set).union(vermouth_set).union(set(['sparkling wine']))) ind_alcohol = [i for i in range(len(ingredient_list)) if ingredient_list[i] in alcohol] return max_ingredients, ingredient_list, ind_alcohol def get_max_n_ingredients(data): max_count = 0 ingredient_set = set() alcohol_set = set() liqueur_set = set() vermouth_set = set() ing_str = np.array(data['ingredients_str']) for i in range(len(data['names'])): ingredients, quantities = extract_ingredients(ing_str[i]) max_count = max(max_count, len(ingredients)) for ing in ingredients: ingredient_set.add(ing) if ing in ingredients_per_type['liquor']: alcohol_set.add(ing) if ing in ingredients_per_type['liqueur']: liqueur_set.add(ing) if ing in ingredients_per_type['vermouth']: vermouth_set.add(ing) return max_count, ingredient_set, alcohol_set, liqueur_set, vermouth_set def find_ingredient_from_str(ing_str): # function that assigns an ingredient string to one of the ingredient if possible, following the rules defined above. # return a flag and the ingredient string. When flag is false, the ingredient has not been found and the cocktail is rejected. ing_str = ing_str.lower() flags = [] for k in ingredient_list: or_flags = [] # get flag for each of several conditions for i_p, pattern in enumerate(ingredient_search[k]): or_flags.append(True) if isinstance(pattern, str): if pattern[0] == '~' and pattern[1:] in ing_str: or_flags[-1] = False elif pattern[0] != '~' and pattern not in ing_str: or_flags[-1] = False elif isinstance(pattern, list): for element in pattern: if element[0] == '~': or_flags[-1] = or_flags[-1] and not element[1:] in ing_str else: or_flags[-1] = or_flags[-1] and element in ing_str else: raise ValueError flags.append(any(or_flags)) if sum(flags) > 1: print(ing_str) for i_f, f in enumerate(flags): if f: print(ingredient_list[i_f]) stop = 1 return True, ingredient_list[flags.index(True)] elif sum(flags) == 0: # if 'grape' not in ing_str: # print('\t\t Not found:', ing_str) return True, None else: return False, ingredient_list[flags.index(True)] def get_cocktails_per_ingredient(ing_strs): cocktails_per_ing = dict(zip(ingredient_list, [[] for _ in range(len(ingredient_list))])) for i_ing, ing_str in enumerate(ing_strs): ingredients, _ = extract_ingredients(ing_str) for ing in ingredients: cocktails_per_ing[ing].append(i_ing) return cocktails_per_ing def extract_ingredients(ingredient_str): # extract list of ingredients and quantities from an formatted ingredient string (reverse of format_ingredients) ingredient_str = ingredient_str[1: -1] words = ingredient_str.split(',') ingredients = [] quantities = [] for i in range(len(words)//2): ingredients.append(words[2 * i][1:]) quantities.append(float(words[2 * i + 1][:-1])) return ingredients, quantities def format_ingredients(ingredients, quantities): # format an ingredient string from the lists of ingredients and quantities (reverse of extract_ingredients) out = '[' for ing, q in zip(ingredients, quantities): if ing[-1] == ' ': ingre = ing[:-1] else: ingre = ing out += f'({ingre},{q}),' out = out[:-1] + ']' return out def get_ingredient_count(data): # get count of ingredients in the whole dataset ingredient_counts = dict(zip(ingredient_list, [0] * len(ingredient_list))) for i in range(len(data['names'])): if data['to_keep'][i]: ingredients, _ = extract_ingredients(data['ingredients_str'][i]) for i in ingredients: ingredient_counts[i] += 1 return ingredient_counts def add_counts_to_ingredient_list(data): # update the list of ingredients to add their count of occurence in dataset. ingredient_counts = get_ingredient_count(data) counts = [ingredient_counts[k] for k in ingredient_list] ingredient_profiles['counts'] = counts ingredient_profiles.to_csv(INGREDIENTS_LIST_PATH, index=False)