File size: 11,404 Bytes
93c029f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# This script loads the list and profiles of our ingredients selection.
# It defines rules to recognize ingredients from the list in recipes and the function to extract that information from ingredient strings.

import pandas as pd
from src.cocktails.config import INGREDIENTS_LIST_PATH, COCKTAILS_CSV_DATA
import numpy as np

ingredient_profiles = pd.read_csv(INGREDIENTS_LIST_PATH)
ingredient_list = [ing.lower() for ing in ingredient_profiles['ingredient']]
n_ingredients = len(ingredient_list)
ingredient2ingredient_id = dict(zip(ingredient_list, range(n_ingredients)))

ingredients_types = sorted(set(ingredient_profiles['type']))
# for each type, get all ingredients
ing_per_type = [[ing for ing in ingredient_list if ingredient_profiles['type'][ingredient_list.index(ing)] == type] for type in ingredients_types]
ingredients_per_type = dict(zip(ingredients_types, ing_per_type))

bubble_ingredients =  ['soda', 'ginger beer', 'tonic', 'sparkling wine']
# rules to recognize ingredients in recipes.
# in [] are separate rules with an OR relation: only one needs to be satisfied
# within [], rules apply with and AND relation: all rules need to be satisfied.
# ~ indicates that the following expression must NOT appear
# simple expression indicate that the expression MUST appear.
ingredient_search = {#'salt': ['salt'],
                     'lime juice': [['lime', '~soda', '~lemonade', '~cordial']],
                     'lemon juice': [['lemon', '~soda', '~lemonade']],
                     'angostura': [['angostura', '~orange'],
                                   ['bitter', '~campari', '~orange', '~red', '~italian', '~fernet']],
                     'orange bitters': [['orange', 'bitter', '~bittersweet']],
                     'orange juice': [['orange', '~bitter', '~jam', '~marmalade', '~liqueur', '~water'],
                                      ['orange', 'squeeze']],
                     'pineapple juice': [['pineapple']],
                     # 'apple juice': [['apple', 'juice', '~pine']],
                     'cranberry juice': [['cranberry', 'juice']],
                     'cointreau': ['cointreau', 'triple sec', 'grand marnier', 'curaçao', 'curacao'],
                     'luxardo maraschino': ['luxardo', 'maraschino', 'kirsch'],
                     'amaretto': ['amaretto'],
                     'benedictine': ['benedictine', 'bénédictine', 'bénedictine', 'benédictine'],
                     'campari': ['campari', ['italian', 'red', 'bitter'], 'aperol', 'bittersweet', 'aperitivo', 'orange-red'],
                     # 'campari': ['campari', ['italian', 'red', 'bitter']],
                     # 'crème de violette': [['violette', 'crème'], ['crême', 'violette'], ['liqueur', 'violette']],
                     # 'aperol': ['aperol', 'bittersweet', 'aperitivo', 'orange-red'],
                     'green chartreuse': ['chartreuse'],
                     'black raspberry liqueur': [['cassis', 'liqueur'],
                                                 ['black raspberry', 'liqueur'],
                                                 ['raspberry', 'liqueur'],
                                                 ['strawberry', 'liqueur'],
                                                 ['blackberry', 'liqueur'],
                                                 ['violette', 'crème'], ['crême', 'violette'], ['liqueur', 'violette']],
                     # 'simple syrup': [],
                     # 'drambuie': ['drambuie'],
                     # 'fernet branca': ['fernet', 'branca'],
                     'gin': [['gin', '~sloe', '~ginger']],
                     'vodka': ['vodka'],
                     'cuban rum': [['rum', 'puerto rican'], ['light', 'rum'], ['white', 'rum'], ['rum', 'havana', '~7'], ['rum', 'bacardi']],
                     'cognac': [['cognac', '~grand marnier', '~cointreau', '~orange']],
                     # 'bourbon': [['bourbon', '~liqueur']],
                     # 'tequila': ['tequila', 'pisco'],
                     # 'tequila': ['tequila'],
                     'scotch': ['scotch'],
                     'dark rum': [['rum', 'age', '~bacardi', '~havana'],
                                  ['rum', 'dark', '~bacardi', '~havana'],
                                  ['rum', 'old', '~bacardi', '~havana'],
                                  ['rum', 'old', '7'],
                                  ['rum', 'havana', '7'],
                                  ['havana', 'rum', 'especial']],
                     'absinthe': ['absinthe'],
                     'rye whiskey': ['rye', ['bourbon', '~liqueur']],
                     # 'rye whiskey': ['rye'],
                     'apricot brandy': [['apricot', 'brandy']],
                     # 'pisco': ['pisco'],
                     # 'cachaça': ['cachaça', 'cachaca'],
                     'egg': [['egg', 'white', '~yolk', '~whole']],
                     'soda': [['soda', 'water', '~lemon', '~lime']],
                     'mint': ['mint'],
                     'sparkling wine': ['sparkling wine', 'prosecco', 'champagne'],
                     'ginger beer': [['ginger', 'beer'], ['ginger', 'ale']],
                     'tonic': [['tonic'], ['7up'], ['sprite']],
                     # 'espresso': ['espresso', 'expresso', ['café', '~liqueur', '~cream'],
                     #              ['cafe', '~liqueur', '~cream'],
                     #              ['coffee', '~liqueur', '~cream']],
                     # 'southern comfort': ['southern comfort'],
                     # 'cola': ['cola', 'coke', 'pepsi'],
                     'double syrup': [['sugar','~raspberry'], ['simple', 'syrup'], ['double', 'syrup']],
                     # 'grenadine': ['grenadine', ['pomegranate', 'syrup']],
                     'grenadine': ['grenadine', ['pomegranate', 'syrup'], ['raspberry', 'syrup', '~black']],
                     'honey syrup': ['honey', ['maple', 'syrup']],
                     # 'raspberry syrup': [['raspberry', 'syrup', '~black']],
                     'dry vermouth': [['vermouth', 'dry'], ['vermouth', 'white'], ['vermouth', 'french'], 'lillet'],
                     'sweet vermouth': [['vermouth', 'sweet'], ['vermouth', 'red'], ['vermouth', 'italian']],
                     # 'lillet blanc': ['lillet'],
                     'water': [['water', '~sugar', '~coconut', '~soda', '~tonic', '~honey', '~orange', '~melon']]
                     }
# check that there is a rule for all ingredients in the list
assert sorted(ingredient_list) == sorted(ingredient_search.keys()), 'ing search dict keys do not match ingredient list'

def get_ingredients_info():
    data = pd.read_csv(COCKTAILS_CSV_DATA)
    max_ingredients, ingredient_set, liquor_set, liqueur_set, vermouth_set = get_max_n_ingredients(data)
    ingredient_list = sorted(ingredient_set)
    alcohol = sorted(liquor_set.union(liqueur_set).union(vermouth_set).union(set(['sparkling wine'])))
    ind_alcohol = [i for i in range(len(ingredient_list)) if ingredient_list[i] in alcohol]
    return max_ingredients, ingredient_list, ind_alcohol

def get_max_n_ingredients(data):
    max_count = 0
    ingredient_set = set()
    alcohol_set = set()
    liqueur_set = set()
    vermouth_set = set()
    ing_str = np.array(data['ingredients_str'])
    for i in range(len(data['names'])):
        ingredients, quantities = extract_ingredients(ing_str[i])
        max_count = max(max_count, len(ingredients))
        for ing in ingredients:
            ingredient_set.add(ing)
            if ing in ingredients_per_type['liquor']:
                alcohol_set.add(ing)
            if ing in ingredients_per_type['liqueur']:
                liqueur_set.add(ing)
            if ing in ingredients_per_type['vermouth']:
                vermouth_set.add(ing)
    return max_count, ingredient_set, alcohol_set, liqueur_set, vermouth_set

def find_ingredient_from_str(ing_str):
    # function that assigns an ingredient string to one of the ingredient if possible, following the rules defined above.
    # return a flag and the ingredient string. When flag is false, the ingredient has not been found and the cocktail is rejected.
    ing_str = ing_str.lower()
    flags = []
    for k in ingredient_list:
        or_flags = [] # get flag for each of several conditions
        for i_p, pattern in enumerate(ingredient_search[k]):
            or_flags.append(True)
            if isinstance(pattern, str):
                if pattern[0] == '~' and pattern[1:] in ing_str:
                    or_flags[-1] = False
                elif pattern[0] != '~' and pattern not in ing_str:
                    or_flags[-1] = False
            elif isinstance(pattern, list):
                for element in pattern:
                    if element[0] == '~':
                        or_flags[-1] = or_flags[-1] and not element[1:] in ing_str
                    else:
                        or_flags[-1] = or_flags[-1] and element in ing_str
            else:
                raise ValueError
        flags.append(any(or_flags))
    if sum(flags) > 1:
        print(ing_str)
        for i_f, f in enumerate(flags):
            if f:
                print(ingredient_list[i_f])
        stop = 1
        return True, ingredient_list[flags.index(True)]
    elif sum(flags) == 0:
        # if 'grape' not in ing_str:
        #     print('\t\t Not found:', ing_str)
        return True, None
    else:
        return False, ingredient_list[flags.index(True)]

def get_cocktails_per_ingredient(ing_strs):
    cocktails_per_ing = dict(zip(ingredient_list, [[] for _ in range(len(ingredient_list))]))
    for i_ing, ing_str in enumerate(ing_strs):
        ingredients, _ = extract_ingredients(ing_str)
        for ing in ingredients:
            cocktails_per_ing[ing].append(i_ing)
    return cocktails_per_ing

def extract_ingredients(ingredient_str):
    # extract list of ingredients and quantities from an formatted ingredient string (reverse of format_ingredients)
    ingredient_str = ingredient_str[1: -1]
    words = ingredient_str.split(',')
    ingredients = []
    quantities = []
    for i in range(len(words)//2):
        ingredients.append(words[2 * i][1:])
        quantities.append(float(words[2 * i + 1][:-1]))
    return ingredients, quantities

def format_ingredients(ingredients, quantities):
    # format an ingredient string from the lists of ingredients and quantities (reverse of extract_ingredients)
    out = '['
    for ing, q in zip(ingredients, quantities):
        if ing[-1] == ' ':
            ingre = ing[:-1]
        else:
            ingre = ing
        out += f'({ingre},{q}),'
    out = out[:-1] + ']'
    return out


def get_ingredient_count(data):
    # get count of ingredients in the whole dataset
    ingredient_counts = dict(zip(ingredient_list, [0] * len(ingredient_list)))
    for i in range(len(data['names'])):
        if data['to_keep'][i]:
            ingredients, _ = extract_ingredients(data['ingredients_str'][i])
            for i in ingredients:
                ingredient_counts[i] += 1
    return ingredient_counts

def add_counts_to_ingredient_list(data):
    # update the list of ingredients to add their count of occurence in dataset.
    ingredient_counts = get_ingredient_count(data)
    counts = [ingredient_counts[k] for k in ingredient_list]
    ingredient_profiles['counts'] = counts
    ingredient_profiles.to_csv(INGREDIENTS_LIST_PATH, index=False)