Spaces:

ccolas
/

TastyPiano

Runtime error

App Files Files Community

ccolas commited on Sep 3, 2023

Commit

93c029f

•

1 Parent(s): 981764f

Upload 174 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

src/__init__.py +0 -0
src/cocktails/__init__.py +0 -0
src/cocktails/__pycache__/__init__.cpython-39.pyc +0 -0
src/cocktails/__pycache__/config.cpython-39.pyc +0 -0
src/cocktails/config.py +21 -0
src/cocktails/pipeline/__init__.py +0 -0
src/cocktails/pipeline/__pycache__/__init__.cpython-39.pyc +0 -0
src/cocktails/pipeline/__pycache__/cocktail2affect.cpython-39.pyc +0 -0
src/cocktails/pipeline/__pycache__/cocktailrep2recipe.cpython-39.pyc +0 -0
src/cocktails/pipeline/__pycache__/get_affect2affective_cluster.cpython-39.pyc +0 -0
src/cocktails/pipeline/__pycache__/get_cocktail2affective_cluster.cpython-39.pyc +0 -0
src/cocktails/pipeline/cocktail2affect.py +372 -0
src/cocktails/pipeline/cocktailrep2recipe.py +329 -0
src/cocktails/pipeline/get_affect2affective_cluster.py +23 -0
src/cocktails/pipeline/get_cocktail2affective_cluster.py +9 -0
src/cocktails/representation_learning/__init__.py +0 -0
src/cocktails/representation_learning/__pycache__/__init__.cpython-39.pyc +0 -0
src/cocktails/representation_learning/__pycache__/dataset.cpython-39.pyc +0 -0
src/cocktails/representation_learning/__pycache__/multihead_model.cpython-39.pyc +0 -0
src/cocktails/representation_learning/__pycache__/run.cpython-39.pyc +0 -0
src/cocktails/representation_learning/__pycache__/run_without_vae.cpython-39.pyc +0 -0
src/cocktails/representation_learning/__pycache__/simple_model.cpython-39.pyc +0 -0
src/cocktails/representation_learning/__pycache__/vae_model.cpython-39.pyc +0 -0
src/cocktails/representation_learning/dataset.py +324 -0
src/cocktails/representation_learning/multihead_model.py +148 -0
src/cocktails/representation_learning/run.py +557 -0
src/cocktails/representation_learning/run_simple_net.py +302 -0
src/cocktails/representation_learning/run_without_vae.py +514 -0
src/cocktails/representation_learning/simple_model.py +54 -0
src/cocktails/representation_learning/vae_model.py +238 -0
src/cocktails/utilities/__init__.py +0 -0
src/cocktails/utilities/__pycache__/__init__.cpython-39.pyc +0 -0
src/cocktails/utilities/__pycache__/cocktail_category_detection_utilities.cpython-39.pyc +0 -0
src/cocktails/utilities/__pycache__/cocktail_utilities.cpython-39.pyc +0 -0
src/cocktails/utilities/__pycache__/glass_and_volume_utilities.cpython-39.pyc +0 -0
src/cocktails/utilities/__pycache__/ingredients_utilities.cpython-39.pyc +0 -0
src/cocktails/utilities/__pycache__/other_scrubbing_utilities.cpython-39.pyc +0 -0
src/cocktails/utilities/analysis_utilities.py +189 -0
src/cocktails/utilities/cocktail_category_detection_utilities.py +221 -0
src/cocktails/utilities/cocktail_generation_utilities/__init__.py +0 -0
src/cocktails/utilities/cocktail_generation_utilities/__pycache__/__init__.cpython-39.pyc +0 -0
src/cocktails/utilities/cocktail_generation_utilities/__pycache__/individual.cpython-39.pyc +0 -0
src/cocktails/utilities/cocktail_generation_utilities/__pycache__/population.cpython-39.pyc +0 -0
src/cocktails/utilities/cocktail_generation_utilities/individual.py +587 -0
src/cocktails/utilities/cocktail_generation_utilities/population.py +213 -0
src/cocktails/utilities/cocktail_utilities.py +220 -0
src/cocktails/utilities/glass_and_volume_utilities.py +42 -0
src/cocktails/utilities/ingredients_utilities.py +209 -0
src/cocktails/utilities/other_scrubbing_utilities.py +240 -0
src/debugger.py +180 -0

src/__init__.py ADDED Viewed

File without changes

src/cocktails/__init__.py ADDED Viewed

File without changes

src/cocktails/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (198 Bytes). View file

src/cocktails/__pycache__/config.cpython-39.pyc ADDED Viewed

Binary file (961 Bytes). View file

src/cocktails/config.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+REPO_PATH = '/'.join(os.path.abspath(__file__).split('/')[:-3]) + '/'
+# QUADRUPLETS_PATH = REPO_PATH + 'checkpoints/cocktail_representation/quadruplets.pickle'
+INGREDIENTS_LIST_PATH = REPO_PATH + 'checkpoints/cocktail_representation/ingredient_list.csv'
+# ING_MATCH_SCORE_Q_PATH = REPO_PATH + 'checkpoints/cocktail_representation/ingredient_match_score_q.txt'
+# ING_MATCH_SCORE_COUNT_PATH = REPO_PATH + 'checkpoints/cocktail_representation/ingredient_match_score_count.txt'
+# COCKTAIL_DATA_FOLDER_PATH = REPO_PATH + 'checkpoints/cocktail_representation/'
+COCKTAILS_CSV_DATA = REPO_PATH + 'checkpoints/cocktail_representation/cocktails_data.csv'
+# COCKTAILS_PKL_DATA = REPO_PATH + 'checkpoints/cocktail_representation/cocktails_data.pkl'
+# COCKTAILS_URL_DATA = REPO_PATH + 'checkpoints/cocktail_representation/cocktails_names_urls.pkl'
+EXPERIMENT_PATH = REPO_PATH + 'experiments/cocktails/representation_learning/'
+# ANALYSIS_PATH = REPO_PATH + 'experiments/cocktails/representation_analysis/'
+# REPRESENTATIONS_PATH = REPO_PATH + 'experiments/cocktails/learned_representations/'
+FULL_COCKTAIL_REP_PATH = REPO_PATH + "/checkpoints/cocktail_representation/handcoded_reps/cocktail_handcoded_reps_minmax_norm-1_1_dim13_customkeys.txt"
+RECIPE2FEATURES_PATH = REPO_PATH + "/checkpoints/cocktail_representation/"  # get this by running run_without_vae
+COCKTAIL_REP_CHKPT_PATH = REPO_PATH + "/checkpoints/cocktail_representation/handcoded_reps/"
+# FULL_COCKTAIL_REP_PATH = REPO_PATH + "experiments/cocktails/representation_analysis/affective_mapping/clustered_representations/all_cocktail_reps_norm-1_1_custom_keys_dim13.txt'
+COCKTAIL_NN_PATH = REPO_PATH + "/checkpoints/cocktail_representation/handcoded_reps/nn_model.pickle"

src/cocktails/pipeline/__init__.py ADDED Viewed

File without changes

src/cocktails/pipeline/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (207 Bytes). View file

src/cocktails/pipeline/__pycache__/cocktail2affect.cpython-39.pyc ADDED Viewed

Binary file (13.5 kB). View file

src/cocktails/pipeline/__pycache__/cocktailrep2recipe.cpython-39.pyc ADDED Viewed

Binary file (10.6 kB). View file

src/cocktails/pipeline/__pycache__/get_affect2affective_cluster.cpython-39.pyc ADDED Viewed

Binary file (1.15 kB). View file

src/cocktails/pipeline/__pycache__/get_cocktail2affective_cluster.cpython-39.pyc ADDED Viewed

Binary file (789 Bytes). View file

src/cocktails/pipeline/cocktail2affect.py ADDED Viewed

	@@ -0,0 +1,372 @@

+import pandas as pd
+import numpy as np
+import os
+from src.cocktails.utilities.cocktail_utilities import get_bunch_of_rep_keys
+from src.cocktails.utilities.other_scrubbing_utilities import print_recipe
+from src.cocktails.config import COCKTAILS_CSV_DATA
+from src.music.config import CHECKPOINTS_PATH, EXPERIMENT_PATH
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+from sklearn.mixture import GaussianMixture
+from sklearn.neighbors import NearestNeighbors
+import pickle
+import random
+experiment_path = EXPERIMENT_PATH + '/cocktails/representation_analysis/affective_mapping/'
+min_max_path = CHECKPOINTS_PATH + "/cocktail_representation/minmax/"
+cluster_model_path = CHECKPOINTS_PATH + "/music2cocktails/affects2affect_cluster/cluster_model.pickle"
+affective_space_dimensions = ((-1, 1), (-1, 1), (-1, 1))  # valence, arousal, dominance
+n_splits = (3, 3, 2)  # number of bins per dimension
+# dimensions_weights = [1, 1, 0.5]
+dimensions_weights = [1, 1, 1]
+total_n_clusters = np.prod(n_splits)  # total number of bins
+affective_boundaries = [np.arange(asd[0], asd[1]+1e-6, (asd[1] - asd[0]) / n_split) for asd, n_split in zip(affective_space_dimensions, n_splits)]
+for af in affective_boundaries:
+    af[-1] += 1e-6
+all_keys = get_bunch_of_rep_keys()['custom']
+original_affective_keys = get_bunch_of_rep_keys()['affective']
+affective_keys = [a.split(' ')[1] for a in original_affective_keys]
+random.seed(0)
+cluster_colors = ['#%06X' % random.randint(0, 0xFFFFFF) for _ in range(total_n_clusters)]
+clustering_method = 'k_means'  # 'k_means', 'handcoded', 'agglo', 'spectral'
+if clustering_method != 'handcoded':
+    total_n_clusters = 10
+min_arousal = np.loadtxt(min_max_path + 'min_arousal.txt')
+max_arousal = np.loadtxt(min_max_path + 'max_arousal.txt')
+min_val = np.loadtxt(min_max_path + 'min_valence.txt')
+max_val = np.loadtxt(min_max_path + 'max_valence.txt')
+min_dom = np.loadtxt(min_max_path + 'min_dominance.txt')
+max_dom = np.loadtxt(min_max_path + 'max_dominance.txt')
+def get_cocktail_reps(path, save=False):
+    cocktail_data = pd.read_csv(path)
+    cocktail_reps = np.array([cocktail_data[k] for k in original_affective_keys]).transpose()
+    n_data, dim_rep = cocktail_reps.shape
+    # print(f'{n_data} data points of {dim_rep} dimensions: {affective_keys}')
+    cocktail_reps = normalize_cocktail_reps_affective(cocktail_reps, save=save)
+    if save:
+        np.savetxt(experiment_path + f'cocktail_reps_for_affective_mapping_-1_1_norm_sigmoid_rescaling_{dim_rep}_keys.txt', cocktail_reps)
+    return cocktail_reps
+def sigmoid(x, shift, beta):
+    return (1 / (1 + np.exp(-(x + shift) * beta)) - 0.5) * 2
+def normalize_cocktail_reps_affective(cocktail_reps, save=False):
+    if save:
+        min_cr = cocktail_reps.min(axis=0)
+        max_cr = cocktail_reps.max(axis=0)
+        np.savetxt(min_max_path + 'min_cocktail_reps_affective.txt', min_cr)
+        np.savetxt(min_max_path + 'max_cocktail_reps_affective.txt', max_cr)
+    else:
+        min_cr = np.loadtxt(min_max_path + 'min_cocktail_reps_affective.txt')
+        max_cr = np.loadtxt(min_max_path + 'max_cocktail_reps_affective.txt')
+    cocktail_reps = ((cocktail_reps - min_cr) / (max_cr - min_cr) - 0.5) * 2
+    cocktail_reps[:, 0] = sigmoid(cocktail_reps[:, 0], shift=0.05, beta=4)
+    cocktail_reps[:, 1] = sigmoid(cocktail_reps[:, 1], shift=0.3, beta=5)
+    cocktail_reps[:, 2] = sigmoid(cocktail_reps[:, 2], shift=0.15, beta=3)
+    cocktail_reps[:, 3] = sigmoid(cocktail_reps[:, 3], shift=0.9, beta=20)
+    cocktail_reps[:, 4] = sigmoid(cocktail_reps[:, 4], shift=0, beta=4)
+    cocktail_reps[:, 5] = sigmoid(cocktail_reps[:, 5], shift=0.2, beta=3)
+    cocktail_reps[:, 6] = sigmoid(cocktail_reps[:, 6], shift=0.5, beta=5)
+    cocktail_reps[:, 7] = sigmoid(cocktail_reps[:, 7], shift=0.2, beta=6)
+    return cocktail_reps
+def plot(cocktail_reps):
+    dim_rep = cocktail_reps.shape[1]
+    for i in range(dim_rep):
+        for j in range(i+1, dim_rep):
+            plt.figure()
+            plt.scatter(cocktail_reps[:, i], cocktail_reps[:, j], s=150, alpha=0.5)
+            plt.xlabel(affective_keys[i])
+            plt.ylabel(affective_keys[j])
+            plt.savefig(experiment_path + f'scatters/{affective_keys[i]}_vs_{affective_keys[j]}.png', dpi=300)
+            plt.close('all')
+        plt.figure()
+        plt.hist(cocktail_reps[:, i])
+        plt.xlabel(affective_keys[i])
+        plt.savefig(experiment_path + f'hists/{affective_keys[i]}.png', dpi=300)
+    plt.close('all')
+def get_clusters(affective_coordinates, save=False):
+    if clustering_method in ['k_means', 'gmm',]:
+        if clustering_method == 'k_means': model = KMeans(n_clusters=total_n_clusters)
+        elif clustering_method == 'gmm': model = GaussianMixture(n_components=total_n_clusters, covariance_type="full")
+        model.fit(affective_coordinates * np.array(dimensions_weights))
+        def find_cluster(aff_coord):
+            if aff_coord.ndim == 1:
+                aff_coord = aff_coord.reshape(1, -1)
+            return model.predict(aff_coord * np.array(dimensions_weights))
+        cluster_centers = model.cluster_centers_ if clustering_method == 'k_means' else []
+        if save:
+            to_save = dict(cluster_model=model,
+                           cluster_centers=cluster_centers,
+                           nb_clusters=len(cluster_centers),
+                           dimensions_weights=dimensions_weights)
+            with open(cluster_model_path, 'wb') as f:
+                pickle.dump(to_save, f)
+        stop= 1
+    elif clustering_method == 'handcoded':
+        def find_cluster(aff_coord):
+            if aff_coord.ndim == 1:
+                aff_coord = aff_coord.reshape(1, -1)
+            cluster_coordinates = []
+            for i in range(aff_coord.shape[0]):
+                cluster_coordinates.append([np.argwhere(affective_boundaries[j] <= aff_coord[i, j]).flatten()[-1] for j in range(3)])
+            cluster_coordinates = np.array(cluster_coordinates)
+            cluster_ids = cluster_coordinates[:, 0] * np.prod(n_splits[1:]) + cluster_coordinates[:, 1] * n_splits[-1] + cluster_coordinates[:, 2]
+            return cluster_ids
+        # find cluster centers
+        cluster_centers = []
+        for i in range(n_splits[0]):
+            asd = affective_space_dimensions[0]
+            x_coordinate = np.arange(asd[0] + 1 / n_splits[0], asd[1], (asd[1] - asd[0]) / n_splits[0])[i]
+            for j in range(n_splits[1]):
+                asd = affective_space_dimensions[1]
+                y_coordinate = np.arange(asd[0] + 1 / n_splits[1], asd[1], (asd[1] - asd[0]) / n_splits[1])[j]
+                for k in range(n_splits[2]):
+                    asd = affective_space_dimensions[2]
+                    z_coordinate = np.arange(asd[0] + 1 / n_splits[2], asd[1], (asd[1] - asd[0]) / n_splits[2])[k]
+                    cluster_centers.append([x_coordinate, y_coordinate, z_coordinate])
+        cluster_centers = np.array(cluster_centers)
+    else:
+        raise NotImplemented
+    cluster_ids = find_cluster(affective_coordinates)
+    return cluster_ids, cluster_centers, find_cluster
+def cocktail2affect(cocktail_reps, save=False):
+    if cocktail_reps.ndim == 1:
+        cocktail_reps = cocktail_reps.reshape(1, -1)
+    assert affective_keys == ['booze', 'sweet', 'sour', 'fizzy', 'complex', 'bitter', 'spicy', 'colorful']
+    all_weights = []
+    # valence
+    # + sweet - bitter - booze + colorful
+    weights = np.array([-1, 1, 0, 0, 0, -1, 0, 1])
+    valence = (cocktail_reps * weights).sum(axis=1)
+    if save:
+        min_ = valence.min()
+        max_ = valence.max()
+        np.savetxt(min_max_path + 'min_valence.txt', np.array([min_]))
+        np.savetxt(min_max_path + 'max_valence.txt', np.array([max_]))
+    else:
+        min_ = min_val
+        max_ = max_val
+    valence = 2 * ((valence - min_) / (max_ - min_) - 0.5)
+    valence = sigmoid(valence, shift=0.1, beta=3.5)
+    valence = valence.reshape(-1, 1)
+    all_weights.append(weights.copy())
+    # arousal
+    # + fizzy + sour + complex - sweet + spicy + bitter
+    # weights = np.array([0, -1, 1, 1, 1, 1, 1, 0])
+    weights = np.array([0.7, 0, 1.5, 1.5, 0.6, 0, 0.6, 0])
+    arousal = (cocktail_reps * weights).sum(axis=1)
+    if save:
+        min_ = arousal.min()
+        max_ = arousal.max()
+        np.savetxt(min_max_path + 'min_arousal.txt', np.array([min_]))
+        np.savetxt(min_max_path + 'max_arousal.txt', np.array([max_]))
+    else:
+        min_, max_ = min_arousal, max_arousal
+    arousal = 2 * ((arousal - min_) / (max_ - min_) - 0.5)  # normalize to -1, 1
+    arousal = sigmoid(arousal, shift=0.3, beta=4)
+    arousal = arousal.reshape(-1, 1)
+    all_weights.append(weights.copy())
+    # dominance
+    # assert affective_keys == ['booze', 'sweet', 'sour', 'fizzy', 'complex', 'bitter', 'spicy', 'colorful']
+    # + booze + fizzy - complex - bitter - sweet
+    weights = np.array([1.5, -0.8, 0, 0.7, -1, -1.5, 0, 0])
+    dominance = (cocktail_reps * weights).sum(axis=1)
+    if save:
+        min_ = dominance.min()
+        max_ = dominance.max()
+        np.savetxt(min_max_path + 'min_dominance.txt', np.array([min_]))
+        np.savetxt(min_max_path + 'max_dominance.txt', np.array([max_]))
+    else:
+        min_, max_ = min_dom, max_dom
+    dominance = 2 * ((dominance - min_) / (max_ - min_) - 0.5)
+    dominance = sigmoid(dominance, shift=-0.05, beta=5)
+    dominance = dominance.reshape(-1, 1)
+    all_weights.append(weights.copy())
+    affective_coordinates = np.concatenate([valence, arousal, dominance], axis=1)
+    # if save:
+    #     assert (affective_coordinates.min(axis=0) == np.array([ac[0] for ac in affective_space_dimensions])).all()
+    #     assert (affective_coordinates.max(axis=0) == np.array([ac[1] for ac in affective_space_dimensions])).all()
+    return affective_coordinates, all_weights
+def save_reps(path, affective_cluster_ids):
+    cocktail_data = pd.read_csv(path)
+    rep_keys = get_bunch_of_rep_keys()['custom']
+    cocktail_reps = np.array([cocktail_data[k] for k in rep_keys]).transpose()
+    np.savetxt(experiment_path + 'clustered_representations/' + f'min_cocktail_reps_custom_keys_dim{cocktail_reps.shape[1]}.txt', cocktail_reps.min(axis=0))
+    np.savetxt(experiment_path + 'clustered_representations/' + f'max_cocktail_reps_custom_keys_dim{cocktail_reps.shape[1]}.txt', cocktail_reps.max(axis=0))
+    cocktail_reps = ((cocktail_reps - cocktail_reps.min(axis=0)) / (cocktail_reps.max(axis=0) - cocktail_reps.min(axis=0)) - 0.5) * 2  # normalize in -1, 1
+    np.savetxt(experiment_path + 'clustered_representations/' + f'all_cocktail_reps_norm-1_1_custom_keys_dim{cocktail_reps.shape[1]}.txt', cocktail_reps)
+    np.savetxt(experiment_path + 'clustered_representations/' + 'affective_cluster_ids.txt', affective_cluster_ids)
+    for cluster_id in sorted(set(affective_cluster_ids)):
+        indexes = np.argwhere(affective_cluster_ids == cluster_id).flatten()
+        reps = cocktail_reps[indexes, :]
+        np.savetxt(experiment_path + 'clustered_representations/' + f'rep_cluster{cluster_id}_norm-1_1_custom_keys_dim{cocktail_reps.shape[1]}.txt', reps)
+def study_affects(affective_coordinates, affective_cluster_ids):
+    plt.figure()
+    plt.hist(affective_cluster_ids, bins=total_n_clusters)
+    plt.xlabel('Affective cluster ids')
+    plt.xticks(np.arange(total_n_clusters))
+    plt.savefig(experiment_path + 'affective_cluster_distrib.png')
+    fig = plt.gcf()
+    plt.close(fig)
+    fig = plt.figure()
+    ax = fig.add_subplot(projection='3d')
+    ax.set_xlim([-1, 1])
+    ax.set_ylim([-1, 1])
+    ax.set_zlim([-1, 1])
+    for cluster_id in sorted(set(affective_cluster_ids)):
+        indexes = np.argwhere(affective_cluster_ids == cluster_id).flatten()
+        ax.scatter(affective_coordinates[indexes, 0], affective_coordinates[indexes, 1], affective_coordinates[indexes, 2], c=cluster_colors[cluster_id], s=150)
+        ax.set_xlabel('Valence')
+        ax.set_ylabel('Arousal')
+        ax.set_zlabel('Dominance')
+        stop = 1
+    plt.savefig(experiment_path + 'scatters_affect/affective_mapping.png')
+    fig = plt.gcf()
+    plt.close(fig)
+    affects = ['Valence', 'Arousal', 'Dominance']
+    for i in range(3):
+        for j in range(i + 1, 3):
+            fig = plt.figure()
+            ax = fig.add_subplot()
+            for cluster_id in sorted(set(affective_cluster_ids)):
+                indexes = np.argwhere(affective_cluster_ids == cluster_id).flatten()
+                ax.scatter(affective_coordinates[indexes, i], affective_coordinates[indexes, j], alpha=0.5, c=cluster_colors[cluster_id], s=150)
+            ax.set_xlabel(affects[i])
+            ax.set_ylabel(affects[j])
+            plt.savefig(experiment_path + f'scatters_affect/scatter_{affects[i]}_vs_{affects[j]}.png')
+            fig = plt.gcf()
+            plt.close(fig)
+        plt.figure()
+        plt.hist(affective_coordinates[:, i])
+        plt.xlabel(affects[i])
+        plt.savefig(experiment_path + f'hists_affect/hist_{affects[i]}.png')
+        fig = plt.gcf()
+        plt.close(fig)
+    plt.close('all')
+    stop = 1
+def sample_clusters(path, cocktail_reps, all_weights, affective_cluster_ids, affective_cluster_centers, affective_coordinates, n_samples=4):
+    cocktail_data = pd.read_csv(path)
+    these_cocktail_reps = normalize_cocktail_reps_affective(np.array([cocktail_data[k] for k in original_affective_keys]).transpose())
+    names = cocktail_data['names']
+    urls = cocktail_data['urls']
+    ingr_str = cocktail_data['ingredients_str']
+    for cluster_id in sorted(set(affective_cluster_ids)):
+        indexes = np.argwhere(affective_cluster_ids == cluster_id).flatten()
+        print('\n\n\n---------\n----------\n-----------\n')
+        cluster_str = ''
+        cluster_str += f'Affective cluster #{cluster_id}' + \
+                       f'\n\tSize: {len(indexes)}' + \
+                       f'\n\tCenter: ' + \
+                       f'\n\t\tVal: {affective_cluster_centers[cluster_id][0]:.2f}, ' + \
+                       f'\n\t\tArousal: {affective_cluster_centers[cluster_id][1]:.2f}, ' + \
+                       f'\n\t\tDominance: {affective_cluster_centers[cluster_id][2]:.2f}'
+        print(cluster_str)
+        if affective_cluster_centers[cluster_id][2] == np.max(affective_cluster_centers[:, 2]):
+            stop = 1
+        sampled_idx = np.random.choice(indexes, size=min(len(indexes), n_samples), replace=False)
+        cocktail_str = ''
+        for i in sampled_idx:
+            assert np.sum(cocktail_reps[i] - these_cocktail_reps[i]) < 1e-9
+            cocktail_str += f'\n\n-------------'
+            cocktail_str += print_recipe(ingr_str[i], name=names[i], to_print=False)
+            cocktail_str +=  f'\nUrl: {urls[i]}'
+            cocktail_str +=  '\n\nRepresentation: ' + ', '.join([f'{af}: {cr:.2f}' for af, cr in zip(affective_keys, cocktail_reps[i])]) + '\n'
+            cocktail_str += '\n' + generate_explanation(cocktail_reps[i], all_weights, affective_coordinates[i])
+            print(cocktail_str)
+            stop = 1
+        cluster_str += '\n' + cocktail_str
+        with open(f"/home/cedric/Documents/pianocktail/experiments/cocktails/representation_analysis/affective_mapping/clusters/cluster_{cluster_id}", 'w') as f:
+            f.write(cluster_str)
+    stop = 1
+def explanation_per_dimension(i, cocktail_rep, all_weights, aff_coord):
+    names = ['valence', 'arousal', 'dominance']
+    weights = all_weights[i]
+    explanation_str = f'\n{names[i].capitalize()} explanation ({aff_coord[i]:.2f}):'
+    strengths = np.abs(weights * cocktail_rep)
+    strengths /= strengths.sum()
+    indexes = np.flip(np.argsort(strengths))
+    for ind in indexes:
+        if strengths[ind] != 0:
+            if np.sign(weights[ind]) == np.sign(cocktail_rep[ind]):
+                keyword = 'high' if cocktail_rep[ind] > 0 else 'low'
+                explanation_str += f'\n\t{int(strengths[ind]*100)}%: higher {names[i]} because {keyword} {affective_keys[ind]}'
+            else:
+                keyword = 'high' if cocktail_rep[ind] > 0 else 'low'
+                explanation_str += f'\n\t{int(strengths[ind]*100)}%: low {names[i]} because {keyword} {affective_keys[ind]}'
+    return explanation_str
+def generate_explanation(cocktail_rep, all_weights, aff_coord):
+    explanation_str = ''
+    for i in range(3):
+        explanation_str += explanation_per_dimension(i, cocktail_rep, all_weights, aff_coord)
+    return explanation_str
+def cocktails2affect_clusters(cocktail_rep):
+    if cocktail_rep.ndim == 1:
+        cocktail_rep = cocktail_rep.reshape(1, -1)
+    affective_coordinates, _ = cocktail2affect(cocktail_rep)
+    affective_cluster_ids, _, _ = get_clusters(affective_coordinates)
+    return affective_cluster_ids
+def setup_affective_space(path, save=False):
+    cocktail_data = pd.read_csv(path)
+    names = cocktail_data['names']
+    recipes = cocktail_data['ingredients_str']
+    urls = cocktail_data['urls']
+    reps = get_cocktail_reps(path)
+    affective_coordinates, all_weights = cocktail2affect(reps)
+    affective_cluster_ids, affective_cluster_centers, find_cluster = get_clusters(affective_coordinates, save=save)
+    nn_model = NearestNeighbors(n_neighbors=1)
+    nn_model.fit(affective_coordinates)
+    def cocktail2affect_cluster(cocktail_rep):
+        affective_coordinates, _ = cocktail2affect(cocktail_rep)
+        return find_cluster(affective_coordinates)
+    affective_clusters = dict(affective_coordinates=affective_coordinates,  # coordinates of cocktail in affective space
+                              affective_cluster_ids=affective_cluster_ids,  # cluster id of cocktails
+                              affective_cluster_centers=affective_cluster_centers, # cluster centers in affective space
+                              affective_weights=all_weights,  # weights to compute valence, arousal, dominance from cocktail representations
+                              original_affective_keys=original_affective_keys,
+                              cocktail_reps=reps,  # cocktail representations from the dataset (normalized)
+                              find_cluster=find_cluster,  # function to retrieve a cluster from affective coordinates
+                              nn_model=nn_model,  # to predict the nearest neighbor affective space,
+                              names=names, # names of cocktails in the dataset
+                              urls=urls,  # urls from the dataset
+                              recipes=recipes, # recipes of the dataset
+                              cocktail2affect=cocktail2affect,  # function to compute affects from cocktail representations
+                              cocktails2affect_clusters=cocktails2affect_clusters,
+                              cocktail2affect_cluster=cocktail2affect_cluster
+                              )
+    return affective_clusters
+if __name__ == '__main__':
+    reps = get_cocktail_reps(COCKTAILS_CSV_DATA, save=True)
+    # plot(reps)
+    affective_coordinates, all_weights = cocktail2affect(reps, save=True)
+    affective_cluster_ids, affective_cluster_centers, find_cluster = get_clusters(affective_coordinates)
+    save_reps(COCKTAILS_CSV_DATA, affective_cluster_ids)
+    study_affects(affective_coordinates, affective_cluster_ids)
+    sample_clusters(COCKTAILS_CSV_DATA, reps, all_weights, affective_cluster_ids, affective_cluster_centers, affective_coordinates)
+    setup_affective_space(COCKTAILS_CSV_DATA, save=True)

src/cocktails/pipeline/cocktailrep2recipe.py ADDED Viewed

	@@ -0,0 +1,329 @@

+import matplotlib.pyplot as plt
+import pickle
+from src.cocktails.utilities.cocktail_generation_utilities.population import *
+from src.cocktails.utilities.glass_and_volume_utilities import glass_volume
+from src.cocktails.config import RECIPE2FEATURES_PATH
+def test_mutation_params(cocktail_reps):
+    indexes = np.arange(cocktail_reps.shape[0])
+    np.random.shuffle(indexes)
+    perfs = []
+    mutated_perfs = []
+    pop_params = dict(mutation_params=dict(p_add_ing=0.7,
+                                           p_remove_ing=0.7,
+                                           p_switch_ing=0.5,
+                                           p_change_q=0.7,
+                                           delta_change_q=0.3,
+                                           asexual_rep=True,
+                                           crossover=True,
+                                           ingredient_addition=(0.1, 0.05)),
+                      nb_generations=100,
+                      pop_size=100,
+                      nb_elites=10,
+                      dist='mse',
+                      n_neighbors=5)
+    for i in indexes[:20]:
+        target = cocktail_reps[i]
+        for j in range(100):
+            parent = IndividualCocktail(pop_params=pop_params,
+                                        target_affective_cluster=None,
+                                        target=target.copy())
+            perfs.append(parent.perf)
+            child = parent.get_child()[0]
+            # child.compute_cocktail_rep()
+            # child.compute_perf()
+            if perfs[-1] != child.perf:
+                mutated_perfs.append(child.perf)
+            else:
+                perfs.pop(-1)
+    filtered_children = np.argwhere(np.array(mutated_perfs)==-100).flatten()
+    non_filtered_ids = np.argwhere(np.logical_and(np.array(perfs)!=-100, np.array(mutated_perfs)!=-100)).flatten()
+    print(f'Proportion of filtered: {filtered_children.size} / {len(mutated_perfs)} = {int(filtered_children.size / len(mutated_perfs)*100)}%')
+    plt.figure()
+    plt.scatter(np.array(perfs)[non_filtered_ids], np.array(mutated_perfs)[non_filtered_ids], s=100, alpha=0.5)
+    plt.xlabel('parent perf')
+    plt.ylabel('child perf')
+    print(np.corrcoef(np.array(perfs)[non_filtered_ids], np.array(mutated_perfs)[non_filtered_ids])[0, 1])
+    plt.show()
+    stop = 1
+def test_crossover(cocktail_reps):
+    indexes = np.arange(cocktail_reps.shape[0])
+    np.random.shuffle(indexes)
+    perfs = []
+    mutated_perfs = []
+    pop_params = dict(mutation_params=dict(p_add_ing=0.7,
+                                           p_remove_ing=0.7,
+                                           p_switch_ing=0.5,
+                                           p_change_q=0.7,
+                                           delta_change_q=0.3,
+                                           asexual_rep=True,
+                                           crossover=True,
+                                           ingredient_addition=(0.1, 0.05)),
+                      nb_generations=100,
+                      pop_size=100,
+                      nb_elites=10,
+                      dist='mse',
+                      n_neighbors=5)
+    for i in indexes[:20]:
+        for j in range(100):
+            target = cocktail_reps[i]
+            parent1 = IndividualCocktail(pop_params=pop_params,
+                                         target_affective_cluster=None,
+                                        target=target.copy())
+            parent2 = IndividualCocktail(pop_params=pop_params,
+                                         target_affective_cluster=None,
+                                         target=target.copy())
+            child = parent1.get_child_with(parent2)[0]
+            # child.compute_cocktail_rep()
+            # child.compute_perf()
+            perfs.append((parent1.perf + parent2.perf)/2)
+            if perfs[-1] != child.perf:
+                mutated_perfs.append(child.perf)
+            else:
+                perfs.pop(-1)
+    filtered_children = np.argwhere(np.array(mutated_perfs)==-100).flatten()
+    non_filtered_ids = np.argwhere(np.logical_and(np.array(perfs)>-45, np.array(mutated_perfs)!=-100)).flatten()
+    print(f'Proportion of filtered: {filtered_children.size} / {len(mutated_perfs)} = {int(filtered_children.size / len(mutated_perfs)*100)}%')
+    plt.figure()
+    plt.scatter(np.array(perfs)[non_filtered_ids], np.array(mutated_perfs)[non_filtered_ids], s=100, alpha=0.5)
+    plt.xlabel('parent perf')
+    plt.ylabel('child perf')
+    print(np.corrcoef(np.array(perfs)[non_filtered_ids], np.array(mutated_perfs)[non_filtered_ids])[0, 1])
+    plt.show()
+    stop = 1
+def run_comparisons():
+    np.random.seed(0)
+    indexes = np.arange(cocktail_reps.shape[0])
+    np.random.shuffle(indexes)
+    for n_neighbors in [0, 5]:
+        id_str_neigh = '5neigh_' if n_neighbors == 5 else '0_neigh_'
+        for asexual_rep in [True, False]:
+            id_str_as = id_str_neigh + 'asexual_' if asexual_rep else id_str_neigh
+            for crossover in [True, False]:
+                id_str = id_str_as + 'crossover_' if crossover else id_str_as
+                if crossover or asexual_rep:
+                    mutation_params = dict(p_add_ing = 0.5,
+                                           p_remove_ing = 0.5,
+                                           p_change_q = 0.5,
+                                           delta_change_q = 0.3,
+                                           asexual_rep=asexual_rep,
+                                           crossover=crossover,
+                                           ingredient_addition = (0.1, 0.05))
+                    nb_generations = 100
+                    pop_size=100
+                    nb_elites=10
+                    dist = 'mse'
+                    results = dict()
+                    print(id_str)
+                    for i, ind in enumerate(indexes[:30]):
+                        print(i+1)
+                        target_ing_str = data['ingredients_str'][ind]
+                        target = cocktail_reps[ind]
+                        population = Population(nb_generations=nb_generations, pop_size=pop_size, nb_elite=nb_elites,
+                                                target=target, dist=dist, mutation_params=mutation_params,
+                                                n_neighbors=n_neighbors, target_ing_str=target_ing_str, true_prep_type=data['category'][ind])
+                        population.run_evolution(verbose=False)
+                        best_scores, best_ind = population.get_best_score()
+                        recipes = [ind.get_recipe()[3] for ind in best_ind[:5]]
+                        results[str(ind)] = dict(best_scores=best_scores[:5], recipes=recipes, target=population.target_individual.get_recipe()[3])
+                        with open(f'/home/cedric/Desktop/ga_tests_{id_str}.pickle', 'wb') as f:
+                            pickle.dump(results, f)
+def get_cocktail_distribution(cocktail_reps):
+    return (np.mean(cocktail_reps, axis=0), np.cov(cocktail_reps, rowvar=0))
+def sample_cocktails(cocktail_reps, n=10, target_affective_cluster=None, to_print=True):
+    distrib = get_cocktail_distribution(cocktail_reps)
+    sampled_cocktail_reps = np.random.multivariate_normal(distrib[0], distrib[1], size=n)
+    recipes = []
+    closest_recipes = []
+    for i_c, cr in enumerate(sampled_cocktail_reps):
+        population = setup_recipe_generation(cr.copy(), target_affective_cluster=target_affective_cluster)
+        closest_recipes.append(population.nn_recipes[0])
+        best_scores, best_individuals = population.run_evolution()
+        recipes.append(best_individuals[0].get_recipe()[3])
+        if to_print:
+            print(f'Sample #{len(recipes)}:')
+            print(recipes[-1])
+            print('Closest from dataset:')
+            print(closest_recipes[-1])
+        stop = 1
+    return recipes, closest_recipes
+def setup_recipe_generation(target, known_target_dict=None, target_affective_cluster=None):
+    # pop_params = dict(mutation_params=dict(p_add_ing=0.7,
+    #                                        p_remove_ing=0.7,
+    #                                        p_switch_ing=0.5,
+    #                                        p_change_q=0.7,
+    #                                        delta_change_q=0.3,
+    #                                        asexual_rep=True,
+    #                                        crossover=True,
+    #                                        ingredient_addition=(0.1, 0.05)),
+    #                   nb_generations=2, #100
+    #                   pop_size=5, #100
+    #                   nb_elites=2, #10
+    #                   dist='mse',
+    #                   n_neighbors=3) #5
+    pop_params = dict(mutation_params=dict(p_add_ing=0.4,
+                                           p_remove_ing=1,
+                                           p_switch_ing=0.5,
+                                           p_change_q=1,
+                                           delta_change_q=0.3,
+                                           asexual_rep=True,
+                                           crossover=True,
+                                           ingredient_addition=(0.1, 0.05)),
+                      nb_generations=100,  # 100
+                      pop_size=100,  # 100
+                      nb_elites=10,  # 10
+                      dist='mse',
+                      n_neighbors=5)  # 5
+    population = Population(target=target,  target_affective_cluster=target_affective_cluster, known_target_dict=known_target_dict, pop_params=pop_params)
+    return population
+def cocktailrep2recipe(cocktail_rep, unit='mL', target_affective_cluster=None, known_target_dict=None, n_output=1, return_ind=False, verbose=True, full_verbose=False, level=0):
+    init_time = time.time()
+    if verbose: print(' ' * level + 'Generating cocktail..')
+    if cocktail_rep.ndim > 1:
+        assert cocktail_rep.shape[0] == 1
+        cocktail_rep = cocktail_rep.flatten()
+        # target_affective_cluster = target_affective_cluster[0]
+    population = setup_recipe_generation(cocktail_rep.copy(), known_target_dict=known_target_dict, target_affective_cluster=target_affective_cluster)
+    if full_verbose:
+        print(' ' * (level + 2) + '3 nearest neighbors:')
+        for i, recipe, score in zip(range(3), population.nn_recipes[:3], population.nn_scores[:3]):
+            print(' ' * (level + 4) + f'#{i+1}, score: {score:.2f}')
+            print(' ' * (level + 4) + recipe[1:].replace('None ()', '').replace('\t\t', ' ' * (level + 6)))
+    best_scores, best_individuals = population.run_evolution(verbose=full_verbose, level=level+2)
+    for i in range(n_output):
+        best_individuals[i].make_recipe_fit_the_glass()
+    instructions = [ind.get_instructions() for ind in best_individuals[:n_output]]
+    recipes = [ind.get_recipe(unit=unit)[3] for ind in best_individuals[:n_output]]
+    glasses = [ind.glass for ind in best_individuals[:n_output]]
+    prep_types = [ind.prep_type for ind in best_individuals[:n_output]]
+    for i, g, p, inst in zip(range(len(recipes)), glasses, prep_types, instructions):
+        recipes[i] = recipes[i].replace('Recipe', 'Ingredients') + f'Serve in:\n   {g.capitalize()} glass.\n' + inst
+    if full_verbose:
+        print(f'\n--------------\n{n_output} best results:')
+        for i, recipe, score in zip(range(n_output), recipes, best_scores[:n_output]):
+            print(f'#{i+1}, score: {score:.2f}')
+            print(recipe)
+    if verbose: print(' ' * (level + 2) + f'Generated in {int(time.time() - init_time)} seconds.')
+    if return_ind:
+        return recipes, best_scores[:n_output], best_individuals[:n_output]
+    else:
+        return recipes, best_scores[:n_output]
+def interpolate(cocktail_rep1, cocktail_rep2, alpha, verbose=False):
+    recipe, score = cocktailrep2recipe(alpha * cocktail_rep1 + (1 - alpha) * cocktail_rep2, verbose=verbose)
+    return recipe[0], score
+def interpolation_study(n_steps, cocktail_reps):
+    alphas = np.arange(0, 1 + 1e-6, 1/(n_steps + 1))
+    indexes = np.random.choice(np.arange(cocktail_reps.shape[0]), size=2, replace=False)
+    target_ing_str1, target_ing_str2 = data['ingredients_str'][indexes[0]], data['ingredients_str'][indexes[1]]
+    cocktail_rep1, cocktail_rep2 = cocktail_reps[indexes[0]], cocktail_reps[indexes[1]]
+    recipes, scores = [], []
+    for alpha in alphas:
+        recipe, score = interpolate(cocktail_rep1, cocktail_rep2, alpha)
+        recipes.append(recipe)
+        scores.append(score[0])
+    print('Point A:')
+    print_recipe(ingredient_str=target_ing_str2)
+    for i, alpha in enumerate(alphas):
+        print(f'Alpha = {alpha}, score = {scores[i]}')
+        print(recipes[i])
+    print('Point B:')
+    print_recipe(ingredient_str=target_ing_str1)
+    stop = 1
+def test_robustness_affective_cluster(cocktail_reps):
+    indexes = np.arange(cocktail_reps.shape[0])
+    np.random.shuffle(indexes)
+    matches = []
+    for i in indexes:
+        target_ing_str = data['ingredients_str'][i]
+        true_prep_type = data['category'][i]
+        target = cocktail_reps[i]
+        # get affective cluster
+        recipes, best_scores, best_inds = cocktailrep2recipe(cocktail_rep=target, target_ing_str=target_ing_str, true_prep_type=true_prep_type, n_output=1, verbose=False,
+                                                             return_ind=True)
+        matches.append(best_inds[0].does_affective_cluster_match())
+        print(np.mean(matches))
+def test(cocktail_reps):
+    indexes = np.arange(these_cocktail_reps.shape[0])
+    unnormalized_cr = np.array([data[k] for k in rep_keys]).transpose()
+    for i in indexes:
+        target_ing_str = data['ingredients_str'][i]
+        true_prep_type = data['category'][i]
+        target = these_cocktail_reps[i]
+        # print('preptype:', true_prep_type)
+        # print('cocktail unnormalized', np.sum(unnormalized_cr[i]), unnormalized_cr[i])
+        # print('cocktail hand normalized', np.sum(normalize_cocktail(unnormalized_cr[i])), normalize_cocktail(unnormalized_cr[i]))
+        # print('cocktail rep normalized', np.sum(these_cocktail_reps[i]), these_cocktail_reps[i])
+        # print('cocktail rep normalized', np.sum(all_reps[i]), all_reps[i])
+        population = setup_recipe_generation(target.copy(), target_ing_str=target_ing_str, target_affective_cluster=None, true_prep_type=true_prep_type)
+        target = population.target_individual
+        target.compute_perf()
+        if target.perf < -50:
+            print(i)
+            print_recipe(target_ing_str)
+            if not target.is_alcohol_present(): print('No alcohol')
+            if not target.is_total_volume_enough(): print('small volume')
+            if not target.does_fit_glass():
+                print(target.end_volume)
+                print(glass_volume[target.get_glass_type()] * 0.81)
+                print('too much volume')
+            if not target.is_alcohol_reasonable():
+                print(f'amount of alcohol too small or too large: {target.alcohol_precentage}')
+            stop = 1
+if __name__ == '__main__':
+    these_cocktail_reps = COCKTAIL_REPS.copy()
+    # test_crossover(these_cocktail_reps)
+    # test_mutation_params(these_cocktail_reps)
+    # test(these_cocktail_reps)
+    # recipes, closest_recipes = sample_cocktails(these_cocktail_reps, n=10)
+    # interpolation_study(n_steps=4, cocktail_reps=these_cocktail_reps)
+    # test_robustness_affective_cluster(these_cocktail_reps)
+    indexes = np.arange(these_cocktail_reps.shape[0])
+    np.random.shuffle(indexes)
+    # test_crossover(mutation_params, dist)
+    # test_mutation_params(mutation_params, dist)
+    stop = 1
+    unnormalized_cr = np.array([data[k] for k in rep_keys]).transpose()
+    for i in indexes:
+        print(i)
+        target_ing_str = data['ingredients_str'][i]
+        target_prep_type = data['category'][i]
+        target_glass = data['glass'][i]
+        print('preptype:', target_prep_type)
+        print('cocktail unnormalized', np.sum(unnormalized_cr[i]), unnormalized_cr[i])
+        print('cocktail hand normalized', np.sum(normalize_cocktail(unnormalized_cr[i])), normalize_cocktail(unnormalized_cr[i]))
+        print('cocktail rep normalized', np.sum(these_cocktail_reps[i]), these_cocktail_reps[i])
+        print('cocktail rep normalized', np.sum(all_reps[i]), all_reps[i])
+        print(i)
+        print('___________Target')
+        nn_model = NearestNeighbors()
+        nn_model.fit(these_cocktail_reps)
+        dists, indexes = nn_model.kneighbors(these_cocktail_reps[i].reshape(1, -1))
+        print(indexes)
+        print_recipe(target_ing_str)
+        target = these_cocktail_reps[i]
+        known_target_dict = dict(prep_type=target_prep_type,
+                                 ing_str=target_ing_str,
+                                 glass=target_glass)
+        recipes, best_scores = cocktailrep2recipe(cocktail_rep=target, known_target_dict=known_target_dict, n_output=1, verbose=True, full_verbose=True)
+        stop = 1

src/cocktails/pipeline/get_affect2affective_cluster.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from src.music.config import CHECKPOINTS_PATH
+import pickle
+import numpy as np
+# can be computed from cocktail2affect
+cluster_model_path = CHECKPOINTS_PATH + "/music2cocktails/affects2affect_cluster/cluster_model.pickle"
+def get_affect2affective_cluster():
+    with open(cluster_model_path, 'rb') as f:
+        data = pickle.load(f)
+    model = data['cluster_model']
+    dimensions_weights = data['dimensions_weights']
+    def find_cluster(aff_coord):
+        if aff_coord.ndim == 1:
+            aff_coord = aff_coord.reshape(1, -1)
+        return model.predict(aff_coord * np.array(dimensions_weights))
+    return find_cluster
+def get_affective_cluster_centers():
+    with open(cluster_model_path, 'rb') as f:
+        data = pickle.load(f)
+    return  data['cluster_centers']

src/cocktails/pipeline/get_cocktail2affective_cluster.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from src.cocktails.pipeline.get_affect2affective_cluster import get_affect2affective_cluster
+from src.cocktails.pipeline.cocktail2affect import cocktail2affect
+def get_cocktail2affective_cluster():
+    find_cluster = get_affect2affective_cluster()
+    def cocktail2affect_cluster(cocktail_rep):
+        affective_coordinates, _ = cocktail2affect(cocktail_rep)
+        return find_cluster(affective_coordinates)
+    return cocktail2affect_cluster

src/cocktails/representation_learning/__init__.py ADDED Viewed

File without changes

src/cocktails/representation_learning/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (222 Bytes). View file

src/cocktails/representation_learning/__pycache__/dataset.cpython-39.pyc ADDED Viewed

Binary file (8.77 kB). View file

src/cocktails/representation_learning/__pycache__/multihead_model.cpython-39.pyc ADDED Viewed

Binary file (5.36 kB). View file

src/cocktails/representation_learning/__pycache__/run.cpython-39.pyc ADDED Viewed

Binary file (16.1 kB). View file

src/cocktails/representation_learning/__pycache__/run_without_vae.cpython-39.pyc ADDED Viewed

Binary file (15.7 kB). View file

src/cocktails/representation_learning/__pycache__/simple_model.cpython-39.pyc ADDED Viewed

Binary file (1.96 kB). View file

src/cocktails/representation_learning/__pycache__/vae_model.cpython-39.pyc ADDED Viewed

Binary file (8.28 kB). View file

src/cocktails/representation_learning/dataset.py ADDED Viewed

	@@ -0,0 +1,324 @@

+from torch.utils.data import Dataset
+import pickle
+from src.cocktails.utilities.ingredients_utilities import extract_ingredients, ingredient_list, ingredient_profiles,  ingredients_per_type
+from src.cocktails.utilities.other_scrubbing_utilities import print_recipe
+import numpy as np
+def get_representation_from_ingredient(ingredients, quantities, max_q_per_ing, index, params):
+    assert len(ingredients) == len(quantities)
+    ing, q = ingredients[index], quantities[index]
+    proportion = q / np.sum(quantities)
+    index_ing = ingredient_list.index(ing)
+    # add keys of profile
+    rep_ingredient = []
+    rep_ingredient += [ingredient_profiles[k][index_ing] for k in params['ing_keys']]
+    # add category encoding
+    # rep_ingredient += list(params['category_encodings'][ingredient_profiles['type'][index_ing]])
+    # add quantitiy and relative quantity
+    rep_ingredient += [q / max_q_per_ing[ing], proportion]
+    ing_one_hot = np.zeros(len(ingredient_list))
+    ing_one_hot[index_ing] = 1
+    rep_ingredient += list(ing_one_hot)
+    indexes_to_normalize = list(range(len(params['ing_keys'])))
+    #TODO: should we add ing one hot? Or make sure no 2 ing have same embedding
+    return np.array(rep_ingredient), indexes_to_normalize
+def get_max_n_ingredients(data):
+    max_count = 0
+    ingredient_set = set()
+    alcohol_set = set()
+    liqueur_set = set()
+    ing_str = np.array(data['ingredients_str'])
+    for i in range(len(data['names'])):
+        ingredients, quantities = extract_ingredients(ing_str[i])
+        max_count = max(max_count, len(ingredients))
+        for ing in ingredients:
+            ingredient_set.add(ing)
+            if ing in ingredients_per_type['liquor']:
+                alcohol_set.add(ing)
+            if ing in ingredients_per_type['liqueur']:
+                liqueur_set.add(ing)
+    return max_count, ingredient_set, alcohol_set, liqueur_set
+# Add your custom dataset class here
+class MyDataset(Dataset):
+    def __init__(self, split, params):
+        data = params['raw_data']
+        self.dim_rep_ingredient = params['dim_rep_ingredient']
+        n_data = len(data["names"])
+        preparation_list = sorted(set(data['category']))
+        categories_list = sorted(set(data['subcategory']))
+        glasses_list = sorted(set(data['glass']))
+        max_ingredients, ingredient_set, liquor_set, liqueur_set = get_max_n_ingredients(data)
+        ingredient_set = sorted(ingredient_set)
+        self.ingredient_set = ingredient_set
+        ingredient_quantities = []  # output of our network
+        ingr_strs = np.array(data['ingredients_str'])
+        for i in range(n_data):
+            ingredients, quantities = extract_ingredients(ingr_strs[i])
+            # get ingredient presence and quantity
+            ingredient_q_rep = np.zeros([len(ingredient_set)])
+            for ing, q in zip(ingredients, quantities):
+                ingredient_q_rep[ingredient_set.index(ing)] = q
+            ingredient_quantities.append(ingredient_q_rep)
+        # take care of ingredient quantities (OUTPUTS)
+        ingredient_quantities = np.array(ingredient_quantities)
+        ingredients_presence = (ingredient_quantities>0).astype(np.int)
+        min_ing_quantities = np.min(ingredient_quantities, axis=0)
+        max_ing_quantities = np.max(ingredient_quantities, axis=0)
+        def normalize_ing_quantities(ing_quantities):
+            return ((ing_quantities - min_ing_quantities) / (max_ing_quantities - min_ing_quantities)).copy()
+        def denormalize_ing_quantities(normalized_ing_quantities):
+            return (normalized_ing_quantities * (max_ing_quantities - min_ing_quantities) + min_ing_quantities).copy()
+        ing_q_when_present = ingredient_quantities.copy()
+        for i in range(len(ing_q_when_present)):
+            ing_q_when_present[i, np.where(ing_q_when_present[i, :] == 0)] = np.nan
+        self.min_when_present_ing_quantities = np.nanmin(ing_q_when_present, axis=0)
+        def filter_decoder_output(output):
+            output_unnormalized = output * max_ing_quantities
+            if output.ndim == 1:
+                output_unnormalized[np.where(output_unnormalized<self.min_when_present_ing_quantities)] = 0
+            else:
+                for i in range(output.shape[0]):
+                    output_unnormalized[i, np.where(output_unnormalized[i] < self.min_when_present_ing_quantities)] = 0
+            return output_unnormalized.copy()
+        self.filter_decoder_output = filter_decoder_output
+        # arg_mins = np.nanargmin(ing_q_when_present, axis=0)
+        #
+        # for ing, minq, argminq in zip(ingredient_set, self.min_when_present_ing_quantities, arg_mins):
+        #     print(f'__\n{ing}: {minq}')
+        #     print_recipe(ingr_strs[argminq])
+        #     ingredients, quantities = extract_ingredients(ingr_strs[argminq])
+        #     # get ingredient presence and quantity
+        #     ingredient_q_rep = np.zeros([len(ingredient_set)])
+        #     for ing, q in zip(ingredients, quantities):
+        #         ingredient_q_rep[ingredient_set.index(ing)] = q
+        #     print(np.array(data['urls'])[argminq])
+        #     stop = 1
+        self.max_ing_quantities = max_ing_quantities
+        self.mean_ing_quantities = np.mean(ingredient_quantities, axis=0)
+        self.std_ing_quantities = np.std(ingredient_quantities, axis=0)
+        if split == 'train':
+            np.savetxt(params['save_path'] + 'min_when_present_ing_quantities.txt', self.min_when_present_ing_quantities)
+            np.savetxt(params['save_path'] + 'max_ing_quantities.txt', max_ing_quantities)
+            np.savetxt(params['save_path'] + 'mean_ing_quantities.txt', self.mean_ing_quantities)
+            np.savetxt(params['save_path'] + 'std_ing_quantities.txt', self.std_ing_quantities)
+        # print(ingredient_quantities[0])
+        # ingredient_quantities = (ingredient_quantities - self.mean_ing_quantities) /  self.std_ing_quantities
+        # print(ingredient_quantities[0])
+        # print(ingredient_quantities[0] * self.std_ing_quantities + self.mean_ing_quantities )
+        ingredient_quantities = ingredient_quantities / max_ing_quantities#= normalize_ing_quantities(ingredient_quantities)
+        max_q_per_ing = dict(zip(ingredient_set, max_ing_quantities))
+        # print(ingredient_quantities[0])
+        #########
+        # Process input representation_analysis: list of ingredient representation_analysis
+        #########
+        input_data = []  # input of ingredient encoders
+        all_ing_reps = []
+        for i in range(n_data):
+            ingredients, quantities = extract_ingredients(ingr_strs[i])
+            # get ingredient presence and quantity
+            ingredient_q_rep = np.zeros([len(ingredient_set)])
+            for ing, q in zip(ingredients, quantities):
+                ingredient_q_rep[ingredient_set.index(ing)] = q
+            # get main liquor
+            cocktail_rep = []
+            for j in range(len(ingredients)):
+                cocktail_rep.append(get_representation_from_ingredient(ingredients, quantities, max_q_per_ing, index=j, params=params)[0])
+                all_ing_reps.append(cocktail_rep[-1].copy())
+            input_data.append(cocktail_rep)
+        all_ing_reps = np.array(all_ing_reps)
+        min_ing_reps = np.min(all_ing_reps[:, params['indexes_ing_to_normalize']], axis=0)
+        max_ing_reps = np.max(all_ing_reps[:, params['indexes_ing_to_normalize']], axis=0)
+        def normalize_ing_reps(ing_reps):
+            if ing_reps.ndim == 1:
+                ing_reps = ing_reps.reshape(1, -1)
+            out = ing_reps.copy()
+            out[:, params['indexes_ing_to_normalize']] = (out[:, params['indexes_ing_to_normalize']] - min_ing_reps) / (max_ing_reps - min_ing_reps)
+            return out
+        def denormalize_ing_reps(normalized_ing_reps):
+            if normalized_ing_reps.ndim == 1:
+                normalized_ing_reps = normalized_ing_reps.reshape(1, -1)
+            out = normalized_ing_reps.copy()
+            out[:, params['indexes_ing_to_normalize']] = out[:, params['indexes_ing_to_normalize']] * (max_ing_reps - min_ing_reps) + min_ing_reps
+            return out
+        # put everything in a big matrix
+        dim_cocktail_rep = max_ingredients * self.dim_rep_ingredient
+        input_data2 = []
+        nb_ingredients = []
+        for d in input_data:
+            cocktail_rep = np.zeros([dim_cocktail_rep])
+            cocktail_rep.fill(np.nan)
+            index = 0
+            nb_ingredients.append(len(d))
+            for dj in d:
+                cocktail_rep[index:index + self.dim_rep_ingredient] = normalize_ing_reps(dj)
+                index += self.dim_rep_ingredient
+            input_data2.append(cocktail_rep)
+        input_data = np.array(input_data2)
+        nb_ingredients = np.array(nb_ingredients)
+        # let us now extract various possible output we might want to predict:
+        #########
+        # Process output cocktail representation_analysis (computed from ingredient reps)
+        #########
+        # quantities_indexes = np.arange(20, 456, 57)
+        # qs = input_data[0, quantities_indexes]
+        # ingredient_quantities[0]
+        # get final volume
+        volumes = np.array(params['raw_data']['end volume'])
+        min_vol = volumes.min()
+        max_vol = volumes.max()
+        def normalize_vol(volume):
+            return (volume - min_vol) / (max_vol - min_vol)
+        def denormalize_vol(normalized_vol):
+            return normalized_vol * (max_vol - min_vol) + min_vol
+        volumes = normalize_vol(volumes)
+        # computed cocktail representation
+        computed_cocktail_reps = params['cocktail_reps']
+        self.dim_rep = computed_cocktail_reps.shape[1]
+        #########
+        # Process output sub categories
+        #########
+        categories = np.array([categories_list.index(sc) for sc in data['subcategory']])
+        counts = dict(zip(categories_list, [0] * len(categories)))
+        for c in data['subcategory']:
+            counts[c] += 1
+        for k in counts.keys():
+            counts[k] /= len(data['subcategory'])
+        self.categories = categories_list
+        self.categories_weights = []
+        for c in self.categories:
+            self.categories_weights.append(1/len(self.categories)/counts[c])
+        print(counts)
+        #########
+        # Process output glass type
+        #########
+        glasses = np.array([glasses_list.index(sc) for sc in data['glass']])
+        counts = dict(zip(glasses_list, [0] * len(set(data['glass']))))
+        for c in data['glass']:
+            counts[c] += 1
+        for k in counts.keys():
+            counts[k] /= len(data['glass'])
+        self.glasses = glasses_list
+        self.glasses_weights = []
+        for c in self.glasses:
+            self.glasses_weights.append(1 / len(self.glasses) / counts[c])
+        print(counts)
+        #########
+        # Process output preparation type
+        #########
+        prep_type = np.array([preparation_list.index(sc) for sc in data['category']])
+        counts = dict(zip(preparation_list, [0] * len(preparation_list)))
+        for c in data['category']:
+            counts[c] += 1
+        for k in counts.keys():
+            counts[k] /= len(data['category'])
+        self.prep_types = preparation_list
+        self.prep_types_weights = []
+        for c in self.prep_types:
+            self.prep_types_weights.append(1 / len(self.prep_types) / counts[c])
+        print(counts)
+        taste_reps = list(data['taste_rep'])
+        taste_rep_ground_truth = []
+        taste_rep_valid = []
+        for tr in taste_reps:
+            if len(tr) > 2:
+                taste_rep_valid.append(True)
+                taste_rep_ground_truth.append([float(tr.split('[')[1].split(',')[0]), float(tr.split(']')[0].split(',')[1][1:])])
+            else:
+                taste_rep_valid.append(False)
+                taste_rep_ground_truth.append([np.nan, np.nan])
+        taste_rep_ground_truth = np.array(taste_rep_ground_truth)
+        taste_rep_valid = np.array(taste_rep_valid)
+        taste_rep_ground_truth /= 10
+        auxiliary_data = dict(categories=categories,
+                              glasses=glasses,
+                              prep_type=prep_type,
+                              cocktail_reps=computed_cocktail_reps,
+                              ingredients_presence=ingredients_presence,
+                              taste_reps=taste_rep_ground_truth,
+                              volume=volumes,
+                              ingredients_quantities=ingredient_quantities)
+        self.auxiliary_keys = sorted(params['auxiliaries_dict'].keys())
+        assert self.auxiliary_keys == sorted(auxiliary_data.keys())
+        data_preprocessing = dict(min_max_ing_quantities=(min_ing_quantities, max_ing_quantities),
+                                  min_max_ing_reps=(min_ing_reps, max_ing_reps),
+                                  min_max_vol=(min_vol, max_vol))
+        if split == 'train':
+            with open(params['save_path'] + 'normalization_funcs.pickle', 'wb') as f:
+                pickle.dump(data_preprocessing, f)
+        n_data = len(input_data)
+        assert len(ingredient_quantities) == n_data
+        for aux in self.auxiliary_keys:
+            assert len(auxiliary_data[aux]) == n_data
+        if split == 'train':
+            indexes = np.arange(int(0.9 * n_data))
+        elif split == 'test':
+            indexes = np.arange(int(0.9 * n_data), n_data)
+        elif split == 'all':
+            indexes = np.arange(n_data)
+        else:
+            raise ValueError
+        # np.random.shuffle(indexes)
+        self.taste_rep_valid = taste_rep_valid[indexes]
+        self.input_ingredients = input_data[indexes]
+        self.ingredient_quantities = ingredient_quantities[indexes]
+        self.computed_cocktail_reps = computed_cocktail_reps[indexes]
+        self.auxiliaries = dict()
+        for aux in self.auxiliary_keys:
+            self.auxiliaries[aux] = auxiliary_data[aux][indexes]
+        self.nb_ingredients = nb_ingredients[indexes]
+    def __len__(self):
+        return len(self.input_ingredients)
+    def get_auxiliary_data(self, idx):
+        out = dict()
+        for aux in self.auxiliary_keys:
+            out[aux] = self.auxiliaries[aux][idx]
+        return out
+    def __getitem__(self, idx):
+        assert self.nb_ingredients[idx] == np.argwhere(~np.isnan(self.input_ingredients[idx])).flatten().size / self.dim_rep_ingredient
+        return [self.nb_ingredients[idx], self.input_ingredients[idx], self.ingredient_quantities[idx], self.computed_cocktail_reps[idx], self.get_auxiliary_data(idx),
+                self.taste_rep_valid[idx]]

src/cocktails/representation_learning/multihead_model.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import torch; torch.manual_seed(0)
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils
+import torch.distributions
+import matplotlib.pyplot as plt; plt.rcParams['figure.dpi'] = 200
+from src.cocktails.representation_learning.simple_model import SimpleNet
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+def get_activation(activation):
+    if activation == 'tanh':
+        activ = F.tanh
+    elif activation == 'relu':
+        activ = F.relu
+    elif activation == 'mish':
+        activ = F.mish
+    elif activation == 'sigmoid':
+        activ = F.sigmoid
+    elif activation == 'leakyrelu':
+        activ = F.leaky_relu
+    elif activation == 'exp':
+        activ = torch.exp
+    else:
+        raise ValueError
+    return activ
+class IngredientEncoder(nn.Module):
+    def __init__(self, input_dim, deepset_latent_dim, hidden_dims, activation, dropout):
+        super(IngredientEncoder, self).__init__()
+        self.linears = nn.ModuleList()
+        self.dropouts = nn.ModuleList()
+        dims = [input_dim] + hidden_dims + [deepset_latent_dim]
+        for d_in, d_out in zip(dims[:-1], dims[1:]):
+            self.linears.append(nn.Linear(d_in, d_out))
+            self.dropouts.append(nn.Dropout(dropout))
+        self.activation = get_activation(activation)
+        self.n_layers = len(self.linears)
+        self.layer_range = range(self.n_layers)
+    def forward(self, x):
+        for i_layer, layer, dropout in zip(self.layer_range, self.linears, self.dropouts):
+            x = layer(x)
+            if i_layer != self.n_layers - 1:
+                x = self.activation(dropout(x))
+        return x  # do not use dropout on last layer?
+class DeepsetCocktailEncoder(nn.Module):
+    def __init__(self, input_dim, deepset_latent_dim, hidden_dims_ing, activation,
+                 hidden_dims_cocktail, latent_dim, aggregation, dropout):
+        super(DeepsetCocktailEncoder, self).__init__()
+        self.input_dim = input_dim  # dimension of ingredient representation + quantity
+        self.ingredient_encoder = IngredientEncoder(input_dim, deepset_latent_dim, hidden_dims_ing, activation, dropout)  # encode each ingredient separately
+        self.deepset_latent_dim = deepset_latent_dim  # dimension of the deepset aggregation
+        self.aggregation = aggregation
+        self.latent_dim = latent_dim
+        # post aggregation network
+        self.linears = nn.ModuleList()
+        self.dropouts = nn.ModuleList()
+        dims = [deepset_latent_dim] + hidden_dims_cocktail
+        for d_in, d_out in zip(dims[:-1], dims[1:]):
+            self.linears.append(nn.Linear(d_in, d_out))
+            self.dropouts.append(nn.Dropout(dropout))
+        self.FC_mean  = nn.Linear(hidden_dims_cocktail[-1], latent_dim)
+        self.FC_logvar   = nn.Linear(hidden_dims_cocktail[-1], latent_dim)
+        self.softplus = nn.Softplus()
+        self.activation = get_activation(activation)
+        self.n_layers = len(self.linears)
+        self.layer_range = range(self.n_layers)
+    def forward(self, nb_ingredients, x):
+        # reshape x in (batch size * nb ingredients, dim_ing_rep)
+        batch_size = x.shape[0]
+        all_ingredients = []
+        for i in range(batch_size):
+            for j in range(nb_ingredients[i]):
+                all_ingredients.append(x[i, self.input_dim * j: self.input_dim * (j + 1)].reshape(1, -1))
+        x = torch.cat(all_ingredients, dim=0)
+        # encode ingredients in parallel
+        ingredients_encodings = self.ingredient_encoder(x)
+        assert ingredients_encodings.shape == (torch.sum(nb_ingredients), self.deepset_latent_dim)
+        # aggregate
+        x = []
+        index_first = 0
+        for i in range(batch_size):
+            index_last = index_first + nb_ingredients[i]
+            # aggregate
+            if self.aggregation == 'sum':
+                x.append(torch.sum(ingredients_encodings[index_first:index_last], dim=0).reshape(1, -1))
+            elif self.aggregation == 'mean':
+                x.append(torch.mean(ingredients_encodings[index_first:index_last], dim=0).reshape(1, -1))
+            else:
+                raise ValueError
+            index_first = index_last
+        x = torch.cat(x, dim=0)
+        assert x.shape[0] == batch_size
+        for i_layer, layer, dropout in zip(self.layer_range, self.linears, self.dropouts):
+            x = self.activation(dropout(layer(x)))
+        mean = self.FC_mean(x)
+        logvar = self.FC_logvar(x)
+        return mean, logvar
+class MultiHeadModel(nn.Module):
+    def __init__(self, encoder, auxiliaries_dict, activation, hidden_dims_decoder):
+        super(MultiHeadModel, self).__init__()
+        self.encoder = encoder
+        self.latent_dim = self.encoder.output_dim
+        self.auxiliaries_str = []
+        self.auxiliaries = nn.ModuleList()
+        for aux_str in sorted(auxiliaries_dict.keys()):
+            if aux_str == 'taste_reps':
+                self.taste_reps_decoder = SimpleNet(input_dim=self.latent_dim, hidden_dims=[], output_dim=auxiliaries_dict[aux_str]['dim_output'],
+                                                    activation=activation, dropout=0.0, final_activ=auxiliaries_dict[aux_str]['final_activ'])
+            else:
+                self.auxiliaries_str.append(aux_str)
+                if aux_str == 'ingredients_quantities':
+                    hd = hidden_dims_decoder
+                else:
+                    hd = []
+                self.auxiliaries.append(SimpleNet(input_dim=self.latent_dim, hidden_dims=hd, output_dim=auxiliaries_dict[aux_str]['dim_output'],
+                                                  activation=activation, dropout=0.0, final_activ=auxiliaries_dict[aux_str]['final_activ']))
+    def get_all_auxiliaries(self, x):
+        return [aux(x) for aux in self.auxiliaries]
+    def get_auxiliary(self, z, aux_str):
+        if aux_str == 'taste_reps':
+            return self.taste_reps_decoder(z)
+        else:
+            index = self.auxiliaries_str.index(aux_str)
+            return self.auxiliaries[index](z)
+    def forward(self, x, aux_str=None):
+        z = self.encoder(x)
+        if aux_str is not None:
+            return z, self.get_auxiliary(z, aux_str), [aux_str]
+        else:
+            return z, self.get_all_auxiliaries(z), self.auxiliaries_str
+def get_multihead_model(input_dim, activation, hidden_dims_cocktail, latent_dim, dropout, auxiliaries_dict, hidden_dims_decoder):
+    encoder = SimpleNet(input_dim, hidden_dims_cocktail, latent_dim, activation, dropout)
+    model = MultiHeadModel(encoder, auxiliaries_dict, activation, hidden_dims_decoder)
+    return model

src/cocktails/representation_learning/run.py ADDED Viewed

	@@ -0,0 +1,557 @@

+import torch; torch.manual_seed(0)
+import torch.utils
+from torch.utils.data import DataLoader
+import torch.distributions
+import torch.nn as nn
+import matplotlib.pyplot as plt; plt.rcParams['figure.dpi'] = 200
+from src.cocktails.representation_learning.dataset import MyDataset, get_representation_from_ingredient, get_max_n_ingredients
+import json
+import pandas as pd
+import numpy as np
+import os
+from src.cocktails.representation_learning.vae_model import get_vae_model
+from src.cocktails.config import COCKTAILS_CSV_DATA, FULL_COCKTAIL_REP_PATH, EXPERIMENT_PATH
+from src.cocktails.utilities.cocktail_utilities import get_bunch_of_rep_keys
+from src.cocktails.utilities.ingredients_utilities import ingredient_profiles
+from resource import getrusage
+from resource import RUSAGE_SELF
+import gc
+gc.collect(2)
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+def get_params():
+    data = pd.read_csv(COCKTAILS_CSV_DATA)
+    max_ingredients, ingredient_set, liquor_set, liqueur_set = get_max_n_ingredients(data)
+    num_ingredients = len(ingredient_set)
+    rep_keys = get_bunch_of_rep_keys()['custom']
+    ing_keys = [k.split(' ')[1] for k in rep_keys]
+    ing_keys.remove('volume')
+    nb_ing_categories = len(set(ingredient_profiles['type']))
+    category_encodings = dict(zip(sorted(set(ingredient_profiles['type'])), np.eye(nb_ing_categories)))
+    params = dict(trial_id='test',
+                  save_path=EXPERIMENT_PATH + "/deepset_vae/",
+                  nb_epochs=2000,
+                  print_every=50,
+                  plot_every=100,
+                  batch_size=64,
+                  lr=0.001,
+                  dropout=0.,
+                  nb_epoch_switch_beta=600,
+                  latent_dim=10,
+                  beta_vae=0.2,
+                  ing_keys=ing_keys,
+                  nb_ingredients=len(ingredient_set),
+                  hidden_dims_ingredients=[128],
+                  hidden_dims_cocktail=[32],
+                  hidden_dims_decoder=[32],
+                  agg='mean',
+                  activation='relu',
+                  auxiliaries_dict=dict(categories=dict(weight=0, type='classif', final_activ=None, dim_output=len(set(data['subcategory']))),
+                                        glasses=dict(weight=0, type='classif', final_activ=None, dim_output=len(set(data['glass']))),
+                                        prep_type=dict(weight=0, type='classif', final_activ=None, dim_output=len(set(data['category']))),
+                                        cocktail_reps=dict(weight=0, type='regression', final_activ=None, dim_output=13),
+                                        volume=dict(weight=0, type='regression', final_activ='relu',  dim_output=1),
+                                        taste_reps=dict(weight=0, type='regression', final_activ='relu', dim_output=2),
+                                        ingredients_presence=dict(weight=0, type='multiclassif', final_activ=None, dim_output=num_ingredients)),
+                  category_encodings=category_encodings
+                  )
+    # params = dict(trial_id='test',
+    #               save_path=EXPERIMENT_PATH + "/deepset_vae/",
+    #               nb_epochs=1000,
+    #               print_every=50,
+    #               plot_every=100,
+    #               batch_size=64,
+    #               lr=0.001,
+    #               dropout=0.,
+    #               nb_epoch_switch_beta=500,
+    #               latent_dim=64,
+    #               beta_vae=0.3,
+    #               ing_keys=ing_keys,
+    #               nb_ingredients=len(ingredient_set),
+    #               hidden_dims_ingredients=[128],
+    #               hidden_dims_cocktail=[128, 128],
+    #               hidden_dims_decoder=[128, 128],
+    #               agg='mean',
+    #               activation='mish',
+    #               auxiliaries_dict=dict(categories=dict(weight=0.5, type='classif', final_activ=None, dim_output=len(set(data['subcategory']))),
+    #                                     glasses=dict(weight=0.03, type='classif', final_activ=None, dim_output=len(set(data['glass']))),
+    #                                     prep_type=dict(weight=0.02, type='classif', final_activ=None, dim_output=len(set(data['category']))),
+    #                                     cocktail_reps=dict(weight=1, type='regression', final_activ=None, dim_output=13),
+    #                                     volume=dict(weight=1, type='regression', final_activ='relu',  dim_output=1),
+    #                                     taste_reps=dict(weight=1, type='regression', final_activ='relu', dim_output=2),
+    #                                     ingredients_presence=dict(weight=1.5, type='multiclassif', final_activ=None, dim_output=num_ingredients)),
+    #               category_encodings=category_encodings
+    #               )
+    water_rep, indexes_to_normalize = get_representation_from_ingredient(ingredients=['water'], quantities=[1],
+                                                                         max_q_per_ing=dict(zip(ingredient_set, [1] * num_ingredients)), index=0,
+                                                                         params=params)
+    dim_rep_ingredient = water_rep.size
+    params['indexes_ing_to_normalize'] = indexes_to_normalize
+    params['deepset_latent_dim'] = dim_rep_ingredient * max_ingredients
+    params['input_dim'] = dim_rep_ingredient
+    params['dim_rep_ingredient'] = dim_rep_ingredient
+    params = compute_expe_name_and_save_path(params)
+    del params['category_encodings']  # to dump
+    with open(params['save_path'] + 'params.json', 'w') as f:
+        json.dump(params, f)
+    params = complete_params(params)
+    return params
+def complete_params(params):
+    data = pd.read_csv(COCKTAILS_CSV_DATA)
+    cocktail_reps = np.loadtxt(FULL_COCKTAIL_REP_PATH)
+    nb_ing_categories = len(set(ingredient_profiles['type']))
+    category_encodings = dict(zip(sorted(set(ingredient_profiles['type'])), np.eye(nb_ing_categories)))
+    params['cocktail_reps'] = cocktail_reps
+    params['raw_data'] = data
+    params['category_encodings'] = category_encodings
+    return params
+def compute_losses_and_accuracies(loss_functions, auxiliaries, auxiliaries_str, outputs, data):
+    losses = dict()
+    accuracies = dict()
+    other_metrics = dict()
+    for i_k, k in enumerate(auxiliaries_str):
+        # get ground truth
+        # compute loss
+        if k == 'volume':
+            outputs[i_k] = outputs[i_k].flatten()
+        ground_truth = auxiliaries[k]
+        if ground_truth.dtype == torch.float64:
+            losses[k] = loss_functions[k](outputs[i_k], ground_truth.float()).float()
+        elif ground_truth.dtype == torch.int64:
+            if str(loss_functions[k]) != "BCEWithLogitsLoss()":
+                losses[k] = loss_functions[k](outputs[i_k].float(), ground_truth.long()).float()
+            else:
+                losses[k] = loss_functions[k](outputs[i_k].float(), ground_truth.float()).float()
+        else:
+            losses[k] = loss_functions[k](outputs[i_k], ground_truth).float()
+        # compute accuracies
+        if str(loss_functions[k]) == 'CrossEntropyLoss()':
+            bs, n_options = outputs[i_k].shape
+            predicted = outputs[i_k].argmax(dim=1).detach().numpy()
+            true = ground_truth.int().detach().numpy()
+            confusion_matrix = np.zeros([n_options, n_options])
+            for i in range(bs):
+                confusion_matrix[true[i], predicted[i]] += 1
+            acc = confusion_matrix.diagonal().sum() / bs
+            for i in range(n_options):
+                if confusion_matrix[i].sum() != 0:
+                    confusion_matrix[i] /= confusion_matrix[i].sum()
+            other_metrics[k + '_confusion'] = confusion_matrix
+            accuracies[k] = np.mean(outputs[i_k].argmax(dim=1).detach().numpy() == ground_truth.int().detach().numpy())
+            assert (acc - accuracies[k]) < 1e-5
+        elif str(loss_functions[k]) == 'BCEWithLogitsLoss()':
+            assert k == 'ingredients_presence'
+            outputs_rescaled = outputs[i_k].detach().numpy() * data.dataset.std_ing_quantities + data.dataset.mean_ing_quantities
+            predicted_presence = (outputs_rescaled > 0).astype(bool)
+            presence = ground_truth.detach().numpy().astype(bool)
+            other_metrics[k + '_false_positive'] = np.mean(np.logical_and(predicted_presence.astype(bool), ~presence.astype(bool)))
+            other_metrics[k + '_false_negative'] = np.mean(np.logical_and(~predicted_presence.astype(bool), presence.astype(bool)))
+            accuracies[k] = np.mean(predicted_presence == presence)  # accuracy for multi class labeling
+        elif str(loss_functions[k]) == 'MSELoss()':
+            accuracies[k] = np.nan
+        else:
+            raise ValueError
+    return losses, accuracies, other_metrics
+def compute_metric_output(aux_other_metrics, data, ingredient_quantities, x_hat):
+    ing_q = ingredient_quantities.detach().numpy() * data.dataset.std_ing_quantities + data.dataset.mean_ing_quantities
+    ing_presence = (ing_q > 0)
+    x_hat = x_hat.detach().numpy() * data.dataset.std_ing_quantities + data.dataset.mean_ing_quantities
+    # abs_diff = np.abs(ing_q - x_hat) * data.dataset.max_ing_quantities
+    abs_diff = np.abs(ing_q - x_hat)
+    ing_q_abs_loss_when_present, ing_q_abs_loss_when_absent = [], []
+    for i in range(ingredient_quantities.shape[0]):
+        ing_q_abs_loss_when_present.append(np.mean(abs_diff[i, np.where(ing_presence[i])]))
+        ing_q_abs_loss_when_absent.append(np.mean(abs_diff[i, np.where(~ing_presence[i])]))
+    aux_other_metrics['ing_q_abs_loss_when_present'] = np.mean(ing_q_abs_loss_when_present)
+    aux_other_metrics['ing_q_abs_loss_when_absent'] = np.mean(ing_q_abs_loss_when_absent)
+    return aux_other_metrics
+def run_epoch(opt, train, model, data, loss_functions, weights, params):
+    if train:
+        model.train()
+    else:
+        model.eval()
+    # prepare logging of losses
+    losses = dict(kld_loss=[],
+                  mse_loss=[],
+                  vae_loss=[],
+                  volume_loss=[],
+                  global_loss=[])
+    accuracies = dict()
+    other_metrics = dict()
+    for aux in params['auxiliaries_dict'].keys():
+        losses[aux] = []
+        accuracies[aux] = []
+    if train: opt.zero_grad()
+    for d in data:
+        nb_ingredients = d[0]
+        batch_size = nb_ingredients.shape[0]
+        x_ingredients = d[1].float()
+        ingredient_quantities = d[2]
+        cocktail_reps = d[3]
+        auxiliaries = d[4]
+        for k in auxiliaries.keys():
+            if auxiliaries[k].dtype == torch.float64: auxiliaries[k] = auxiliaries[k].float()
+        taste_valid = d[-1]
+        x = x_ingredients.to(device)
+        x_hat, z, mean, log_var, outputs, auxiliaries_str = model.forward_direct(ingredient_quantities.float())
+        # get auxiliary losses and accuracies
+        aux_losses, aux_accuracies, aux_other_metrics = compute_losses_and_accuracies(loss_functions, auxiliaries, auxiliaries_str, outputs, data)
+        # compute vae loss
+        mse_loss = ((ingredient_quantities - x_hat) ** 2).mean().float()
+        kld_loss = torch.mean(-0.5 * torch.sum(1 + log_var - mean ** 2 - log_var.exp(), dim=1)).float()
+        vae_loss = mse_loss + params['beta_vae'] * (params['latent_dim'] / params['nb_ingredients']) * kld_loss
+        # compute total volume loss to train decoder
+        # volume_loss = ((ingredient_quantities.sum(dim=1) - x_hat.sum(dim=1)) ** 2).mean().float()
+        volume_loss = torch.FloatTensor([0])
+        aux_other_metrics = compute_metric_output(aux_other_metrics, data, ingredient_quantities, x_hat)
+        indexes_taste_valid = np.argwhere(taste_valid.detach().numpy()).flatten()
+        if indexes_taste_valid.size > 0:
+            outputs_taste = model.get_auxiliary(z[indexes_taste_valid], aux_str='taste_reps')
+            gt = auxiliaries['taste_reps'][indexes_taste_valid]
+            factor_loss = indexes_taste_valid.size / (0.3 * batch_size)# factor on the loss: if same ratio as actual dataset factor = 1 if there is less data, then the factor decreases, more data, it increases
+            aux_losses['taste_reps'] = (loss_functions['taste_reps'](outputs_taste, gt) * factor_loss).float()
+        else:
+            aux_losses['taste_reps'] = torch.FloatTensor([0]).reshape([])
+        aux_accuracies['taste_reps'] = 0
+        # aggregate losses
+        global_loss = torch.sum(torch.cat([torch.atleast_1d(vae_loss), torch.atleast_1d(volume_loss)] + [torch.atleast_1d(aux_losses[k] * weights[k]) for k in params['auxiliaries_dict'].keys()]))
+        # for k in params['auxiliaries_dict'].keys():
+        #     global_loss += aux_losses[k] * weights[k]
+        if train:
+            global_loss.backward()
+            opt.step()
+            opt.zero_grad()
+        # logging
+        losses['global_loss'].append(float(global_loss))
+        losses['mse_loss'].append(float(mse_loss))
+        losses['vae_loss'].append(float(vae_loss))
+        losses['volume_loss'].append(float(volume_loss))
+        losses['kld_loss'].append(float(kld_loss))
+        for k in params['auxiliaries_dict'].keys():
+            losses[k].append(float(aux_losses[k]))
+            accuracies[k].append(float(aux_accuracies[k]))
+        for k in aux_other_metrics.keys():
+            if k not in other_metrics.keys():
+                other_metrics[k] = [aux_other_metrics[k]]
+            else:
+                other_metrics[k].append(aux_other_metrics[k])
+    for k in losses.keys():
+        losses[k] = np.mean(losses[k])
+    for k in accuracies.keys():
+        accuracies[k] = np.mean(accuracies[k])
+    for k in other_metrics.keys():
+        other_metrics[k] = np.mean(other_metrics[k], axis=0)
+    return model, losses, accuracies, other_metrics
+def prepare_data_and_loss(params):
+    train_data = MyDataset(split='train', params=params)
+    test_data = MyDataset(split='test', params=params)
+    train_data_loader = DataLoader(train_data, batch_size=params['batch_size'], shuffle=True)
+    test_data_loader = DataLoader(test_data, batch_size=params['batch_size'], shuffle=True)
+    loss_functions = dict()
+    weights = dict()
+    for k in sorted(params['auxiliaries_dict'].keys()):
+        if params['auxiliaries_dict'][k]['type'] == 'classif':
+            if k == 'glasses':
+                classif_weights = train_data.glasses_weights
+            elif k == 'prep_type':
+                classif_weights = train_data.prep_types_weights
+            elif k == 'categories':
+                classif_weights = train_data.categories_weights
+            else:
+                raise ValueError
+            loss_functions[k] = nn.CrossEntropyLoss(torch.FloatTensor(classif_weights))
+        elif params['auxiliaries_dict'][k]['type'] == 'multiclassif':
+            loss_functions[k] = nn.BCEWithLogitsLoss()
+        elif params['auxiliaries_dict'][k]['type'] == 'regression':
+            loss_functions[k] = nn.MSELoss()
+        else:
+            raise ValueError
+        weights[k] = params['auxiliaries_dict'][k]['weight']
+    return loss_functions, train_data_loader, test_data_loader, weights
+def print_losses(train, losses, accuracies, other_metrics):
+    keyword = 'Train' if train else 'Eval'
+    print(f'\t{keyword} logs:')
+    keys = ['global_loss', 'vae_loss', 'mse_loss', 'kld_loss', 'volume_loss']
+    for k in keys:
+        print(f'\t\t{k} - Loss: {losses[k]:.2f}')
+    for k in sorted(accuracies.keys()):
+        print(f'\t\t{k} (aux) - Loss: {losses[k]:.2f}, Acc: {accuracies[k]:.2f}')
+    for k in sorted(other_metrics.keys()):
+        if 'confusion' not in k:
+            print(f'\t\t{k} - {other_metrics[k]:.2f}')
+def run_experiment(params, verbose=True):
+    loss_functions, train_data_loader, test_data_loader, weights = prepare_data_and_loss(params)
+    params['filter_decoder_output'] = train_data_loader.dataset.filter_decoder_output
+    model_params = [params[k] for k in ["input_dim", "deepset_latent_dim", "hidden_dims_ingredients", "activation",
+                               "hidden_dims_cocktail", "hidden_dims_decoder", "nb_ingredients", "latent_dim", "agg", "dropout", "auxiliaries_dict",
+                                        "filter_decoder_output"]]
+    model = get_vae_model(*model_params)
+    opt = torch.optim.AdamW(model.parameters(), lr=params['lr'])
+    all_train_losses = []
+    all_eval_losses = []
+    all_train_accuracies = []
+    all_eval_accuracies = []
+    all_eval_other_metrics = []
+    all_train_other_metrics = []
+    best_loss = np.inf
+    model, eval_losses, eval_accuracies, eval_other_metrics = run_epoch(opt=opt, train=False, model=model, data=test_data_loader, loss_functions=loss_functions,
+                                                                        weights=weights, params=params)
+    all_eval_losses.append(eval_losses)
+    all_eval_accuracies.append(eval_accuracies)
+    all_eval_other_metrics.append(eval_other_metrics)
+    if verbose: print(f'\n--------\nEpoch #0')
+    if verbose: print_losses(train=False, accuracies=eval_accuracies, losses=eval_losses, other_metrics=eval_other_metrics)
+    for epoch in range(params['nb_epochs']):
+        if verbose and (epoch + 1) % params['print_every'] == 0: print(f'\n--------\nEpoch #{epoch+1}')
+        model, train_losses, train_accuracies, train_other_metrics = run_epoch(opt=opt, train=True, model=model, data=train_data_loader, loss_functions=loss_functions,
+                                                                            weights=weights, params=params)
+        if verbose and (epoch + 1) % params['print_every'] == 0: print_losses(train=True, accuracies=train_accuracies, losses=train_losses, other_metrics=train_other_metrics)
+        model, eval_losses, eval_accuracies, eval_other_metrics = run_epoch(opt=opt, train=False, model=model, data=test_data_loader, loss_functions=loss_functions,
+                                                                            weights=weights, params=params)
+        if verbose and (epoch + 1) % params['print_every'] == 0: print_losses(train=False, accuracies=eval_accuracies, losses=eval_losses, other_metrics=eval_other_metrics)
+        if eval_losses['global_loss'] < best_loss:
+            best_loss = eval_losses['global_loss']
+            if verbose: print(f'Saving new best model with loss {best_loss:.2f}')
+            torch.save(model.state_dict(), params['save_path'] + f'checkpoint_best.save')
+        # log
+        all_train_losses.append(train_losses)
+        all_train_accuracies.append(train_accuracies)
+        all_eval_losses.append(eval_losses)
+        all_eval_accuracies.append(eval_accuracies)
+        all_eval_other_metrics.append(eval_other_metrics)
+        all_train_other_metrics.append(train_other_metrics)
+        # if epoch == params['nb_epoch_switch_beta']:
+        #     params['beta_vae'] = 2.5
+            # params['auxiliaries_dict']['prep_type']['weight'] /= 10
+            # params['auxiliaries_dict']['glasses']['weight'] /= 10
+        if (epoch + 1) % params['plot_every'] == 0:
+            plot_results(all_train_losses, all_train_accuracies, all_train_other_metrics,
+                         all_eval_losses, all_eval_accuracies, all_eval_other_metrics, params['plot_path'], weights)
+    return model
+def plot_results(all_train_losses, all_train_accuracies, all_train_other_metrics,
+                 all_eval_losses, all_eval_accuracies, all_eval_other_metrics, plot_path, weights):
+    steps = np.arange(len(all_eval_accuracies))
+    loss_keys = sorted(all_train_losses[0].keys())
+    acc_keys = sorted(all_train_accuracies[0].keys())
+    metrics_keys = sorted(all_train_other_metrics[0].keys())
+    plt.figure()
+    plt.title('Train losses')
+    for k in loss_keys:
+        factor = 1 if k == 'mse_loss' else 1
+        if k not in weights.keys():
+            plt.plot(steps[1:], [train_loss[k] * factor for train_loss in all_train_losses], label=k)
+        else:
+            if weights[k] != 0:
+                plt.plot(steps[1:], [train_loss[k] * factor for train_loss in all_train_losses], label=k)
+    plt.legend()
+    plt.ylim([0, 4])
+    plt.savefig(plot_path + 'train_losses.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Train accuracies')
+    for k in acc_keys:
+        if weights[k] != 0:
+            plt.plot(steps[1:], [train_acc[k] for train_acc in all_train_accuracies], label=k)
+    plt.legend()
+    plt.ylim([0, 1])
+    plt.savefig(plot_path + 'train_acc.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Train other metrics')
+    for k in metrics_keys:
+        if 'confusion' not in k and 'presence' in k:
+            plt.plot(steps[1:], [train_metric[k] for train_metric in all_train_other_metrics], label=k)
+    plt.legend()
+    plt.ylim([0, 1])
+    plt.savefig(plot_path + 'train_ing_presence_errors.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Train other metrics')
+    for k in metrics_keys:
+        if 'confusion' not in k and 'presence' not in k:
+            plt.plot(steps[1:], [train_metric[k] for train_metric in all_train_other_metrics], label=k)
+    plt.legend()
+    plt.savefig(plot_path + 'train_ing_q_error.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Eval losses')
+    for k in loss_keys:
+        factor = 1 if k == 'mse_loss' else 1
+        if k not in weights.keys():
+            plt.plot(steps, [eval_loss[k] * factor for eval_loss in all_eval_losses], label=k)
+        else:
+            if weights[k] != 0:
+                plt.plot(steps, [eval_loss[k] * factor for eval_loss in all_eval_losses], label=k)
+    plt.legend()
+    plt.ylim([0, 4])
+    plt.savefig(plot_path + 'eval_losses.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Eval accuracies')
+    for k in acc_keys:
+        if weights[k] != 0:
+            plt.plot(steps, [eval_acc[k] for eval_acc in all_eval_accuracies], label=k)
+    plt.legend()
+    plt.ylim([0, 1])
+    plt.savefig(plot_path + 'eval_acc.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Eval other metrics')
+    for k in metrics_keys:
+        if 'confusion' not in k and 'presence' in k:
+            plt.plot(steps, [eval_metric[k] for eval_metric in all_eval_other_metrics], label=k)
+    plt.legend()
+    plt.ylim([0, 1])
+    plt.savefig(plot_path + 'eval_ing_presence_errors.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Eval other metrics')
+    for k in metrics_keys:
+        if 'confusion' not in k and 'presence' not in k:
+            plt.plot(steps, [eval_metric[k] for eval_metric in all_eval_other_metrics], label=k)
+    plt.legend()
+    plt.savefig(plot_path + 'eval_ing_q_error.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    for k in metrics_keys:
+        if 'confusion' in k:
+            plt.figure()
+            plt.title(k)
+            plt.ylabel('True')
+            plt.xlabel('Predicted')
+            plt.imshow(all_eval_other_metrics[-1][k], vmin=0, vmax=1)
+            plt.colorbar()
+            plt.savefig(plot_path + f'eval_{k}.png', dpi=200)
+            fig = plt.gcf()
+            plt.close(fig)
+    for k in metrics_keys:
+        if 'confusion' in k:
+            plt.figure()
+            plt.title(k)
+            plt.ylabel('True')
+            plt.xlabel('Predicted')
+            plt.imshow(all_train_other_metrics[-1][k], vmin=0, vmax=1)
+            plt.colorbar()
+            plt.savefig(plot_path + f'train_{k}.png', dpi=200)
+            fig = plt.gcf()
+            plt.close(fig)
+    plt.close('all')
+def get_model(model_path):
+    with open(model_path + 'params.json', 'r') as f:
+        params = json.load(f)
+    params['save_path'] = model_path
+    max_ing_quantities = np.loadtxt(params['save_path'] + 'max_ing_quantities.txt')
+    mean_ing_quantities = np.loadtxt(params['save_path'] + 'mean_ing_quantities.txt')
+    std_ing_quantities = np.loadtxt(params['save_path'] + 'std_ing_quantities.txt')
+    min_when_present_ing_quantities = np.loadtxt(params['save_path'] + 'min_when_present_ing_quantities.txt')
+    def filter_decoder_output(output):
+        output = output.detach().numpy()
+        output_unnormalized = output * std_ing_quantities + mean_ing_quantities
+        if output.ndim == 1:
+            output_unnormalized[np.where(output_unnormalized < min_when_present_ing_quantities)] = 0
+        else:
+            for i in range(output.shape[0]):
+                output_unnormalized[i, np.where(output_unnormalized[i] < min_when_present_ing_quantities)] = 0
+        return output_unnormalized.copy()
+    params['filter_decoder_output'] = filter_decoder_output
+    model_chkpt = model_path + "checkpoint_best.save"
+    model_params = [params[k] for k in ["input_dim", "deepset_latent_dim", "hidden_dims_ingredients", "activation",
+                                        "hidden_dims_cocktail", "hidden_dims_decoder", "nb_ingredients", "latent_dim", "agg", "dropout", "auxiliaries_dict",
+                                        "filter_decoder_output"]]
+    model = get_vae_model(*model_params)
+    model.load_state_dict(torch.load(model_chkpt))
+    model.eval()
+    return model, filter_decoder_output, params
+def compute_expe_name_and_save_path(params):
+    weights_str = '['
+    for aux in params['auxiliaries_dict'].keys():
+        weights_str += f'{params["auxiliaries_dict"][aux]["weight"]}, '
+    weights_str = weights_str[:-2] + ']'
+    save_path = params['save_path'] + params["trial_id"]
+    save_path += f'_lr{params["lr"]}'
+    save_path += f'_betavae{params["beta_vae"]}'
+    save_path += f'_bs{params["batch_size"]}'
+    save_path += f'_latentdim{params["latent_dim"]}'
+    save_path += f'_hding{params["hidden_dims_ingredients"]}'
+    save_path += f'_hdcocktail{params["hidden_dims_cocktail"]}'
+    save_path += f'_hddecoder{params["hidden_dims_decoder"]}'
+    save_path += f'_agg{params["agg"]}'
+    save_path += f'_activ{params["activation"]}'
+    save_path += f'_w{weights_str}'
+    counter = 0
+    while os.path.exists(save_path + f"_{counter}"):
+        counter += 1
+    save_path = save_path + f"_{counter}" + '/'
+    params["save_path"] = save_path
+    os.makedirs(save_path)
+    os.makedirs(save_path + 'plots/')
+    params['plot_path'] = save_path + 'plots/'
+    print(f'logging to {save_path}')
+    return params
+if __name__ == '__main__':
+    params = get_params()
+    run_experiment(params)

src/cocktails/representation_learning/run_simple_net.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import torch; torch.manual_seed(0)
+import torch.utils
+from torch.utils.data import DataLoader
+import torch.distributions
+import torch.nn as nn
+import matplotlib.pyplot as plt; plt.rcParams['figure.dpi'] = 200
+from src.cocktails.representation_learning.dataset import MyDataset, get_representation_from_ingredient, get_max_n_ingredients
+import json
+import pandas as pd
+import numpy as np
+import os
+from src.cocktails.representation_learning.simple_model import SimpleNet
+from src.cocktails.config import COCKTAILS_CSV_DATA, FULL_COCKTAIL_REP_PATH, EXPERIMENT_PATH
+from src.cocktails.utilities.cocktail_utilities import get_bunch_of_rep_keys
+from src.cocktails.utilities.ingredients_utilities import ingredient_profiles
+from resource import getrusage
+from resource import RUSAGE_SELF
+import gc
+gc.collect(2)
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+def get_params():
+    data = pd.read_csv(COCKTAILS_CSV_DATA)
+    max_ingredients, ingredient_set, liquor_set, liqueur_set = get_max_n_ingredients(data)
+    num_ingredients = len(ingredient_set)
+    rep_keys = get_bunch_of_rep_keys()['custom']
+    ing_keys = [k.split(' ')[1] for k in rep_keys]
+    ing_keys.remove('volume')
+    nb_ing_categories = len(set(ingredient_profiles['type']))
+    category_encodings = dict(zip(sorted(set(ingredient_profiles['type'])), np.eye(nb_ing_categories)))
+    params = dict(trial_id='test',
+                  save_path=EXPERIMENT_PATH + "/simple_net/",
+                  nb_epochs=100,
+                  print_every=50,
+                  plot_every=50,
+                  batch_size=128,
+                  lr=0.001,
+                  dropout=0.15,
+                  output_keyword='glasses',
+                  ing_keys=ing_keys,
+                  nb_ingredients=len(ingredient_set),
+                  hidden_dims=[16],
+                  activation='sigmoid',
+                  auxiliaries_dict=dict(categories=dict(weight=0, type='classif', final_activ=None, dim_output=len(set(data['subcategory']))),
+                                        glasses=dict(weight=0, type='classif', final_activ=None, dim_output=len(set(data['glass']))),
+                                        prep_type=dict(weight=0, type='classif', final_activ=None, dim_output=len(set(data['category']))),
+                                        cocktail_reps=dict(weight=0, type='regression', final_activ=None, dim_output=13),
+                                        volume=dict(weight=0, type='regression', final_activ='relu',  dim_output=1),
+                                        taste_reps=dict(weight=0, type='regression', final_activ='relu', dim_output=2),
+                                        ingredients_presence=dict(weight=0, type='multiclassif', final_activ=None, dim_output=num_ingredients),
+                                        ingredients_quantities=dict(weight=0, type='regression', final_activ=None, dim_output=num_ingredients)),
+                  category_encodings=category_encodings
+                  )
+    params['output_dim'] = params['auxiliaries_dict'][params['output_keyword']]['dim_output']
+    water_rep, indexes_to_normalize = get_representation_from_ingredient(ingredients=['water'], quantities=[1],
+                                                                         max_q_per_ing=dict(zip(ingredient_set, [1] * num_ingredients)), index=0,
+                                                                         params=params)
+    dim_rep_ingredient = water_rep.size
+    params['indexes_ing_to_normalize'] = indexes_to_normalize
+    params['deepset_latent_dim'] = dim_rep_ingredient * max_ingredients
+    params['dim_rep_ingredient'] = dim_rep_ingredient
+    params['input_dim'] = params['nb_ingredients']
+    params = compute_expe_name_and_save_path(params)
+    del params['category_encodings']  # to dump
+    with open(params['save_path'] + 'params.json', 'w') as f:
+        json.dump(params, f)
+    params = complete_params(params)
+    return params
+def complete_params(params):
+    data = pd.read_csv(COCKTAILS_CSV_DATA)
+    cocktail_reps = np.loadtxt(FULL_COCKTAIL_REP_PATH)
+    nb_ing_categories = len(set(ingredient_profiles['type']))
+    category_encodings = dict(zip(sorted(set(ingredient_profiles['type'])), np.eye(nb_ing_categories)))
+    params['cocktail_reps'] = cocktail_reps
+    params['raw_data'] = data
+    params['category_encodings'] = category_encodings
+    return params
+def compute_confusion_matrix_and_accuracy(predictions, ground_truth):
+    bs, n_options = predictions.shape
+    predicted = predictions.argmax(dim=1).detach().numpy()
+    true = ground_truth.int().detach().numpy()
+    confusion_matrix = np.zeros([n_options, n_options])
+    for i in range(bs):
+        confusion_matrix[true[i], predicted[i]] += 1
+    acc = confusion_matrix.diagonal().sum() / bs
+    for i in range(n_options):
+        if confusion_matrix[i].sum() != 0:
+            confusion_matrix[i] /= confusion_matrix[i].sum()
+    acc2 = np.mean(predicted == true)
+    assert (acc - acc2) < 1e-5
+    return confusion_matrix, acc
+def run_epoch(opt, train, model, data, loss_function, params):
+    if train:
+        model.train()
+    else:
+        model.eval()
+    # prepare logging of losses
+    losses = []
+    accuracies = []
+    cf_matrices = []
+    if train: opt.zero_grad()
+    for d in data:
+        nb_ingredients = d[0]
+        batch_size = nb_ingredients.shape[0]
+        x_ingredients = d[1].float()
+        ingredient_quantities = d[2].float()
+        cocktail_reps = d[3].float()
+        auxiliaries = d[4]
+        for k in auxiliaries.keys():
+            if auxiliaries[k].dtype == torch.float64: auxiliaries[k] = auxiliaries[k].float()
+        taste_valid = d[-1]
+        predictions = model(ingredient_quantities)
+        loss = loss_function(predictions, auxiliaries[params['output_keyword']].long()).float()
+        cf_matrix, accuracy = compute_confusion_matrix_and_accuracy(predictions, auxiliaries[params['output_keyword']])
+        if train:
+            loss.backward()
+            opt.step()
+            opt.zero_grad()
+        losses.append(float(loss))
+        cf_matrices.append(cf_matrix)
+        accuracies.append(accuracy)
+    return model, np.mean(losses), np.mean(accuracies), np.mean(cf_matrices, axis=0)
+def prepare_data_and_loss(params):
+    train_data = MyDataset(split='train', params=params)
+    test_data = MyDataset(split='test', params=params)
+    train_data_loader = DataLoader(train_data, batch_size=params['batch_size'], shuffle=True)
+    test_data_loader = DataLoader(test_data, batch_size=params['batch_size'], shuffle=True)
+    if params['auxiliaries_dict'][params['output_keyword']]['type'] == 'classif':
+        if params['output_keyword'] == 'glasses':
+            classif_weights = train_data.glasses_weights
+        elif params['output_keyword'] == 'prep_type':
+            classif_weights = train_data.prep_types_weights
+        elif params['output_keyword'] == 'categories':
+            classif_weights = train_data.categories_weights
+        else:
+            raise ValueError
+        # classif_weights = (np.array(classif_weights) * 2 + np.ones(len(classif_weights))) / 3
+        loss_function = nn.CrossEntropyLoss(torch.FloatTensor(classif_weights))
+        # loss_function = nn.CrossEntropyLoss()
+    elif params['auxiliaries_dict'][params['output_keyword']]['type'] == 'multiclassif':
+        loss_function = nn.BCEWithLogitsLoss()
+    elif params['auxiliaries_dict'][params['output_keyword']]['type'] == 'regression':
+        loss_function = nn.MSELoss()
+    else:
+        raise ValueError
+    return loss_function, train_data_loader, test_data_loader
+def print_losses(train, loss, accuracy):
+    keyword = 'Train' if train else 'Eval'
+    print(f'\t{keyword} logs:')
+    print(f'\t\t Loss: {loss:.2f}, Acc: {accuracy:.2f}')
+def run_experiment(params, verbose=True):
+    loss_function, train_data_loader, test_data_loader = prepare_data_and_loss(params)
+    model = SimpleNet(params['input_dim'], params['hidden_dims'], params['output_dim'], params['activation'], params['dropout'])
+    opt = torch.optim.AdamW(model.parameters(), lr=params['lr'])
+    all_train_losses = []
+    all_eval_losses = []
+    all_eval_cf_matrices = []
+    all_train_accuracies = []
+    all_eval_accuracies = []
+    all_train_cf_matrices = []
+    best_loss = np.inf
+    model, eval_loss, eval_accuracy, eval_cf_matrix = run_epoch(opt=opt, train=False, model=model, data=test_data_loader, loss_function=loss_function,  params=params)
+    all_eval_losses.append(eval_loss)
+    all_eval_accuracies.append(eval_accuracy)
+    if verbose: print(f'\n--------\nEpoch #0')
+    if verbose: print_losses(train=False, accuracy=eval_accuracy, loss=eval_loss)
+    for epoch in range(params['nb_epochs']):
+        if verbose and (epoch + 1) % params['print_every'] == 0: print(f'\n--------\nEpoch #{epoch+1}')
+        model, train_loss, train_accuracy, train_cf_matrix = run_epoch(opt=opt, train=True, model=model, data=train_data_loader, loss_function=loss_function, params=params)
+        if verbose and (epoch + 1) % params['print_every'] == 0: print_losses(train=True, accuracy=train_accuracy, loss=train_loss)
+        model, eval_loss, eval_accuracy, eval_cf_matrix = run_epoch(opt=opt, train=False, model=model, data=test_data_loader, loss_function=loss_function, params=params)
+        if verbose and (epoch + 1) % params['print_every'] == 0: print_losses(train=False, accuracy=eval_accuracy, loss=eval_loss)
+        if eval_loss < best_loss:
+            best_loss = eval_loss
+            if verbose: print(f'Saving new best model with loss {best_loss:.2f}')
+            torch.save(model.state_dict(), params['save_path'] + f'checkpoint_best.save')
+        # log
+        all_train_losses.append(train_loss)
+        all_train_accuracies.append(train_accuracy)
+        all_eval_losses.append(eval_loss)
+        all_eval_accuracies.append(eval_accuracy)
+        all_eval_cf_matrices.append(eval_cf_matrix)
+        all_train_cf_matrices.append(train_cf_matrix)
+        if (epoch + 1) % params['plot_every'] == 0:
+            plot_results(all_train_losses, all_train_accuracies, all_train_cf_matrices,
+                         all_eval_losses, all_eval_accuracies, all_eval_cf_matrices, params['plot_path'])
+    return model
+def plot_results(all_train_losses, all_train_accuracies, all_train_cf_matrices,
+                 all_eval_losses, all_eval_accuracies, all_eval_cf_matrices, plot_path):
+    steps = np.arange(len(all_eval_accuracies))
+    plt.figure()
+    plt.title('Losses')
+    plt.plot(steps[1:], all_train_losses, label='train')
+    plt.plot(steps, all_eval_losses, label='eval')
+    plt.legend()
+    plt.ylim([0, 4])
+    plt.savefig(plot_path + 'losses.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Accuracies')
+    plt.plot(steps[1:], all_train_accuracies, label='train')
+    plt.plot(steps, all_eval_accuracies, label='eval')
+    plt.legend()
+    plt.ylim([0, 1])
+    plt.savefig(plot_path + 'accs.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Train confusion matrix')
+    plt.ylabel('True')
+    plt.xlabel('Predicted')
+    plt.imshow(all_train_cf_matrices[-1], vmin=0, vmax=1)
+    plt.colorbar()
+    plt.savefig(plot_path + f'train_confusion_matrix.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Eval confusion matrix')
+    plt.ylabel('True')
+    plt.xlabel('Predicted')
+    plt.imshow(all_eval_cf_matrices[-1], vmin=0, vmax=1)
+    plt.colorbar()
+    plt.savefig(plot_path + f'eval_confusion_matrix.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.close('all')
+def get_model(model_path):
+    with open(model_path + 'params.json', 'r') as f:
+        params = json.load(f)
+    params['save_path'] = model_path
+    model_chkpt = model_path + "checkpoint_best.save"
+    model = SimpleNet(params['input_dim'], params['hidden_dims'], params['output_dim'], params['activation'], params['dropout'])
+    model.load_state_dict(torch.load(model_chkpt))
+    model.eval()
+    return model, params
+def compute_expe_name_and_save_path(params):
+    weights_str = '['
+    for aux in params['auxiliaries_dict'].keys():
+        weights_str += f'{params["auxiliaries_dict"][aux]["weight"]}, '
+    weights_str = weights_str[:-2] + ']'
+    save_path = params['save_path'] + params["trial_id"]
+    save_path += f'_lr{params["lr"]}'
+    save_path += f'_bs{params["batch_size"]}'
+    save_path += f'_hd{params["hidden_dims"]}'
+    save_path += f'_activ{params["activation"]}'
+    save_path += f'_w{weights_str}'
+    counter = 0
+    while os.path.exists(save_path + f"_{counter}"):
+        counter += 1
+    save_path = save_path + f"_{counter}" + '/'
+    params["save_path"] = save_path
+    os.makedirs(save_path)
+    os.makedirs(save_path + 'plots/')
+    params['plot_path'] = save_path + 'plots/'
+    print(f'logging to {save_path}')
+    return params
+if __name__ == '__main__':
+    params = get_params()
+    run_experiment(params)

src/cocktails/representation_learning/run_without_vae.py ADDED Viewed

	@@ -0,0 +1,514 @@

+import torch; torch.manual_seed(0)
+import torch.utils
+from torch.utils.data import DataLoader
+import torch.distributions
+import torch.nn as nn
+import matplotlib.pyplot as plt; plt.rcParams['figure.dpi'] = 200
+from src.cocktails.representation_learning.dataset import MyDataset, get_representation_from_ingredient, get_max_n_ingredients
+import json
+import pandas as pd
+import numpy as np
+import os
+from src.cocktails.representation_learning.multihead_model import get_multihead_model
+from src.cocktails.config import COCKTAILS_CSV_DATA, FULL_COCKTAIL_REP_PATH, EXPERIMENT_PATH
+from src.cocktails.utilities.cocktail_utilities import get_bunch_of_rep_keys
+from src.cocktails.utilities.ingredients_utilities import ingredient_profiles
+from resource import getrusage
+from resource import RUSAGE_SELF
+import gc
+gc.collect(2)
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+def get_params():
+    data = pd.read_csv(COCKTAILS_CSV_DATA)
+    max_ingredients, ingredient_set, liquor_set, liqueur_set = get_max_n_ingredients(data)
+    num_ingredients = len(ingredient_set)
+    rep_keys = get_bunch_of_rep_keys()['custom']
+    ing_keys = [k.split(' ')[1] for k in rep_keys]
+    ing_keys.remove('volume')
+    nb_ing_categories = len(set(ingredient_profiles['type']))
+    category_encodings = dict(zip(sorted(set(ingredient_profiles['type'])), np.eye(nb_ing_categories)))
+    params = dict(trial_id='test',
+                  save_path=EXPERIMENT_PATH + "/multihead_model/",
+                  nb_epochs=500,
+                  print_every=50,
+                  plot_every=50,
+                  batch_size=128,
+                  lr=0.001,
+                  dropout=0.,
+                  nb_epoch_switch_beta=600,
+                  latent_dim=10,
+                  beta_vae=0.2,
+                  ing_keys=ing_keys,
+                  nb_ingredients=len(ingredient_set),
+                  hidden_dims_ingredients=[128],
+                  hidden_dims_cocktail=[64],
+                  hidden_dims_decoder=[32],
+                  agg='mean',
+                  activation='relu',
+                  auxiliaries_dict=dict(categories=dict(weight=5, type='classif', final_activ=None, dim_output=len(set(data['subcategory']))), #0.5
+                                        glasses=dict(weight=0.5, type='classif', final_activ=None, dim_output=len(set(data['glass']))), #0.1
+                                        prep_type=dict(weight=0.1, type='classif', final_activ=None, dim_output=len(set(data['category']))),#1
+                                        cocktail_reps=dict(weight=1, type='regression', final_activ=None, dim_output=13),#1
+                                        volume=dict(weight=1, type='regression', final_activ='relu',  dim_output=1),#1
+                                        taste_reps=dict(weight=1, type='regression', final_activ='relu', dim_output=2),#1
+                                        ingredients_presence=dict(weight=0, type='multiclassif', final_activ=None, dim_output=num_ingredients),#10
+                                        ingredients_quantities=dict(weight=0, type='regression', final_activ=None, dim_output=num_ingredients)),
+                  category_encodings=category_encodings
+                  )
+    water_rep, indexes_to_normalize = get_representation_from_ingredient(ingredients=['water'], quantities=[1],
+                                                                         max_q_per_ing=dict(zip(ingredient_set, [1] * num_ingredients)), index=0,
+                                                                         params=params)
+    dim_rep_ingredient = water_rep.size
+    params['indexes_ing_to_normalize'] = indexes_to_normalize
+    params['deepset_latent_dim'] = dim_rep_ingredient * max_ingredients
+    params['dim_rep_ingredient'] = dim_rep_ingredient
+    params['input_dim'] = params['nb_ingredients']
+    params = compute_expe_name_and_save_path(params)
+    del params['category_encodings']  # to dump
+    with open(params['save_path'] + 'params.json', 'w') as f:
+        json.dump(params, f)
+    params = complete_params(params)
+    return params
+def complete_params(params):
+    data = pd.read_csv(COCKTAILS_CSV_DATA)
+    cocktail_reps = np.loadtxt(FULL_COCKTAIL_REP_PATH)
+    nb_ing_categories = len(set(ingredient_profiles['type']))
+    category_encodings = dict(zip(sorted(set(ingredient_profiles['type'])), np.eye(nb_ing_categories)))
+    params['cocktail_reps'] = cocktail_reps
+    params['raw_data'] = data
+    params['category_encodings'] = category_encodings
+    return params
+def compute_losses_and_accuracies(loss_functions, auxiliaries, auxiliaries_str, outputs, data):
+    losses = dict()
+    accuracies = dict()
+    other_metrics = dict()
+    for i_k, k in enumerate(auxiliaries_str):
+        # get ground truth
+        # compute loss
+        if k == 'volume':
+            outputs[i_k] = outputs[i_k].flatten()
+        ground_truth = auxiliaries[k]
+        if ground_truth.dtype == torch.float64:
+            losses[k] = loss_functions[k](outputs[i_k], ground_truth.float()).float()
+        elif ground_truth.dtype == torch.int64:
+            if str(loss_functions[k]) != "BCEWithLogitsLoss()":
+                losses[k] = loss_functions[k](outputs[i_k].float(), ground_truth.long()).float()
+            else:
+                losses[k] = loss_functions[k](outputs[i_k].float(), ground_truth.float()).float()
+        else:
+            losses[k] = loss_functions[k](outputs[i_k], ground_truth).float()
+        # compute accuracies
+        if str(loss_functions[k]) == 'CrossEntropyLoss()':
+            bs, n_options = outputs[i_k].shape
+            predicted = outputs[i_k].argmax(dim=1).detach().numpy()
+            true = ground_truth.int().detach().numpy()
+            confusion_matrix = np.zeros([n_options, n_options])
+            for i in range(bs):
+                confusion_matrix[true[i], predicted[i]] += 1
+            acc = confusion_matrix.diagonal().sum() / bs
+            for i in range(n_options):
+                if confusion_matrix[i].sum() != 0:
+                    confusion_matrix[i] /= confusion_matrix[i].sum()
+            other_metrics[k + '_confusion'] = confusion_matrix
+            accuracies[k] = np.mean(outputs[i_k].argmax(dim=1).detach().numpy() == ground_truth.int().detach().numpy())
+            assert (acc - accuracies[k]) < 1e-5
+        elif str(loss_functions[k]) == 'BCEWithLogitsLoss()':
+            assert k == 'ingredients_presence'
+            outputs_rescaled = outputs[i_k].detach().numpy() * data.dataset.std_ing_quantities + data.dataset.mean_ing_quantities
+            predicted_presence = (outputs_rescaled > 0).astype(bool)
+            presence = ground_truth.detach().numpy().astype(bool)
+            other_metrics[k + '_false_positive'] = np.mean(np.logical_and(predicted_presence.astype(bool), ~presence.astype(bool)))
+            other_metrics[k + '_false_negative'] = np.mean(np.logical_and(~predicted_presence.astype(bool), presence.astype(bool)))
+            accuracies[k] = np.mean(predicted_presence == presence)  # accuracy for multi class labeling
+        elif str(loss_functions[k]) == 'MSELoss()':
+            accuracies[k] = np.nan
+        else:
+            raise ValueError
+    return losses, accuracies, other_metrics
+def compute_metric_output(aux_other_metrics, data, ingredient_quantities, x_hat):
+    ing_q = ingredient_quantities.detach().numpy()# * data.dataset.std_ing_quantities + data.dataset.mean_ing_quantities
+    ing_presence = (ing_q > 0)
+    x_hat = x_hat.detach().numpy()
+    # x_hat = x_hat.detach().numpy() * data.dataset.std_ing_quantities + data.dataset.mean_ing_quantities
+    abs_diff = np.abs(ing_q - x_hat) * data.dataset.max_ing_quantities
+    # abs_diff = np.abs(ing_q - x_hat)
+    ing_q_abs_loss_when_present, ing_q_abs_loss_when_absent = [], []
+    for i in range(ingredient_quantities.shape[0]):
+        ing_q_abs_loss_when_present.append(np.mean(abs_diff[i, np.where(ing_presence[i])]))
+        ing_q_abs_loss_when_absent.append(np.mean(abs_diff[i, np.where(~ing_presence[i])]))
+    aux_other_metrics['ing_q_abs_loss_when_present'] = np.mean(ing_q_abs_loss_when_present)
+    aux_other_metrics['ing_q_abs_loss_when_absent'] = np.mean(ing_q_abs_loss_when_absent)
+    return aux_other_metrics
+def run_epoch(opt, train, model, data, loss_functions, weights, params):
+    if train:
+        model.train()
+    else:
+        model.eval()
+    # prepare logging of losses
+    losses = dict(kld_loss=[],
+                  mse_loss=[],
+                  vae_loss=[],
+                  volume_loss=[],
+                  global_loss=[])
+    accuracies = dict()
+    other_metrics = dict()
+    for aux in params['auxiliaries_dict'].keys():
+        losses[aux] = []
+        accuracies[aux] = []
+    if train: opt.zero_grad()
+    for d in data:
+        nb_ingredients = d[0]
+        batch_size = nb_ingredients.shape[0]
+        x_ingredients = d[1].float()
+        ingredient_quantities = d[2]
+        cocktail_reps = d[3]
+        auxiliaries = d[4]
+        for k in auxiliaries.keys():
+            if auxiliaries[k].dtype == torch.float64: auxiliaries[k] = auxiliaries[k].float()
+        taste_valid = d[-1]
+        z, outputs, auxiliaries_str = model.forward(ingredient_quantities.float())
+        # get auxiliary losses and accuracies
+        aux_losses, aux_accuracies, aux_other_metrics = compute_losses_and_accuracies(loss_functions, auxiliaries, auxiliaries_str, outputs, data)
+        # compute vae loss
+        aux_other_metrics = compute_metric_output(aux_other_metrics, data, ingredient_quantities, outputs[auxiliaries_str.index('ingredients_quantities')])
+        indexes_taste_valid = np.argwhere(taste_valid.detach().numpy()).flatten()
+        if indexes_taste_valid.size > 0:
+            outputs_taste = model.get_auxiliary(z[indexes_taste_valid], aux_str='taste_reps')
+            gt = auxiliaries['taste_reps'][indexes_taste_valid]
+            factor_loss = indexes_taste_valid.size / (0.3 * batch_size)# factor on the loss: if same ratio as actual dataset factor = 1 if there is less data, then the factor decreases, more data, it increases
+            aux_losses['taste_reps'] = (loss_functions['taste_reps'](outputs_taste, gt) * factor_loss).float()
+        else:
+            aux_losses['taste_reps'] = torch.FloatTensor([0]).reshape([])
+        aux_accuracies['taste_reps'] = 0
+        # aggregate losses
+        global_loss = torch.sum(torch.cat([torch.atleast_1d(aux_losses[k] * weights[k]) for k in params['auxiliaries_dict'].keys()]))
+        # for k in params['auxiliaries_dict'].keys():
+        #     global_loss += aux_losses[k] * weights[k]
+        if train:
+            global_loss.backward()
+            opt.step()
+            opt.zero_grad()
+        # logging
+        losses['global_loss'].append(float(global_loss))
+        for k in params['auxiliaries_dict'].keys():
+            losses[k].append(float(aux_losses[k]))
+            accuracies[k].append(float(aux_accuracies[k]))
+        for k in aux_other_metrics.keys():
+            if k not in other_metrics.keys():
+                other_metrics[k] = [aux_other_metrics[k]]
+            else:
+                other_metrics[k].append(aux_other_metrics[k])
+    for k in losses.keys():
+        losses[k] = np.mean(losses[k])
+    for k in accuracies.keys():
+        accuracies[k] = np.mean(accuracies[k])
+    for k in other_metrics.keys():
+        other_metrics[k] = np.mean(other_metrics[k], axis=0)
+    return model, losses, accuracies, other_metrics
+def prepare_data_and_loss(params):
+    train_data = MyDataset(split='train', params=params)
+    test_data = MyDataset(split='test', params=params)
+    train_data_loader = DataLoader(train_data, batch_size=params['batch_size'], shuffle=True)
+    test_data_loader = DataLoader(test_data, batch_size=params['batch_size'], shuffle=True)
+    loss_functions = dict()
+    weights = dict()
+    for k in sorted(params['auxiliaries_dict'].keys()):
+        if params['auxiliaries_dict'][k]['type'] == 'classif':
+            if k == 'glasses':
+                classif_weights = train_data.glasses_weights
+            elif k == 'prep_type':
+                classif_weights = train_data.prep_types_weights
+            elif k == 'categories':
+                classif_weights = train_data.categories_weights
+            else:
+                raise ValueError
+            loss_functions[k] = nn.CrossEntropyLoss(torch.FloatTensor(classif_weights))
+        elif params['auxiliaries_dict'][k]['type'] == 'multiclassif':
+            loss_functions[k] = nn.BCEWithLogitsLoss()
+        elif params['auxiliaries_dict'][k]['type'] == 'regression':
+            loss_functions[k] = nn.MSELoss()
+        else:
+            raise ValueError
+        weights[k] = params['auxiliaries_dict'][k]['weight']
+    return loss_functions, train_data_loader, test_data_loader, weights
+def print_losses(train, losses, accuracies, other_metrics):
+    keyword = 'Train' if train else 'Eval'
+    print(f'\t{keyword} logs:')
+    keys = ['global_loss', 'vae_loss', 'mse_loss', 'kld_loss', 'volume_loss']
+    for k in keys:
+        print(f'\t\t{k} - Loss: {losses[k]:.2f}')
+    for k in sorted(accuracies.keys()):
+        print(f'\t\t{k} (aux) - Loss: {losses[k]:.2f}, Acc: {accuracies[k]:.2f}')
+    for k in sorted(other_metrics.keys()):
+        if 'confusion' not in k:
+            print(f'\t\t{k} - {other_metrics[k]:.2f}')
+def run_experiment(params, verbose=True):
+    loss_functions, train_data_loader, test_data_loader, weights = prepare_data_and_loss(params)
+    model_params = [params[k] for k in ["input_dim", "activation", "hidden_dims_cocktail", "latent_dim", "dropout", "auxiliaries_dict", "hidden_dims_decoder"]]
+    model = get_multihead_model(*model_params)
+    opt = torch.optim.AdamW(model.parameters(), lr=params['lr'])
+    all_train_losses = []
+    all_eval_losses = []
+    all_train_accuracies = []
+    all_eval_accuracies = []
+    all_eval_other_metrics = []
+    all_train_other_metrics = []
+    best_loss = np.inf
+    model, eval_losses, eval_accuracies, eval_other_metrics = run_epoch(opt=opt, train=False, model=model, data=test_data_loader, loss_functions=loss_functions,
+                                                                        weights=weights, params=params)
+    all_eval_losses.append(eval_losses)
+    all_eval_accuracies.append(eval_accuracies)
+    all_eval_other_metrics.append(eval_other_metrics)
+    if verbose: print(f'\n--------\nEpoch #0')
+    if verbose: print_losses(train=False, accuracies=eval_accuracies, losses=eval_losses, other_metrics=eval_other_metrics)
+    for epoch in range(params['nb_epochs']):
+        if verbose and (epoch + 1) % params['print_every'] == 0: print(f'\n--------\nEpoch #{epoch+1}')
+        model, train_losses, train_accuracies, train_other_metrics = run_epoch(opt=opt, train=True, model=model, data=train_data_loader, loss_functions=loss_functions,
+                                                                            weights=weights, params=params)
+        if verbose and (epoch + 1) % params['print_every'] == 0: print_losses(train=True, accuracies=train_accuracies, losses=train_losses, other_metrics=train_other_metrics)
+        model, eval_losses, eval_accuracies, eval_other_metrics = run_epoch(opt=opt, train=False, model=model, data=test_data_loader, loss_functions=loss_functions,
+                                                                            weights=weights, params=params)
+        if verbose and (epoch + 1) % params['print_every'] == 0: print_losses(train=False, accuracies=eval_accuracies, losses=eval_losses, other_metrics=eval_other_metrics)
+        if eval_losses['global_loss'] < best_loss:
+            best_loss = eval_losses['global_loss']
+            if verbose: print(f'Saving new best model with loss {best_loss:.2f}')
+            torch.save(model.state_dict(), params['save_path'] + f'checkpoint_best.save')
+        # log
+        all_train_losses.append(train_losses)
+        all_train_accuracies.append(train_accuracies)
+        all_eval_losses.append(eval_losses)
+        all_eval_accuracies.append(eval_accuracies)
+        all_eval_other_metrics.append(eval_other_metrics)
+        all_train_other_metrics.append(train_other_metrics)
+        # if epoch == params['nb_epoch_switch_beta']:
+        #     params['beta_vae'] = 2.5
+            # params['auxiliaries_dict']['prep_type']['weight'] /= 10
+            # params['auxiliaries_dict']['glasses']['weight'] /= 10
+        if (epoch + 1) % params['plot_every'] == 0:
+            plot_results(all_train_losses, all_train_accuracies, all_train_other_metrics,
+                         all_eval_losses, all_eval_accuracies, all_eval_other_metrics, params['plot_path'], weights)
+    return model
+def plot_results(all_train_losses, all_train_accuracies, all_train_other_metrics,
+                 all_eval_losses, all_eval_accuracies, all_eval_other_metrics, plot_path, weights):
+    steps = np.arange(len(all_eval_accuracies))
+    loss_keys = sorted(all_train_losses[0].keys())
+    acc_keys = sorted(all_train_accuracies[0].keys())
+    metrics_keys = sorted(all_train_other_metrics[0].keys())
+    plt.figure()
+    plt.title('Train losses')
+    for k in loss_keys:
+        factor = 1 if k == 'mse_loss' else 1
+        if k not in weights.keys():
+            plt.plot(steps[1:], [train_loss[k] * factor for train_loss in all_train_losses], label=k)
+        else:
+            if weights[k] != 0:
+                plt.plot(steps[1:], [train_loss[k] * factor for train_loss in all_train_losses], label=k)
+    plt.legend()
+    plt.ylim([0, 4])
+    plt.savefig(plot_path + 'train_losses.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Train accuracies')
+    for k in acc_keys:
+        if weights[k] != 0:
+            plt.plot(steps[1:], [train_acc[k] for train_acc in all_train_accuracies], label=k)
+    plt.legend()
+    plt.ylim([0, 1])
+    plt.savefig(plot_path + 'train_acc.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Train other metrics')
+    for k in metrics_keys:
+        if 'confusion' not in k and 'presence' in k:
+            plt.plot(steps[1:], [train_metric[k] for train_metric in all_train_other_metrics], label=k)
+    plt.legend()
+    plt.ylim([0, 1])
+    plt.savefig(plot_path + 'train_ing_presence_errors.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Train other metrics')
+    for k in metrics_keys:
+        if 'confusion' not in k and 'presence' not in k:
+            plt.plot(steps[1:], [train_metric[k] for train_metric in all_train_other_metrics], label=k)
+    plt.legend()
+    plt.ylim([0, 15])
+    plt.savefig(plot_path + 'train_ing_q_error.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Eval losses')
+    for k in loss_keys:
+        factor = 1 if k == 'mse_loss' else 1
+        if k not in weights.keys():
+            plt.plot(steps, [eval_loss[k] * factor for eval_loss in all_eval_losses], label=k)
+        else:
+            if weights[k] != 0:
+                plt.plot(steps, [eval_loss[k] * factor for eval_loss in all_eval_losses], label=k)
+    plt.legend()
+    plt.ylim([0, 4])
+    plt.savefig(plot_path + 'eval_losses.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Eval accuracies')
+    for k in acc_keys:
+        if weights[k] != 0:
+            plt.plot(steps, [eval_acc[k] for eval_acc in all_eval_accuracies], label=k)
+    plt.legend()
+    plt.ylim([0, 1])
+    plt.savefig(plot_path + 'eval_acc.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Eval other metrics')
+    for k in metrics_keys:
+        if 'confusion' not in k and 'presence' in k:
+            plt.plot(steps, [eval_metric[k] for eval_metric in all_eval_other_metrics], label=k)
+    plt.legend()
+    plt.ylim([0, 1])
+    plt.savefig(plot_path + 'eval_ing_presence_errors.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    plt.figure()
+    plt.title('Eval other metrics')
+    for k in metrics_keys:
+        if 'confusion' not in k and 'presence' not in k:
+            plt.plot(steps, [eval_metric[k] for eval_metric in all_eval_other_metrics], label=k)
+    plt.legend()
+    plt.ylim([0, 15])
+    plt.savefig(plot_path + 'eval_ing_q_error.png', dpi=200)
+    fig = plt.gcf()
+    plt.close(fig)
+    for k in metrics_keys:
+        if 'confusion' in k:
+            plt.figure()
+            plt.title(k)
+            plt.ylabel('True')
+            plt.xlabel('Predicted')
+            plt.imshow(all_eval_other_metrics[-1][k], vmin=0, vmax=1)
+            plt.colorbar()
+            plt.savefig(plot_path + f'eval_{k}.png', dpi=200)
+            fig = plt.gcf()
+            plt.close(fig)
+    for k in metrics_keys:
+        if 'confusion' in k:
+            plt.figure()
+            plt.title(k)
+            plt.ylabel('True')
+            plt.xlabel('Predicted')
+            plt.imshow(all_train_other_metrics[-1][k], vmin=0, vmax=1)
+            plt.colorbar()
+            plt.savefig(plot_path + f'train_{k}.png', dpi=200)
+            fig = plt.gcf()
+            plt.close(fig)
+    plt.close('all')
+def get_model(model_path):
+    with open(model_path + 'params.json', 'r') as f:
+        params = json.load(f)
+    params['save_path'] = model_path
+    model_chkpt = model_path + "checkpoint_best.save"
+    model_params = [params[k] for k in ["input_dim", "activation", "hidden_dims_cocktail", "latent_dim", "dropout", "auxiliaries_dict", "hidden_dims_decoder"]]
+    model = get_multihead_model(*model_params)
+    model.load_state_dict(torch.load(model_chkpt))
+    model.eval()
+    max_ing_quantities = np.loadtxt(model_path + 'max_ing_quantities.txt')
+    def predict(ing_qs, aux_str):
+        ing_qs /= max_ing_quantities
+        input_model = torch.FloatTensor(ing_qs).reshape(1, -1)
+        _, outputs, auxiliaries_str = model.forward(input_model, )
+        if isinstance(aux_str, str):
+            return outputs[auxiliaries_str.index(aux_str)].detach().numpy()
+        elif isinstance(aux_str, list):
+            return [outputs[auxiliaries_str.index(aux)].detach().numpy() for aux in aux_str]
+        else:
+            raise ValueError
+    return predict, params
+def compute_expe_name_and_save_path(params):
+    weights_str = '['
+    for aux in params['auxiliaries_dict'].keys():
+        weights_str += f'{params["auxiliaries_dict"][aux]["weight"]}, '
+    weights_str = weights_str[:-2] + ']'
+    save_path = params['save_path'] + params["trial_id"]
+    save_path += f'_lr{params["lr"]}'
+    save_path += f'_betavae{params["beta_vae"]}'
+    save_path += f'_bs{params["batch_size"]}'
+    save_path += f'_latentdim{params["latent_dim"]}'
+    save_path += f'_hding{params["hidden_dims_ingredients"]}'
+    save_path += f'_hdcocktail{params["hidden_dims_cocktail"]}'
+    save_path += f'_hddecoder{params["hidden_dims_decoder"]}'
+    save_path += f'_agg{params["agg"]}'
+    save_path += f'_activ{params["activation"]}'
+    save_path += f'_w{weights_str}'
+    counter = 0
+    while os.path.exists(save_path + f"_{counter}"):
+        counter += 1
+    save_path = save_path + f"_{counter}" + '/'
+    params["save_path"] = save_path
+    os.makedirs(save_path)
+    os.makedirs(save_path + 'plots/')
+    params['plot_path'] = save_path + 'plots/'
+    print(f'logging to {save_path}')
+    return params
+if __name__ == '__main__':
+    params = get_params()
+    run_experiment(params)

src/cocktails/representation_learning/simple_model.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch; torch.manual_seed(0)
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils
+import torch.distributions
+import matplotlib.pyplot as plt; plt.rcParams['figure.dpi'] = 200
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+def get_activation(activation):
+    if activation == 'tanh':
+        activ = F.tanh
+    elif activation == 'relu':
+        activ = F.relu
+    elif activation == 'mish':
+        activ = F.mish
+    elif activation == 'sigmoid':
+        activ = torch.sigmoid
+    elif activation == 'leakyrelu':
+        activ = F.leaky_relu
+    elif activation == 'exp':
+        activ = torch.exp
+    else:
+        raise ValueError
+    return activ
+class SimpleNet(nn.Module):
+    def __init__(self, input_dim, hidden_dims, output_dim, activation, dropout, final_activ=None):
+        super(SimpleNet, self).__init__()
+        self.linears = nn.ModuleList()
+        self.dropouts = nn.ModuleList()
+        self.output_dim = output_dim
+        dims = [input_dim] + hidden_dims + [output_dim]
+        for d_in, d_out in zip(dims[:-1], dims[1:]):
+            self.linears.append(nn.Linear(d_in, d_out))
+            self.dropouts.append(nn.Dropout(dropout))
+        self.activation = get_activation(activation)
+        self.n_layers = len(self.linears)
+        self.layer_range = range(self.n_layers)
+        if final_activ != None:
+            self.final_activ = get_activation(final_activ)
+            self.use_final_activ = True
+        else:
+            self.use_final_activ = False
+    def forward(self, x):
+        for i_layer, layer, dropout in zip(self.layer_range, self.linears, self.dropouts):
+            x = layer(x)
+            if i_layer != self.n_layers - 1:
+                x = self.activation(dropout(x))
+        if self.use_final_activ: x = self.final_activ(x)
+        return x

src/cocktails/representation_learning/vae_model.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import torch; torch.manual_seed(0)
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils
+import torch.distributions
+import matplotlib.pyplot as plt; plt.rcParams['figure.dpi'] = 200
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+def get_activation(activation):
+    if activation == 'tanh':
+        activ = F.tanh
+    elif activation == 'relu':
+        activ = F.relu
+    elif activation == 'mish':
+        activ = F.mish
+    elif activation == 'sigmoid':
+        activ = F.sigmoid
+    elif activation == 'leakyrelu':
+        activ = F.leaky_relu
+    elif activation == 'exp':
+        activ = torch.exp
+    else:
+        raise ValueError
+    return activ
+class IngredientEncoder(nn.Module):
+    def __init__(self, input_dim, deepset_latent_dim, hidden_dims, activation, dropout):
+        super(IngredientEncoder, self).__init__()
+        self.linears = nn.ModuleList()
+        self.dropouts = nn.ModuleList()
+        dims = [input_dim] + hidden_dims + [deepset_latent_dim]
+        for d_in, d_out in zip(dims[:-1], dims[1:]):
+            self.linears.append(nn.Linear(d_in, d_out))
+            self.dropouts.append(nn.Dropout(dropout))
+        self.activation = get_activation(activation)
+        self.n_layers = len(self.linears)
+        self.layer_range = range(self.n_layers)
+    def forward(self, x):
+        for i_layer, layer, dropout in zip(self.layer_range, self.linears, self.dropouts):
+            x = layer(x)
+            if i_layer != self.n_layers - 1:
+                x = self.activation(dropout(x))
+        return x  # do not use dropout on last layer?
+class DeepsetCocktailEncoder(nn.Module):
+    def __init__(self, input_dim, deepset_latent_dim, hidden_dims_ing, activation,
+                 hidden_dims_cocktail, latent_dim, aggregation, dropout):
+        super(DeepsetCocktailEncoder, self).__init__()
+        self.input_dim = input_dim  # dimension of ingredient representation + quantity
+        self.ingredient_encoder = IngredientEncoder(input_dim, deepset_latent_dim, hidden_dims_ing, activation, dropout)  # encode each ingredient separately
+        self.deepset_latent_dim = deepset_latent_dim  # dimension of the deepset aggregation
+        self.aggregation = aggregation
+        self.latent_dim = latent_dim
+        # post aggregation network
+        self.linears = nn.ModuleList()
+        self.dropouts = nn.ModuleList()
+        dims = [deepset_latent_dim] + hidden_dims_cocktail
+        for d_in, d_out in zip(dims[:-1], dims[1:]):
+            self.linears.append(nn.Linear(d_in, d_out))
+            self.dropouts.append(nn.Dropout(dropout))
+        self.FC_mean  = nn.Linear(hidden_dims_cocktail[-1], latent_dim)
+        self.FC_logvar   = nn.Linear(hidden_dims_cocktail[-1], latent_dim)
+        self.softplus = nn.Softplus()
+        self.activation = get_activation(activation)
+        self.n_layers = len(self.linears)
+        self.layer_range = range(self.n_layers)
+    def forward(self, nb_ingredients, x):
+        # reshape x in (batch size * nb ingredients, dim_ing_rep)
+        batch_size = x.shape[0]
+        all_ingredients = []
+        for i in range(batch_size):
+            for j in range(nb_ingredients[i]):
+                all_ingredients.append(x[i, self.input_dim * j: self.input_dim * (j + 1)].reshape(1, -1))
+        x = torch.cat(all_ingredients, dim=0)
+        # encode ingredients in parallel
+        ingredients_encodings = self.ingredient_encoder(x)
+        assert ingredients_encodings.shape == (torch.sum(nb_ingredients), self.deepset_latent_dim)
+        # aggregate
+        x = []
+        index_first = 0
+        for i in range(batch_size):
+            index_last = index_first + nb_ingredients[i]
+            # aggregate
+            if self.aggregation == 'sum':
+                x.append(torch.sum(ingredients_encodings[index_first:index_last], dim=0).reshape(1, -1))
+            elif self.aggregation == 'mean':
+                x.append(torch.mean(ingredients_encodings[index_first:index_last], dim=0).reshape(1, -1))
+            else:
+                raise ValueError
+            index_first = index_last
+        x = torch.cat(x, dim=0)
+        assert x.shape[0] == batch_size
+        for i_layer, layer, dropout in zip(self.layer_range, self.linears, self.dropouts):
+            x = self.activation(dropout(layer(x)))
+        mean = self.FC_mean(x)
+        logvar = self.FC_logvar(x)
+        return mean, logvar
+class Decoder(nn.Module):
+    def __init__(self, latent_dim, hidden_dims, num_ingredients, activation, dropout, filter_output=None):
+        super(Decoder, self).__init__()
+        self.linears = nn.ModuleList()
+        self.dropouts = nn.ModuleList()
+        dims = [latent_dim] + hidden_dims + [num_ingredients]
+        for d_in, d_out in zip(dims[:-1], dims[1:]):
+            self.linears.append(nn.Linear(d_in, d_out))
+            self.dropouts.append(nn.Dropout(dropout))
+        self.activation = get_activation(activation)
+        self.n_layers = len(self.linears)
+        self.layer_range = range(self.n_layers)
+        self.filter = filter_output
+    def forward(self, x, to_filter=False):
+        for i_layer, layer, dropout in zip(self.layer_range, self.linears, self.dropouts):
+            x = layer(x)
+            if i_layer != self.n_layers - 1:
+                x = self.activation(dropout(x))
+        if to_filter:
+            x = self.filter(x)
+        return x
+class PredictorHead(nn.Module):
+    def __init__(self, latent_dim, dim_output, final_activ):
+        super(PredictorHead, self).__init__()
+        self.linear = nn.Linear(latent_dim, dim_output)
+        if final_activ != None:
+            self.final_activ = get_activation(final_activ)
+            self.use_final_activ = True
+        else:
+            self.use_final_activ = False
+    def forward(self, x):
+        x = self.linear(x)
+        if self.use_final_activ: x = self.final_activ(x)
+        return x
+class VAEModel(nn.Module):
+    def __init__(self, encoder, decoder, auxiliaries_dict):
+        super(VAEModel, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.latent_dim = self.encoder.latent_dim
+        self.auxiliaries_str = []
+        self.auxiliaries = nn.ModuleList()
+        for aux_str in sorted(auxiliaries_dict.keys()):
+            if aux_str == 'taste_reps':
+                self.taste_reps_decoder = PredictorHead(self.latent_dim, auxiliaries_dict[aux_str]['dim_output'], auxiliaries_dict[aux_str]['final_activ'])
+            else:
+                self.auxiliaries_str.append(aux_str)
+                self.auxiliaries.append(PredictorHead(self.latent_dim, auxiliaries_dict[aux_str]['dim_output'], auxiliaries_dict[aux_str]['final_activ']))
+    def reparameterization(self, mean, logvar):
+        std = torch.exp(0.5 * logvar)
+        epsilon = torch.randn_like(std).to(device)        # sampling epsilon
+        z = mean + std * epsilon                          # reparameterization trick
+        return z
+    def sample(self, n=1):
+        z = torch.randn(size=(n, self.latent_dim))
+        return self.decoder(z)
+    def get_all_auxiliaries(self, x):
+        return [aux(x) for aux in self.auxiliaries]
+    def get_auxiliary(self, z, aux_str):
+        if aux_str == 'taste_reps':
+            return self.taste_reps_decoder(z)
+        else:
+            index = self.auxiliaries_str.index(aux_str)
+            return self.auxiliaries[index](z)
+    def forward_direct(self, x, aux_str=None, to_filter=False):
+        mean, logvar = self.encoder(x)
+        z = self.reparameterization(mean, logvar)  # takes exponential function (log var -> std)
+        x_hat = self.decoder(mean, to_filter=to_filter)
+        if aux_str is not None:
+            return x_hat, z, mean, logvar, self.get_auxiliary(z, aux_str), [aux_str]
+        else:
+            return x_hat, z, mean, logvar, self.get_all_auxiliaries(z), self.auxiliaries_str
+    def forward(self, nb_ingredients, x, aux_str=None, to_filter=False):
+        assert False
+        mean, std = self.encoder(nb_ingredients, x)
+        z = self.reparameterization(mean, std)  # takes exponential function (log var -> std)
+        x_hat = self.decoder(mean, to_filter=to_filter)
+        if aux_str is not None:
+            return x_hat, z, mean, std, self.get_auxiliary(z, aux_str), [aux_str]
+        else:
+            return x_hat, z, mean, std, self.get_all_auxiliaries(z), self.auxiliaries_str
+class SimpleEncoder(nn.Module):
+    def __init__(self, input_dim, hidden_dims, latent_dim, activation, dropout):
+        super(SimpleEncoder, self).__init__()
+        self.latent_dim = latent_dim
+        # post aggregation network
+        self.linears = nn.ModuleList()
+        self.dropouts = nn.ModuleList()
+        dims = [input_dim] + hidden_dims
+        for d_in, d_out in zip(dims[:-1], dims[1:]):
+            self.linears.append(nn.Linear(d_in, d_out))
+            self.dropouts.append(nn.Dropout(dropout))
+        self.FC_mean = nn.Linear(hidden_dims[-1], latent_dim)
+        self.FC_logvar = nn.Linear(hidden_dims[-1], latent_dim)
+        # self.softplus = nn.Softplus()
+        self.activation = get_activation(activation)
+        self.n_layers = len(self.linears)
+        self.layer_range = range(self.n_layers)
+    def forward(self, x):
+        for i_layer, layer, dropout in zip(self.layer_range, self.linears, self.dropouts):
+            x = self.activation(dropout(layer(x)))
+        mean = self.FC_mean(x)
+        logvar = self.FC_logvar(x)
+        return mean, logvar
+def get_vae_model(input_dim, deepset_latent_dim, hidden_dims_ing, activation,
+                  hidden_dims_cocktail, hidden_dims_decoder, num_ingredients, latent_dim, aggregation, dropout, auxiliaries_dict,
+                  filter_decoder_output):
+    # encoder = DeepsetCocktailEncoder(input_dim, deepset_latent_dim, hidden_dims_ing, activation,
+    #                                  hidden_dims_cocktail, latent_dim, aggregation, dropout)
+    encoder = SimpleEncoder(num_ingredients, hidden_dims_cocktail, latent_dim, activation, dropout)
+    decoder = Decoder(latent_dim, hidden_dims_decoder, num_ingredients, activation, dropout, filter_output=filter_decoder_output)
+    vae = VAEModel(encoder, decoder, auxiliaries_dict)
+    return vae

src/cocktails/utilities/__init__.py ADDED Viewed

File without changes

src/cocktails/utilities/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (208 Bytes). View file

src/cocktails/utilities/__pycache__/cocktail_category_detection_utilities.cpython-39.pyc ADDED Viewed

Binary file (9.62 kB). View file

src/cocktails/utilities/__pycache__/cocktail_utilities.cpython-39.pyc ADDED Viewed

Binary file (8.12 kB). View file

src/cocktails/utilities/__pycache__/glass_and_volume_utilities.cpython-39.pyc ADDED Viewed

Binary file (1.19 kB). View file

src/cocktails/utilities/__pycache__/ingredients_utilities.cpython-39.pyc ADDED Viewed

Binary file (6.86 kB). View file

src/cocktails/utilities/__pycache__/other_scrubbing_utilities.cpython-39.pyc ADDED Viewed

Binary file (8.55 kB). View file

src/cocktails/utilities/analysis_utilities.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import numpy as np
+import matplotlib.pyplot as plt
+from src.cocktails.utilities.ingredients_utilities import ingredient_list, extract_ingredients, ingredients_per_type
+color_codes = dict(ancestral='#000000',
+                   spirit_forward='#2320D2',
+                   duo='#6E20D2',
+                   champagne_cocktail='#25FFCA',
+                   complex_highball='#068F25',
+                   simple_highball='#25FF57',
+                   collins='#77FF96',
+                   julep='#25B8FF',
+                   simple_sour='#FBD756',
+                   complex_sour='#DCAD07',
+                   simple_sour_with_juice='#FF5033',
+                   complex_sour_with_juice='#D42306',
+                   # simple_sour_with_egg='#FF9C54',
+                   # complex_sour_with_egg='#CF5700',
+                   # almost_simple_sor='#FF5033',
+                   # almost_sor='#D42306',
+                   # almost_sor_with_egg='#D42306',
+                   other='#9B9B9B'
+                   )
+def get_subcategories(data):
+    subcategories = np.array(data['subcategory'])
+    sub_categories_list = sorted(set(subcategories))
+    subcat_count = dict(zip(sub_categories_list, [0] * len(sub_categories_list)))
+    for sc in data['subcategory']:
+        subcat_count[sc] += 1
+    return subcategories, sub_categories_list, subcat_count
+def get_ingredient_count(data):
+    ingredient_counts = dict(zip(ingredient_list, [0] * len(ingredient_list)))
+    for ing_str in data['ingredients_str']:
+        ingredients, _ = extract_ingredients(ing_str)
+        for ing in ingredients:
+            ingredient_counts[ing] += 1
+    return ingredient_counts
+def compute_eucl_dist(a, b):
+    return np.sqrt(np.sum((a - b)**2))
+def recipe_contains(ingredients, stuff):
+    if stuff in ingredient_list:
+        return stuff in ingredients
+    elif stuff == 'juice':
+        return any(['juice' in ing and 'lemon' not in ing and 'lime' not in ing for ing in ingredients])
+    elif stuff == 'bubbles':
+        return any([ing in ['soda', 'tonic', 'cola', 'sparkling wine', 'ginger beer'] for ing in ingredients])
+    elif stuff == 'acid':
+        return any([ing in ['lemon juice', 'lime juice'] for ing in ingredients])
+    elif stuff == 'vermouth':
+        return any([ing in ingredients_per_type['vermouth'] for ing in ingredients])
+    elif stuff == 'plain sweet':
+        plain_sweet = ingredients_per_type['sweeteners']
+        return any([ing in plain_sweet for ing in ingredients])
+    elif stuff == 'sweet':
+        sweet = ingredients_per_type['sweeteners'] + ingredients_per_type['liqueur'] + ['sweet vermouth', 'lillet blanc']
+        return any([ing in sweet for ing in ingredients])
+    elif stuff == 'spirit':
+        return any([ing in ingredients_per_type['liquor'] for ing in ingredients])
+    else:
+        raise ValueError
+def radar_factory(num_vars, frame='circle'):
+    # from stackoverflow's post? Or matplotlib's blog
+    """
+    Create a radar chart with `num_vars` axes.
+    This function creates a RadarAxes projection and registers it.
+    Parameters
+    ----------
+    num_vars : int
+        Number of variables for radar chart.
+    frame : {'circle', 'polygon'}
+        Shape of frame surrounding axes.
+    """
+    import numpy as np
+    from matplotlib.patches import Circle, RegularPolygon
+    from matplotlib.path import Path
+    from matplotlib.projections.polar import PolarAxes
+    from matplotlib.projections import register_projection
+    from matplotlib.spines import Spine
+    from matplotlib.transforms import Affine2D
+    # calculate evenly-spaced axis angles
+    theta = np.linspace(0, 2*np.pi, num_vars, endpoint=False)
+    class RadarAxes(PolarAxes):
+        name = 'radar'
+        # use 1 line segment to connect specified points
+        RESOLUTION = 1
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            # rotate plot such that the first axis is at the top
+            self.set_theta_zero_location('N')
+        def fill(self, *args, closed=True, **kwargs):
+            """Override fill so that line is closed by default"""
+            return super().fill(closed=closed, *args, **kwargs)
+        def plot(self, *args, **kwargs):
+            """Override plot so that line is closed by default"""
+            lines = super().plot(*args, **kwargs)
+            for line in lines:
+                self._close_line(line)
+        def _close_line(self, line):
+            x, y = line.get_data()
+            # FIXME: markers at x[0], y[0] get doubled-up
+            if x[0] != x[-1]:
+                x = np.append(x, x[0])
+                y = np.append(y, y[0])
+                line.set_data(x, y)
+        def set_varlabels(self, labels):
+            self.set_thetagrids(np.degrees(theta), labels)
+        def _gen_axes_patch(self):
+            # The Axes patch must be centered at (0.5, 0.5) and of radius 0.5
+            # in axes coordinates.
+            if frame == 'circle':
+                return Circle((0.5, 0.5), 0.5)
+            elif frame == 'polygon':
+                return RegularPolygon((0.5, 0.5), num_vars,
+                                      radius=.5, edgecolor="k")
+            else:
+                raise ValueError("Unknown value for 'frame': %s" % frame)
+        def _gen_axes_spines(self):
+            if frame == 'circle':
+                return super()._gen_axes_spines()
+            elif frame == 'polygon':
+                # spine_type must be 'left'/'right'/'top'/'bottom'/'circle'.
+                spine = Spine(axes=self,
+                              spine_type='circle',
+                              path=Path.unit_regular_polygon(num_vars))
+                # unit_regular_polygon gives a polygon of radius 1 centered at
+                # (0, 0) but we want a polygon of radius 0.5 centered at (0.5,
+                # 0.5) in axes coordinates.
+                spine.set_transform(Affine2D().scale(.5).translate(.5, .5)
+                                    + self.transAxes)
+                return {'polar': spine}
+            else:
+                raise ValueError("Unknown value for 'frame': %s" % frame)
+    register_projection(RadarAxes)
+    return theta
+def plot_radar_cocktail(representation, labels_dim, labels_cocktails, save_path=None, to_show=False, to_save=False):
+    assert to_show or to_save, 'either show or save'
+    assert representation.ndim == 2
+    n_data, dim_rep = representation.shape
+    assert len(labels_cocktails) == n_data
+    assert len(labels_dim) == dim_rep
+    assert n_data <= 5, 'max 5 representation_analysis please'
+    theta = radar_factory(dim_rep, frame='circle')
+    fig, ax = plt.subplots(figsize=(9, 9), subplot_kw=dict(projection='radar'))
+    fig.subplots_adjust(wspace=0.25, hspace=0.20, top=0.85, bottom=0.05)
+    colors = ['b', 'r', 'g', 'm', 'y']
+    # Plot the four cases from the example data on separate axes
+    ax.set_rgrids([0.2, 0.4, 0.6, 0.8])
+    for d, color in zip(representation, colors):
+        ax.plot(theta, d, color=color)
+    for d, color in zip(representation, colors):
+        ax.fill(theta, d, facecolor=color, alpha=0.25)
+    ax.set_varlabels(labels_dim)
+    # add legend relative to top-left plot
+    legend = ax.legend(labels_cocktails, loc=(0.9, .95),
+                       labelspacing=0.1, fontsize='small')
+    if to_save:
+        plt.savefig(save_path, bbox_artists=(legend,), dpi=200)
+    else:
+        plt.show()

src/cocktails/utilities/cocktail_category_detection_utilities.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# The following functions check whether a cocktail belong to any of N categories
+import numpy as np
+from src.cocktails.utilities.ingredients_utilities import ingredient_profiles, ingredients_per_type, ingredient2ingredient_id, extract_ingredients
+def is_ancestral(n, ingredient_indexes, ingredients, quantities):
+    # ancestrals have a strong spirit and some sweetness from sugar, syrup or liqueurs, no citrus.
+    # absinthe can be added up to 3 dashes.
+    # Liqueurs are there to bring sweetness, thus must stay below 15ml (if not it's a duo)
+    if n['spirit'] > 0 and n['citrus'] == 0 and n['plain_sweet'] + n['liqueur'] <= 2:
+        if n['spirit'] > 1 and 'absinthe' in ingredients:
+            if quantities[ingredients.index('absinthe')] < 3:
+                pass
+            else:
+                return False
+        if n['sugar'] < 2 and n['liqueur'] < 3:
+            if n['all'] - n['spirit'] - n['sugar'] -n['syrup']- n['liqueur']- n['inconsequentials'] == 0:
+                if n['liqueur'] == 0:
+                    return True
+                else:
+                    q_liqueur = np.sum([quantities[i_ing]
+                                        for i_ind, i_ing in zip(ingredient_indexes, range(len(ingredients)))
+                                        if ingredient_profiles['type'][i_ind].lower() == 'liqueur'])
+                if q_liqueur <= 15:
+                    return True
+                else:
+                    return False
+    return False
+def is_simple_sour(n, ingredient_indexes, ingredients, quantities):
+    # simple sours contain a citrus, at least 1 spirit and non-alcoholic sweetness
+    if n['citrus'] + n['coffee']> 0 and n['spirit'] > 0 and n['plain_sweet'] > 0  and n['juice'] == 0:
+        if n['all'] - n['citrus'] - n['coffee'] - n['spirit'] - n['plain_sweet'] - n['juice'] -n['egg'] - n['inconsequentials'] == 0:
+            return True
+    return False
+def is_complex_sour(n, ingredient_indexes, ingredients, quantities):
+    # complex sours are simple sours that use alcoholic sweetness, at least in part
+    if n['citrus'] + n['coffee'] > 0 and n['all_sweet'] > 0 and n['juice'] == 0:
+        if (n['spirit'] == 0 and n['liqueur'] > 0) or n['spirit'] > 0:
+            if n['vermouth'] + n['liqueur'] <= 2 and n['vermouth'] + n['liqueur'] > 0:
+                if n['all'] -n['coffee'] - n['citrus'] - n['spirit'] - n['sugar'] - n['syrup'] \
+                        - n['liqueur'] - n['vermouth'] - n['egg'] - n['juice'] - n['inconsequentials'] == 0:
+                    return True
+    return False
+def is_spirit_forward(n, ingredient_indexes, ingredients, quantities):
+    # spirit forward contain at least a spirit and vermouth, no citrus. Can contain sweet (sugar, syrups, liqueurs)
+    if n['spirit'] > 0 and n['citrus'] == 0 and n['vermouth'] > 0:
+        if n['all'] - n['spirit'] - n['sugar'] - n['syrup'] - n['liqueur'] -n['egg'] - n['vermouth'] - n['inconsequentials']== 0:
+            return True
+    return False
+def is_duo(n, ingredient_indexes, ingredients, quantities):
+    # duos are made of one spirit and one liqueur (above 15ml), under it's an ancestral, no citrus.
+    if n['spirit'] >= 1 and n['citrus'] == 0 and n['sugar']==0 and n['liqueur'] > 0 and n['vermouth'] == 0:
+        if n['all'] - n['spirit'] - n['sugar'] - n['liqueur'] - n['vermouth'] - n['inconsequentials'] == 0:
+            q_liqueur = np.sum([quantities[i_ing]
+                                for i_ind, i_ing in zip(ingredient_indexes, range(len(ingredients)))
+                                if ingredient_profiles['type'][i_ind].lower() == 'liqueur'])
+            if q_liqueur > 15:
+                return True
+            else:
+                return False
+    return False
+def is_champagne_cocktail(n, ingredient_indexes, ingredients, quantities):
+    if n['sparkling'] > 0:
+        return True
+    else:
+        return False
+def is_simple_highball(n, ingredient_indexes, ingredients, quantities):
+    # simple highballs have one alcoholic ingredient and bubbles
+    if n['alcoholic'] == 1 and n['bubbles'] > 0:
+        if n['all'] - n['alcoholic'] - n['bubbles'] - n['inconsequentials']== 0:
+            return True
+    return False
+def is_complex_highball(n, ingredient_indexes, ingredients, quantities):
+    # complex highballs have at least one alcoholic ingredient and bubbles (possibly alcoholic). They also contain extra sugar under any form and juice
+    if n['alcoholic'] > 0 and (n['bubbles'] + n['sparkling']) == 1 and n['juice'] + n['all_sweet'] + n['sugar_bubbles']> 0:
+        if n['all'] - n['spirit'] - n['bubbles'] - n['sparkling'] - n['citrus'] - n['juice'] - n['liqueur'] \
+                - n['syrup'] - n['sugar'] -n['vermouth'] -n['egg'] - n['inconsequentials'] == 0:
+            if not is_collins(n, ingredient_indexes, ingredients, quantities) and not is_simple_highball(n, ingredient_indexes, ingredients, quantities):
+                return True
+    return False
+def is_collins(n, ingredient_indexes, ingredients, quantities):
+    # collins are a particular kind of highball with sugar and citrus
+    if n['alcoholic'] == 1 and n['bubbles'] == 1 and n['citrus'] > 0 and n['plain_sweet'] + n['sugar_bubbles'] > 0:
+        if n['all'] - n['spirit'] - n['bubbles'] - n['citrus'] - n['sugar'] - n['inconsequentials'] == 0:
+            return True
+    return False
+def is_julep(n, ingredient_indexes, ingredients, quantities):
+    # juleps involve smashd mint, sugar and a spirit, no citrus.
+    if 'mint' in ingredients and n['sugar'] > 0 and n['spirit'] > 0 and n['vermouth'] == 0 and n['citrus'] == 0:
+        return True
+    return False
+def is_simple_sour_with_juice(n, ingredient_indexes, ingredients, quantities):
+    # almost sours are sours with juice
+    if n['juice'] > 0 and n['spirit'] > 0 and n['plain_sweet'] > 0:
+        if n['all'] - n['citrus'] - n['coffee'] - n['juice'] - n['spirit'] - n['sugar'] - n['syrup'] - n['egg'] - n['inconsequentials'] == 0:
+            return True
+    return False
+def is_complex_sour_with_juice(n, ingredient_indexes, ingredients, quantities):
+    # almost sours are sours with juice
+    if n['juice'] > 0 and n['all_sweet'] > 0:
+        if (n['spirit'] == 0 and n['liqueur'] > 0) or n['spirit'] > 0:
+            if n['vermouth'] + n['liqueur'] <= 2 and n['vermouth'] + n['liqueur'] > 0:
+                if n['all'] -n['coffee'] - n['citrus'] - n['spirit'] - n['sugar'] - n['syrup'] \
+                        - n['liqueur'] - n['vermouth'] - n['egg'] - n['juice'] - n['inconsequentials'] == 0:
+                    return True
+    return False
+is_sub_category = [is_ancestral, is_complex_sour, is_simple_sour, is_duo, is_champagne_cocktail,
+                   is_spirit_forward, is_simple_highball, is_complex_highball, is_collins,
+                   is_julep, is_simple_sour_with_juice, is_complex_sour_with_juice]
+sub_categories = ['ancestral', 'complex_sour', 'simple_sour', 'duo', 'champagne_cocktail',
+                  'spirit_forward', 'simple_highball', 'complex_highball', 'collins',
+                  'julep', 'simple_sour_with_juice', 'complex_sour_with_juice']
+# compute cocktail category as a function of ingredients and quantities, uses name to check match between name and cat (e.g. XXX Collins should be collins..)
+# Categories definitions are based on https://www.seriouseats.com/cocktail-style-guide-categories-of-cocktails-glossary-families-of-drinks
+def find_cocktail_sub_category(ingredients, quantities, name=None):
+    ingredient_indexes = [ingredient2ingredient_id[ing] for ing in ingredients]
+    n_spirit = np.sum([ingredient_profiles['type'][i].lower() == 'liquor' for i in ingredient_indexes ])
+    n_citrus = np.sum([ingredient_profiles['type'][i].lower()== 'acid' for i in ingredient_indexes])
+    n_sugar = np.sum([ingredient_profiles['ingredient'][i].lower() in ['double syrup', 'simple syrup', 'honey syrup'] for i in ingredient_indexes])
+    plain_sweet = ingredients_per_type['sweeteners']
+    all_sweet = ingredients_per_type['sweeteners'] + ingredients_per_type['liqueur'] + ['sweet vermouth', 'lillet blanc']
+    n_plain_sweet = np.sum([ingredient_profiles['ingredient'][i].lower() in plain_sweet for i in ingredient_indexes])
+    n_all_sweet = np.sum([ingredient_profiles['ingredient'][i].lower() in all_sweet for i in ingredient_indexes])
+    n_sugar_bubbles = np.sum([ingredient_profiles['ingredient'][i].lower() in ['cola', 'ginger beer', 'tonic'] for i in ingredient_indexes])
+    n_juice = np.sum([ingredient_profiles['type'][i].lower() == 'juice' for i in ingredient_indexes])
+    n_liqueur = np.sum([ingredient_profiles['type'][i].lower() == 'liqueur' for i in ingredient_indexes])
+    alcoholic = ingredients_per_type['liquor'] + ingredients_per_type['liqueur'] + ingredients_per_type['vermouth']
+    n_alcoholic = np.sum([ingredient_profiles['ingredient'][i].lower() in alcoholic for i in ingredient_indexes])
+    n_bitter = np.sum([ingredient_profiles['type'][i].lower() == 'bitters' for i in ingredient_indexes])
+    n_egg = np.sum([ingredient_profiles['ingredient'][i].lower() == 'egg' for i in ingredient_indexes])
+    n_vermouth = np.sum([ingredient_profiles['type'][i].lower() == 'vermouth' for i in ingredient_indexes])
+    n_sparkling = np.sum([ingredient_profiles['ingredient'][i].lower() == 'sparkling wine' for i in ingredient_indexes])
+    n_bubbles = np.sum([ingredient_profiles['ingredient'][i].lower() in ['soda', 'tonic', 'cola', 'ginger beer'] for i in ingredient_indexes])
+    n_syrup = np.sum([ingredient_profiles['ingredient'][i].lower() in ['grenadine', 'raspberry syrup'] for i in ingredient_indexes])
+    n_coffee = np.sum([ingredient_profiles['ingredient'][i].lower() == 'espresso' for i in ingredient_indexes])
+    inconsequentials = ['water', 'salt', 'angostura', 'orange bitters', 'mint']
+    n_inconsequentials = np.sum([ingredient_profiles['ingredient'][i].lower() in inconsequentials for i in ingredient_indexes])
+    n = dict(all=len(ingredients),
+             inconsequentials=n_inconsequentials,
+             sugar_bubbles=n_sugar_bubbles,
+             bubbles=n_bubbles,
+             plain_sweet=n_plain_sweet,
+             all_sweet=n_all_sweet,
+             coffee=n_coffee,
+             alcoholic=n_alcoholic,
+             syrup=n_syrup,
+             sparkling=n_sparkling,
+             sugar=n_sugar,
+             spirit=n_spirit,
+             citrus=n_citrus,
+             juice=n_juice,
+             liqueur=n_liqueur,
+             bitter=n_bitter,
+             egg=n_egg,
+             vermouth=n_vermouth)
+    sub_cats = [c for c, test_c in zip(sub_categories, is_sub_category) if test_c(n, ingredient_indexes, ingredients, quantities)]
+    if name != None:
+        name = name.lower()
+        keywords_to_test = ['julep', 'collins', 'highball', 'sour', 'champagne']
+        for k in keywords_to_test:
+            if k in name and not any([k in cat for cat in sub_cats]):
+                print(k)
+                for ing, q in zip(ingredients, quantities):
+                    print(f'{ing}: {q} ml')
+                print(n)
+                break
+    if sorted(sub_cats) == ['champagne_cocktail', 'complex_highball']:
+        sub_cats = ['champagne_cocktail']
+    elif sorted(sub_cats) == ['collins', 'complex_highball']:
+        sub_cats = ['collins']
+    elif sorted(sub_cats) ==  ['champagne_cocktail', 'complex_highball', 'julep']:
+        sub_cats = ['champagne_cocktail']
+    elif sorted(sub_cats) ==  ['ancestral', 'julep']:
+        sub_cats = ['julep']
+    elif sorted(sub_cats) ==  ['complex_highball', 'julep']:
+        sub_cats = ['complex_highball']
+    elif sorted(sub_cats) ==  ['julep', 'simple_sour_with_juice']:
+        sub_cats = ['simple_sour_with_juice']
+    elif sorted(sub_cats) == ['complex_sour_with_juice', 'julep']:
+        sub_cats = ['complex_sour_with_juice']
+    if len(sub_cats) != 1:
+        # print(sub_cats)
+        # for ing, q in zip(ingredients, quantities):
+        #     print(f'{ing}: {q} ml')
+        # print(n)
+        # if len(sub_cats) == 0:
+        sub_cats = ['other']
+    assert len(sub_cats) == 1, sub_cats
+    return sub_cats[0], n
+def get_cocktails_attributes(ing_strs):
+    attributes = dict()
+    cats = []
+    for ing_str in ing_strs:
+        ingredients, quantities = extract_ingredients(ing_str)
+        cat, atts = find_cocktail_sub_category(ingredients, quantities)
+        for k in atts.keys():
+            if k not in attributes.keys():
+                attributes[k] = [atts[k]]
+            else:
+                attributes[k].append(atts[k])
+        cats.append(cat)
+    return cats, attributes

src/cocktails/utilities/cocktail_generation_utilities/__init__.py ADDED Viewed

File without changes

src/cocktails/utilities/cocktail_generation_utilities/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (238 Bytes). View file

src/cocktails/utilities/cocktail_generation_utilities/__pycache__/individual.cpython-39.pyc ADDED Viewed

Binary file (20.2 kB). View file

src/cocktails/utilities/cocktail_generation_utilities/__pycache__/population.cpython-39.pyc ADDED Viewed

Binary file (8.36 kB). View file

src/cocktails/utilities/cocktail_generation_utilities/individual.py ADDED Viewed

	@@ -0,0 +1,587 @@

+from src.cocktails.utilities.ingredients_utilities import get_ingredients_info, format_ingredients, extract_ingredients, ingredients_per_type, bubble_ingredients
+import numpy as np
+from src.cocktails.utilities.other_scrubbing_utilities import print_recipe
+from src.cocktails.utilities.cocktail_utilities import get_cocktail_rep, get_profile, get_bunch_of_rep_keys
+from src.cocktails.utilities.glass_and_volume_utilities import glass_volume
+from src.cocktails.representation_learning.run import get_model
+from src.cocktails.pipeline.get_cocktail2affective_cluster import get_cocktail2affective_cluster
+from src.cocktails.config import COCKTAILS_CSV_DATA, FULL_COCKTAIL_REP_PATH, REPO_PATH, COCKTAIL_REP_CHKPT_PATH, RECIPE2FEATURES_PATH
+from src.cocktails.representation_learning.run_without_vae import get_model
+from src.cocktails.utilities.cocktail_category_detection_utilities import find_cocktail_sub_category
+import pandas as pd
+import torch
+import time
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+density_ingredients = np.loadtxt(COCKTAIL_REP_CHKPT_PATH + 'density_ingredients.txt')
+max_ingredients, ingredient_list, ind_alcohol = get_ingredients_info()
+min_ingredients = 2
+factor_max = 1.2  # generated recipes can go up to 1.2 times the max quantity of the ingredient found in the dataset
+prep_model = get_model(RECIPE2FEATURES_PATH + 'multi_predictor/')[0]
+all_rep_path = FULL_COCKTAIL_REP_PATH
+all_reps = np.loadtxt(all_rep_path)
+experiment_dir = REPO_PATH + '/experiments/cocktails/'
+rep_keys = get_bunch_of_rep_keys()['custom']
+dict_weights_mse_computation = {'end volume': .1, 'end sour': 2, 'end sweet': 2, 'end booze': 4, 'end bitter': 2, 'end fruit': 1, 'end herb': 1,
+                                'end complex': 1, 'end spicy': 5, 'end oaky': 1, 'end fizzy': 10, 'end colorful': 1, 'end eggy': 10}
+assert sorted(dict_weights_mse_computation.keys()) == sorted(rep_keys)
+weights_mse_computation = np.array([dict_weights_mse_computation[k] for k in rep_keys])
+weights_mse_computation /= weights_mse_computation.sum()
+data = pd.read_csv(COCKTAILS_CSV_DATA)
+preparation_list = sorted(set(data['category']))
+glasses_list = sorted(set(data['glass']))
+weights_perf_n_ing = {2:0.71, 3:0.81, 4:0.93, 5:1., 6:1.03, 7:1.08, 8:1.05}
+# weights_perf_n_ing = {2:0.75, 3:0.8, 4:0.95, 5:1.05, 6:1.05, 7:1.05, 8:1.05}
+min_ingredients_quantities_when_present = np.loadtxt(COCKTAIL_REP_CHKPT_PATH +'ingredients_min_quantities_when_present.txt')
+min_ingredients_quantities = np.loadtxt(COCKTAIL_REP_CHKPT_PATH +'ingredients_min_quantities.txt')
+max_ingredients_quantities = np.loadtxt(COCKTAIL_REP_CHKPT_PATH + 'ingredients_max_quantities.txt')
+min_cocktail_rep, max_cocktail_rep = np.loadtxt(COCKTAIL_REP_CHKPT_PATH +'cocktail_minmax_dim13_customkeys.txt')
+distrib_nb_ings_2_8 = np.loadtxt(COCKTAIL_REP_CHKPT_PATH + 'distrib_nb_ing.txt')[2:]
+def normalize_cocktail(cocktail_rep):
+    return ((cocktail_rep - min_cocktail_rep) / (max_cocktail_rep - min_cocktail_rep) - 0.5) * 2
+def denormalize_cocktail(cocktail_rep):
+    return (cocktail_rep / 2 + 0.5) * (max_cocktail_rep - min_cocktail_rep) + min_cocktail_rep
+def normalize_ingredient_q_rep(ingredients_q):
+    return (ingredients_q - min_ingredients_quantities_when_present) / (max_ingredients_quantities * factor_max - min_ingredients_quantities_when_present)
+COCKTAIL_REPS = normalize_cocktail(np.array([data[k] for k in rep_keys]).transpose())
+assert np.abs(COCKTAIL_REPS - all_reps).sum() < 1e-8
+cocktail2affective_cluster = get_cocktail2affective_cluster()
+original_affective_keys = get_bunch_of_rep_keys()['affective']
+def sigmoid(x, shift, beta):
+    return (1 / (1 + np.exp(-(x + shift) * beta)) - 0.5) * 2
+def get_normalized_affective_cocktail_rep_from_normalized_cocktail_rep(cocktail_rep):
+    indexes = np.array([rep_keys.index(key) for key in original_affective_keys])
+    cocktail_rep = cocktail_rep[indexes]
+    cocktail_rep[0] = sigmoid(cocktail_rep[0], shift=0.05, beta=4)
+    cocktail_rep[1] = sigmoid(cocktail_rep[1], shift=0.3, beta=5)
+    cocktail_rep[2] = sigmoid(cocktail_rep[2], shift=0.15, beta=3)
+    cocktail_rep[3] = sigmoid(cocktail_rep[3], shift=0.9, beta=20)
+    cocktail_rep[4] = sigmoid(cocktail_rep[4], shift=0, beta=4)
+    cocktail_rep[5] = sigmoid(cocktail_rep[5], shift=0.2, beta=3)
+    cocktail_rep[6] = sigmoid(cocktail_rep[6], shift=0.5, beta=5)
+    cocktail_rep[7] = sigmoid(cocktail_rep[7], shift=0.2, beta=6)
+    return cocktail_rep
+class IndividualCocktail():
+    def __init__(self, pop_params, target, target_affective_cluster, genes_presence=None, genes_quantity=None,
+                 compute_perf=True, known_target_dict=None, run_hard_check=False):
+        self.pop_params = pop_params
+        self.n_genes = len(ingredient_list)
+        self.max_ingredients = max_ingredients
+        self.min_ingredients = min_ingredients
+        self.mutation_params = pop_params['mutation_params']
+        self.dist = pop_params['dist']
+        self.target = target
+        self.is_known = known_target_dict is not None
+        self.known_target_dict = known_target_dict
+        self.perf = None
+        self.cocktail_rep = None
+        self.affective_cluster = None
+        self.target_affective_cluster = target_affective_cluster
+        self.ing_list = np.array(ingredient_list)
+        self.ing_set = set(ingredient_list)
+        self.ing_ids_per_cat = dict(bubbles=set(self.get_ingredients_ids_from_list(bubble_ingredients)),
+                                    liquor=set(self.get_ingredients_ids_from_list(ingredients_per_type['liquor'])),
+                                    liqueur=set(self.get_ingredients_ids_from_list(ingredients_per_type['liqueur'])),
+                                    citrus=set(self.get_ingredients_ids_from_list(ingredients_per_type['acid'] + ['orange juice'])),
+                                    alcohol=set(ind_alcohol),
+                                    sweeteners=set(self.get_ingredients_ids_from_list(ingredients_per_type['sweeteners'])),
+                                    vermouth=set(self.get_ingredients_ids_from_list(ingredients_per_type['vermouth'])),
+                                    bitters=set(self.get_ingredients_ids_from_list(ingredients_per_type['bitters'])),
+                                    juice=set(self.get_ingredients_ids_from_list(ingredients_per_type['juice'])),
+                                    acid=set(self.get_ingredients_ids_from_list(ingredients_per_type['acid'])),
+                                    egg=set(self.get_ingredients_ids_from_list(['egg']))
+                                    )
+        if genes_presence is not None:
+            assert len(genes_presence) == self.n_genes
+            assert len(genes_quantity) == self.n_genes
+            self.genes_presence = genes_presence
+            self.genes_quantity = genes_quantity
+            if compute_perf:
+                self.compute_cocktail_rep()
+                self.compute_perf()
+        else:
+            self.sample_initial_genes()
+            self.compute_cocktail_rep()
+            # self.make_recipe_fit_the_glass()
+            self.compute_perf()
+    # # # # # # # # # # # # # # # # # # # # # # # #
+    # Sample initial genes with smart rules
+    # # # # # # # # # # # # # # # # # # # # # # # #
+    def sample_initial_genes(self):
+        # rules:
+        # - between min_ingredients and max_ingredients
+        # - at most one type of bubbles
+        # - at least one alcohol
+        # - no egg without lime or lemon
+        # - at most two liqueurs
+        # - at most three liquors
+        # - at most two sweetener
+        self.genes_quantity = np.random.uniform(0, 1, size=self.n_genes)  # holds quantities for each ingredient
+        n_ingredients = np.random.choice(np.arange(min_ingredients, max_ingredients + 1), p=distrib_nb_ings_2_8)
+        self.genes_presence = np.zeros(self.n_genes)
+        # add one alchohol
+        self.genes_presence[np.random.choice(ind_alcohol)] = 1
+        while self.get_ing_count() < n_ingredients:
+            candidate_ids = self.get_candidate_ingredients_ids(self.genes_presence)
+            probas = density_ingredients[candidate_ids] / np.sum(density_ingredients[candidate_ids])
+            self.genes_presence[np.random.choice(candidate_ids, p=probas)] = 1
+    def get_candidate_ingredients_ids(self, genes_presence):
+        candidates = set(np.argwhere(genes_presence==0).flatten())
+        present_ids = set(np.argwhere(genes_presence==1).flatten())
+        if self.count_in_genes(present_ids, 'bubbles') >= 1:  # at most one type of bubbles
+            candidates = candidates - self.ing_ids_per_cat['bubbles']
+        if self.count_in_genes(present_ids, 'liquor')  >= 3:  # at most three liquors
+            candidates = candidates - self.ing_ids_per_cat['liquor']
+        if self.count_in_genes(present_ids, 'liqueur')  >= 2:  # at most two liqueurs
+            candidates = candidates - self.ing_ids_per_cat['liqueur']
+        if self.count_in_genes(present_ids, 'sweeteners')  >= 2:  # at most two sweetener
+            candidates = candidates - self.ing_ids_per_cat['sweeteners']
+        if self.count_in_genes(present_ids, 'citrus')  == 0:  # no egg without lime or lemon
+            candidates = candidates - self.ing_ids_per_cat['egg']
+        return np.array(sorted(candidates))
+    def count_in_genes(self, present_ids, keyword):
+        if keyword == 'citrus': return len(present_ids & self.ing_ids_per_cat['citrus'])
+        elif keyword == 'bubbles': return len(present_ids & self.ing_ids_per_cat['bubbles'])
+        elif keyword == 'liquor': return len(present_ids & self.ing_ids_per_cat['liquor'])
+        elif keyword == 'liqueur': return len(present_ids & self.ing_ids_per_cat['liqueur'])
+        elif keyword == 'alcohol': return len(present_ids & self.ing_ids_per_cat['alcohol'])
+        elif keyword == 'sweeteners': return len(present_ids & self.ing_ids_per_cat['sweeteners'])
+        else: raise ValueError
+    def get_ingredients_ids_from_list(self, ing_list):
+        return [ingredient_list.index(ing) for ing in ing_list]
+    def get_ing_count(self):
+        return np.sum(self.genes_presence)
+    # # # # # # # # # # # # # # # # # # # # # # # #
+    # Compute cocktail representations
+    # # # # # # # # # # # # # # # # # # # # # # # #
+    def get_absent_ing(self):
+        return np.argwhere(self.genes_presence==0).flatten()
+    def get_present_ing(self):
+        return np.argwhere(self.genes_presence==1).flatten()
+    def get_ingredient_quantities(self):
+        # unnormalize quantities to get real ones
+        return (self.genes_quantity * (max_ingredients_quantities * factor_max - min_ingredients_quantities_when_present) + min_ingredients_quantities_when_present) * self.genes_presence
+    def get_ing_and_q_from_genes(self):
+        present_ings = self.get_present_ing()
+        ing_quantities = self.get_ingredient_quantities()
+        ingredients, quantities = [], []
+        for i_ing in present_ings:
+            ingredients.append(ingredient_list[i_ing])
+            quantities.append(ing_quantities[i_ing])
+        return ingredients, quantities, ing_quantities
+    def compute_cocktail_rep(self):
+        # only call when genes have changes
+        init_time = time.time()
+        ingredients, quantities, ing_quantities = self.get_ing_and_q_from_genes()
+        # compute cocktail category
+        self.category = find_cocktail_sub_category(ingredients, quantities)[0]
+        # print(f't1: {time.time() - init_time}')
+        init_time = time.time()
+        self.prep_type = self.get_prep_type(ing_quantities)
+        # print(f't2: {time.time() - init_time}')
+        init_time = time.time()
+        cocktail_rep, self.end_volume, self.end_alcohol = get_cocktail_rep(self.prep_type, ingredients, quantities, keys=rep_keys[1:]) # volume is added later
+        # print(f't3: {time.time() - init_time}')
+        init_time = time.time()
+        self.cocktail_rep = normalize_cocktail(cocktail_rep)
+        # print(f't4: {time.time() - init_time}')
+        init_time = time.time()
+        self.glass = self.get_glass_type(ing_quantities)
+        # print(f't5: {time.time() - init_time}')
+        init_time = time.time()
+        if self.is_known:
+            assert np.abs(self.cocktail_rep - self.target).sum() < 1e-6
+        return self.cocktail_rep
+    def get_prep_type(self, quantities=None):
+        if self.is_known: return self.known_target_dict['prep_type']
+        else:
+            if quantities is None:
+                quantities = self.get_ingredient_quantities()
+            if quantities[ingredient_list.index('egg')] > 0:
+                prep_cat = 'egg_shaken'
+            elif self.category in ['spirit_forward', 'simple_sour_with_juice', 'julep', 'duo', 'ancestral', 'complex_sour_with_juice']:
+                # use hard coded rules for most obvious cases determined with the correlations_glass_cat_prep_script
+                if self.category in ['ancestral', 'spirit_forward', 'duo']:
+                    prep_cat = 'stirred'
+                elif self.category in ['complex_sour_with_juice', 'julep', 'simple_sour_with_juice']:
+                    prep_cat = 'shaken'
+                else:
+                    raise ValueError
+            else:
+                output = prep_model(quantities, aux_str='prep_type').flatten()
+                output[preparation_list.index('egg_shaken')] = -np.inf
+                prep_cat = preparation_list[np.argmax(output)]
+        return prep_cat
+    def get_glass_type(self, quantities=None):
+        if self.is_known: return self.known_target_dict['glass']
+        else:
+            if self.category in ['collins', 'complex_highball', 'simple_highball', 'champagne_cocktail', 'complex_sour']:
+                # use hard coded rules for most obvious cases determined with the correlations_glass_cat_prep_script
+                if self.category in ['collins', 'complex_highball', 'simple_highball']:
+                    glass = 'collins'
+                elif self.category in ['champagne_cocktail', 'complex_sour']:
+                    glass = 'coupe'
+            else:
+                if quantities is None:
+                    quantities = self.get_ingredient_quantities()
+                output = prep_model(quantities, aux_str='glasses').flatten()
+                glass = glasses_list[np.argmax(output)]
+        return glass
+    # # # # # # # # # # # # # # # # # # # # # # # #
+    # Adapt recipe to fit the glass
+    # # # # # # # # # # # # # # # # # # # # # # # #
+    def is_too_large_for_glass(self):
+        return self.end_volume > glass_volume[self.glass] * 0.80
+    def is_too_small_for_glass(self):
+        return self.end_volume < glass_volume[self.glass] * 0.3
+    def scale_ing_quantities(self, present_ings, factor):
+        qs = self.get_ingredient_quantities().copy()
+        qs[present_ings] *= factor
+        self.set_genes_from_quantities(present_ings, qs)
+    def set_genes_from_quantities(self, present_ings, quantities):
+        genes_quantity = np.clip((quantities - min_ingredients_quantities_when_present) /
+                                 (factor_max * max_ingredients_quantities - min_ingredients_quantities_when_present), 0, 1)
+        self.genes_quantity[present_ings] = genes_quantity[present_ings]
+    def make_recipe_fit_the_glass(self):
+        # check if citrus, if not remove egg
+        present_ids = np.argwhere(self.genes_presence == 1).flatten()
+        ing_list = self.ing_list[present_ids]
+        present_ids = set(present_ids)
+        if self.count_in_genes(present_ids, 'citrus') == 0 and 'egg' in ing_list:
+            if self.genes_presence.sum() > 2:
+                i_egg = ingredient_list.index('egg')
+                self.genes_presence[i_egg] = 0.
+                self.compute_cocktail_rep()
+        i_trial = 0
+        present_ings = self.get_present_ing()
+        while self.is_too_large_for_glass():
+            i_trial += 1
+            end_volume = self.end_volume
+            desired_volume = glass_volume[self.glass] * 0.80
+            ratio = desired_volume / end_volume
+            self.scale_ing_quantities(present_ings, factor=ratio)
+            self.compute_cocktail_rep()
+            if end_volume == self.end_volume: break
+            if i_trial == 10: break
+        while self.is_too_small_for_glass():
+            i_trial += 1
+            end_volume = self.end_volume
+            desired_volume = glass_volume[self.glass] * 0.80
+            ratio = desired_volume / end_volume
+            self.scale_ing_quantities(present_ings, factor=ratio)
+            self.compute_cocktail_rep()
+            if end_volume == self.end_volume: break
+            if i_trial == 10: break
+    # # # # # # # # # # # # # # # # # # # # # # # #
+    # Compute performance
+    # # # # # # # # # # # # # # # # # # # # # # # #
+    def passes_checks(self):
+        present_ids = np.argwhere(self.genes_presence==1).flatten()
+        # ing_list = self.ing_list[present_ids]
+        present_ids = set(present_ids)
+        if len(present_ids) < 2 or len(present_ids) > 8: return False
+        # if self.is_too_large_for_glass(): return False
+        # if self.is_too_small_for_glass(): return False
+        if self.end_alcohol < 0.05 or self.end_alcohol > 0.31: return False
+        if self.count_in_genes(present_ids, 'sweeteners')  > 2: return False
+        if self.count_in_genes(present_ids, 'liqueur')  > 2: return False
+        if self.count_in_genes(present_ids, 'liquor')  > 3: return False
+        # if self.count_in_genes(present_ids, 'citrus') == 0 and 'egg' in ing_list: return False
+        if self.count_in_genes(present_ids, 'bubbles') > 1: return False
+        else: return True
+    def get_affective_cluster(self):
+        cocktail_rep_affective = get_normalized_affective_cocktail_rep_from_normalized_cocktail_rep(self.cocktail_rep)
+        self.affective_cluster = cocktail2affective_cluster(cocktail_rep_affective)[0]
+        return self.affective_cluster
+    def does_affective_cluster_match(self):
+        return True#self.get_affective_cluster() == self.target_affective_cluster
+    def compute_perf(self):
+        if not self.passes_checks(): self.perf = -100
+        else:
+            if self.dist == 'mse':
+                # self.perf = - np.sqrt(((self.cocktail_rep - self.target)**2).mean())
+                self.perf = - np.sqrt(np.dot((self.cocktail_rep - self.target)**2, weights_mse_computation))
+                self.perf *= weights_perf_n_ing[int(self.genes_presence.sum())]
+                if not self.does_affective_cluster_match():
+                    self.perf *= 2
+            else: raise NotImplemented
+    # # # # # # # # # # # # # # # # # # # # # # # #
+    # Mutations and crossover
+    # # # # # # # # # # # # # # # # # # # # # # # #
+    def get_child(self):
+        time_dict = dict()
+        init_time = time.time()
+        child = IndividualCocktail(pop_params=self.pop_params, target_affective_cluster=self.target_affective_cluster,
+                                   target=self.target, genes_presence=self.genes_presence.copy(),
+                                   genes_quantity=self.genes_quantity.copy(), compute_perf=False)
+        time_dict['    asexual child creation'] = [time.time() - init_time]
+        init_time = time.time()
+        this_time_dict = child.mutate()
+        time_dict = self.update_time_dict(time_dict, this_time_dict)
+        time_dict['    asexual child mutation'] = [time.time() - init_time]
+        return child, time_dict
+    def get_child_with(self, other_parent):
+        time_dict = dict()
+        init_time = time.time()
+        new_genes_presence = np.zeros(self.n_genes)
+        present_ing = self.get_present_ing()
+        other_present_ing = other_parent.get_present_ing()
+        new_genes_quantity = np.random.uniform(0, 1, size=self.n_genes)
+        shared_ingredients = sorted(set(present_ing) & set(other_present_ing))
+        unique_ingredients_one = sorted(set(present_ing) - set(other_present_ing))
+        unique_ingredients_two = sorted(set(other_present_ing) - set(present_ing))
+        for i in shared_ingredients:
+            new_genes_presence[i] = 1
+            new_genes_quantity[i] = (self.genes_quantity[i] + other_parent.genes_quantity[i]) / 2
+        time_dict['    crossover child creation'] = [time.time() - init_time]
+        init_time = time.time()
+        # add one alcohol if none present
+        if len(set(np.argwhere(new_genes_presence==1).flatten()).intersection(ind_alcohol)) == 0:
+            new_genes_presence[np.random.choice(ind_alcohol)] = 1
+        # up to here, we respect the constraints (assuming both parents do).
+        candidate_genes = np.array(unique_ingredients_one + unique_ingredients_two)
+        candidate_quantities = np.array([self.genes_quantity[i] for i in unique_ingredients_one] + [other_parent.genes_quantity[i] for i in unique_ingredients_two])
+        indexes = np.arange(len(candidate_genes))
+        np.random.shuffle(indexes)
+        candidate_genes = candidate_genes[indexes]
+        candidate_quantities = candidate_quantities[indexes]
+        time_dict['    crossover prepare selection'] = [time.time() - init_time]
+        init_time = time.time()
+        # now let's try to add each of them while respecting the constraints
+        for i in range(len(indexes)):
+            if np.random.rand() < 0.5 or np.sum(new_genes_presence) < self.min_ingredients:  # only try to add one every two ingredient
+                ing_id = candidate_genes[i]
+                q = candidate_quantities[i]
+                new_genes_presence[ing_id] = 1
+                new_genes_quantity[ing_id] = q
+                if np.sum(new_genes_presence) == self.max_ingredients:
+                    break
+        time_dict['    crossover do selection'] = [time.time() - init_time]
+        init_time = time.time()
+        # create new child
+        child = IndividualCocktail(pop_params=self.pop_params, target_affective_cluster=self.target_affective_cluster, target=self.target,
+                                   genes_presence=new_genes_presence.copy(), genes_quantity=new_genes_quantity.copy(), compute_perf=False)
+        time_dict['    crossover create child'] = [time.time() - init_time]
+        init_time = time.time()
+        this_time_dict = child.mutate()
+        time_dict = self.update_time_dict(time_dict, this_time_dict)
+        time_dict['    crossover child mutation'] = [time.time() - init_time]
+        init_time = time.time()
+        return child, time_dict
+    def mutate(self):
+        # self.print_recipe()
+        time_dict = dict()
+        # remove an ingredient
+        init_time = time.time()
+        present_ids = set(np.argwhere(self.genes_presence==1).flatten())
+        if np.random.rand() < self.mutation_params['p_remove_ing']:
+            if self.get_ing_count() > self.min_ingredients:
+                candidate_ings = self.get_present_ing()
+                if self.count_in_genes(present_ids, 'alcohol') == 1:  # make sure we keep at least one liquor
+                    candidate_ings = np.array(sorted(set(candidate_ings) - set(ind_alcohol)))
+                index_to_remove = np.random.choice(candidate_ings)
+                self.genes_presence[index_to_remove] = 0
+            time_dict['      mutation remove ing'] = [time.time() - init_time]
+        init_time = time.time()
+        # add an ingredient
+        if np.random.rand() < self.mutation_params['p_add_ing']:
+            if self.get_ing_count() < self.max_ingredients:
+                candidate_ings = self.get_candidate_ingredients_ids(self.genes_presence.copy())
+                index_to_add = np.random.choice(candidate_ings, p=density_ingredients[candidate_ings] / np.sum(density_ingredients[candidate_ings]))
+                self.genes_presence[index_to_add] = 1
+                time_dict['      mutation add ing'] = [time.time() - init_time]
+        init_time = time.time()
+        # replace ings by others from the same family
+        if np.random.rand() < self.mutation_params['p_switch_ing']:
+            i = np.random.choice(self.get_present_ing())
+            ing_str = ingredient_list[i]
+            if ing_str not in ['sparkling wine', 'orange juice']:
+                if ing_str in bubble_ingredients:
+                    candidates_ids = np.array(sorted(self.ing_ids_per_cat['bubbles'] - set([i])))
+                    new_bubble = np.random.choice(candidates_ids, p=density_ingredients[candidates_ids] / np.sum(density_ingredients[candidates_ids]))
+                    self.genes_presence[i] = 0
+                    self.genes_presence[new_bubble] = 1
+                    self.genes_quantity[new_bubble] = self.genes_quantity[i] # copy quantity
+                categories = ['acid', 'bitters', 'juice', 'liqueur', 'liquor', 'sweeteners', 'vermouth']
+                for cat in categories:
+                    if ing_str in ingredients_per_type[cat]:
+                        present_ings = self.get_present_ing()
+                        candidates_ids = np.array(sorted(self.ing_ids_per_cat[cat] - set([i]) - set(present_ings)))
+                        if len(candidates_ids) > 0:
+                            replacing_ing = np.random.choice(candidates_ids, p=density_ingredients[candidates_ids] / np.sum(density_ingredients[candidates_ids]))
+                            self.genes_presence[i] = 0
+                            self.genes_presence[replacing_ing] = 1
+                            self.genes_quantity[replacing_ing] = self.genes_quantity[i]  # copy quantity
+                        break
+        time_dict['      mutation switch ing'] = [time.time() - init_time]
+        init_time = time.time()
+        # add noise on ing quantity
+        for i in self.get_present_ing():
+            if np.random.rand() < self.mutation_params['p_change_q']:
+                self.genes_quantity[i] += np.random.randn() * self.mutation_params['delta_change_q']
+        self.genes_quantity = np.clip(self.genes_quantity, 0, 1)
+        time_dict['      mutation change quantity'] = [time.time() - init_time]
+        init_time = time.time()
+        self.compute_cocktail_rep()
+        time_dict['      mutation compute cocktail rep'] = [time.time() - init_time]
+        init_time = time.time()
+        # self.make_recipe_fit_the_glass()
+        time_dict['      mutation check glass fit'] = [time.time() - init_time]
+        init_time = time.time()
+        self.compute_perf()
+        time_dict['      mutation compute perf'] = [time.time() - init_time]
+        init_time = time.time()
+        stop = 1
+        return time_dict
+    def update_time_dict(self, main_dict, new_dict):
+        for k in new_dict.keys():
+            if k in main_dict.keys():
+                main_dict[k].append(np.sum(new_dict[k]))
+            else:
+                main_dict[k] = [np.sum(new_dict[k])]
+        return main_dict
+    # # # # # # # # # # # # # # # # # # # # # # # #
+    # Get recipe and print
+    # # # # # # # # # # # # # # # # # # # # # # # #
+    def get_recipe(self, unit='mL', name=None):
+        ing_quantities = self.get_ingredient_quantities()
+        ingredients, quantities = [], []
+        for i_ing, q_ing in enumerate(ing_quantities):
+            if q_ing > 0.8:
+                ingredients.append(ingredient_list[i_ing])
+                quantities.append(round(q_ing))
+        recipe_str = format_ingredients(ingredients, quantities)
+        recipe_str_readable = print_recipe(unit=unit, ingredient_str=recipe_str, name=name, to_print=False)
+        return ingredients, quantities, recipe_str, recipe_str_readable
+    def get_instructions(self):
+        ing_quantities = self.get_ingredient_quantities()
+        ingredients, quantities = [], []
+        for i_ing, q_ing in enumerate(ing_quantities):
+            if q_ing > 0.8:
+                ingredients.append(ingredient_list[i_ing])
+                quantities.append(round(q_ing))
+        str_out = 'Instructions:\n   '
+        if 'mint' in ingredients:
+            i_mint = ingredients.index('mint')
+            n_leaves = quantities[i_mint]
+            str_out += f'Add {n_leaves} mint leaves to a shaker, followed by an ice cube.\n   Muddle the mint and ice together with a muddler.\n   '
+        bubbles = ['sparkling wine', 'tonic', 'soda', 'ginger beer']
+        other_ings = [ing for ing in ingredients if ing not in ['egg', 'angostura', 'orange bitters'] + bubbles]
+        if self.prep_type == 'built':
+            str_out += 'Add a large ice cube in the glass.\n   '
+        # add ingredients to pour
+        str_out += 'Pour'
+        for i, ing in enumerate(other_ings):
+            if i == len(other_ings) - 2:
+                str_out += f' {ing} and'
+            elif i == len(other_ings) - 1:
+                str_out += f' {ing}'
+            else:
+                str_out += f' {ing},'
+        if self.prep_type in ['built'] and 'mint' not in ingredients:
+            str_out += ' into the glass.\n   '
+        else:
+            str_out += ' into the shaker.\n   '
+        if self.prep_type == 'egg_shaken' and 'egg' in ingredients:
+            str_out += 'Add the egg white.\n   Dry-shake for 15s (without ice), then fill with ice and shake for another 15s.\n   Serve into the glass through a strainer.\n   '
+        elif 'shaken' in self.prep_type:
+            str_out += 'Fill with ice and shake for 15s.\n   Serve into the glass through a strainer.\n   '
+        elif self.prep_type == 'stirred':
+            str_out += 'Add ice and stir the cocktail with a spoon for 15s.\n   Serve into the glass through a strainer.\n   '
+        elif self.prep_type == 'built':
+            str_out += 'Stir two turns with a spoon.\n   '
+        bubble_ing = [ing for ing in ingredients if ing in bubbles]
+        if len(bubble_ing) > 0:
+            str_out += f'Top up with '
+            for ing in bubble_ing:
+                str_out += f'{ing}, '
+            str_out = str_out[:-2] + '.\n   '
+        bitter_ing = [ing for ing in ingredients if ing in ['angostura', 'orange bitters']]
+        if len(bitter_ing) > 0:
+            if len(bitter_ing) == 1:
+                q = quantities[ingredients.index(bitter_ing[0])]
+                n_dashes = max(1, int(q / 0.6))
+                str_out += f'Add {n_dashes} dash'
+                if n_dashes > 1:
+                    str_out += 'es'
+                str_out += f' of {bitter_ing[0]}.\n   '
+            elif len(bitter_ing) == 2:
+                q = quantities[ingredients.index(bitter_ing[0])]
+                n_dashes = max(1, int(q / 0.6))
+                str_out += f'Add {n_dashes} dash'
+                if n_dashes > 1:
+                    str_out += 'es'
+                str_out += f' of {bitter_ing[0]} and '
+                q = quantities[ingredients.index(bitter_ing[1])]
+                n_dashes = max(1, int(q / 0.6))
+                str_out += f'{n_dashes} dash'
+                if n_dashes > 1:
+                    str_out += 'es'
+                str_out += f' of {bitter_ing[1]}.\n   '
+        str_out += 'Enjoy!'
+        return str_out
+    def print_recipe(self, name=None):
+        print(self.get_recipe(name)[3])

src/cocktails/utilities/cocktail_generation_utilities/population.py ADDED Viewed

	@@ -0,0 +1,213 @@

+from src.cocktails.utilities.cocktail_generation_utilities.individual import *
+from sklearn.neighbors import NearestNeighbors
+import time
+import pickle
+from src.cocktails.config import COCKTAIL_NN_PATH, COCKTAILS_CSV_DATA
+class Population:
+    def __init__(self, target, pop_params, target_affective_cluster=None, known_target_dict=None):
+        self.pop_params = pop_params
+        self.pop_size = pop_params['pop_size']
+        self.nb_elite = pop_params['nb_elites']
+        self.nb_generations = pop_params['nb_generations']
+        self.target = target
+        self.mutation_params = pop_params['mutation_params']
+        self.dist = pop_params['dist']
+        self.n_neighbors = pop_params['n_neighbors']
+        self.known_target_dict = known_target_dict
+        with open(COCKTAIL_NN_PATH, 'rb') as f:
+            data = pickle.load(f)
+        self.nn_model_cocktail = data['nn_model']
+        self.dim_rep_cocktail = data['dim_rep_cocktail']
+        self.n_cocktails = data['n_cocktails']
+        self.cocktail_data = pd.read_csv(COCKTAILS_CSV_DATA)
+        if target_affective_cluster is None:
+            cocktail_rep_affective = get_normalized_affective_cocktail_rep_from_normalized_cocktail_rep(target)
+            self.target_affective_cluster = cocktail2affective_cluster(cocktail_rep_affective)[0]
+        else:
+            self.target_affective_cluster = target_affective_cluster
+        self.pop_elite = []
+        self.pop = []
+        self.add_target_individual()  # create a target individual (not in pop)
+        self.add_nearest_neighbors_in_pop()  # add nearest neighbor from dataset into the population
+        # fill population
+        while self.get_pop_size() < self.pop_size:
+            self.add_individual()
+        while len(self.pop_elite) < self.nb_elite:
+            self.pop_elite.append(IndividualCocktail(pop_params=self.pop_params,
+                                                     target=self.target.copy(),
+                                                     target_affective_cluster=self.target_affective_cluster))
+        self.update_elite_and_get_next_pop()
+    def add_target_individual(self):
+        if self.known_target_dict is not None:
+            genes_presence, genes_quantity = self.get_q_rep(*extract_ingredients(self.known_target_dict['ing_str']))
+            self.target_individual = IndividualCocktail(pop_params=self.pop_params,
+                                                        target=self.target.copy(),
+                                                        known_target_dict=self.known_target_dict,
+                                                        target_affective_cluster=self.target_affective_cluster,
+                                                        genes_presence=genes_presence,
+                                                        genes_quantity=genes_quantity
+                                                        )
+        else:
+            self.target_individual = None
+    def add_nearest_neighbors_in_pop(self):
+        # add nearest neighbor from dataset into the population
+        if self.n_neighbors > 0:
+            dists, indexes = self.nn_model_cocktail.kneighbors(self.target.reshape(1, -1))
+            dists, indexes = dists.flatten(), indexes.flatten()
+            first = 1 if dists[0] == 0 else 0  # avoid taking the target when testing with known targets from the dataset
+            indexes = indexes[first:first + self.n_neighbors]
+            self.ing_strs = np.array(self.cocktail_data['ingredients_str'])[indexes]
+            recipes = [extract_ingredients(ing_str) for ing_str in self.ing_strs]
+            for r in recipes:
+                genes_presence, genes_quantity = self.get_q_rep(r[0], r[1])
+                genes_presence[-1] = 0  # remove water ingredient
+                self.add_individual(genes_presence=genes_presence.copy(), genes_quantity=genes_quantity.copy())
+            self.nn_recipes = [ind.get_recipe()[3] for ind in self.pop]
+            self.nn_scores = [ind.perf for ind in self.pop]
+        else:
+            self.ing_strs = None
+    def add_individual(self, genes_presence=None, genes_quantity=None):
+        self.pop.append(IndividualCocktail(pop_params=self.pop_params,
+                                           target=self.target.copy(),
+                                           target_affective_cluster=self.target_affective_cluster,
+                                           genes_presence=genes_presence,
+                                           genes_quantity=genes_quantity))
+    def get_elite_perf(self):
+        return np.array([e.perf for e in self.pop_elite])
+    def get_pop_perf(self):
+        return np.array([ind.perf for ind in self.pop])
+    def update_elite_and_get_next_pop(self):
+        time_dict = dict()
+        init_time = time.time()
+        elite_perfs = self.get_elite_perf()
+        pop_perfs = self.get_pop_perf()
+        all_perfs = np.concatenate([elite_perfs, pop_perfs])
+        temp_list = self.pop_elite + self.pop
+        time_dict['  get pop perfs'] = [time.time() - init_time]
+        init_time = time.time()
+        # update elite  population with new bests
+        indexes_sorted = np.flip(np.argsort(all_perfs))
+        new_pop_elite = [IndividualCocktail(pop_params=self.pop_params,
+                                            target=self.target.copy(),
+                                            target_affective_cluster=self.target_affective_cluster,
+                                            genes_presence=temp_list[i_new_e].genes_presence.copy(),
+                                            genes_quantity=temp_list[i_new_e].genes_quantity.copy()) for i_new_e in indexes_sorted[:self.nb_elite]]
+        time_dict['  recreate elite individuals'] = [time.time() - init_time]
+        init_time = time.time()
+        # select parents
+        rank_perfs = np.flip(np.arange(len(temp_list)))
+        sampling_probs = rank_perfs / np.sum(rank_perfs)
+        if self.mutation_params['asexual_rep'] and not self.mutation_params['crossover']:
+            new_pop_indexes = np.random.choice(indexes_sorted, p=sampling_probs, size=self.pop_size)
+            self.pop = [temp_list[i].get_child() for i in new_pop_indexes]
+        elif self.mutation_params['crossover'] and not self.mutation_params['asexual_rep']:
+            self.pop = []
+            while len(self.pop) < self.pop_size:
+                parents = np.random.choice(indexes_sorted, p=sampling_probs, size=2, replace=False)
+                self.pop.append(temp_list[parents[0]].get_child_with(temp_list[parents[1]]))
+        elif self.mutation_params['crossover'] and self.mutation_params['asexual_rep']:
+            new_pop_indexes = np.random.choice(indexes_sorted, p=sampling_probs, size=self.pop_size//2)
+            time_dict['  choose asexual parent indexes'] = [time.time() - init_time]
+            init_time = time.time()
+            self.pop = []
+            for i in new_pop_indexes:
+                child, this_time_dict = temp_list[i].get_child()
+                self.pop.append(child)
+                time_dict = self.update_time_dict(time_dict, this_time_dict)
+            time_dict['  get asexual children'] = [time.time() - init_time]
+            init_time = time.time()
+            while len(self.pop) < self.pop_size:
+                parents = np.random.choice(indexes_sorted, p=sampling_probs, size=2, replace=False)
+                child, this_time_dict = temp_list[parents[0]].get_child_with(temp_list[parents[1]])
+                self.pop.append(child)
+                time_dict = self.update_time_dict(time_dict, this_time_dict)
+            time_dict['  get sexual children'] = [time.time() - init_time]
+        self.pop_elite = new_pop_elite
+        return time_dict
+    def get_pop_size(self):
+        return len(self.pop)
+    def get_q_rep(self, ingredients, quantities):
+        ingredient_q_rep = np.zeros([len(ingredient_list)])
+        genes_presence = np.zeros([len(ingredient_list)])
+        for ing, q in zip(ingredients, quantities):
+            ingredient_q_rep[ingredient_list.index(ing)] = q
+            genes_presence[ingredient_list.index(ing)] = 1
+        return genes_presence.copy(), normalize_ingredient_q_rep(ingredient_q_rep)
+    def get_best_score(self, affective_cluster_check=False):
+        elite_perfs = self.get_elite_perf()
+        pop_perfs = self.get_pop_perf()
+        all_perfs = np.concatenate([elite_perfs, pop_perfs])
+        temp_list = self.pop_elite + self.pop
+        if affective_cluster_check:
+            indexes = np.array([i for i in range(len(temp_list)) if temp_list[i].does_affective_cluster_match()])
+            if indexes.size > 0:
+                temp_list = np.array(temp_list)[indexes]
+                all_perfs = all_perfs[indexes]
+        indexes_best = np.flip(np.argsort(all_perfs))
+        return np.array(all_perfs)[indexes_best], np.array(temp_list)[indexes_best]
+    def update_time_dict(self, main_dict, new_dict):
+        for k in new_dict.keys():
+            if k in main_dict.keys():
+                main_dict[k].append(np.sum(new_dict[k]))
+            else:
+                main_dict[k] = [np.sum(new_dict[k])]
+        return main_dict
+    def run_one_generation(self, verbose=True, affective_cluster_check=False):
+        time_dict = dict()
+        init_time = time.time()
+        this_time_dict = self.update_elite_and_get_next_pop()
+        time_dict['update_elite_and_pop'] = [time.time() - init_time]
+        time_dict = self.update_time_dict(time_dict, this_time_dict)
+        init_time = time.time()
+        best_perfs, best_individuals = self.get_best_score(affective_cluster_check)
+        time_dict['get best scores'] = [time.time() - init_time]
+        return best_perfs[0], time_dict
+    def run_evolution(self, verbose=False, print_every=10, affective_cluster_check=False, level=0):
+        best_score = -np.inf
+        time_dict = dict()
+        init_time = time.time()
+        for i in range(self.nb_generations):
+            best_score, this_time_dict = self.run_one_generation(verbose, affective_cluster_check=affective_cluster_check)
+            time_dict = self.update_time_dict(time_dict, this_time_dict)
+            if verbose and (i+1) % print_every == 0:
+                print(' ' * level + f'Gen #{i+1} - Current best perf: {best_score:.2f}, time: {time.time() - init_time:.4f}')
+                init_time = time.time()
+                #
+                # to_print = time_dict.copy()
+                # keys = sorted(to_print.keys())
+                # values = []
+                # for k in keys:
+                #     to_print[k] = np.sum(to_print[k])
+                #     values.append(to_print[k])
+                # sorted_inds = np.flip(np.argsort(values))
+                # for i in sorted_inds:
+                #     print(f'{keys[i]}: {values[i]:.4f}')
+        if verbose: print(' ' * level + f'Evolution over, best perf: {best_score:.2f}')
+        return self.get_best_score()
+    def print_results(self, n=3):
+        best_scores, best_ind = self.get_best_score()
+        for i in range(n):
+            best_ind[i].print_recipe(f'Candidate #{i+1}, Score: {best_scores[i]:.2f}')

src/cocktails/utilities/cocktail_utilities.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import numpy as np
+from src.cocktails.utilities.ingredients_utilities import ingredient2ingredient_id, ingredient_profiles, ingredients_per_type, ingredient_list, find_ingredient_from_str
+from src.cocktails.utilities.cocktail_category_detection_utilities import *
+import time
+# representation_keys = ['pH', 'sour', 'sweet', 'booze', 'bitter', 'fruit', 'herb',
+#                        'complex', 'spicy', 'strong', 'oaky', 'fizzy', 'colorful', 'eggy']
+representation_keys = ['sour', 'sweet', 'booze', 'bitter', 'fruit', 'herb',
+                       'complex', 'spicy', 'oaky', 'fizzy', 'colorful', 'eggy']
+representation_keys_linear = list(set(representation_keys) - set(['pH', 'complex']))
+ing_reps = np.array([[ingredient_profiles[k][ing_id] for ing_id in ingredient2ingredient_id.values()] for k in representation_keys]).transpose()
+def compute_cocktail_representation(profile, ingredients, quantities):
+    # computes representation of a cocktail from the recipe (ingredients, quantities) and volume
+    n = len(ingredients)
+    assert n == len(quantities)
+    quantities = np.array(quantities)
+    weights = quantities / np.sum(quantities)
+    rep = dict()
+    ing_ids = np.array([ingredient2ingredient_id[ing] for ing in ingredients])
+    # compute features as linear combination of ingredient features
+    for k in representation_keys_linear:
+        k_ing = np.array([ingredient_profiles[k][ing_id] for ing_id in ing_ids])
+        rep[k] = np.dot(weights, k_ing)
+    # for ph
+    # ph = - log10 x
+    phs = np.array([ingredient_profiles['pH'][ing_id] for ing_id in ing_ids])
+    concentrations = 10 ** (- phs)
+    mix_c = np.dot(weights, concentrations)
+    rep['pH'] = - np.log10(mix_c)
+    rep['complex'] = np.mean([ingredient_profiles['complex'][ing_id] for ing_id in ing_ids]) + len(ing_ids)
+    # compute profile after dilution
+    volume_ratio = profile['mix volume'] / profile['end volume']
+    for k in representation_keys:
+        rep['end ' + k] = rep[k] * volume_ratio
+    concentration = 10 ** (-rep['pH'])
+    end_concentration = concentration * volume_ratio
+    rep['end pH'] = - np.log10(end_concentration)
+    return rep
+def get_alcohol_profile(ingredients, quantities):
+    ingredients = ingredients.copy()
+    quantities = quantities.copy()
+    assert len(ingredients) == len(quantities)
+    if 'mint' in ingredients:
+        mint_ind = ingredients.index('mint')
+        ingredients.pop(mint_ind)
+        quantities.pop(mint_ind)
+    alcohol = []
+    volume_mix = np.sum(quantities)
+    weights = quantities / volume_mix
+    assert np.abs(np.sum(weights) - 1) < 1e-4
+    ingredients_list = [ing.lower() for ing in ingredient_list]
+    for ing, q in zip(ingredients, quantities):
+        id = ingredients_list.index(ing)
+        alcohol.append(ingredient_profiles['ethanol'][id])
+    alcohol = np.dot(alcohol, weights)
+    return alcohol, volume_mix
+def get_mix_profile(ingredients, quantities):
+    ingredients = ingredients.copy()
+    quantities = quantities.copy()
+    assert len(ingredients) == len(quantities)
+    if 'mint' in ingredients:
+        mint_ind = ingredients.index('mint')
+        ingredients.pop(mint_ind)
+        quantities.pop(mint_ind)
+    alcohol, sugar, acid = [], [], []
+    volume_mix = np.sum(quantities)
+    weights = quantities / volume_mix
+    assert np.abs(np.sum(weights) - 1) < 1e-4
+    ingredients_list = [ing.lower() for ing in ingredient_list]
+    for ing, q in zip(ingredients, quantities):
+        id = ingredients_list.index(ing)
+        sugar.append(ingredient_profiles['sugar'][id])
+        alcohol.append(ingredient_profiles['ethanol'][id])
+        acid.append(ingredient_profiles['acid'][id])
+    sugar = np.dot(sugar, weights)
+    acid = np.dot(acid, weights)
+    alcohol = np.dot(alcohol, weights)
+    return alcohol, sugar, acid
+def extract_preparation_type(instructions, recipe):
+    flag = False
+    instructions = instructions.lower()
+    egg_in_recipe = any([find_ingredient_from_str(ing_str)[1]=='egg' for ing_str in recipe[1]])
+    if 'shake' in instructions:
+        if egg_in_recipe:
+            prep_type = 'egg_shaken'
+        else:
+            prep_type = 'shaken'
+    elif 'stir' in instructions:
+        prep_type = 'stirred'
+    elif 'blend' in instructions:
+        prep_type = 'blended'
+    elif any([w in instructions for w in ['build', 'mix', 'pour', 'combine', 'place']]):
+        prep_type = 'built'
+    else:
+        prep_type = 'built'
+    if egg_in_recipe and 'shaken' not in prep_type:
+        stop = 1
+    return flag, prep_type
+def get_dilution_ratio(category, alcohol):
+    # formulas from the Liquid Intelligence book
+    # The formula for built was invented
+    if category == 'stirred':
+        return -1.21 * alcohol**2 + 1.246 * alcohol + 0.145
+    elif category in ['shaken', 'egg_shaken']:
+        return -1.567 * alcohol**2 + 1.742 * alcohol + 0.203
+    elif category == 'built':
+        return (-1.21 * alcohol**2 + 1.246 * alcohol + 0.145) /2
+    else:
+        return 1
+def get_cocktail_rep(category, ingredients, quantities, keys):
+    ingredients = ingredients.copy()
+    quantities = quantities.copy()
+    assert len(ingredients) == len(quantities)
+    volume_mix = np.sum([quantities[i] for i in range(len(ingredients)) if ingredients[i] != 'mint'])
+    # compute alcohol content without mint ingredient
+    ingredients2 = [ing for ing in ingredients if ing != 'mint']
+    quantities2 = [q for ing, q in zip(ingredients, quantities) if ing != 'mint']
+    weights2 = quantities2 / np.sum(quantities2)
+    assert np.abs(np.sum(weights2) - 1) < 1e-4
+    ing_ids2 = np.array([ingredient2ingredient_id[ing] for ing in ingredients2])
+    alcohol = np.array([ingredient_profiles['ethanol'][ing_id] for ing_id in ing_ids2])
+    alcohol = np.dot(alcohol, weights2)
+    dilution_ratio = get_dilution_ratio(category, alcohol)
+    end_volume = volume_mix + volume_mix * dilution_ratio
+    volume_ratio = volume_mix / end_volume
+    end_alcohol = alcohol * volume_ratio
+    # computes representation of a cocktail from the recipe (ingredients, quantities) and volume
+    weights = quantities / np.sum(quantities)
+    assert np.abs(np.sum(weights) - 1) < 1e-4
+    ing_ids = np.array([ingredient2ingredient_id[ing] for ing in ingredients])
+    reps = ing_reps[ing_ids]
+    cocktail_rep = np.dot(weights, reps)
+    i_complex = keys.index('end complex')
+    cocktail_rep[i_complex] = np.mean(reps[:, i_complex]) + len(ing_ids)  # complexity increases with number of ingredients
+    # compute profile after dilution
+    cocktail_rep = cocktail_rep * volume_ratio
+    cocktail_rep = np.concatenate([[end_volume], cocktail_rep])
+    return cocktail_rep, end_volume, end_alcohol
+def get_profile(category, ingredients, quantities):
+    volume_mix = np.sum([quantities[i] for i in range(len(ingredients)) if ingredients[i] != 'mint'])
+    alcohol, sugar, acid = get_mix_profile(ingredients, quantities)
+    dilution_ratio = get_dilution_ratio(category, alcohol)
+    end_volume = volume_mix + volume_mix * dilution_ratio
+    volume_ratio = volume_mix / end_volume
+    profile = {'mix volume': volume_mix,
+               'mix alcohol': alcohol,
+               'mix sugar': sugar,
+               'mix acid': acid,
+               'dilution ratio': dilution_ratio,
+               'end volume': end_volume,
+               'end alcohol': alcohol * volume_ratio,
+               'end sugar': sugar * volume_ratio,
+               'end acid': acid * volume_ratio}
+    cocktail_rep = compute_cocktail_representation(profile, ingredients, quantities)
+    profile.update(cocktail_rep)
+    return profile
+profile_keys = ['mix volume', 'end volume',
+                'dilution ratio',
+                'mix alcohol', 'end alcohol',
+                'mix sugar', 'end sugar',
+                'mix acid', 'end acid'] \
+               + representation_keys \
+               + ['end ' + k for k in representation_keys]
+def update_profile_in_datapoint(datapoint, category, ingredients, quantities):
+    profile = get_profile(category, ingredients, quantities)
+    for k in profile_keys:
+        datapoint[k] = profile[k]
+    return datapoint
+# define representation keys
+def get_bunch_of_rep_keys():
+    dict_rep_keys = dict()
+    # all
+    rep_keys = profile_keys
+    dict_rep_keys['all'] = rep_keys
+    # only_end
+    rep_keys = [k for k in profile_keys if 'end' in k ]
+    dict_rep_keys['only_end'] = rep_keys
+    # except_end
+    rep_keys = [k for k in profile_keys if 'end' not in k ]
+    dict_rep_keys['except_end'] = rep_keys
+    # custom
+    to_remove = ['end alcohol', 'end sugar', 'end acid', 'end pH', 'end strong']
+    rep_keys = [k for k in profile_keys if 'end' in k ]
+    for k in to_remove:
+        if k in rep_keys:
+            rep_keys.remove(k)
+    dict_rep_keys['custom'] = rep_keys
+    # custom restricted
+    to_remove = ['end alcohol', 'end sugar', 'end acid', 'end pH', 'end strong', 'end spicy', 'end oaky']
+    rep_keys = [k for k in profile_keys if 'end' in k ]
+    for k in to_remove:
+        if k in rep_keys:
+            rep_keys.remove(k)
+    dict_rep_keys['restricted'] = rep_keys
+    dict_rep_keys['affective'] = ['end booze', 'end sweet', 'end sour', 'end fizzy', 'end complex', 'end bitter', 'end spicy', 'end colorful']
+    return dict_rep_keys

src/cocktails/utilities/glass_and_volume_utilities.py ADDED Viewed

	@@ -0,0 +1,42 @@

+glass_conversion = {'coupe':'coupe',
+                    'martini': 'martini',
+                    'collins': 'collins',
+                    'oldfashion': 'oldfashion',
+                    'Coupe glass': 'coupe',
+                    'Old-fashioned glass': 'oldfashion',
+                    'Martini glass': 'martini',
+                    'Nick & Nora glass': 'coupe',
+                    'Julep tin': 'oldfashion',
+                    'Collins or Pineapple shell glass': 'collins',
+                    'Collins glass': 'collins',
+                    'Rocks glass': 'oldfashion',
+                    'Highball (max 10oz/300ml)': 'collins',
+                    'Wine glass': 'coupe',
+                    'Flute glass': 'coupe',
+                    'Double old-fashioned': 'oldfashion',
+                    'Copa glass': 'coupe',
+                    'Toddy glass': 'oldfashion',
+                    'Sling glass': 'collins',
+                    'Goblet glass': 'oldfashion',
+                    'Fizz or Highball (8oz to 10oz)': 'collins',
+                    'Copper mug or Collins glass': 'collins',
+                    'Tiki mug or collins': 'collins',
+                    'Snifter glass': 'oldfashion',
+                    'Coconut shell or Collins glass': 'collins',
+                    'Martini (large 10oz) glass': 'martini',
+                    'Hurricane glass': 'collins',
+                    'Absinthe glass or old-fashioned glass': 'oldfashion'
+                    }
+glass_volume = dict(coupe = 200,
+                    collins=350,
+                    martini=200,
+                    oldfashion=320)
+assert set(glass_conversion.values()) == set(glass_volume.keys())
+volume_ranges = dict(stirred=(90, 97),
+                     built=(70, 75),
+                     shaken=(98, 112),
+                     egg_shaken=(130, 143),
+                     carbonated=(150, 150))

src/cocktails/utilities/ingredients_utilities.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# This script loads the list and profiles of our ingredients selection.
+# It defines rules to recognize ingredients from the list in recipes and the function to extract that information from ingredient strings.
+import pandas as pd
+from src.cocktails.config import INGREDIENTS_LIST_PATH, COCKTAILS_CSV_DATA
+import numpy as np
+ingredient_profiles = pd.read_csv(INGREDIENTS_LIST_PATH)
+ingredient_list = [ing.lower() for ing in ingredient_profiles['ingredient']]
+n_ingredients = len(ingredient_list)
+ingredient2ingredient_id = dict(zip(ingredient_list, range(n_ingredients)))
+ingredients_types = sorted(set(ingredient_profiles['type']))
+# for each type, get all ingredients
+ing_per_type = [[ing for ing in ingredient_list if ingredient_profiles['type'][ingredient_list.index(ing)] == type] for type in ingredients_types]
+ingredients_per_type = dict(zip(ingredients_types, ing_per_type))
+bubble_ingredients =  ['soda', 'ginger beer', 'tonic', 'sparkling wine']
+# rules to recognize ingredients in recipes.
+# in [] are separate rules with an OR relation: only one needs to be satisfied
+# within [], rules apply with and AND relation: all rules need to be satisfied.
+# ~ indicates that the following expression must NOT appear
+# simple expression indicate that the expression MUST appear.
+ingredient_search = {#'salt': ['salt'],
+                     'lime juice': [['lime', '~soda', '~lemonade', '~cordial']],
+                     'lemon juice': [['lemon', '~soda', '~lemonade']],
+                     'angostura': [['angostura', '~orange'],
+                                   ['bitter', '~campari', '~orange', '~red', '~italian', '~fernet']],
+                     'orange bitters': [['orange', 'bitter', '~bittersweet']],
+                     'orange juice': [['orange', '~bitter', '~jam', '~marmalade', '~liqueur', '~water'],
+                                      ['orange', 'squeeze']],
+                     'pineapple juice': [['pineapple']],
+                     # 'apple juice': [['apple', 'juice', '~pine']],
+                     'cranberry juice': [['cranberry', 'juice']],
+                     'cointreau': ['cointreau', 'triple sec', 'grand marnier', 'curaçao', 'curacao'],
+                     'luxardo maraschino': ['luxardo', 'maraschino', 'kirsch'],
+                     'amaretto': ['amaretto'],
+                     'benedictine': ['benedictine', 'bénédictine', 'bénedictine', 'benédictine'],
+                     'campari': ['campari', ['italian', 'red', 'bitter'], 'aperol', 'bittersweet', 'aperitivo', 'orange-red'],
+                     # 'campari': ['campari', ['italian', 'red', 'bitter']],
+                     # 'crème de violette': [['violette', 'crème'], ['crême', 'violette'], ['liqueur', 'violette']],
+                     # 'aperol': ['aperol', 'bittersweet', 'aperitivo', 'orange-red'],
+                     'green chartreuse': ['chartreuse'],
+                     'black raspberry liqueur': [['cassis', 'liqueur'],
+                                                 ['black raspberry', 'liqueur'],
+                                                 ['raspberry', 'liqueur'],
+                                                 ['strawberry', 'liqueur'],
+                                                 ['blackberry', 'liqueur'],
+                                                 ['violette', 'crème'], ['crême', 'violette'], ['liqueur', 'violette']],
+                     # 'simple syrup': [],
+                     # 'drambuie': ['drambuie'],
+                     # 'fernet branca': ['fernet', 'branca'],
+                     'gin': [['gin', '~sloe', '~ginger']],
+                     'vodka': ['vodka'],
+                     'cuban rum': [['rum', 'puerto rican'], ['light', 'rum'], ['white', 'rum'], ['rum', 'havana', '~7'], ['rum', 'bacardi']],
+                     'cognac': [['cognac', '~grand marnier', '~cointreau', '~orange']],
+                     # 'bourbon': [['bourbon', '~liqueur']],
+                     # 'tequila': ['tequila', 'pisco'],
+                     # 'tequila': ['tequila'],
+                     'scotch': ['scotch'],
+                     'dark rum': [['rum', 'age', '~bacardi', '~havana'],
+                                  ['rum', 'dark', '~bacardi', '~havana'],
+                                  ['rum', 'old', '~bacardi', '~havana'],
+                                  ['rum', 'old', '7'],
+                                  ['rum', 'havana', '7'],
+                                  ['havana', 'rum', 'especial']],
+                     'absinthe': ['absinthe'],
+                     'rye whiskey': ['rye', ['bourbon', '~liqueur']],
+                     # 'rye whiskey': ['rye'],
+                     'apricot brandy': [['apricot', 'brandy']],
+                     # 'pisco': ['pisco'],
+                     # 'cachaça': ['cachaça', 'cachaca'],
+                     'egg': [['egg', 'white', '~yolk', '~whole']],
+                     'soda': [['soda', 'water', '~lemon', '~lime']],
+                     'mint': ['mint'],
+                     'sparkling wine': ['sparkling wine', 'prosecco', 'champagne'],
+                     'ginger beer': [['ginger', 'beer'], ['ginger', 'ale']],
+                     'tonic': [['tonic'], ['7up'], ['sprite']],
+                     # 'espresso': ['espresso', 'expresso', ['café', '~liqueur', '~cream'],
+                     #              ['cafe', '~liqueur', '~cream'],
+                     #              ['coffee', '~liqueur', '~cream']],
+                     # 'southern comfort': ['southern comfort'],
+                     # 'cola': ['cola', 'coke', 'pepsi'],
+                     'double syrup': [['sugar','~raspberry'], ['simple', 'syrup'], ['double', 'syrup']],
+                     # 'grenadine': ['grenadine', ['pomegranate', 'syrup']],
+                     'grenadine': ['grenadine', ['pomegranate', 'syrup'], ['raspberry', 'syrup', '~black']],
+                     'honey syrup': ['honey', ['maple', 'syrup']],
+                     # 'raspberry syrup': [['raspberry', 'syrup', '~black']],
+                     'dry vermouth': [['vermouth', 'dry'], ['vermouth', 'white'], ['vermouth', 'french'], 'lillet'],
+                     'sweet vermouth': [['vermouth', 'sweet'], ['vermouth', 'red'], ['vermouth', 'italian']],
+                     # 'lillet blanc': ['lillet'],
+                     'water': [['water', '~sugar', '~coconut', '~soda', '~tonic', '~honey', '~orange', '~melon']]
+                     }
+# check that there is a rule for all ingredients in the list
+assert sorted(ingredient_list) == sorted(ingredient_search.keys()), 'ing search dict keys do not match ingredient list'
+def get_ingredients_info():
+    data = pd.read_csv(COCKTAILS_CSV_DATA)
+    max_ingredients, ingredient_set, liquor_set, liqueur_set, vermouth_set = get_max_n_ingredients(data)
+    ingredient_list = sorted(ingredient_set)
+    alcohol = sorted(liquor_set.union(liqueur_set).union(vermouth_set).union(set(['sparkling wine'])))
+    ind_alcohol = [i for i in range(len(ingredient_list)) if ingredient_list[i] in alcohol]
+    return max_ingredients, ingredient_list, ind_alcohol
+def get_max_n_ingredients(data):
+    max_count = 0
+    ingredient_set = set()
+    alcohol_set = set()
+    liqueur_set = set()
+    vermouth_set = set()
+    ing_str = np.array(data['ingredients_str'])
+    for i in range(len(data['names'])):
+        ingredients, quantities = extract_ingredients(ing_str[i])
+        max_count = max(max_count, len(ingredients))
+        for ing in ingredients:
+            ingredient_set.add(ing)
+            if ing in ingredients_per_type['liquor']:
+                alcohol_set.add(ing)
+            if ing in ingredients_per_type['liqueur']:
+                liqueur_set.add(ing)
+            if ing in ingredients_per_type['vermouth']:
+                vermouth_set.add(ing)
+    return max_count, ingredient_set, alcohol_set, liqueur_set, vermouth_set
+def find_ingredient_from_str(ing_str):
+    # function that assigns an ingredient string to one of the ingredient if possible, following the rules defined above.
+    # return a flag and the ingredient string. When flag is false, the ingredient has not been found and the cocktail is rejected.
+    ing_str = ing_str.lower()
+    flags = []
+    for k in ingredient_list:
+        or_flags = [] # get flag for each of several conditions
+        for i_p, pattern in enumerate(ingredient_search[k]):
+            or_flags.append(True)
+            if isinstance(pattern, str):
+                if pattern[0] == '~' and pattern[1:] in ing_str:
+                    or_flags[-1] = False
+                elif pattern[0] != '~' and pattern not in ing_str:
+                    or_flags[-1] = False
+            elif isinstance(pattern, list):
+                for element in pattern:
+                    if element[0] == '~':
+                        or_flags[-1] = or_flags[-1] and not element[1:] in ing_str
+                    else:
+                        or_flags[-1] = or_flags[-1] and element in ing_str
+            else:
+                raise ValueError
+        flags.append(any(or_flags))
+    if sum(flags) > 1:
+        print(ing_str)
+        for i_f, f in enumerate(flags):
+            if f:
+                print(ingredient_list[i_f])
+        stop = 1
+        return True, ingredient_list[flags.index(True)]
+    elif sum(flags) == 0:
+        # if 'grape' not in ing_str:
+        #     print('\t\t Not found:', ing_str)
+        return True, None
+    else:
+        return False, ingredient_list[flags.index(True)]
+def get_cocktails_per_ingredient(ing_strs):
+    cocktails_per_ing = dict(zip(ingredient_list, [[] for _ in range(len(ingredient_list))]))
+    for i_ing, ing_str in enumerate(ing_strs):
+        ingredients, _ = extract_ingredients(ing_str)
+        for ing in ingredients:
+            cocktails_per_ing[ing].append(i_ing)
+    return cocktails_per_ing
+def extract_ingredients(ingredient_str):
+    # extract list of ingredients and quantities from an formatted ingredient string (reverse of format_ingredients)
+    ingredient_str = ingredient_str[1: -1]
+    words = ingredient_str.split(',')
+    ingredients = []
+    quantities = []
+    for i in range(len(words)//2):
+        ingredients.append(words[2 * i][1:])
+        quantities.append(float(words[2 * i + 1][:-1]))
+    return ingredients, quantities
+def format_ingredients(ingredients, quantities):
+    # format an ingredient string from the lists of ingredients and quantities (reverse of extract_ingredients)
+    out = '['
+    for ing, q in zip(ingredients, quantities):
+        if ing[-1] == ' ':
+            ingre = ing[:-1]
+        else:
+            ingre = ing
+        out += f'({ingre},{q}),'
+    out = out[:-1] + ']'
+    return out
+def get_ingredient_count(data):
+    # get count of ingredients in the whole dataset
+    ingredient_counts = dict(zip(ingredient_list, [0] * len(ingredient_list)))
+    for i in range(len(data['names'])):
+        if data['to_keep'][i]:
+            ingredients, _ = extract_ingredients(data['ingredients_str'][i])
+            for i in ingredients:
+                ingredient_counts[i] += 1
+    return ingredient_counts
+def add_counts_to_ingredient_list(data):
+    # update the list of ingredients to add their count of occurence in dataset.
+    ingredient_counts = get_ingredient_count(data)
+    counts = [ingredient_counts[k] for k in ingredient_list]
+    ingredient_profiles['counts'] = counts
+    ingredient_profiles.to_csv(INGREDIENTS_LIST_PATH, index=False)

src/cocktails/utilities/other_scrubbing_utilities.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import numpy as np
+import pickle
+from src.cocktails.utilities.cocktail_utilities import get_profile, profile_keys
+from src.cocktails.utilities.ingredients_utilities import extract_ingredients, ingredient_list, ingredient_profiles
+from src.cocktails.utilities.glass_and_volume_utilities import glass_volume, volume_ranges
+one_dash = 1
+one_splash = 6
+one_tablespoon = 15
+one_barspoon = 5
+fill_rate = 0.8
+quantity_factors ={'ml':1,
+                   'cl':10,
+                   'splash':one_splash,
+                   'splashes':one_splash,
+                   'dash':one_dash,
+                   'dashes':one_dash,
+                   'spoon':one_barspoon,
+                   'spoons':one_barspoon,
+                   'tablespoon':one_tablespoon,
+                   'barspoons':one_barspoon,
+                   'barspoon':one_barspoon,
+                   'bar spoons': one_barspoon,
+                   'bar spoon': one_barspoon,
+                   'tablespoons':one_tablespoon,
+                   'teaspoon':5,
+                   'teaspoons':5,
+                   'drop':0.05,
+                   'drops':0.05}
+quantitiy_keys = sorted(quantity_factors.keys())
+indexes_keys = np.flip(np.argsort([len(k) for k in quantitiy_keys]))
+quantity_factors_keys = list(np.array(quantitiy_keys)[indexes_keys])
+keys_to_track = ['names', 'urls', 'glass', 'garnish', 'recipe', 'how_to', 'review', 'taste_rep', 'valid']
+keys_to_add = ['category', 'subcategory', 'ingredients_str', 'ingredients', 'quantities', 'to_keep']
+keys_to_update = ['glass']
+keys_for_csv = ['names', 'category', 'subcategory', 'ingredients_str', 'urls', 'glass', 'garnish', 'how_to', 'review', 'taste_rep'] + profile_keys
+to_replace_q = {' fresh': ''}
+to_replace_ing = {'maple syrup': 'honey syrup',
+                  'agave syrup': 'honey syrup',
+                  'basil': 'mint'}
+def print_recipe(unit='mL', ingredient_str=None, ingredients=None, quantities=None, name='', cat='', to_print=True):
+    str_out = ''
+    if ingredient_str is None:
+        assert len(ingredients) == len(quantities), 'provide either ingredient_str, or list ingredients and quantities'
+    else:
+        assert ingredients is None and quantities is None, 'provide either ingredient_str, or list ingredients and quantities'
+        ingredients, quantities = extract_ingredients(ingredient_str)
+    str_out += f'\nRecipe:'
+    if name != '' and name is not None: str_out += f' {name}'
+    if cat != '': str_out += f' ({cat})'
+    str_out += '\n'
+    for i in range(len(ingredients)):
+        # get quantifier
+        if ingredients[i] == 'egg':
+            quantities[i] = 1
+            ingredients[i] = 'egg white'
+            if unit == 'mL':
+                quantifier = ' (30 mL)'
+            elif unit == 'oz':
+                quantifier = ' (1 fl oz)'
+            else:
+                raise ValueError
+        elif ingredients[i] in ['angostura', 'orange bitters']:
+            quantities[i] = max(1, int(quantities[i] / 0.6))
+            quantifier = ' dash'
+            if quantities[i] > 1: quantifier += 'es'
+        elif ingredients[i] == 'mint':
+            if quantities[i] > 1: quantifier = ' leaves'
+            else: quantifier = ' leaf'
+        else:
+            if unit == "oz":
+                quantities[i] = float(f"{quantities[i] * 0.033814:.3f}")  # convert to fl oz
+                quantifier = ' fl oz'
+            else:
+                quantifier = ' mL'
+        str_out += f'   {quantities[i]}{quantifier} - {ingredients[i]}\n'
+    if to_print:
+        print(str_out)
+    return str_out
+def test_datapoint(datapoint, category, ingredients, quantities):
+    # run checks
+    ingredient_indexes = [ingredient_list.index(ing) for ing in ingredients]
+    profile = get_profile(category, ingredients, quantities)
+    volume = profile['end volume']
+    alcohol = profile['end alcohol']
+    acid = profile['end acid']
+    sugar = profile['end sugar']
+    # check volume
+    if datapoint['glass'] != None:
+        if volume > glass_volume[datapoint['glass']] * fill_rate:
+            # recompute quantities for it to match
+            ratio = fill_rate *  glass_volume[datapoint['glass']] / volume
+            for i_q in range(len(quantities)):
+                quantities[i_q] = float(f'{quantities[i_q] * ratio:.2f}')
+    # check alcohol
+    assert alcohol < 30, 'too boozy'
+    assert alcohol < 5, 'not boozy enough'
+    assert acid < 2, 'too much acid'
+    assert sugar < 20, 'too much sugar'
+    assert len(ingredients) > 1, 'only one ingredient'
+    if len(set(ingredients)) != len(ingredients):
+        i_doubles = []
+        s_ing = set()
+        for i, ing in enumerate(ingredients):
+            if ing in s_ing:
+                i_doubles.append(i)
+            else:
+                s_ing.add(ing)
+        ingredient_double_ok = ['mint', 'cointreau', 'lemon juice', 'cuban rum', 'double syrup']
+        if len(i_doubles) == 1 and ingredients[i_doubles[0]] in ingredient_double_ok:
+            ing_double = ingredients[i_doubles[0]]
+            double_q = np.sum([quantities[i] for i in range(len(ingredients)) if ingredients[i] == ing_double])
+            ingredients.pop(i_doubles[0])
+            quantities.pop(i_doubles[0])
+            quantities[ingredients.index(ing_double)] = double_q
+        else:
+            assert False, f'double ingredient, not {ingredient_double_ok}'
+    lemon_lime_q = np.sum([quantities[i] for i in range(len(ingredients)) if ingredients[i] in ['lime juice', 'lemon juice']])
+    assert lemon_lime_q <= 45, 'too much lemon and lime'
+    salt_q = np.sum([quantities[i] for i in range(len(ingredients)) if ingredients[i] == 'salt'])
+    assert salt_q <= 8, 'too much salt'
+    bitter_q = np.sum([quantities[i] for i in range(len(ingredients)) if ingredients[i] in ['angostura', 'orange bitters']])
+    assert bitter_q <= 5 * one_dash, 'too much bitter'
+    absinthe_q = np.sum([quantities[i] for i in range(len(ingredients)) if ingredients[i] == 'absinthe'])
+    if absinthe_q > 4 * one_dash:
+        mix_volume = np.sum([quantities[i] for i in range(len(ingredients)) if ingredients[i] != 'mint'])
+        assert absinthe_q < 0.5 * mix_volume, 'filter absinthe glasses'
+    if any([w in datapoint['how_to'] or any([w in ing.lower() for ing in datapoint['recipe'][1]]) for w in ['warm', 'boil', 'hot']]) and 'shot' not in datapoint['how_to']:
+        assert False
+    water_q = np.sum([quantities[i] for i in range(len(ingredients)) if ingredients[i] == 'water'])
+    assert water_q < 40
+    # n_liqueur = np.sum([ingredient_profiles['type'][i].lower() == 'liqueur' for i in ingredient_indexes])
+    # assert n_liqueur <= 2
+    n_liqueur_and_vermouth = np.sum([ingredient_profiles['type'][i].lower() in ['liqueur', 'vermouth'] for i in ingredient_indexes])
+    assert n_liqueur_and_vermouth <= 3
+    return ingredients, quantities
+def run_battery_checks_difford(datapoint, category, ingredients, quantities):
+    flag = False
+    try:
+        ingredients, quantities = test_datapoint(datapoint, category, ingredients, quantities)
+    except:
+        flag = True
+        print(datapoint["names"])
+        print(datapoint["urls"])
+        ingredients, quantities = None, None
+    return flag, ingredients, quantities
+def tambouille(q, ingredients_scrubbed, quantities_scrubbed, cat):
+    # ugly
+    ing_scrubbed = ingredients_scrubbed[len(quantities_scrubbed)]
+    if q == '4 cube' and ing_scrubbed == 'pineapple juice':
+        q = '20 ml'
+    elif 'top up with' in q:
+        volume_so_far = np.sum([quantities_scrubbed[i] for i in range(len(quantities_scrubbed)) if ingredients_scrubbed[i] != 'mint'])
+        volume_mix = np.sum(volume_ranges[cat]) / 2
+        if (volume_mix - volume_so_far) < 15:
+            q = '15 ml'#
+        else:
+            q = str(int(volume_mix - volume_so_far)) + ' ml'
+    elif q == '1 pinch' and ing_scrubbed == 'salt':
+        q = '2 drops'
+    elif 'cube' in q and ing_scrubbed == 'double syrup':
+        q = f'{float(q.split(" ")[0]) * 2 * 1.7:.2f} ml'  #2g per cube, 1.7 is ratio solid / syrup
+    elif 'wedge' in q:
+        if ing_scrubbed == 'orange juice':
+            vol = 70
+        elif ing_scrubbed == 'lime juice':
+            vol = 30
+        elif ing_scrubbed == 'lemon juice':
+            vol = 45
+        elif ing_scrubbed == 'pineapple juice':
+            vol = 140
+        factor = float(q.split(' ')[0]) * 0.15 # consider a wedge to be 0.15*the fruit.
+        q = f'{factor * vol:.2f} ml'
+    elif 'slice' in q:
+        if ing_scrubbed == 'orange juice':
+            vol = 70
+        elif ing_scrubbed == 'lime juice':
+            vol = 30
+        elif ing_scrubbed == 'lemon juice':
+            vol = 45
+        elif ing_scrubbed == 'pineapple juice':
+            vol = 140
+        f = q.split(' ')[0]
+        if len(f.split('⁄')) > 1:
+            frac = f.split('⁄')
+            factor = float(frac[0]) / float(frac[1])
+        else:
+            factor = float(f)
+        factor *= 0.1 # consider a slice to be 0.1*the fruit.
+        q = f'{factor * vol:.2f} ml'
+    elif q == '1 whole' and ing_scrubbed == 'luxardo maraschino':
+        q = '10 ml'
+    elif ing_scrubbed == 'egg' and 'ml' not in q:
+        q = f'{float(q) * 30:.2f} ml'  # 30 ml per egg
+    return q
+def compute_eucl_dist(a, b):
+    return np.sqrt(np.sum((a - b)**2))
+def evaluate_with_quadruplets(representations, strategy='all'):
+    with open(QUADRUPLETS_PATH, 'rb') as f:
+        data = pickle.load(f)
+    data = list(data.values())
+    quadruplets = []
+    if strategy != 'all':
+        for d in data:
+            if d[0] == strategy:
+                quadruplets.append(d[1:])
+    elif strategy == 'all':
+        for d in data:
+            quadruplets.append(d[1:])
+    else:
+        raise ValueError
+    scores = []
+    for q in quadruplets:
+        close = q[0]
+        if len(close) == 2:
+            far = q[1]
+            distance_close = compute_eucl_dist(representations[close[0]], representations[close[1]])
+            distances_far = [compute_eucl_dist(representations[far[i][0]], representations[far[i][1]]) for i in range(len(far))]
+            scores.append(distance_close < np.min(distances_far))
+    if len(scores) == 0:
+        score = np.nan
+    else:
+        score = np.mean(scores)
+    return score

src/debugger.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import os.path
+# from src.music.data_collection.is_audio_solo_piano import calculate_piano_solo_prob
+from src.music.utils import load_audio
+from src.music.config import FPS
+import pretty_midi as pm
+import numpy as np
+from src.music.config import MUSIC_REP_PATH, MUSIC_NN_PATH
+from sklearn.neighbors import NearestNeighbors
+from src.cocktails.config import FULL_COCKTAIL_REP_PATH, COCKTAIL_NN_PATH, COCKTAILS_CSV_DATA
+# from src.cocktails.pipeline.get_affect2affective_cluster import get_affective_cluster_centers
+from src.cocktails.utilities.other_scrubbing_utilities import print_recipe
+from src.music.utils import get_all_subfiles_with_extension
+import os
+import pickle
+import pandas as pd
+import time
+keyword = 'b256_r128_represented'
+def load_reps(rep_path, sample_size=None):
+    if sample_size:
+        with open(rep_path + f'all_reps_unnormalized_sample{sample_size}.pickle', 'rb') as f:
+            data = pickle.load(f)
+    else:
+        with open(rep_path + f'music_reps_unnormalized.pickle', 'rb') as f:
+            data = pickle.load(f)
+    reps = data['reps']
+    # playlists = [r.split(f'_{keyword}')[0].split('/')[-1] for r in data['paths']]
+    playlists = [r.split(f'{keyword}')[1].split('/')[1] for r in data['paths']]
+    n_data, dim_data = reps.shape
+    return reps, data['paths'], playlists, n_data, dim_data
+class Debugger():
+    def __init__(self, verbose=True):
+        if verbose: print('Setting up debugger.')
+        if not os.path.exists(MUSIC_NN_PATH):
+            reps_path = MUSIC_REP_PATH + 'music_reps_unnormalized.pickle'
+            if not os.path.exists(reps_path):
+                all_rep_path = get_all_subfiles_with_extension(MUSIC_REP_PATH, max_depth=3, extension='.txt', current_depth=0)
+                all_data = []
+                new_all_rep_path = []
+                for i_r, r in enumerate(all_rep_path):
+                    if 'mean_std' not in r:
+                        all_data.append(np.loadtxt(r))
+                        assert len(all_data[-1]) == 128
+                        new_all_rep_path.append(r)
+                data = np.array(all_data)
+                to_save = dict(reps=data,
+                               paths=new_all_rep_path)
+                with open(reps_path, 'wb') as f:
+                    pickle.dump(to_save, f)
+            reps, self.rep_paths, playlists, n_data, self.dim_rep_music = load_reps(MUSIC_REP_PATH)
+            self.nn_model_music = NearestNeighbors(n_neighbors=6, metric='cosine')
+            self.nn_model_music.fit(reps)
+            to_save = dict(nn_model=self.nn_model_music,
+                           rep_paths=self.rep_paths,
+                           dim_rep_music=self.dim_rep_music)
+            with open(MUSIC_NN_PATH, 'wb') as f:
+                pickle.dump(to_save, f)
+        else:
+            with open(MUSIC_NN_PATH, 'rb') as f:
+                data = pickle.load(f)
+            self.nn_model_music = data['nn_model']
+            self.rep_paths = data['rep_paths']
+            self.dim_rep_music = data['dim_rep_music']
+        if verbose: print(f'  {len(self.rep_paths)} songs, representation dim: {self.dim_rep_music}')
+        self.rep_paths = np.array(self.rep_paths)
+        if not os.path.exists(COCKTAIL_NN_PATH):
+            cocktail_reps = np.loadtxt(FULL_COCKTAIL_REP_PATH)
+            # cocktail_reps = (cocktail_reps - cocktail_reps.mean(axis=0)) / cocktail_reps.std(axis=0)
+            self.nn_model_cocktail = NearestNeighbors(n_neighbors=6)
+            self.nn_model_cocktail.fit(cocktail_reps)
+            self.dim_rep_cocktail = cocktail_reps.shape[1]
+            self.n_cocktails = cocktail_reps.shape[0]
+            to_save = dict(nn_model=self.nn_model_cocktail,
+                           dim_rep_cocktail=self.dim_rep_cocktail,
+                           n_cocktails=self.n_cocktails)
+            with open(COCKTAIL_NN_PATH, 'wb') as f:
+                pickle.dump(to_save, f)
+        else:
+            with open(COCKTAIL_NN_PATH, 'rb') as f:
+                data = pickle.load(f)
+            self.nn_model_cocktail = data['nn_model']
+            self.dim_rep_cocktail = data['dim_rep_cocktail']
+            self.n_cocktails = data['n_cocktails']
+        if verbose: print(f'  {self.n_cocktails} cocktails, representation dim: {self.dim_rep_cocktail}')
+        self.cocktail_data = pd.read_csv(COCKTAILS_CSV_DATA)
+        # self.affective_cluster_centers = get_affective_cluster_centers()
+        self.keys_to_print = ['mse_reconstruction', 'nearest_cocktail_recipes', 'nearest_cocktail_urls',
+                              'nn_music_dists', 'nn_music', 'dim_rep', 'nb_notes', 'audio_len', 'piano_solo_prob', 'recipe_score', 'cocktail_rep']
+        # 'affect', 'affective_cluster_id', 'affective_cluster_center',
+    def get_nearest_songs(self, music_rep):
+        dists, indexes = self.nn_model_music.kneighbors(music_rep.reshape(1, -1))
+        indexes = indexes.flatten()[:5]
+        rep_paths = [r.split('/')[-1] for r in self.rep_paths[indexes[:5]]]
+        return rep_paths, dists.flatten().tolist()
+    def get_nearest_cocktails(self, cocktail_rep):
+        dists, indexes = self.nn_model_cocktail.kneighbors(cocktail_rep.reshape(1, -1))
+        indexes = indexes.flatten()
+        nn_names = np.array(self.cocktail_data['names'])[indexes].tolist()
+        nn_urls = np.array(self.cocktail_data['urls'])[indexes].tolist()
+        nn_recipes = [print_recipe(ingredient_str=ing_str, to_print=False) for ing_str in np.array(self.cocktail_data['ingredients_str'])[indexes]]
+        nn_ing_strs = np.array(self.cocktail_data['ingredients_str'])[indexes].tolist()
+        return indexes, nn_names, nn_urls, nn_recipes, nn_ing_strs
+    def extract_info(self, all_paths, affective_cluster_id, affect, cocktail_rep, music_reconstruction, recipe_score, verbose=False, level=0):
+        if verbose: print(' ' * level + 'Extracting debug info..')
+        init_time = time.time()
+        debug_dict = dict()
+        debug_dict['all_paths'] = all_paths
+        debug_dict['recipe_score'] = recipe_score
+        if all_paths['audio_path'] != None:
+            # is it piano?
+            debug_dict['piano_solo_prob'] = None#float(calculate_piano_solo_prob(all_paths['audio_path'])[0])
+            # how long is the audio
+            (audio, _) = load_audio(all_paths['audio_path'], sr=FPS, mono=True)
+            debug_dict['audio_len'] = int(len(audio) / FPS)
+        else:
+            debug_dict['piano_solo_prob'] = None
+            debug_dict['audio_len'] = None
+        # how many notes?
+        midi = pm.PrettyMIDI(all_paths['processed_path'])
+        debug_dict['nb_notes'] = len(midi.instruments[0].notes)
+        # dimension of music rep
+        representation = np.loadtxt(all_paths['representation_path'])
+        debug_dict['dim_rep'] = representation.shape[0]
+        # closest songs in dataset
+        debug_dict['nn_music'], debug_dict['nn_music_dists'] = self.get_nearest_songs(representation)
+        # get affective cluster info
+        # debug_dict['affective_cluster_id'] = affective_cluster_id[0]
+        # debug_dict['affective_cluster_center'] = self.affective_cluster_centers[affective_cluster_id].flatten().tolist()
+        # debug_dict['affect'] = affect.flatten().tolist()
+        indexes, nn_names, nn_urls, nn_recipes, nn_ing_strs = self.get_nearest_cocktails(cocktail_rep)
+        debug_dict['cocktail_rep'] = cocktail_rep.copy().tolist()
+        debug_dict['nearest_cocktail_indexes'] = indexes.tolist()
+        debug_dict['nn_ing_strs'] = nn_ing_strs
+        debug_dict['nearest_cocktail_names'] = nn_names
+        debug_dict['nearest_cocktail_urls'] = nn_urls
+        debug_dict['nearest_cocktail_recipes'] = nn_recipes
+        debug_dict['music_reconstruction'] = music_reconstruction.tolist()
+        debug_dict['mse_reconstruction'] = ((music_reconstruction - representation) ** 2).mean()
+        self.debug_dict = debug_dict
+        if verbose: print(' ' * (level + 2) + f'Debug info extracted in {int(time.time() - init_time)} seconds.')
+        return self.debug_dict
+    def print_debug(self, level=0):
+        print(' ' * level + '__DEBUGGING INFO__')
+        for k in self.keys_to_print:
+            to_print = self.debug_dict[k]
+            if k == 'nearest_cocktail_recipes':
+                to_print = self.debug_dict[k].copy()
+                for i in range(len(to_print)):
+                    to_print[i] = to_print[i].replace('\n', '').replace('\t', '').replace('()', '')
+            if k == "nn_music":
+                to_print = self.debug_dict[k].copy()
+                for i in range(len(to_print)):
+                    to_print[i] = to_print[i].replace('encoded_new_structured_', '').replace('_represented.txt', '')
+            to_print_str = f'{to_print}'
+            if isinstance(to_print, float):
+                to_print_str = f'{to_print:.2f}'
+            elif isinstance(to_print, list):
+                if isinstance(to_print[0], float):
+                    to_print_str = '['
+                    for element in to_print:
+                        to_print_str += f'{element:.2f}, '
+                    to_print_str = to_print_str[:-2] + ']'
+            print(' ' * (level + 2) + f'{k} : ' + to_print_str)