File size: 3,012 Bytes
79a66df
 
bf8a52d
79a66df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import tensorflow as tf
from tensorflow import keras
from transformer import TransformerEncoder, TransformerDecoderLayer, get_look_ahead_mask, get_padding_mask

class TFPort(keras.Model):
    def __init__(self, crop_size, embed_dim, encoder_layers, decoder_layers, input_seq_length, 
                    target_seq_length, hidden_dim, num_heads, input_vocab_size, dropout_rate=0.1):
        super().__init__()

        self.image_encoder = keras.applications.InceptionV3(
            include_top=False, 
            weights='imagenet',
            input_shape=crop_size + (3,),
        )
        self.image_encoder.trainable = False
        self.conv = keras.layers.Conv2D(embed_dim, 1)
        self.ingredient_encoder = TransformerEncoder(encoder_layers, input_seq_length, embed_dim, hidden_dim, 
                                                        num_heads, input_vocab_size, dropout_rate=dropout_rate)
        self.portion_embedding = PortionEmbedding(target_seq_length, embed_dim)
        self.dropout = keras.layers.Dropout(dropout_rate)
        self.decoder_layers = [
            TransformerDecoderLayer(embed_dim, hidden_dim, num_heads, dropout_rate=dropout_rate)
            for _ in range(decoder_layers)
        ]
        self.linear = keras.layers.Dense(1, activation="relu")

    def call(self, inputs, training=False):
        image, ingredients, targets = inputs
        padding_mask = get_padding_mask(ingredients)
        encoded_img = self.image_encoder(image, training=False)
        encoded_img = self.conv(encoded_img, training=training)
        encoded_img = tf.reshape(encoded_img, (tf.shape(encoded_img)[0], -1, tf.shape(encoded_img)[3]))
        encoded_ingr = self.ingredient_encoder(ingredients, padding_mask, training=training)
        encoder_outputs = tf.concat([encoded_img, encoded_ingr], axis=1)

        img_mask = tf.ones((tf.shape(encoded_img)[0], 1, tf.shape(encoded_img)[1]), dtype=tf.int32)
        padding_mask = tf.concat([img_mask, padding_mask], axis=2)
        look_ahead_mask = get_look_ahead_mask(targets)
        
        x = self.portion_embedding(targets)
        x = self.dropout(x, training=training)
        for i in range(len(self.decoder_layers)):
            x = self.decoder_layers[i](x, encoder_outputs, look_ahead_mask, training=training, 
                                        padding_mask=padding_mask)
        x = self.linear(x)
        return tf.squeeze(x)


class PortionEmbedding(keras.layers.Layer):
    def __init__(self, seq_length, embed_dim):
        super().__init__()

        self.linear = keras.layers.Dense(embed_dim)
        self.position_embeddings = keras.layers.Embedding(
            input_dim=seq_length, output_dim=embed_dim
        )

    def call(self, inputs):
        positions = tf.range(start=0, limit=tf.shape(inputs)[-1], delta=1)
        embedded_portions = self.linear(inputs[:, :, tf.newaxis])
        embedded_positions = self.position_embeddings(positions)
        return embedded_portions + embedded_positions