Spaces:

chhetri123
/

caption_generation_transformer

Running

App Files Files Community

chhetri123 commited on Feb 11

Commit

340d736

•

1 Parent(s): 2d85d8d

Upload 27 files

Browse files

Files changed (28) hide show

.gitattributes +2 -0
app.py +182 -0
library/Multihead_attention.py +57 -0
library/__pycache__/Multihead_attention.cpython-310.pyc +0 -0
library/__pycache__/customSchedule.cpython-310.pyc +0 -0
library/__pycache__/encoder_decoder.cpython-310.pyc +0 -0
library/__pycache__/imageLoad.cpython-310.pyc +0 -0
library/__pycache__/prediction.cpython-310.pyc +0 -0
library/__pycache__/self_attention.cpython-310.pyc +0 -0
library/__pycache__/transformer.cpython-310.pyc +0 -0
library/customSchedule.py +19 -0
library/encoder_decoder.py +143 -0
library/imageLoad.py +23 -0
library/prediction.py +63 -0
library/self_attention.py +101 -0
library/transformer.py +60 -0
model/fingerprint.pb +3 -0
model/model-20/checkpoint +2 -0
model/model-20/custom_objects-15.pkl +3 -0
model/model-20/model_weights-15.data-00000-of-00001 +3 -0
model/model-20/model_weights-15.index +0 -0
model/model-20/training_validation_accuracy.png +0 -0
model/model-20/training_validation_loss.png +0 -0
model/saved_model.pb +3 -0
model/variables/variables.data-00000-of-00001 +3 -0
model/variables/variables.index +0 -0
requirements.txt +11 -0
transformer/tokenizer.pickle +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model/model-20/model_weights-15.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
+model/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from flask import Flask, request, jsonify
+import cv2
+import numpy as np
+from keras.applications import ResNet152
+from keras.optimizers import Adam
+from keras.models import Sequential, Model,load_model
+from keras.layers import Input
+from keras.layers import Dense
+from keras.layers import LSTM
+from keras.layers import Embedding
+from keras.layers import Dropout
+from keras.layers import add
+from keras.utils import to_categorical
+from tensorflow.keras.applications.resnet import preprocess_input
+from keras.preprocessing import image, sequence
+import cv2
+from keras_preprocessing.sequence import pad_sequences
+from tqdm import tqdm
+import pickle
+import tensorflow as tf
+# from keras.applications.Resnet50 import preprocess_input
+from flask_cors import CORS
+#
+# Transformer
+from library.prediction import evaluate_single_image
+from  library.transformer import Transformer
+from library.customSchedule import learning_rate
+top_k = 25000
+num_layer = 4
+d_model = 512
+dff = 2048
+num_heads = 8
+row_size = 8
+col_size = 8
+target_vocab_size = top_k + 1
+dropout_rate = 0.1
+loaded_transformer = Transformer(num_layer, d_model, num_heads, dff, row_size, col_size,
+                                 target_vocab_size, max_pos_encoding=target_vocab_size,
+                                 rate=dropout_rate)
+# Load the weights into the model
+loaded_transformer.load_weights('models/Transformer/model')
+# Use the loaded custom objects
+loaded_transformer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
+print("Trasformer model loaded successfully")
+# loaded_transformer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss=train_loss.result(), metrics=[train_accuracy])
+global tokenizer
+with open('pickle_files/transformer/tokenizer.pickle', 'rb') as handle:
+    tokenizer = pickle.load(handle)
+    tokenizer.word_index['<pad>'] = 0
+    tokenizer.index_word[0] = '<pad>'
+print("Tokenizer  loaded successfully")
+#
+# LSTM Model
+# incept_model = ResNet152(weights='imagenet', include_top=False)
+# last = incept_model.layers[-2].output
+# ResNet152Model= Model(inputs = incept_model.input,outputs = last)
+ResNet152Model=ResNet152(include_top=False, weights='imagenet',input_shape=(224,224,3), pooling='avg')
+with open("pickle_files/lstm/words_dict_nepali_sc.pkl","rb") as f:
+    words_dict=pickle.load(f)
+# vocab_size = len(words_dict)+1
+vocab_size = 5521
+# MAX_LEN = 192
+MAX_LEN=210
+inv_dict = {v:k for k, v in words_dict.items()}
+# model = tf.keras.models.load_model('models/LSTM/cultural_nepali_50.h5')
+inputs1 = Input(shape=(2048,))
+fe1 = Dropout(0.5)(inputs1)
+fe2 = Dense(256, activation='relu')(fe1)
+# language sequence model
+inputs2 = Input(shape=(MAX_LEN,))
+se1 = Embedding(vocab_size, MAX_LEN, mask_zero=True)(inputs2)
+se2 = Dropout(0.4)(se1)
+se3 = LSTM(256)(se2)
+# decoder model
+decoder1 = add([fe2, se3])
+decoder2 = Dense(256, activation='relu')(decoder1)
+outputs = Dense(vocab_size, activation='softmax')(decoder2)
+# tie it together [image, seq] [word]
+model = Model(inputs=[inputs1, inputs2], outputs=outputs)
+# compile model
+model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
+model.load_weights("models/LSTM/resnet152_lstm_model_weights_50epoch.h5")
+#
+print("LSTM model  loaded successfully")
+app = Flask(__name__)
+app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 1
+cors = CORS(app, resources={r"/*": {"origins": "*"}})
+# @app.route('/')
+# def index():
+#     return render_template('index.html')
+@app.route('/tranformer',methods=['POST'])
+def tranformer():
+    if 'file' not in request.files:
+        return 'No file part'
+    file = request.files['file']
+    if file.filename == '':
+        return 'No selected file'
+    # Save the file
+    file.save('static/file.jpg')
+    caption=evaluate_single_image("static/file.jpg",tokenizer,loaded_transformer)
+    print(caption)
+    return jsonify({'caption': caption})
+@app.route('/lstm', methods=['POST'])
+def after():
+    if 'file' not in request.files:
+        return 'No file part'
+    file = request.files['file']
+    if file.filename == '':
+        return 'No selected file'
+    # Save the file
+    file.save('static/file.jpg')
+    # Read the saved file
+    img = cv2.imread('static/file.jpg')
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    # img = cv2.resize(img, (224,224))
+    img = cv2.resize(img, (224, 224))
+    img = np.expand_dims(img, axis=0)  # Add batch dimension
+    img = preprocess_input(img)
+    # img = img.reshape(1,224,224,3)
+    test_img_resized=ResNet152Model.predict(img).reshape(1,2048)
+    # test_img_resized=test_img_resized.reshape(test_img_resized.shape[0], -1)
+    text_inp = ['startofseq']
+    count = 0
+    caption = ''
+    while count < MAX_LEN:
+        count += 1
+        encoded = []
+        encoded = [words_dict.get(word, len(words_dict) - 1) for word in text_inp]  # Convert words to indices, using index for '<end>' for unknown words
+        encoded = pad_sequences([encoded], padding='post', truncating='post', maxlen=MAX_LEN)[0]  # Pad sequences
+        data_list = [test_img_resized.reshape(1, -1), encoded.reshape(1, -1)]  # Reshape encoded
+        prediction = np.argmax(model.predict(data_list))
+        prediction = np.argmax(model.predict(data_list))
+        sampled_word = inv_dict[prediction]
+        caption = caption + ' ' + sampled_word
+        if sampled_word == 'endofseq':
+            break
+        text_inp.append(sampled_word)
+    caption= caption.replace('endofseq','')
+    print(caption.replace(' .','.'))
+    return jsonify({'caption': caption.replace(' .','.')})
+if __name__ == "__main__":
+    app.run(debug=True)

library/Multihead_attention.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import tensorflow as tf
+from library.self_attention import scaled_dot_product_attention
+class MultiHeadAttention(tf.keras.layers.Layer):
+  def __init__(self, d_model, num_heads):
+    super(MultiHeadAttention, self).__init__()
+    self.num_heads = num_heads
+    self.d_model = d_model
+    assert d_model % self.num_heads == 0
+    self.depth = d_model // self.num_heads
+    self.wq = tf.keras.layers.Dense(d_model)
+    self.wk = tf.keras.layers.Dense(d_model)
+    self.wv = tf.keras.layers.Dense(d_model)
+    self.dense = tf.keras.layers.Dense(d_model)
+  def split_heads(self, x, batch_size):
+    """Split the last dimension into (num_heads, depth).
+    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
+    """
+    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
+    return tf.transpose(x, perm=[0, 2, 1, 3])
+  def call(self, v, k, q, mask=None):
+    batch_size = tf.shape(q)[0]
+    q = self.wq(q)  # (batch_size, seq_len, d_model)
+    k = self.wk(k)  # (batch_size, seq_len, d_model)
+    v = self.wv(v)  # (batch_size, seq_len, d_model)
+    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
+    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
+    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
+    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
+    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
+    scaled_attention, attention_weights = scaled_dot_product_attention(
+        q, k, v, mask)
+    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
+    concat_attention = tf.reshape(scaled_attention,
+                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
+    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
+    return output, attention_weights
+def point_wise_feed_forward_network(d_model, dff):
+  return tf.keras.Sequential([
+      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
+      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
+  ])

library/__pycache__/Multihead_attention.cpython-310.pyc ADDED Viewed

Binary file (1.93 kB). View file

library/__pycache__/customSchedule.cpython-310.pyc ADDED Viewed

Binary file (994 Bytes). View file

library/__pycache__/encoder_decoder.cpython-310.pyc ADDED Viewed

Binary file (4.59 kB). View file

library/__pycache__/imageLoad.cpython-310.pyc ADDED Viewed

Binary file (711 Bytes). View file

library/__pycache__/prediction.cpython-310.pyc ADDED Viewed

Binary file (1.79 kB). View file

library/__pycache__/self_attention.cpython-310.pyc ADDED Viewed

Binary file (3.18 kB). View file

library/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (2.08 kB). View file

library/customSchedule.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import tensorflow as tf
+d_model = 512
+class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+    def __init__(self, d_model, warmup_steps=4000):
+        super(CustomSchedule, self).__init__()
+        self.d_model = tf.cast(d_model, tf.float32)  # Ensure d_model is a float32
+        self.warmup_steps = tf.cast(warmup_steps, tf.float32)  # Ensure warmup_steps is a float32
+    def __call__(self, step):
+        step = tf.cast(step, tf.float32)  # Ensure step is a float32
+        arg1 = tf.math.rsqrt(step)
+        arg2 = step * (self.warmup_steps ** -1.5)
+        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
+learning_rate = CustomSchedule(d_model)

library/encoder_decoder.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import tensorflow as tf
+from library.Multihead_attention import MultiHeadAttention,point_wise_feed_forward_network
+from library.self_attention import positional_encoding_1d,positional_encoding_2d
+class EncoderLayer(tf.keras.layers.Layer):
+  def __init__(self, d_model, num_heads, dff, rate=0.1):
+    super(EncoderLayer, self).__init__()
+    self.mha = MultiHeadAttention(d_model, num_heads)
+    self.ffn = point_wise_feed_forward_network(d_model, dff)
+    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+    self.dropout1 = tf.keras.layers.Dropout(rate)
+    self.dropout2 = tf.keras.layers.Dropout(rate)
+  def call(self, x, training, mask=None):
+    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
+    attn_output = self.dropout1(attn_output, training=training)
+    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
+    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
+    ffn_output = self.dropout2(ffn_output, training=training)
+    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
+    return out2
+class DecoderLayer(tf.keras.layers.Layer):
+  def __init__(self, d_model, num_heads, dff, rate=0.1):
+    super(DecoderLayer, self).__init__()
+    self.mha1 = MultiHeadAttention(d_model, num_heads)
+    self.mha2 = MultiHeadAttention(d_model, num_heads)
+    self.ffn = point_wise_feed_forward_network(d_model, dff)
+    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+    self.dropout1 = tf.keras.layers.Dropout(rate)
+    self.dropout2 = tf.keras.layers.Dropout(rate)
+    self.dropout3 = tf.keras.layers.Dropout(rate)
+  def call(self, x, enc_output, training,
+           look_ahead_mask=None, padding_mask=None):
+    # enc_output.shape == (batch_size, input_seq_len, d_model)
+    # using look ahead mask so that during self attention current query dont consider future token
+    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
+    attn1 = self.dropout1(attn1, training=training)
+    out1 = self.layernorm1(attn1 + x)
+    # use padding mask to avoid padded values of both enc_output and dec_input
+    attn2, attn_weights_block2 = self.mha2(
+        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
+    attn2 = self.dropout2(attn2, training=training)
+    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
+    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
+    ffn_output = self.dropout3(ffn_output, training=training)
+    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
+    return out3, attn_weights_block1, attn_weights_block2
+class Encoder(tf.keras.layers.Layer):
+  def __init__(self, num_layers, d_model, num_heads, dff,
+               row_size,col_size,rate=0.1):
+    super(Encoder, self).__init__()
+    self.d_model = d_model
+    self.num_layers = num_layers
+    self.embedding = tf.keras.layers.Dense(self.d_model,activation='relu')
+    self.pos_encoding = positional_encoding_2d(row_size,col_size,
+                                            self.d_model)
+    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
+                       for _ in range(num_layers)]
+    self.dropout = tf.keras.layers.Dropout(rate)
+  def call(self, x, training, mask=None):
+    # shape(x) = (batch_size,seq_len(H*W),features)
+    seq_len = tf.shape(x)[1]
+    # adding embedding and position encoding.
+    x = self.embedding(x)  # (batch_size, input_seq_len(H*W), d_model)
+    x += self.pos_encoding[:, :seq_len, :]
+    x = self.dropout(x, training=training)
+    for i in range(self.num_layers):
+      x = self.enc_layers[i](x, training, mask)
+    return x  # (batch_size, input_seq_len, d_model)
+class Decoder(tf.keras.layers.Layer):
+  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
+               maximum_position_encoding, rate=0.1):
+    super(Decoder, self).__init__()
+    self.d_model = d_model
+    self.num_layers = num_layers
+    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
+    self.pos_encoding = positional_encoding_1d(maximum_position_encoding, d_model)
+    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
+                       for _ in range(num_layers)]
+    self.dropout = tf.keras.layers.Dropout(rate)
+  def call(self, x, enc_output, training,
+           look_ahead_mask=None, padding_mask=None):
+    seq_len = tf.shape(x)[1]
+    attention_weights = {}
+    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
+    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
+    x += self.pos_encoding[:, :seq_len, :]
+    x = self.dropout(x, training=training)
+    for i in range(self.num_layers):
+      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
+                                             look_ahead_mask, padding_mask)
+      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
+      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
+    # x.shape == (batch_size, target_seq_len, d_model)
+    return x, attention_weights

library/imageLoad.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import tensorflow as tf
+def load_image(image_path):
+    img = tf.io.read_file(image_path)
+    img = tf.image.decode_jpeg(img, channels=3)
+    img = tf.image.resize(img, (299, 299))
+    img = tf.keras.applications.inception_v3.preprocess_input(img)
+    return img, image_path
+    # Feature extraction
+image_model = tf.keras.applications.InceptionV3(include_top=False,
+                                                weights='imagenet')
+new_input = image_model.input
+hidden_layer = image_model.layers[-1].output
+image_features_extract_model = tf.keras.Model(new_input, hidden_layer)
+# Tokenizer

library/prediction.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
+import tensorflow as tf
+import numpy as np
+import pickle
+#
+from  library.transformer import Transformer
+from library.imageLoad import image_features_extract_model,load_image
+from library.self_attention import create_masks_decoder,scaled_dot_product_attention
+def evaluate(image,tokenizer,loaded_transformer):
+    temp_input = tf.expand_dims(load_image(image)[0], 0)
+    img_tensor_val = image_features_extract_model(temp_input)
+    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))
+    start_token = tokenizer.word_index['<start>']
+    end_token = tokenizer.word_index['<end>']
+    #decoder input is start token.
+    decoder_input = [start_token]
+    output = tf.expand_dims(decoder_input, 0) #tokens
+    result = [] #word list
+    for i in range(100):
+        dec_mask = create_masks_decoder(output)
+        # predictions.shape == (batch_size, seq_len, vocab_size)
+        predictions, attention_weights = loaded_transformer(img_tensor_val,output,False,dec_mask)
+        # select the last word from the seq_len dimension
+        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)
+        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
+        # return the result if the predicted_id is equal to the end token
+        if predicted_id == end_token:
+            return result,tf.squeeze(output, axis=0), attention_weights
+        # concatentate the predicted_id to the output which is given to the decoder
+        # as its input.
+        result.append(tokenizer.index_word[int(predicted_id)])
+        output = tf.concat([output, predicted_id], axis=-1)
+    return result,tf.squeeze(output, axis=0), attention_weights
+# Assuming evaluate function is defined to generate captions
+def evaluate_single_image(image_path,tokenizer,loaded_transformer):
+    start_token = tokenizer.word_index['<start>']
+    end_token = tokenizer.word_index['<end>']
+    # Evaluate the caption for the given image
+    caption, _, _ = evaluate(image_path,tokenizer,loaded_transformer);
+    # Remove "<unk>" from the result
+    caption = [word for word in caption if word != "<unk>"]
+    # Remove <end> from the result
+    result_join = ' '.join(caption)
+    result_final = result_join.rsplit(' ', 1)[0]
+    return result_final

library/self_attention.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import tensorflow as tf
+import numpy as np
+def get_angles(pos, i, d_model):
+  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
+  return pos * angle_rates
+def positional_encoding_1d(position, d_model):
+  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
+                          np.arange(d_model)[np.newaxis, :],
+                          d_model)
+  # apply sin to even indices in the array; 2i
+  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
+  # apply cos to odd indices in the array; 2i+1
+  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
+  pos_encoding = angle_rads[np.newaxis, ...]
+  return tf.cast(pos_encoding, dtype=tf.float32)
+def positional_encoding_2d(row,col,d_model):
+  assert d_model % 2 == 0
+  # first d_model/2 encode row embedding and second d_model/2 encode column embedding
+  row_pos = np.repeat(np.arange(row),col)[:,np.newaxis]
+  col_pos = np.repeat(np.expand_dims(np.arange(col),0),row,axis=0).reshape(-1,1)
+  angle_rads_row = get_angles(row_pos,np.arange(d_model//2)[np.newaxis,:],d_model//2)
+  angle_rads_col = get_angles(col_pos,np.arange(d_model//2)[np.newaxis,:],d_model//2)
+  #apply sin and cos to odd and even indices resp.
+  angle_rads_row[:, 0::2] = np.sin(angle_rads_row[:, 0::2])
+  angle_rads_row[:, 1::2] = np.cos(angle_rads_row[:, 1::2])
+  angle_rads_col[:, 0::2] = np.sin(angle_rads_col[:, 0::2])
+  angle_rads_col[:, 1::2] = np.cos(angle_rads_col[:, 1::2])
+  pos_encoding = np.concatenate([angle_rads_row,angle_rads_col],axis=1)[np.newaxis, ...]
+  return tf.cast(pos_encoding, dtype=tf.float32)
+def create_padding_mask(seq):
+  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
+  # add extra dimensions to add the padding
+  # to the attention logits.
+  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
+def create_look_ahead_mask(size):
+  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
+  return mask  # (seq_len, seq_len)
+def create_masks_decoder(tar):
+  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
+  dec_target_padding_mask = create_padding_mask(tar)
+  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
+  return combined_mask
+def scaled_dot_product_attention(q, k, v, mask):
+  """Calculate the attention weights.
+  q, k, v must have matching leading dimensions.
+  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
+  The mask has different shapes depending on its type(padding or look ahead)
+  but it must be broadcastable for addition.
+  Args:
+    q: query shape == (..., seq_len_q, depth)
+    k: key shape == (..., seq_len_k, depth)
+    v: value shape == (..., seq_len_v, depth_v)
+    mask: Float tensor with shape broadcastable
+          to (..., seq_len_q, seq_len_k). Defaults to None.
+  Returns:
+    output, attention_weights
+  """
+  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
+  # scale matmul_qk
+  dk = tf.cast(tf.shape(k)[-1], tf.float32)
+  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
+  # add the mask to the scaled tensor.
+  if mask is not None:
+    scaled_attention_logits += (mask * -1e9)  #adding -Inf where mask is 1 s.t. value get ignored in softmax
+  # softmax is normalized on the last axis (seq_len_k) so that the scores
+  # add up to 1.
+  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
+  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
+  return output, attention_weights

library/transformer.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import tensorflow as tf
+from library.self_attention import create_padding_mask,create_masks_decoder,scaled_dot_product_attention
+from library.Multihead_attention import MultiHeadAttention,point_wise_feed_forward_network
+from library.customSchedule import learning_rate
+from library.encoder_decoder import Encoder,Decoder,EncoderLayer,DecoderLayer
+import pickle
+def load_image(image_path):
+    img = tf.io.read_file(image_path)
+    img = tf.image.decode_jpeg(img, channels=3)
+    img = tf.image.resize(img, (299, 299))
+    img = tf.keras.applications.inception_v3.preprocess_input(img)
+    return img, image_path
+    # Feature extraction
+image_model = tf.keras.applications.InceptionV3(include_top=False,
+                                                weights='imagenet')
+new_input = image_model.input
+hidden_layer = image_model.layers[-1].output
+image_features_extract_model = tf.keras.Model(new_input, hidden_layer)
+class Transformer(tf.keras.Model):
+  def __init__(self, num_layers, d_model, num_heads, dff,row_size,col_size,
+               target_vocab_size,max_pos_encoding, rate=0.1):
+    super(Transformer, self).__init__()
+    self.encoder = Encoder(num_layers, d_model, num_heads, dff,row_size,col_size, rate)
+    self.decoder = Decoder(num_layers, d_model, num_heads, dff,
+                           target_vocab_size,max_pos_encoding, rate)
+    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
+  def call(self, inp, tar, training,look_ahead_mask=None, dec_padding_mask=None,enc_padding_mask=None):
+    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
+    # dec_output.shape == (batch_size, tar_seq_len, d_model)
+    dec_output, attention_weights = self.decoder(
+        tar, enc_output, training, look_ahead_mask, dec_padding_mask)
+    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
+    return final_output, attention_weights
+# # Load the custom objects
+# with open('models/Transformer/custom_objects-80.pkl', 'rb') as f:
+#     custom_objects = pickle.load(f)
+# Assuming you have the same model architecture defined in the 'Transformer' class
+# Create an instance of the Transformer model (without loading weights)

model/fingerprint.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8a25f566874573317d02eb3d331f81f46a4d188159c889c2624db8a36a343ee
+size 58

model/model-20/checkpoint ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ model_checkpoint_path: "model_weights-15"
2	+ all_model_checkpoint_paths: "model_weights-15"

model/model-20/custom_objects-15.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:107c764c4d70f7416ba7b4f6af4b8acd80b494d0bc0b2bf12416a8f513221992
+size 47

model/model-20/model_weights-15.data-00000-of-00001 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb6aa6a6153cb29d04510d07dba14c5fea247cc799e25b4f59bc581bac609c01
+size 224434664

model/model-20/model_weights-15.index ADDED Viewed

Binary file (11.4 kB). View file

model/model-20/training_validation_accuracy.png ADDED Viewed

model/model-20/training_validation_loss.png ADDED Viewed

model/saved_model.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6361b491875593db4e3fd758462d6ebbe9f1ffd17facc1b5568b028932d17df3
+size 57028385

model/variables/variables.data-00000-of-00001 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:227114d56ffd12d39a4f84e2e5aa9cd0485dc255925c32ff6d081a4011af0dbd
+size 224513816

model/variables/variables.index ADDED Viewed

Binary file (9.91 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+Flask==2.0.2
+opencv-python
+keras==2.15.0
+numpy>=1.23.5
+keras-applications==1.0.8
+tqdm
+flask-cors
+Werkzeug==2.0.2
+tensorflow
+keras_preprocessing==1.1.2
+nltk==3.6.5

transformer/tokenizer.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56ce18abc08dcf20877edbb48701616635801a1c69b8673db5605101f04e623a
+size 1368089