chhetri123 commited on
Commit
340d736
1 Parent(s): 2d85d8d

Upload 27 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model/model-20/model_weights-15.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
37
+ model/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import cv2
3
+ import numpy as np
4
+ from keras.applications import ResNet152
5
+ from keras.optimizers import Adam
6
+ from keras.models import Sequential, Model,load_model
7
+ from keras.layers import Input
8
+ from keras.layers import Dense
9
+ from keras.layers import LSTM
10
+ from keras.layers import Embedding
11
+ from keras.layers import Dropout
12
+ from keras.layers import add
13
+ from keras.utils import to_categorical
14
+
15
+ from tensorflow.keras.applications.resnet import preprocess_input
16
+ from keras.preprocessing import image, sequence
17
+ import cv2
18
+ from keras_preprocessing.sequence import pad_sequences
19
+ from tqdm import tqdm
20
+ import pickle
21
+ import tensorflow as tf
22
+ # from keras.applications.Resnet50 import preprocess_input
23
+ from flask_cors import CORS
24
+ #
25
+ # Transformer
26
+ from library.prediction import evaluate_single_image
27
+ from library.transformer import Transformer
28
+ from library.customSchedule import learning_rate
29
+
30
+ top_k = 25000
31
+ num_layer = 4
32
+ d_model = 512
33
+ dff = 2048
34
+ num_heads = 8
35
+ row_size = 8
36
+ col_size = 8
37
+ target_vocab_size = top_k + 1
38
+ dropout_rate = 0.1
39
+
40
+
41
+ loaded_transformer = Transformer(num_layer, d_model, num_heads, dff, row_size, col_size,
42
+ target_vocab_size, max_pos_encoding=target_vocab_size,
43
+ rate=dropout_rate)
44
+
45
+ # Load the weights into the model
46
+ loaded_transformer.load_weights('models/Transformer/model')
47
+ # Use the loaded custom objects
48
+ loaded_transformer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
49
+ print("Trasformer model loaded successfully")
50
+ # loaded_transformer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss=train_loss.result(), metrics=[train_accuracy])
51
+ global tokenizer
52
+ with open('pickle_files/transformer/tokenizer.pickle', 'rb') as handle:
53
+ tokenizer = pickle.load(handle)
54
+ tokenizer.word_index['<pad>'] = 0
55
+ tokenizer.index_word[0] = '<pad>'
56
+
57
+
58
+ print("Tokenizer loaded successfully")
59
+
60
+ #
61
+
62
+ # LSTM Model
63
+ # incept_model = ResNet152(weights='imagenet', include_top=False)
64
+ # last = incept_model.layers[-2].output
65
+ # ResNet152Model= Model(inputs = incept_model.input,outputs = last)
66
+ ResNet152Model=ResNet152(include_top=False, weights='imagenet',input_shape=(224,224,3), pooling='avg')
67
+ with open("pickle_files/lstm/words_dict_nepali_sc.pkl","rb") as f:
68
+ words_dict=pickle.load(f)
69
+
70
+
71
+ # vocab_size = len(words_dict)+1
72
+ vocab_size = 5521
73
+ # MAX_LEN = 192
74
+ MAX_LEN=210
75
+ inv_dict = {v:k for k, v in words_dict.items()}
76
+
77
+
78
+ # model = tf.keras.models.load_model('models/LSTM/cultural_nepali_50.h5')
79
+
80
+ inputs1 = Input(shape=(2048,))
81
+ fe1 = Dropout(0.5)(inputs1)
82
+ fe2 = Dense(256, activation='relu')(fe1)
83
+
84
+ # language sequence model
85
+ inputs2 = Input(shape=(MAX_LEN,))
86
+ se1 = Embedding(vocab_size, MAX_LEN, mask_zero=True)(inputs2)
87
+ se2 = Dropout(0.4)(se1)
88
+ se3 = LSTM(256)(se2)
89
+
90
+ # decoder model
91
+ decoder1 = add([fe2, se3])
92
+ decoder2 = Dense(256, activation='relu')(decoder1)
93
+ outputs = Dense(vocab_size, activation='softmax')(decoder2)
94
+
95
+ # tie it together [image, seq] [word]
96
+ model = Model(inputs=[inputs1, inputs2], outputs=outputs)
97
+ # compile model
98
+ model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
99
+ model.load_weights("models/LSTM/resnet152_lstm_model_weights_50epoch.h5")
100
+ #
101
+ print("LSTM model loaded successfully")
102
+
103
+
104
+ app = Flask(__name__)
105
+ app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 1
106
+ cors = CORS(app, resources={r"/*": {"origins": "*"}})
107
+ # @app.route('/')
108
+ # def index():
109
+ # return render_template('index.html')
110
+
111
+
112
+ @app.route('/tranformer',methods=['POST'])
113
+ def tranformer():
114
+ if 'file' not in request.files:
115
+ return 'No file part'
116
+
117
+ file = request.files['file']
118
+
119
+ if file.filename == '':
120
+ return 'No selected file'
121
+
122
+ # Save the file
123
+
124
+ file.save('static/file.jpg')
125
+ caption=evaluate_single_image("static/file.jpg",tokenizer,loaded_transformer)
126
+ print(caption)
127
+ return jsonify({'caption': caption})
128
+
129
+
130
+ @app.route('/lstm', methods=['POST'])
131
+ def after():
132
+
133
+ if 'file' not in request.files:
134
+ return 'No file part'
135
+
136
+ file = request.files['file']
137
+
138
+ if file.filename == '':
139
+ return 'No selected file'
140
+
141
+ # Save the file
142
+
143
+ file.save('static/file.jpg')
144
+
145
+ # Read the saved file
146
+ img = cv2.imread('static/file.jpg')
147
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
148
+ # img = cv2.resize(img, (224,224))
149
+ img = cv2.resize(img, (224, 224))
150
+ img = np.expand_dims(img, axis=0) # Add batch dimension
151
+ img = preprocess_input(img)
152
+ # img = img.reshape(1,224,224,3)
153
+ test_img_resized=ResNet152Model.predict(img).reshape(1,2048)
154
+ # test_img_resized=test_img_resized.reshape(test_img_resized.shape[0], -1)
155
+
156
+ text_inp = ['startofseq']
157
+ count = 0
158
+ caption = ''
159
+ while count < MAX_LEN:
160
+ count += 1
161
+ encoded = []
162
+ encoded = [words_dict.get(word, len(words_dict) - 1) for word in text_inp] # Convert words to indices, using index for '<end>' for unknown words
163
+ encoded = pad_sequences([encoded], padding='post', truncating='post', maxlen=MAX_LEN)[0] # Pad sequences
164
+
165
+ data_list = [test_img_resized.reshape(1, -1), encoded.reshape(1, -1)] # Reshape encoded
166
+ prediction = np.argmax(model.predict(data_list))
167
+ prediction = np.argmax(model.predict(data_list))
168
+ sampled_word = inv_dict[prediction]
169
+ caption = caption + ' ' + sampled_word
170
+
171
+ if sampled_word == 'endofseq':
172
+ break
173
+ text_inp.append(sampled_word)
174
+
175
+ caption= caption.replace('endofseq','')
176
+ print(caption.replace(' .','.'))
177
+
178
+ return jsonify({'caption': caption.replace(' .','.')})
179
+
180
+
181
+ if __name__ == "__main__":
182
+ app.run(debug=True)
library/Multihead_attention.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from library.self_attention import scaled_dot_product_attention
3
+
4
+ class MultiHeadAttention(tf.keras.layers.Layer):
5
+ def __init__(self, d_model, num_heads):
6
+ super(MultiHeadAttention, self).__init__()
7
+ self.num_heads = num_heads
8
+ self.d_model = d_model
9
+
10
+ assert d_model % self.num_heads == 0
11
+
12
+ self.depth = d_model // self.num_heads
13
+
14
+ self.wq = tf.keras.layers.Dense(d_model)
15
+ self.wk = tf.keras.layers.Dense(d_model)
16
+ self.wv = tf.keras.layers.Dense(d_model)
17
+
18
+ self.dense = tf.keras.layers.Dense(d_model)
19
+
20
+ def split_heads(self, x, batch_size):
21
+ """Split the last dimension into (num_heads, depth).
22
+ Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
23
+ """
24
+ x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
25
+ return tf.transpose(x, perm=[0, 2, 1, 3])
26
+
27
+ def call(self, v, k, q, mask=None):
28
+ batch_size = tf.shape(q)[0]
29
+
30
+ q = self.wq(q) # (batch_size, seq_len, d_model)
31
+ k = self.wk(k) # (batch_size, seq_len, d_model)
32
+ v = self.wv(v) # (batch_size, seq_len, d_model)
33
+
34
+ q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
35
+ k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth)
36
+ v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth)
37
+
38
+ # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
39
+ # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
40
+ scaled_attention, attention_weights = scaled_dot_product_attention(
41
+ q, k, v, mask)
42
+
43
+ scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth)
44
+
45
+ concat_attention = tf.reshape(scaled_attention,
46
+ (batch_size, -1, self.d_model)) # (batch_size, seq_len_q, d_model)
47
+
48
+ output = self.dense(concat_attention) # (batch_size, seq_len_q, d_model)
49
+
50
+ return output, attention_weights
51
+
52
+
53
+ def point_wise_feed_forward_network(d_model, dff):
54
+ return tf.keras.Sequential([
55
+ tf.keras.layers.Dense(dff, activation='relu'), # (batch_size, seq_len, dff)
56
+ tf.keras.layers.Dense(d_model) # (batch_size, seq_len, d_model)
57
+ ])
library/__pycache__/Multihead_attention.cpython-310.pyc ADDED
Binary file (1.93 kB). View file
 
library/__pycache__/customSchedule.cpython-310.pyc ADDED
Binary file (994 Bytes). View file
 
library/__pycache__/encoder_decoder.cpython-310.pyc ADDED
Binary file (4.59 kB). View file
 
library/__pycache__/imageLoad.cpython-310.pyc ADDED
Binary file (711 Bytes). View file
 
library/__pycache__/prediction.cpython-310.pyc ADDED
Binary file (1.79 kB). View file
 
library/__pycache__/self_attention.cpython-310.pyc ADDED
Binary file (3.18 kB). View file
 
library/__pycache__/transformer.cpython-310.pyc ADDED
Binary file (2.08 kB). View file
 
library/customSchedule.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import tensorflow as tf
3
+ d_model = 512
4
+ class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
5
+ def __init__(self, d_model, warmup_steps=4000):
6
+ super(CustomSchedule, self).__init__()
7
+
8
+ self.d_model = tf.cast(d_model, tf.float32) # Ensure d_model is a float32
9
+ self.warmup_steps = tf.cast(warmup_steps, tf.float32) # Ensure warmup_steps is a float32
10
+
11
+ def __call__(self, step):
12
+ step = tf.cast(step, tf.float32) # Ensure step is a float32
13
+
14
+ arg1 = tf.math.rsqrt(step)
15
+ arg2 = step * (self.warmup_steps ** -1.5)
16
+
17
+ return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
18
+
19
+ learning_rate = CustomSchedule(d_model)
library/encoder_decoder.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import tensorflow as tf
3
+ from library.Multihead_attention import MultiHeadAttention,point_wise_feed_forward_network
4
+ from library.self_attention import positional_encoding_1d,positional_encoding_2d
5
+
6
+ class EncoderLayer(tf.keras.layers.Layer):
7
+ def __init__(self, d_model, num_heads, dff, rate=0.1):
8
+ super(EncoderLayer, self).__init__()
9
+
10
+ self.mha = MultiHeadAttention(d_model, num_heads)
11
+ self.ffn = point_wise_feed_forward_network(d_model, dff)
12
+
13
+ self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
14
+ self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
15
+
16
+ self.dropout1 = tf.keras.layers.Dropout(rate)
17
+ self.dropout2 = tf.keras.layers.Dropout(rate)
18
+
19
+ def call(self, x, training, mask=None):
20
+
21
+ attn_output, _ = self.mha(x, x, x, mask) # (batch_size, input_seq_len, d_model)
22
+ attn_output = self.dropout1(attn_output, training=training)
23
+ out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model)
24
+
25
+ ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)
26
+ ffn_output = self.dropout2(ffn_output, training=training)
27
+ out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)
28
+
29
+ return out2
30
+
31
+
32
+ class DecoderLayer(tf.keras.layers.Layer):
33
+ def __init__(self, d_model, num_heads, dff, rate=0.1):
34
+ super(DecoderLayer, self).__init__()
35
+
36
+ self.mha1 = MultiHeadAttention(d_model, num_heads)
37
+ self.mha2 = MultiHeadAttention(d_model, num_heads)
38
+
39
+ self.ffn = point_wise_feed_forward_network(d_model, dff)
40
+
41
+ self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
42
+ self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
43
+ self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
44
+
45
+ self.dropout1 = tf.keras.layers.Dropout(rate)
46
+ self.dropout2 = tf.keras.layers.Dropout(rate)
47
+ self.dropout3 = tf.keras.layers.Dropout(rate)
48
+
49
+
50
+ def call(self, x, enc_output, training,
51
+ look_ahead_mask=None, padding_mask=None):
52
+ # enc_output.shape == (batch_size, input_seq_len, d_model)
53
+
54
+ # using look ahead mask so that during self attention current query dont consider future token
55
+ attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask) # (batch_size, target_seq_len, d_model)
56
+ attn1 = self.dropout1(attn1, training=training)
57
+ out1 = self.layernorm1(attn1 + x)
58
+
59
+ # use padding mask to avoid padded values of both enc_output and dec_input
60
+ attn2, attn_weights_block2 = self.mha2(
61
+ enc_output, enc_output, out1, padding_mask) # (batch_size, target_seq_len, d_model)
62
+ attn2 = self.dropout2(attn2, training=training)
63
+ out2 = self.layernorm2(attn2 + out1) # (batch_size, target_seq_len, d_model)
64
+
65
+ ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model)
66
+ ffn_output = self.dropout3(ffn_output, training=training)
67
+ out3 = self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, d_model)
68
+
69
+ return out3, attn_weights_block1, attn_weights_block2
70
+
71
+
72
+ class Encoder(tf.keras.layers.Layer):
73
+ def __init__(self, num_layers, d_model, num_heads, dff,
74
+ row_size,col_size,rate=0.1):
75
+ super(Encoder, self).__init__()
76
+
77
+ self.d_model = d_model
78
+ self.num_layers = num_layers
79
+
80
+ self.embedding = tf.keras.layers.Dense(self.d_model,activation='relu')
81
+ self.pos_encoding = positional_encoding_2d(row_size,col_size,
82
+ self.d_model)
83
+
84
+
85
+ self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
86
+ for _ in range(num_layers)]
87
+
88
+ self.dropout = tf.keras.layers.Dropout(rate)
89
+
90
+ def call(self, x, training, mask=None):
91
+ # shape(x) = (batch_size,seq_len(H*W),features)
92
+ seq_len = tf.shape(x)[1]
93
+
94
+ # adding embedding and position encoding.
95
+ x = self.embedding(x) # (batch_size, input_seq_len(H*W), d_model)
96
+ x += self.pos_encoding[:, :seq_len, :]
97
+
98
+ x = self.dropout(x, training=training)
99
+
100
+ for i in range(self.num_layers):
101
+ x = self.enc_layers[i](x, training, mask)
102
+
103
+ return x # (batch_size, input_seq_len, d_model)
104
+
105
+
106
+
107
+
108
+ class Decoder(tf.keras.layers.Layer):
109
+ def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
110
+ maximum_position_encoding, rate=0.1):
111
+ super(Decoder, self).__init__()
112
+
113
+ self.d_model = d_model
114
+ self.num_layers = num_layers
115
+
116
+ self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
117
+ self.pos_encoding = positional_encoding_1d(maximum_position_encoding, d_model)
118
+
119
+ self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
120
+ for _ in range(num_layers)]
121
+ self.dropout = tf.keras.layers.Dropout(rate)
122
+
123
+ def call(self, x, enc_output, training,
124
+ look_ahead_mask=None, padding_mask=None):
125
+
126
+ seq_len = tf.shape(x)[1]
127
+ attention_weights = {}
128
+
129
+ x = self.embedding(x) # (batch_size, target_seq_len, d_model)
130
+ x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
131
+ x += self.pos_encoding[:, :seq_len, :]
132
+
133
+ x = self.dropout(x, training=training)
134
+
135
+ for i in range(self.num_layers):
136
+ x, block1, block2 = self.dec_layers[i](x, enc_output, training,
137
+ look_ahead_mask, padding_mask)
138
+
139
+ attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
140
+ attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
141
+
142
+ # x.shape == (batch_size, target_seq_len, d_model)
143
+ return x, attention_weights
library/imageLoad.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+
3
+ def load_image(image_path):
4
+ img = tf.io.read_file(image_path)
5
+ img = tf.image.decode_jpeg(img, channels=3)
6
+ img = tf.image.resize(img, (299, 299))
7
+ img = tf.keras.applications.inception_v3.preprocess_input(img)
8
+ return img, image_path
9
+
10
+
11
+ # Feature extraction
12
+
13
+ image_model = tf.keras.applications.InceptionV3(include_top=False,
14
+ weights='imagenet')
15
+ new_input = image_model.input
16
+ hidden_layer = image_model.layers[-1].output
17
+
18
+ image_features_extract_model = tf.keras.Model(new_input, hidden_layer)
19
+
20
+
21
+ # Tokenizer
22
+
23
+
library/prediction.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
2
+ import tensorflow as tf
3
+ import numpy as np
4
+ import pickle
5
+ #
6
+ from library.transformer import Transformer
7
+ from library.imageLoad import image_features_extract_model,load_image
8
+ from library.self_attention import create_masks_decoder,scaled_dot_product_attention
9
+
10
+
11
+
12
+
13
+ def evaluate(image,tokenizer,loaded_transformer):
14
+ temp_input = tf.expand_dims(load_image(image)[0], 0)
15
+ img_tensor_val = image_features_extract_model(temp_input)
16
+ img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))
17
+
18
+ start_token = tokenizer.word_index['<start>']
19
+ end_token = tokenizer.word_index['<end>']
20
+
21
+ #decoder input is start token.
22
+ decoder_input = [start_token]
23
+ output = tf.expand_dims(decoder_input, 0) #tokens
24
+ result = [] #word list
25
+
26
+ for i in range(100):
27
+ dec_mask = create_masks_decoder(output)
28
+
29
+ # predictions.shape == (batch_size, seq_len, vocab_size)
30
+ predictions, attention_weights = loaded_transformer(img_tensor_val,output,False,dec_mask)
31
+
32
+ # select the last word from the seq_len dimension
33
+ predictions = predictions[: ,-1:, :] # (batch_size, 1, vocab_size)
34
+
35
+ predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
36
+ # return the result if the predicted_id is equal to the end token
37
+ if predicted_id == end_token:
38
+ return result,tf.squeeze(output, axis=0), attention_weights
39
+ # concatentate the predicted_id to the output which is given to the decoder
40
+ # as its input.
41
+ result.append(tokenizer.index_word[int(predicted_id)])
42
+ output = tf.concat([output, predicted_id], axis=-1)
43
+
44
+ return result,tf.squeeze(output, axis=0), attention_weights
45
+
46
+
47
+
48
+ # Assuming evaluate function is defined to generate captions
49
+ def evaluate_single_image(image_path,tokenizer,loaded_transformer):
50
+ start_token = tokenizer.word_index['<start>']
51
+ end_token = tokenizer.word_index['<end>']
52
+
53
+ # Evaluate the caption for the given image
54
+ caption, _, _ = evaluate(image_path,tokenizer,loaded_transformer);
55
+
56
+ # Remove "<unk>" from the result
57
+ caption = [word for word in caption if word != "<unk>"]
58
+
59
+ # Remove <end> from the result
60
+ result_join = ' '.join(caption)
61
+ result_final = result_join.rsplit(' ', 1)[0]
62
+
63
+ return result_final
library/self_attention.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import tensorflow as tf
3
+ import numpy as np
4
+
5
+
6
+ def get_angles(pos, i, d_model):
7
+ angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
8
+ return pos * angle_rates
9
+
10
+
11
+ def positional_encoding_1d(position, d_model):
12
+ angle_rads = get_angles(np.arange(position)[:, np.newaxis],
13
+ np.arange(d_model)[np.newaxis, :],
14
+ d_model)
15
+
16
+ # apply sin to even indices in the array; 2i
17
+ angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
18
+
19
+ # apply cos to odd indices in the array; 2i+1
20
+ angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
21
+
22
+ pos_encoding = angle_rads[np.newaxis, ...]
23
+
24
+ return tf.cast(pos_encoding, dtype=tf.float32)
25
+
26
+
27
+ def positional_encoding_2d(row,col,d_model):
28
+ assert d_model % 2 == 0
29
+ # first d_model/2 encode row embedding and second d_model/2 encode column embedding
30
+ row_pos = np.repeat(np.arange(row),col)[:,np.newaxis]
31
+ col_pos = np.repeat(np.expand_dims(np.arange(col),0),row,axis=0).reshape(-1,1)
32
+ angle_rads_row = get_angles(row_pos,np.arange(d_model//2)[np.newaxis,:],d_model//2)
33
+ angle_rads_col = get_angles(col_pos,np.arange(d_model//2)[np.newaxis,:],d_model//2)
34
+ #apply sin and cos to odd and even indices resp.
35
+ angle_rads_row[:, 0::2] = np.sin(angle_rads_row[:, 0::2])
36
+ angle_rads_row[:, 1::2] = np.cos(angle_rads_row[:, 1::2])
37
+ angle_rads_col[:, 0::2] = np.sin(angle_rads_col[:, 0::2])
38
+ angle_rads_col[:, 1::2] = np.cos(angle_rads_col[:, 1::2])
39
+ pos_encoding = np.concatenate([angle_rads_row,angle_rads_col],axis=1)[np.newaxis, ...]
40
+
41
+ return tf.cast(pos_encoding, dtype=tf.float32)
42
+
43
+
44
+ def create_padding_mask(seq):
45
+ seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
46
+
47
+ # add extra dimensions to add the padding
48
+ # to the attention logits.
49
+ return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len)
50
+
51
+
52
+
53
+ def create_look_ahead_mask(size):
54
+ mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
55
+ return mask # (seq_len, seq_len)
56
+
57
+
58
+
59
+ def create_masks_decoder(tar):
60
+ look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
61
+ dec_target_padding_mask = create_padding_mask(tar)
62
+ combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
63
+ return combined_mask
64
+
65
+
66
+
67
+ def scaled_dot_product_attention(q, k, v, mask):
68
+ """Calculate the attention weights.
69
+ q, k, v must have matching leading dimensions.
70
+ k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
71
+ The mask has different shapes depending on its type(padding or look ahead)
72
+ but it must be broadcastable for addition.
73
+
74
+ Args:
75
+ q: query shape == (..., seq_len_q, depth)
76
+ k: key shape == (..., seq_len_k, depth)
77
+ v: value shape == (..., seq_len_v, depth_v)
78
+ mask: Float tensor with shape broadcastable
79
+ to (..., seq_len_q, seq_len_k). Defaults to None.
80
+
81
+ Returns:
82
+ output, attention_weights
83
+ """
84
+
85
+ matmul_qk = tf.matmul(q, k, transpose_b=True) # (..., seq_len_q, seq_len_k)
86
+
87
+ # scale matmul_qk
88
+ dk = tf.cast(tf.shape(k)[-1], tf.float32)
89
+ scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
90
+
91
+ # add the mask to the scaled tensor.
92
+ if mask is not None:
93
+ scaled_attention_logits += (mask * -1e9) #adding -Inf where mask is 1 s.t. value get ignored in softmax
94
+
95
+ # softmax is normalized on the last axis (seq_len_k) so that the scores
96
+ # add up to 1.
97
+ attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., seq_len_q, seq_len_k)
98
+
99
+ output = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v)
100
+
101
+ return output, attention_weights
library/transformer.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from library.self_attention import create_padding_mask,create_masks_decoder,scaled_dot_product_attention
3
+
4
+ from library.Multihead_attention import MultiHeadAttention,point_wise_feed_forward_network
5
+ from library.customSchedule import learning_rate
6
+ from library.encoder_decoder import Encoder,Decoder,EncoderLayer,DecoderLayer
7
+ import pickle
8
+
9
+
10
+
11
+ def load_image(image_path):
12
+ img = tf.io.read_file(image_path)
13
+ img = tf.image.decode_jpeg(img, channels=3)
14
+ img = tf.image.resize(img, (299, 299))
15
+ img = tf.keras.applications.inception_v3.preprocess_input(img)
16
+ return img, image_path
17
+
18
+
19
+ # Feature extraction
20
+
21
+ image_model = tf.keras.applications.InceptionV3(include_top=False,
22
+ weights='imagenet')
23
+ new_input = image_model.input
24
+ hidden_layer = image_model.layers[-1].output
25
+
26
+ image_features_extract_model = tf.keras.Model(new_input, hidden_layer)
27
+
28
+
29
+ class Transformer(tf.keras.Model):
30
+ def __init__(self, num_layers, d_model, num_heads, dff,row_size,col_size,
31
+ target_vocab_size,max_pos_encoding, rate=0.1):
32
+ super(Transformer, self).__init__()
33
+
34
+ self.encoder = Encoder(num_layers, d_model, num_heads, dff,row_size,col_size, rate)
35
+
36
+ self.decoder = Decoder(num_layers, d_model, num_heads, dff,
37
+ target_vocab_size,max_pos_encoding, rate)
38
+
39
+ self.final_layer = tf.keras.layers.Dense(target_vocab_size)
40
+
41
+ def call(self, inp, tar, training,look_ahead_mask=None, dec_padding_mask=None,enc_padding_mask=None):
42
+
43
+ enc_output = self.encoder(inp, training, enc_padding_mask) # (batch_size, inp_seq_len, d_model)
44
+
45
+ # dec_output.shape == (batch_size, tar_seq_len, d_model)
46
+ dec_output, attention_weights = self.decoder(
47
+ tar, enc_output, training, look_ahead_mask, dec_padding_mask)
48
+
49
+ final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size)
50
+
51
+ return final_output, attention_weights
52
+
53
+
54
+
55
+ # # Load the custom objects
56
+ # with open('models/Transformer/custom_objects-80.pkl', 'rb') as f:
57
+ # custom_objects = pickle.load(f)
58
+
59
+ # Assuming you have the same model architecture defined in the 'Transformer' class
60
+ # Create an instance of the Transformer model (without loading weights)
model/fingerprint.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8a25f566874573317d02eb3d331f81f46a4d188159c889c2624db8a36a343ee
3
+ size 58
model/model-20/checkpoint ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ model_checkpoint_path: "model_weights-15"
2
+ all_model_checkpoint_paths: "model_weights-15"
model/model-20/custom_objects-15.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:107c764c4d70f7416ba7b4f6af4b8acd80b494d0bc0b2bf12416a8f513221992
3
+ size 47
model/model-20/model_weights-15.data-00000-of-00001 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb6aa6a6153cb29d04510d07dba14c5fea247cc799e25b4f59bc581bac609c01
3
+ size 224434664
model/model-20/model_weights-15.index ADDED
Binary file (11.4 kB). View file
 
model/model-20/training_validation_accuracy.png ADDED
model/model-20/training_validation_loss.png ADDED
model/saved_model.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6361b491875593db4e3fd758462d6ebbe9f1ffd17facc1b5568b028932d17df3
3
+ size 57028385
model/variables/variables.data-00000-of-00001 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:227114d56ffd12d39a4f84e2e5aa9cd0485dc255925c32ff6d081a4011af0dbd
3
+ size 224513816
model/variables/variables.index ADDED
Binary file (9.91 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask==2.0.2
2
+ opencv-python
3
+ keras==2.15.0
4
+ numpy>=1.23.5
5
+ keras-applications==1.0.8
6
+ tqdm
7
+ flask-cors
8
+ Werkzeug==2.0.2
9
+ tensorflow
10
+ keras_preprocessing==1.1.2
11
+ nltk==3.6.5
transformer/tokenizer.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56ce18abc08dcf20877edbb48701616635801a1c69b8673db5605101f04e623a
3
+ size 1368089