chhetri123
commited on
Commit
•
340d736
1
Parent(s):
2d85d8d
Upload 27 files
Browse files- .gitattributes +2 -0
- app.py +182 -0
- library/Multihead_attention.py +57 -0
- library/__pycache__/Multihead_attention.cpython-310.pyc +0 -0
- library/__pycache__/customSchedule.cpython-310.pyc +0 -0
- library/__pycache__/encoder_decoder.cpython-310.pyc +0 -0
- library/__pycache__/imageLoad.cpython-310.pyc +0 -0
- library/__pycache__/prediction.cpython-310.pyc +0 -0
- library/__pycache__/self_attention.cpython-310.pyc +0 -0
- library/__pycache__/transformer.cpython-310.pyc +0 -0
- library/customSchedule.py +19 -0
- library/encoder_decoder.py +143 -0
- library/imageLoad.py +23 -0
- library/prediction.py +63 -0
- library/self_attention.py +101 -0
- library/transformer.py +60 -0
- model/fingerprint.pb +3 -0
- model/model-20/checkpoint +2 -0
- model/model-20/custom_objects-15.pkl +3 -0
- model/model-20/model_weights-15.data-00000-of-00001 +3 -0
- model/model-20/model_weights-15.index +0 -0
- model/model-20/training_validation_accuracy.png +0 -0
- model/model-20/training_validation_loss.png +0 -0
- model/saved_model.pb +3 -0
- model/variables/variables.data-00000-of-00001 +3 -0
- model/variables/variables.index +0 -0
- requirements.txt +11 -0
- transformer/tokenizer.pickle +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
model/model-20/model_weights-15.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
model/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
from keras.applications import ResNet152
|
5 |
+
from keras.optimizers import Adam
|
6 |
+
from keras.models import Sequential, Model,load_model
|
7 |
+
from keras.layers import Input
|
8 |
+
from keras.layers import Dense
|
9 |
+
from keras.layers import LSTM
|
10 |
+
from keras.layers import Embedding
|
11 |
+
from keras.layers import Dropout
|
12 |
+
from keras.layers import add
|
13 |
+
from keras.utils import to_categorical
|
14 |
+
|
15 |
+
from tensorflow.keras.applications.resnet import preprocess_input
|
16 |
+
from keras.preprocessing import image, sequence
|
17 |
+
import cv2
|
18 |
+
from keras_preprocessing.sequence import pad_sequences
|
19 |
+
from tqdm import tqdm
|
20 |
+
import pickle
|
21 |
+
import tensorflow as tf
|
22 |
+
# from keras.applications.Resnet50 import preprocess_input
|
23 |
+
from flask_cors import CORS
|
24 |
+
#
|
25 |
+
# Transformer
|
26 |
+
from library.prediction import evaluate_single_image
|
27 |
+
from library.transformer import Transformer
|
28 |
+
from library.customSchedule import learning_rate
|
29 |
+
|
30 |
+
top_k = 25000
|
31 |
+
num_layer = 4
|
32 |
+
d_model = 512
|
33 |
+
dff = 2048
|
34 |
+
num_heads = 8
|
35 |
+
row_size = 8
|
36 |
+
col_size = 8
|
37 |
+
target_vocab_size = top_k + 1
|
38 |
+
dropout_rate = 0.1
|
39 |
+
|
40 |
+
|
41 |
+
loaded_transformer = Transformer(num_layer, d_model, num_heads, dff, row_size, col_size,
|
42 |
+
target_vocab_size, max_pos_encoding=target_vocab_size,
|
43 |
+
rate=dropout_rate)
|
44 |
+
|
45 |
+
# Load the weights into the model
|
46 |
+
loaded_transformer.load_weights('models/Transformer/model')
|
47 |
+
# Use the loaded custom objects
|
48 |
+
loaded_transformer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
|
49 |
+
print("Trasformer model loaded successfully")
|
50 |
+
# loaded_transformer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss=train_loss.result(), metrics=[train_accuracy])
|
51 |
+
global tokenizer
|
52 |
+
with open('pickle_files/transformer/tokenizer.pickle', 'rb') as handle:
|
53 |
+
tokenizer = pickle.load(handle)
|
54 |
+
tokenizer.word_index['<pad>'] = 0
|
55 |
+
tokenizer.index_word[0] = '<pad>'
|
56 |
+
|
57 |
+
|
58 |
+
print("Tokenizer loaded successfully")
|
59 |
+
|
60 |
+
#
|
61 |
+
|
62 |
+
# LSTM Model
|
63 |
+
# incept_model = ResNet152(weights='imagenet', include_top=False)
|
64 |
+
# last = incept_model.layers[-2].output
|
65 |
+
# ResNet152Model= Model(inputs = incept_model.input,outputs = last)
|
66 |
+
ResNet152Model=ResNet152(include_top=False, weights='imagenet',input_shape=(224,224,3), pooling='avg')
|
67 |
+
with open("pickle_files/lstm/words_dict_nepali_sc.pkl","rb") as f:
|
68 |
+
words_dict=pickle.load(f)
|
69 |
+
|
70 |
+
|
71 |
+
# vocab_size = len(words_dict)+1
|
72 |
+
vocab_size = 5521
|
73 |
+
# MAX_LEN = 192
|
74 |
+
MAX_LEN=210
|
75 |
+
inv_dict = {v:k for k, v in words_dict.items()}
|
76 |
+
|
77 |
+
|
78 |
+
# model = tf.keras.models.load_model('models/LSTM/cultural_nepali_50.h5')
|
79 |
+
|
80 |
+
inputs1 = Input(shape=(2048,))
|
81 |
+
fe1 = Dropout(0.5)(inputs1)
|
82 |
+
fe2 = Dense(256, activation='relu')(fe1)
|
83 |
+
|
84 |
+
# language sequence model
|
85 |
+
inputs2 = Input(shape=(MAX_LEN,))
|
86 |
+
se1 = Embedding(vocab_size, MAX_LEN, mask_zero=True)(inputs2)
|
87 |
+
se2 = Dropout(0.4)(se1)
|
88 |
+
se3 = LSTM(256)(se2)
|
89 |
+
|
90 |
+
# decoder model
|
91 |
+
decoder1 = add([fe2, se3])
|
92 |
+
decoder2 = Dense(256, activation='relu')(decoder1)
|
93 |
+
outputs = Dense(vocab_size, activation='softmax')(decoder2)
|
94 |
+
|
95 |
+
# tie it together [image, seq] [word]
|
96 |
+
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
|
97 |
+
# compile model
|
98 |
+
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
|
99 |
+
model.load_weights("models/LSTM/resnet152_lstm_model_weights_50epoch.h5")
|
100 |
+
#
|
101 |
+
print("LSTM model loaded successfully")
|
102 |
+
|
103 |
+
|
104 |
+
app = Flask(__name__)
|
105 |
+
app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 1
|
106 |
+
cors = CORS(app, resources={r"/*": {"origins": "*"}})
|
107 |
+
# @app.route('/')
|
108 |
+
# def index():
|
109 |
+
# return render_template('index.html')
|
110 |
+
|
111 |
+
|
112 |
+
@app.route('/tranformer',methods=['POST'])
|
113 |
+
def tranformer():
|
114 |
+
if 'file' not in request.files:
|
115 |
+
return 'No file part'
|
116 |
+
|
117 |
+
file = request.files['file']
|
118 |
+
|
119 |
+
if file.filename == '':
|
120 |
+
return 'No selected file'
|
121 |
+
|
122 |
+
# Save the file
|
123 |
+
|
124 |
+
file.save('static/file.jpg')
|
125 |
+
caption=evaluate_single_image("static/file.jpg",tokenizer,loaded_transformer)
|
126 |
+
print(caption)
|
127 |
+
return jsonify({'caption': caption})
|
128 |
+
|
129 |
+
|
130 |
+
@app.route('/lstm', methods=['POST'])
|
131 |
+
def after():
|
132 |
+
|
133 |
+
if 'file' not in request.files:
|
134 |
+
return 'No file part'
|
135 |
+
|
136 |
+
file = request.files['file']
|
137 |
+
|
138 |
+
if file.filename == '':
|
139 |
+
return 'No selected file'
|
140 |
+
|
141 |
+
# Save the file
|
142 |
+
|
143 |
+
file.save('static/file.jpg')
|
144 |
+
|
145 |
+
# Read the saved file
|
146 |
+
img = cv2.imread('static/file.jpg')
|
147 |
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
148 |
+
# img = cv2.resize(img, (224,224))
|
149 |
+
img = cv2.resize(img, (224, 224))
|
150 |
+
img = np.expand_dims(img, axis=0) # Add batch dimension
|
151 |
+
img = preprocess_input(img)
|
152 |
+
# img = img.reshape(1,224,224,3)
|
153 |
+
test_img_resized=ResNet152Model.predict(img).reshape(1,2048)
|
154 |
+
# test_img_resized=test_img_resized.reshape(test_img_resized.shape[0], -1)
|
155 |
+
|
156 |
+
text_inp = ['startofseq']
|
157 |
+
count = 0
|
158 |
+
caption = ''
|
159 |
+
while count < MAX_LEN:
|
160 |
+
count += 1
|
161 |
+
encoded = []
|
162 |
+
encoded = [words_dict.get(word, len(words_dict) - 1) for word in text_inp] # Convert words to indices, using index for '<end>' for unknown words
|
163 |
+
encoded = pad_sequences([encoded], padding='post', truncating='post', maxlen=MAX_LEN)[0] # Pad sequences
|
164 |
+
|
165 |
+
data_list = [test_img_resized.reshape(1, -1), encoded.reshape(1, -1)] # Reshape encoded
|
166 |
+
prediction = np.argmax(model.predict(data_list))
|
167 |
+
prediction = np.argmax(model.predict(data_list))
|
168 |
+
sampled_word = inv_dict[prediction]
|
169 |
+
caption = caption + ' ' + sampled_word
|
170 |
+
|
171 |
+
if sampled_word == 'endofseq':
|
172 |
+
break
|
173 |
+
text_inp.append(sampled_word)
|
174 |
+
|
175 |
+
caption= caption.replace('endofseq','')
|
176 |
+
print(caption.replace(' .','.'))
|
177 |
+
|
178 |
+
return jsonify({'caption': caption.replace(' .','.')})
|
179 |
+
|
180 |
+
|
181 |
+
if __name__ == "__main__":
|
182 |
+
app.run(debug=True)
|
library/Multihead_attention.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
from library.self_attention import scaled_dot_product_attention
|
3 |
+
|
4 |
+
class MultiHeadAttention(tf.keras.layers.Layer):
|
5 |
+
def __init__(self, d_model, num_heads):
|
6 |
+
super(MultiHeadAttention, self).__init__()
|
7 |
+
self.num_heads = num_heads
|
8 |
+
self.d_model = d_model
|
9 |
+
|
10 |
+
assert d_model % self.num_heads == 0
|
11 |
+
|
12 |
+
self.depth = d_model // self.num_heads
|
13 |
+
|
14 |
+
self.wq = tf.keras.layers.Dense(d_model)
|
15 |
+
self.wk = tf.keras.layers.Dense(d_model)
|
16 |
+
self.wv = tf.keras.layers.Dense(d_model)
|
17 |
+
|
18 |
+
self.dense = tf.keras.layers.Dense(d_model)
|
19 |
+
|
20 |
+
def split_heads(self, x, batch_size):
|
21 |
+
"""Split the last dimension into (num_heads, depth).
|
22 |
+
Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
|
23 |
+
"""
|
24 |
+
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
|
25 |
+
return tf.transpose(x, perm=[0, 2, 1, 3])
|
26 |
+
|
27 |
+
def call(self, v, k, q, mask=None):
|
28 |
+
batch_size = tf.shape(q)[0]
|
29 |
+
|
30 |
+
q = self.wq(q) # (batch_size, seq_len, d_model)
|
31 |
+
k = self.wk(k) # (batch_size, seq_len, d_model)
|
32 |
+
v = self.wv(v) # (batch_size, seq_len, d_model)
|
33 |
+
|
34 |
+
q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
|
35 |
+
k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth)
|
36 |
+
v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth)
|
37 |
+
|
38 |
+
# scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
|
39 |
+
# attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
|
40 |
+
scaled_attention, attention_weights = scaled_dot_product_attention(
|
41 |
+
q, k, v, mask)
|
42 |
+
|
43 |
+
scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth)
|
44 |
+
|
45 |
+
concat_attention = tf.reshape(scaled_attention,
|
46 |
+
(batch_size, -1, self.d_model)) # (batch_size, seq_len_q, d_model)
|
47 |
+
|
48 |
+
output = self.dense(concat_attention) # (batch_size, seq_len_q, d_model)
|
49 |
+
|
50 |
+
return output, attention_weights
|
51 |
+
|
52 |
+
|
53 |
+
def point_wise_feed_forward_network(d_model, dff):
|
54 |
+
return tf.keras.Sequential([
|
55 |
+
tf.keras.layers.Dense(dff, activation='relu'), # (batch_size, seq_len, dff)
|
56 |
+
tf.keras.layers.Dense(d_model) # (batch_size, seq_len, d_model)
|
57 |
+
])
|
library/__pycache__/Multihead_attention.cpython-310.pyc
ADDED
Binary file (1.93 kB). View file
|
|
library/__pycache__/customSchedule.cpython-310.pyc
ADDED
Binary file (994 Bytes). View file
|
|
library/__pycache__/encoder_decoder.cpython-310.pyc
ADDED
Binary file (4.59 kB). View file
|
|
library/__pycache__/imageLoad.cpython-310.pyc
ADDED
Binary file (711 Bytes). View file
|
|
library/__pycache__/prediction.cpython-310.pyc
ADDED
Binary file (1.79 kB). View file
|
|
library/__pycache__/self_attention.cpython-310.pyc
ADDED
Binary file (3.18 kB). View file
|
|
library/__pycache__/transformer.cpython-310.pyc
ADDED
Binary file (2.08 kB). View file
|
|
library/customSchedule.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import tensorflow as tf
|
3 |
+
d_model = 512
|
4 |
+
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
|
5 |
+
def __init__(self, d_model, warmup_steps=4000):
|
6 |
+
super(CustomSchedule, self).__init__()
|
7 |
+
|
8 |
+
self.d_model = tf.cast(d_model, tf.float32) # Ensure d_model is a float32
|
9 |
+
self.warmup_steps = tf.cast(warmup_steps, tf.float32) # Ensure warmup_steps is a float32
|
10 |
+
|
11 |
+
def __call__(self, step):
|
12 |
+
step = tf.cast(step, tf.float32) # Ensure step is a float32
|
13 |
+
|
14 |
+
arg1 = tf.math.rsqrt(step)
|
15 |
+
arg2 = step * (self.warmup_steps ** -1.5)
|
16 |
+
|
17 |
+
return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
|
18 |
+
|
19 |
+
learning_rate = CustomSchedule(d_model)
|
library/encoder_decoder.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import tensorflow as tf
|
3 |
+
from library.Multihead_attention import MultiHeadAttention,point_wise_feed_forward_network
|
4 |
+
from library.self_attention import positional_encoding_1d,positional_encoding_2d
|
5 |
+
|
6 |
+
class EncoderLayer(tf.keras.layers.Layer):
|
7 |
+
def __init__(self, d_model, num_heads, dff, rate=0.1):
|
8 |
+
super(EncoderLayer, self).__init__()
|
9 |
+
|
10 |
+
self.mha = MultiHeadAttention(d_model, num_heads)
|
11 |
+
self.ffn = point_wise_feed_forward_network(d_model, dff)
|
12 |
+
|
13 |
+
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
|
14 |
+
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
|
15 |
+
|
16 |
+
self.dropout1 = tf.keras.layers.Dropout(rate)
|
17 |
+
self.dropout2 = tf.keras.layers.Dropout(rate)
|
18 |
+
|
19 |
+
def call(self, x, training, mask=None):
|
20 |
+
|
21 |
+
attn_output, _ = self.mha(x, x, x, mask) # (batch_size, input_seq_len, d_model)
|
22 |
+
attn_output = self.dropout1(attn_output, training=training)
|
23 |
+
out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model)
|
24 |
+
|
25 |
+
ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)
|
26 |
+
ffn_output = self.dropout2(ffn_output, training=training)
|
27 |
+
out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)
|
28 |
+
|
29 |
+
return out2
|
30 |
+
|
31 |
+
|
32 |
+
class DecoderLayer(tf.keras.layers.Layer):
|
33 |
+
def __init__(self, d_model, num_heads, dff, rate=0.1):
|
34 |
+
super(DecoderLayer, self).__init__()
|
35 |
+
|
36 |
+
self.mha1 = MultiHeadAttention(d_model, num_heads)
|
37 |
+
self.mha2 = MultiHeadAttention(d_model, num_heads)
|
38 |
+
|
39 |
+
self.ffn = point_wise_feed_forward_network(d_model, dff)
|
40 |
+
|
41 |
+
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
|
42 |
+
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
|
43 |
+
self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
|
44 |
+
|
45 |
+
self.dropout1 = tf.keras.layers.Dropout(rate)
|
46 |
+
self.dropout2 = tf.keras.layers.Dropout(rate)
|
47 |
+
self.dropout3 = tf.keras.layers.Dropout(rate)
|
48 |
+
|
49 |
+
|
50 |
+
def call(self, x, enc_output, training,
|
51 |
+
look_ahead_mask=None, padding_mask=None):
|
52 |
+
# enc_output.shape == (batch_size, input_seq_len, d_model)
|
53 |
+
|
54 |
+
# using look ahead mask so that during self attention current query dont consider future token
|
55 |
+
attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask) # (batch_size, target_seq_len, d_model)
|
56 |
+
attn1 = self.dropout1(attn1, training=training)
|
57 |
+
out1 = self.layernorm1(attn1 + x)
|
58 |
+
|
59 |
+
# use padding mask to avoid padded values of both enc_output and dec_input
|
60 |
+
attn2, attn_weights_block2 = self.mha2(
|
61 |
+
enc_output, enc_output, out1, padding_mask) # (batch_size, target_seq_len, d_model)
|
62 |
+
attn2 = self.dropout2(attn2, training=training)
|
63 |
+
out2 = self.layernorm2(attn2 + out1) # (batch_size, target_seq_len, d_model)
|
64 |
+
|
65 |
+
ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model)
|
66 |
+
ffn_output = self.dropout3(ffn_output, training=training)
|
67 |
+
out3 = self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, d_model)
|
68 |
+
|
69 |
+
return out3, attn_weights_block1, attn_weights_block2
|
70 |
+
|
71 |
+
|
72 |
+
class Encoder(tf.keras.layers.Layer):
|
73 |
+
def __init__(self, num_layers, d_model, num_heads, dff,
|
74 |
+
row_size,col_size,rate=0.1):
|
75 |
+
super(Encoder, self).__init__()
|
76 |
+
|
77 |
+
self.d_model = d_model
|
78 |
+
self.num_layers = num_layers
|
79 |
+
|
80 |
+
self.embedding = tf.keras.layers.Dense(self.d_model,activation='relu')
|
81 |
+
self.pos_encoding = positional_encoding_2d(row_size,col_size,
|
82 |
+
self.d_model)
|
83 |
+
|
84 |
+
|
85 |
+
self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
|
86 |
+
for _ in range(num_layers)]
|
87 |
+
|
88 |
+
self.dropout = tf.keras.layers.Dropout(rate)
|
89 |
+
|
90 |
+
def call(self, x, training, mask=None):
|
91 |
+
# shape(x) = (batch_size,seq_len(H*W),features)
|
92 |
+
seq_len = tf.shape(x)[1]
|
93 |
+
|
94 |
+
# adding embedding and position encoding.
|
95 |
+
x = self.embedding(x) # (batch_size, input_seq_len(H*W), d_model)
|
96 |
+
x += self.pos_encoding[:, :seq_len, :]
|
97 |
+
|
98 |
+
x = self.dropout(x, training=training)
|
99 |
+
|
100 |
+
for i in range(self.num_layers):
|
101 |
+
x = self.enc_layers[i](x, training, mask)
|
102 |
+
|
103 |
+
return x # (batch_size, input_seq_len, d_model)
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
class Decoder(tf.keras.layers.Layer):
|
109 |
+
def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
|
110 |
+
maximum_position_encoding, rate=0.1):
|
111 |
+
super(Decoder, self).__init__()
|
112 |
+
|
113 |
+
self.d_model = d_model
|
114 |
+
self.num_layers = num_layers
|
115 |
+
|
116 |
+
self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
|
117 |
+
self.pos_encoding = positional_encoding_1d(maximum_position_encoding, d_model)
|
118 |
+
|
119 |
+
self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
|
120 |
+
for _ in range(num_layers)]
|
121 |
+
self.dropout = tf.keras.layers.Dropout(rate)
|
122 |
+
|
123 |
+
def call(self, x, enc_output, training,
|
124 |
+
look_ahead_mask=None, padding_mask=None):
|
125 |
+
|
126 |
+
seq_len = tf.shape(x)[1]
|
127 |
+
attention_weights = {}
|
128 |
+
|
129 |
+
x = self.embedding(x) # (batch_size, target_seq_len, d_model)
|
130 |
+
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
|
131 |
+
x += self.pos_encoding[:, :seq_len, :]
|
132 |
+
|
133 |
+
x = self.dropout(x, training=training)
|
134 |
+
|
135 |
+
for i in range(self.num_layers):
|
136 |
+
x, block1, block2 = self.dec_layers[i](x, enc_output, training,
|
137 |
+
look_ahead_mask, padding_mask)
|
138 |
+
|
139 |
+
attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
|
140 |
+
attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
|
141 |
+
|
142 |
+
# x.shape == (batch_size, target_seq_len, d_model)
|
143 |
+
return x, attention_weights
|
library/imageLoad.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
|
3 |
+
def load_image(image_path):
|
4 |
+
img = tf.io.read_file(image_path)
|
5 |
+
img = tf.image.decode_jpeg(img, channels=3)
|
6 |
+
img = tf.image.resize(img, (299, 299))
|
7 |
+
img = tf.keras.applications.inception_v3.preprocess_input(img)
|
8 |
+
return img, image_path
|
9 |
+
|
10 |
+
|
11 |
+
# Feature extraction
|
12 |
+
|
13 |
+
image_model = tf.keras.applications.InceptionV3(include_top=False,
|
14 |
+
weights='imagenet')
|
15 |
+
new_input = image_model.input
|
16 |
+
hidden_layer = image_model.layers[-1].output
|
17 |
+
|
18 |
+
image_features_extract_model = tf.keras.Model(new_input, hidden_layer)
|
19 |
+
|
20 |
+
|
21 |
+
# Tokenizer
|
22 |
+
|
23 |
+
|
library/prediction.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
|
2 |
+
import tensorflow as tf
|
3 |
+
import numpy as np
|
4 |
+
import pickle
|
5 |
+
#
|
6 |
+
from library.transformer import Transformer
|
7 |
+
from library.imageLoad import image_features_extract_model,load_image
|
8 |
+
from library.self_attention import create_masks_decoder,scaled_dot_product_attention
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
def evaluate(image,tokenizer,loaded_transformer):
|
14 |
+
temp_input = tf.expand_dims(load_image(image)[0], 0)
|
15 |
+
img_tensor_val = image_features_extract_model(temp_input)
|
16 |
+
img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))
|
17 |
+
|
18 |
+
start_token = tokenizer.word_index['<start>']
|
19 |
+
end_token = tokenizer.word_index['<end>']
|
20 |
+
|
21 |
+
#decoder input is start token.
|
22 |
+
decoder_input = [start_token]
|
23 |
+
output = tf.expand_dims(decoder_input, 0) #tokens
|
24 |
+
result = [] #word list
|
25 |
+
|
26 |
+
for i in range(100):
|
27 |
+
dec_mask = create_masks_decoder(output)
|
28 |
+
|
29 |
+
# predictions.shape == (batch_size, seq_len, vocab_size)
|
30 |
+
predictions, attention_weights = loaded_transformer(img_tensor_val,output,False,dec_mask)
|
31 |
+
|
32 |
+
# select the last word from the seq_len dimension
|
33 |
+
predictions = predictions[: ,-1:, :] # (batch_size, 1, vocab_size)
|
34 |
+
|
35 |
+
predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
|
36 |
+
# return the result if the predicted_id is equal to the end token
|
37 |
+
if predicted_id == end_token:
|
38 |
+
return result,tf.squeeze(output, axis=0), attention_weights
|
39 |
+
# concatentate the predicted_id to the output which is given to the decoder
|
40 |
+
# as its input.
|
41 |
+
result.append(tokenizer.index_word[int(predicted_id)])
|
42 |
+
output = tf.concat([output, predicted_id], axis=-1)
|
43 |
+
|
44 |
+
return result,tf.squeeze(output, axis=0), attention_weights
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
# Assuming evaluate function is defined to generate captions
|
49 |
+
def evaluate_single_image(image_path,tokenizer,loaded_transformer):
|
50 |
+
start_token = tokenizer.word_index['<start>']
|
51 |
+
end_token = tokenizer.word_index['<end>']
|
52 |
+
|
53 |
+
# Evaluate the caption for the given image
|
54 |
+
caption, _, _ = evaluate(image_path,tokenizer,loaded_transformer);
|
55 |
+
|
56 |
+
# Remove "<unk>" from the result
|
57 |
+
caption = [word for word in caption if word != "<unk>"]
|
58 |
+
|
59 |
+
# Remove <end> from the result
|
60 |
+
result_join = ' '.join(caption)
|
61 |
+
result_final = result_join.rsplit(' ', 1)[0]
|
62 |
+
|
63 |
+
return result_final
|
library/self_attention.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import tensorflow as tf
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
def get_angles(pos, i, d_model):
|
7 |
+
angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
|
8 |
+
return pos * angle_rates
|
9 |
+
|
10 |
+
|
11 |
+
def positional_encoding_1d(position, d_model):
|
12 |
+
angle_rads = get_angles(np.arange(position)[:, np.newaxis],
|
13 |
+
np.arange(d_model)[np.newaxis, :],
|
14 |
+
d_model)
|
15 |
+
|
16 |
+
# apply sin to even indices in the array; 2i
|
17 |
+
angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
|
18 |
+
|
19 |
+
# apply cos to odd indices in the array; 2i+1
|
20 |
+
angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
|
21 |
+
|
22 |
+
pos_encoding = angle_rads[np.newaxis, ...]
|
23 |
+
|
24 |
+
return tf.cast(pos_encoding, dtype=tf.float32)
|
25 |
+
|
26 |
+
|
27 |
+
def positional_encoding_2d(row,col,d_model):
|
28 |
+
assert d_model % 2 == 0
|
29 |
+
# first d_model/2 encode row embedding and second d_model/2 encode column embedding
|
30 |
+
row_pos = np.repeat(np.arange(row),col)[:,np.newaxis]
|
31 |
+
col_pos = np.repeat(np.expand_dims(np.arange(col),0),row,axis=0).reshape(-1,1)
|
32 |
+
angle_rads_row = get_angles(row_pos,np.arange(d_model//2)[np.newaxis,:],d_model//2)
|
33 |
+
angle_rads_col = get_angles(col_pos,np.arange(d_model//2)[np.newaxis,:],d_model//2)
|
34 |
+
#apply sin and cos to odd and even indices resp.
|
35 |
+
angle_rads_row[:, 0::2] = np.sin(angle_rads_row[:, 0::2])
|
36 |
+
angle_rads_row[:, 1::2] = np.cos(angle_rads_row[:, 1::2])
|
37 |
+
angle_rads_col[:, 0::2] = np.sin(angle_rads_col[:, 0::2])
|
38 |
+
angle_rads_col[:, 1::2] = np.cos(angle_rads_col[:, 1::2])
|
39 |
+
pos_encoding = np.concatenate([angle_rads_row,angle_rads_col],axis=1)[np.newaxis, ...]
|
40 |
+
|
41 |
+
return tf.cast(pos_encoding, dtype=tf.float32)
|
42 |
+
|
43 |
+
|
44 |
+
def create_padding_mask(seq):
|
45 |
+
seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
|
46 |
+
|
47 |
+
# add extra dimensions to add the padding
|
48 |
+
# to the attention logits.
|
49 |
+
return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len)
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
def create_look_ahead_mask(size):
|
54 |
+
mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
|
55 |
+
return mask # (seq_len, seq_len)
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
def create_masks_decoder(tar):
|
60 |
+
look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
|
61 |
+
dec_target_padding_mask = create_padding_mask(tar)
|
62 |
+
combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
|
63 |
+
return combined_mask
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
def scaled_dot_product_attention(q, k, v, mask):
|
68 |
+
"""Calculate the attention weights.
|
69 |
+
q, k, v must have matching leading dimensions.
|
70 |
+
k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
|
71 |
+
The mask has different shapes depending on its type(padding or look ahead)
|
72 |
+
but it must be broadcastable for addition.
|
73 |
+
|
74 |
+
Args:
|
75 |
+
q: query shape == (..., seq_len_q, depth)
|
76 |
+
k: key shape == (..., seq_len_k, depth)
|
77 |
+
v: value shape == (..., seq_len_v, depth_v)
|
78 |
+
mask: Float tensor with shape broadcastable
|
79 |
+
to (..., seq_len_q, seq_len_k). Defaults to None.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
output, attention_weights
|
83 |
+
"""
|
84 |
+
|
85 |
+
matmul_qk = tf.matmul(q, k, transpose_b=True) # (..., seq_len_q, seq_len_k)
|
86 |
+
|
87 |
+
# scale matmul_qk
|
88 |
+
dk = tf.cast(tf.shape(k)[-1], tf.float32)
|
89 |
+
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
|
90 |
+
|
91 |
+
# add the mask to the scaled tensor.
|
92 |
+
if mask is not None:
|
93 |
+
scaled_attention_logits += (mask * -1e9) #adding -Inf where mask is 1 s.t. value get ignored in softmax
|
94 |
+
|
95 |
+
# softmax is normalized on the last axis (seq_len_k) so that the scores
|
96 |
+
# add up to 1.
|
97 |
+
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., seq_len_q, seq_len_k)
|
98 |
+
|
99 |
+
output = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v)
|
100 |
+
|
101 |
+
return output, attention_weights
|
library/transformer.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
from library.self_attention import create_padding_mask,create_masks_decoder,scaled_dot_product_attention
|
3 |
+
|
4 |
+
from library.Multihead_attention import MultiHeadAttention,point_wise_feed_forward_network
|
5 |
+
from library.customSchedule import learning_rate
|
6 |
+
from library.encoder_decoder import Encoder,Decoder,EncoderLayer,DecoderLayer
|
7 |
+
import pickle
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
def load_image(image_path):
|
12 |
+
img = tf.io.read_file(image_path)
|
13 |
+
img = tf.image.decode_jpeg(img, channels=3)
|
14 |
+
img = tf.image.resize(img, (299, 299))
|
15 |
+
img = tf.keras.applications.inception_v3.preprocess_input(img)
|
16 |
+
return img, image_path
|
17 |
+
|
18 |
+
|
19 |
+
# Feature extraction
|
20 |
+
|
21 |
+
image_model = tf.keras.applications.InceptionV3(include_top=False,
|
22 |
+
weights='imagenet')
|
23 |
+
new_input = image_model.input
|
24 |
+
hidden_layer = image_model.layers[-1].output
|
25 |
+
|
26 |
+
image_features_extract_model = tf.keras.Model(new_input, hidden_layer)
|
27 |
+
|
28 |
+
|
29 |
+
class Transformer(tf.keras.Model):
|
30 |
+
def __init__(self, num_layers, d_model, num_heads, dff,row_size,col_size,
|
31 |
+
target_vocab_size,max_pos_encoding, rate=0.1):
|
32 |
+
super(Transformer, self).__init__()
|
33 |
+
|
34 |
+
self.encoder = Encoder(num_layers, d_model, num_heads, dff,row_size,col_size, rate)
|
35 |
+
|
36 |
+
self.decoder = Decoder(num_layers, d_model, num_heads, dff,
|
37 |
+
target_vocab_size,max_pos_encoding, rate)
|
38 |
+
|
39 |
+
self.final_layer = tf.keras.layers.Dense(target_vocab_size)
|
40 |
+
|
41 |
+
def call(self, inp, tar, training,look_ahead_mask=None, dec_padding_mask=None,enc_padding_mask=None):
|
42 |
+
|
43 |
+
enc_output = self.encoder(inp, training, enc_padding_mask) # (batch_size, inp_seq_len, d_model)
|
44 |
+
|
45 |
+
# dec_output.shape == (batch_size, tar_seq_len, d_model)
|
46 |
+
dec_output, attention_weights = self.decoder(
|
47 |
+
tar, enc_output, training, look_ahead_mask, dec_padding_mask)
|
48 |
+
|
49 |
+
final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size)
|
50 |
+
|
51 |
+
return final_output, attention_weights
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
# # Load the custom objects
|
56 |
+
# with open('models/Transformer/custom_objects-80.pkl', 'rb') as f:
|
57 |
+
# custom_objects = pickle.load(f)
|
58 |
+
|
59 |
+
# Assuming you have the same model architecture defined in the 'Transformer' class
|
60 |
+
# Create an instance of the Transformer model (without loading weights)
|
model/fingerprint.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f8a25f566874573317d02eb3d331f81f46a4d188159c889c2624db8a36a343ee
|
3 |
+
size 58
|
model/model-20/checkpoint
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
model_checkpoint_path: "model_weights-15"
|
2 |
+
all_model_checkpoint_paths: "model_weights-15"
|
model/model-20/custom_objects-15.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:107c764c4d70f7416ba7b4f6af4b8acd80b494d0bc0b2bf12416a8f513221992
|
3 |
+
size 47
|
model/model-20/model_weights-15.data-00000-of-00001
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb6aa6a6153cb29d04510d07dba14c5fea247cc799e25b4f59bc581bac609c01
|
3 |
+
size 224434664
|
model/model-20/model_weights-15.index
ADDED
Binary file (11.4 kB). View file
|
|
model/model-20/training_validation_accuracy.png
ADDED
model/model-20/training_validation_loss.png
ADDED
model/saved_model.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6361b491875593db4e3fd758462d6ebbe9f1ffd17facc1b5568b028932d17df3
|
3 |
+
size 57028385
|
model/variables/variables.data-00000-of-00001
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:227114d56ffd12d39a4f84e2e5aa9cd0485dc255925c32ff6d081a4011af0dbd
|
3 |
+
size 224513816
|
model/variables/variables.index
ADDED
Binary file (9.91 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Flask==2.0.2
|
2 |
+
opencv-python
|
3 |
+
keras==2.15.0
|
4 |
+
numpy>=1.23.5
|
5 |
+
keras-applications==1.0.8
|
6 |
+
tqdm
|
7 |
+
flask-cors
|
8 |
+
Werkzeug==2.0.2
|
9 |
+
tensorflow
|
10 |
+
keras_preprocessing==1.1.2
|
11 |
+
nltk==3.6.5
|
transformer/tokenizer.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:56ce18abc08dcf20877edbb48701616635801a1c69b8673db5605101f04e623a
|
3 |
+
size 1368089
|