Spaces:

Tayaba171
/

CALText-TextRecognizer2

Runtime error

File size: 29,802 Bytes

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model
from matplotlib import pyplot as plt
from PIL import Image,ImageFont, ImageDraw
from skimage.transform import rescale, resize
import numpy as np
import re
import math
import copy
import random
import time
import data
import cv2

#import data
rng = np.random.RandomState(int(time.time()))

#### training setup parameters ####
lambda_val=1e-4
gamma_val=1
num_classes=130

##  Utility functions used to initialize vaiables
def norm_weight(fan_in, fan_out):
	W_bound = np.sqrt(6.0 / (fan_in + fan_out))
	return np.asarray(rng.uniform(low=-W_bound, high=W_bound, size=(fan_in, fan_out)), dtype=np.float32)

def conv_norm_weight(nin, nout, kernel_size):
    filter_shape = (kernel_size[0], kernel_size[1], nin, nout)
    fan_in = kernel_size[0] * kernel_size[1] * nin
    fan_out = kernel_size[0] * kernel_size[1] * nout
    W_bound = np.sqrt(6. / (fan_in + fan_out))
    W = np.asarray(rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=np.float32)
    return W.astype('float32')

def ortho_weight(ndim):
	W = np.random.randn(ndim, ndim)
	u, s, v = np.linalg.svd(W)
	return u.astype('float32')

#####
class DenseEncoder(layers.Layer):
	def __init__(self, blocks,       # number of dense blocks
				level,                     # number of levels in each blocks
				growth_rate,               # growth rate in DenseNet paper: k
				istraining,
				dropout_rate=0.2,          # keep-rate of dropout layer
				dense_channels=0,          # filter numbers of transition layer's input
				transition=0.5,            # rate of comprssion
				input_conv_filters=48,     # filter numbers of conv2d before dense blocks
				input_conv_stride=(2,2),       # stride of conv2d before dense blocks
				input_conv_kernel=(7,7), **kwargs):  # kernel size of conv2d before dense blocks
		super(DenseEncoder, self).__init__( **kwargs)
		self.blocks = blocks
		self.growth_rate = growth_rate
		self.training = istraining
		self.dense_channels = dense_channels
		self.level = level
		self.dropout_rate = dropout_rate
		self.transition = transition
		self.input_conv_kernel = input_conv_kernel
		self.input_conv_stride = input_conv_stride
		self.input_conv_filters = input_conv_filters

		self.limit = self.bound(1, self.input_conv_filters, self.input_conv_kernel)
		self.conv1=tf.keras.layers.Conv2D(filters=self.input_conv_filters, kernel_size=self.input_conv_kernel ,strides=self.input_conv_stride, padding='same', data_format='channels_last', use_bias=False , kernel_initializer=tf.random_uniform_initializer(-self.limit, self.limit))
		self.batch_norm=tf.keras.layers.BatchNormalization(trainable=self.training, momentum=0.9, scale=True, gamma_initializer=tf.random_uniform_initializer(-1.0/math.sqrt(self.input_conv_filters),1.0/math.sqrt(self.input_conv_filters)), epsilon=0.0001)
		self.relu=tf.keras.layers.ReLU()
		self.maxpool=tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2), padding='same')

		self.dropout=tf.keras.layers.Dropout(rate=self.dropout_rate)
		self.avgpool=tf.keras.layers.AveragePooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')

		self.conv=[]
		self.conv2=[]
		self.batchnorm=[]
		self.batchnorm2=[]
		self.dense_channels += self.input_conv_filters
		for i in range(self.blocks):
			for j in range(self.level):
					limit = self.bound(self.dense_channels, 4 * self.growth_rate, [1,1])
					self.conv.append(tf.keras.layers.Conv2D(filters=4 * self.growth_rate, kernel_size=(1,1) ,strides=(1,1), padding='valid', data_format='channels_last', use_bias=False , kernel_initializer=tf.random_uniform_initializer(-limit, limit)))
					self.batchnorm.append(tf.keras.layers.BatchNormalization(trainable=self.training, momentum=0.9, scale=True,gamma_initializer=tf.random_uniform_initializer(-1.0/math.sqrt(4 * self.growth_rate),1.0/math.sqrt(4 * self.growth_rate)), epsilon=0.0001))

					limit = self.bound(4 * self.growth_rate, self.growth_rate, [3,3])
					self.conv.append(tf.keras.layers.Conv2D(filters=self.growth_rate, kernel_size=(3,3) ,strides=(1,1), padding='same', data_format='channels_last', use_bias=False , kernel_initializer=tf.random_uniform_initializer(-limit, limit)))
					self.batchnorm.append(tf.keras.layers.BatchNormalization(trainable=self.training, momentum=0.9, scale=True,gamma_initializer=tf.random_uniform_initializer(-1.0/math.sqrt(self.growth_rate),1.0/math.sqrt(self.growth_rate)), epsilon=0.0001))
					self.dense_channels += self.growth_rate

			if i < self.blocks - 1:
					compressed_channels = int(self.dense_channels * self.transition)

					#### new dense channels for new dense block ####
					self.dense_channels = compressed_channels
					limit = self.bound(self.dense_channels, compressed_channels, [1,1])
					self.conv2.append(tf.keras.layers.Conv2D(compressed_channels, kernel_size=(1,1) ,strides=(1,1), padding='valid', data_format='channels_last', use_bias=False , activation=None, kernel_initializer=tf.random_uniform_initializer(-limit, limit)))
					self.batchnorm2.append(tf.keras.layers.BatchNormalization(trainable=self.training, momentum=0.9, scale=True, gamma_initializer=tf.random_uniform_initializer(-1.0/math.sqrt(self.dense_channels),1.0/math.sqrt(self.dense_channels)), epsilon=0.0001))


	def bound(self, nin, nout, kernel):
		fin = nin * kernel[0] * kernel[1]
		fout = nout * kernel[0] * kernel[1]
		return np.sqrt(6. / (fin + fout))

	def dense_net(self, input_x, mask_x):

		#### before flowing into dense blocks ####
		input_x=tf.expand_dims(input=input_x, axis=3)
		x = input_x
		#limit = self.bound(1, self.input_conv_filters, self.input_conv_kernel)
		x =self.conv1(x)
		mask_x = mask_x[:, 0::2, 0::2]
		x =self.batch_norm(x)
		x =self.relu(x)
		x=self.maxpool(x)
		input_pre = x
		mask_x = mask_x[:, 0::2, 0::2]
		dense_out = x

		cind=0
		bind=0
		cind2=0
		bind2=0
		#### flowing into dense blocks and transition_layer ####
		for i in range(self.blocks):
			for j in range(self.level):
				#### [1, 1] convolution part for bottleneck ####
				x =self.conv[cind](x)
				cind += 1
				x =self.batchnorm[bind](x)
				bind += 1
				x =self.relu(x)
				x =self.dropout(x,training=self.training)

				#### [3, 3] convolution part for regular convolve operation
				x =self.conv[cind](x)
				cind += 1
				x =self.batchnorm[bind](x)
				bind += 1
				x =self.relu(x)
				x =self.dropout(x,training=self.training)
				dense_out = tf.concat([dense_out, x], axis=3)
				x = dense_out
				#### calculate the filter number of dense block's output ####

			if i < self.blocks - 1:
				#### new dense channels for new dense block ####
				x =self.conv2[cind2](x)
				cind2 += 1
				x =self.batchnorm2[bind2](x)
				bind2 += 1
				x =self.relu(x)
				x =self.dropout(x,training=self.training)
				x=self.avgpool(x)
				dense_out = x
				mask_x = mask_x[:, 0::2, 0::2]

		return dense_out, mask_x

'''
ContextualAttention class implements contextual attention mechanism.
'''
class ContextualAttention(layers.Layer):
	def __init__(self, channels,                          # output of DenseEncoder | [batch, h, w, channels]
				dim_decoder, dim_attend, **kwargs):                       # decoder hidden state:$h_{t-1}$ | [batch, dec_dim]
		super(ContextualAttention, self).__init__( **kwargs)
		self.channels = channels

		self.coverage_kernel = [11,11]                      # kernel size of $Q$
		self.coverage_filters = dim_attend                  # filter numbers of $Q$ | 512

		self.dim_decoder = dim_decoder                      # 256
		self.dim_attend = dim_attend                        # unified dim of three parts calculating $e_ti$ i.e.
		                                                    # $Q*beta_t$, $U_a * a_i$, $W_a x h_{t-1}$ | 512
		self.U_f = tf.Variable(norm_weight(self.coverage_filters, self.dim_attend), name='U_f') # $U_f x f_i$ | [cov_filters, dim_attend]
		self.U_f_b = tf.Variable(np.zeros((self.dim_attend,)).astype('float32'), name='U_f_b')  # $U_f x f_i + U_f_b$ | [dim_attend, ]

		self.U_a = tf.Variable(norm_weight(self.channels,self.dim_attend), name='U_a')         # $U_a x a_i$ | [annotatin_channels, dim_attend]
		self.U_a_b = tf.Variable(np.zeros((self.dim_attend,)).astype('float32'), name='U_a_b') # $U_a x a_i + U_a_b$ | [dim_attend, ]

		self.W_a = tf.Variable(norm_weight(self.dim_decoder,self.dim_attend), name='W_a')      # $W_a x h_{t_1}$ | [dec_dim, dim_attend]
		self.W_a_b = tf.Variable(np.zeros((self.dim_attend,)).astype('float32'), name='W_a_b') # $W_a x h_{t-1} + W_a_b$ | [dim_attend, ]

		self.V_a = tf.Variable(norm_weight(self.dim_attend, 1), name='V_a')                    # $V_a x tanh(A + B + C)$ | [dim_attend, 1]
		self.V_a_b = tf.Variable(np.zeros((1,)).astype('float32'), name='V_a_b')               # $V_a x tanh(A + B + C) + V_a_b$ | [1, ]

		self.alpha_past_filter = tf.Variable(conv_norm_weight(1, self.dim_attend, self.coverage_kernel), name='alpha_past_filter')


	def get_context(self, annotation4ctx, h_t_1, alpha_past4ctx, a_mask):

		#### calculate $U_f x f_i$ ####
		alpha_past_4d = alpha_past4ctx[:, :, :, None]

		Ft = tf.nn.conv2d(alpha_past_4d, filters=self.alpha_past_filter, strides=[1, 1, 1, 1], padding='SAME')
		coverage_vector = tf.tensordot(Ft, self.U_f, axes=1) 	#+ self.U_f_b    # [batch, h, w, dim_attend]

		#### calculate $U_a x a_i$ ####
		dense_encoder_vector = tf.tensordot(annotation4ctx, self.U_a, axes=1) #+ self.U_a_b   # [batch, h, w, dim_attend]

		#### calculate $W_a x h_{t - 1}$ ####
		speller_vector = tf.tensordot(h_t_1, self.W_a, axes=1) #+ self.W_a_b   # [batch, dim_attend]
		speller_vector = speller_vector[:, None, None, :]    # [batch, None, None, dim_attend]

		tanh_vector = tf.tanh(coverage_vector + dense_encoder_vector + speller_vector + self.U_f_b)    # [batch, h, w, dim_attend]
		e_ti = tf.tensordot(tanh_vector, self.V_a, axes=1) + self.V_a_b  # [batch, h, w, 1]
		alpha = tf.exp(e_ti)
		alpha = tf.squeeze(alpha, axis=3)

		if a_mask is not None:
			alpha = alpha * a_mask

		alpha = alpha / tf.reduce_sum(alpha, axis=[1, 2], keepdims=True)    # normlized weights | [batch, h, w]
		alpha_past4ctx += alpha    # accumalated weights matrix | [batch, h, w]
		context = tf.reduce_sum(annotation4ctx * alpha[:, :, :, None], axis=[1, 2])   # context vector | [batch, feature_channels]
		return context, alpha, alpha_past4ctx

'''
Decoder class implements 2 layerd Decoder (GRU) which decodes an input image
and outputs a seuence of characters using attention mechanism .
'''
class Decoder(layers.Layer):
	def __init__(self, hidden_dim, word_dim, contextual_attention, context_dim, **kwargs):
		super(Decoder, self).__init__( **kwargs)
		self.contextual_attention = contextual_attention                                # inner-instance of contextual_attention to provide context
		self.context_dim = context_dim                          # context dime 684
		self.hidden_dim = hidden_dim                            # dim of hidden state  256
		self.word_dim = word_dim                                # dim of embedding word 256

		##GRU 1 weights initialization starts here
		self.W_yz_yr = tf.Variable(np.concatenate(
			[norm_weight(self.word_dim, self.hidden_dim), norm_weight(self.word_dim, self.hidden_dim)], axis=1), name='W_yz_yr') # [dim_word, 2 * dim_decoder]
		self.b_yz_yr = tf.Variable(np.zeros((2 * self.hidden_dim, )).astype('float32'), name='b_yz_yr')

		self.U_hz_hr = tf.Variable(np.concatenate(
			[ortho_weight(self.hidden_dim),ortho_weight(self.hidden_dim)], axis=1), name='U_hz_hr')                              # [dim_hidden, 2 * dim_hidden]

		self.W_yh = tf.Variable(norm_weight(self.word_dim,
			self.hidden_dim), name='W_yh')
		self.b_yh = tf.Variable(np.zeros((self.hidden_dim, )).astype('float32'), name='b_yh')                                    # [dim_decoder, ]

		self.U_rh = tf.Variable(ortho_weight(self.hidden_dim), name='U_rh')                                                      # [dim_hidden, dim_hidden]

		##GRU 2 weights initialization starts here
		self.U_hz_hr_nl = tf.Variable(np.concatenate(
			[ortho_weight(self.hidden_dim), ortho_weight(self.hidden_dim)], axis=1), name='U_hz_hr_nl')                          # [dim_hidden, 2 * dim_hidden] non_linear

		self.b_hz_hr_nl = tf.Variable(np.zeros((2 * self.hidden_dim, )).astype('float32'), name='b_hz_hr_nl')                    # [2 * dim_hidden, ]

		self.W_c_z_r = tf.Variable(norm_weight(self.context_dim,
			2 * self.hidden_dim), name='W_c_z_r')

		self.U_rh_nl = tf.Variable(ortho_weight(self.hidden_dim), name='U_rh_nl')
		self.b_rh_nl = tf.Variable(np.zeros((self.hidden_dim, )).astype('float32'), name='b_rh_nl')

		self.W_c_h_nl = tf.Variable(norm_weight(self.context_dim, self.hidden_dim), name='W_c_h_nl')



	def get_ht_ctx(self, emb_y, target_hidden_state_0, annotations, a_m, y_m):

		res = tf.scan(self.one_time_step, elems=(emb_y, y_m),
			initializer=(target_hidden_state_0,
				tf.zeros([tf.shape(annotations)[0], self.context_dim]),
				tf.zeros([tf.shape(annotations)[0], tf.shape(annotations)[1], tf.shape(annotations)[2]]),
				tf.zeros([tf.shape(annotations)[0], tf.shape(annotations)[1], tf.shape(annotations)[2]]),
				annotations, a_m))

		return res




	def one_time_step(self, tuple_h0_ctx_alpha_alpha_past_annotation, tuple_emb_mask):

		target_hidden_state_0 = tuple_h0_ctx_alpha_alpha_past_annotation[0]
		alpha_past_one        = tuple_h0_ctx_alpha_alpha_past_annotation[3]
		annotation_one        = tuple_h0_ctx_alpha_alpha_past_annotation[4]
		a_mask                = tuple_h0_ctx_alpha_alpha_past_annotation[5]

		emb_y, y_mask = tuple_emb_mask

		#GRU 1 starts here
		emb_y_z_r_vector = tf.tensordot(emb_y, self.W_yz_yr, axes=1) + \
		self.b_yz_yr                                            # [batch, 2 * dim_decoder]
		hidden_z_r_vector = tf.tensordot(target_hidden_state_0,
		self.U_hz_hr, axes=1)                                   # [batch, 2 * dim_decoder]
		pre_z_r_vector = tf.sigmoid(emb_y_z_r_vector + \
		hidden_z_r_vector)                                      # [batch, 2 * dim_decoder]

		r1 = pre_z_r_vector[:, :self.hidden_dim]                # [batch, dim_decoder]
		z1 = pre_z_r_vector[:, self.hidden_dim:]                # [batch, dim_decoder]

		emb_y_h_vector = tf.tensordot(emb_y, self.W_yh, axes=1) + \
		self.b_yh                                               # [batch, dim_decoder]
		hidden_r_h_vector = tf.tensordot(target_hidden_state_0,
		self.U_rh, axes=1)                                      # [batch, dim_decoder]
		hidden_r_h_vector *= r1
		pre_h_proposal = tf.tanh(hidden_r_h_vector + emb_y_h_vector)

		pre_h = z1 * target_hidden_state_0 + (1. - z1) * pre_h_proposal

		if y_mask is not None:
			pre_h = y_mask[:, None] * pre_h + (1. - y_mask)[:, None] * target_hidden_state_0

		context, alpha, alpha_past_one = self.contextual_attention.get_context(annotation_one, pre_h, alpha_past_one, a_mask)  # [batch, dim_ctx]

		#GRU 2 starts here
		emb_y_z_r_nl_vector = tf.tensordot(pre_h, self.U_hz_hr_nl, axes=1) + self.b_hz_hr_nl
		context_z_r_vector = tf.tensordot(context, self.W_c_z_r, axes=1)
		z_r_vector = tf.sigmoid(emb_y_z_r_nl_vector + context_z_r_vector)

		r2 = z_r_vector[:, :self.hidden_dim]
		z2 = z_r_vector[:, self.hidden_dim:]

		emb_y_h_nl_vector = tf.tensordot(pre_h, self.U_rh_nl, axes=1)
		emb_y_h_nl_vector *= r2
		emb_y_h_nl_vector=emb_y_h_nl_vector+ self.b_rh_nl # bias added after point wise multiplication with r2
		context_h_vector = tf.tensordot(context, self.W_c_h_nl, axes=1)
		h_proposal = tf.tanh(emb_y_h_nl_vector + context_h_vector)
		h = z2 * pre_h + (1. - z2) * h_proposal

		if y_mask is not None:
			h = y_mask[:, None] * h + (1. - y_mask)[:, None] * pre_h

		return h, context, alpha, alpha_past_one, annotation_one, a_mask


'''
CALText class is the main class. This class uses below three classes:
1) DenseEncoder (Encoder)
2) ContextualAttention (Contextual attention mechnism)
3) Decoder (2 layerd GRU Decoder)
CALText class implements two functions get_cost and get_sample, which are actually used for cost calculation and decoding.
'''
class CALText(layers.Layer):
	def __init__(self, dense_encoder, contextual_attention, decoder, hidden_dim, word_dim, context_dim, target_dim, istraining,**kwargs):
		super(CALText, self).__init__( **kwargs)
		#self.batch_size = batch_size
		self.hidden_dim = hidden_dim
		self.word_dim = word_dim
		self.context_dim = context_dim
		self.target_dim = target_dim
		self.embed_matrix = tf.Variable(norm_weight(self.target_dim, self.word_dim), name='embed')

		self.dense_encoder = dense_encoder
		self.contextual_attention = contextual_attention
		self.decoder = decoder
		self.Wa2h = tf.Variable(norm_weight(self.context_dim, self.hidden_dim), name='Wa2h')
		self.ba2h = tf.Variable(np.zeros((self.hidden_dim,)).astype('float32'), name='ba2h')
		self.Wc = tf.Variable(norm_weight(self.context_dim, self.word_dim), name='Wc')
		self.bc = tf.Variable(np.zeros((self.word_dim,)).astype('float32'), name='bc')
		self.Wh = tf.Variable(norm_weight(self.hidden_dim, self.word_dim), name='Wh')
		self.bh = tf.Variable(np.zeros((self.word_dim,)).astype('float32'), name='bh')
		self.Wy = tf.Variable(norm_weight(self.word_dim, self.word_dim), name='Wy')
		self.by = tf.Variable(np.zeros((self.word_dim,)).astype('float32'), name='by')
		self.Wo = tf.Variable(norm_weight(self.word_dim//2, self.target_dim), name='Wo')
		self.bo = tf.Variable(np.zeros((self.target_dim,)).astype('float32'), name='bo')
		self.training = istraining
		self.dropout=tf.keras.layers.Dropout(rate=0.2)


	def get_cost(self, cost_annotation, cost_y, a_m, y_m,alpha_reg):

		#### step: 1 prepration of embedding of labels sequences ####
		timesteps = tf.shape(cost_y)[0]
		batch_size = tf.shape(cost_y)[1]

		emb_y = tf.nn.embedding_lookup(self.embed_matrix, tf.reshape(cost_y, [-1]))
		emb_y = tf.reshape(emb_y, [timesteps, batch_size, self.word_dim])
		emb_pad = tf.fill((1, batch_size, self.word_dim), 0.0)
		emb_shift = tf.concat([emb_pad ,tf.strided_slice(emb_y, [0, 0, 0], [-1, batch_size, self.word_dim], [1, 1, 1])], axis=0)
		new_emb_y = emb_shift

		#### step: 2 calculation of h_0 ####
		anno_mean = tf.reduce_sum(cost_annotation * a_m[:, :, :, None], axis=[1, 2]) / tf.reduce_sum(a_m, axis=[1, 2])[:, None]
		h_0 = tf.tensordot(anno_mean, self.Wa2h, axes=1) + self.ba2h  # [batch, hidden_dim]
		h_0 = tf.tanh(h_0)

		#### step: 3 calculation of h_t and c_t at all time steps ####
		ret = self.decoder.get_ht_ctx(new_emb_y, h_0, cost_annotation, a_m, y_m)
		h_t = ret[0]                      # h_t of all timesteps [timesteps, batch, hidden_dim]
		c_t = ret[1]                      # c_t of all timesteps [timesteps, batch, context_dim]
		alpha=ret[2]											# alpha of all timesteps [timesteps, batch, h, w]

		#### step: 4 calculation of cost using h_t, c_t and y_t_1 ####
		y_t_1 = new_emb_y                 # shifted y | [1:] = [:-1]
		logit_gru = tf.tensordot(h_t, self.Wh, axes=1) #+ self.bh
		logit_ctx = tf.tensordot(c_t, self.Wc, axes=1) #+ self.bc
		logit_pre = tf.tensordot(y_t_1, self.Wy, axes=1) #+ self.by
		logit = logit_pre + logit_ctx + logit_gru + self.bh
		shape = tf.shape(logit)
		logit = tf.reshape(logit, [shape[0], -1, shape[2]//2, 2])
		logit = tf.reduce_max(logit, axis=3)
		logit =self.dropout(logit,training=self.training)
		#logit = tf.layers.dropout(inputs=logit, rate=0.2, training=self.training)

		logit = tf.tensordot(logit, self.Wo, axes=1) + self.bo
		logit_shape = tf.shape(logit)
		logit = tf.reshape(logit, [-1,logit_shape[2]])
		cost = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=tf.one_hot(tf.reshape(cost_y, [-1]),depth=self.target_dim))

		#### max pooling on vector with size equal to word_dim ####
		cost = tf.multiply(cost, tf.reshape(y_m, [-1]))
		cost = tf.reshape(cost, [shape[0], shape[1]])
		cost = tf.reduce_sum(cost, axis=0)
		cost = tf.reduce_mean(cost)

		#### alpha  L1 regularization ####
		alpha_sum=tf.reduce_mean(tf.reduce_sum(tf.reduce_sum(tf.abs(alpha), axis=[2, 3]),axis=0))
		cost = tf.cond(tf.cast(alpha_reg > 0, tf.bool),  	lambda: cost + (alpha_reg * alpha_sum), lambda: cost)

		return cost


	def get_word(self, sample_y, sample_h_pre, alpha_past_pre, sample_annotation,training_mode):

		emb = tf.cond(pred=sample_y[0] < 0,
			true_fn=lambda: tf.fill((1, self.word_dim), 0.0),
			false_fn=lambda: tf.nn.embedding_lookup(params=self.embed_matrix, ids=sample_y)
			)

		#ret = self.decoder.one_time_step((h_pre, None, None, alpha_past_pre, annotation, None), (emb, None))
		emb_y_z_r_vector = tf.tensordot(emb, self.decoder.W_yz_yr, axes=1) + \
		self.decoder.b_yz_yr                                            # [batch, 2 * dim_decoder]
		hidden_z_r_vector = tf.tensordot(sample_h_pre,
		self.decoder.U_hz_hr, axes=1)                                   # [batch, 2 * dim_decoder]
		pre_z_r_vector = tf.sigmoid(emb_y_z_r_vector + \
		hidden_z_r_vector)                                             # [batch, 2 * dim_decoder]

		r1 = pre_z_r_vector[:, :self.decoder.hidden_dim]                # [batch, dim_decoder]
		z1 = pre_z_r_vector[:, self.decoder.hidden_dim:]                # [batch, dim_decoder]

		emb_y_h_vector = tf.tensordot(emb, self.decoder.W_yh, axes=1) + \
		self.decoder.b_yh                                               # [batch, dim_decoder]
		hidden_r_h_vector = tf.tensordot(sample_h_pre,
		self.decoder.U_rh, axes=1)                                      # [batch, dim_decoder]
		hidden_r_h_vector *= r1
		pre_h_proposal = tf.tanh(hidden_r_h_vector + emb_y_h_vector)

		pre_h = z1 * sample_h_pre + (1. - z1) * pre_h_proposal

		context, alphacc, alpha_past = self.decoder.contextual_attention.get_context(sample_annotation, pre_h, alpha_past_pre, None)  # [batch, dim_ctx]
		emb_y_z_r_nl_vector = tf.tensordot(pre_h, self.decoder.U_hz_hr_nl, axes=1) + self.decoder.b_hz_hr_nl
		context_z_r_vector = tf.tensordot(context, self.decoder.W_c_z_r, axes=1)
		z_r_vector = tf.sigmoid(emb_y_z_r_nl_vector + context_z_r_vector)

		r2 = z_r_vector[:, :self.decoder.hidden_dim]
		z2 = z_r_vector[:, self.decoder.hidden_dim:]

		emb_y_h_nl_vector = tf.tensordot(pre_h, self.decoder.U_rh_nl, axes=1) + self.decoder.b_rh_nl
		emb_y_h_nl_vector *= r2
		context_h_vector = tf.tensordot(context, self.decoder.W_c_h_nl, axes=1)
		h_proposal = tf.tanh(emb_y_h_nl_vector + context_h_vector)
		h = z2 * pre_h + (1. - z2) * h_proposal

		h_t = h
		c_t = context
		alpha_past_t = alpha_past
		y_t_1 = emb
		logit_gru = tf.tensordot(h_t, self.Wh, axes=1) #+ self.bh
		logit_ctx = tf.tensordot(c_t, self.Wc, axes=1) #+ self.bc
		logit_pre = tf.tensordot(y_t_1, self.Wy, axes=1) #+ self.by
		logit = logit_pre + logit_ctx + logit_gru  + self.bh # batch x word_dim

		shape = tf.shape(input=logit)
		logit = tf.reshape(logit, [-1, shape[1]//2, 2])
		logit = tf.reduce_max(input_tensor=logit, axis=2)

		logit =  self.dropout(logit,training=training_mode)

		logit = tf.tensordot(logit, self.Wo, axes=1) + self.bo

		next_probs = tf.nn.softmax(logits=logit)
		next_word  = tf.reduce_max(input_tensor=tf.random.categorical(logits=next_probs, num_samples=1), axis=1)
		return next_probs, next_word, h_t, alpha_past_t, alphacc



class CALText_Model(tf.keras.Model): # Subclass from tf.keras.model
		def __init__(self,training): # Define All your Variables Here. And other configurations
				super(CALText_Model, self).__init__()
				self.dense_blocks=3
				self.levels_count=16
				self.growth=24

				#### decoder setup parameters ####
				self.hidden_dim=256
				self.word_dim=256
				self.dim_attend=512
				self.dense_encoder =  DenseEncoder(blocks=self.dense_blocks,level=self.levels_count, growth_rate=self.growth, istraining=training)
				self.contextual_attention = ContextualAttention(684, self.hidden_dim, self.dim_attend)
				self.decoder = Decoder(self.hidden_dim, self.word_dim, self.contextual_attention, 684)	##annotation.shape.as_list()[3]=684
				self.caltext = CALText(self.dense_encoder, self.contextual_attention, self.decoder, self.hidden_dim, self.word_dim, 684 ,num_classes ,istraining=training)


		def call(self, x, x_mask, y=None, y_mask=None, training=True): # Use the variables defined here.... this is forward prop
				annotation, anno_mask = self.dense_encoder.dense_net(x, x_mask)
				if(y==None):
					return annotation
				else:
					cost = self.caltext.get_cost(annotation, y, anno_mask, y_mask, gamma_val)
					return cost,annotation

		def get_hidden_state_0(self, anno):
				hidden_state_0 = tf.tanh(tf.tensordot(tf.reduce_mean(input_tensor=anno, axis=[1, 2]), self.caltext.Wa2h, axes=1) + self.caltext.ba2h)  # [batch, hidden_dim]
				return hidden_state_0

##########apply L2 regularization on weights
def get_loss(loss,model):
    #print(model.trainable_weights)
    for layer in model.trainable_weights:
      arr=(layer.name).split("/")
      if not arr[len(arr)-1].startswith('conv2d'):
        loss += lambda_val * tf.reduce_sum(input_tensor=tf.pow(layer, 2))
    return loss

###################

@tf.function(experimental_relax_shapes=True)
def get_word(next_w, next_state, next_alpha_past, ctx, model,training):
   return model.caltext.get_word(next_w, next_state, next_alpha_past, ctx,training)

def get_sample(ctx0, h_0, k , maxlen, stochastic, training, model):

		sample = []
		sample_score = []
		sample_att=[]
		live_k = 1
		dead_k = 0

		hyp_samples = [[]] * 1
		hyp_scores = np.zeros(live_k).astype('float32')
		hyp_states = []


		next_alpha_past = np.zeros((ctx0.shape[0], ctx0.shape[1], ctx0.shape[2])).astype('float32')
		emb_0 = np.zeros((ctx0.shape[0], 256))

		next_w = -1 * np.ones((1,)).astype('int64')

		next_state = h_0
		#tf.autograph.experimental.set_loop_options(shape_invariants=[(next_alpha_past, tf.TensorShape([None]))])

		for ii in range(maxlen):

			ctx = np.tile(ctx0, [live_k, 1, 1, 1])
			next_p, next_w, next_state, next_alpha_past,contexVec  = get_word(next_w, next_state, next_alpha_past, ctx, model,training)
			sample_att.append(contexVec[0,:,:])
			if stochastic:
				nw = next_w[0]
				sample.append(nw)
				sample_score += next_p[0, nw]
				if nw == 0:
					break
			else:

				cand_scores = hyp_scores[:, None] - np.log(next_p)
				cand_flat = cand_scores.flatten()
				ranks_flat = cand_flat.argsort()[:(k-dead_k)]
				voc_size = next_p.shape[1]

				assert voc_size==num_classes

				trans_indices = ranks_flat // voc_size
				word_indices = ranks_flat % voc_size
				costs = cand_flat[ranks_flat]
				new_hyp_samples = []
				new_hyp_scores = np.zeros(k-dead_k).astype('float32')
				new_hyp_states = []
				new_hyp_alpha_past = []

				for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)):
					new_hyp_samples.append(hyp_samples[ti]+[wi])
					new_hyp_scores[idx] = copy.copy(costs[idx])
					new_hyp_states.append(copy.copy(next_state[ti]))
					new_hyp_alpha_past.append(copy.copy(next_alpha_past[ti]))

				new_live_k = 0
				hyp_samples = []
				hyp_scores = []
				hyp_states = []
				hyp_alpha_past = []

				for idx in range(len(new_hyp_samples)):
					if new_hyp_samples[idx][-1] == 0: # <eol>
						sample.append(new_hyp_samples[idx])
						sample_score.append(new_hyp_scores[idx])
						dead_k += 1
					else:
						new_live_k += 1
						hyp_samples.append(new_hyp_samples[idx])
						hyp_scores.append(new_hyp_scores[idx])
						hyp_states.append(new_hyp_states[idx])
						hyp_alpha_past.append(new_hyp_alpha_past[idx])
				hyp_scores = np.array(hyp_scores)
				live_k = new_live_k

				if new_live_k < 1:
					break
				if dead_k >= k:
					break
                    
				next_w = np.array([w1[-1] for w1 in hyp_samples])
				next_state = np.array(hyp_states)
				next_alpha_past = np.array(hyp_alpha_past)

		if not stochastic:
			# dump every remaining one
			if live_k > 0:
				for idx in range(live_k):
					sample.append(hyp_samples[idx])
					sample_score.append(hyp_scores[idx])

		return sample, sample_score,sample_att


#######################Predict
@tf.function(experimental_relax_shapes=True)
def execute_model(xx,xx_mask,CALTEXT):

  anno = CALTEXT(xx,xx_mask, training=False)
  hidden_state_0 = CALTEXT.get_hidden_state_0(anno)
  return anno,hidden_state_0


def predict(CALTEXT, images, x_mask):
  # training=False is only needed if there are layers with different
  # behavior during training versus inference (e.g. Dropout).
  batch_loss=0
  img_ind=1
  for img_ind in range(len(images)):
    xx = images[img_ind][tf.newaxis, ... ]
    xx_mask = x_mask[img_ind][tf.newaxis, ... ]
    anno,hidden_state_0=execute_model(xx,xx_mask,CALTEXT)

    sample, score,hypalpha=get_sample(anno, hidden_state_0,10, 130, False, False, CALTEXT)


    score = score / np.array([len(s) for s in sample])
    ss = sample[score.argmin()]
    img_ind=img_ind+1

    ind=0
    num=int(len(ss)/2)

    ####   output string
    ind=0
    outstr=u''
    frames = []
    font = ImageFont.truetype("Jameel Noori Nastaleeq.ttf",60)
    worddicts_r=data.load_dict_picklefile("vocabulary.pkl")
    while (ind<len(ss)-1):
      k=(len(ss)-2)-ind
      outstr=outstr+worddicts_r[int(ss[k])]
      textimg = Image.new('RGB', (1400,100),(255,255,255))
      drawtext = ImageDraw.Draw(textimg)
      drawtext.text((20, 20), outstr ,(0,0,0),font=font)
      fig,axes=plt.subplots(2,1)
      axes[0].imshow(textimg)
      axes[0].axis('off')
      axes[1].axis('off')
      axes[1].imshow(xx[0,:,:],cmap='gray')
      visualization=resize(hypalpha[k], (100,800),anti_aliasing=True)
      axes[1].imshow(255-(255 * visualization), alpha=0.2)
      plt.axis('off')

      plt.savefig('res.png')
      frames.append(Image.fromarray(cv2.imread('res.png'), 'RGB'))
      ind=ind+1
    frame_one = frames[0]
    frame_one.save("vis.gif", format="GIF", append_images=frames,save_all=True, duration=300, loop=0)
    gif_image="vis.gif"
  return outstr,gif_image