Spaces:

chhetri123
/

image_captioning_lstm

Sleeping

App Files Files Community

image_captioning_lstm / app.py

chhetri123

Update app.py

351c492 verified 4 months ago

raw history blame

No virus

3.28 kB


	import cv2
	import numpy as np
	from keras.applications import ResNet152
	from keras.optimizers import Adam
	from keras.models import Sequential, Model,load_model
	from keras.layers import Input
	from keras.layers import Dense
	from keras.layers import LSTM
	from keras.layers import Embedding
	from keras.layers import Dropout
	from keras.layers import add
	from keras.utils import to_categorical
	import gradio as gr
	from keras.preprocessing import image, sequence
	from keras_preprocessing.sequence import pad_sequences
	from tqdm import tqdm
	import pickle
	import tensorflow as tf
	from tensorflow.keras.applications.resnet import preprocess_input


	ResNet152Model=ResNet152(include_top=False, weights='imagenet',input_shape=(224,224,3), pooling='avg')
	with open("pickle_files/words_dict_nepali_sc_40.pkl","rb") as f:
	words_dict=pickle.load(f)


	# vocab_size = 5611
	vocab_size=6039 # 40 epoch all data


	# MAX_LEN=192
	MAX_LEN=211 # 40 epoch all data
	inv_dict = {v:k for k, v in words_dict.items()}


	inputs1 = Input(shape=(2048,))
	fe1 = Dropout(0.5)(inputs1)
	fe2 = Dense(256, activation='relu')(fe1)

	# language sequence model
	inputs2 = Input(shape=(MAX_LEN,))
	se1 = Embedding(vocab_size, MAX_LEN, mask_zero=True)(inputs2)
	se2 = Dropout(0.4)(se1)
	se3 = LSTM(256)(se2)

	# decoder model
	decoder1 = add([fe2, se3])
	decoder2 = Dense(256, activation='relu')(decoder1)
	outputs = Dense(vocab_size, activation='softmax')(decoder2)

	# tie it together [image, seq] [word]
	model = Model(inputs=[inputs1, inputs2], outputs=outputs)
	# compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
	model.load_weights("LSTM/resnet152_lstm_model_weights_40epoch.h5")
	#
	print("LSTM model loaded successfully")

	def after(image):

	img_array = np.array(image)

	# Ensure the image is not empty
	if img_array is None:
	return "Error: Empty image received."

	# Perform image processing
	img = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
	img = cv2.resize(img, (224, 224))
	img = np.expand_dims(img, axis=0) # Add batch dimension
	img = preprocess_input(img)
	test_img_resized=ResNet152Model.predict(img).reshape(1,2048)

	text_inp = ['startofseq']
	count = 0
	caption = ''
	while count < MAX_LEN:
	count += 1
	encoded = []
	encoded = [words_dict.get(word, len(words_dict) - 1) for word in text_inp] # Convert words to indices, using index for '<end>' for unknown words
	encoded = pad_sequences([encoded], padding='post', truncating='post', maxlen=MAX_LEN)[0] # Pad sequences

	data_list = [test_img_resized.reshape(1, -1), encoded.reshape(1, -1)] # Reshape encoded
	prediction = np.argmax(model.predict(data_list))
	prediction = np.argmax(model.predict(data_list))
	sampled_word = inv_dict[prediction]
	caption = caption + ' ' + sampled_word

	if sampled_word == 'endofseq':
	break
	text_inp.append(sampled_word)

	caption= caption.replace('endofseq','')
	print(caption.replace(' .','.'))

	# return jsonify({'caption': caption.replace(' .','.')})
	return caption.replace(' .','.')



	iface = gr.Interface(fn=after, inputs="image", outputs="text")
	iface.launch()
	# if __name__ == "__main__":
	# app.run(debug=True)