ImageCaptioner / app.py
Mr-Vicky-01's picture
Update app.py
e55aa90 verified
import gradio as gr
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB7
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.models import model_from_json
from keras.optimizers import Adam
from PIL import Image
# load vgg16 model
pre_trained = EfficientNetB7(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
# Freeze the base model
pre_trained.trainable = False
x = tf.keras.layers.GlobalAveragePooling2D()(pre_trained.output)
# restructure the model
pre_trained_model = Model(inputs=pre_trained.input, outputs=x)
###########################################################################################################
# model = tf.keras.models.load_model("image_captioning_30k_model.h5")
# Load model architecture
with open("30k_model_architecture.json", "r") as json_file:
loaded_model_json = json_file.read()
# Create the optimizer without specifying the learning rate
optimizer = Adam()
# Set the learning rate separately
optimizer.learning_rate.assign(0.001)
# Load weights
model = model_from_json(loaded_model_json)
model.load_weights("30k_model_weights.h5")
# Load optimizer state
model.compile(optimizer=optimizer, loss='categorical_crossentropy')
###########################################################################################################
tokenizer = Tokenizer()
with open("Image_Captioner_tokenizer_30k.pkl", "rb") as f:
tokenizer = pickle.load(f)
def idx_to_word(integer, tokenizer):
for word, index in tokenizer.word_index.items():
if index == integer:
return word
return None
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
# add start tag for generation process
in_text = 'startseq'
# iterate over the max length of sequence
for i in range(max_length):
# encode input sequence
sequence = tokenizer.texts_to_sequences([in_text])[0]
# pad the sequence
sequence = pad_sequences([sequence], max_length)
# predict next word
yhat = model.predict([image, sequence], verbose=0)
# get index with high probability
yhat = np.argmax(yhat)
# convert index to word
word = idx_to_word(yhat, tokenizer)
# stop if word not found
if word is None:
break
# append word as input for generating next word
in_text += " " + word
# stop if we reach end tag
if word == 'endseq':
break
cut_text = ' '.join(in_text.split()[1:-1])
return cut_text
def google_image_testing(inp):
# Convert input into jpg file
input_image = Image.fromarray(inp)
input_image.save("input_image.jpg")
# Load input Image
image_path = 'input_image.jpg'
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
img_feature = pre_trained_model.predict(image, verbose=0)
# predict the caption
predicted = predict_caption(model, img_feature, tokenizer, max_length=74)
return predicted
demo = gr.Interface(fn=google_image_testing, inputs='image',outputs='text',title='Image Captioner')
demo.launch(debug=True,share=True)