import gradio as gr import pickle import numpy as np import tensorflow as tf from tensorflow.keras.applications import EfficientNetB7 from tensorflow.keras.applications.efficientnet import preprocess_input from tensorflow.keras.preprocessing.image import load_img, img_to_array from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Model from tensorflow.keras.models import model_from_json from keras.optimizers import Adam from PIL import Image # load vgg16 model pre_trained = EfficientNetB7(weights='imagenet', include_top=False, input_shape=(224, 224, 3)) # Freeze the base model pre_trained.trainable = False x = tf.keras.layers.GlobalAveragePooling2D()(pre_trained.output) # restructure the model pre_trained_model = Model(inputs=pre_trained.input, outputs=x) ########################################################################################################### # model = tf.keras.models.load_model("image_captioning_30k_model.h5") # Load model architecture with open("30k_model_architecture.json", "r") as json_file: loaded_model_json = json_file.read() # Create the optimizer without specifying the learning rate optimizer = Adam() # Set the learning rate separately optimizer.learning_rate.assign(0.001) # Load weights model = model_from_json(loaded_model_json) model.load_weights("30k_model_weights.h5") # Load optimizer state model.compile(optimizer=optimizer, loss='categorical_crossentropy') ########################################################################################################### tokenizer = Tokenizer() with open("Image_Captioner_tokenizer_30k.pkl", "rb") as f: tokenizer = pickle.load(f) def idx_to_word(integer, tokenizer): for word, index in tokenizer.word_index.items(): if index == integer: return word return None # generate caption for an image def predict_caption(model, image, tokenizer, max_length): # add start tag for generation process in_text = 'startseq' # iterate over the max length of sequence for i in range(max_length): # encode input sequence sequence = tokenizer.texts_to_sequences([in_text])[0] # pad the sequence sequence = pad_sequences([sequence], max_length) # predict next word yhat = model.predict([image, sequence], verbose=0) # get index with high probability yhat = np.argmax(yhat) # convert index to word word = idx_to_word(yhat, tokenizer) # stop if word not found if word is None: break # append word as input for generating next word in_text += " " + word # stop if we reach end tag if word == 'endseq': break cut_text = ' '.join(in_text.split()[1:-1]) return cut_text def google_image_testing(inp): # Convert input into jpg file input_image = Image.fromarray(inp) input_image.save("input_image.jpg") # Load input Image image_path = 'input_image.jpg' image = load_img(image_path, target_size=(224, 224)) # convert image pixels to numpy array image = img_to_array(image) # reshape data for model image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) # preprocess image for vgg image = preprocess_input(image) # extract features img_feature = pre_trained_model.predict(image, verbose=0) # predict the caption predicted = predict_caption(model, img_feature, tokenizer, max_length=74) return predicted demo = gr.Interface(fn=google_image_testing, inputs='image',outputs='text',title='Image Captioner') demo.launch(debug=True,share=True)