Spaces:

chhetri123
/

image_captioning_lstm

Sleeping

File size: 3,282 Bytes


import cv2
import numpy as np
from keras.applications import ResNet152
from keras.optimizers import Adam
from keras.models import Sequential, Model,load_model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import add
from keras.utils import to_categorical
import gradio as gr
from keras.preprocessing import image, sequence
from keras_preprocessing.sequence import pad_sequences
from tqdm import tqdm
import pickle
import tensorflow as tf
from tensorflow.keras.applications.resnet import preprocess_input


ResNet152Model=ResNet152(include_top=False, weights='imagenet',input_shape=(224,224,3), pooling='avg')
with open("pickle_files/words_dict_nepali_sc_40.pkl","rb") as f:
    words_dict=pickle.load(f)


# vocab_size = 5611
vocab_size=6039  # 40 epoch all data


# MAX_LEN=192
MAX_LEN=211 # 40 epoch all data
inv_dict = {v:k for k, v in words_dict.items()}


inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

# language sequence model
inputs2 = Input(shape=(MAX_LEN,))
se1 = Embedding(vocab_size, MAX_LEN, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

# tie it together [image, seq] [word]
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.load_weights("LSTM/resnet152_lstm_model_weights_40epoch.h5")
#
print("LSTM model  loaded successfully")

def after(image):

    img_array = np.array(image)

    # Ensure the image is not empty
    if img_array is None:
        return "Error: Empty image received."

    # Perform image processing
    img = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))
    img = np.expand_dims(img, axis=0)  # Add batch dimension
    img = preprocess_input(img) 
    test_img_resized=ResNet152Model.predict(img).reshape(1,2048)

    text_inp = ['startofseq']
    count = 0
    caption = ''
    while count < MAX_LEN:
        count += 1
        encoded = []
        encoded = [words_dict.get(word, len(words_dict) - 1) for word in text_inp]  # Convert words to indices, using index for '<end>' for unknown words
        encoded = pad_sequences([encoded], padding='post', truncating='post', maxlen=MAX_LEN)[0]  # Pad sequences

        data_list = [test_img_resized.reshape(1, -1), encoded.reshape(1, -1)]  # Reshape encoded
        prediction = np.argmax(model.predict(data_list))
        prediction = np.argmax(model.predict(data_list))
        sampled_word = inv_dict[prediction]
        caption = caption + ' ' + sampled_word

        if sampled_word == 'endofseq':
            break
        text_inp.append(sampled_word)

    caption= caption.replace('endofseq','')
    print(caption.replace(' .','.'))

    # return jsonify({'caption': caption.replace(' .','.')})
    return caption.replace(' .','.')



iface = gr.Interface(fn=after, inputs="image", outputs="text")
iface.launch()
# if __name__ == "__main__":
#     app.run(debug=True)