chhetri123's picture
Update app.py
351c492 verified
import cv2
import numpy as np
from keras.applications import ResNet152
from keras.optimizers import Adam
from keras.models import Sequential, Model,load_model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import add
from keras.utils import to_categorical
import gradio as gr
from keras.preprocessing import image, sequence
from keras_preprocessing.sequence import pad_sequences
from tqdm import tqdm
import pickle
import tensorflow as tf
from tensorflow.keras.applications.resnet import preprocess_input
ResNet152Model=ResNet152(include_top=False, weights='imagenet',input_shape=(224,224,3), pooling='avg')
with open("pickle_files/words_dict_nepali_sc_40.pkl","rb") as f:
words_dict=pickle.load(f)
# vocab_size = 5611
vocab_size=6039 # 40 epoch all data
# MAX_LEN=192
MAX_LEN=211 # 40 epoch all data
inv_dict = {v:k for k, v in words_dict.items()}
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
# language sequence model
inputs2 = Input(shape=(MAX_LEN,))
se1 = Embedding(vocab_size, MAX_LEN, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)
# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
# tie it together [image, seq] [word]
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.load_weights("LSTM/resnet152_lstm_model_weights_40epoch.h5")
#
print("LSTM model loaded successfully")
def after(image):
img_array = np.array(image)
# Ensure the image is not empty
if img_array is None:
return "Error: Empty image received."
# Perform image processing
img = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
img = cv2.resize(img, (224, 224))
img = np.expand_dims(img, axis=0) # Add batch dimension
img = preprocess_input(img)
test_img_resized=ResNet152Model.predict(img).reshape(1,2048)
text_inp = ['startofseq']
count = 0
caption = ''
while count < MAX_LEN:
count += 1
encoded = []
encoded = [words_dict.get(word, len(words_dict) - 1) for word in text_inp] # Convert words to indices, using index for '<end>' for unknown words
encoded = pad_sequences([encoded], padding='post', truncating='post', maxlen=MAX_LEN)[0] # Pad sequences
data_list = [test_img_resized.reshape(1, -1), encoded.reshape(1, -1)] # Reshape encoded
prediction = np.argmax(model.predict(data_list))
prediction = np.argmax(model.predict(data_list))
sampled_word = inv_dict[prediction]
caption = caption + ' ' + sampled_word
if sampled_word == 'endofseq':
break
text_inp.append(sampled_word)
caption= caption.replace('endofseq','')
print(caption.replace(' .','.'))
# return jsonify({'caption': caption.replace(' .','.')})
return caption.replace(' .','.')
iface = gr.Interface(fn=after, inputs="image", outputs="text")
iface.launch()
# if __name__ == "__main__":
# app.run(debug=True)