chhetri123's picture
Update app.py
351c492 verified
raw history blame
No virus
3.28 kB
import cv2
import numpy as np
from keras.applications import ResNet152
from keras.optimizers import Adam
from keras.models import Sequential, Model,load_model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import add
from keras.utils import to_categorical
import gradio as gr
from keras.preprocessing import image, sequence
from keras_preprocessing.sequence import pad_sequences
from tqdm import tqdm
import pickle
import tensorflow as tf
from tensorflow.keras.applications.resnet import preprocess_input
ResNet152Model=ResNet152(include_top=False, weights='imagenet',input_shape=(224,224,3), pooling='avg')
with open("pickle_files/words_dict_nepali_sc_40.pkl","rb") as f:
words_dict=pickle.load(f)
# vocab_size = 5611
vocab_size=6039 # 40 epoch all data
# MAX_LEN=192
MAX_LEN=211 # 40 epoch all data
inv_dict = {v:k for k, v in words_dict.items()}
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
# language sequence model
inputs2 = Input(shape=(MAX_LEN,))
se1 = Embedding(vocab_size, MAX_LEN, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)
# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
# tie it together [image, seq] [word]
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.load_weights("LSTM/resnet152_lstm_model_weights_40epoch.h5")
#
print("LSTM model loaded successfully")
def after(image):
img_array = np.array(image)
# Ensure the image is not empty
if img_array is None:
return "Error: Empty image received."
# Perform image processing
img = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
img = cv2.resize(img, (224, 224))
img = np.expand_dims(img, axis=0) # Add batch dimension
img = preprocess_input(img)
test_img_resized=ResNet152Model.predict(img).reshape(1,2048)
text_inp = ['startofseq']
count = 0
caption = ''
while count < MAX_LEN:
count += 1
encoded = []
encoded = [words_dict.get(word, len(words_dict) - 1) for word in text_inp] # Convert words to indices, using index for '<end>' for unknown words
encoded = pad_sequences([encoded], padding='post', truncating='post', maxlen=MAX_LEN)[0] # Pad sequences
data_list = [test_img_resized.reshape(1, -1), encoded.reshape(1, -1)] # Reshape encoded
prediction = np.argmax(model.predict(data_list))
prediction = np.argmax(model.predict(data_list))
sampled_word = inv_dict[prediction]
caption = caption + ' ' + sampled_word
if sampled_word == 'endofseq':
break
text_inp.append(sampled_word)
caption= caption.replace('endofseq','')
print(caption.replace(' .','.'))
# return jsonify({'caption': caption.replace(' .','.')})
return caption.replace(' .','.')
iface = gr.Interface(fn=after, inputs="image", outputs="text")
iface.launch()
# if __name__ == "__main__":
# app.run(debug=True)