import cv2 import numpy as np from keras.applications import ResNet152 from keras.optimizers import Adam from keras.models import Sequential, Model,load_model from keras.layers import Input from keras.layers import Dense from keras.layers import LSTM from keras.layers import Embedding from keras.layers import Dropout from keras.layers import add from keras.utils import to_categorical import gradio as gr from keras.preprocessing import image, sequence from keras_preprocessing.sequence import pad_sequences from tqdm import tqdm import pickle import tensorflow as tf from tensorflow.keras.applications.resnet import preprocess_input ResNet152Model=ResNet152(include_top=False, weights='imagenet',input_shape=(224,224,3), pooling='avg') with open("pickle_files/words_dict_nepali_sc_40.pkl","rb") as f: words_dict=pickle.load(f) # vocab_size = 5611 vocab_size=6039 # 40 epoch all data # MAX_LEN=192 MAX_LEN=211 # 40 epoch all data inv_dict = {v:k for k, v in words_dict.items()} inputs1 = Input(shape=(2048,)) fe1 = Dropout(0.5)(inputs1) fe2 = Dense(256, activation='relu')(fe1) # language sequence model inputs2 = Input(shape=(MAX_LEN,)) se1 = Embedding(vocab_size, MAX_LEN, mask_zero=True)(inputs2) se2 = Dropout(0.4)(se1) se3 = LSTM(256)(se2) # decoder model decoder1 = add([fe2, se3]) decoder2 = Dense(256, activation='relu')(decoder1) outputs = Dense(vocab_size, activation='softmax')(decoder2) # tie it together [image, seq] [word] model = Model(inputs=[inputs1, inputs2], outputs=outputs) # compile model model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy']) model.load_weights("LSTM/resnet152_lstm_model_weights_40epoch.h5") # print("LSTM model loaded successfully") def after(image): img_array = np.array(image) # Ensure the image is not empty if img_array is None: return "Error: Empty image received." # Perform image processing img = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB) img = cv2.resize(img, (224, 224)) img = np.expand_dims(img, axis=0) # Add batch dimension img = preprocess_input(img) test_img_resized=ResNet152Model.predict(img).reshape(1,2048) text_inp = ['startofseq'] count = 0 caption = '' while count < MAX_LEN: count += 1 encoded = [] encoded = [words_dict.get(word, len(words_dict) - 1) for word in text_inp] # Convert words to indices, using index for '' for unknown words encoded = pad_sequences([encoded], padding='post', truncating='post', maxlen=MAX_LEN)[0] # Pad sequences data_list = [test_img_resized.reshape(1, -1), encoded.reshape(1, -1)] # Reshape encoded prediction = np.argmax(model.predict(data_list)) prediction = np.argmax(model.predict(data_list)) sampled_word = inv_dict[prediction] caption = caption + ' ' + sampled_word if sampled_word == 'endofseq': break text_inp.append(sampled_word) caption= caption.replace('endofseq','') print(caption.replace(' .','.')) # return jsonify({'caption': caption.replace(' .','.')}) return caption.replace(' .','.') iface = gr.Interface(fn=after, inputs="image", outputs="text") iface.launch() # if __name__ == "__main__": # app.run(debug=True)