Spaces:
Sleeping
Sleeping
import cv2 | |
import numpy as np | |
from keras.applications import ResNet152 | |
from keras.optimizers import Adam | |
from keras.models import Sequential, Model,load_model | |
from keras.layers import Input | |
from keras.layers import Dense | |
from keras.layers import LSTM | |
from keras.layers import Embedding | |
from keras.layers import Dropout | |
from keras.layers import add | |
from keras.utils import to_categorical | |
import gradio as gr | |
from keras.preprocessing import image, sequence | |
from keras_preprocessing.sequence import pad_sequences | |
from tqdm import tqdm | |
import pickle | |
import tensorflow as tf | |
from tensorflow.keras.applications.resnet import preprocess_input | |
ResNet152Model=ResNet152(include_top=False, weights='imagenet',input_shape=(224,224,3), pooling='avg') | |
with open("pickle_files/words_dict_nepali_sc_40.pkl","rb") as f: | |
words_dict=pickle.load(f) | |
# vocab_size = 5611 | |
vocab_size=6039 # 40 epoch all data | |
# MAX_LEN=192 | |
MAX_LEN=211 # 40 epoch all data | |
inv_dict = {v:k for k, v in words_dict.items()} | |
inputs1 = Input(shape=(2048,)) | |
fe1 = Dropout(0.5)(inputs1) | |
fe2 = Dense(256, activation='relu')(fe1) | |
# language sequence model | |
inputs2 = Input(shape=(MAX_LEN,)) | |
se1 = Embedding(vocab_size, MAX_LEN, mask_zero=True)(inputs2) | |
se2 = Dropout(0.4)(se1) | |
se3 = LSTM(256)(se2) | |
# decoder model | |
decoder1 = add([fe2, se3]) | |
decoder2 = Dense(256, activation='relu')(decoder1) | |
outputs = Dense(vocab_size, activation='softmax')(decoder2) | |
# tie it together [image, seq] [word] | |
model = Model(inputs=[inputs1, inputs2], outputs=outputs) | |
# compile model | |
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy']) | |
model.load_weights("LSTM/resnet152_lstm_model_weights_40epoch.h5") | |
# | |
print("LSTM model loaded successfully") | |
def after(image): | |
img_array = np.array(image) | |
# Ensure the image is not empty | |
if img_array is None: | |
return "Error: Empty image received." | |
# Perform image processing | |
img = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB) | |
img = cv2.resize(img, (224, 224)) | |
img = np.expand_dims(img, axis=0) # Add batch dimension | |
img = preprocess_input(img) | |
test_img_resized=ResNet152Model.predict(img).reshape(1,2048) | |
text_inp = ['startofseq'] | |
count = 0 | |
caption = '' | |
while count < MAX_LEN: | |
count += 1 | |
encoded = [] | |
encoded = [words_dict.get(word, len(words_dict) - 1) for word in text_inp] # Convert words to indices, using index for '<end>' for unknown words | |
encoded = pad_sequences([encoded], padding='post', truncating='post', maxlen=MAX_LEN)[0] # Pad sequences | |
data_list = [test_img_resized.reshape(1, -1), encoded.reshape(1, -1)] # Reshape encoded | |
prediction = np.argmax(model.predict(data_list)) | |
prediction = np.argmax(model.predict(data_list)) | |
sampled_word = inv_dict[prediction] | |
caption = caption + ' ' + sampled_word | |
if sampled_word == 'endofseq': | |
break | |
text_inp.append(sampled_word) | |
caption= caption.replace('endofseq','') | |
print(caption.replace(' .','.')) | |
# return jsonify({'caption': caption.replace(' .','.')}) | |
return caption.replace(' .','.') | |
iface = gr.Interface(fn=after, inputs="image", outputs="text") | |
iface.launch() | |
# if __name__ == "__main__": | |
# app.run(debug=True) |