File size: 3,282 Bytes
028b822
c3b8e88
 
 
 
 
 
 
 
 
 
 
 
5812c3e
c3b8e88
 
 
 
 
028b822
c3b8e88
 
028b822
769a633
c3b8e88
 
 
769a633
351c492
 
ba5a475
769a633
351c492
c3b8e88
 
 
028b822
 
 
c3b8e88
028b822
 
 
 
 
c3b8e88
028b822
 
 
 
c3b8e88
028b822
 
 
 
769a633
028b822
 
c3b8e88
 
 
6977399
 
 
 
 
 
 
 
028b822
 
 
 
c3b8e88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

import cv2
import numpy as np
from keras.applications import ResNet152
from keras.optimizers import Adam
from keras.models import Sequential, Model,load_model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import add
from keras.utils import to_categorical
import gradio as gr
from keras.preprocessing import image, sequence
from keras_preprocessing.sequence import pad_sequences
from tqdm import tqdm
import pickle
import tensorflow as tf
from tensorflow.keras.applications.resnet import preprocess_input


ResNet152Model=ResNet152(include_top=False, weights='imagenet',input_shape=(224,224,3), pooling='avg')
with open("pickle_files/words_dict_nepali_sc_40.pkl","rb") as f:
    words_dict=pickle.load(f)


# vocab_size = 5611
vocab_size=6039  # 40 epoch all data


# MAX_LEN=192
MAX_LEN=211 # 40 epoch all data
inv_dict = {v:k for k, v in words_dict.items()}


inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

# language sequence model
inputs2 = Input(shape=(MAX_LEN,))
se1 = Embedding(vocab_size, MAX_LEN, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

# tie it together [image, seq] [word]
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.load_weights("LSTM/resnet152_lstm_model_weights_40epoch.h5")
#
print("LSTM model  loaded successfully")

def after(image):

    img_array = np.array(image)

    # Ensure the image is not empty
    if img_array is None:
        return "Error: Empty image received."

    # Perform image processing
    img = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))
    img = np.expand_dims(img, axis=0)  # Add batch dimension
    img = preprocess_input(img) 
    test_img_resized=ResNet152Model.predict(img).reshape(1,2048)

    text_inp = ['startofseq']
    count = 0
    caption = ''
    while count < MAX_LEN:
        count += 1
        encoded = []
        encoded = [words_dict.get(word, len(words_dict) - 1) for word in text_inp]  # Convert words to indices, using index for '<end>' for unknown words
        encoded = pad_sequences([encoded], padding='post', truncating='post', maxlen=MAX_LEN)[0]  # Pad sequences

        data_list = [test_img_resized.reshape(1, -1), encoded.reshape(1, -1)]  # Reshape encoded
        prediction = np.argmax(model.predict(data_list))
        prediction = np.argmax(model.predict(data_list))
        sampled_word = inv_dict[prediction]
        caption = caption + ' ' + sampled_word

        if sampled_word == 'endofseq':
            break
        text_inp.append(sampled_word)

    caption= caption.replace('endofseq','')
    print(caption.replace(' .','.'))

    # return jsonify({'caption': caption.replace(' .','.')})
    return caption.replace(' .','.')



iface = gr.Interface(fn=after, inputs="image", outputs="text")
iface.launch()
# if __name__ == "__main__":
#     app.run(debug=True)