saichandrapandraju commited on
Commit
322d8bb
1 Parent(s): 68870d7

initial model

Browse files
Files changed (4) hide show
  1. app.py +97 -0
  2. best_model.h5 +3 -0
  3. requirements.txt +6 -0
  4. tokenizer.txt +0 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import streamlit as st
3
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
4
+ from tensorflow.keras.models import load_model, Model
5
+ from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
6
+ from tensorflow.keras.preprocessing.text import tokenizer_from_json
7
+ from tensorflow.keras.preprocessing.image import load_img, img_to_array
8
+ from PIL import Image
9
+
10
+
11
+ @st.cache_resource
12
+ def init_lstm_model():
13
+ return load_model("./best_model.h5")
14
+
15
+ @st.cache_resource
16
+ def init_vgg16_model():
17
+ vgg_model = VGG16()
18
+ return Model(inputs = vgg_model.inputs , outputs = vgg_model.layers[-2].output)
19
+
20
+ @st.cache_resource
21
+ def init_lstm_tokenizer():
22
+ with open("./tokenizer.txt") as rf:
23
+ return tokenizer_from_json(rf.read())
24
+
25
+
26
+
27
+ vgg16_model = init_vgg16_model()
28
+ lstm_model = init_lstm_model()
29
+ lstm_tokenizer = init_lstm_tokenizer()
30
+ max_length = 35
31
+
32
+ def idx_to_word(integer):
33
+ for word, index in lstm_tokenizer.word_index.items():
34
+ if index == integer:
35
+ return word
36
+ return None
37
+
38
+
39
+ def predict_caption(image, max_length):
40
+ # add start tag for generation process
41
+ in_text = 'startseq'
42
+ # iterate over the max length of sequence
43
+ for _ in range(max_length):
44
+ # encode input sequence
45
+ sequence = lstm_tokenizer.texts_to_sequences([in_text])[0]
46
+ # pad the sequence
47
+ sequence = pad_sequences([sequence], max_length)
48
+ # predict next word
49
+ yhat = lstm_model.predict([image, sequence], verbose=0)
50
+ # get index with high probability
51
+ yhat = np.argmax(yhat)
52
+ # convert index to word
53
+ word = idx_to_word(yhat, lstm_tokenizer)
54
+ # stop if word not found
55
+ if word is None:
56
+ break
57
+ # append word as input for generating next word
58
+ in_text += " " + word
59
+ # stop if we reach end tag
60
+ if word == 'endseq':
61
+ break
62
+ return in_text
63
+
64
+
65
+
66
+ def generate_caption(image_name):
67
+ # load the image
68
+ image = load_img(image_name, target_size=(224, 224))
69
+ # convert image pixels to numpy array
70
+ image = img_to_array(image)
71
+ # reshape data for model
72
+ image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
73
+ # preprocess image for vgg
74
+ image = preprocess_input(image)
75
+ feature = vgg16_model.predict(image)
76
+ # predict the caption
77
+ y_pred = predict_caption(feature, max_length)
78
+ return y_pred.repalce("startseq", "").replace("endseq", "").strip()
79
+
80
+
81
+ st.title("""
82
+ Image Captioner.
83
+
84
+ This app generates a caption for the input image. The results will be predicted from the basic cnn-rnn to advanced transformer based encoder-decoder models.""")
85
+
86
+
87
+ file_name = st.file_uploader("Upload an image to generate caption...")
88
+
89
+ if file_name is not None:
90
+ col1, col2 = st.columns(2)
91
+
92
+ image = Image.open(file_name)
93
+ col1.image(image, use_column_width=True)
94
+ prediction = generate_caption(file_name)
95
+
96
+ col2.header("Predictions")
97
+ col2.subheader(f"VGG16-LSTM : {prediction}")
best_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f232b494b3e1fa7f720a6e508adfc8145ac8df339cde02ac5f650e0ad909cf7f
3
+ size 71314248
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ keras==2.12.0
2
+ Pillow
3
+ tensorflow==2.12.0
4
+ tensorflow-text
5
+ numpy
6
+ streamlit
tokenizer.txt ADDED
The diff for this file is too large to render. See raw diff