File size: 2,897 Bytes
5e2bbb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7873a48
 
 
5e2bbb8
 
 
 
ff01337
5e2bbb8
 
 
 
 
7873a48
5e2bbb8
 
 
 
 
 
 
7873a48
 
 
202c930
7873a48
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import streamlit as st
import pickle
st.header("Image Captioner")
st.markdown("Building the framework may take upto a minute. Please be patient. Thank you!")
features=pickle.load(open("features.pkl","rb"))
all_captions=pickle.load(open("all_captions.pkl","rb"))
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in all_captions)
from tensorflow import keras
model = keras.models.load_model("best_model.h5")
from tensorflow.keras.preprocessing.sequence import pad_sequences
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
# generate caption for an image
import numpy as np
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
    
        in_text += " " + word

        if word == 'endseq':
            break
      
    return in_text
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
from gtts import gTTS
from io import BytesIO
sound_file = BytesIO()
vgg_model = VGG16()
vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output)
from tensorflow.keras.preprocessing.image import img_to_array
from PIL import Image
uploaded_image=st.file_uploader("Upload image to be captioned",type=["jpg","png","jpeg","webp"])
image_path="bushman.jpeg"
if(uploaded_image!=None):
    display_image=Image.open(uploaded_image)
    st.image(display_image)
    if st.button("Caption"):
        st.text("Please be patient...")
        display_image=display_image.resize((224,224))
        image = img_to_array(display_image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)
        feature = vgg_model.predict(image, verbose=0)
        final=predict_caption(model, feature, tokenizer, max_length)
        final_output=((" ").join(final.split(" ")[1:len(final.split(" "))-1]))
        tts = gTTS(final_output, lang='en')
        tts.write_to_fp(sound_file)
        st.text("Output:")
        st.markdown(final_output)
        st.audio(sound_file)