import streamlit as st from transformers import VisionEncoderDecoderModel,ViTImageProcessor,RobertaTokenizerFast from torchvision import transforms from PIL import Image def image_captioner(inp): model = VisionEncoderDecoderModel.from_pretrained('ViT_Roberta_Image_Captioning') image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k') tokenizer = RobertaTokenizerFast.from_pretrained('Byte_tokenizer') cap = tokenizer.decode(model.generate(image_processor(images = inp, return_tensors="pt").pixel_values)[0]) return f'{cap.replace("","").replace("","")}' import streamlit as st import streamlit.components.v1 as components st.markdown( ''' # Hellooooo !! ### Welcome to my application for generating captions for an image ''' ) st.logo(image="https://kaggle.com/static/images/open-in-kaggle.svg",link="https://www.kaggle.com/code/mightywarrior001/anicygan/") transform = transforms.Compose([ transforms.PILToTensor() ]) columns = st.columns([0.5,0.5]) with columns[0]: with st.container(border=True): option = st.selectbox( label='\t', options =['select an option','camera','file upload'] ) input_photo = None match option: case 'select an option': st.markdown(' I wonder, which method would you like to use to upload images 🤔?') case 'camera': st.markdown('#### Say cheeseeeeeeee... ✌️') input_photo = st.camera_input(label='\t') # st.write(type(input_photo)) case 'file upload': st.markdown('Please Upload your image in png, jpeg, or jpg format 😊') input_photo = st.file_uploader(label='\t', type=['png','jpg','jpeg']) # st.write(type(input_photo)) if input_photo is not None: st.session_state.input_photo = Image.open(input_photo).convert('RGB') input_photo = transform(st.session_state.input_photo) st.write('the image taken as input is, please check how it looks 🥹🥹') st.image((input_photo).permute(1,2,0).numpy(),clamp=True) with columns[1]: with st.container(border=True): if st.button(label='Generate captions'): if "input_photo" in st.session_state: with st.status(label="loading models...", expanded=True): model = VisionEncoderDecoderModel.from_pretrained('ViT_Roberta_Image_Captioning') image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k') tokenizer = RobertaTokenizerFast.from_pretrained('Byte_tokenizer') with st.status(label='Generating Captions...', expanded=True): cap = tokenizer.decode(model.generate(image_processor(images = st.session_state.input_photo, return_tensors="pt").pixel_values)[0]) st.markdown('The caption is....😎:') with st.container(border=True): f'#### {cap.replace("","").replace("","")}' else: st.error( ''' ### Please upload an image ''' )