Spaces:
Sleeping
Sleeping
File size: 3,266 Bytes
27b3aaf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import streamlit as st
from transformers import VisionEncoderDecoderModel,ViTImageProcessor,RobertaTokenizerFast
from torchvision import transforms
from PIL import Image
def image_captioner(inp):
model = VisionEncoderDecoderModel.from_pretrained('ViT_Roberta_Image_Captioning')
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
tokenizer = RobertaTokenizerFast.from_pretrained('Byte_tokenizer')
cap = tokenizer.decode(model.generate(image_processor(images = inp, return_tensors="pt").pixel_values)[0])
return f'{cap.replace("<s>","").replace("</s>","")}'
import streamlit as st
import streamlit.components.v1 as components
st.markdown(
'''
# Hellooooo !!
### Welcome to my application for generating captions for an image
'''
)
st.logo(image="https://kaggle.com/static/images/open-in-kaggle.svg",link="https://www.kaggle.com/code/mightywarrior001/anicygan/")
transform = transforms.Compose([
transforms.PILToTensor()
])
columns = st.columns([0.5,0.5])
with columns[0]:
with st.container(border=True):
option = st.selectbox(
label='\t',
options =['select an option','camera','file upload']
)
input_photo = None
match option:
case 'select an option': st.markdown(' I wonder, which method would you like to use to upload images π€?')
case 'camera':
st.markdown('#### Say cheeseeeeeeee... βοΈ')
input_photo = st.camera_input(label='\t')
# st.write(type(input_photo))
case 'file upload':
st.markdown('Please Upload your image in png, jpeg, or jpg format π')
input_photo = st.file_uploader(label='\t', type=['png','jpg','jpeg'])
# st.write(type(input_photo))
if input_photo is not None:
st.session_state.input_photo = Image.open(input_photo).convert('RGB')
input_photo = transform(st.session_state.input_photo)
st.write('the image taken as input is, please check how it looks π₯Ήπ₯Ή')
st.image((input_photo).permute(1,2,0).numpy(),clamp=True)
with columns[1]:
with st.container(border=True):
if st.button(label='Generate captions'):
if "input_photo" in st.session_state:
with st.status(label="loading models...", expanded=True):
model = VisionEncoderDecoderModel.from_pretrained('ViT_Roberta_Image_Captioning')
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
tokenizer = RobertaTokenizerFast.from_pretrained('Byte_tokenizer')
with st.status(label='Generating Captions...', expanded=True):
cap = tokenizer.decode(model.generate(image_processor(images = st.session_state.input_photo, return_tensors="pt").pixel_values)[0])
st.markdown('The caption is....π:')
with st.container(border=True):
f'#### {cap.replace("<s>","").replace("</s>","")}'
else:
st.error(
'''
### Please upload an image
'''
)
|