clip-italian-demo / image2text.py
vinid's picture
fixing a few things
6c67d85
raw
history blame
2.15 kB
import streamlit as st
from text2image import get_model, get_tokenizer, get_image_transform
from utils import text_encoder, image_encoder
from PIL import Image
from jax import numpy as jnp
import pandas as pd
def app():
st.title("From Image to Text")
st.markdown(
"""
### πŸ‘‹ Ciao!
Here you can find the captions or the labels that are most related to a given image. It is a zero-shot
image classification task!
🀌 Italian mode on! 🀌
"""
)
filename = st.file_uploader(
"Choose an image from your computer", type=["jpg", "jpeg", "png"]
)
MAX_CAP = 4
col1, col2 = st.beta_columns([3, 1])
with col2:
captions_count = st.selectbox(
"Number of labels", options=range(1, MAX_CAP + 1)
)
compute = st.button("Compute")
with col1:
captions = list()
for idx in range(min(MAX_CAP, captions_count)):
captions.append(st.text_input(f"Insert label {idx+1}"))
if compute:
captions = [c for c in captions if c != ""]
if not captions or not filename:
st.error("Please choose one image and at least one label")
else:
with st.spinner("Computing..."):
model = get_model()
tokenizer = get_tokenizer()
text_embeds = list()
for i, c in enumerate(captions):
text_embeds.extend(text_encoder(c, model, tokenizer))
text_embeds = jnp.array(text_embeds)
image = Image.open(filename).convert("RGB")
transform = get_image_transform(model.config.vision_config.image_size)
image_embed = image_encoder(transform(image), model)
# we could have a softmax here
cos_similarities = jnp.matmul(image_embed, text_embeds.T)
chart_data = pd.Series(cos_similarities[0], index=captions)
col1, col2 = st.beta_columns(2)
with col1:
st.bar_chart(chart_data)
with col2:
st.image(image)