Spaces:
Running
Running
import streamlit as st | |
from text2image import get_model, get_tokenizer, get_image_transform | |
from utils import text_encoder, image_encoder | |
from PIL import Image | |
from jax import numpy as jnp | |
import pandas as pd | |
def app(): | |
st.title("From Image to Text") | |
st.markdown( | |
""" | |
### π Ciao! | |
Here you can find the captions or the labels that are most related to a given image. It is a zero-shot | |
image classification task! | |
π€ Italian mode on! π€ | |
""" | |
) | |
filename = st.file_uploader( | |
"Choose an image from your computer", type=["jpg", "jpeg", "png"] | |
) | |
MAX_CAP = 4 | |
col1, col2 = st.beta_columns([3, 1]) | |
with col2: | |
captions_count = st.selectbox( | |
"Number of labels", options=range(1, MAX_CAP + 1) | |
) | |
compute = st.button("Compute") | |
with col1: | |
captions = list() | |
for idx in range(min(MAX_CAP, captions_count)): | |
captions.append(st.text_input(f"Insert label {idx+1}")) | |
if compute: | |
captions = [c for c in captions if c != ""] | |
if not captions or not filename: | |
st.error("Please choose one image and at least one label") | |
else: | |
with st.spinner("Computing..."): | |
model = get_model() | |
tokenizer = get_tokenizer() | |
text_embeds = list() | |
for i, c in enumerate(captions): | |
text_embeds.extend(text_encoder(c, model, tokenizer)) | |
text_embeds = jnp.array(text_embeds) | |
image = Image.open(filename).convert("RGB") | |
transform = get_image_transform(model.config.vision_config.image_size) | |
image_embed = image_encoder(transform(image), model) | |
# we could have a softmax here | |
cos_similarities = jnp.matmul(image_embed, text_embeds.T) | |
chart_data = pd.Series(cos_similarities[0], index=captions) | |
col1, col2 = st.beta_columns(2) | |
with col1: | |
st.bar_chart(chart_data) | |
with col2: | |
st.image(image) | |