Spaces:

clip-italian
/

clip-italian-demo

Running

App Files Files Community

clip-italian-demo / image2text.py

vinid

fixing a few things

6c67d85 over 3 years ago

raw

history blame

2.15 kB

	import streamlit as st
	from text2image import get_model, get_tokenizer, get_image_transform
	from utils import text_encoder, image_encoder
	from PIL import Image
	from jax import numpy as jnp
	import pandas as pd


	def app():
	st.title("From Image to Text")
	st.markdown(
	"""

	### 👋 Ciao!

	Here you can find the captions or the labels that are most related to a given image. It is a zero-shot
	image classification task!

	🤌 Italian mode on! 🤌

	"""
	)

	filename = st.file_uploader(
	"Choose an image from your computer", type=["jpg", "jpeg", "png"]
	)

	MAX_CAP = 4

	col1, col2 = st.beta_columns([3, 1])

	with col2:
	captions_count = st.selectbox(
	"Number of labels", options=range(1, MAX_CAP + 1)
	)
	compute = st.button("Compute")

	with col1:
	captions = list()
	for idx in range(min(MAX_CAP, captions_count)):
	captions.append(st.text_input(f"Insert label {idx+1}"))

	if compute:
	captions = [c for c in captions if c != ""]

	if not captions or not filename:
	st.error("Please choose one image and at least one label")
	else:
	with st.spinner("Computing..."):
	model = get_model()
	tokenizer = get_tokenizer()

	text_embeds = list()
	for i, c in enumerate(captions):
	text_embeds.extend(text_encoder(c, model, tokenizer))

	text_embeds = jnp.array(text_embeds)

	image = Image.open(filename).convert("RGB")
	transform = get_image_transform(model.config.vision_config.image_size)
	image_embed = image_encoder(transform(image), model)

	# we could have a softmax here
	cos_similarities = jnp.matmul(image_embed, text_embeds.T)

	chart_data = pd.Series(cos_similarities[0], index=captions)

	col1, col2 = st.beta_columns(2)
	with col1:
	st.bar_chart(chart_data)

	with col2:
	st.image(image)