Spaces:

flax-community
/

koclip

Build error

App Files Files Community

koclip / image2text.py

jaketae

style: run linter

bf9c2d9 almost 3 years ago

raw history blame

No virus

3.08 kB

	import jax
	import jax.numpy as jnp
	import pandas as pd
	import requests
	import streamlit as st
	from PIL import Image

	from utils import load_model


	def app(model_name):
	model, processor = load_model(f"koclip/{model_name}")

	st.title("Zero-shot Image Classification")
	st.markdown(
	"""
	This demonstration explores capability of KoCLIP in the field of Zero-Shot Prediction. This demo takes a set of image and captions from the user, and predicts the most likely label among the different captions given.

	KoCLIP is a retraining of OpenAI's CLIP model using 82,783 images from [MSCOCO](https://cocodataset.org/#home) dataset and Korean caption annotations. Korean translation of caption annotations were obtained from [AI Hub](https://aihub.or.kr/keti_data_board/visual_intelligence). Base model `koclip` uses `klue/roberta` as text encoder and `openai/clip-vit-base-patch32` as image encoder. Larger model `koclip-large` uses `klue/roberta` as text encoder and bigger `google/vit-large-patch16-224` as image encoder.
	"""
	)

	query1 = st.text_input(
	"Enter a URL to an image...",
	value="http://images.cocodataset.org/val2017/000000039769.jpg",
	)
	query2 = st.file_uploader("or upload an image...", type=["jpg", "jpeg", "png"])

	col1, col2 = st.beta_columns([3, 1])

	with col2:
	captions_count = st.selectbox("Number of labels", options=range(1, 6), index=2)
	compute = st.button("Classify")

	with col1:
	captions = []
	defaults = ["귀여운 고양이", "멋있는 강아지", "포동포동한 햄스터"]
	for idx in range(captions_count):
	value = defaults[idx] if idx < len(defaults) else ""
	captions.append(st.text_input(f"Insert label {idx+1}", value=value))

	if compute:
	if not any([query1, query2]):
	st.error("Please upload an image or paste an image URL.")
	else:
	st.markdown("""---""")
	with st.spinner("Computing..."):
	image_data = (
	query2
	if query2 is not None
	else requests.get(query1, stream=True).raw
	)
	image = Image.open(image_data)

	# captions = [caption.strip() for caption in captions.split(",")]
	captions = [f"이것은 {caption.strip()}이다." for caption in captions]
	inputs = processor(
	text=captions, images=image, return_tensors="jax", padding=True
	)
	inputs["pixel_values"] = jnp.transpose(
	inputs["pixel_values"], axes=[0, 2, 3, 1]
	)
	outputs = model(**inputs)
	probs = jax.nn.softmax(outputs.logits_per_image, axis=1)
	chart_data = pd.Series(probs[0], index=captions)

	col1, col2 = st.beta_columns(2)
	with col1:
	st.image(image)
	with col2:
	st.bar_chart(chart_data)