Johannes
update
2420b7f
import gradio as gr
import clip
from model import ClipCaptionModel
from transformers import GPT2Tokenizer
import numpy as np
import torch
import PIL
from predict import generate2, generate_beam
from huggingface_hub import hf_hub_download
D = torch.device
CPU = torch.device('cpu')
pretrained_model_variance = "0.015"
device = "cpu"
model_path = hf_hub_download('johko/capdec_015', 'model.pt')
clip_model, preprocess = clip.load("RN50x4", device=device, jit=False)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model_0 = hf_hub_download('johko/capdec_0', 'model.pt')
model_001 = hf_hub_download('johko/capdec_001', 'model.pt')
model_005 = hf_hub_download('johko/capdec_005', 'model.pt')
model_015 = hf_hub_download('johko/capdec_015', 'model.pt')
model_025 = hf_hub_download('johko/capdec_025', 'model.pt')
model_05 = hf_hub_download('johko/capdec_05', 'model.pt')
def load_noise_level_model(noise_level):
if noise_level == "0.0":
model_path = model_0
elif noise_level == "0.001":
model_path = model_001
elif noise_level == "0.005":
model_path = model_005
elif noise_level == "0.015":
model_path = model_015
elif noise_level == "0.025":
model_path = model_025
elif noise_level == "0.05":
model_path = model_05
else:
raise ValueError("Unknown Noise Level")
model = ClipCaptionModel()
model.load_state_dict(torch.load(model_path, map_location=CPU))
model = model.eval()
model = model.to(device)
return model
def infer(input_image: np.ndarray, noise_level: str):
use_beam_search = True
model = load_noise_level_model(noise_level)
pil_image = PIL.Image.fromarray(input_image)
image = preprocess(pil_image).unsqueeze(0).to(device)
with torch.no_grad():
prefix = clip_model.encode_image(image).to(device, dtype=torch.float32)
prefix_embed = model.clip_project(prefix).reshape(1, 40, -1)
if use_beam_search:
generated_text_prefix = generate_beam(model, tokenizer, embed=prefix_embed)[0]
else:
generated_text_prefix = generate2(model, tokenizer, embed=prefix_embed)
return input_image, generated_text_prefix
description="""This space is a demo for the paper [*Text-Only Training for Image Captioning using Noise-Injected CLIP*](https://arxiv.org/pdf/2211.00575.pdf)
by David Nukrai, Ron Mokady and Amir Globerson.
The paper is about training an Image Captioning model by only using text. It leverages the usage of noise injections at different Noise Levels,
with which you can experiment as well in this demo. The text caption will change depending on the Noise Level you choose."""
dropdown = gr.components.Dropdown(["0.0", "0.001", "0.005", "0.015", "0.025", "0.05"], value="0.015", label="Noise Level")
input_image = gr.components.Image(label="Input Image")
output_image = gr.components.Image(label="Image")
output_text = gr.components.Textbox(label="Generated Caption")
iface = gr.Interface(
title="CapDec Image Captioning",
description=description,
fn=infer,
inputs=[input_image, dropdown],
outputs=[output_image, output_text],
examples=[["examples/flickr_ex2.jpg", "0.015"], ["examples/web_ex3.jpeg", "0.015"]])
iface.launch()