import gradio as gr import clip from model import ClipCaptionModel from transformers import GPT2Tokenizer import numpy as np import torch import PIL from predict import generate2, generate_beam from huggingface_hub import hf_hub_download D = torch.device CPU = torch.device('cpu') pretrained_model_variance = "0.015" device = "cpu" model_path = hf_hub_download('johko/capdec_015', 'model.pt') clip_model, preprocess = clip.load("RN50x4", device=device, jit=False) tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model_0 = hf_hub_download('johko/capdec_0', 'model.pt') model_001 = hf_hub_download('johko/capdec_001', 'model.pt') model_005 = hf_hub_download('johko/capdec_005', 'model.pt') model_015 = hf_hub_download('johko/capdec_015', 'model.pt') model_025 = hf_hub_download('johko/capdec_025', 'model.pt') model_05 = hf_hub_download('johko/capdec_05', 'model.pt') def load_noise_level_model(noise_level): if noise_level == "0.0": model_path = model_0 elif noise_level == "0.001": model_path = model_001 elif noise_level == "0.005": model_path = model_005 elif noise_level == "0.015": model_path = model_015 elif noise_level == "0.025": model_path = model_025 elif noise_level == "0.05": model_path = model_05 else: raise ValueError("Unknown Noise Level") model = ClipCaptionModel() model.load_state_dict(torch.load(model_path, map_location=CPU)) model = model.eval() model = model.to(device) return model def infer(input_image: np.ndarray, noise_level: str): use_beam_search = True model = load_noise_level_model(noise_level) pil_image = PIL.Image.fromarray(input_image) image = preprocess(pil_image).unsqueeze(0).to(device) with torch.no_grad(): prefix = clip_model.encode_image(image).to(device, dtype=torch.float32) prefix_embed = model.clip_project(prefix).reshape(1, 40, -1) if use_beam_search: generated_text_prefix = generate_beam(model, tokenizer, embed=prefix_embed)[0] else: generated_text_prefix = generate2(model, tokenizer, embed=prefix_embed) return input_image, generated_text_prefix description="""This space is a demo for the paper [*Text-Only Training for Image Captioning using Noise-Injected CLIP*](https://arxiv.org/pdf/2211.00575.pdf) by David Nukrai, Ron Mokady and Amir Globerson. The paper is about training an Image Captioning model by only using text. It leverages the usage of noise injections at different Noise Levels, with which you can experiment as well in this demo. The text caption will change depending on the Noise Level you choose.""" dropdown = gr.components.Dropdown(["0.0", "0.001", "0.005", "0.015", "0.025", "0.05"], value="0.015", label="Noise Level") input_image = gr.components.Image(label="Input Image") output_image = gr.components.Image(label="Image") output_text = gr.components.Textbox(label="Generated Caption") iface = gr.Interface( title="CapDec Image Captioning", description=description, fn=infer, inputs=[input_image, dropdown], outputs=[output_image, output_text], examples=[["examples/flickr_ex2.jpg", "0.015"], ["examples/web_ex3.jpeg", "0.015"]]) iface.launch()