import gradio as gr import torch from transformers import BlipForConditionalGeneration, BlipProcessor device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model_image_captioning = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def inference(raw_image, question, decoding_strategy): inputs = processor(images=raw_image, text=question, return_tensors="pt") if decoding_strategy == "Beam search": inputs["max_length"] = 20 inputs["num_beams"] = 5 elif decoding_strategy == "Nucleus sampling": inputs["max_length"] = 20 inputs["num_beams"] = 1 inputs["do_sample"] = True inputs["top_k"] = 50 inputs["top_p"] = 0.95 elif decoding_strategy == "Contrastive search": inputs["penalty_alpha"] = 0.6 inputs["top_k"] = 4 inputs["max_length"] = 512 out = model_image_captioning.generate(**inputs) return processor.batch_decode(out, skip_special_tokens=True)[0] inputs = [ gr.inputs.Image(type='pil'), gr.inputs.Textbox(lines=2, label="Context (optional)"), gr.inputs.Radio(choices=["Beam search","Nucleus sampling", "Contrastive search"], type="value", default="Nucleus sampling", label="Caption Decoding Strategy") ] outputs = gr.outputs.Textbox(label="Output") title = "BLIP" description = "Gradio demo for BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation (Salesforce Research). To use it, simply upload your image, or click one of the examples to load them. Read more at the links below." article = "
BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation | Github Repo
" gr.Interface(inference, inputs, outputs, title=title, description=description, article=article).launch(enable_queue=True)