from transformers import pipeline get_completion = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") # Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("visual-question-answering", model="Salesforce/blip-vqa-base") import gradio as gr from transformers import BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering # Assume your model and processor are correctly initialized caption_model_path = "Salesforce/blip-image-captioning-base" model_vqa_path = "Salesforce/blip-vqa-base" caption_processor = BlipProcessor.from_pretrained(caption_model_path) caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_path) vqa_processor = BlipProcessor.from_pretrained(model_vqa_path) vqa_model = BlipForQuestionAnswering.from_pretrained(model_vqa_path) def captioner(image): inputs = caption_processor(image, return_tensors="pt") out = caption_model.generate(**inputs) caption = caption_processor.decode(out[0], skip_special_tokens=True) return caption def answer_question(image, question): inputs = vqa_processor(image, question, return_tensors="pt") out = vqa_model.generate(**inputs) ans = vqa_processor.decode(out[0], skip_special_tokens=True) return ans with gr.Blocks() as demo: with gr.Tab("Image Captioning"): with gr.Column(): img_caption = gr.Image(label="Upload Image", type="pil") caption_out = gr.Textbox(label="Caption") caption_btn = gr.Button("Generate Caption") caption_btn.click(captioner, inputs=img_caption, outputs=caption_out) with gr.Tab("Visual Question Answering"): with gr.Column(): img_vqa = gr.Image(label="Upload Image for VQA", type="pil") question = gr.Textbox(label="Question") answer_out = gr.Textbox(label="Answer") answer_btn = gr.Button("Get Answer") answer_btn.click(answer_question, inputs=[img_vqa, question], outputs=answer_out) demo.launch()