Spaces:
Runtime error
Runtime error
File size: 6,297 Bytes
1d42b83 9a59d7a 1d42b83 c431f44 9d3974d c431f44 8cf99f3 c431f44 d65dd81 6193207 d65dd81 f2977fa 5b067c5 a2468a2 8cf99f3 5b067c5 c431f44 5b067c5 c411dc2 c431f44 a3b4f26 c431f44 9a59d7a 6ee92d0 1fb1f41 1d42b83 26889b2 9a59d7a 26889b2 a3b4f26 a3d1262 1d42b83 a3d1262 1d42b83 6801c63 1d42b83 a3d1262 1d42b83 2cdead4 1d42b83 26889b2 c431f44 51aabb2 c431f44 5b067c5 51aabb2 5b067c5 26889b2 1d42b83 26889b2 7fa027b a3d1262 7fa027b a70df5d 1d42b83 a00a3a2 1d42b83 a00a3a2 1d42b83 a00a3a2 1d42b83 a3b4f26 1d42b83 a3b4f26 1d42b83 26889b2 c431f44 5b067c5 26889b2 8c02c7c fb169a0 8c02c7c 05b9609 3b1ade6 05b9609 1d42b83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import string
import gradio as gr
import requests
import torch
from models.VLE import VLEForVQA, VLEProcessor, VLEForVQAPipeline
from PIL import Image
model_name="hfl/vle-base-for-vqa"
model = VLEForVQA.from_pretrained(model_name)
vle_processor = VLEProcessor.from_pretrained(model_name)
vqa_pipeline = VLEForVQAPipeline(model=model, device='cpu', vle_processor=vle_processor)
from transformers import BlipForQuestionAnswering, BlipProcessor
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
model_vqa = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device)
from transformers import BlipProcessor, BlipForConditionalGeneration
cap_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
def caption(input_image):
inputs = cap_processor(input_image, return_tensors="pt")
inputs["num_beams"] = 1
inputs['num_return_sequences'] =1
out = cap_model.generate(**inputs)
return "\n".join(cap_processor.batch_decode(out, skip_special_tokens=True))
import openai
import os
openai.api_key= os.getenv('openai_appkey')
def gpt3(question,vqa_answer,caption):
prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer."
response = openai.Completion.create(
engine="text-davinci-003",
prompt=prompt,
max_tokens=30,
n=1,
stop=None,
temperature=0.7,
)
answer = response.choices[0].text.strip()
# return "input_text:\n"+prompt+"\n\n output_answer:\n"+answer
return answer
def vle(input_image,input_text):
vqa_answers = vqa_pipeline({"image":input_image, "question":input_text}, top_k=4)
# return [" ".join([str(value) for key,value in vqa.items()] )for vqa in vqa_answers]
return [vqa['answer'] for vqa in vqa_answers]
def inference_chat(input_image,input_text):
cap=caption(input_image)
# inputs = processor(images=input_image, text=input_text,return_tensors="pt")
# inputs["max_length"] = 10
# inputs["num_beams"] = 5
# inputs['num_return_sequences'] =4
# out = model_vqa.generate(**inputs)
# out=processor.batch_decode(out, skip_special_tokens=True)
out=vle(input_image,input_text)
vqa="\n".join(out)
gpt3_out=gpt3(input_text,vqa,cap)
gpt3_out1=gpt3(input_text,'',cap)
return out[0], gpt3_out,gpt3_out1
title = """<h1 align="center">VQA</h1>"""
with gr.Blocks(
css="""
.message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px}
#component-21 > div.wrap.svelte-w6rprc {height: 600px;}
"""
) as iface:
state = gr.State([])
#caption_output = None
gr.Markdown(title)
# gr.Markdown(description)
#gr.Markdown(article)
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(type="pil",label="VQA Image Input")
with gr.Row():
with gr.Column(scale=1):
chat_input = gr.Textbox(lines=1, label="VQA Question Input")
with gr.Row():
clear_button = gr.Button(value="Clear", interactive=True)
submit_button = gr.Button(
value="Submit", interactive=True, variant="primary"
)
'''
cap_submit_button = gr.Button(
value="Submit_CAP", interactive=True, variant="primary"
)
gpt3_submit_button = gr.Button(
value="Submit_GPT3", interactive=True, variant="primary"
)
'''
with gr.Column():
caption_output = gr.Textbox(lines=0, label="VQA ")
caption_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (short answer)")
gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
image_input.change(
lambda: ("", [],"","",""),
[],
[ caption_output, state,caption_output,gpt3_output_v1,caption_output_v1],
queue=False,
)
chat_input.submit(
inference_chat,
[
image_input,
chat_input,
],
[ caption_output],
)
clear_button.click(
lambda: ("", [],"","",""),
[],
[chat_input, state,caption_output,gpt3_output_v1,caption_output_v1],
queue=False,
)
submit_button.click(
inference_chat,
[
image_input,
chat_input,
],
[caption_output,gpt3_output_v1,caption_output_v1],
)
'''
cap_submit_button.click(
caption,
[
image_input,
],
[caption_output_v1],
)
gpt3_submit_button.click(
gpt3,
[
chat_input,
caption_output ,
caption_output_v1,
],
[gpt3_output_v1],
)
'''
examples=[['bird.jpeg',"How many birds are there in the tree?","There are two birds in the tree.","2","2"],
['qa9.jpg',"What type of vehicle is being pulled by the horses ?",'The vehicle being pulled by the horses is likely a sleigh.','carriage','Sled'],
['upload4.jpg',"What is this old man doing?","The old man is fishing.","fishing","Fishing"]]
examples = gr.Examples(
examples=examples,inputs=[image_input, chat_input,caption_output_v1,caption_output,gpt3_output_v1],
)
iface.queue(concurrency_count=1, api_open=False, max_size=10)
iface.launch(enable_queue=True) |