Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,047 Bytes
140a766 411cee3 140a766 411cee3 140a766 8e85e31 140a766 411cee3 140a766 7062ece 140a766 967ff26 411cee3 82e8993 411cee3 140a766 411cee3 140a766 411cee3 140a766 411cee3 140a766 411cee3 140a766 411cee3 82e8993 411cee3 82e8993 140a766 82e8993 140a766 82e8993 140a766 411cee3 82e8993 411cee3 1e870d6 a8611ac 411cee3 c7149fd 411cee3 82e8993 55dc83f 1e300d8 e57a9e9 a8611ac 82e8993 1e300d8 82e8993 1e300d8 82e8993 1e300d8 82e8993 1e300d8 140a766 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import gradio as gr
from transformers import AutoProcessor, Idefics2ForConditionalGeneration
import re
import time
from PIL import Image
import torch
import spaces
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
model = Idefics2ForConditionalGeneration.from_pretrained(
"HuggingFaceM4/idefics2-8b",
torch_dtype=torch.bfloat16,
#_attn_implementation="flash_attention_2",
trust_remote_code=True).to("cuda")
@spaces.GPU(duration=180)
def model_inference(
image, text, decoding_strategy, temperature,
max_new_tokens, repetition_penalty, top_p
):
if text == "" and not image:
gr.Error("Please input a query and optionally image(s).")
if text == "" and image:
gr.Error("Please input a text query along the image(s).")
resulting_messages = [
{
"role": "user",
"content": [{"type": "image"}] + [
{"type": "text", "text": text}
]
}
]
prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}
generation_args = {
"max_new_tokens": max_new_tokens,
"repetition_penalty": repetition_penalty,
}
assert decoding_strategy in [
"Greedy",
"Top P Sampling",
]
if decoding_strategy == "Greedy":
generation_args["do_sample"] = False
elif decoding_strategy == "Top P Sampling":
generation_args["temperature"] = temperature
generation_args["do_sample"] = True
generation_args["top_p"] = top_p
generation_args.update(inputs)
# Generate
generated_ids = model.generate(**generation_args)
generated_texts = processor.batch_decode(generated_ids[:, generation_args["input_ids"].size(1):], skip_special_tokens=True)
print("INPUT:", prompt, "|OUTPUT:", generated_texts)
return generated_texts[0]
with gr.Blocks(fill_height=True) as demo:
gr.Markdown("## IDEFICS2 Instruction 🐶")
gr.Markdown("Play with [IDEFICS2-8B](https://huggingface.co/HuggingFaceM4/idefics2-8b) in this demo. To get started, upload an image and text or try one of the examples.")
gr.Markdown("**Important note**: This model is not made for chatting, the chatty IDEFICS2 will be released in the upcoming days. **This model is very strong on various tasks, including visual question answering, document retrieval and more, you can see it through the examples.**")
gr.Markdown("Learn more about IDEFICS2 in this [blog post](https://huggingface.co/blog/idefics2).")
with gr.Column():
image_input = gr.Image(label="Upload your Image", type="pil")
query_input = gr.Textbox(label="Prompt")
submit_btn = gr.Button("Submit")
output = gr.Textbox(label="Output")
with gr.Accordion(label="Example Inputs and Advanced Generation Parameters"):
examples=[["./example_images/docvqa_example.png", "How many items are sold?", "Greedy", 0.4, 512, 1.2, 0.8],
["./example_images/example_images_travel_tips.jpg", "I want to go somewhere similar to the one in the photo. Give me destinations and travel tips.", "Greedy", 0.4, 512, 1.2, 0.8],
["./example_images/baklava.png", "Where is this pastry from?", "Greedy", 0.4, 512, 1.2, 0.8],
["./example_images/dummy_pdf.png", "How much percent is the order status?", "Greedy", 0.4, 512, 1.2, 0.8],
["./example_images/art_critic.png", "As an art critic AI assistant, could you describe this painting in details and make a thorough critic?.", "Greedy", 0.4, 512, 1.2, 0.8],
["./example_images/s2w_example.png", "What is this UI about?", "Greedy", 0.4, 512, 1.2, 0.8]]
# Hyper-parameters for generation
max_new_tokens = gr.Slider(
minimum=8,
maximum=1024,
value=512,
step=1,
interactive=True,
label="Maximum number of new tokens to generate",
)
repetition_penalty = gr.Slider(
minimum=0.01,
maximum=5.0,
value=1.2,
step=0.01,
interactive=True,
label="Repetition penalty",
info="1.0 is equivalent to no penalty",
)
temperature = gr.Slider(
minimum=0.0,
maximum=5.0,
value=0.4,
step=0.1,
interactive=True,
label="Sampling temperature",
info="Higher values will produce more diverse outputs.",
)
top_p = gr.Slider(
minimum=0.01,
maximum=0.99,
value=0.8,
step=0.01,
interactive=True,
label="Top P",
info="Higher values is equivalent to sampling more low-probability tokens.",
)
decoding_strategy = gr.Radio(
[
"Greedy",
"Top P Sampling",
],
value="Greedy",
label="Decoding strategy",
interactive=True,
info="Higher values is equivalent to sampling more low-probability tokens.",
)
decoding_strategy.change(
fn=lambda selection: gr.Slider(
visible=(
selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
)
),
inputs=decoding_strategy,
outputs=temperature,
)
decoding_strategy.change(
fn=lambda selection: gr.Slider(
visible=(
selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
)
),
inputs=decoding_strategy,
outputs=repetition_penalty,
)
decoding_strategy.change(
fn=lambda selection: gr.Slider(visible=(selection in ["Top P Sampling"])),
inputs=decoding_strategy,
outputs=top_p,
)
gr.Examples(
examples = examples,
inputs=[image_input, query_input, decoding_strategy, temperature,
max_new_tokens, repetition_penalty, top_p],
outputs=output,
fn=model_inference
)
submit_btn.click(model_inference, inputs = [image_input, query_input, decoding_strategy, temperature,
max_new_tokens, repetition_penalty, top_p], outputs=output)
demo.launch(debug=True) |