Spaces:

HuggingFaceM4
/

idefics-8b

Running on Zero

App Files Files Community

idefics-8b / app.py

merve HF staff

Update app.py

411cee3 verified 3 months ago

raw

history blame

No virus

6.61 kB

	import gradio as gr
	from transformers import AutoProcessor, Idefics2ForConditionalGeneration
	import re
	import time
	from PIL import Image
	import torch
	import spaces
	import subprocess
	subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)


	processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")

	model = Idefics2ForConditionalGeneration.from_pretrained(
	"HuggingFaceM4/idefics2-8b",
	torch_dtype=torch.bfloat16,
	_attn_implementation="flash_attention_2",
	trust_remote_code=True).to("cuda")

	@spaces.GPU
	def model_inference(
	image, text, decoding_strategy, temperature,
	max_new_tokens, repetition_penalty, top_p
	):
	if text == "" and not image:
	gr.Error("Please input a query and optionally image(s).")

	if text == "" and image:
	gr.Error("Please input a text query along the image(s).")

	resulting_messages = [
	{
	"role": "user",
	"content": [{"type": "image"}] + [
	{"type": "text", "text": text}
	]
	}
	]


	prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
	inputs = processor(text=prompt, images=[image], return_tensors="pt")
	inputs = {k: v.to("cuda") for k, v in inputs.items()}

	generation_args = {
	"max_new_tokens": max_new_tokens,
	"repetition_penalty": repetition_penalty,

	}

	assert decoding_strategy in [
	"Greedy",
	"Top P Sampling",
	]
	if decoding_strategy == "Greedy":
	generation_args["do_sample"] = False
	elif decoding_strategy == "Top P Sampling":
	generation_args["temperature"] = temperature
	generation_args["do_sample"] = True
	generation_args["top_p"] = top_p


	generation_args.update(inputs)

	# Generate
	generated_ids = model.generate(**generation_args)

	generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
	print(generated_texts)
	pattern = r"Assistant: (.*)"

	# Use regular expression to find the desired part
	result = re.search(pattern, generated_texts[0]).group(1)

	return result[:-1]


	with gr.Blocks(fill_height=True) as demo:
	gr.Markdown("## IDEFICS2 Instruction 🐶")
	gr.Markdown("Play with fine-tuned [IDEFICS2](https://huggingface.co/HuggingFaceM4/idefics2-8b) in this demo. To get started, upload an image and text or try one of the examples.")
	gr.Markdown("Important note: This model is not made for chatting, the chatty IDEFICS2 will be released in the upcoming days. This model is very strong on various tasks, including visual question answering, document retrieval and more.")
	gr.Markdown("Learn more about IDEFICS2 in this [blog post](https://huggingface.co/blog/idefics2).")

	with gr.Row():
	with gr.Column():
	image_input = gr.Image(label="Upload your Image", type="pil")
	query_input = gr.Textbox(label="Prompt")
	submit_btn = gr.Button("Submit")

	with gr.Column():

	output = gr.Textbox(label="Output")

	with gr.Accordion():
	# Hyper-parameters for generation
	max_new_tokens = gr.Slider(
	minimum=8,
	maximum=1024,
	value=512,
	step=1,
	interactive=True,
	label="Maximum number of new tokens to generate",
	)
	repetition_penalty = gr.Slider(
	minimum=0.01,
	maximum=5.0,
	value=1.2,
	step=0.01,
	interactive=True,
	label="Repetition penalty",
	info="1.0 is equivalent to no penalty",
	)
	temperature = gr.Slider(
	minimum=0.0,
	maximum=5.0,
	value=0.4,
	step=0.1,
	interactive=True,
	label="Sampling temperature",
	info="Higher values will produce more diverse outputs.",
	)
	top_p = gr.Slider(
	minimum=0.01,
	maximum=0.99,
	value=0.8,
	step=0.01,
	interactive=True,
	label="Top P",
	info="Higher values is equivalent to sampling more low-probability tokens.",
	)
	decoding_strategy = gr.Radio(
	[
	"Greedy",
	"Top P Sampling",
	],
	value="Greedy",
	label="Decoding strategy",
	interactive=True,
	info="Higher values is equivalent to sampling more low-probability tokens.",
	)
	decoding_strategy.change(
	fn=lambda selection: gr.Slider(
	visible=(
	selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
	)
	),
	inputs=decoding_strategy,
	outputs=temperature,
	)

	decoding_strategy.change(
	fn=lambda selection: gr.Slider(
	visible=(
	selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
	)
	),
	inputs=decoding_strategy,
	outputs=repetition_penalty,
	)
	decoding_strategy.change(
	fn=lambda selection: gr.Slider(visible=(selection in ["Top P Sampling"])),
	inputs=decoding_strategy,
	outputs=top_p,
	)
	examples=[["./example_images/docvqa_example.png", "How many items are sold?", "Greedy", 0.4, 512, 1.2, 0.8],
	["./example_images/s2w_example.png", "What is this UI about?", "Greedy", 0.4, 512, 1.2, 0.8],
	["./example_images/example_images_travel_tips.jpg", "I want to go somewhere similar to the one in the photo. Give me destinations and travel tips.", 0.4, 512, 1.2, 0.8],
	["./example_images/chicken_on_money.png", "Can you tell me a very short story based on this image?", 0.4, 512, 1.2, 0.8],
	["./example_images/baklava.png", "Where is this pastry from?", 0.4, 512, 1.2, 0.8],
	["./example_images/dummy_pdf.png", "How much percent is the order status?", 0.4, 512, 1.2, 0.8],
	["./example_images/art_critic.png", "As an art critic AI assistant, could you describe this painting in details and make a thorough critic?.",
	0.4, 512, 1.2, 0.8]]
	],

	submit_btn.click(model_inference, inputs = [image_input, query_input, decoding_strategy, temperature,
	max_new_tokens, repetition_penalty, top_p],
	outputs=output)


	demo.launch(debug=True)