llava-4bit

Paused

App Files Files Community

llava-4bit / app.py

merve HF staff

Update app.py

780b5ff about 1 year ago

raw

history blame

6.55 kB

	import os
	import string
	import copy
	import gradio as gr
	import PIL.Image
	import torch
	from transformers import BitsAndBytesConfig, pipeline
	import re
	import time

	DESCRIPTION = "# LLaVA 🌋"

	model_id = "llava-hf/llava-1.5-7b-hf"
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16
	)
	pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})



	def extract_response_pairs(text):
	turns = re.split(r'(USER:\|ASSISTANT:)', text)[1:]
	turns = [turn.strip() for turn in turns if turn.strip()]
	print(f"conv turns are {turns[1::2]}")
	conv_list = []
	for i in range(0, len(turns[1::2]), 2):
	if i + 1 < len(turns[1::2]):
	conv_list.append([turns[1::2][i].lstrip(":"), turns[1::2][i + 1].lstrip(":")])

	return conv_list



	def add_text(history, text):
	history = history.append([text, None])
	return history, text

	def infer(image, prompt,
	temperature,
	length_penalty,
	repetition_penalty,
	max_length,
	min_length,
	top_p):

	outputs = pipe(images=image, prompt=prompt,
	generate_kwargs={"temperature":temperature,
	"length_penalty":length_penalty,
	"repetition_penalty":repetition_penalty,
	"max_length":max_length,
	"min_length":min_length,
	"top_p":top_p})
	inference_output = outputs[0]["generated_text"]
	return inference_output



	def bot(history_chat, text_input, image,
	temperature,
	length_penalty,
	repetition_penalty,
	max_length,
	min_length,
	top_p):
	chat_history = " ".join(history_chat) # history as a str to be passed to model
	chat_history = chat_history + f"USER: <image>\n{text_input}\nASSISTANT:" # add text input for prompting


	inference_result = infer(image, chat_history,
	temperature,
	length_penalty,
	repetition_penalty,
	max_length,
	min_length,
	top_p)
	# return inference and parse for new history
	chat_val = extract_response_pairs(inference_result)

	# create history list for yielding the last inference response
	chat_state_list = copy.deepcopy(chat_val)
	chat_state_list[-1][1] = "" # empty last response

	# add characters iteratively

	for character in chat_val[-1][1]:
	chat_state_list[-1][1] += character
	time.sleep(0.05)
	# yield history but with last response being streamed
	print(chat_state_list)
	yield chat_state_list


	css = """
	#mkd {
	height: 500px;
	overflow: auto;
	border: 1px solid #ccc;
	}
	"""
	with gr.Blocks(css="style.css") as demo:
	gr.Markdown(DESCRIPTION)
	gr.Markdown("""## LLaVA, one of the greatest multimodal chat models is now available in Transformers with 4-bit quantization! ⚡️
	See the docs here: https://huggingface.co/docs/transformers/main/en/model_doc/llava.""")
	chatbot = gr.Chatbot(label="Chat", show_label=False)
	gr.Markdown("Input image and text and start chatting 👇")
	with gr.Row():

	image = gr.Image(type="pil")
	text_input = gr.Text(label="Chat Input", show_label=False, max_lines=3, container=False)

	history_chat = gr.State(value=[])

	with gr.Accordion(label="Advanced settings", open=False):
	temperature = gr.Slider(
	label="Temperature",
	info="Used with nucleus sampling.",
	minimum=0.5,
	maximum=1.0,
	step=0.1,
	value=1.0,
	)
	length_penalty = gr.Slider(
	label="Length Penalty",
	info="Set to larger for longer sequence, used with beam search.",
	minimum=-1.0,
	maximum=2.0,
	step=0.2,
	value=1.0,
	)
	repetition_penalty = gr.Slider(
	label="Repetition Penalty",
	info="Larger value prevents repetition.",
	minimum=1.0,
	maximum=5.0,
	step=0.5,
	value=1.5,
	)
	max_length = gr.Slider(
	label="Max Length",
	minimum=1,
	maximum=500,
	step=1,
	value=200,
	)
	min_length = gr.Slider(
	label="Minimum Length",
	minimum=1,
	maximum=100,
	step=1,
	value=1,
	)
	top_p = gr.Slider(
	label="Top P",
	info="Used with nucleus sampling.",
	minimum=0.5,
	maximum=1.0,
	step=0.1,
	value=0.9,
	)
	chat_output = [
	chatbot,
	history_chat
	]


	chat_inputs = [
	image,
	text_input,
	temperature,
	length_penalty,
	repetition_penalty,
	max_length,
	min_length,
	top_p,
	history_chat
	]
	with gr.Row():
	clear_chat_button = gr.Button("Clear")
	cancel_btn = gr.Button("Stop Generation")
	chat_button = gr.Button("Submit", variant="primary")

	chat_event1 = chat_button.click(add_text, [chatbot, text_input], [chatbot, text_input]).then(bot, [chatbot, text_input,
	image, temperature,
	length_penalty,
	repetition_penalty,
	max_length,
	min_length,
	top_p], chatbot)

	chat_event2 = text_input.submit(
	add_text,
	[chatbot, text_input],
	[chatbot, text_input]
	).then(
	fn=bot,
	inputs=[chatbot, text_input, image, temperature,
	length_penalty,
	repetition_penalty,
	max_length,
	min_length,
	top_p],
	outputs=chatbot
	)
	clear_chat_button.click(
	fn=lambda: ([], []),
	inputs=None,
	outputs=[
	chatbot,
	history_chat
	],
	queue=False,
	api_name="clear",
	)
	image.change(
	fn=lambda: ([], []),
	inputs=None,
	outputs=[
	chatbot,
	history_chat
	],
	queue=False)
	cancel_btn.click(
	None, [], [],
	cancels=[chat_event1, chat_event2]
	)
	examples = [["./examples/baklava.png", "How to make this pastry?"],["./examples/bee.png","Describe this image."]]
	gr.Examples(examples=examples, inputs=[image, text_input, chat_inputs])




	if __name__ == "__main__":
	demo.queue(max_size=10).launch(debug=True)