Spaces:

csuhan
/

OneLLM

Sleeping

App Files Files Community

OneLLM / app.py

csuhan

Update app.py

5b036b0 10 months ago

raw

history blame contribute delete

19 kB

	import sys
	import os
	import argparse
	import multiprocessing as mp
	import numpy as np
	from typing import List, Optional

	import torch
	import torch.distributed as dist

	from fairscale.nn.model_parallel import initialize as fs_init

	import gradio as gr
	from util.misc import setup_for_distributed
	from util.misc import default_tensor_type
	from model.meta import MetaModel
	from data.conversation_lib import conv_templates, SeparatorStyle
	from PIL import Image
	import torchvision.transforms as transforms
	from data.fintune_dataset import make_audio_features
	from data import video_utils
	from dataclasses import dataclass
	from huggingface_hub import hf_hub_download
	import plotly.graph_objects as go
	from data.fintune_dataset import pc_norm
	from functools import partial
	import glob
	import torchvision.transforms.functional as F

	T_random_resized_crop = transforms.Compose([
	transforms.RandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(0.75, 1.3333), interpolation=3,
	antialias=None), # 3 is bicubic
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])

	class PairRandomResizedCrop(transforms.RandomResizedCrop):
	def forward(self, imgs):
	i, j, h, w = self.get_params(imgs[0], self.scale, self.ratio)
	return [F.resized_crop(img, i, j, h, w, self.size, self.interpolation, antialias=self.antialias) for img in imgs]

	class PairToTensor(transforms.ToTensor):
	def __call__(self, pics):
	return [F.to_tensor(pic) for pic in pics]

	class PairNormalize(transforms.Normalize):
	def forward(self, tensors):
	return [F.normalize(tensor, self.mean, self.std, self.inplace) for tensor in tensors]

	transform_pairimg_train = transforms.Compose([
	PairRandomResizedCrop(size=(224, 224), scale=(0.99, 1.0), ratio=(0.75, 1.3333), interpolation=3, antialias=None), # 3 is bicubic
	PairToTensor(),
	PairNormalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])

	def load_audio(audio_path):
	fbank = make_audio_features(audio_path, mel_bins=128)
	fbank = fbank.transpose(0, 1)[None] #[1, 128, 1024]
	return fbank

	def load_video(video_path):
	video_feats = video_utils.load_and_transform_video_data(video_path, video_path, clip_duration=1, clips_per_video=5)
	return video_feats[:, :, 0]

	def load_point(point_path):
	point_feat = np.load(point_path)
	point_feat = torch.tensor(point_feat)
	point_feat = pc_norm(point_feat)
	return point_feat

	def load_fmri(fmri_path):
	data = np.load(fmri_path)
	data = data.mean(axis=0)
	data = torch.tensor(data[None])
	return data

	def load_rgbx(image_path, x_image_path):
	# trick: replace path if 'depth_scaled' in path
	x_image_path = x_image_path.replace('depth_scaled', 'depth')

	image = Image.open(image_path).convert('RGB')
	x_image = Image.open(x_image_path).convert('RGB')
	x_image = x_image.resize(image.size[-2:])

	image, x_image = transform_pairimg_train([image, x_image])

	# [2, 3, H, W]
	image = torch.stack([image, x_image], dim=0)
	return image


	class Ready: pass


	def model_worker(
	rank: int, args: argparse.Namespace, barrier: mp.Barrier,
	request_queue: mp.Queue, response_queue: Optional[mp.Queue] = None,
	) -> None:
	"""
	The worker function that manipulates the GPU to run the inference.
	Exact n_gpu workers are started, with each one operating on a separate GPU.

	Args:
	rank (int): Distributed rank of the worker.
	args (argparse.Namespace): All command line arguments.
	barrier (multiprocessing.Barrier): A barrier used to delay the start
	of Web UI to be after the start of the model.
	"""

	world_size = len(args.gpu_ids)
	gpu_id = args.gpu_ids[rank]
	dist.init_process_group(
	backend="nccl", rank=rank, world_size=world_size,
	init_method=f"tcp://{args.master_addr}:{args.master_port}",
	)
	print(f"\| distributed init on worker {rank}/{world_size}. "
	f"using gpu: {gpu_id}")
	fs_init.initialize_model_parallel(world_size)
	torch.cuda.set_device(gpu_id)

	torch.manual_seed(1)
	np.random.seed(1)

	# set the print behavior.
	setup_for_distributed(rank == 0)

	target_dtype = {
	"bf16": torch.bfloat16,
	"fp16": torch.float16
	}[args.dtype]
	with default_tensor_type(dtype=target_dtype, device="cuda"):
	model = MetaModel(args.llama_type, args.llama_config, tokenizer_path=args.tokenizer_path)
	for ckpt_id in range(args.num_ckpts):
	ckpt_path = hf_hub_download(repo_id=args.pretrained_path, filename=args.ckpt_format.format(str(ckpt_id)))
	# ckpt_path = os.path.join(args.pretrained_path, args.ckpt_format.format(str(ckpt_id)))
	print(f"Loading pretrained weights {ckpt_path}")
	checkpoint = torch.load(ckpt_path, map_location='cpu')
	msg = model.load_state_dict(checkpoint, strict=False)
	# print("load result:\n", msg)
	model.cuda()
	model.eval()
	print(f"Model = {str(model)}")

	barrier.wait()

	while True:
	if response_queue is not None:
	response_queue.put(Ready())
	img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, temperature, top_p, modality = request_queue.get()
	try:
	if 'image' in modality and img_path is not None:
	image = Image.open(img_path).convert('RGB')
	inputs = T_random_resized_crop(image)
	elif 'video' in modality and video_path is not None:
	inputs = load_video(video_path)
	elif 'audio' in modality and audio_path is not None:
	inputs = load_audio(audio_path)
	elif 'point' in modality and point_path is not None:
	inputs = load_point(point_path)
	elif 'fmri' in modality and fmri_path is not None:
	inputs = load_fmri(fmri_path)
	elif 'rgbd' in modality and depth_path is not None and depth_rgb_path is not None:
	inputs = load_rgbx(depth_rgb_path, depth_path)
	elif 'rgbn' in modality and normal_path is not None and normal_rgb_path is not None:
	inputs = load_rgbx(normal_rgb_path, normal_path)
	else:
	inputs = None
	except:
	inputs = None

	if inputs is not None:
	inputs = inputs[None].cuda().to(target_dtype)

	conv = conv_templates["v1"].copy()
	for user, bot in chatbot:
	conv.append_message(conv.roles[0], user)
	conv.append_message(conv.roles[1], bot)

	with torch.cuda.amp.autocast(dtype=target_dtype):
	print(conv.get_prompt())
	for stream_response in model.stream_generate(
	conv.get_prompt(), inputs,
	max_gen_len=max_gen_len, temperature=temperature, top_p=top_p,
	modal = modality
	):
	conv_sep = (
	conv.sep
	if conv.sep_style == SeparatorStyle.SINGLE
	else conv.sep2
	)
	end_pos = stream_response["text"].find(conv_sep)
	if end_pos != -1:
	stream_response["text"] = (
	stream_response['text'][:end_pos].rstrip() + "\n"
	)
	stream_response["end_of_content"] = True

	# keep a few characters if not end_of_content to avoid sending
	# part of conv_sep before all of it is generated.
	if not stream_response["end_of_content"]:
	if len(stream_response["text"]) < len(conv_sep):
	continue
	stream_response["text"] = (
	stream_response["text"][:-len(conv_sep)]
	)

	if response_queue is not None:
	response_queue.put(stream_response)

	if stream_response["end_of_content"]:
	break


	def gradio_worker(
	request_queues: List[mp.Queue], response_queue: mp.Queue,
	args: argparse.Namespace, barrier: mp.Barrier,
	) -> None:
	"""
	The gradio worker is responsible for displaying the WebUI and relay the
	requests to model workers. It should be launched only once.

	Args:
	request_queues (List[mp.Queue]): A list of request queues (one for
	each model worker).
	args (argparse.Namespace): All command line arguments.
	barrier (multiprocessing.Barrier): A barrier used to delay the start
	of Web UI to be after the start of the model.
	"""

	def show_user_input(msg, chatbot):
	return "", chatbot + [[msg, None]]

	def stream_model_output(img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality):
	while True:
	content_piece = response_queue.get()
	if isinstance(content_piece, Ready):
	break
	for queue in request_queues:
	queue.put((img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality))
	while True:
	content_piece = response_queue.get()
	chatbot[-1][1] = content_piece["text"]
	yield chatbot
	if content_piece["end_of_content"]:
	break

	def undo(chatbot):
	if len(chatbot) > 0:
	chatbot = chatbot[:-1]
	return chatbot

	def clear():
	chatbot = []
	msg = ""
	return chatbot, msg

	def show_point_cloud(file):
	point = load_point(file).numpy()
	fig = go.Figure(
	data=[
	go.Scatter3d(
	x=point[:,0], y=point[:,1], z=point[:,2],
	mode='markers',
	marker=dict(
	size=1.2,
	color=['rgb({},{},{})'.format(r, g, b) for r,g,b in zip(point[:,3], point[:,4], point[:,5])]
	))],
	layout=dict(
	scene=dict(
	xaxis=dict(visible=False),
	yaxis=dict(visible=False),
	zaxis=dict(visible=False)
	)),)
	return fig

	def change_modality(modal):
	return modal

	CSS ="""
	.contain { display: flex; flex-direction: column; }
	#component-0 { height: 100%; }
	#chatbot { flex-grow: 1; overflow: auto;}
	"""

	header="""
	## OneLLM: One Framework to Align All Modalities with Language
	[[Project Page](https://onellm.csuhan.com)] [[Paper](https://arxiv.org/abs/2312.03700)] [[Code](https://github.com/csuhan/OneLLM)]
	"""

	with gr.Blocks(css=CSS, theme=gr.themes.Base()) as demo:
	gr.Markdown(header)
	with gr.Row(equal_height=True):
	modality = gr.Textbox(value='image', visible=False)
	with gr.Column(scale=1):
	with gr.Tab('Image') as img_tab:
	img_path = gr.Image(label='Image Input', type='filepath')
	gr.Examples(
	examples=[
	"examples/new_york.jpg",
	"examples/food_menu.png",
	],
	inputs=[img_path],
	)
	with gr.Tab('Video') as video_tab:
	video_path = gr.Video(label='Video Input', max_length=180)
	gr.Examples(
	examples=[
	"examples/flower.mp4",
	"examples/star_kun.mp4",
	],
	inputs=[video_path],
	)
	with gr.Tab('Audio') as audio_tab:
	audio_path = gr.Audio(label='Audio Input', type='filepath', sources=['upload'])
	gr.Examples(
	examples=[
	"examples/bell_ring.wav",
	"examples/bird_audio.wav",
	],
	inputs=[audio_path],
	)
	with gr.Tab('Point Cloud') as point_tab:
	point_path = gr.File(label='Point Cloud Input', elem_id="pointpath", elem_classes="")
	point_vis = gr.Plot()
	btn = gr.Button(value="Show Point Cloud")
	btn.click(show_point_cloud, point_path, point_vis)
	gr.Examples(
	examples=glob.glob("examples/point/*.npy"),
	inputs=[point_path],
	examples_per_page=5,
	)
	with gr.Tab('IMU') as imu_tab:
	gr.Markdown('Coming soon🤗')
	with gr.Tab('fMRI') as fmri_tab:
	fmri_path = gr.File(label='fMRI Input', elem_id="fmripath", elem_classes="")
	fmri_image_path = gr.Image(label='Reference Image', interactive=False)
	gr.Examples(
	examples=[
	[file.replace('.jpg', '.npy'), file]
	for file in glob.glob("examples/fmri/*.jpg")
	],
	inputs=[fmri_path, fmri_image_path],
	examples_per_page=3,
	)
	with gr.Tab('Depth Map') as depth_tab:
	depth_path = gr.Image(label='Depth Map', type='filepath')
	depth_rgb_path = gr.Image(label='RGB Image', type='filepath')
	gr.Examples(
	examples=[
	[rgb_image.replace('rgb', 'depth_scaled'), rgb_image]
	for rgb_image in glob.glob("examples/depth_normal/rgb/*.png")[:9]
	],
	inputs=[depth_path, depth_rgb_path],
	examples_per_page=3,
	)
	with gr.Tab('Normal Map') as normal_tab:
	normal_path = gr.Image(label='Normal Map', type='filepath')
	normal_rgb_path = gr.Image(label='RGB Image', type='filepath')
	gr.Examples(
	examples=[
	[rgb_image.replace('rgb', 'normal'), rgb_image]
	for rgb_image in glob.glob("examples/depth_normal/rgb/*.png")[9:]
	],
	inputs=[normal_path, normal_rgb_path],
	examples_per_page=3,
	)
	with gr.Column(scale=2):
	chatbot = gr.Chatbot(elem_id="chatbot")
	msg = gr.Textbox()

	with gr.Row():
	submit_button = gr.Button("Submit", variant="primary")
	undo_button = gr.Button("Undo")
	clear_button = gr.ClearButton([chatbot, msg, img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, point_vis])
	with gr.Row():
	max_gen_len = gr.Slider(
	minimum=1, maximum=args.model_max_seq_len // 2,
	value=args.model_max_seq_len // 2, interactive=True,
	label="Single-turn max response length",
	)
	gen_t = gr.Slider(
	minimum=0, maximum=1, value=0.1, interactive=True,
	label="Temperature",
	)
	top_p = gr.Slider(
	minimum=0, maximum=1, value=0.75, interactive=True,
	label="Top-p",
	)

	img_tab.select(partial(change_modality, 'image'), [], [modality])
	video_tab.select(partial(change_modality, 'video'), [], [modality])
	audio_tab.select(partial(change_modality, 'audio'), [], [modality])
	point_tab.select(partial(change_modality, 'point'), [], [modality])
	fmri_tab.select(partial(change_modality, 'fmri'), [], [modality])
	depth_tab.select(partial(change_modality, 'rgbd'), [], [modality])
	normal_tab.select(partial(change_modality, 'rgbn'), [], [modality])

	img_path.change(clear, [], [chatbot, msg])
	audio_path.change(clear, [], [chatbot, msg])
	video_path.change(clear, [], [chatbot, msg])
	point_path.change(clear, [], [chatbot, msg])
	fmri_path.change(clear, [], [chatbot, msg])
	depth_path.change(clear, [], [chatbot, msg])
	normal_path.change(clear, [], [chatbot, msg])

	msg.submit(
	show_user_input, [msg, chatbot], [msg, chatbot],
	).then(
	stream_model_output, [img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot,
	)
	submit_button.click(
	show_user_input, [msg, chatbot], [msg, chatbot],
	).then(
	stream_model_output, [img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot,
	)
	undo_button.click(undo, chatbot, chatbot)
	barrier.wait()
	demo.queue(api_open=True).launch(share=True, max_threads=1)


	@dataclass
	class DemoConfig:
	gpu_ids = [0]
	tokenizer_path = "config/llama2/tokenizer.model"
	llama_type = "onellm"
	llama_config = "config/llama2/7B.json"
	model_max_seq_len = 2048
	pretrained_path = "csuhan/OneLLM-7B-hf"
	# pretrained_path = "/home/pgao/jiaming/weights/7B_v20_splits/"
	ckpt_format = "consolidated.00-of-01.s{}.pth"
	num_ckpts = 10
	master_port = 23863
	master_addr = "127.0.0.1"
	dtype = "fp16"

	if __name__ == "__main__":
	args = DemoConfig()

	# using the default "fork" method messes up some imported libs (e.g.,
	# pandas)
	# mp.set_start_method("spawn")

	# setup the queues and start the model workers
	request_queues = []
	response_queue = mp.Queue()
	worker_processes = []
	barrier = mp.Barrier(len(args.gpu_ids) + 1)
	for rank, gpu_id in enumerate(args.gpu_ids):
	request_queue = mp.Queue()
	rank_response_queue = response_queue if rank == 0 else None
	process = mp.Process(
	target=model_worker,
	args=(rank, args, barrier, request_queue, rank_response_queue),
	)
	process.start()
	worker_processes.append(process)
	request_queues.append(request_queue)

	gradio_worker(request_queues, response_queue, args, barrier)