Spaces:
Runtime error
Runtime error
import sys | |
import os | |
import argparse | |
import multiprocessing as mp | |
import numpy as np | |
from typing import List, Optional | |
import torch | |
import torch.distributed as dist | |
from fairscale.nn.model_parallel import initialize as fs_init | |
import gradio as gr | |
from util.misc import setup_for_distributed | |
from util.misc import default_tensor_type | |
from model.meta import MetaModel | |
from data.conversation_lib import conv_templates, SeparatorStyle | |
from PIL import Image | |
import torchvision.transforms as transforms | |
from data.fintune_dataset import make_audio_features | |
from data import video_utils | |
from dataclasses import dataclass | |
from huggingface_hub import hf_hub_download | |
import plotly.graph_objects as go | |
from data.fintune_dataset import pc_norm | |
from functools import partial | |
import glob | |
import torchvision.transforms.functional as F | |
T_random_resized_crop = transforms.Compose([ | |
transforms.RandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(0.75, 1.3333), interpolation=3, | |
antialias=None), # 3 is bicubic | |
transforms.ToTensor(), | |
transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])]) | |
class PairRandomResizedCrop(transforms.RandomResizedCrop): | |
def forward(self, imgs): | |
i, j, h, w = self.get_params(imgs[0], self.scale, self.ratio) | |
return [F.resized_crop(img, i, j, h, w, self.size, self.interpolation, antialias=self.antialias) for img in imgs] | |
class PairToTensor(transforms.ToTensor): | |
def __call__(self, pics): | |
return [F.to_tensor(pic) for pic in pics] | |
class PairNormalize(transforms.Normalize): | |
def forward(self, tensors): | |
return [F.normalize(tensor, self.mean, self.std, self.inplace) for tensor in tensors] | |
transform_pairimg_train = transforms.Compose([ | |
PairRandomResizedCrop(size=(224, 224), scale=(0.99, 1.0), ratio=(0.75, 1.3333), interpolation=3, antialias=None), # 3 is bicubic | |
PairToTensor(), | |
PairNormalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])]) | |
def load_audio(audio_path): | |
fbank = make_audio_features(audio_path, mel_bins=128) | |
fbank = fbank.transpose(0, 1)[None] #[1, 128, 1024] | |
return fbank | |
def load_video(video_path): | |
video_feats = video_utils.load_and_transform_video_data(video_path, video_path, clip_duration=1, clips_per_video=5) | |
return video_feats[:, :, 0] | |
def load_point(point_path): | |
point_feat = np.load(point_path) | |
point_feat = torch.tensor(point_feat) | |
point_feat = pc_norm(point_feat) | |
return point_feat | |
def load_fmri(fmri_path): | |
data = np.load(fmri_path) | |
data = data.mean(axis=0) | |
data = torch.tensor(data[None]) | |
return data | |
def load_rgbx(image_path, x_image_path): | |
# trick: replace path if 'depth_scaled' in path | |
x_image_path = x_image_path.replace('depth_scaled', 'depth') | |
image = Image.open(image_path).convert('RGB') | |
x_image = Image.open(x_image_path).convert('RGB') | |
x_image = x_image.resize(image.size[-2:]) | |
image, x_image = transform_pairimg_train([image, x_image]) | |
# [2, 3, H, W] | |
image = torch.stack([image, x_image], dim=0) | |
return image | |
class Ready: pass | |
def model_worker( | |
rank: int, args: argparse.Namespace, barrier: mp.Barrier, | |
request_queue: mp.Queue, response_queue: Optional[mp.Queue] = None, | |
) -> None: | |
""" | |
The worker function that manipulates the GPU to run the inference. | |
Exact n_gpu workers are started, with each one operating on a separate GPU. | |
Args: | |
rank (int): Distributed rank of the worker. | |
args (argparse.Namespace): All command line arguments. | |
barrier (multiprocessing.Barrier): A barrier used to delay the start | |
of Web UI to be after the start of the model. | |
""" | |
world_size = len(args.gpu_ids) | |
gpu_id = args.gpu_ids[rank] | |
dist.init_process_group( | |
backend="nccl", rank=rank, world_size=world_size, | |
init_method=f"tcp://{args.master_addr}:{args.master_port}", | |
) | |
print(f"| distributed init on worker {rank}/{world_size}. " | |
f"using gpu: {gpu_id}") | |
fs_init.initialize_model_parallel(world_size) | |
torch.cuda.set_device(gpu_id) | |
torch.manual_seed(1) | |
np.random.seed(1) | |
# set the print behavior. | |
setup_for_distributed(rank == 0) | |
target_dtype = { | |
"bf16": torch.bfloat16, | |
"fp16": torch.float16 | |
}[args.dtype] | |
with default_tensor_type(dtype=target_dtype, device="cuda"): | |
model = MetaModel(args.llama_type, args.llama_config, tokenizer_path=args.tokenizer_path) | |
for ckpt_id in range(args.num_ckpts): | |
ckpt_path = hf_hub_download(repo_id=args.pretrained_path, filename=args.ckpt_format.format(str(ckpt_id))) | |
# ckpt_path = os.path.join(args.pretrained_path, args.ckpt_format.format(str(ckpt_id))) | |
print(f"Loading pretrained weights {ckpt_path}") | |
checkpoint = torch.load(ckpt_path, map_location='cpu') | |
msg = model.load_state_dict(checkpoint, strict=False) | |
# print("load result:\n", msg) | |
model.cuda() | |
model.eval() | |
print(f"Model = {str(model)}") | |
barrier.wait() | |
while True: | |
if response_queue is not None: | |
response_queue.put(Ready()) | |
img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, temperature, top_p, modality = request_queue.get() | |
if 'image' in modality and img_path is not None: | |
image = Image.open(img_path).convert('RGB') | |
inputs = T_random_resized_crop(image) | |
elif 'video' in modality and video_path is not None: | |
inputs = load_video(video_path) | |
elif 'audio' in modality and audio_path is not None: | |
inputs = load_audio(audio_path) | |
elif 'point' in modality and point_path is not None: | |
inputs = load_point(point_path) | |
elif 'fmri' in modality and fmri_path is not None: | |
inputs = load_fmri(fmri_path) | |
elif 'rgbd' in modality and depth_path is not None and depth_rgb_path is not None: | |
inputs = load_rgbx(depth_rgb_path, depth_path) | |
elif 'rgbn' in modality and normal_path is not None and normal_rgb_path is not None: | |
inputs = load_rgbx(normal_rgb_path, normal_path) | |
else: | |
inputs = None | |
if inputs is not None: | |
inputs = inputs[None].cuda().to(target_dtype) | |
conv = conv_templates["v1"].copy() | |
for user, bot in chatbot: | |
conv.append_message(conv.roles[0], user) | |
conv.append_message(conv.roles[1], bot) | |
with torch.cuda.amp.autocast(dtype=target_dtype): | |
print(conv.get_prompt()) | |
for stream_response in model.stream_generate( | |
conv.get_prompt(), inputs, | |
max_gen_len=max_gen_len, temperature=temperature, top_p=top_p, | |
modal = modality | |
): | |
conv_sep = ( | |
conv.sep | |
if conv.sep_style == SeparatorStyle.SINGLE | |
else conv.sep2 | |
) | |
end_pos = stream_response["text"].find(conv_sep) | |
if end_pos != -1: | |
stream_response["text"] = ( | |
stream_response['text'][:end_pos].rstrip() + "\n" | |
) | |
stream_response["end_of_content"] = True | |
# keep a few characters if not end_of_content to avoid sending | |
# part of conv_sep before all of it is generated. | |
if not stream_response["end_of_content"]: | |
if len(stream_response["text"]) < len(conv_sep): | |
continue | |
stream_response["text"] = ( | |
stream_response["text"][:-len(conv_sep)] | |
) | |
if response_queue is not None: | |
response_queue.put(stream_response) | |
if stream_response["end_of_content"]: | |
break | |
def gradio_worker( | |
request_queues: List[mp.Queue], response_queue: mp.Queue, | |
args: argparse.Namespace, barrier: mp.Barrier, | |
) -> None: | |
""" | |
The gradio worker is responsible for displaying the WebUI and relay the | |
requests to model workers. It should be launched only once. | |
Args: | |
request_queues (List[mp.Queue]): A list of request queues (one for | |
each model worker). | |
args (argparse.Namespace): All command line arguments. | |
barrier (multiprocessing.Barrier): A barrier used to delay the start | |
of Web UI to be after the start of the model. | |
""" | |
def show_user_input(msg, chatbot): | |
return "", chatbot + [[msg, None]] | |
def stream_model_output(img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality): | |
while True: | |
content_piece = response_queue.get() | |
if isinstance(content_piece, Ready): | |
break | |
for queue in request_queues: | |
queue.put((img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality)) | |
while True: | |
content_piece = response_queue.get() | |
chatbot[-1][1] = content_piece["text"] | |
yield chatbot | |
if content_piece["end_of_content"]: | |
break | |
def undo(chatbot): | |
if len(chatbot) > 0: | |
chatbot = chatbot[:-1] | |
return chatbot | |
def clear(): | |
chatbot = [] | |
msg = "" | |
return chatbot, msg | |
def show_point_cloud(file): | |
point = load_point(file).numpy() | |
fig = go.Figure( | |
data=[ | |
go.Scatter3d( | |
x=point[:,0], y=point[:,1], z=point[:,2], | |
mode='markers', | |
marker=dict( | |
size=1.2, | |
color=['rgb({},{},{})'.format(r, g, b) for r,g,b in zip(point[:,3], point[:,4], point[:,5])] | |
))], | |
layout=dict( | |
scene=dict( | |
xaxis=dict(visible=False), | |
yaxis=dict(visible=False), | |
zaxis=dict(visible=False) | |
)),) | |
return fig | |
def change_modality(modal): | |
return modal | |
CSS =""" | |
.contain { display: flex; flex-direction: column; } | |
#component-0 { height: 100%; } | |
#chatbot { flex-grow: 1; overflow: auto;} | |
""" | |
header=""" | |
## OneLLM: One Framework to Align All Modalities with Language | |
[[Project Page](https://onellm.csuhan.com)] [[Paper](https://arxiv.org/abs/2312.03700)] [[Code](https://github.com/csuhan/OneLLM)] | |
""" | |
with gr.Blocks(css=CSS, theme=gr.themes.Base()) as demo: | |
gr.Markdown(header) | |
with gr.Row(equal_height=True): | |
modality = gr.Textbox(value='image', visible=False) | |
with gr.Column(scale=1): | |
with gr.Tab('Image') as img_tab: | |
img_path = gr.Image(label='Image Input', type='filepath') | |
gr.Examples( | |
examples=[ | |
"examples/new_york.jpg", | |
"examples/food_menu.png", | |
], | |
inputs=[img_path], | |
) | |
with gr.Tab('Video') as video_tab: | |
video_path = gr.Video(label='Video Input', max_length=180) | |
gr.Examples( | |
examples=[ | |
"examples/flower.mp4", | |
"examples/star_kun.mp4", | |
], | |
inputs=[video_path], | |
) | |
with gr.Tab('Audio') as audio_tab: | |
audio_path = gr.Audio(label='Audio Input', type='filepath', sources=['upload']) | |
gr.Examples( | |
examples=[ | |
"examples/bell_ring.wav", | |
"examples/bird_audio.wav", | |
], | |
inputs=[audio_path], | |
) | |
with gr.Tab('Point Cloud') as point_tab: | |
point_path = gr.File(label='Point Cloud Input', elem_id="pointpath", elem_classes="") | |
point_vis = gr.Plot() | |
btn = gr.Button(value="Show Point Cloud") | |
btn.click(show_point_cloud, point_path, point_vis) | |
gr.Examples( | |
examples=glob.glob("examples/point/*.npy"), | |
inputs=[point_path], | |
examples_per_page=5, | |
) | |
with gr.Tab('IMU') as imu_tab: | |
gr.Markdown('Coming soon🤗') | |
with gr.Tab('fMRI') as fmri_tab: | |
fmri_path = gr.File(label='fMRI Input', elem_id="fmripath", elem_classes="") | |
fmri_image_path = gr.Image(label='Reference Image', interactive=False) | |
gr.Examples( | |
examples=[ | |
[file.replace('.jpg', '.npy'), file] | |
for file in glob.glob("examples/fmri/*.jpg") | |
], | |
inputs=[fmri_path, fmri_image_path], | |
examples_per_page=3, | |
) | |
with gr.Tab('Depth Map') as depth_tab: | |
depth_path = gr.Image(label='Depth Map', type='filepath') | |
depth_rgb_path = gr.Image(label='RGB Image', type='filepath') | |
gr.Examples( | |
examples=[ | |
[rgb_image.replace('rgb', 'depth_scaled'), rgb_image] | |
for rgb_image in glob.glob("examples/depth_normal/rgb/*.png")[:9] | |
], | |
inputs=[depth_path, depth_rgb_path], | |
examples_per_page=3, | |
) | |
with gr.Tab('Normal Map') as normal_tab: | |
normal_path = gr.Image(label='Normal Map', type='filepath') | |
normal_rgb_path = gr.Image(label='RGB Image', type='filepath') | |
gr.Examples( | |
examples=[ | |
[rgb_image.replace('rgb', 'normal'), rgb_image] | |
for rgb_image in glob.glob("examples/depth_normal/rgb/*.png")[9:] | |
], | |
inputs=[normal_path, normal_rgb_path], | |
examples_per_page=3, | |
) | |
with gr.Column(scale=2): | |
chatbot = gr.Chatbot(elem_id="chatbot") | |
msg = gr.Textbox() | |
with gr.Row(): | |
submit_button = gr.Button("Submit", variant="primary") | |
undo_button = gr.Button("Undo") | |
clear_button = gr.ClearButton([chatbot, msg, img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, point_vis]) | |
with gr.Row(): | |
max_gen_len = gr.Slider( | |
minimum=1, maximum=args.model_max_seq_len // 2, | |
value=args.model_max_seq_len // 2, interactive=True, | |
label="Single-turn max response length", | |
) | |
gen_t = gr.Slider( | |
minimum=0, maximum=1, value=0.1, interactive=True, | |
label="Temperature", | |
) | |
top_p = gr.Slider( | |
minimum=0, maximum=1, value=0.75, interactive=True, | |
label="Top-p", | |
) | |
img_tab.select(partial(change_modality, 'image'), [], [modality]) | |
video_tab.select(partial(change_modality, 'video'), [], [modality]) | |
audio_tab.select(partial(change_modality, 'audio'), [], [modality]) | |
point_tab.select(partial(change_modality, 'point'), [], [modality]) | |
fmri_tab.select(partial(change_modality, 'fmri'), [], [modality]) | |
depth_tab.select(partial(change_modality, 'rgbd'), [], [modality]) | |
normal_tab.select(partial(change_modality, 'rgbn'), [], [modality]) | |
msg.submit( | |
show_user_input, [msg, chatbot], [msg, chatbot], | |
).then( | |
stream_model_output, [img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot, | |
) | |
submit_button.click( | |
show_user_input, [msg, chatbot], [msg, chatbot], | |
).then( | |
stream_model_output, [img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot, | |
) | |
undo_button.click(undo, chatbot, chatbot) | |
# img_path.change(clear, [], [chatbot, msg]) | |
barrier.wait() | |
demo.queue(api_open=True).launch(share=True, max_threads=1) | |
class DemoConfig: | |
gpu_ids = [0] | |
tokenizer_path = "config/llama2/tokenizer.model" | |
llama_type = "onellm" | |
llama_config = "config/llama2/7B.json" | |
model_max_seq_len = 2048 | |
pretrained_path = "csuhan/OneLLM-7B-hf" | |
# pretrained_path = "/home/pgao/jiaming/weights/7B_v20_splits/" | |
ckpt_format = "consolidated.00-of-01.s{}.pth" | |
num_ckpts = 10 | |
master_port = 23863 | |
master_addr = "127.0.0.1" | |
dtype = "fp16" | |
if __name__ == "__main__": | |
args = DemoConfig() | |
# using the default "fork" method messes up some imported libs (e.g., | |
# pandas) | |
# mp.set_start_method("spawn") | |
# setup the queues and start the model workers | |
request_queues = [] | |
response_queue = mp.Queue() | |
worker_processes = [] | |
barrier = mp.Barrier(len(args.gpu_ids) + 1) | |
for rank, gpu_id in enumerate(args.gpu_ids): | |
request_queue = mp.Queue() | |
rank_response_queue = response_queue if rank == 0 else None | |
process = mp.Process( | |
target=model_worker, | |
args=(rank, args, barrier, request_queue, rank_response_queue), | |
) | |
process.start() | |
worker_processes.append(process) | |
request_queues.append(request_queue) | |
gradio_worker(request_queues, response_queue, args, barrier) | |