Spaces:

DAMO-NLP-SG
/

Video-LLaMA

Sleeping

Video-LLaMA / video_llama /models /__init__.py

舟勤

45d16e9 about 1 year ago

No virus

5.87 kB

	"""
	Adapted from salesforce@LAVIS Vision-CAIR@MiniGPT-4. Below is the original copyright:
	Copyright (c) 2022, salesforce.com, inc.
	All rights reserved.
	SPDX-License-Identifier: BSD-3-Clause
	For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
	"""

	import logging
	import torch
	from omegaconf import OmegaConf

	from video_llama.common.registry import registry
	from video_llama.models.base_model import BaseModel
	from video_llama.models.blip2 import Blip2Base
	from video_llama.models.video_llama import VideoLLAMA
	from video_llama.processors.base_processor import BaseProcessor


	__all__ = [
	"load_model",
	"BaseModel",
	"Blip2Base",
	"VideoLLAMA"
	]


	def load_model(name, model_type, is_eval=False, device="cpu", checkpoint=None):
	"""
	Load supported models.

	To list all available models and types in registry:
	>>> from video_llama.models import model_zoo
	>>> print(model_zoo)

	Args:
	name (str): name of the model.
	model_type (str): type of the model.
	is_eval (bool): whether the model is in eval mode. Default: False.
	device (str): device to use. Default: "cpu".
	checkpoint (str): path or to checkpoint. Default: None.
	Note that expecting the checkpoint to have the same keys in state_dict as the model.

	Returns:
	model (torch.nn.Module): model.
	"""

	model = registry.get_model_class(name).from_pretrained(model_type=model_type)

	if checkpoint is not None:
	model.load_checkpoint(checkpoint)

	if is_eval:
	model.eval()

	if device == "cpu":
	model = model.float()

	return model.to(device)


	def load_preprocess(config):
	"""
	Load preprocessor configs and construct preprocessors.

	If no preprocessor is specified, return BaseProcessor, which does not do any preprocessing.

	Args:
	config (dict): preprocessor configs.

	Returns:
	vis_processors (dict): preprocessors for visual inputs.
	txt_processors (dict): preprocessors for text inputs.

	Key is "train" or "eval" for processors used in training and evaluation respectively.
	"""

	def _build_proc_from_cfg(cfg):
	return (
	registry.get_processor_class(cfg.name).from_config(cfg)
	if cfg is not None
	else BaseProcessor()
	)

	vis_processors = dict()
	txt_processors = dict()

	vis_proc_cfg = config.get("vis_processor")
	txt_proc_cfg = config.get("text_processor")

	if vis_proc_cfg is not None:
	vis_train_cfg = vis_proc_cfg.get("train")
	vis_eval_cfg = vis_proc_cfg.get("eval")
	else:
	vis_train_cfg = None
	vis_eval_cfg = None

	vis_processors["train"] = _build_proc_from_cfg(vis_train_cfg)
	vis_processors["eval"] = _build_proc_from_cfg(vis_eval_cfg)

	if txt_proc_cfg is not None:
	txt_train_cfg = txt_proc_cfg.get("train")
	txt_eval_cfg = txt_proc_cfg.get("eval")
	else:
	txt_train_cfg = None
	txt_eval_cfg = None

	txt_processors["train"] = _build_proc_from_cfg(txt_train_cfg)
	txt_processors["eval"] = _build_proc_from_cfg(txt_eval_cfg)

	return vis_processors, txt_processors


	def load_model_and_preprocess(name, model_type, is_eval=False, device="cpu"):
	"""
	Load model and its related preprocessors.

	List all available models and types in registry:
	>>> from video_llama.models import model_zoo
	>>> print(model_zoo)

	Args:
	name (str): name of the model.
	model_type (str): type of the model.
	is_eval (bool): whether the model is in eval mode. Default: False.
	device (str): device to use. Default: "cpu".

	Returns:
	model (torch.nn.Module): model.
	vis_processors (dict): preprocessors for visual inputs.
	txt_processors (dict): preprocessors for text inputs.
	"""
	model_cls = registry.get_model_class(name)

	# load model
	model = model_cls.from_pretrained(model_type=model_type)

	if is_eval:
	model.eval()

	# load preprocess
	cfg = OmegaConf.load(model_cls.default_config_path(model_type))
	if cfg is not None:
	preprocess_cfg = cfg.preprocess

	vis_processors, txt_processors = load_preprocess(preprocess_cfg)
	else:
	vis_processors, txt_processors = None, None
	logging.info(
	f"""No default preprocess for model {name} ({model_type}).
	This can happen if the model is not finetuned on downstream datasets,
	or it is not intended for direct use without finetuning.
	"""
	)

	if device == "cpu" or device == torch.device("cpu"):
	model = model.float()

	return model.to(device), vis_processors, txt_processors


	class ModelZoo:
	"""
	A utility class to create string representation of available model architectures and types.

	>>> from video_llama.models import model_zoo
	>>> # list all available models
	>>> print(model_zoo)
	>>> # show total number of models
	>>> print(len(model_zoo))
	"""

	def __init__(self) -> None:
	self.model_zoo = {
	k: list(v.PRETRAINED_MODEL_CONFIG_DICT.keys())
	for k, v in registry.mapping["model_name_mapping"].items()
	}

	def __str__(self) -> str:
	return (
	"=" * 50
	+ "\n"
	+ f"{'Architectures':<30} {'Types'}\n"
	+ "=" * 50
	+ "\n"
	+ "\n".join(
	[
	f"{name:<30} {', '.join(types)}"
	for name, types in self.model_zoo.items()
	]
	)
	)

	def __iter__(self):
	return iter(self.model_zoo.items())

	def __len__(self):
	return sum([len(v) for v in self.model_zoo.values()])


	model_zoo = ModelZoo()