EasyAnimate / easyanimate /video_caption /utils /image_captioner_awq.py
bubbliiiing
Create Code
19fe404
from pathlib import Path
from typing import Tuple
import auto_gptq
import torch
from auto_gptq.modeling import BaseGPTQForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer
class QwenVLChat:
def __init__(self, device: str = "cuda:0", quantized: bool = False) -> None:
if quantized:
self.model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen-VL-Chat-Int4", device_map=device, trust_remote_code=True
).eval()
self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Int4", trust_remote_code=True)
else:
self.model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen-VL-Chat", device_map=device, trust_remote_code=True, fp16=True
).eval()
self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
def __call__(self, prompt: str, image: str) -> Tuple[str, str]:
query = self.tokenizer.from_list_format([{"image": image}, {"text": prompt}])
response, history = self.model.chat(self.tokenizer, query=query, history=[])
return response, history
class InternLMXComposer2QForCausalLM(BaseGPTQForCausalLM):
layers_block_name = "model.layers"
outside_layer_modules = [
"vit",
"vision_proj",
"model.tok_embeddings",
"model.norm",
"output",
]
inside_layer_modules = [
["attention.wqkv.linear"],
["attention.wo.linear"],
["feed_forward.w1.linear", "feed_forward.w3.linear"],
["feed_forward.w2.linear"],
]
class InternLMXComposer2:
def __init__(self, device: str = "cuda:0", quantized: bool = True):
if quantized:
auto_gptq.modeling._base.SUPPORTED_MODELS = ["internlm"]
self.model = InternLMXComposer2QForCausalLM.from_quantized(
"internlm/internlm-xcomposer2-vl-7b-4bit", trust_remote_code=True, device=device
).eval()
self.tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-xcomposer2-vl-7b-4bit", trust_remote_code=True)
else:
# Setting fp16=True does not work. See https://huggingface.co/internlm/internlm-xcomposer2-vl-7b/discussions/1.
self.model = (
AutoModelForCausalLM.from_pretrained(
"internlm/internlm-xcomposer2-vl-7b", device_map=device, trust_remote_code=True
)
.eval()
.to(torch.float16)
)
self.tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-xcomposer2-vl-7b", trust_remote_code=True)
def __call__(self, prompt: str, image: str):
if not prompt.startswith("<ImageHere>"):
prompt = "<ImageHere>" + prompt
with torch.cuda.amp.autocast(), torch.no_grad():
response, history = self.model.chat(self.tokenizer, query=prompt, image=image, history=[], do_sample=False)
return response, history
if __name__ == "__main__":
image_folder = "demo/"
wildcard_list = ["*.jpg", "*.png"]
image_list = []
for wildcard in wildcard_list:
image_list.extend([str(image_path) for image_path in Path(image_folder).glob(wildcard)])
qwen_vl_chat = QwenVLChat(device="cuda:0", quantized=True)
qwen_vl_prompt = "Please describe this image in detail."
for image in image_list:
response, _ = qwen_vl_chat(qwen_vl_prompt, image)
print(image, response)
internlm2_vl = InternLMXComposer2(device="cuda:0", quantized=False)
internlm2_vl_prompt = "Please describe this image in detail."
for image in image_list:
response, _ = internlm2_vl(internlm2_vl_prompt, image)
print(image, response)