|
|
import time |
|
|
from typing import List, Optional, Tuple, Union |
|
|
|
|
|
import numpy as np |
|
|
from loguru import logger as eval_logger |
|
|
from PIL import Image |
|
|
from tqdm import tqdm |
|
|
|
|
|
from lmms_eval import utils |
|
|
from lmms_eval.api.instance import Instance |
|
|
from lmms_eval.api.registry import register_model |
|
|
from lmms_eval.models.model_utils.gen_metrics import log_metrics |
|
|
from lmms_eval.models.model_utils.reasoning_model_utils import ( |
|
|
parse_reasoning_model_answer, |
|
|
) |
|
|
from lmms_eval.models.simple.qwen2_5_vl import Qwen2_5_VL as Qwen2_5_VLSimple |
|
|
from lmms_eval.protocol import ChatMessages |
|
|
|
|
|
try: |
|
|
from qwen_vl_utils import process_vision_info |
|
|
except ImportError: |
|
|
eval_logger.warning("Failed to import qwen_vl_utils; Please install it via `pip install qwen-vl-utils`") |
|
|
|
|
|
|
|
|
@register_model("qwen2_5_vl_chat") |
|
|
class Qwen2_5_VL(Qwen2_5_VLSimple): |
|
|
is_simple = False |
|
|
|
|
|
def generate_until(self, requests: List[Instance]) -> List[str]: |
|
|
res = [] |
|
|
|
|
|
|
|
|
def _collate(x): |
|
|
return x[0], x[0] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
re_ords = utils.Collator([reg.args for reg in requests], _collate, group_fn=lambda x: x[2], grouping=True) |
|
|
chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) |
|
|
num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 |
|
|
pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") |
|
|
e2e_latency = 0 |
|
|
total_tokens = 0 |
|
|
for chunk in chunks: |
|
|
ctx, doc_to_messages, all_gen_kwargs, doc_id, task, split = zip(*chunk) |
|
|
chat_messages = [doc_to_messages[idx](self.task_dict[task][split][ids]) for idx, (ids, task, split) in enumerate(zip(doc_id, task, split))] |
|
|
chat_messages: List[ChatMessages] = [ChatMessages(**{"messages": message}) for message in chat_messages] |
|
|
visuals = [] |
|
|
videos = [] |
|
|
for messages in chat_messages: |
|
|
visual, video, _ = messages.extract_media() |
|
|
visuals.append(visual) |
|
|
videos.append(video) |
|
|
visuals = self.flatten(visuals) |
|
|
videos = self.flatten(videos) |
|
|
gen_kwargs = all_gen_kwargs[0] |
|
|
|
|
|
|
|
|
video_kwargs = { |
|
|
"max_pixels": self.max_pixels, |
|
|
"min_pixels": self.min_pixels, |
|
|
} |
|
|
if self.fps is not None: |
|
|
video_kwargs["fps"] = self.fps |
|
|
else: |
|
|
video_kwargs["nframes"] = self.max_num_frames |
|
|
batched_messages = [chat_message.to_hf_messages(video_kwargs=video_kwargs) for chat_message in chat_messages] |
|
|
texts = [self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batched_messages] |
|
|
image_inputs, video_inputs = process_vision_info(batched_messages) |
|
|
if video_inputs is not None: |
|
|
total_frames = video_inputs[0].shape[0] |
|
|
indices = np.linspace(0, total_frames - 1, self.max_num_frames, dtype=int) |
|
|
|
|
|
if total_frames - 1 not in indices: |
|
|
indices = np.append(indices, total_frames - 1) |
|
|
video_inputs[0] = video_inputs[0][indices] |
|
|
inputs = self.processor(text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt") |
|
|
|
|
|
if self.device_map == "auto": |
|
|
inputs = inputs.to("cuda") |
|
|
else: |
|
|
inputs = inputs.to(self.device) |
|
|
|
|
|
|
|
|
default_gen_kwargs = { |
|
|
"max_new_tokens": 128, |
|
|
"temperature": 0.0, |
|
|
"top_p": None, |
|
|
"num_beams": 1, |
|
|
} |
|
|
|
|
|
current_gen_kwargs = {**default_gen_kwargs, **gen_kwargs} |
|
|
pad_token_id = self.tokenizer.pad_token_id |
|
|
|
|
|
if current_gen_kwargs["temperature"] > 0: |
|
|
current_gen_kwargs["do_sample"] = True |
|
|
else: |
|
|
current_gen_kwargs["do_sample"] = False |
|
|
current_gen_kwargs["temperature"] = None |
|
|
current_gen_kwargs["top_p"] = None |
|
|
current_gen_kwargs["top_k"] = None |
|
|
|
|
|
start_time = time.time() |
|
|
cont = self.model.generate( |
|
|
**inputs, |
|
|
eos_token_id=self.tokenizer.eos_token_id, |
|
|
pad_token_id=pad_token_id, |
|
|
do_sample=current_gen_kwargs["do_sample"], |
|
|
temperature=current_gen_kwargs["temperature"], |
|
|
top_p=current_gen_kwargs["top_p"], |
|
|
num_beams=current_gen_kwargs["num_beams"], |
|
|
max_new_tokens=current_gen_kwargs["max_new_tokens"], |
|
|
top_k=current_gen_kwargs.get("top_k", None), |
|
|
use_cache=self.use_cache, |
|
|
) |
|
|
end_time = time.time() |
|
|
|
|
|
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, cont)] |
|
|
answers = self.processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False) |
|
|
|
|
|
|
|
|
e2e_latency += end_time - start_time |
|
|
total_tokens += sum(len(ids) for ids in generated_ids_trimmed) |
|
|
|
|
|
for ans, context in zip(answers, texts): |
|
|
clean_ans = parse_reasoning_model_answer(ans) |
|
|
res.append(clean_ans) |
|
|
self.cache_hook.add_partial("generate_until", (context, gen_kwargs), clean_ans) |
|
|
pbar.update(1) |
|
|
|
|
|
eval_logger.debug(f"Question: {context}") |
|
|
eval_logger.debug(f"Model Raw Response: {ans}") |
|
|
eval_logger.debug(f"Model Clean Response: {clean_ans}") |
|
|
|
|
|
res = re_ords.get_original(res) |
|
|
|
|
|
|
|
|
avg_speed = total_tokens / e2e_latency if e2e_latency > 0 else 0 |
|
|
|
|
|
metric_dict = { |
|
|
"total_tokens": total_tokens, |
|
|
"e2e_latency": e2e_latency, |
|
|
"avg_speed": avg_speed, |
|
|
"additional_metrics": { |
|
|
"rank": self.rank, |
|
|
}, |
|
|
} |
|
|
log_metrics(**metric_dict) |
|
|
|
|
|
pbar.close() |
|
|
return res |
|
|
|