| | from transformers import GenerationConfig, AutoProcessor, AutoTokenizer, AutoModelForImageTextToText, Qwen2_5_VLForConditionalGeneration |
| | from qwen_vl_utils import process_vision_info |
| |
|
| | model_name = "Qwen/Qwen2.5-VL-7B-Instruct" |
| | |
| |
|
| | class EndpointHandler: |
| | def __init__(self): |
| | self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained( |
| | model_name, torch_dtype="auto", device_map="cuda" |
| | ) |
| | self.processor = AutoProcessor.from_pretrained(model_name) |
| |
|
| | async def __call__(self, data): |
| |
|
| | messages = data.get("messages") |
| |
|
| | gen_cfg = GenerationConfig( |
| | max_new_tokens=2048, |
| | no_repeat_ngram_size=3, |
| | repeat_penalty=1.2, |
| | early_stopping=True, |
| | ) |
| |
|
| | text = self.processor.apply_chat_template( |
| | messages, tokenize=False, add_generation_prompt=True |
| | ) |
| | image_inputs, video_inputs = process_vision_info(messages) |
| | inputs = self.processor( |
| | text=[text], |
| | images=image_inputs, |
| | videos=video_inputs, |
| | padding=True, |
| | return_tensors="pt", |
| | ) |
| |
|
| | generated_ids = self.model.generate(**inputs, generation_config=gen_cfg) |
| | generated_ids_trimmed = [ |
| | out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| | ] |
| | output_text = self.processor.batch_decode( |
| | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False |
| | ) |
| |
|
| | return output_text[0] |