File size: 4,760 Bytes
88aba71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import json
from typing import List, Optional, Union


from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
from llamafactory.extras.constants import IGNORE_INDEX
from llamafactory.extras.misc import get_device_count
from llamafactory.extras.packages import is_vllm_available
from llamafactory.hparams import get_infer_args
from llamafactory.model import load_tokenizer
from pydantic import BaseModel
from vllm.sampling_params import GuidedDecodingParams


from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest


# 这里不需要写太好,transforms库后续更新自带vllm


def vllm_infer(

    inputs: Union[str, List[str]],

    model_name_or_path: str,

    adapter_name_or_path: Optional[str] = None,

    dataset: str = "alpaca_en_demo",

    dataset_dir: str = "data",

    template: str = "default",

    cutoff_len: int = 2048,

    max_samples: Optional[int] = None,

    vllm_config: str = "{}",

    save_name: str = "generated_predictions.jsonl",

    temperature: float = 0.95,

    top_p: float = 0.7,

    top_k: int = 50,

    guided_decoding_class: Optional[type[BaseModel]] = None,

    bad_words: Optional[List[str]] = None,

    logprobs: Optional[int] = None,

    max_new_tokens: int = 1024,

    repetition_penalty: float = 1.0,

    skip_special_tokens: bool = True,

    seed: Optional[int] = None,

    pipeline_parallel_size: int = 1,

    image_max_pixels: int = 768 * 768,

    image_min_pixels: int = 32 * 32,

):
    r"""Perform batch generation using vLLM engine, which supports tensor parallelism."""
    if pipeline_parallel_size > get_device_count():
        raise ValueError("Pipeline parallel size should be smaller than the number of gpus.")

    model_args, data_args, _, generating_args = get_infer_args(
        dict(
            model_name_or_path=model_name_or_path,
            adapter_name_or_path=adapter_name_or_path,
            dataset=dataset,
            dataset_dir=dataset_dir,
            template=template,
            cutoff_len=cutoff_len,
            max_samples=max_samples,
            preprocessing_num_workers=16,
            vllm_config=vllm_config,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
        )
    )

    tokenizer_module = load_tokenizer(model_args)
    tokenizer = tokenizer_module["tokenizer"]
    template_obj = get_template_and_fix_tokenizer(tokenizer, data_args)
    template_obj.mm_plugin.expand_mm_tokens = False  # for vllm generate

    if guided_decoding_class:
        json_schema = guided_decoding_class.model_json_schema()
        guided_decoding_params = GuidedDecodingParams(json=json_schema)
    else:
        guided_decoding_params = None

    sampling_params = SamplingParams(
        repetition_penalty=generating_args.repetition_penalty or 1.0,  # repetition_penalty must > 0
        temperature=generating_args.temperature,
        top_p=generating_args.top_p or 1.0,  # top_p must > 0
        top_k=generating_args.top_k or -1,  # top_k must > 0
        stop_token_ids=template_obj.get_stop_token_ids(tokenizer),
        max_tokens=generating_args.max_new_tokens,
        skip_special_tokens=skip_special_tokens,
        seed=seed,
        guided_decoding=guided_decoding_params,
        bad_words=bad_words,
    )
    if model_args.adapter_name_or_path is not None:
        lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
    else:
        lora_request = None

    engine_args = {
        "model": model_args.model_name_or_path,
        "trust_remote_code": True,
        "dtype": model_args.infer_dtype,
        "max_model_len": cutoff_len + max_new_tokens,
        # "tensor_parallel_size":  1,
        # "pipeline_parallel_size": pipeline_parallel_size,
        # "data_parallel_size": get_device_count(), // vllm0.8.5版本支持DP
        "disable_log_stats": True,
        "enable_lora": model_args.adapter_name_or_path is not None,
        "enable_prefix_caching": True,  # 是否启用前缀缓存
        "gpu_memory_utilization": 0.95,
        # "quantization": "bitsandbytes", # 是否启用vllm的 bitsandbytes 的量化加载
        # "load_format": "bitsandbytes",
    }
    if template_obj.mm_plugin.__class__.__name__ != "BasePlugin":
        engine_args["limit_mm_per_prompt"] = {"image": 4, "video": 2, "audio": 2}

    if isinstance(model_args.vllm_config, dict):
        engine_args.update(model_args.vllm_config)

    results = LLM(**engine_args).generate(inputs, sampling_params, lora_request=lora_request)
    return results