File size: 8,179 Bytes
b585c7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import functools
import json

from src.enums import t5_type
from src.utils import have_optimum


def get_loaders(model_name, reward_type, llama_type=None,
                load_gptq='',
                use_autogptq=False,
                load_awq='',
                load_exllama=False,
                config=None,
                rope_scaling=None, max_seq_len=None, model_name_exllama_if_no_config='',
                exllama_dict=None, gptq_dict=None,
                hf_model_dict={},
                ):
    # NOTE: Some models need specific new prompt_type
    # E.g. t5_xxl_true_nli_mixture has input format: "premise: PREMISE_TEXT hypothesis: HYPOTHESIS_TEXT".)
    if load_exllama:
        if exllama_dict is None:
            exllama_dict = {}
        from src.llm_exllama import H2OExLlamaTokenizer, H2OExLlamaGenerator
        from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
        import os, glob

        if config:
            # then use HF path
            from transformers import TRANSFORMERS_CACHE
            model_directory = os.path.join(TRANSFORMERS_CACHE, 'models--' + config.name_or_path.replace('/', '--'),
                                           'snapshots', config._commit_hash)
        else:
            # then use path in env file
            # Directory containing model, tokenizer, generator
            model_directory = model_name_exllama_if_no_config

        # download model
        revision = config._commit_hash
        from huggingface_hub import snapshot_download
        snapshot_download(repo_id=model_name, revision=revision)

        # Locate files we need within that directory
        tokenizer_path = os.path.join(model_directory, "tokenizer.model")
        assert os.path.isfile(tokenizer_path), "Missing %s" % tokenizer_path
        model_config_path = os.path.join(model_directory, "config.json")
        assert os.path.isfile(model_config_path), "Missing %s" % model_config_path
        st_pattern = os.path.join(model_directory, "*.safetensors")
        model_path = glob.glob(st_pattern)[0]
        assert os.path.isfile(model_path), "Missing %s" % model_path

        # Create config, model, tokenizer and generator
        exconfig = ExLlamaConfig(model_config_path)  # create config from config.json
        rope_scaling = rope_scaling or {}
        exconfig.alpha_value = rope_scaling.get('alpha_value', 1)  # rope
        exconfig.compress_pos_emb = rope_scaling.get('compress_pos_emb', 1)  # related rope
        # update max_seq_len
        assert hasattr(config, 'max_position_embeddings') or hasattr(config,
                                                                     'max_sequence_length'), "Improve code if no such argument"
        if hasattr(config, 'max_position_embeddings'):
            exconfig.max_seq_len = int(config.max_position_embeddings * exconfig.alpha_value)
        else:
            exconfig.max_seq_len = int(config.max_sequence_length * exconfig.alpha_value)
        if 'Llama-2'.lower() in model_name.lower():
            # override bad defaults
            exconfig.max_seq_len = int(4096 * exconfig.alpha_value)
        if max_seq_len is not None:
            exconfig.max_seq_len = max_seq_len

        exconfig.model_path = model_path  # supply path to model weights file
        for k, v in exllama_dict.items():
            setattr(exconfig, k, v)
        if 'set_auto_map' in exllama_dict:
            exconfig.auto_map = [float(alloc) for alloc in exllama_dict['set_auto_map'].split(",")]

        model = ExLlama(exconfig)  # create ExLlama instance and load the weights
        tokenizer = H2OExLlamaTokenizer(tokenizer_path)  # create tokenizer from tokenizer model file
        tokenizer.model_max_length = exconfig.max_seq_len

        cache = ExLlamaCache(model)  # create cache for inference
        generator = H2OExLlamaGenerator(model, tokenizer, cache)  # create generator
        return generator, tokenizer, False
    if load_gptq and use_autogptq:
        if gptq_dict is None:
            gptq_dict = {}
        from transformers import AutoTokenizer
        from auto_gptq import AutoGPTQForCausalLM
        if 'use_triton' not in gptq_dict:
            gptq_dict['use_triton'] = False
        if 'llama-2-70B-chat-GPTQ' in model_name.lower() and 'inject_fused_attention' not in gptq_dict:
            gptq_dict.update(dict(inject_fused_attention=False))
        model_loader = functools.partial(AutoGPTQForCausalLM.from_quantized,
                                         quantize_config=None,
                                         **gptq_dict,
                                         )
        return model_loader, AutoTokenizer, False
    if load_gptq and not use_autogptq:
        assert have_optimum, "To use HF transformers GPTQ, please: pip install optimum"
    if load_awq:
        from transformers import AutoTokenizer
        from awq import AutoAWQForCausalLM
        model_loader = functools.partial(AutoAWQForCausalLM.from_quantized,
                                         fuse_layers=True,
                                         )
        return model_loader, AutoTokenizer, False
    if llama_type is None:
        llama_type = "llama" in model_name.lower()
    if llama_type and not load_gptq:
        from transformers import LlamaForCausalLM, LlamaTokenizer
        return functools.partial(LlamaForCausalLM.from_pretrained, **hf_model_dict), LlamaTokenizer, False
    elif 'distilgpt2' in model_name.lower():
        from transformers import AutoModelForCausalLM, AutoTokenizer
        return functools.partial(AutoModelForCausalLM.from_pretrained, **hf_model_dict), AutoTokenizer, False
    elif 'gpt2' in model_name.lower():
        from transformers import GPT2LMHeadModel, GPT2Tokenizer
        return functools.partial(GPT2LMHeadModel.from_pretrained, **hf_model_dict), GPT2Tokenizer, False
    elif 'mbart-' in model_name.lower():
        from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
        return functools.partial(MBartForConditionalGeneration.from_pretrained, **hf_model_dict), MBart50TokenizerFast, True
    elif t5_type(model_name):
        from transformers import AutoTokenizer, T5ForConditionalGeneration
        return functools.partial(T5ForConditionalGeneration.from_pretrained, **hf_model_dict), AutoTokenizer, True
    elif 'bigbird' in model_name:
        from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer
        return functools.partial(BigBirdPegasusForConditionalGeneration.from_pretrained, **hf_model_dict), AutoTokenizer, True
    elif 'bart-large-cnn-samsum' in model_name or 'flan-t5-base-samsum' in model_name:
        from transformers import pipeline
        return pipeline, "summarization", False
    elif reward_type or 'OpenAssistant/reward-model'.lower() in model_name.lower():
        from transformers import AutoModelForSequenceClassification, AutoTokenizer
        return functools.partial(AutoModelForSequenceClassification.from_pretrained, **hf_model_dict), AutoTokenizer, False
    else:
        from transformers import AutoTokenizer, AutoModelForCausalLM
        model_loader = functools.partial(AutoModelForCausalLM.from_pretrained, **hf_model_dict)
        tokenizer_loader = AutoTokenizer
        return model_loader, tokenizer_loader, False


def get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token):
    tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
                                                 local_files_only=local_files_only,
                                                 resume_download=resume_download,
                                                 token=use_auth_token,
                                                 padding_side='left')

    tokenizer.pad_token_id = 0  # different from the eos token
    # when generating, we will use the logits of right-most token to predict the next token
    # so the padding should be on the left,
    # e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference
    tokenizer.padding_side = "left"  # Allow batched inference

    return tokenizer