unsubscribe commited on
Commit
ec14ead
1 Parent(s): d5d7f70
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +54 -0
  2. service_docker_up.sh +87 -0
  3. triton_models/interactive/1/placeholder +0 -0
  4. triton_models/interactive/config.pbtxt +281 -0
  5. triton_models/postprocessing/1/model.py +129 -0
  6. triton_models/postprocessing/config.pbtxt +36 -0
  7. triton_models/preprocessing/1/model.py +151 -0
  8. triton_models/preprocessing/config.pbtxt +74 -0
  9. triton_models/tokenizer/config.json +29 -0
  10. triton_models/tokenizer/configuration_internlm.py +120 -0
  11. triton_models/tokenizer/generation_config.json +7 -0
  12. triton_models/tokenizer/modeling_internlm.py +966 -0
  13. triton_models/tokenizer/placeholder +0 -0
  14. triton_models/tokenizer/special_tokens_map.json +6 -0
  15. triton_models/tokenizer/tokenization_internlm.py +242 -0
  16. triton_models/tokenizer/tokenizer.model +3 -0
  17. triton_models/tokenizer/tokenizer.py +290 -0
  18. triton_models/tokenizer/tokenizer_config.json +15 -0
  19. triton_models/weights/layers.0.attention.w_qkv.0.qweight +3 -0
  20. triton_models/weights/layers.0.attention.w_qkv.0.scales_zeros +3 -0
  21. triton_models/weights/layers.0.attention.wo.0.bias +0 -0
  22. triton_models/weights/layers.0.attention.wo.0.qweight +3 -0
  23. triton_models/weights/layers.0.attention.wo.0.scales_zeros +0 -0
  24. triton_models/weights/layers.0.ffn_norm.weight +0 -0
  25. triton_models/weights/layers.1.attention.w_qkv.0.qweight +3 -0
  26. triton_models/weights/layers.1.attention.wo.0.scales_zeros +0 -0
  27. triton_models/weights/layers.1.feed_forward.w2.0.scales_zeros +3 -0
  28. triton_models/weights/layers.1.ffn_norm.weight +0 -0
  29. triton_models/weights/layers.10.attention.w_qkv.0.scales_zeros +3 -0
  30. triton_models/weights/layers.10.attention.wo.0.scales_zeros +0 -0
  31. triton_models/weights/layers.10.feed_forward.w13.0.qweight +3 -0
  32. triton_models/weights/layers.11.attention.wo.0.scales_zeros +0 -0
  33. triton_models/weights/layers.11.feed_forward.w13.0.qweight +3 -0
  34. triton_models/weights/layers.12.attention.w_qkv.0.bias +0 -0
  35. triton_models/weights/layers.12.attention.wo.0.qweight +3 -0
  36. triton_models/weights/layers.13.attention.w_qkv.0.bias +0 -0
  37. triton_models/weights/layers.13.attention.wo.0.scales_zeros +0 -0
  38. triton_models/weights/layers.13.feed_forward.w13.0.qweight +3 -0
  39. triton_models/weights/layers.13.feed_forward.w13.0.scales_zeros +3 -0
  40. triton_models/weights/layers.13.feed_forward.w2.0.qweight +3 -0
  41. triton_models/weights/layers.14.attention.w_qkv.0.scales_zeros +3 -0
  42. triton_models/weights/layers.14.attention.wo.0.qweight +3 -0
  43. triton_models/weights/layers.14.attention.wo.0.scales_zeros +0 -0
  44. triton_models/weights/layers.14.feed_forward.w2.0.qweight +3 -0
  45. triton_models/weights/layers.14.ffn_norm.weight +0 -0
  46. triton_models/weights/layers.15.attention.w_qkv.0.qweight +3 -0
  47. triton_models/weights/layers.15.attention.w_qkv.0.scales_zeros +3 -0
  48. triton_models/weights/layers.15.attention_norm.weight +0 -0
  49. triton_models/weights/layers.15.feed_forward.w2.0.scales_zeros +3 -0
  50. triton_models/weights/layers.16.attention.w_qkv.0.qweight +3 -0
.gitattributes CHANGED
@@ -33,3 +33,57 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ triton_models/weights/layers.0.attention.w_qkv.0.qweight filter=lfs diff=lfs merge=lfs -text
37
+ triton_models/weights/layers.0.attention.w_qkv.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
38
+ triton_models/weights/layers.0.attention.wo.0.qweight filter=lfs diff=lfs merge=lfs -text
39
+ triton_models/weights/layers.1.attention.w_qkv.0.qweight filter=lfs diff=lfs merge=lfs -text
40
+ triton_models/weights/layers.1.feed_forward.w2.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
41
+ triton_models/weights/layers.10.attention.w_qkv.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
42
+ triton_models/weights/layers.10.feed_forward.w13.0.qweight filter=lfs diff=lfs merge=lfs -text
43
+ triton_models/weights/layers.11.feed_forward.w13.0.qweight filter=lfs diff=lfs merge=lfs -text
44
+ triton_models/weights/layers.12.attention.wo.0.qweight filter=lfs diff=lfs merge=lfs -text
45
+ triton_models/weights/layers.13.feed_forward.w13.0.qweight filter=lfs diff=lfs merge=lfs -text
46
+ triton_models/weights/layers.13.feed_forward.w13.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
47
+ triton_models/weights/layers.13.feed_forward.w2.0.qweight filter=lfs diff=lfs merge=lfs -text
48
+ triton_models/weights/layers.14.attention.w_qkv.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
49
+ triton_models/weights/layers.14.attention.wo.0.qweight filter=lfs diff=lfs merge=lfs -text
50
+ triton_models/weights/layers.14.feed_forward.w2.0.qweight filter=lfs diff=lfs merge=lfs -text
51
+ triton_models/weights/layers.15.attention.w_qkv.0.qweight filter=lfs diff=lfs merge=lfs -text
52
+ triton_models/weights/layers.15.attention.w_qkv.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
53
+ triton_models/weights/layers.15.feed_forward.w2.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
54
+ triton_models/weights/layers.16.attention.w_qkv.0.qweight filter=lfs diff=lfs merge=lfs -text
55
+ triton_models/weights/layers.16.feed_forward.w13.0.qweight filter=lfs diff=lfs merge=lfs -text
56
+ triton_models/weights/layers.16.feed_forward.w13.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
57
+ triton_models/weights/layers.16.feed_forward.w2.0.qweight filter=lfs diff=lfs merge=lfs -text
58
+ triton_models/weights/layers.18.attention.w_qkv.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
59
+ triton_models/weights/layers.18.attention.wo.0.qweight filter=lfs diff=lfs merge=lfs -text
60
+ triton_models/weights/layers.18.feed_forward.w2.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
61
+ triton_models/weights/layers.19.attention.w_qkv.0.qweight filter=lfs diff=lfs merge=lfs -text
62
+ triton_models/weights/layers.19.attention.w_qkv.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
63
+ triton_models/weights/layers.2.attention.wo.0.qweight filter=lfs diff=lfs merge=lfs -text
64
+ triton_models/weights/layers.2.feed_forward.w13.0.qweight filter=lfs diff=lfs merge=lfs -text
65
+ triton_models/weights/layers.20.feed_forward.w13.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
66
+ triton_models/weights/layers.21.feed_forward.w13.0.qweight filter=lfs diff=lfs merge=lfs -text
67
+ triton_models/weights/layers.21.feed_forward.w2.0.qweight filter=lfs diff=lfs merge=lfs -text
68
+ triton_models/weights/layers.21.feed_forward.w2.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
69
+ triton_models/weights/layers.22.attention.w_qkv.0.qweight filter=lfs diff=lfs merge=lfs -text
70
+ triton_models/weights/layers.23.feed_forward.w13.0.qweight filter=lfs diff=lfs merge=lfs -text
71
+ triton_models/weights/layers.24.attention.w_qkv.0.qweight filter=lfs diff=lfs merge=lfs -text
72
+ triton_models/weights/layers.24.feed_forward.w2.0.qweight filter=lfs diff=lfs merge=lfs -text
73
+ triton_models/weights/layers.25.attention.w_qkv.0.qweight filter=lfs diff=lfs merge=lfs -text
74
+ triton_models/weights/layers.26.attention.wo.0.qweight filter=lfs diff=lfs merge=lfs -text
75
+ triton_models/weights/layers.26.feed_forward.w13.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
76
+ triton_models/weights/layers.27.feed_forward.w13.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
77
+ triton_models/weights/layers.28.attention.w_qkv.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
78
+ triton_models/weights/layers.29.attention.w_qkv.0.qweight filter=lfs diff=lfs merge=lfs -text
79
+ triton_models/weights/layers.29.attention.wo.0.qweight filter=lfs diff=lfs merge=lfs -text
80
+ triton_models/weights/layers.3.feed_forward.w13.0.qweight filter=lfs diff=lfs merge=lfs -text
81
+ triton_models/weights/layers.30.feed_forward.w13.0.qweight filter=lfs diff=lfs merge=lfs -text
82
+ triton_models/weights/layers.31.feed_forward.w2.0.qweight filter=lfs diff=lfs merge=lfs -text
83
+ triton_models/weights/layers.31.feed_forward.w2.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
84
+ triton_models/weights/layers.4.feed_forward.w13.0.qweight filter=lfs diff=lfs merge=lfs -text
85
+ triton_models/weights/layers.5.attention.w_qkv.0.qweight filter=lfs diff=lfs merge=lfs -text
86
+ triton_models/weights/layers.5.feed_forward.w2.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
87
+ triton_models/weights/layers.7.feed_forward.w2.0.scales_zeros filter=lfs diff=lfs merge=lfs -text
88
+ triton_models/weights/layers.8.feed_forward.w2.0.qweight filter=lfs diff=lfs merge=lfs -text
89
+ triton_models/weights/layers.9.feed_forward.w2.0.qweight filter=lfs diff=lfs merge=lfs -text
service_docker_up.sh ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ show_help() {
4
+ echo "Usage: $0 [-h] [--help] [-l] [--lib-dir]"
5
+ echo
6
+ echo "Options:"
7
+ echo " -h, --help Show this help message and exit"
8
+ echo " --lib-dir Specify the directory of turbomind libraries"
9
+ }
10
+
11
+ # check if '-h' or '--help' in the arguments
12
+ for arg in "$@"
13
+ do
14
+ if [ "$arg" == "-h" ] || [ "$arg" == "--help" ]; then
15
+ show_help
16
+ exit 0
17
+ fi
18
+ done
19
+
20
+
21
+ TP=1
22
+ DEVICES="0"
23
+ for ((i = 1; i < ${TP}; ++i)); do
24
+ DEVICES="${DEVICES},$i"
25
+ done
26
+ DEVICES="\"device=${DEVICES}\""
27
+
28
+
29
+ SCRIPT_DIR="$(dirname "$0")"
30
+ SCRIPT_ABS_DIR="$(realpath "$SCRIPT_DIR")"
31
+
32
+
33
+ if [ -z "$1" ]; then
34
+ docker run \
35
+ --gpus $DEVICES \
36
+ --rm \
37
+ -v "${SCRIPT_ABS_DIR}":/workspace/models \
38
+ --shm-size 16g \
39
+ -p 33336:22 \
40
+ -p 33337-33400:33337-33400 \
41
+ --cap-add=SYS_PTRACE \
42
+ --cap-add=SYS_ADMIN \
43
+ --security-opt seccomp=unconfined \
44
+ --name lmdeploy \
45
+ -it --env NCCL_LAUNCH_MODE=GROUP openmmlab/lmdeploy:latest \
46
+ tritonserver \
47
+ --model-repository=/workspace/models/model_repository \
48
+ --allow-http=0 \
49
+ --allow-grpc=1 \
50
+ --grpc-port=33337 \
51
+ --log-verbose=0 \
52
+ --allow-metrics=1
53
+ fi
54
+
55
+ for ((i = 1; i <= $#; i++)); do
56
+ arg=${!i}
57
+ case "$arg" in
58
+ --lib-dir)
59
+ if [ "$i" -eq "$#" ]; then
60
+ show_help
61
+ exit -1
62
+ fi
63
+ LIB_PATH=${@:i+1:1}
64
+ docker run \
65
+ --gpus $DEVICES \
66
+ --rm \
67
+ -v "${LIB_PATH}":/opt/tritonserver/backends/turbomind \
68
+ -v ""${SCRIPT_ABS_DIR}"":/workspace/models \
69
+ --shm-size 16g \
70
+ -p 33336:22 \
71
+ -p 33337-33400:33337-33400 \
72
+ --cap-add=SYS_PTRACE \
73
+ --cap-add=SYS_ADMIN \
74
+ --security-opt seccomp=unconfined \
75
+ --name lmdeploy \
76
+ -it --env NCCL_LAUNCH_MODE=GROUP openmmlab/lmdeploy:latest \
77
+ tritonserver \
78
+ --model-repository=/workspace/models/model_repository \
79
+ --allow-http=0 \
80
+ --allow-grpc=1 \
81
+ --grpc-port=33337 \
82
+ --log-verbose=0 \
83
+ --allow-metrics=1
84
+ break
85
+ ;;
86
+ esac
87
+ done
triton_models/interactive/1/placeholder ADDED
File without changes
triton_models/interactive/config.pbtxt ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "turbomind"
28
+ backend: "turbomind"
29
+ default_model_filename: "weights"
30
+ max_batch_size: 1
31
+
32
+ model_transaction_policy {
33
+ decoupled: True
34
+ }
35
+
36
+ instance_group [
37
+ {
38
+ # max concurrent instances
39
+ count: 48
40
+ kind: KIND_CPU
41
+ }
42
+ ]
43
+
44
+ input [
45
+ {
46
+ name: "input_ids"
47
+ data_type: TYPE_UINT32
48
+ dims: [ -1 ]
49
+ # allow_ragged_batch: true
50
+ },
51
+ {
52
+ name: "input_lengths"
53
+ data_type: TYPE_UINT32
54
+ dims: [ 1 ]
55
+ reshape: { shape: [ ] }
56
+ },
57
+ {
58
+ name: "request_output_len"
59
+ data_type: TYPE_UINT32
60
+ dims: [ -1 ]
61
+ },
62
+ {
63
+ name: "step"
64
+ data_type: TYPE_INT32
65
+ dims: [ 1 ]
66
+ reshape: { shape: [ ] }
67
+ optional: true
68
+ },
69
+ {
70
+ name: "session_len"
71
+ data_type: TYPE_UINT32
72
+ dims: [ 1 ]
73
+ reshape: { shape: [ ] }
74
+ optional: true
75
+ },
76
+ {
77
+ name: "runtime_top_k"
78
+ data_type: TYPE_UINT32
79
+ dims: [ 1 ]
80
+ reshape: { shape: [ ] }
81
+ optional: true
82
+ },
83
+ {
84
+ name: "runtime_top_p"
85
+ data_type: TYPE_FP32
86
+ dims: [ 1 ]
87
+ reshape: { shape: [ ] }
88
+ optional: true
89
+ },
90
+ {
91
+ name: "beam_search_diversity_rate"
92
+ data_type: TYPE_FP32
93
+ dims: [ 1 ]
94
+ reshape: { shape: [ ] }
95
+ optional: true
96
+ },
97
+ {
98
+ name: "temperature"
99
+ data_type: TYPE_FP32
100
+ dims: [ 1 ]
101
+ reshape: { shape: [ ] }
102
+ optional: true
103
+ },
104
+ {
105
+ name: "len_penalty"
106
+ data_type: TYPE_FP32
107
+ dims: [ 1 ]
108
+ reshape: { shape: [ ] }
109
+ optional: true
110
+ },
111
+ {
112
+ name: "repetition_penalty"
113
+ data_type: TYPE_FP32
114
+ dims: [ 1 ]
115
+ reshape: { shape: [ ] }
116
+ optional: true
117
+ },
118
+ {
119
+ name: "random_seed"
120
+ data_type: TYPE_UINT64
121
+ dims: [ 1 ]
122
+ reshape: { shape: [ ] }
123
+ optional: true
124
+ },
125
+ {
126
+ name: "is_return_log_probs"
127
+ data_type: TYPE_BOOL
128
+ dims: [ 1 ]
129
+ reshape: { shape: [ ] }
130
+ optional: true
131
+ },
132
+ {
133
+ name: "beam_width"
134
+ data_type: TYPE_UINT32
135
+ dims: [ 1 ]
136
+ reshape: { shape: [ ] }
137
+ optional: true
138
+ },
139
+ {
140
+ name: "start_id"
141
+ data_type: TYPE_UINT32
142
+ dims: [ 1 ]
143
+ reshape: { shape: [ ] }
144
+ optional: true
145
+ },
146
+ {
147
+ name: "end_id"
148
+ data_type: TYPE_UINT32
149
+ dims: [ 1 ]
150
+ reshape: { shape: [ ] }
151
+ optional: true
152
+ },
153
+ {
154
+ name: "bad_words_list"
155
+ data_type: TYPE_INT32
156
+ dims: [ 2, -1 ]
157
+ optional: true
158
+ },
159
+ {
160
+ name: "stop_words_list"
161
+ data_type: TYPE_INT32
162
+ dims: [ 2, -1 ]
163
+ optional: true
164
+ },
165
+ {
166
+ name: "prompt_learning_task_name_ids"
167
+ data_type: TYPE_UINT32
168
+ dims: [ 1 ]
169
+ reshape: { shape: [ ] }
170
+ optional: true
171
+ },
172
+ {
173
+ name: "top_p_decay"
174
+ data_type: TYPE_FP32
175
+ dims: [ 1 ]
176
+ reshape: { shape: [ ] }
177
+ optional: true
178
+ },
179
+ {
180
+ name: "top_p_min"
181
+ data_type: TYPE_FP32
182
+ dims: [ 1 ]
183
+ reshape: { shape: [ ] }
184
+ optional: true
185
+ },
186
+ {
187
+ name: "top_p_reset_ids"
188
+ data_type: TYPE_UINT32
189
+ dims: [ 1 ]
190
+ reshape: { shape: [ ] }
191
+ optional: true
192
+ },
193
+ {
194
+ name: "START"
195
+ data_type: TYPE_INT32
196
+ dims: [ 1 ]
197
+ reshape: { shape: [ ] }
198
+ optional: true
199
+ },
200
+ {
201
+ name: "END"
202
+ data_type: TYPE_INT32
203
+ dims: [ 1 ]
204
+ reshape: { shape: [ ] }
205
+ optional: true
206
+ },
207
+ {
208
+ name: "STOP"
209
+ data_type: TYPE_INT32
210
+ dims: [ 1 ]
211
+ reshape: { shape: [ ] }
212
+ optional: true
213
+ },
214
+ {
215
+ name: "CORRID"
216
+ data_type: TYPE_UINT64
217
+ dims: [ 1 ]
218
+ reshape: { shape: [ ] }
219
+ optional: true
220
+ }
221
+ ]
222
+ output [
223
+ {
224
+ name: "output_ids"
225
+ data_type: TYPE_UINT32
226
+ dims: [ -1, -1 ]
227
+ },
228
+ {
229
+ name: "sequence_length"
230
+ data_type: TYPE_UINT32
231
+ dims: [ -1 ]
232
+ },
233
+ {
234
+ name: "cum_log_probs"
235
+ data_type: TYPE_FP32
236
+ dims: [ -1 ]
237
+ },
238
+ {
239
+ name: "output_log_probs"
240
+ data_type: TYPE_FP32
241
+ dims: [ -1, -1 ]
242
+ }
243
+ ]
244
+
245
+ parameters {
246
+ key: "pipeline_para_size"
247
+ value: {
248
+ string_value: "1"
249
+ }
250
+ }
251
+ parameters {
252
+ key: "data_type"
253
+ value: {
254
+ string_value: "fp16"
255
+ }
256
+ }
257
+ parameters {
258
+ key: "model_type"
259
+ value: {
260
+ string_value: "Llama"
261
+ }
262
+ }
263
+
264
+ parameters {
265
+ key: "enable_custom_all_reduce"
266
+ value: {
267
+ string_value: "0"
268
+ }
269
+ }
270
+ parameters {
271
+ key: "tensor_para_size"
272
+ value: {
273
+ string_value: "1"
274
+ }
275
+ }
276
+ parameters {
277
+ key: "model_name"
278
+ value: {
279
+ string_value: "internlm-chat-7b"
280
+ }
281
+ }
triton_models/postprocessing/1/model.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import json
3
+ import os.path as osp
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ import triton_python_backend_utils as pb_utils
8
+
9
+ # This tokenizer is `lmdeploy/turbomind/tokenizer.py`. When an LLM is served
10
+ # by triton inference server, it has to be converted first by running
11
+ # `python lmdeploy/serve/turbomind/deploy.py`. Then
12
+ # `lmdeploy/turbomind/tokenizer.py` will be copied to `tokenizer/tokenizer.py`
13
+ from .tokenizer.tokenizer import Tokenizer
14
+
15
+
16
+ class TritonPythonModel:
17
+ """Your Python model must use the same class name.
18
+
19
+ Every Python model that is created must have "TritonPythonModel" as the
20
+ class name.
21
+ """
22
+
23
+ def initialize(self, args):
24
+ """`initialize` is called only once when the model is being loaded.
25
+ Implementing `initialize` function is optional. This function allows
26
+ the model to initialize any state associated with this model.
27
+ Parameters
28
+ ----------
29
+ args : dict
30
+ Both keys and values are strings. The dictionary keys and values are:
31
+ * model_config: A JSON string containing the model configuration
32
+ * model_instance_kind: A string containing model instance kind
33
+ * model_instance_device_id: A string containing model instance device
34
+ ID
35
+ * model_repository: Model repository path
36
+ * model_version: Model version
37
+ * model_name: Model name
38
+ """
39
+ # Parse model configs
40
+ self.model_config = model_config = json.loads(args['model_config'])
41
+
42
+ # Parse model output configs
43
+ output_config = pb_utils.get_output_config_by_name(
44
+ model_config, 'OUTPUT')
45
+
46
+ # Convert Triton types to numpy types
47
+ self.output_dtype = pb_utils.triton_string_to_numpy(
48
+ output_config['data_type'])
49
+
50
+ cur_folder = Path(__file__).parent
51
+
52
+ self.tokenizer = Tokenizer(
53
+ osp.join(
54
+ cur_folder, self.model_config['parameters']['tokenizer_path']
55
+ ['string_value']))
56
+
57
+ def execute(self, requests):
58
+ """`execute` must be implemented in every Python model. `execute`
59
+ function receives a list of pb_utils.InferenceRequest as the only
60
+ argument. This function is called when an inference is requested
61
+ for this model. Depending on the batching configuration (e.g. Dynamic
62
+ Batching) used, `requests` may contain multiple requests. Every
63
+ Python model, must create one pb_utils.InferenceResponse for every
64
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
65
+ set the error argument when creating a pb_utils.InferenceResponse.
66
+ Parameters
67
+ ----------
68
+ requests : list
69
+ A list of pb_utils.InferenceRequest
70
+ Returns
71
+ -------
72
+ list
73
+ A list of pb_utils.InferenceResponse. The length of this list must
74
+ be the same as `requests`
75
+ """
76
+
77
+ responses = []
78
+
79
+ # Every Python backend must iterate over everyone of the requests
80
+ # and create a pb_utils.InferenceResponse for each of them.
81
+ for idx, request in enumerate(requests):
82
+ # Get input tensors
83
+ tokens_batch = pb_utils.get_input_tensor_by_name(
84
+ request, 'TOKENS_BATCH').as_numpy()
85
+ sequence_length = pb_utils.get_input_tensor_by_name(
86
+ request, 'sequence_length').as_numpy()
87
+
88
+ # Postprocessing output data.
89
+ outputs = self._postprocessing(tokens_batch.tolist(),
90
+ sequence_length)
91
+
92
+ # Create output tensors. You need pb_utils.Tensor
93
+ # objects to create pb_utils.InferenceResponse.
94
+ output_tensor = pb_utils.Tensor(
95
+ 'OUTPUT',
96
+ np.array(outputs).astype(self.output_dtype))
97
+
98
+ # Create InferenceResponse. You can set an error here in case
99
+ # there was a problem with handling this inference request.
100
+ # Below is an example of how you can set errors in inference
101
+ # response:
102
+ #
103
+ # pb_utils.InferenceResponse(
104
+ # output_tensors=..., TritonError("An error occurred"))
105
+ inference_response = pb_utils.InferenceResponse(
106
+ output_tensors=[output_tensor])
107
+ responses.append(inference_response)
108
+
109
+ # You should return a list of pb_utils.InferenceResponse. Length
110
+ # of this list must match the length of `requests` list.
111
+ return responses
112
+
113
+ def finalize(self):
114
+ """`finalize` is called only once when the model is being unloaded.
115
+
116
+ Implementing `finalize` function is optional. This function allows the
117
+ model to perform any necessary clean ups before exit.
118
+ """
119
+ print('Cleaning up...')
120
+
121
+ def _postprocessing(self, tokens_batch, sequence_length):
122
+ """decode token ids into texts."""
123
+ outputs = []
124
+ for beam_tokens, beam_len in zip(tokens_batch, sequence_length):
125
+ for tokens, _len in zip(beam_tokens, beam_len):
126
+ output = self.tokenizer.decode(tokens, _len)
127
+ output = output.encode('utf8')
128
+ outputs.append(output)
129
+ return outputs
triton_models/postprocessing/config.pbtxt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "postprocessing"
2
+ backend: "python"
3
+ max_batch_size: 1
4
+ input [
5
+ {
6
+ name: "TOKENS_BATCH"
7
+ data_type: TYPE_UINT32
8
+ dims: [ -1, -1 ]
9
+ },
10
+ {
11
+ name: "sequence_length"
12
+ data_type: TYPE_UINT32
13
+ dims: [ -1 ]
14
+ }
15
+ ]
16
+ output [
17
+ {
18
+ name: "OUTPUT"
19
+ data_type: TYPE_STRING
20
+ dims: [ -1, -1 ]
21
+ }
22
+ ]
23
+
24
+ instance_group [
25
+ {
26
+ count: 16
27
+ kind: KIND_CPU
28
+ }
29
+ ]
30
+
31
+ parameters {
32
+ key: "tokenizer_path"
33
+ value: {
34
+ string_value: "tokenizer/tokenizer.model"
35
+ }
36
+ }
triton_models/preprocessing/1/model.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import json
3
+ import os.path as osp
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ import torch
8
+ import triton_python_backend_utils as pb_utils
9
+ from torch.nn.utils.rnn import pad_sequence
10
+
11
+ # This tokenizer is `lmdeploy/turbomind/tokenizer.py`. When an LLM is served
12
+ # by triton inference server, it has to be converted first by running
13
+ # `python lmdeploy/serve/turbomind/deploy.py`. Then
14
+ # `lmdeploy/turbomind/tokenizer.py` will be copied to `tokenizer/tokenizer.py`
15
+ from .tokenizer.tokenizer import Tokenizer
16
+
17
+
18
+ class TritonPythonModel:
19
+ """Your Python model must use the same class name.
20
+
21
+ Every Python model that is created must have "TritonPythonModel" as the
22
+ class name.
23
+ """
24
+
25
+ def initialize(self, args):
26
+ """`initialize` is called only once when the model is being loaded.
27
+ Implementing `initialize` function is optional. This function allows
28
+ the model to initialize any state associated with this model.
29
+ Parameters
30
+ ----------
31
+ args : dict
32
+ Both keys and values are strings. The dictionary keys and values are:
33
+ * model_config: A JSON string containing the model configuration
34
+ * model_instance_kind: A string containing model instance kind
35
+ * model_instance_device_id: A string containing model instance device
36
+ ID
37
+ * model_repository: Model repository path
38
+ * model_version: Model version
39
+ * model_name: Model name
40
+ """
41
+ # Parse model configs
42
+ self.model_config = model_config = json.loads(args['model_config'])
43
+
44
+ # Parse model output configs and convert Triton types to numpy types
45
+ input_names = [
46
+ 'INPUT_ID', 'REQUEST_INPUT_LEN', 'BAD_WORDS_IDS', 'STOP_WORDS_IDS'
47
+ ]
48
+ for input_name in input_names:
49
+ setattr(
50
+ self,
51
+ input_name.lower() + '_dtype',
52
+ pb_utils.triton_string_to_numpy(
53
+ pb_utils.get_output_config_by_name(
54
+ model_config, input_name)['data_type']))
55
+
56
+ cur_folder = Path(__file__).parent
57
+ self.tokenizer = Tokenizer(
58
+ osp.join(
59
+ cur_folder, self.model_config['parameters']['tokenizer_path']
60
+ ['string_value']))
61
+ self.start_id = self.tokenizer.bos_token_id
62
+ self.end_id = self.tokenizer.eos_token_id
63
+
64
+ def execute(self, requests):
65
+ """`execute` must be implemented in every Python model. `execute`
66
+ function receives a list of pb_utils.InferenceRequest as the only
67
+ argument. This function is called when an inference is requested
68
+ for this model. Depending on the batching configuration (e.g. Dynamic
69
+ Batching) used, `requests` may contain multiple requests. Every
70
+ Python model, must create one pb_utils.InferenceResponse for every
71
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
72
+ set the error argument when creating a pb_utils.InferenceResponse.
73
+ Parameters
74
+ ----------
75
+ requests : list
76
+ A list of pb_utils.InferenceRequest
77
+ Returns
78
+ -------
79
+ list
80
+ A list of pb_utils.InferenceResponse. The length of this list must
81
+ be the same as `requests`
82
+ """
83
+
84
+ responses = []
85
+
86
+ # Every Python backend must iterate over everyone of the requests
87
+ # and create a pb_utils.InferenceResponse for each of them.
88
+ for idx, request in enumerate(requests):
89
+ # Get input tensors
90
+ query = pb_utils.get_input_tensor_by_name(request,
91
+ 'QUERY').as_numpy()
92
+ request_output_len = pb_utils.get_input_tensor_by_name(
93
+ request, 'REQUEST_OUTPUT_LEN').as_numpy()
94
+
95
+ # Preprocessing input data.
96
+ input_id, request_input_len = self._create_request(query)
97
+
98
+ # Create output tensors. You need pb_utils.Tensor
99
+ # objects to create pb_utils.InferenceResponse.
100
+ input_id_tensor = pb_utils.Tensor(
101
+ 'INPUT_ID',
102
+ np.array(input_id).astype(self.input_id_dtype))
103
+ request_input_len_tensor = pb_utils.Tensor(
104
+ 'REQUEST_INPUT_LEN',
105
+ np.array(request_input_len).astype(
106
+ self.request_input_len_dtype))
107
+ request_output_len_tensor = pb_utils.Tensor(
108
+ 'REQUEST_OUTPUT_LEN', request_output_len)
109
+
110
+ # Create InferenceResponse. You can set an error here in case
111
+ # there was a problem with handling this inference request.
112
+ # Below is an example of how you can set errors in inference
113
+ # response:
114
+ #
115
+ # pb_utils.InferenceResponse(
116
+ # output_tensors=..., TritonError("An error occurred"))
117
+ inference_response = pb_utils.InferenceResponse(output_tensors=[
118
+ input_id_tensor, request_input_len_tensor,
119
+ request_output_len_tensor
120
+ ])
121
+ responses.append(inference_response)
122
+
123
+ # You should return a list of pb_utils.InferenceResponse. Length
124
+ # of this list must match the length of `requests` list.
125
+ return responses
126
+
127
+ def finalize(self):
128
+ """`finalize` is called only once when the model is being unloaded.
129
+
130
+ Implementing `finalize` function is optional. This function allows the
131
+ model to perform any necessary clean ups before exit.
132
+ """
133
+ print('Cleaning up...')
134
+
135
+ def _create_request(self, query):
136
+ """Tokenize prompts and return the token ids and their length.
137
+
138
+ Args:
139
+ query (List[str]): a list of prompt
140
+ Returns:
141
+ tuple: token ids and their length
142
+ """
143
+ start_ids = [
144
+ torch.IntTensor(self.tokenizer.encode(s[0].decode()))
145
+ for s in query
146
+ ]
147
+ start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
148
+ start_ids = pad_sequence(start_ids,
149
+ batch_first=True,
150
+ padding_value=self.end_id)
151
+ return start_ids, start_lengths
triton_models/preprocessing/config.pbtxt ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "preprocessing"
2
+ backend: "python"
3
+ max_batch_size: 1
4
+
5
+ input [
6
+ {
7
+ name: "QUERY"
8
+ data_type: TYPE_STRING
9
+ dims: [ -1 ]
10
+ },
11
+ {
12
+ name: "BAD_WORDS_DICT"
13
+ data_type: TYPE_STRING
14
+ dims: [ -1 ]
15
+ optional: true
16
+ },
17
+ {
18
+ name: "STOP_WORDS_DICT"
19
+ data_type: TYPE_STRING
20
+ dims: [ -1 ]
21
+ optional: true
22
+ },
23
+ {
24
+ name: "REQUEST_OUTPUT_LEN"
25
+ data_type: TYPE_UINT32
26
+ dims: [ -1 ]
27
+ }
28
+ ]
29
+ output [
30
+ {
31
+ name: "INPUT_ID"
32
+ data_type: TYPE_UINT32
33
+ dims: [ -1 ]
34
+ },
35
+ {
36
+ name: "REQUEST_INPUT_LEN"
37
+ data_type: TYPE_UINT32
38
+ dims: [ 1 ]
39
+ },
40
+ {
41
+ name: "BAD_WORDS_IDS"
42
+ data_type: TYPE_INT32
43
+ dims: [ 2, -1 ]
44
+ },
45
+ {
46
+ name: "STOP_WORDS_IDS"
47
+ data_type: TYPE_INT32
48
+ dims: [ 2, -1 ]
49
+ },
50
+ {
51
+ name: "REQUEST_OUTPUT_LEN"
52
+ data_type: TYPE_UINT32
53
+ dims: [ -1 ]
54
+ },
55
+ {
56
+ name: "PROMPT_LEARNING_TASK_NAME_IDS"
57
+ data_type: TYPE_UINT32
58
+ dims: [ 1 ]
59
+ }
60
+ ]
61
+
62
+ instance_group [
63
+ {
64
+ count: 4
65
+ kind: KIND_CPU
66
+ }
67
+ ]
68
+
69
+ parameters {
70
+ key: "tokenizer_path"
71
+ value: {
72
+ string_value: "tokenizer/tokenizer.model"
73
+ }
74
+ }
triton_models/tokenizer/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/nvme/shared_data/InternLM/internlm-chat-7b",
3
+ "architectures": [
4
+ "InternLMForCausalLM"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_internlm.InternLMConfig",
8
+ "AutoModel": "modeling_internlm.InternLMForCausalLM",
9
+ "AutoModelForCausalLM": "modeling_internlm.InternLMForCausalLM"
10
+ },
11
+ "bias": true,
12
+ "bos_token_id": 1,
13
+ "eos_token_id": 2,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 4096,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 11008,
18
+ "max_position_embeddings": 2048,
19
+ "model_type": "internlm",
20
+ "num_attention_heads": 32,
21
+ "num_hidden_layers": 32,
22
+ "pad_token_id": 0,
23
+ "rms_norm_eps": 1e-06,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "float16",
26
+ "transformers_version": "4.33.1",
27
+ "use_cache": false,
28
+ "vocab_size": 103168
29
+ }
triton_models/tokenizer/configuration_internlm.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ InternLM model configuration"""
21
+
22
+ from transformers.utils import logging
23
+ from transformers.configuration_utils import PretrainedConfig
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+ INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
29
+
30
+
31
+ class InternLMConfig(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate an InternLM
34
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
+ defaults will yield a similar configuration to that of the InternLM-7B.
36
+
37
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
38
+ documentation from [`PretrainedConfig`] for more information.
39
+
40
+
41
+ Args:
42
+ vocab_size (`int`, *optional*, defaults to 32000):
43
+ Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
44
+ `inputs_ids` passed when calling [`InternLMModel`]
45
+ hidden_size (`int`, *optional*, defaults to 4096):
46
+ Dimension of the hidden representations.
47
+ intermediate_size (`int`, *optional*, defaults to 11008):
48
+ Dimension of the MLP representations.
49
+ num_hidden_layers (`int`, *optional*, defaults to 32):
50
+ Number of hidden layers in the Transformer encoder.
51
+ num_attention_heads (`int`, *optional*, defaults to 32):
52
+ Number of attention heads for each attention layer in the Transformer encoder.
53
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
54
+ The non-linear activation function (function or string) in the decoder.
55
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
56
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
57
+ just in case (e.g., 512 or 1024 or 2048).
58
+ initializer_range (`float`, *optional*, defaults to 0.02):
59
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
60
+ rms_norm_eps (`float`, *optional*, defaults to 1e-12):
61
+ The epsilon used by the rms normalization layers.
62
+ use_cache (`bool`, *optional*, defaults to `True`):
63
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
64
+ relevant if `config.is_decoder=True`.
65
+ tie_word_embeddings(`bool`, *optional*, defaults to `False`):
66
+ Whether to tie weight embeddings
67
+ Example:
68
+
69
+ ```python
70
+ >>> from transformers import InternLMModel, InternLMConfig
71
+
72
+ >>> # Initializing a InternLM internlm-7b style configuration
73
+ >>> configuration = InternLMConfig()
74
+
75
+ >>> # Initializing a model from the internlm-7b style configuration
76
+ >>> model = InternLMModel(configuration)
77
+
78
+ >>> # Accessing the model configuration
79
+ >>> configuration = model.config
80
+ ```"""
81
+ model_type = "internlm"
82
+ _auto_class = "AutoConfig"
83
+
84
+ def __init__(
85
+ self,
86
+ vocab_size=103168,
87
+ hidden_size=4096,
88
+ intermediate_size=11008,
89
+ num_hidden_layers=32,
90
+ num_attention_heads=32,
91
+ hidden_act="silu",
92
+ max_position_embeddings=2048,
93
+ initializer_range=0.02,
94
+ rms_norm_eps=1e-6,
95
+ use_cache=True,
96
+ pad_token_id=0,
97
+ bos_token_id=1,
98
+ eos_token_id=2,
99
+ tie_word_embeddings=False,
100
+ bias=True,
101
+ **kwargs,
102
+ ):
103
+ self.vocab_size = vocab_size
104
+ self.max_position_embeddings = max_position_embeddings
105
+ self.hidden_size = hidden_size
106
+ self.intermediate_size = intermediate_size
107
+ self.num_hidden_layers = num_hidden_layers
108
+ self.num_attention_heads = num_attention_heads
109
+ self.hidden_act = hidden_act
110
+ self.initializer_range = initializer_range
111
+ self.rms_norm_eps = rms_norm_eps
112
+ self.use_cache = use_cache
113
+ self.bias = bias
114
+ super().__init__(
115
+ pad_token_id=pad_token_id,
116
+ bos_token_id=bos_token_id,
117
+ eos_token_id=eos_token_id,
118
+ tie_word_embeddings=tie_word_embeddings,
119
+ **kwargs,
120
+ )
triton_models/tokenizer/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.33.1"
7
+ }
triton_models/tokenizer/modeling_internlm.py ADDED
@@ -0,0 +1,966 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ PyTorch InternLM model."""
21
+ import math
22
+ from typing import List, Optional, Tuple, Union
23
+
24
+ import torch
25
+ import torch.utils.checkpoint
26
+ from torch import nn
27
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
28
+
29
+ from transformers.activations import ACT2FN
30
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
31
+ from transformers.modeling_utils import PreTrainedModel
32
+ from transformers.generation.streamers import BaseStreamer
33
+ from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
34
+ from .configuration_internlm import InternLMConfig
35
+
36
+
37
+ logger = logging.get_logger(__name__)
38
+
39
+ _CONFIG_FOR_DOC = "InternLMConfig"
40
+
41
+ # Copied from transformers.models.bart.modeling_bart._make_causal_mask
42
+ def _make_causal_mask(
43
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
44
+ ):
45
+ """
46
+ Make causal mask used for bi-directional self-attention.
47
+ """
48
+ bsz, tgt_len = input_ids_shape
49
+ mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
50
+ mask_cond = torch.arange(mask.size(-1), device=device)
51
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
52
+ mask = mask.to(dtype)
53
+
54
+ if past_key_values_length > 0:
55
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
56
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
57
+
58
+
59
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
60
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
61
+ """
62
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
63
+ """
64
+ bsz, src_len = mask.size()
65
+ tgt_len = tgt_len if tgt_len is not None else src_len
66
+
67
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
68
+
69
+ inverted_mask = 1.0 - expanded_mask
70
+
71
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
72
+
73
+
74
+ class InternLMRMSNorm(nn.Module):
75
+ def __init__(self, hidden_size, eps=1e-6):
76
+ """
77
+ InternLMRMSNorm is equivalent to T5LayerNorm
78
+ """
79
+ super().__init__()
80
+ self.weight = nn.Parameter(torch.ones(hidden_size))
81
+ self.variance_epsilon = eps
82
+
83
+ def forward(self, hidden_states):
84
+ variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
85
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
86
+
87
+ # convert into half-precision if necessary
88
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
89
+ hidden_states = hidden_states.to(self.weight.dtype)
90
+
91
+ return self.weight * hidden_states
92
+
93
+
94
+ class InternLMRotaryEmbedding(torch.nn.Module):
95
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
96
+ super().__init__()
97
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
98
+ self.register_buffer("inv_freq", inv_freq)
99
+
100
+ # Build here to make `torch.jit.trace` work.
101
+ self.max_seq_len_cached = max_position_embeddings
102
+ t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
103
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
104
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
105
+ emb = torch.cat((freqs, freqs), dim=-1)
106
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
107
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
108
+
109
+ def forward(self, x, seq_len=None):
110
+ # x: [bs, num_attention_heads, seq_len, head_size]
111
+ # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
112
+ if seq_len > self.max_seq_len_cached:
113
+ self.max_seq_len_cached = seq_len
114
+ t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
115
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
116
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
117
+ emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
118
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
119
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
120
+ return (
121
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
122
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
123
+ )
124
+
125
+
126
+ def rotate_half(x):
127
+ """Rotates half the hidden dims of the input."""
128
+ x1 = x[..., : x.shape[-1] // 2]
129
+ x2 = x[..., x.shape[-1] // 2 :]
130
+ return torch.cat((-x2, x1), dim=-1)
131
+
132
+
133
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
134
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
135
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
136
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
137
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
138
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
139
+ q_embed = (q * cos) + (rotate_half(q) * sin)
140
+ k_embed = (k * cos) + (rotate_half(k) * sin)
141
+ return q_embed, k_embed
142
+
143
+
144
+ class InternLMMLP(nn.Module):
145
+ def __init__(
146
+ self,
147
+ hidden_size: int,
148
+ intermediate_size: int,
149
+ hidden_act: str,
150
+ ):
151
+ super().__init__()
152
+ self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
153
+ self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
154
+ self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
155
+ self.act_fn = ACT2FN[hidden_act]
156
+
157
+ def forward(self, x):
158
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
159
+
160
+
161
+ class InternLMAttention(nn.Module):
162
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
163
+
164
+ def __init__(self, config: InternLMConfig):
165
+ super().__init__()
166
+ self.config = config
167
+ self.hidden_size = config.hidden_size
168
+ self.num_heads = config.num_attention_heads
169
+ self.head_dim = self.hidden_size // self.num_heads
170
+ self.max_position_embeddings = config.max_position_embeddings
171
+
172
+ if (self.head_dim * self.num_heads) != self.hidden_size:
173
+ raise ValueError(
174
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
175
+ f" and `num_heads`: {self.num_heads})."
176
+ )
177
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
178
+ self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
179
+ self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
180
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
181
+ self.rotary_emb = InternLMRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
182
+
183
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
184
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
185
+
186
+ def forward(
187
+ self,
188
+ hidden_states: torch.Tensor,
189
+ attention_mask: Optional[torch.Tensor] = None,
190
+ position_ids: Optional[torch.LongTensor] = None,
191
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
192
+ output_attentions: bool = False,
193
+ use_cache: bool = False,
194
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
195
+ bsz, q_len, _ = hidden_states.size()
196
+
197
+ query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
198
+ key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
199
+ value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
200
+
201
+ kv_seq_len = key_states.shape[-2]
202
+ if past_key_value is not None:
203
+ kv_seq_len += past_key_value[0].shape[-2]
204
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
205
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
206
+ # [bsz, nh, t, hd]
207
+
208
+ if past_key_value is not None:
209
+ # reuse k, v, self_attention
210
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
211
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
212
+
213
+ past_key_value = (key_states, value_states) if use_cache else None
214
+
215
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
216
+
217
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
218
+ raise ValueError(
219
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
220
+ f" {attn_weights.size()}"
221
+ )
222
+
223
+ if attention_mask is not None:
224
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
225
+ raise ValueError(
226
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
227
+ )
228
+ attn_weights = attn_weights + attention_mask
229
+ attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
230
+
231
+ # upcast attention to fp32
232
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
233
+ attn_output = torch.matmul(attn_weights, value_states)
234
+
235
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
236
+ raise ValueError(
237
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
238
+ f" {attn_output.size()}"
239
+ )
240
+
241
+ attn_output = attn_output.transpose(1, 2)
242
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
243
+
244
+ attn_output = self.o_proj(attn_output)
245
+
246
+ if not output_attentions:
247
+ attn_weights = None
248
+
249
+ return attn_output, attn_weights, past_key_value
250
+
251
+
252
+ class InternLMDecoderLayer(nn.Module):
253
+ def __init__(self, config: InternLMConfig):
254
+ super().__init__()
255
+ self.hidden_size = config.hidden_size
256
+ self.self_attn = InternLMAttention(config=config)
257
+ self.mlp = InternLMMLP(
258
+ hidden_size=self.hidden_size,
259
+ intermediate_size=config.intermediate_size,
260
+ hidden_act=config.hidden_act,
261
+ )
262
+ self.input_layernorm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
263
+ self.post_attention_layernorm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
264
+
265
+ def forward(
266
+ self,
267
+ hidden_states: torch.Tensor,
268
+ attention_mask: Optional[torch.Tensor] = None,
269
+ position_ids: Optional[torch.LongTensor] = None,
270
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
271
+ output_attentions: Optional[bool] = False,
272
+ use_cache: Optional[bool] = False,
273
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
274
+ """
275
+ Args:
276
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
277
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
278
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
279
+ output_attentions (`bool`, *optional*):
280
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
281
+ returned tensors for more detail.
282
+ use_cache (`bool`, *optional*):
283
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
284
+ (see `past_key_values`).
285
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
286
+ """
287
+
288
+ residual = hidden_states
289
+
290
+ hidden_states = self.input_layernorm(hidden_states)
291
+
292
+ # Self Attention
293
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
294
+ hidden_states=hidden_states,
295
+ attention_mask=attention_mask,
296
+ position_ids=position_ids,
297
+ past_key_value=past_key_value,
298
+ output_attentions=output_attentions,
299
+ use_cache=use_cache,
300
+ )
301
+ hidden_states = residual + hidden_states
302
+
303
+ # Fully Connected
304
+ residual = hidden_states
305
+ hidden_states = self.post_attention_layernorm(hidden_states)
306
+ hidden_states = self.mlp(hidden_states)
307
+ hidden_states = residual + hidden_states
308
+
309
+ outputs = (hidden_states,)
310
+
311
+ if output_attentions:
312
+ outputs += (self_attn_weights,)
313
+
314
+ if use_cache:
315
+ outputs += (present_key_value,)
316
+
317
+ return outputs
318
+
319
+
320
+ INTERNLM_START_DOCSTRING = r"""
321
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
322
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
323
+ etc.)
324
+
325
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
326
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
327
+ and behavior.
328
+
329
+ Parameters:
330
+ config ([`InternLMConfig`]):
331
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
332
+ load the weights associated with the model, only the configuration. Check out the
333
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
334
+ """
335
+
336
+
337
+ @add_start_docstrings(
338
+ "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
339
+ INTERNLM_START_DOCSTRING,
340
+ )
341
+ class InternLMPreTrainedModel(PreTrainedModel):
342
+ config_class = InternLMConfig
343
+ base_model_prefix = "model"
344
+ supports_gradient_checkpointing = True
345
+ _no_split_modules = ["InternLMDecoderLayer"]
346
+ _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
347
+
348
+ def _init_weights(self, module):
349
+ std = self.config.initializer_range
350
+ if isinstance(module, nn.Linear):
351
+ module.weight.data.normal_(mean=0.0, std=std)
352
+ if module.bias is not None:
353
+ module.bias.data.zero_()
354
+ elif isinstance(module, nn.Embedding):
355
+ module.weight.data.normal_(mean=0.0, std=std)
356
+ if module.padding_idx is not None:
357
+ module.weight.data[module.padding_idx].zero_()
358
+
359
+ def _set_gradient_checkpointing(self, module, value=False):
360
+ if isinstance(module, InternLMModel):
361
+ module.gradient_checkpointing = value
362
+
363
+
364
+ INTERNLM_INPUTS_DOCSTRING = r"""
365
+ Args:
366
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
367
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
368
+ it.
369
+
370
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
371
+ [`PreTrainedTokenizer.__call__`] for details.
372
+
373
+ [What are input IDs?](../glossary#input-ids)
374
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
375
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
376
+
377
+ - 1 for tokens that are **not masked**,
378
+ - 0 for tokens that are **masked**.
379
+
380
+ [What are attention masks?](../glossary#attention-mask)
381
+
382
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
383
+ [`PreTrainedTokenizer.__call__`] for details.
384
+
385
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
386
+ `past_key_values`).
387
+
388
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
389
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
390
+ information on the default strategy.
391
+
392
+ - 1 indicates the head is **not masked**,
393
+ - 0 indicates the head is **masked**.
394
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
395
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
396
+ config.n_positions - 1]`.
397
+
398
+ [What are position IDs?](../glossary#position-ids)
399
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
400
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
401
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
402
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
403
+
404
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
405
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
406
+
407
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
408
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
409
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
410
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
411
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
412
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
413
+ model's internal embedding lookup matrix.
414
+ use_cache (`bool`, *optional*):
415
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
416
+ `past_key_values`).
417
+ output_attentions (`bool`, *optional*):
418
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
419
+ tensors for more detail.
420
+ output_hidden_states (`bool`, *optional*):
421
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
422
+ more detail.
423
+ return_dict (`bool`, *optional*):
424
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
425
+ """
426
+
427
+
428
+ @add_start_docstrings(
429
+ "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
430
+ INTERNLM_START_DOCSTRING,
431
+ )
432
+ class InternLMModel(InternLMPreTrainedModel):
433
+ """
434
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLMDecoderLayer`]
435
+
436
+ Args:
437
+ config: InternLMConfig
438
+ """
439
+ _auto_class = "AutoModel"
440
+
441
+ def __init__(self, config: InternLMConfig):
442
+ super().__init__(config)
443
+ self.padding_idx = config.pad_token_id
444
+ self.vocab_size = config.vocab_size
445
+
446
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
447
+ self.layers = nn.ModuleList([InternLMDecoderLayer(config) for _ in range(config.num_hidden_layers)])
448
+ self.norm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
449
+
450
+ self.gradient_checkpointing = False
451
+ # Initialize weights and apply final processing
452
+ self.post_init()
453
+
454
+ def get_input_embeddings(self):
455
+ return self.embed_tokens
456
+
457
+ def set_input_embeddings(self, value):
458
+ self.embed_tokens = value
459
+
460
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
461
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
462
+ # create causal mask
463
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
464
+ combined_attention_mask = None
465
+ if input_shape[-1] > 1:
466
+ combined_attention_mask = _make_causal_mask(
467
+ input_shape,
468
+ inputs_embeds.dtype,
469
+ device=inputs_embeds.device,
470
+ past_key_values_length=past_key_values_length,
471
+ )
472
+
473
+ if attention_mask is not None:
474
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
475
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
476
+ inputs_embeds.device
477
+ )
478
+ combined_attention_mask = (
479
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
480
+ )
481
+
482
+ return combined_attention_mask
483
+
484
+ @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
485
+ def forward(
486
+ self,
487
+ input_ids: torch.LongTensor = None,
488
+ attention_mask: Optional[torch.Tensor] = None,
489
+ position_ids: Optional[torch.LongTensor] = None,
490
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
491
+ inputs_embeds: Optional[torch.FloatTensor] = None,
492
+ use_cache: Optional[bool] = None,
493
+ output_attentions: Optional[bool] = None,
494
+ output_hidden_states: Optional[bool] = None,
495
+ return_dict: Optional[bool] = None,
496
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
497
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
498
+ output_hidden_states = (
499
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
500
+ )
501
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
502
+
503
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
504
+
505
+ # retrieve input_ids and inputs_embeds
506
+ if input_ids is not None and inputs_embeds is not None:
507
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
508
+ elif input_ids is not None:
509
+ batch_size, seq_length = input_ids.shape
510
+ elif inputs_embeds is not None:
511
+ batch_size, seq_length, _ = inputs_embeds.shape
512
+ else:
513
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
514
+
515
+ seq_length_with_past = seq_length
516
+ past_key_values_length = 0
517
+
518
+ if past_key_values is not None:
519
+ past_key_values_length = past_key_values[0][0].shape[2]
520
+ seq_length_with_past = seq_length_with_past + past_key_values_length
521
+
522
+ if position_ids is None:
523
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
524
+ position_ids = torch.arange(
525
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
526
+ )
527
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
528
+ else:
529
+ position_ids = position_ids.view(-1, seq_length).long()
530
+
531
+ if inputs_embeds is None:
532
+ inputs_embeds = self.embed_tokens(input_ids)
533
+ # embed positions
534
+ if attention_mask is None:
535
+ attention_mask = torch.ones(
536
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
537
+ )
538
+ attention_mask = self._prepare_decoder_attention_mask(
539
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
540
+ )
541
+
542
+ hidden_states = inputs_embeds
543
+
544
+ if self.gradient_checkpointing and self.training:
545
+ if use_cache:
546
+ logger.warning_once(
547
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
548
+ )
549
+ use_cache = False
550
+
551
+ # decoder layers
552
+ all_hidden_states = () if output_hidden_states else None
553
+ all_self_attns = () if output_attentions else None
554
+ next_decoder_cache = () if use_cache else None
555
+
556
+ for idx, decoder_layer in enumerate(self.layers):
557
+ if output_hidden_states:
558
+ all_hidden_states += (hidden_states,)
559
+
560
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
561
+
562
+ if self.gradient_checkpointing and self.training:
563
+
564
+ def create_custom_forward(module):
565
+ def custom_forward(*inputs):
566
+ # None for past_key_value
567
+ return module(*inputs, output_attentions, None)
568
+
569
+ return custom_forward
570
+
571
+ layer_outputs = torch.utils.checkpoint.checkpoint(
572
+ create_custom_forward(decoder_layer),
573
+ hidden_states,
574
+ attention_mask,
575
+ position_ids,
576
+ None,
577
+ )
578
+ else:
579
+ layer_outputs = decoder_layer(
580
+ hidden_states,
581
+ attention_mask=attention_mask,
582
+ position_ids=position_ids,
583
+ past_key_value=past_key_value,
584
+ output_attentions=output_attentions,
585
+ use_cache=use_cache,
586
+ )
587
+
588
+ hidden_states = layer_outputs[0]
589
+
590
+ if use_cache:
591
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
592
+
593
+ if output_attentions:
594
+ all_self_attns += (layer_outputs[1],)
595
+
596
+ hidden_states = self.norm(hidden_states)
597
+
598
+ # add hidden states from the last decoder layer
599
+ if output_hidden_states:
600
+ all_hidden_states += (hidden_states,)
601
+
602
+ next_cache = next_decoder_cache if use_cache else None
603
+ if not return_dict:
604
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
605
+ return BaseModelOutputWithPast(
606
+ last_hidden_state=hidden_states,
607
+ past_key_values=next_cache,
608
+ hidden_states=all_hidden_states,
609
+ attentions=all_self_attns,
610
+ )
611
+
612
+
613
+ class InternLMForCausalLM(InternLMPreTrainedModel):
614
+ _auto_class = "AutoModelForCausalLM"
615
+
616
+ def __init__(self, config):
617
+ super().__init__(config)
618
+ self.model = InternLMModel(config)
619
+
620
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
621
+
622
+ # Initialize weights and apply final processing
623
+ self.post_init()
624
+
625
+ def get_input_embeddings(self):
626
+ return self.model.embed_tokens
627
+
628
+ def set_input_embeddings(self, value):
629
+ self.model.embed_tokens = value
630
+
631
+ def get_output_embeddings(self):
632
+ return self.lm_head
633
+
634
+ def set_output_embeddings(self, new_embeddings):
635
+ self.lm_head = new_embeddings
636
+
637
+ def set_decoder(self, decoder):
638
+ self.model = decoder
639
+
640
+ def get_decoder(self):
641
+ return self.model
642
+
643
+ @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
644
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
645
+ def forward(
646
+ self,
647
+ input_ids: torch.LongTensor = None,
648
+ attention_mask: Optional[torch.Tensor] = None,
649
+ position_ids: Optional[torch.LongTensor] = None,
650
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
651
+ inputs_embeds: Optional[torch.FloatTensor] = None,
652
+ labels: Optional[torch.LongTensor] = None,
653
+ use_cache: Optional[bool] = None,
654
+ output_attentions: Optional[bool] = None,
655
+ output_hidden_states: Optional[bool] = None,
656
+ return_dict: Optional[bool] = None,
657
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
658
+ r"""
659
+ Args:
660
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
661
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
662
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
663
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
664
+
665
+ Returns:
666
+
667
+ Example:
668
+
669
+ ```python
670
+ >>> from transformers import AutoTokenizer, InternLMForCausalLM
671
+
672
+ >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
673
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
674
+
675
+ >>> prompt = "Hey, are you consciours? Can you talk to me?"
676
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
677
+
678
+ >>> # Generate
679
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
680
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
681
+ "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
682
+ ```"""
683
+
684
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
685
+ output_hidden_states = (
686
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
687
+ )
688
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
689
+
690
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
691
+ outputs = self.model(
692
+ input_ids=input_ids,
693
+ attention_mask=attention_mask,
694
+ position_ids=position_ids,
695
+ past_key_values=past_key_values,
696
+ inputs_embeds=inputs_embeds,
697
+ use_cache=use_cache,
698
+ output_attentions=output_attentions,
699
+ output_hidden_states=output_hidden_states,
700
+ return_dict=return_dict,
701
+ )
702
+
703
+ hidden_states = outputs[0]
704
+ logits = self.lm_head(hidden_states)
705
+
706
+ loss = None
707
+ if labels is not None:
708
+ # Shift so that tokens < n predict n
709
+ shift_logits = logits[..., :-1, :].contiguous()
710
+ shift_labels = labels[..., 1:].contiguous()
711
+ # Flatten the tokens
712
+ loss_fct = CrossEntropyLoss()
713
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
714
+ shift_labels = shift_labels.view(-1)
715
+ # Enable model parallelism
716
+ shift_labels = shift_labels.to(shift_logits.device)
717
+ loss = loss_fct(shift_logits, shift_labels)
718
+
719
+ if not return_dict:
720
+ output = (logits,) + outputs[1:]
721
+ return (loss,) + output if loss is not None else output
722
+
723
+ return CausalLMOutputWithPast(
724
+ loss=loss,
725
+ logits=logits,
726
+ past_key_values=outputs.past_key_values,
727
+ hidden_states=outputs.hidden_states,
728
+ attentions=outputs.attentions,
729
+ )
730
+
731
+ def prepare_inputs_for_generation(
732
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
733
+ ):
734
+ if past_key_values:
735
+ input_ids = input_ids[:, -1:]
736
+
737
+ position_ids = kwargs.get("position_ids", None)
738
+ if attention_mask is not None and position_ids is None:
739
+ # create position_ids on the fly for batch generation
740
+ position_ids = attention_mask.long().cumsum(-1) - 1
741
+ position_ids.masked_fill_(attention_mask == 0, 1)
742
+ if past_key_values:
743
+ position_ids = position_ids[:, -1].unsqueeze(-1)
744
+
745
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
746
+ if inputs_embeds is not None and past_key_values is None:
747
+ model_inputs = {"inputs_embeds": inputs_embeds}
748
+ else:
749
+ model_inputs = {"input_ids": input_ids}
750
+
751
+ model_inputs.update(
752
+ {
753
+ "position_ids": position_ids,
754
+ "past_key_values": past_key_values,
755
+ "use_cache": kwargs.get("use_cache"),
756
+ "attention_mask": attention_mask,
757
+ }
758
+ )
759
+ return model_inputs
760
+
761
+ @staticmethod
762
+ def _reorder_cache(past_key_values, beam_idx):
763
+ reordered_past = ()
764
+ for layer_past in past_key_values:
765
+ reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
766
+ return reordered_past
767
+
768
+ def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = []):
769
+ prompt = ""
770
+ for record in history:
771
+ prompt += f"""<s><|User|>:{record[0]}<eoh>\n<|Bot|>:{record[1]}<eoa>\n"""
772
+ if len(prompt) == 0:
773
+ prompt += "<s>"
774
+ prompt += f"""<|User|>:{query}<eoh>\n<|Bot|>:"""
775
+ return tokenizer([prompt], return_tensors="pt")
776
+
777
+ @torch.no_grad()
778
+ def chat(self,
779
+ tokenizer,
780
+ query: str,
781
+ history: List[Tuple[str, str]] = [],
782
+ streamer: Optional[BaseStreamer] = None,
783
+ max_new_tokens: int = 1024,
784
+ do_sample: bool = True,
785
+ temperature: float = 0.8,
786
+ top_p: float = 0.8,
787
+ eos_token_id = (2, 103028),
788
+ **kwargs):
789
+ inputs = self.build_inputs(tokenizer, query, history)
790
+ inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
791
+ outputs = self.generate(**inputs,
792
+ streamer=streamer,
793
+ max_new_tokens=max_new_tokens,
794
+ do_sample=do_sample,
795
+ temperature=temperature,
796
+ top_p=top_p,
797
+ eos_token_id=list(eos_token_id),
798
+ **kwargs)
799
+ outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]):]
800
+ response = tokenizer.decode(outputs, skip_special_tokens=True)
801
+ response = response.split("<eoa>")[0]
802
+ history = history + [(query, response)]
803
+ return response, history
804
+
805
+ @torch.no_grad()
806
+ def stream_chat(self,
807
+ tokenizer,
808
+ query: str,
809
+ history: List[Tuple[str, str]] = [],
810
+ max_new_tokens: int = 1024,
811
+ do_sample: bool = True,
812
+ temperature: float = 0.8,
813
+ top_p: float = 0.8,
814
+ eos_token_id = (2, 103028),
815
+ **kwargs):
816
+ class ChatStreamer(BaseStreamer):
817
+ def __init__(self, tokenizer) -> None:
818
+ super().__init__()
819
+ self.tokenizer = tokenizer
820
+
821
+ def put(self, value):
822
+ if len(value.shape) > 1 and value.shape[0] > 1:
823
+ raise ValueError("ChatStreamer only supports batch size 1")
824
+ elif len(value.shape) > 1:
825
+ value = value[0]
826
+ token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
827
+ if token.strip() != "<eoa>":
828
+ print(token, end="")
829
+
830
+ def end(self):
831
+ print("")
832
+
833
+ return self.chat(
834
+ tokenizer=tokenizer,
835
+ query=query,
836
+ streamer=ChatStreamer(tokenizer=tokenizer),
837
+ history=history,
838
+ max_new_tokens=max_new_tokens,
839
+ do_sample=do_sample,
840
+ temperature=temperature,
841
+ top_p=top_p,
842
+ eos_token_id=eos_token_id,
843
+ **kwargs
844
+ )
845
+
846
+
847
+ @add_start_docstrings(
848
+ """
849
+ The InternLM Model transformer with a sequence classification head on top (linear layer).
850
+
851
+ [`InternLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
852
+ (e.g. GPT-2) do.
853
+
854
+ Since it does classification on the last token, it requires to know the position of the last token. If a
855
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
856
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
857
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
858
+ each row of the batch).
859
+ """,
860
+ INTERNLM_START_DOCSTRING,
861
+ )
862
+ class InternLMForSequenceClassification(InternLMPreTrainedModel):
863
+ _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
864
+
865
+ def __init__(self, config):
866
+ super().__init__(config)
867
+ self.num_labels = config.num_labels
868
+ self.model = InternLMModel(config)
869
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
870
+
871
+ # Initialize weights and apply final processing
872
+ self.post_init()
873
+
874
+ def get_input_embeddings(self):
875
+ return self.model.embed_tokens
876
+
877
+ def set_input_embeddings(self, value):
878
+ self.model.embed_tokens = value
879
+
880
+ @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
881
+ def forward(
882
+ self,
883
+ input_ids: torch.LongTensor = None,
884
+ attention_mask: Optional[torch.Tensor] = None,
885
+ position_ids: Optional[torch.LongTensor] = None,
886
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
887
+ inputs_embeds: Optional[torch.FloatTensor] = None,
888
+ labels: Optional[torch.LongTensor] = None,
889
+ use_cache: Optional[bool] = None,
890
+ output_attentions: Optional[bool] = None,
891
+ output_hidden_states: Optional[bool] = None,
892
+ return_dict: Optional[bool] = None,
893
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
894
+ r"""
895
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
896
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
897
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
898
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
899
+ """
900
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
901
+
902
+ transformer_outputs = self.model(
903
+ input_ids,
904
+ attention_mask=attention_mask,
905
+ position_ids=position_ids,
906
+ past_key_values=past_key_values,
907
+ inputs_embeds=inputs_embeds,
908
+ use_cache=use_cache,
909
+ output_attentions=output_attentions,
910
+ output_hidden_states=output_hidden_states,
911
+ return_dict=return_dict,
912
+ )
913
+ hidden_states = transformer_outputs[0]
914
+ logits = self.score(hidden_states)
915
+
916
+ if input_ids is not None:
917
+ batch_size = input_ids.shape[0]
918
+ else:
919
+ batch_size = inputs_embeds.shape[0]
920
+
921
+ if self.config.pad_token_id is None and batch_size != 1:
922
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
923
+ if self.config.pad_token_id is None:
924
+ sequence_lengths = -1
925
+ else:
926
+ if input_ids is not None:
927
+ sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
928
+ else:
929
+ sequence_lengths = -1
930
+
931
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
932
+
933
+ loss = None
934
+ if labels is not None:
935
+ labels = labels.to(logits.device)
936
+ if self.config.problem_type is None:
937
+ if self.num_labels == 1:
938
+ self.config.problem_type = "regression"
939
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
940
+ self.config.problem_type = "single_label_classification"
941
+ else:
942
+ self.config.problem_type = "multi_label_classification"
943
+
944
+ if self.config.problem_type == "regression":
945
+ loss_fct = MSELoss()
946
+ if self.num_labels == 1:
947
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
948
+ else:
949
+ loss = loss_fct(pooled_logits, labels)
950
+ elif self.config.problem_type == "single_label_classification":
951
+ loss_fct = CrossEntropyLoss()
952
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
953
+ elif self.config.problem_type == "multi_label_classification":
954
+ loss_fct = BCEWithLogitsLoss()
955
+ loss = loss_fct(pooled_logits, labels)
956
+ if not return_dict:
957
+ output = (pooled_logits,) + transformer_outputs[1:]
958
+ return ((loss,) + output) if loss is not None else output
959
+
960
+ return SequenceClassifierOutputWithPast(
961
+ loss=loss,
962
+ logits=pooled_logits,
963
+ past_key_values=transformer_outputs.past_key_values,
964
+ hidden_states=transformer_outputs.hidden_states,
965
+ attentions=transformer_outputs.attentions,
966
+ )
triton_models/tokenizer/placeholder ADDED
File without changes
triton_models/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "</s>",
5
+ "unk_token": "<unk>"
6
+ }
triton_models/tokenizer/tokenization_internlm.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ """Tokenization classes for IntermLM."""
22
+ import os
23
+ from shutil import copyfile
24
+ from typing import Any, Dict, List, Optional, Tuple
25
+
26
+ import sentencepiece as spm
27
+
28
+ from transformers.tokenization_utils import PreTrainedTokenizer
29
+ from transformers.utils import logging
30
+
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+ VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
35
+
36
+ PRETRAINED_VOCAB_FILES_MAP = {}
37
+
38
+
39
+ class InternLMTokenizer(PreTrainedTokenizer):
40
+ """
41
+ Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
42
+
43
+ Args:
44
+ vocab_file (`str`):
45
+ Path to the vocabulary file.
46
+ """
47
+
48
+ vocab_files_names = VOCAB_FILES_NAMES
49
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
50
+ model_input_names = ["input_ids", "attention_mask"]
51
+ _auto_class = "AutoTokenizer"
52
+
53
+ def __init__(
54
+ self,
55
+ vocab_file,
56
+ unk_token="<unk>",
57
+ bos_token="<s>",
58
+ eos_token="</s>",
59
+ pad_token="</s>",
60
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
61
+ add_bos_token=True,
62
+ add_eos_token=False,
63
+ decode_with_prefix_space=False,
64
+ clean_up_tokenization_spaces=False,
65
+ **kwargs,
66
+ ):
67
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
68
+ super().__init__(
69
+ bos_token=bos_token,
70
+ eos_token=eos_token,
71
+ unk_token=unk_token,
72
+ pad_token=pad_token,
73
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
74
+ **kwargs,
75
+ )
76
+ self.vocab_file = vocab_file
77
+ self.add_bos_token = add_bos_token
78
+ self.add_eos_token = add_eos_token
79
+ self.decode_with_prefix_space = decode_with_prefix_space
80
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
81
+ self.sp_model.Load(vocab_file)
82
+ self._no_prefix_space_tokens = None
83
+
84
+ """ Initialisation"""
85
+
86
+ @property
87
+ def no_prefix_space_tokens(self):
88
+ if self._no_prefix_space_tokens is None:
89
+ vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
90
+ self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
91
+ return self._no_prefix_space_tokens
92
+
93
+ @property
94
+ def vocab_size(self):
95
+ """Returns vocab size"""
96
+ return self.sp_model.get_piece_size()
97
+
98
+ @property
99
+ def bos_token_id(self) -> Optional[int]:
100
+ return self.sp_model.bos_id()
101
+
102
+ @property
103
+ def eos_token_id(self) -> Optional[int]:
104
+ return self.sp_model.eos_id()
105
+
106
+ def get_vocab(self):
107
+ """Returns vocab as a dict"""
108
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
109
+ vocab.update(self.added_tokens_encoder)
110
+ return vocab
111
+
112
+ def _tokenize(self, text):
113
+ """Returns a tokenized string."""
114
+ return self.sp_model.encode(text, out_type=str)
115
+
116
+ def _convert_token_to_id(self, token):
117
+ """Converts a token (str) in an id using the vocab."""
118
+ return self.sp_model.piece_to_id(token)
119
+
120
+ def _convert_id_to_token(self, index):
121
+ """Converts an index (integer) in a token (str) using the vocab."""
122
+ token = self.sp_model.IdToPiece(index)
123
+ return token
124
+
125
+ def _maybe_add_prefix_space(self, tokens, decoded):
126
+ if tokens and tokens[0] not in self.no_prefix_space_tokens:
127
+ return " " + decoded
128
+ else:
129
+ return decoded
130
+
131
+ def convert_tokens_to_string(self, tokens):
132
+ """Converts a sequence of tokens (string) in a single string."""
133
+ current_sub_tokens = []
134
+ out_string = ""
135
+ prev_is_special = False
136
+ for token in tokens:
137
+ # make sure that special tokens are not decoded using sentencepiece model
138
+ if token in self.all_special_tokens:
139
+ if not prev_is_special:
140
+ out_string += " "
141
+ out_string += self.sp_model.decode(current_sub_tokens) + token
142
+ prev_is_special = True
143
+ current_sub_tokens = []
144
+ else:
145
+ current_sub_tokens.append(token)
146
+ prev_is_special = False
147
+ out_string += self.sp_model.decode(current_sub_tokens)
148
+ out_string = self.clean_up_tokenization(out_string)
149
+ out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
150
+ return out_string[1:]
151
+
152
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
153
+ """
154
+ Save the vocabulary and special tokens file to a directory.
155
+
156
+ Args:
157
+ save_directory (`str`):
158
+ The directory in which to save the vocabulary.
159
+
160
+ Returns:
161
+ `Tuple(str)`: Paths to the files saved.
162
+ """
163
+ if not os.path.isdir(save_directory):
164
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
165
+ return
166
+ out_vocab_file = os.path.join(
167
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
168
+ )
169
+
170
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
171
+ copyfile(self.vocab_file, out_vocab_file)
172
+ elif not os.path.isfile(self.vocab_file):
173
+ with open(out_vocab_file, "wb") as fi:
174
+ content_spiece_model = self.sp_model.serialized_model_proto()
175
+ fi.write(content_spiece_model)
176
+
177
+ return (out_vocab_file,)
178
+
179
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
180
+ if self.add_bos_token:
181
+ bos_token_ids = [self.bos_token_id]
182
+ else:
183
+ bos_token_ids = []
184
+
185
+ output = bos_token_ids + token_ids_0
186
+
187
+ if token_ids_1 is not None:
188
+ output = output + token_ids_1
189
+
190
+ if self.add_eos_token:
191
+ output = output + [self.eos_token_id]
192
+
193
+ return output
194
+
195
+ def get_special_tokens_mask(
196
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
197
+ ) -> List[int]:
198
+ """
199
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
200
+ special tokens using the tokenizer `prepare_for_model` method.
201
+
202
+ Args:
203
+ token_ids_0 (`List[int]`):
204
+ List of IDs.
205
+ token_ids_1 (`List[int]`, *optional*):
206
+ Optional second list of IDs for sequence pairs.
207
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
208
+ Whether or not the token list is already formatted with special tokens for the model.
209
+
210
+ Returns:
211
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
212
+ """
213
+ if already_has_special_tokens:
214
+ return super().get_special_tokens_mask(
215
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
216
+ )
217
+
218
+ if token_ids_1 is None:
219
+ return [1] + ([0] * len(token_ids_0)) + [1]
220
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
221
+
222
+ def create_token_type_ids_from_sequences(
223
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
224
+ ) -> List[int]:
225
+ """
226
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
227
+ use of token type ids, therefore a list of zeros is returned.
228
+
229
+ Args:
230
+ token_ids_0 (`List[int]`):
231
+ List of IDs.
232
+ token_ids_1 (`List[int]`, *optional*):
233
+ Optional second list of IDs for sequence pairs.
234
+
235
+ Returns:
236
+ `List[int]`: List of zeros.
237
+ """
238
+ eos = [self.eos_token_id]
239
+
240
+ if token_ids_1 is None:
241
+ return len(token_ids_0 + eos) * [0]
242
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
triton_models/tokenizer/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aab622d98c98677a1a51f969e25765154487bf3e85c7819db105db2fcacba83f
3
+ size 1658691
triton_models/tokenizer/tokenizer.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import json
3
+ import os.path as osp
4
+ from typing import Optional, Sequence, Union
5
+
6
+ import torch
7
+
8
+
9
+ class SentencePieceTokenizer:
10
+ """Tokenizer of sentencepiece.
11
+
12
+ Args:
13
+ model_file (str): the path of the tokenizer model
14
+ """
15
+
16
+ def __init__(self, model_file: str):
17
+ from sentencepiece import SentencePieceProcessor
18
+ self.model = SentencePieceProcessor(model_file=model_file)
19
+ self._no_prefix_space_tokens = None
20
+
21
+ @property
22
+ def vocab_size(self):
23
+ """vocabulary size."""
24
+ return self.model.vocab_size()
25
+
26
+ @property
27
+ def bos_token_id(self):
28
+ """begine of the sentence token id."""
29
+ return self.model.bos_id()
30
+
31
+ @property
32
+ def eos_token_id(self):
33
+ """end of the sentence token id."""
34
+ return self.model.eos_id()
35
+
36
+ @property
37
+ def no_prefix_space_tokens(self):
38
+ """tokens without prefix space."""
39
+ if self._no_prefix_space_tokens is None:
40
+ vocab = self.model.IdToPiece(list(range(self.vocab_size)))
41
+ self._no_prefix_space_tokens = {
42
+ i
43
+ for i, tok in enumerate(vocab) if not tok.startswith('▁')
44
+ }
45
+ return self._no_prefix_space_tokens
46
+
47
+ def _maybe_add_prefix_space(self, tokens, decoded):
48
+ """maybe add prefix space for incremental decoding."""
49
+ if len(tokens) and tokens[0] not in self.no_prefix_space_tokens:
50
+ return ' ' + decoded
51
+ else:
52
+ return decoded
53
+
54
+ def encode(self, s: str):
55
+ """Tokenize a prompt.
56
+
57
+ Args:
58
+ s (str): a prompt
59
+ Returns:
60
+ list[int]: token ids
61
+ """
62
+ add_bos = False
63
+ add_eos = False
64
+ if s.find('<BOS>') != -1:
65
+ s = s.replace('<BOS>', '')
66
+ add_bos = True
67
+ if s == '<EOS>':
68
+ s = ''
69
+ add_eos = True
70
+ return self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
71
+
72
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
73
+ """De-tokenize.
74
+
75
+ Args:
76
+ t (List[int]): a list of token ids
77
+ offset (int): for incrementally decoding. Default to None, which
78
+ means not applied.
79
+ Returns:
80
+ str: text of decoding tokens
81
+ """
82
+ if isinstance(t, torch.Tensor):
83
+ t = t.tolist()
84
+ t = t[offset:]
85
+ out_string = self.model.Decode(t)
86
+ if offset:
87
+ out_string = self._maybe_add_prefix_space(t, out_string)
88
+ return out_string
89
+
90
+ def __call__(self, s: Union[str, Sequence[str]]):
91
+ """Tokenize prompts.
92
+
93
+ Args:
94
+ s (str): prompts
95
+ Returns:
96
+ list[int]: token ids
97
+ """
98
+ import addict
99
+ add_bos = False
100
+ add_eos = False
101
+
102
+ input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
103
+ return addict.Addict(input_ids=input_ids)
104
+
105
+
106
+ class HuggingFaceTokenizer:
107
+ """Tokenizer of sentencepiece.
108
+
109
+ Args:
110
+ model_dir (str): the directory of the tokenizer model
111
+ """
112
+
113
+ def __init__(self, model_dir: str):
114
+ from transformers import (AutoTokenizer, CodeLlamaTokenizerFast,
115
+ LlamaTokenizerFast)
116
+ model_file = osp.join(model_dir, 'tokenizer.model')
117
+ backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
118
+ model_file_exists = osp.exists(model_file)
119
+ if not osp.exists(backend_tokenizer_file) and model_file_exists:
120
+ print('WARNING: Can not find tokenizer.json. '
121
+ 'It may take long time to initialize the tokenizer.')
122
+ self.model = AutoTokenizer.from_pretrained(model_dir,
123
+ trust_remote_code=True)
124
+ self.need_padding = isinstance(self.model, LlamaTokenizerFast) \
125
+ or isinstance(self.model, CodeLlamaTokenizerFast)
126
+ self._no_prefix_space_tokens = None
127
+ # save tokenizer.json to reuse
128
+ if not osp.exists(backend_tokenizer_file) and model_file_exists:
129
+ if hasattr(self.model, 'backend_tokenizer'):
130
+ self.model.backend_tokenizer.save(backend_tokenizer_file)
131
+
132
+ if self.model.eos_token_id is None:
133
+ generation_config_file = osp.join(model_dir,
134
+ 'generation_config.json')
135
+ with open(generation_config_file, 'r') as f:
136
+ cfg = json.load(f)
137
+ self.model.eos_token_id = cfg['eos_token_id']
138
+
139
+ @property
140
+ def vocab_size(self):
141
+ """vocabulary size."""
142
+ return self.model.vocab_size
143
+
144
+ @property
145
+ def bos_token_id(self):
146
+ """begine of the sentence token id."""
147
+ return self.model.bos_token_id
148
+
149
+ @property
150
+ def eos_token_id(self):
151
+ """end of the sentence token id."""
152
+ return self.model.eos_token_id
153
+
154
+ @property
155
+ def no_prefix_space_tokens(self):
156
+ """tokens without prefix space."""
157
+ if self._no_prefix_space_tokens is None:
158
+ vocab = self.model.convert_ids_to_tokens(
159
+ list(range(self.vocab_size)))
160
+ self._no_prefix_space_tokens = {
161
+ i
162
+ for i, tok in enumerate(vocab) if not tok.startswith('▁')
163
+ }
164
+ return self._no_prefix_space_tokens
165
+
166
+ def _maybe_add_prefix_space(self, tokens, decoded):
167
+ """maybe add prefix space for incremental decoding."""
168
+ if self.need_padding and len(
169
+ tokens) and tokens[0] not in self.no_prefix_space_tokens:
170
+ return ' ' + decoded
171
+ else:
172
+ return decoded
173
+
174
+ def encode(self, s: str):
175
+ """Tokenize a prompt.
176
+
177
+ Args:
178
+ s (str): a prompt
179
+ Returns:
180
+ list[int]: token ids
181
+ """
182
+ add_special_tokens = False
183
+ if s.find('<BOS>') != -1:
184
+ s = s.replace('<BOS>', '<s>')
185
+ if s == '<EOS>':
186
+ s = '</s>'
187
+ if len(s) == 0:
188
+ add_special_tokens = True
189
+ return self.model.encode(s, add_special_tokens=add_special_tokens)
190
+
191
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
192
+ """De-tokenize.
193
+
194
+ Args:
195
+ t (List[int]): a list of token ids
196
+ offset (int): for incrementally decoding. Default to None, which
197
+ means not applied.
198
+ Returns:
199
+ str: text of decoding tokens
200
+ """
201
+ skip_special_tokens = True
202
+ t = t[offset:]
203
+ out_string = self.model.decode(t,
204
+ skip_special_tokens=skip_special_tokens)
205
+ if offset:
206
+ out_string = self._maybe_add_prefix_space(t, out_string)
207
+ return out_string
208
+
209
+ def __call__(self, s: Union[str, Sequence[str]]):
210
+ """Tokenize prompts.
211
+
212
+ Args:
213
+ s (str): prompts
214
+ Returns:
215
+ list[int]: token ids
216
+ """
217
+ add_special_tokens = False
218
+ return self.model(s, add_special_tokens=add_special_tokens)
219
+
220
+
221
+ class Tokenizer:
222
+ """Tokenize prompts or de-tokenize tokens into texts.
223
+
224
+ Args:
225
+ model_file (str): the path of the tokenizer model
226
+ """
227
+
228
+ def __init__(self, model_file: str):
229
+ if model_file.endswith('.model'):
230
+ model_folder = osp.split(model_file)[0]
231
+ else:
232
+ model_folder = model_file
233
+ model_file = osp.join(model_folder, 'tokenizer.model')
234
+ tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')
235
+
236
+ model_file_exists = osp.exists(model_file)
237
+ config_exists = osp.exists(tokenizer_config_file)
238
+ use_hf_model = config_exists or not model_file_exists
239
+
240
+ if not use_hf_model:
241
+ self.model = SentencePieceTokenizer(model_file)
242
+ else:
243
+ self.model = HuggingFaceTokenizer(model_folder)
244
+
245
+ @property
246
+ def vocab_size(self):
247
+ """vocabulary size."""
248
+ return self.model.vocab_size
249
+
250
+ @property
251
+ def bos_token_id(self):
252
+ """begine of the sentence token id."""
253
+ return self.model.bos_token_id
254
+
255
+ @property
256
+ def eos_token_id(self):
257
+ """end of the sentence token id."""
258
+ return self.model.eos_token_id
259
+
260
+ def encode(self, s: str):
261
+ """Tokenize a prompt.
262
+
263
+ Args:
264
+ s (str): a prompt
265
+ Returns:
266
+ list[int]: token ids
267
+ """
268
+ return self.model.encode(s)
269
+
270
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
271
+ """De-tokenize.
272
+
273
+ Args:
274
+ t (List[int]): a list of token ids
275
+ offset (int): for incrementally decoding. Default to None, which
276
+ means not applied.
277
+ Returns:
278
+ str: text of decoding tokens
279
+ """
280
+ return self.model.decode(t, offset)
281
+
282
+ def __call__(self, s: Union[str, Sequence[str]]):
283
+ """Tokenize prompts.
284
+
285
+ Args:
286
+ s (str): prompts
287
+ Returns:
288
+ list[int]: token ids
289
+ """
290
+ return self.model(s)
triton_models/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_internlm.InternLMTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "bos_token": "<s>",
9
+ "clean_up_tokenization_spaces": false,
10
+ "eos_token": "</s>",
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "pad_token": "</s>",
13
+ "tokenizer_class": "InternLMTokenizer",
14
+ "unk_token": "<unk>"
15
+ }
triton_models/weights/layers.0.attention.w_qkv.0.qweight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3827a6ab0149c9e641de42919a51a1f208586eb7a27260eb345107bf7c7c411a
3
+ size 25165824
triton_models/weights/layers.0.attention.w_qkv.0.scales_zeros ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cd215d34b2aca208dc6ee86a4df064ed7fdf3ff7c52079262f7f23cb3aa30b9
3
+ size 1572864
triton_models/weights/layers.0.attention.wo.0.bias ADDED
Binary file (8.19 kB). View file
 
triton_models/weights/layers.0.attention.wo.0.qweight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c23a5d415c07c7c0dd13800e41becec9010f42c7d031e7095afd0031f5ad906a
3
+ size 8388608
triton_models/weights/layers.0.attention.wo.0.scales_zeros ADDED
Binary file (524 kB). View file
 
triton_models/weights/layers.0.ffn_norm.weight ADDED
Binary file (8.19 kB). View file
 
triton_models/weights/layers.1.attention.w_qkv.0.qweight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50f824cf40b3ca3adc59b6ebcd218a953c1a84d0386025e4dd1fd29908b24e96
3
+ size 25165824
triton_models/weights/layers.1.attention.wo.0.scales_zeros ADDED
Binary file (524 kB). View file
 
triton_models/weights/layers.1.feed_forward.w2.0.scales_zeros ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70e37d60fbc0641121371b2eaa517b2ab00257a70c8fc36494b444fc90bcd1f0
3
+ size 1409024
triton_models/weights/layers.1.ffn_norm.weight ADDED
Binary file (8.19 kB). View file
 
triton_models/weights/layers.10.attention.w_qkv.0.scales_zeros ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:077aa0912226ff7fb9acc7c4b40b4d13725861e23ec09cb231b776e57e2d678e
3
+ size 1572864
triton_models/weights/layers.10.attention.wo.0.scales_zeros ADDED
Binary file (524 kB). View file
 
triton_models/weights/layers.10.feed_forward.w13.0.qweight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8821f0c6176d9ec59cdeb94793cd0160383e04a6279558cdf3b8b94d87539c56
3
+ size 45088768
triton_models/weights/layers.11.attention.wo.0.scales_zeros ADDED
Binary file (524 kB). View file
 
triton_models/weights/layers.11.feed_forward.w13.0.qweight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acbf7a75466069cbdf40e474371e4b752d4225c915cc4da1e3064aca817c9b25
3
+ size 45088768
triton_models/weights/layers.12.attention.w_qkv.0.bias ADDED
Binary file (24.6 kB). View file
 
triton_models/weights/layers.12.attention.wo.0.qweight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85efcdf14534ce13cb5efeff1e29a7ffba9d9eb9d62f9aba733a7c46a5ed6c13
3
+ size 8388608
triton_models/weights/layers.13.attention.w_qkv.0.bias ADDED
Binary file (24.6 kB). View file
 
triton_models/weights/layers.13.attention.wo.0.scales_zeros ADDED
Binary file (524 kB). View file
 
triton_models/weights/layers.13.feed_forward.w13.0.qweight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f87466cf9734d8896ba1f62405ec89e221432ac04fe6e4cc3cf947d9d4cdb008
3
+ size 45088768
triton_models/weights/layers.13.feed_forward.w13.0.scales_zeros ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e381ca5b8410eb0f4e35baafa862a25bfbf6b4f7ac6d58535f1517dd6ed7ee56
3
+ size 2818048
triton_models/weights/layers.13.feed_forward.w2.0.qweight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d86c606b9f58c9a1d1fe4c62166a848b7346451f109bc4f92110344c8f6e5e14
3
+ size 22544384
triton_models/weights/layers.14.attention.w_qkv.0.scales_zeros ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb8979d01c3adb98f40baaa31c64aab65a2b2f20217418f40eeef8c9cbb479c2
3
+ size 1572864
triton_models/weights/layers.14.attention.wo.0.qweight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ed60486dcf2b9a2f6353ffc3d9770fefcb7b1536d1a0d6bbe0e73f1314b509a
3
+ size 8388608
triton_models/weights/layers.14.attention.wo.0.scales_zeros ADDED
Binary file (524 kB). View file
 
triton_models/weights/layers.14.feed_forward.w2.0.qweight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5283e8d9834135c3a91261f7f5a23a33b12bdcdfa4da64701855761d41b7e2f1
3
+ size 22544384
triton_models/weights/layers.14.ffn_norm.weight ADDED
Binary file (8.19 kB). View file
 
triton_models/weights/layers.15.attention.w_qkv.0.qweight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9da979d36ca2cf6d7fa23886a477609f88e03dc6ae4b27689cc37c9aec2d9a0e
3
+ size 25165824
triton_models/weights/layers.15.attention.w_qkv.0.scales_zeros ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:932a4c5302ddc985c3ad90e697ad2b2c554db1c99b6a72635c55c82f8b161d7d
3
+ size 1572864
triton_models/weights/layers.15.attention_norm.weight ADDED
Binary file (8.19 kB). View file
 
triton_models/weights/layers.15.feed_forward.w2.0.scales_zeros ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1c9cd36454fdb977e3fdd95b6a56f68cabe3fdb631c79f4f99594f73be47d76
3
+ size 1409024
triton_models/weights/layers.16.attention.w_qkv.0.qweight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8b4ed1cf80e4810ff8e4be00484a0f4d7f0373d42f3959510b60c9ac683c643
3
+ size 25165824