chua commited on
Commit
435ac83
·
verified ·
1 Parent(s): 52562da

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -35,4 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  figures/GPA.png filter=lfs diff=lfs merge=lfs -text
37
  figures/GPA_intro.png filter=lfs diff=lfs merge=lfs -text
38
- gpa_0_3b/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  figures/GPA.png filter=lfs diff=lfs merge=lfs -text
37
  figures/GPA_intro.png filter=lfs diff=lfs merge=lfs -text
38
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
BiCodec/README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: multilingual
3
+ datasets:
4
+ - common_voice
5
+ tags:
6
+ - speech
7
+ license: apache-2.0
8
+ ---
9
+
10
+ # Wav2Vec2-XLSR-53
11
+
12
+ [Facebook's XLSR-Wav2Vec2](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
13
+
14
+ The base model pretrained on 16kHz sampled speech audio. When using the model make sure that your speech input is also sampled at 16Khz. Note that this model should be fine-tuned on a downstream task, like Automatic Speech Recognition. Check out [this blog](https://huggingface.co/blog/fine-tune-wav2vec2-english) for more information.
15
+
16
+ [Paper](https://arxiv.org/abs/2006.13979)
17
+
18
+ Authors: Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli
19
+
20
+ **Abstract**
21
+ This paper presents XLSR which learns cross-lingual speech representations by pretraining a single model from the raw waveform of speech in multiple languages. We build on wav2vec 2.0 which is trained by solving a contrastive task over masked latent speech representations and jointly learns a quantization of the latents shared across languages. The resulting model is fine-tuned on labeled data and experiments show that cross-lingual pretraining significantly outperforms monolingual pretraining. On the CommonVoice benchmark, XLSR shows a relative phoneme error rate reduction of 72% compared to the best known results. On BABEL, our approach improves word error rate by 16% relative compared to a comparable system. Our approach enables a single multilingual speech recognition model which is competitive to strong individual models. Analysis shows that the latent discrete speech representations are shared across languages with increased sharing for related languages. We hope to catalyze research in low-resource speech understanding by releasing XLSR-53, a large model pretrained in 53 languages.
22
+
23
+ The original model can be found under https://github.com/pytorch/fairseq/tree/master/examples/wav2vec#wav2vec-20.
24
+
25
+ # Usage
26
+
27
+ See [this notebook](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_Tune_XLSR_Wav2Vec2_on_Turkish_ASR_with_%F0%9F%A4%97_Transformers.ipynb) for more information on how to fine-tune the model.
28
+
29
+ ![model image](https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/xlsr_wav2vec2.png)
BiCodec/config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "Wav2Vec2ForPreTraining"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "codevector_dim": 768,
10
+ "contrastive_logits_temperature": 0.1,
11
+ "conv_bias": true,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "diversity_loss_weight": 0.1,
42
+ "do_stable_layer_norm": true,
43
+ "eos_token_id": 2,
44
+ "feat_extract_activation": "gelu",
45
+ "feat_extract_dropout": 0.0,
46
+ "feat_extract_norm": "layer",
47
+ "feat_proj_dropout": 0.1,
48
+ "feat_quantizer_dropout": 0.0,
49
+ "final_dropout": 0.0,
50
+ "gradient_checkpointing": false,
51
+ "hidden_act": "gelu",
52
+ "hidden_dropout": 0.1,
53
+ "hidden_size": 1024,
54
+ "initializer_range": 0.02,
55
+ "intermediate_size": 4096,
56
+ "layer_norm_eps": 1e-05,
57
+ "layerdrop": 0.1,
58
+ "mask_channel_length": 10,
59
+ "mask_channel_min_space": 1,
60
+ "mask_channel_other": 0.0,
61
+ "mask_channel_prob": 0.0,
62
+ "mask_channel_selection": "static",
63
+ "mask_feature_length": 10,
64
+ "mask_feature_prob": 0.0,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_space": 1,
67
+ "mask_time_other": 0.0,
68
+ "mask_time_prob": 0.075,
69
+ "mask_time_selection": "static",
70
+ "model_type": "wav2vec2",
71
+ "num_attention_heads": 16,
72
+ "num_codevector_groups": 2,
73
+ "num_codevectors_per_group": 320,
74
+ "num_conv_pos_embedding_groups": 16,
75
+ "num_conv_pos_embeddings": 128,
76
+ "num_feat_extract_layers": 7,
77
+ "num_hidden_layers": 24,
78
+ "num_negatives": 100,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 768,
81
+ "transformers_version": "4.7.0.dev0",
82
+ "vocab_size": 32
83
+ }
BiCodec/config.yaml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio_tokenizer:
2
+ mel_params:
3
+ sample_rate: 16000
4
+ n_fft: 1024
5
+ win_length: 640
6
+ hop_length: 320
7
+ mel_fmin: 10
8
+ mel_fmax: null
9
+ num_mels: 128
10
+
11
+ encoder:
12
+ input_channels: 1024
13
+ vocos_dim: 384
14
+ vocos_intermediate_dim: 2048
15
+ vocos_num_layers: 12
16
+ out_channels: 1024
17
+ sample_ratios: [1,1]
18
+
19
+ decoder:
20
+ input_channel: 1024
21
+ channels: 1536
22
+ rates: [8, 5, 4, 2]
23
+ kernel_sizes: [16,11,8,4]
24
+
25
+ quantizer:
26
+ input_dim: 1024
27
+ codebook_size: 8192
28
+ codebook_dim: 8
29
+ commitment: 0.25
30
+ codebook_loss_weight: 2.0
31
+ use_l2_normlize: True
32
+ threshold_ema_dead_code: 0.2
33
+
34
+ speaker_encoder:
35
+ input_dim: 128
36
+ out_dim: 1024
37
+ latent_dim: 128
38
+ token_num: 32
39
+ fsq_levels: [4, 4, 4, 4, 4, 4]
40
+ fsq_num_quantizers: 1
41
+
42
+ prenet:
43
+ input_channels: 1024
44
+ vocos_dim: 384
45
+ vocos_intermediate_dim: 2048
46
+ vocos_num_layers: 12
47
+ out_channels: 1024
48
+ condition_dim: 1024
49
+ sample_ratios: [1,1]
50
+ use_tanh_at_final: False
51
+
52
+ postnet:
53
+ input_channels: 1024
54
+ vocos_dim: 384
55
+ vocos_intermediate_dim: 2048
56
+ vocos_num_layers: 6
57
+ out_channels: 1024
58
+ use_tanh_at_final: False
59
+ highpass_cutoff_freq: 40
60
+ sample_rate: 16000
61
+ segment_duration: 2.4 # (s)
62
+ max_val_duration: 12 # (s)
63
+ latent_hop_length: 320
64
+ ref_segment_duration: 6
65
+ volume_normalize: true
66
+
BiCodec/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9940cd48d4446e4340ced82d234bf5618350dd9f5db900ebe47a4fdb03867ec
3
+ size 625518756
BiCodec/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
BiCodec/wav2vec2-large-xlsr-53/README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: multilingual
3
+ datasets:
4
+ - common_voice
5
+ tags:
6
+ - speech
7
+ license: apache-2.0
8
+ ---
9
+
10
+ # Wav2Vec2-XLSR-53
11
+
12
+ [Facebook's XLSR-Wav2Vec2](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
13
+
14
+ The base model pretrained on 16kHz sampled speech audio. When using the model make sure that your speech input is also sampled at 16Khz. Note that this model should be fine-tuned on a downstream task, like Automatic Speech Recognition. Check out [this blog](https://huggingface.co/blog/fine-tune-wav2vec2-english) for more information.
15
+
16
+ [Paper](https://arxiv.org/abs/2006.13979)
17
+
18
+ Authors: Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli
19
+
20
+ **Abstract**
21
+ This paper presents XLSR which learns cross-lingual speech representations by pretraining a single model from the raw waveform of speech in multiple languages. We build on wav2vec 2.0 which is trained by solving a contrastive task over masked latent speech representations and jointly learns a quantization of the latents shared across languages. The resulting model is fine-tuned on labeled data and experiments show that cross-lingual pretraining significantly outperforms monolingual pretraining. On the CommonVoice benchmark, XLSR shows a relative phoneme error rate reduction of 72% compared to the best known results. On BABEL, our approach improves word error rate by 16% relative compared to a comparable system. Our approach enables a single multilingual speech recognition model which is competitive to strong individual models. Analysis shows that the latent discrete speech representations are shared across languages with increased sharing for related languages. We hope to catalyze research in low-resource speech understanding by releasing XLSR-53, a large model pretrained in 53 languages.
22
+
23
+ The original model can be found under https://github.com/pytorch/fairseq/tree/master/examples/wav2vec#wav2vec-20.
24
+
25
+ # Usage
26
+
27
+ See [this notebook](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_Tune_XLSR_Wav2Vec2_on_Turkish_ASR_with_%F0%9F%A4%97_Transformers.ipynb) for more information on how to fine-tune the model.
28
+
29
+ ![model image](https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/xlsr_wav2vec2.png)
BiCodec/wav2vec2-large-xlsr-53/config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "Wav2Vec2ForPreTraining"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "codevector_dim": 768,
10
+ "contrastive_logits_temperature": 0.1,
11
+ "conv_bias": true,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "diversity_loss_weight": 0.1,
42
+ "do_stable_layer_norm": true,
43
+ "eos_token_id": 2,
44
+ "feat_extract_activation": "gelu",
45
+ "feat_extract_dropout": 0.0,
46
+ "feat_extract_norm": "layer",
47
+ "feat_proj_dropout": 0.1,
48
+ "feat_quantizer_dropout": 0.0,
49
+ "final_dropout": 0.0,
50
+ "gradient_checkpointing": false,
51
+ "hidden_act": "gelu",
52
+ "hidden_dropout": 0.1,
53
+ "hidden_size": 1024,
54
+ "initializer_range": 0.02,
55
+ "intermediate_size": 4096,
56
+ "layer_norm_eps": 1e-05,
57
+ "layerdrop": 0.1,
58
+ "mask_channel_length": 10,
59
+ "mask_channel_min_space": 1,
60
+ "mask_channel_other": 0.0,
61
+ "mask_channel_prob": 0.0,
62
+ "mask_channel_selection": "static",
63
+ "mask_feature_length": 10,
64
+ "mask_feature_prob": 0.0,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_space": 1,
67
+ "mask_time_other": 0.0,
68
+ "mask_time_prob": 0.075,
69
+ "mask_time_selection": "static",
70
+ "model_type": "wav2vec2",
71
+ "num_attention_heads": 16,
72
+ "num_codevector_groups": 2,
73
+ "num_codevectors_per_group": 320,
74
+ "num_conv_pos_embedding_groups": 16,
75
+ "num_conv_pos_embeddings": 128,
76
+ "num_feat_extract_layers": 7,
77
+ "num_hidden_layers": 24,
78
+ "num_negatives": 100,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 768,
81
+ "transformers_version": "4.7.0.dev0",
82
+ "vocab_size": 32
83
+ }
BiCodec/wav2vec2-large-xlsr-53/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
BiCodec/wav2vec2-large-xlsr-53/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:314340227371a608f71adcd5f0de5933824fe77e55822aa4b24dba9c1c364dcb
3
+ size 1269737156
added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
chat_template.jinja ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
27
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
28
+ {%- elif message.role == "assistant" %}
29
+ {%- set content = message.content %}
30
+ {%- set reasoning_content = '' %}
31
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
32
+ {%- set reasoning_content = message.reasoning_content %}
33
+ {%- else %}
34
+ {%- if '</think>' in message.content %}
35
+ {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
36
+ {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37
+ {%- endif %}
38
+ {%- endif %}
39
+ {%- if loop.index0 > ns.last_query_index %}
40
+ {%- if loop.last or (not loop.last and reasoning_content) %}
41
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
42
+ {%- else %}
43
+ {{- '<|im_start|>' + message.role + '\n' + content }}
44
+ {%- endif %}
45
+ {%- else %}
46
+ {{- '<|im_start|>' + message.role + '\n' + content }}
47
+ {%- endif %}
48
+ {%- if message.tool_calls %}
49
+ {%- for tool_call in message.tool_calls %}
50
+ {%- if (loop.first and content) or (not loop.first) %}
51
+ {{- '\n' }}
52
+ {%- endif %}
53
+ {%- if tool_call.function %}
54
+ {%- set tool_call = tool_call.function %}
55
+ {%- endif %}
56
+ {{- '<tool_call>\n{"name": "' }}
57
+ {{- tool_call.name }}
58
+ {{- '", "arguments": ' }}
59
+ {%- if tool_call.arguments is string %}
60
+ {{- tool_call.arguments }}
61
+ {%- else %}
62
+ {{- tool_call.arguments | tojson }}
63
+ {%- endif %}
64
+ {{- '}\n</tool_call>' }}
65
+ {%- endfor %}
66
+ {%- endif %}
67
+ {{- '<|im_end|>\n' }}
68
+ {%- elif message.role == "tool" %}
69
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
70
+ {{- '<|im_start|>user' }}
71
+ {%- endif %}
72
+ {{- '\n<tool_response>\n' }}
73
+ {{- message.content }}
74
+ {{- '\n</tool_response>' }}
75
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
76
+ {{- '<|im_end|>\n' }}
77
+ {%- endif %}
78
+ {%- endif %}
79
+ {%- endfor %}
80
+ {%- if add_generation_prompt %}
81
+ {{- '<|im_start|>assistant\n' }}
82
+ {%- if enable_thinking is defined and enable_thinking is false %}
83
+ {{- '<think>\n\n</think>\n\n' }}
84
+ {%- endif %}
85
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_qwen3.Qwen3Config",
7
+ "AutoModelForCausalLM": "modeling_qwen3.Qwen3ForCausalLM"
8
+ },
9
+ "attention_bias": false,
10
+ "attention_dropout": 0.0,
11
+ "dtype": "bfloat16",
12
+ "eos_token_id": 151643,
13
+ "head_dim": 128,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 512,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "layer_types": [
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention"
47
+ ],
48
+ "max_position_embeddings": 32768,
49
+ "max_window_layers": 28,
50
+ "model_type": "qwen3",
51
+ "num_attention_heads": 16,
52
+ "num_hidden_layers": 28,
53
+ "num_key_value_heads": 8,
54
+ "pad_token_id": 151643,
55
+ "rms_norm_eps": 1e-06,
56
+ "rope_scaling": null,
57
+ "rope_theta": 1000000,
58
+ "sliding_window": null,
59
+ "tie_word_embeddings": true,
60
+ "transformers_version": "4.57.3",
61
+ "use_cache": false,
62
+ "use_sliding_window": false,
63
+ "vocab_size": 180445
64
+ }
config.json.bak ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_qwen3.Qwen3Config",
7
+ "AutoModelForCausalLM": "modeling_qwen3.Qwen3ForCausalLM"
8
+ },
9
+ "attention_bias": false,
10
+ "attention_dropout": 0.0,
11
+ "dtype": "bfloat16",
12
+ "eos_token_id": 151643,
13
+ "head_dim": 128,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 512,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "layer_types": [
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention"
47
+ ],
48
+ "max_position_embeddings": 32768,
49
+ "max_window_layers": 28,
50
+ "model_type": "qwen3",
51
+ "num_attention_heads": 16,
52
+ "num_hidden_layers": 28,
53
+ "num_key_value_heads": 8,
54
+ "pad_token_id": 151643,
55
+ "rms_norm_eps": 1e-06,
56
+ "rope_scaling": null,
57
+ "rope_theta": 1000000,
58
+ "sliding_window": null,
59
+ "tie_word_embeddings": true,
60
+ "transformers_version": "4.57.3",
61
+ "use_cache": false,
62
+ "use_sliding_window": false,
63
+ "vocab_size": 180445
64
+ }
configuration_qwen3.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Qwen3 model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig, layer_type_validation
18
+ from transformers.modeling_rope_utils import rope_config_validation
19
+ from transformers.utils import logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ class Qwen3Config(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`Qwen3Model`]. It is used to instantiate a
28
+ Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
29
+ with the defaults will yield a similar configuration to that of
30
+ Qwen3-8B [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B).
31
+
32
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
+ documentation from [`PretrainedConfig`] for more information.
34
+
35
+
36
+ Args:
37
+ vocab_size (`int`, *optional*, defaults to 151936):
38
+ Vocabulary size of the Qwen3 model. Defines the number of different tokens that can be represented by the
39
+ `inputs_ids` passed when calling [`Qwen3Model`]
40
+ hidden_size (`int`, *optional*, defaults to 4096):
41
+ Dimension of the hidden representations.
42
+ intermediate_size (`int`, *optional*, defaults to 22016):
43
+ Dimension of the MLP representations.
44
+ num_hidden_layers (`int`, *optional*, defaults to 32):
45
+ Number of hidden layers in the Transformer encoder.
46
+ num_attention_heads (`int`, *optional*, defaults to 32):
47
+ Number of attention heads for each attention layer in the Transformer encoder.
48
+ num_key_value_heads (`int`, *optional*, defaults to 32):
49
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
50
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
51
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
52
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
53
+ by meanpooling all the original heads within that group. For more details, check out [this
54
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
55
+ head_dim (`int`, *optional*, defaults to 128):
56
+ The attention head dimension.
57
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
58
+ The non-linear activation function (function or string) in the decoder.
59
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
60
+ The maximum sequence length that this model might ever be used with.
61
+ initializer_range (`float`, *optional*, defaults to 0.02):
62
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
63
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
64
+ The epsilon used by the rms normalization layers.
65
+ use_cache (`bool`, *optional*, defaults to `True`):
66
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
67
+ relevant if `config.is_decoder=True`.
68
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
69
+ Whether the model's input and output word embeddings should be tied.
70
+ rope_theta (`float`, *optional*, defaults to 10000.0):
71
+ The base period of the RoPE embeddings.
72
+ rope_scaling (`Dict`, *optional*):
73
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
74
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
75
+ accordingly.
76
+ Expected contents:
77
+ `rope_type` (`str`):
78
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
79
+ 'llama3'], with 'default' being the original RoPE implementation.
80
+ `factor` (`float`, *optional*):
81
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
82
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
83
+ original maximum pre-trained length.
84
+ `original_max_position_embeddings` (`int`, *optional*):
85
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
86
+ pretraining.
87
+ `attention_factor` (`float`, *optional*):
88
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
89
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
90
+ `factor` field to infer the suggested value.
91
+ `beta_fast` (`float`, *optional*):
92
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
93
+ ramp function. If unspecified, it defaults to 32.
94
+ `beta_slow` (`float`, *optional*):
95
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
96
+ ramp function. If unspecified, it defaults to 1.
97
+ `short_factor` (`list[float]`, *optional*):
98
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
99
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
100
+ size divided by the number of attention heads divided by 2
101
+ `long_factor` (`list[float]`, *optional*):
102
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
103
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
104
+ size divided by the number of attention heads divided by 2
105
+ `low_freq_factor` (`float`, *optional*):
106
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
107
+ `high_freq_factor` (`float`, *optional*):
108
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
109
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
110
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
111
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
112
+ Whether to use sliding window attention.
113
+ sliding_window (`int`, *optional*, defaults to 4096):
114
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
115
+ max_window_layers (`int`, *optional*, defaults to 28):
116
+ The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
117
+ additional layer afterwards will use SWA (Sliding Window Attention).
118
+ layer_types (`list`, *optional*):
119
+ Attention pattern for each layer.
120
+ attention_dropout (`float`, *optional*, defaults to 0.0):
121
+ The dropout ratio for the attention probabilities.
122
+
123
+ ```python
124
+ >>> from transformers import Qwen3Model, Qwen3Config
125
+
126
+ >>> # Initializing a Qwen3 style configuration
127
+ >>> configuration = Qwen3Config()
128
+
129
+ >>> # Initializing a model from the Qwen3-8B style configuration
130
+ >>> model = Qwen3Model(configuration)
131
+
132
+ >>> # Accessing the model configuration
133
+ >>> configuration = model.config
134
+ ```"""
135
+
136
+ model_type = "qwen3"
137
+ keys_to_ignore_at_inference = ["past_key_values"]
138
+
139
+ # Default tensor parallel plan for base model `Qwen3`
140
+ base_model_tp_plan = {
141
+ "layers.*.self_attn.q_proj": "colwise",
142
+ "layers.*.self_attn.k_proj": "colwise",
143
+ "layers.*.self_attn.v_proj": "colwise",
144
+ "layers.*.self_attn.o_proj": "rowwise",
145
+ "layers.*.mlp.gate_proj": "colwise",
146
+ "layers.*.mlp.up_proj": "colwise",
147
+ "layers.*.mlp.down_proj": "rowwise",
148
+ }
149
+ base_model_pp_plan = {
150
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
151
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
152
+ "norm": (["hidden_states"], ["hidden_states"]),
153
+ }
154
+
155
+ def __init__(
156
+ self,
157
+ vocab_size=151936,
158
+ hidden_size=4096,
159
+ intermediate_size=22016,
160
+ num_hidden_layers=32,
161
+ num_attention_heads=32,
162
+ num_key_value_heads=32,
163
+ head_dim=128,
164
+ hidden_act="silu",
165
+ max_position_embeddings=32768,
166
+ initializer_range=0.02,
167
+ rms_norm_eps=1e-6,
168
+ use_cache=True,
169
+ tie_word_embeddings=False,
170
+ rope_theta=10000.0,
171
+ rope_scaling=None,
172
+ attention_bias=False,
173
+ use_sliding_window=False,
174
+ sliding_window=4096,
175
+ max_window_layers=28,
176
+ layer_types=None,
177
+ attention_dropout=0.0,
178
+ **kwargs,
179
+ ):
180
+ self.vocab_size = vocab_size
181
+ self.max_position_embeddings = max_position_embeddings
182
+ self.hidden_size = hidden_size
183
+ self.intermediate_size = intermediate_size
184
+ self.num_hidden_layers = num_hidden_layers
185
+ self.num_attention_heads = num_attention_heads
186
+ self.use_sliding_window = use_sliding_window
187
+ self.sliding_window = sliding_window if self.use_sliding_window else None
188
+ self.max_window_layers = max_window_layers
189
+
190
+ # for backward compatibility
191
+ if num_key_value_heads is None:
192
+ num_key_value_heads = num_attention_heads
193
+
194
+ self.num_key_value_heads = num_key_value_heads
195
+ self.head_dim = head_dim
196
+ self.hidden_act = hidden_act
197
+ self.initializer_range = initializer_range
198
+ self.rms_norm_eps = rms_norm_eps
199
+ self.use_cache = use_cache
200
+ self.rope_theta = rope_theta
201
+ self.rope_scaling = rope_scaling
202
+ self.attention_bias = attention_bias
203
+ self.attention_dropout = attention_dropout
204
+ # Validate the correctness of rotary position embeddings parameters
205
+ # BC: if there is a 'type' field, move it to 'rope_type'.
206
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
207
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
208
+ rope_config_validation(self)
209
+
210
+ self.layer_types = layer_types
211
+ if self.layer_types is None:
212
+ self.layer_types = [
213
+ "sliding_attention"
214
+ if self.sliding_window is not None and i >= self.max_window_layers
215
+ else "full_attention"
216
+ for i in range(self.num_hidden_layers)
217
+ ]
218
+ layer_type_validation(self.layer_types, self.num_hidden_layers)
219
+
220
+ super().__init__(
221
+ tie_word_embeddings=tie_word_embeddings,
222
+ **kwargs,
223
+ )
224
+
225
+
226
+ __all__ = ["Qwen3Config"]
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [
4
+ 151643
5
+ ],
6
+ "pad_token_id": 151643,
7
+ "transformers_version": "4.57.3"
8
+ }
glm-4-voice-tokenizer/.mdl ADDED
Binary file (52 Bytes). View file
 
glm-4-voice-tokenizer/.msc ADDED
Binary file (440 Bytes). View file
 
glm-4-voice-tokenizer/.mv ADDED
@@ -0,0 +1 @@
 
 
1
+ Revision:master,CreatedAt:1729826962
glm-4-voice-tokenizer/LICENSE ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The glm-4-voice License
2
+
3
+ 1. 定义
4
+
5
+ “许可方”是指分发其软件的 glm-4-voice 模型团队。
6
+ “软件”是指根据本许可提供的 glm-4-voice 模型参数。
7
+
8
+ 2. 许可授予
9
+
10
+ 根据本许可的条款和条件,许可方特此授予您非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可。
11
+ 本许可允许您免费使用本仓库中的所有开源模型进行学术研究,对于希望将模型用于商业目的的用户,需在[这里](https://open.bigmodel.cn/mla/form)完成登记。经过登记的用户可以免费使用本模型进行商业活动,但必须遵守本许可的所有条款和条件。
12
+ 上述版权声明和本许可声明应包含在本软件的所有副本或重要部分中。
13
+ 如果您分发或提供 THUDM / 智谱AI 关于 glm-4 开源模型的材料(或其任何衍生作品),或使用其中任何材料(包括 glm-4 系列的所有开源模型)的产品或服务,您应:
14
+
15
+ (A) 随任何此类 THUDM / 智谱AI 材料提供本协议的副本;
16
+ (B) 在相关网站、用户界面、博客文章、关于页面或产品文档上突出显示 “Built with glm-4”。
17
+ 如果您使用 THUDM / 智谱AI的 glm-4 开源模型的材料来创建、训练、微调或以其他方式改进已分发或可用的 AI 模型,您还应在任何此类 AI 模型名称的开头添加 “glm-4”。
18
+
19
+ 3. 限制
20
+
21
+ 您不得出于任何军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建本软件的全部或部分衍生作品。
22
+ 您不得利用本软件从事任何危害国家安全和国家统一,危害社会公共利益及公序良俗,侵犯他人商业秘密、知识产权、名誉权、肖像权、财产权等权益的行为。
23
+ 您在使用中应遵循使用地所适用的法律法规政策、道德规范等要求。
24
+
25
+ 4. 免责声明
26
+
27
+ 本软件“按原样”提供,不提供任何明示或暗示的保证,包括但不限于对适销性、特定用途的适用性和非侵权性的保证。
28
+ 在任何情况下,作者或版权持有人均不对任何索赔、损害或其他责任负责,无论是在合同诉讼、侵权行为还是其他方面,由软件或软件的使用或其他交易引起、由软件引起或与之相关
29
+ 软件。
30
+
31
+ 5. 责任限制
32
+
33
+ 除适用法律禁止的范围外,在任何情况下且根据任何法律理论,无论是基于侵权行为、疏忽、合同、责任或其他原因,任何许可方均不对您承担任何直接、间接、特殊、偶然、示范性、
34
+ 或间接损害,或任何其他商业损失,即使许可人已被告知此类损害的可能性。
35
+
36
+ 6. 争议解决
37
+
38
+ 本许可受中华人民共和国法律管辖并按其解释。 因本许可引起的或与本许可有关的任何争议应提交北京市海淀区人民法院。
39
+ 请注意,许可证可能会更新到更全面的版本。 有关许可和版权的任何问题,请通过 license@zhipuai.cn 与我们联系。
40
+ 1. Definitions
41
+ “Licensor” means the glm-4-voice Model Team that distributes its Software.
42
+ “Software” means the glm-4-voice model parameters made available under this license.
43
+ 2. License
44
+ Under the terms and conditions of this license, the Licensor hereby grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license.
45
+ This license allows you to use all open source models in this repository for free for academic research. For users who wish to use the models for commercial purposes, please do so [here](https://open.bigmodel.cn/mla/form)
46
+ Complete registration. Registered users are free to use this model for commercial activities, but must comply with all terms and conditions of this license.
47
+ The copyright notice and this license notice shall be included in all copies or substantial portions of the Software.
48
+ If you distribute or provide THUDM / Zhipu AI materials on the glm-4 open source model (or any derivative works thereof), or products or services that use any materials therein (including all open source models of the glm-4 series), you should:
49
+ (A) Provide a copy of this Agreement with any such THUDM/Zhipu AI Materials;
50
+ (B) Prominently display "Built with glm-4" on the relevant website, user interface, blog post, related page or product documentation.
51
+ If you use materials from THUDM/Zhipu AI's glm-4 model to create, train, operate, or otherwise improve assigned or available AI models, you should also add "glm-4" to the beginning of any such AI model name.
52
+ 3. Restrictions
53
+ You are not allowed to use, copy, modify, merge, publish, distribute, copy or create all or part of the derivative works of this software for any military or illegal purposes.
54
+ You are not allowed to use this software to engage in any behavior that endangers national security and unity, endangers social public interests and public order, infringes on the rights and interests of others such as trade secrets, intellectual property rights, reputation rights, portrait rights, and property rights.
55
+ You should comply with the applicable laws, regulations, policies, ethical standards, and other requirements in the place of use during use.
56
+ 4. Disclaimer
57
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
58
+ WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
59
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
60
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
61
+ 5. Limitation of Liability
62
+ EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT,
63
+ NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL,
64
+ INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED
65
+ OF THE POSSIBILITY OF SUCH DAMAGES.
66
+ 6. Dispute Resolution
67
+ This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute
68
+ arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
69
+ Note that the license is subject to update to a more comprehensive version. For any questions related to the license and
70
+ copyright, please contact us at license@zhipuai.cn.
glm-4-voice-tokenizer/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GLM-4-Voice-Tokenizer
2
+
3
+ GLM-4-Voice 是智谱 AI 推出的端到端语音模型。GLM-4-Voice 能够直接理解和生成中英文语音,进行实时语音对话,并且能够根据用户的指令改变语音的情感、语调、语速、方言等属性。
4
+
5
+ GLM-4-Voice is an end-to-end voice model launched by Zhipu AI. GLM-4-Voice can directly understand and generate Chinese and English speech, engage in real-time voice conversations, and change attributes such as emotion, intonation, speech rate, and dialect based on user instructions.
6
+
7
+ 本仓库是 GLM-4-Voice 的 speech tokenizer 部分。通过在 [Whisper](https://github.com/openai/whisper) 的 encoder 部分增加 vector quantization 进行训练,将连续的语音输入转化为离散的 token。每秒音频转化为 12.5 个离散 token。
8
+
9
+ The repo provides the speech tokenzier of GLM-4-Voice, which is trained by adding vector quantization to the encoder part of [Whisper](https://github.com/openai/whisper) and converts continuous speech input into discrete tokens. Each second of audio is converted into 12.5 discrete tokens.
10
+
11
+ 更多信息请参考我们的仓库 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice).
12
+
13
+ For more information please refer to our repo [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice).
glm-4-voice-tokenizer/config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "THUDM/glm-4-voice-tokenizer",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "apply_spec_augment": false,
6
+ "architectures": [
7
+ "WhisperVQEncoder"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "begin_suppress_tokens": [
11
+ 220,
12
+ 50257
13
+ ],
14
+ "bos_token_id": 50257,
15
+ "classifier_proj_size": 256,
16
+ "d_model": 1280,
17
+ "decoder_attention_heads": 20,
18
+ "decoder_ffn_dim": 5120,
19
+ "decoder_layerdrop": 0.0,
20
+ "decoder_layers": 32,
21
+ "decoder_start_token_id": 50258,
22
+ "dropout": 0.0,
23
+ "encoder_attention_heads": 20,
24
+ "encoder_causal_attention": false,
25
+ "encoder_causal_convolution": true,
26
+ "encoder_ffn_dim": 5120,
27
+ "encoder_layerdrop": 0.0,
28
+ "encoder_layers": 32,
29
+ "eos_token_id": 50257,
30
+ "init_std": 0.02,
31
+ "is_encoder_decoder": true,
32
+ "mask_feature_length": 10,
33
+ "mask_feature_min_masks": 0,
34
+ "mask_feature_prob": 0.0,
35
+ "mask_time_length": 10,
36
+ "mask_time_min_masks": 2,
37
+ "mask_time_prob": 0.05,
38
+ "max_length": 448,
39
+ "max_source_positions": 1500,
40
+ "max_target_positions": 448,
41
+ "median_filter_width": 7,
42
+ "model_type": "whisper",
43
+ "num_hidden_layers": 32,
44
+ "num_mel_bins": 128,
45
+ "pad_token_id": 50256,
46
+ "pooling_kernel_size": 4,
47
+ "pooling_position": 16,
48
+ "pooling_type": "avg",
49
+ "quantize_causal_block_size": 200,
50
+ "quantize_causal_encoder": false,
51
+ "quantize_commit_coefficient": 0.25,
52
+ "quantize_ema_decay": 0.99,
53
+ "quantize_encoder_only": true,
54
+ "quantize_loss_scale": 10.0,
55
+ "quantize_position": 16,
56
+ "quantize_restart_interval": 100,
57
+ "quantize_vocab_size": 16384,
58
+ "scale_embedding": false,
59
+ "skip_language_detection": true,
60
+ "torch_dtype": "float32",
61
+ "transformers_version": "4.44.1",
62
+ "use_cache": true,
63
+ "use_weighted_layer_sum": false,
64
+ "vocab_size": 51866
65
+ }
glm-4-voice-tokenizer/configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework":"Pytorch","task":"auto-speech-recognition"}
glm-4-voice-tokenizer/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2800bd503f52b51e45f0c53cfd5c31dcfe8ef7f13d22b396aa3d53e0280dd1e4
3
+ size 1458374480
glm-4-voice-tokenizer/preprocessor_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "feature_extractor_type": "WhisperFeatureExtractor",
4
+ "feature_size": 128,
5
+ "hop_length": 160,
6
+ "n_fft": 400,
7
+ "n_samples": 480000,
8
+ "nb_max_frames": 3000,
9
+ "padding_side": "right",
10
+ "padding_value": 0.0,
11
+ "processor_class": "WhisperProcessor",
12
+ "return_attention_mask": false,
13
+ "sampling_rate": 16000
14
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b7b2c9ab7547c6ba9688d1fc3c03e587ee248a0a5a6fe441e9d687a12605a99
3
+ size 625285168
modeling_qwen3.py ADDED
@@ -0,0 +1,650 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/qwen3/modular_qwen3.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_qwen3.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # coding=utf-8
8
+ # Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
9
+ #
10
+ # Licensed under the Apache License, Version 2.0 (the "License");
11
+ # you may not use this file except in compliance with the License.
12
+ # You may obtain a copy of the License at
13
+ #
14
+ # http://www.apache.org/licenses/LICENSE-2.0
15
+ #
16
+ # Unless required by applicable law or agreed to in writing, software
17
+ # distributed under the License is distributed on an "AS IS" BASIS,
18
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ # See the License for the specific language governing permissions and
20
+ # limitations under the License.
21
+
22
+ from typing import Callable, Optional, Union
23
+
24
+ import torch
25
+ from torch import nn
26
+
27
+ from transformers.activations import ACT2FN
28
+ from transformers.cache_utils import Cache, DynamicCache
29
+ from transformers.generation import GenerationMixin
30
+ from transformers.integrations import use_kernel_forward_from_hub
31
+ from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
32
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
33
+ from transformers.modeling_layers import (
34
+ GenericForQuestionAnswering,
35
+ GenericForSequenceClassification,
36
+ GenericForTokenClassification,
37
+ GradientCheckpointingLayer,
38
+ )
39
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
40
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
41
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
42
+ from transformers.processing_utils import Unpack
43
+ from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
44
+ from transformers.utils.deprecation import deprecate_kwarg
45
+ from transformers.utils.generic import check_model_inputs
46
+ from .configuration_qwen3 import Qwen3Config
47
+
48
+
49
+ @use_kernel_forward_from_hub("RMSNorm")
50
+ class Qwen3RMSNorm(nn.Module):
51
+ def __init__(self, hidden_size, eps: float = 1e-6) -> None:
52
+ """
53
+ Qwen3RMSNorm is equivalent to T5LayerNorm
54
+ """
55
+ super().__init__()
56
+ self.weight = nn.Parameter(torch.ones(hidden_size))
57
+ self.variance_epsilon = eps
58
+
59
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
60
+ input_dtype = hidden_states.dtype
61
+ hidden_states = hidden_states.to(torch.float32)
62
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
63
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
64
+ return self.weight * hidden_states.to(input_dtype)
65
+
66
+ def extra_repr(self):
67
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
68
+
69
+
70
+ class Qwen3MLP(nn.Module):
71
+ def __init__(self, config):
72
+ super().__init__()
73
+ self.config = config
74
+ self.hidden_size = config.hidden_size
75
+ self.intermediate_size = config.intermediate_size
76
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
77
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
78
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
79
+ self.act_fn = ACT2FN[config.hidden_act]
80
+
81
+ def forward(self, x):
82
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
83
+ return down_proj
84
+
85
+
86
+ def rotate_half(x):
87
+ """Rotates half the hidden dims of the input."""
88
+ x1 = x[..., : x.shape[-1] // 2]
89
+ x2 = x[..., x.shape[-1] // 2 :]
90
+ return torch.cat((-x2, x1), dim=-1)
91
+
92
+
93
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
94
+ """Applies Rotary Position Embedding to the query and key tensors.
95
+
96
+ Args:
97
+ q (`torch.Tensor`): The query tensor.
98
+ k (`torch.Tensor`): The key tensor.
99
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
100
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
101
+ position_ids (`torch.Tensor`, *optional*):
102
+ Deprecated and unused.
103
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
104
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
105
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
106
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
107
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
108
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
109
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
110
+ Returns:
111
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
112
+ """
113
+ cos = cos.unsqueeze(unsqueeze_dim)
114
+ sin = sin.unsqueeze(unsqueeze_dim)
115
+ q_embed = (q * cos) + (rotate_half(q) * sin)
116
+ k_embed = (k * cos) + (rotate_half(k) * sin)
117
+ return q_embed, k_embed
118
+
119
+
120
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
121
+ """
122
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
123
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
124
+ """
125
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
126
+ if n_rep == 1:
127
+ return hidden_states
128
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
129
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
130
+
131
+
132
+ def eager_attention_forward(
133
+ module: nn.Module,
134
+ query: torch.Tensor,
135
+ key: torch.Tensor,
136
+ value: torch.Tensor,
137
+ attention_mask: Optional[torch.Tensor],
138
+ scaling: float,
139
+ dropout: float = 0.0,
140
+ **kwargs: Unpack[TransformersKwargs],
141
+ ):
142
+ key_states = repeat_kv(key, module.num_key_value_groups)
143
+ value_states = repeat_kv(value, module.num_key_value_groups)
144
+
145
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
146
+ if attention_mask is not None:
147
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
148
+ attn_weights = attn_weights + causal_mask
149
+
150
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
151
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
152
+ attn_output = torch.matmul(attn_weights, value_states)
153
+ attn_output = attn_output.transpose(1, 2).contiguous()
154
+
155
+ return attn_output, attn_weights
156
+
157
+
158
+ class Qwen3Attention(nn.Module):
159
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
160
+
161
+ def __init__(self, config: Qwen3Config, layer_idx: int):
162
+ super().__init__()
163
+ self.config = config
164
+ self.layer_idx = layer_idx
165
+ self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
166
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
167
+ self.scaling = self.head_dim**-0.5
168
+ self.attention_dropout = config.attention_dropout
169
+ self.is_causal = True
170
+
171
+ self.q_proj = nn.Linear(
172
+ config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
173
+ )
174
+ self.k_proj = nn.Linear(
175
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
176
+ )
177
+ self.v_proj = nn.Linear(
178
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
179
+ )
180
+ self.o_proj = nn.Linear(
181
+ config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
182
+ )
183
+ self.q_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) # unlike olmo, only on the head dim!
184
+ self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) # thus post q_norm does not need reshape
185
+ self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
186
+
187
+ @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
188
+ def forward(
189
+ self,
190
+ hidden_states: torch.Tensor,
191
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
192
+ attention_mask: Optional[torch.Tensor],
193
+ past_key_values: Optional[Cache] = None,
194
+ cache_position: Optional[torch.LongTensor] = None,
195
+ **kwargs: Unpack[FlashAttentionKwargs],
196
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
197
+ input_shape = hidden_states.shape[:-1]
198
+ hidden_shape = (*input_shape, -1, self.head_dim)
199
+
200
+ query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
201
+ key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
202
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
203
+
204
+ cos, sin = position_embeddings
205
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
206
+
207
+ if past_key_values is not None:
208
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
209
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
210
+ key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
211
+
212
+ attention_interface: Callable = eager_attention_forward
213
+ if self.config._attn_implementation != "eager":
214
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
215
+
216
+ attn_output, attn_weights = attention_interface(
217
+ self,
218
+ query_states,
219
+ key_states,
220
+ value_states,
221
+ attention_mask,
222
+ dropout=0.0 if not self.training else self.attention_dropout,
223
+ scaling=self.scaling,
224
+ sliding_window=self.sliding_window, # diff with Llama
225
+ **kwargs,
226
+ )
227
+
228
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
229
+ attn_output = self.o_proj(attn_output)
230
+ return attn_output, attn_weights
231
+
232
+
233
+ class Qwen3DecoderLayer(GradientCheckpointingLayer):
234
+ def __init__(self, config: Qwen3Config, layer_idx: int):
235
+ super().__init__()
236
+ self.hidden_size = config.hidden_size
237
+
238
+ self.self_attn = Qwen3Attention(config=config, layer_idx=layer_idx)
239
+
240
+ self.mlp = Qwen3MLP(config)
241
+ self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
242
+ self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
243
+ self.attention_type = config.layer_types[layer_idx]
244
+
245
+ @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
246
+ def forward(
247
+ self,
248
+ hidden_states: torch.Tensor,
249
+ attention_mask: Optional[torch.Tensor] = None,
250
+ position_ids: Optional[torch.LongTensor] = None,
251
+ past_key_values: Optional[Cache] = None,
252
+ use_cache: Optional[bool] = False,
253
+ cache_position: Optional[torch.LongTensor] = None,
254
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
255
+ **kwargs: Unpack[TransformersKwargs],
256
+ ) -> torch.Tensor:
257
+ residual = hidden_states
258
+ hidden_states = self.input_layernorm(hidden_states)
259
+ # Self Attention
260
+ hidden_states, _ = self.self_attn(
261
+ hidden_states=hidden_states,
262
+ attention_mask=attention_mask,
263
+ position_ids=position_ids,
264
+ past_key_values=past_key_values,
265
+ use_cache=use_cache,
266
+ cache_position=cache_position,
267
+ position_embeddings=position_embeddings,
268
+ **kwargs,
269
+ )
270
+ hidden_states = residual + hidden_states
271
+
272
+ # Fully Connected
273
+ residual = hidden_states
274
+ hidden_states = self.post_attention_layernorm(hidden_states)
275
+ hidden_states = self.mlp(hidden_states)
276
+ hidden_states = residual + hidden_states
277
+ return hidden_states
278
+
279
+
280
+ @auto_docstring
281
+ class Qwen3PreTrainedModel(PreTrainedModel):
282
+ config: Qwen3Config
283
+ base_model_prefix = "model"
284
+ supports_gradient_checkpointing = True
285
+ _no_split_modules = ["Qwen3DecoderLayer"]
286
+ _skip_keys_device_placement = ["past_key_values"]
287
+ _supports_flash_attn = True
288
+ _supports_sdpa = True
289
+ _supports_flex_attn = True
290
+
291
+ _can_compile_fullgraph = True
292
+ _supports_attention_backend = True
293
+ _can_record_outputs = {
294
+ "hidden_states": Qwen3DecoderLayer,
295
+ "attentions": Qwen3Attention,
296
+ }
297
+
298
+
299
+ class Qwen3RotaryEmbedding(nn.Module):
300
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
301
+
302
+ def __init__(self, config: Qwen3Config, device=None):
303
+ super().__init__()
304
+ # BC: "rope_type" was originally "type"
305
+ if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
306
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
307
+ else:
308
+ self.rope_type = "default"
309
+ self.max_seq_len_cached = config.max_position_embeddings
310
+ self.original_max_seq_len = config.max_position_embeddings
311
+
312
+ self.config = config
313
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
314
+
315
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
316
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
317
+ self.original_inv_freq = self.inv_freq
318
+
319
+ @torch.no_grad()
320
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
321
+ def forward(self, x, position_ids):
322
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
323
+ position_ids_expanded = position_ids[:, None, :].float()
324
+
325
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
326
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
327
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
328
+ emb = torch.cat((freqs, freqs), dim=-1)
329
+ cos = emb.cos() * self.attention_scaling
330
+ sin = emb.sin() * self.attention_scaling
331
+
332
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
333
+
334
+
335
+ @auto_docstring
336
+ class Qwen3Model(Qwen3PreTrainedModel):
337
+ def __init__(self, config: Qwen3Config):
338
+ super().__init__(config)
339
+ self.padding_idx = config.pad_token_id
340
+ self.vocab_size = config.vocab_size
341
+ # print(f"load from locally")
342
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
343
+ self.layers = nn.ModuleList(
344
+ [Qwen3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
345
+ )
346
+ self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
347
+ self.rotary_emb = Qwen3RotaryEmbedding(config=config)
348
+ self.gradient_checkpointing = False
349
+ self.has_sliding_layers = "sliding_attention" in self.config.layer_types
350
+
351
+ # Initialize weights and apply final processing
352
+ self.post_init()
353
+
354
+ @check_model_inputs()
355
+ @auto_docstring
356
+ def forward(
357
+ self,
358
+ input_ids: Optional[torch.LongTensor] = None,
359
+ attention_mask: Optional[torch.Tensor] = None,
360
+ position_ids: Optional[torch.LongTensor] = None,
361
+ past_key_values: Optional[Cache] = None,
362
+ inputs_embeds: Optional[torch.FloatTensor] = None,
363
+ use_cache: Optional[bool] = None,
364
+ cache_position: Optional[torch.LongTensor] = None,
365
+ **kwargs: Unpack[TransformersKwargs],
366
+ ) -> BaseModelOutputWithPast:
367
+ if (input_ids is None) ^ (inputs_embeds is not None):
368
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
369
+
370
+ if inputs_embeds is None:
371
+ inputs_embeds = self.embed_tokens(input_ids)
372
+
373
+ if use_cache and past_key_values is None:
374
+ past_key_values = DynamicCache(config=self.config)
375
+
376
+ if cache_position is None:
377
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
378
+ cache_position = torch.arange(
379
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
380
+ )
381
+
382
+ if position_ids is None:
383
+ position_ids = cache_position.unsqueeze(0)
384
+
385
+ # It may already have been prepared by e.g. `generate`
386
+ if not isinstance(causal_mask_mapping := attention_mask, dict):
387
+ # Prepare mask arguments
388
+ mask_kwargs = {
389
+ "config": self.config,
390
+ "input_embeds": inputs_embeds,
391
+ "attention_mask": attention_mask,
392
+ "cache_position": cache_position,
393
+ "past_key_values": past_key_values,
394
+ "position_ids": position_ids,
395
+ }
396
+ # Create the masks
397
+ causal_mask_mapping = {
398
+ "full_attention": create_causal_mask(**mask_kwargs),
399
+ }
400
+ # The sliding window alternating layers are not always activated depending on the config
401
+ if self.has_sliding_layers:
402
+ causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
403
+
404
+ hidden_states = inputs_embeds
405
+
406
+ # create position embeddings to be shared across the decoder layers
407
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
408
+
409
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
410
+ hidden_states = decoder_layer(
411
+ hidden_states,
412
+ attention_mask=causal_mask_mapping[decoder_layer.attention_type],
413
+ position_ids=position_ids,
414
+ past_key_values=past_key_values,
415
+ use_cache=use_cache,
416
+ cache_position=cache_position,
417
+ position_embeddings=position_embeddings,
418
+ **kwargs,
419
+ )
420
+
421
+ hidden_states = self.norm(hidden_states)
422
+ return BaseModelOutputWithPast(
423
+ last_hidden_state=hidden_states,
424
+ past_key_values=past_key_values if use_cache else None,
425
+ )
426
+
427
+
428
+ # @auto_docstring
429
+ # class Qwen3ForCausalLM(Qwen3PreTrainedModel, GenerationMixin):
430
+ # _tied_weights_keys = ["lm_head.weight"]
431
+ # _tp_plan = {"lm_head": "colwise_rep"}
432
+ # _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
433
+
434
+ # def __init__(self, config):
435
+ # super().__init__(config)
436
+ # self.model = Qwen3Model(config)
437
+ # self.vocab_size = config.vocab_size
438
+ # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
439
+
440
+ # # Initialize weights and apply final processing
441
+ # self.post_init()
442
+
443
+ # @can_return_tuple
444
+ # @auto_docstring
445
+ # def forward(
446
+ # self,
447
+ # input_ids: Optional[torch.LongTensor] = None,
448
+ # attention_mask: Optional[torch.Tensor] = None,
449
+ # position_ids: Optional[torch.LongTensor] = None,
450
+ # past_key_values: Optional[Cache] = None,
451
+ # inputs_embeds: Optional[torch.FloatTensor] = None,
452
+ # labels: Optional[torch.LongTensor] = None,
453
+ # use_cache: Optional[bool] = None,
454
+ # cache_position: Optional[torch.LongTensor] = None,
455
+ # logits_to_keep: Union[int, torch.Tensor] = 0,
456
+ # **kwargs: Unpack[TransformersKwargs],
457
+ # ) -> CausalLMOutputWithPast:
458
+ # r"""
459
+ # labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
460
+ # Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
461
+ # config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
462
+ # (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
463
+
464
+ # Example:
465
+
466
+ # ```python
467
+ # >>> from transformers import AutoTokenizer, Qwen3ForCausalLM
468
+
469
+ # >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
470
+ # >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
471
+
472
+ # >>> prompt = "Hey, are you conscious? Can you talk to me?"
473
+ # >>> inputs = tokenizer(prompt, return_tensors="pt")
474
+
475
+ # >>> # Generate
476
+ # >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
477
+ # >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
478
+ # "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
479
+ # ```"""
480
+ # outputs: BaseModelOutputWithPast = self.model(
481
+ # input_ids=input_ids,
482
+ # attention_mask=attention_mask,
483
+ # position_ids=position_ids,
484
+ # past_key_values=past_key_values,
485
+ # inputs_embeds=inputs_embeds,
486
+ # use_cache=use_cache,
487
+ # cache_position=cache_position,
488
+ # **kwargs,
489
+ # )
490
+
491
+ # hidden_states = outputs.last_hidden_state
492
+ # # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
493
+ # slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
494
+ # logits = self.lm_head(hidden_states[:, slice_indices, :])
495
+
496
+ # loss = None
497
+ # if labels is not None:
498
+ # loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
499
+
500
+ # return CausalLMOutputWithPast(
501
+ # loss=loss,
502
+ # logits=logits,
503
+ # past_key_values=outputs.past_key_values,
504
+ # hidden_states=outputs.hidden_states,
505
+ # attentions=outputs.attentions,
506
+ # )
507
+
508
+
509
+ @auto_docstring
510
+ class Qwen3ForCausalLM(Qwen3PreTrainedModel, GenerationMixin):
511
+ _tied_weights_keys = ["lm_head.weight"]
512
+ _tp_plan = {"lm_head": "colwise_rep"}
513
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
514
+
515
+ def __init__(self, config):
516
+ super().__init__(config)
517
+ self.model = Qwen3Model(config)
518
+ self.vocab_size = config.vocab_size
519
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
520
+
521
+ # Initialize weights and apply final processing
522
+ self.post_init()
523
+
524
+ @can_return_tuple
525
+ @auto_docstring
526
+ def forward(
527
+ self,
528
+ input_ids: Optional[torch.LongTensor] = None,
529
+ attention_mask: Optional[torch.Tensor] = None,
530
+ position_ids: Optional[torch.LongTensor] = None,
531
+ past_key_values: Optional[Cache] = None,
532
+ inputs_embeds: Optional[torch.FloatTensor] = None,
533
+ labels: Optional[torch.LongTensor] = None,
534
+ use_cache: Optional[bool] = None,
535
+ cache_position: Optional[torch.LongTensor] = None,
536
+ logits_to_keep: Union[int, torch.Tensor] = 0,
537
+ **kwargs: Unpack[TransformersKwargs],
538
+ ) -> CausalLMOutputWithPast:
539
+ r"""
540
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
541
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
542
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
543
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
544
+
545
+ Example:
546
+
547
+ ```python
548
+ >>> from transformers import AutoTokenizer, Qwen3ForCausalLM
549
+
550
+ >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
551
+ >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
552
+
553
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
554
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
555
+
556
+ >>> # Generate
557
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
558
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
559
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
560
+ ```"""
561
+ outputs: BaseModelOutputWithPast = self.model(
562
+ input_ids=input_ids,
563
+ attention_mask=attention_mask,
564
+ position_ids=position_ids,
565
+ past_key_values=past_key_values,
566
+ inputs_embeds=inputs_embeds,
567
+ use_cache=use_cache,
568
+ cache_position=cache_position,
569
+ **kwargs,
570
+ )
571
+
572
+ hidden_states = outputs.last_hidden_state
573
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
574
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
575
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
576
+
577
+ # =================================================================
578
+ # [ADD] Custom Logit Masking Logic (Inference Only)
579
+ # =================================================================
580
+ # 仅在非训练模式 (self.training == False) 且 input_ids 存在时执行
581
+ if not self.training and input_ids is not None:
582
+ # 1. 判断 Mask 触发条件
583
+ # input_ids shape: (batch_size, seq_len)
584
+
585
+ # --- 条件 A: 样本中包含 151691 ---
586
+ has_trigger_token = (input_ids == 151691).any(dim=-1)
587
+
588
+ # --- 条件 B: 样本长度为 1 且 ID 不在 [172207, 180398] 之间 ---
589
+ seq_len = input_ids.shape[1]
590
+
591
+ if seq_len == 1:
592
+ # 检查 input_ids 是否在 [172207, 180398] 区间内
593
+ in_safe_range = ((input_ids >= 172207) & (input_ids <= 180398)).any(dim=-1)
594
+ # 如果不在安全区间,则满足条件 B
595
+ cond_b = ~in_safe_range
596
+ else:
597
+ # 长度不为1,条件 B 必定不满足
598
+ cond_b = torch.zeros_like(has_trigger_token, dtype=torch.bool)
599
+
600
+ # 综合条件: 满足 A 或 满足 B
601
+ rows_to_mask = has_trigger_token | cond_b
602
+
603
+ # 2. 执行 Mask 操作
604
+ if rows_to_mask.any():
605
+ # 离散 Token 列表
606
+ target_discrete_tokens = [151691, 151692, 151693, 151695, 151696, 151697, 151698]
607
+ mask_indices = torch.tensor(target_discrete_tokens, device=logits.device)
608
+ neg = torch.finfo(logits.dtype).min
609
+ # (1) Mask 离散 Token
610
+ # logits[rows_to_mask] 选取需要 mask 的 batch 行
611
+ # [:, mask_indices] 选取特定的 token ID 列
612
+ logits[rows_to_mask, :, mask_indices] = neg
613
+
614
+ # (2) Mask 连续区间 [172206, 180398]
615
+ # 注意:Python 切片右边界是开区间,所以要写到 180399
616
+ logits[rows_to_mask, :, 151727:180399] = neg
617
+ # =================================================================
618
+
619
+ loss = None
620
+ if labels is not None:
621
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
622
+
623
+ return CausalLMOutputWithPast(
624
+ loss=loss,
625
+ logits=logits,
626
+ past_key_values=outputs.past_key_values,
627
+ hidden_states=outputs.hidden_states,
628
+ attentions=outputs.attentions,
629
+ )
630
+
631
+ class Qwen3ForSequenceClassification(GenericForSequenceClassification, Qwen3PreTrainedModel):
632
+ pass
633
+
634
+
635
+ class Qwen3ForTokenClassification(GenericForTokenClassification, Qwen3PreTrainedModel):
636
+ pass
637
+
638
+
639
+ class Qwen3ForQuestionAnswering(GenericForQuestionAnswering, Qwen3PreTrainedModel):
640
+ base_model_prefix = "transformer" # For BC, where `transformer` was used instead of `model`
641
+
642
+
643
+ __all__ = [
644
+ "Qwen3ForCausalLM",
645
+ "Qwen3ForQuestionAnswering",
646
+ "Qwen3PreTrainedModel",
647
+ "Qwen3Model",
648
+ "Qwen3ForSequenceClassification",
649
+ "Qwen3ForTokenClassification",
650
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01db113d2ddc9192eaed1145c5caced436dbac3e60ee7238b662caf426ecc9f3
3
+ size 17114145
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json ADDED
The diff for this file is too large to render. See raw diff