xiongwang commited on
Commit
a9d7ce0
·
verified ·
1 Parent(s): 7e3f6ed

Upload 9 files

Browse files
README.md CHANGED
@@ -1,3 +1,32 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+
6
+ ## Quick Start
7
+ Please Refer to this [Repo](https://github.com/Freeze-Omni-MLLM/Freeze-Omni).
8
+
9
+ ## ACCEPTABLE USE POLICY
10
+
11
+ Any license on the model is subject to your compliance with the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of the Acceptable Use Policy. Tencent reserves the right to update this Acceptable Use Policy from time to time.
12
+
13
+ Tencent endeavors to promote safe and fair use of its tools and features, including Freeze-Omni. You agree not to use Freeze-Omni or any of its derivatives:
14
+ 1. In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
15
+ 2. To harm Yourself or others;
16
+ 3. To repurpose or distribute output from Freeze-Omni or any of its derivatives to harm Yourself or others;
17
+ 4. To override or circumvent the safety guardrails and safeguards We have put in place;
18
+ 5. For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
19
+ 6. To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
20
+ 7. To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
21
+ 8. To intentionally defame, disparage or otherwise harass others;
22
+ 9. To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
23
+ 10. To generate or disseminate personal identifiable information with the purpose of harming others;
24
+ 11. To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
25
+ 12. To impersonate another individual without consent, authorization, or legal right;
26
+ 13. To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
27
+ 14. In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
28
+ 15. To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
29
+ 16. For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
30
+ 17. To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
31
+ 18. For military purposes;
32
+ 19. To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.
checkpoints/audiollm/final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6d90043f43772c1c9593e818d42cf1b7a151eb6407492f6bcf940d0f0105054
3
+ size 134
checkpoints/audiollm/global_cmvn ADDED
@@ -0,0 +1 @@
 
 
1
+ {"mean_stat": [42290933760.0, 44562989056.0, 47934054400.0, 50294329344.0, 52492623872.0, 54071517184.0, 55109054464.0, 55449354240.0, 55966240768.0, 56030208000.0, 56560594944.0, 56809668608.0, 57317167104.0, 57670230016.0, 57685004288.0, 57780109312.0, 57635872768.0, 57397317632.0, 57772961792.0, 57156792320.0, 56690987008.0, 57452654592.0, 56838529024.0, 57277726720.0, 56979447808.0, 57289629696.0, 56877248512.0, 57206501376.0, 56933548032.0, 56858759168.0, 56982441984.0, 56856772608.0, 56879472640.0, 57088770048.0, 57116323840.0, 57327616000.0, 57559212032.0, 57580515328.0, 57627168768.0, 57469554688.0, 57644437504.0, 57513947136.0, 57888034816.0, 57862254592.0, 58113445888.0, 58480062464.0, 58896785408.0, 59059277824.0, 59257237504.0, 59599949824.0, 59462504448.0, 59389997056.0, 59443486720.0, 59678318592.0, 59872899072.0, 60087185408.0, 60333211648.0, 60344242176.0, 60080939008.0, 59774820352.0, 59574546432.0, 59173933056.0, 58974326784.0, 58689245184.0, 58367008768.0, 57964703744.0, 57601159168.0, 57429585920.0, 57327013888.0, 57154965504.0, 57004187648.0, 56897101824.0, 56751833088.0, 56619937792.0, 56493240320.0, 56417366016.0, 56087306240.0, 55201955840.0, 53744488448.0, 51637952512.0], "var_stat": [532830093312.0, 594386157568.0, 684045107200.0, 747270963200.0, 810570416128.0, 858795933696.0, 892044771328.0, 904934653952.0, 920622006272.0, 921166610432.0, 936514224128.0, 944255991808.0, 960265256960.0, 971787075584.0, 972522848256.0, 975147433984.0, 970122919936.0, 961723301888.0, 973371146240.0, 953644548096.0, 939215552512.0, 962715451392.0, 943020900352.0, 956426420224.0, 946717458432.0, 956588097536.0, 943218753536.0, 953962921984.0, 945552752640.0, 942825799680.0, 946685935616.0, 942276673536.0, 942942650368.0, 949440217088.0, 950289104896.0, 957144694784.0, 964650074112.0, 965387616256.0, 966771015680.0, 961970044928.0, 967419625472.0, 963353182208.0, 975578660864.0, 974970945536.0, 983098916864.0, 995290775552.0, 1009610391552.0, 1015431233536.0, 1022162108416.0, 1033565700096.0, 1029262868480.0, 1027019636736.0, 1028913233920.0, 1037071613952.0, 1044418330624.0, 1052328067072.0, 1060522426368.0, 1060893556736.0, 1052188213248.0, 1041587306496.0, 1035020271616.0, 1021935616000.0, 1015461838848.0, 1006602878976.0, 996370087936.0, 983805853696.0, 972873465856.0, 967858782208.0, 965317427200.0, 960507281408.0, 956465217536.0, 953962856448.0, 950124937216.0, 946753961984.0, 942988001280.0, 939914231808.0, 929753333760.0, 902898647040.0, 858106888192.0, 795279425536.0], "frame_num": 3572697669}
checkpoints/audiollm/train.yaml ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_grad: 1
2
+ cmvn_file:
3
+ dataset_conf:
4
+ batch_conf:
5
+ batch_size: 4
6
+ batch_type: chat
7
+ fbank_conf:
8
+ chunk_size: 4
9
+ dither: 0.0
10
+ frame_length: 25
11
+ frame_shift: 10
12
+ num_mel_bins: 80
13
+ pad_rate: 0.1
14
+ filter_conf:
15
+ max_length: 2000
16
+ max_output_input_ratio: 102400
17
+ max_total_num: 1000
18
+ min_length: 10
19
+ min_output_input_ratio: 0
20
+ token_max_length: 102400
21
+ token_min_length: 0
22
+ parse_multi_rounds: true
23
+ resample_conf:
24
+ resample_rate: 16000
25
+ shuf_list: true
26
+ shuffle: false
27
+ sort: false
28
+ spec_aug: true
29
+ spec_aug_conf:
30
+ max_f: 10
31
+ max_t: 20
32
+ num_f_mask: 1
33
+ num_t_mask: 1
34
+ spec_sub: false
35
+ spec_sub_conf:
36
+ max_t: 30
37
+ num_t_sub: 3
38
+ speed_perturb: false
39
+ split_num: 1
40
+ tokenize_char: false
41
+ tokenize_conf:
42
+ eod_id: 151645
43
+ tokenize_type: Qwen
44
+ tokenizer_path:
45
+ ds_dtype: bf16
46
+ encoder_conf:
47
+ overview_conf:
48
+ encoder-input-dim: 80
49
+ encoder-layer-config: subsampling-transformer
50
+ encoder-output-dim: 1024
51
+ para_conf:
52
+ subsampling:
53
+ subsampling-dropout-rate: 0.1
54
+ subsampling-input-dim: 80
55
+ subsampling-output-dim: 1024
56
+ subsampling-rate: 4
57
+ transformer:
58
+ transformer-attention-dim: 1024
59
+ transformer-attention-dropout-rate: 0.0
60
+ transformer-attention-heads: 16
61
+ transformer-chunk_size: 4
62
+ transformer-concat-after: false
63
+ transformer-dropout-rate: 0.1
64
+ transformer-dynamic-chunks: false
65
+ transformer-input-dim: 1024
66
+ transformer-input-layer: linear
67
+ transformer-left_chunks: 16
68
+ transformer-linear-units: 4096
69
+ transformer-normalize-before: true
70
+ transformer-num-blocks: 24
71
+ transformer-output-dim: 1024
72
+ transformer-pos-enc-class: rel-enc
73
+ transformer-positional-dropout-rate: 0.1
74
+ transformer-positionwise-layer-type: linear
75
+ grad_clip: 5.0
76
+ input_dim: 80
77
+ is_json_cmvn: true
78
+ lang_dict:
79
+ lfmmi_dir: ''
80
+ log_interval: 100
81
+ max_epoch: 100
82
+ model_conf:
83
+ activation_func: gelu
84
+ add_audio_bos_eos: true
85
+ add_prompt_before: true
86
+ adpter_type: subsampling
87
+ chat_template: '<|im_start|>system
88
+
89
+ You are a helpful assistant.<|im_end|>
90
+
91
+ <|im_start|>user
92
+
93
+ <audio><|im_end|>
94
+
95
+ <|im_start|>assistant
96
+
97
+ '
98
+ chunk_size: 2
99
+ enc_out_dim: 1024
100
+ freeze_adpter: true
101
+ freeze_encoder: true
102
+ freeze_llm: true
103
+ kernel_size: 5
104
+ llm_embed_dim: 3584
105
+ llm_head_num: 28
106
+ llm_path:
107
+ norm: layer
108
+ num_key_value_heads: 4
109
+ predict_usr_state: 4
110
+ prompt_finetune: true
111
+ prompt_num: 25
112
+ task_num: 20
113
+ task_type: prompt_finetune
114
+ optim: adamw
115
+ optim_conf:
116
+ betas:
117
+ - 0.9
118
+ - 0.99
119
+ eps: 1.0e-06
120
+ lr: 0.0006
121
+ weight_decay: 0.01
122
+ output_dim: 5538
123
+ scheduler: warmuplr
124
+ scheduler_conf:
125
+ warmup_steps: 200
checkpoints/codec/final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3835f23940dee376066b65bac0fc22004605f0c7f9900da7ba3381af4c03913
3
+ size 134
checkpoints/codec/model.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resblock": "1",
3
+ "num_gpus": 8,
4
+ "batch_size": 160,
5
+ "learning_rate": 0.0002,
6
+ "adam_b1": 0.5,
7
+ "adam_b2": 0.9,
8
+ "lr_decay": 0.98,
9
+ "seed": 1234,
10
+
11
+ "upsample_rates": [8,5,5,3],
12
+ "upsample_kernel_sizes": [16,11,11,5],
13
+ "upsample_initial_channel": 512,
14
+ "resblock_kernel_sizes": [3,7,11],
15
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16
+
17
+ "segment_size": 24000,
18
+ "num_mels": 80,
19
+ "num_freq": 1025,
20
+ "n_fft": 1024,
21
+ "hop_size": 240,
22
+ "win_size": 1024,
23
+
24
+ "sampling_rate": 24000,
25
+
26
+ "n_code_groups": 1,
27
+ "residul_layer": 1,
28
+ "n_codes": 1024,
29
+ "codebook_loss_lambda": 1.0,
30
+ "commitment_loss_lambda": 0.25,
31
+ "global_code_num": 8,
32
+ "global_feature_conv":[128, 64, 128, 3, 1],
33
+ "global_tokens": [473,975,419,219,565,121,550,616],
34
+
35
+ "fmin": 0,
36
+ "fmax": 8000,
37
+ "fmax_for_loss": null,
38
+
39
+ "num_workers": 12
40
+ }
checkpoints/decoder/final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4f98e64988171c3cd85386a0f63b43a710e7a40eb154cf3b21d365b79e4ae69
3
+ size 135
checkpoints/decoder/model.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ 896,
3
+ 1024,
4
+ {
5
+ "accum_grad": 3,
6
+ "char_list": [],
7
+ "debugmode": 0,
8
+ "encoder_criterion": "ce",
9
+ "encoder_drop_rate": 0.1,
10
+ "encoder_input_dim": 896,
11
+ "encoder_layer_config": "transformer",
12
+ "encoder_output_dim": 896,
13
+ "encoder_pre_norm_type": "ln",
14
+ "encoder_upsample_rate": 9,
15
+ "kv_cache_prefix_finetune": 1,
16
+ "epochs": 100,
17
+ "eps": 1e-08,
18
+ "eps_decay": 0.8,
19
+ "gpu_id": null,
20
+ "gpu_num": 1,
21
+ "grad_clip": 5,
22
+ "grad_noise": false,
23
+ "idim": 896,
24
+ "init_lr": 0.0005,
25
+ "lsm_weight": 0.0,
26
+ "max_batch_size": 25,
27
+ "max_duration": 256,
28
+ "max_mem": 20000,
29
+ "mtlalpha": 0.5,
30
+ "n_iter_processes": 8,
31
+ "noam_warmup_steps": 4000,
32
+ "odim": 1024,
33
+ "opt": "noamw",
34
+ "rank": 0,
35
+ "report_interval_iters": 100,
36
+ "resume_trainer": false,
37
+ "save_interval_iters": 2000,
38
+ "seed": 19832,
39
+ "sort_duration": true,
40
+ "start_decay_epoch": 5,
41
+ "stop_learning_rate": 1e-05,
42
+ "sycn_batchnorm": false,
43
+ "tensorboard_dir": null,
44
+ "train_dtype": "bfloat16",
45
+ "transformer_attention_dim": 896,
46
+ "transformer_attention_dropout_rate": 0.1,
47
+ "transformer_attention_heads": 14,
48
+ "transformer_chunk_size": [
49
+ 1
50
+ ],
51
+ "transformer_concat_after": false,
52
+ "transformer_dropout_rate": 0.1,
53
+ "transformer_dynamic_chunks": false,
54
+ "transformer_input_dim": 896,
55
+ "transformer_input_layer": "linear",
56
+ "transformer_left_chunks": [
57
+ -1
58
+ ],
59
+ "transformer_linear_units": 4864,
60
+ "transformer_normalize_before": true,
61
+ "transformer_num_blocks": 4,
62
+ "transformer_output_dim": 896,
63
+ "transformer_pos_enc_class": "rel-enc",
64
+ "transformer_positional_dropout_rate": 0.1,
65
+ "transformer_positionwise_conv_kernel_size": 1,
66
+ "transformer_positionwise_layer_type": "linear",
67
+ "use_zero_redun_opt": false,
68
+ "verbose": 0,
69
+ "weight_decay": 0.05,
70
+ "world_size": 1
71
+ }
72
+ ]
checkpoints/server.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "decoder_first_chunk_size": 20,
3
+ "decoder_chunk_size": 40,
4
+ "decoder_chunk_overlap_size": 10,
5
+ "decoder_top_k": 2,
6
+ "decoder_penalty_window_size": -1,
7
+ "decoder_penalty": 1.1,
8
+ "decoder_N": 2401,
9
+ "decoder_seg_threshold_first_pack": 0.1,
10
+ "decoder_seg_threshold": 0.015
11
+ }