hhou435 commited on
Commit
82181a1
1 Parent(s): 14a0c27
Files changed (1) hide show
  1. ReadMe.txt → README.md +77 -77
ReadMe.txt → README.md RENAMED
@@ -1,78 +1,78 @@
1
- ---
2
- language: All languages
3
- datasets: ISML datasets (80 thousands hours unlabeled data) + babel datasets (2 thousands unlabeled data)
4
-
5
- # Chinese W2v-conformer
6
- ## Model description
7
- This is the set of Speech W2v-conformer model pre-trained by UER-py. You can download the model either from the [UER-py Github page](https://github.com/dbiir/UER-py/):
8
-
9
- ## How to use
10
- You can use the model directly with a pipeline for speech recognition:
11
- ```python
12
- >>> from wenet.dataset.dataset import CollateFunc, AudioDataset
13
- >>> from wenet.transformer.asr_model import ASRModel
14
- >>> from wenet.transformer.encoder import ConformerEncoder
15
- >>> from wenet.transformer.decoder import TransformerDecoder
16
- >>> from wenet.transformer.ctc import CTC
17
- >>> from wenet.utils.executor import Executor
18
- >>> from wenet.utils.checkpoint import save_checkpoint, load_checkpoint
19
- >>> encoder = ConformerEncoder(input_dim, **configs['encoder_conf'])
20
- >>> decoder = TransformerDecoder(vocab_size, encoder.output_size(), **configs['decoder_conf'])
21
- >>> ctc = CTC(vocab_size, encoder.output_size())
22
- >>> with open(args.config, 'r') as fin: configs = yaml.load(fin)
23
- >>> model = ASRModel(
24
- vocab_size=vocab_size,
25
- encoder=encoder,
26
- decoder=decoder,
27
- ctc=ctc,
28
- **configs['model_conf'],
29
- )
30
- >>> infos = load_checkpoint(model, args.checkpoint)
31
-
32
- ```
33
-
34
- ## Training data
35
- ISML datasets (80 thousands hours unlabeled data) and babel datasets (2 thousands unlabeled data) are used as training data.
36
- ## Training procedure
37
- The model is pre-trained by wav2vec2 (https://github.com/dbiir/UER-py/) on [Tencent Cloud](https://cloud.tencent.com/). We pre-train 70 epochs with a batch size of 128. We use the same hyper-parameters on different model sizes.
38
- The downstream models are finetuned:
39
- ```
40
- Stage 1:
41
- ```
42
- python wenet/bin/train.py --gpu 0,1,2,3,4,5,6,7 \
43
- --config $train_config \
44
- --train_data train.data \
45
- --cv_data dev.data \
46
- ${checkpoint:+--checkpoint $checkpoint} \
47
- --model_dir $dir \
48
- --ddp.init_method $init_method \
49
- --ddp.world_size 7 \
50
- --ddp.dist_backend nccl \
51
- --num_workers 2
52
- ```
53
-
54
- ### BibTeX entry and citation info
55
- ```
56
- @article{baevski2020wav2vec,
57
- title={wav2vec 2.0: A framework for self-supervised learning of speech representations},
58
- author={Baevski, Alexei and Zhou, Henry and Mohamed, Abdelrahman and Auli, Michael},
59
- journal={arXiv preprint arXiv:2006.11477},
60
- year={2020}
61
- }
62
-
63
- @article{zhang2020pushing,
64
- title={Pushing the limits of semi-supervised learning for automatic speech recognition},
65
- author={Zhang, Yu and Qin, James and Park, Daniel S and Han, Wei and Chiu, Chung-Cheng and Pang, Ruoming and Le, Quoc V and Wu, Yonghui},
66
- journal={arXiv preprint arXiv:2010.10504},
67
- year={2020}
68
- }
69
-
70
- @article{zhang2021wenet,
71
- title={WeNet: Production First and Production Ready End-to-End Speech Recognition Toolkit},
72
- author={Zhang, Binbin and Wu, Di and Yang, Chao and Chen, Xiaoyu and Peng, Zhendong and Wang, Xiangming and Yao, Zhuoyuan and Wang, Xiong and Yu, Fan and Xie, Lei and others},
73
- journal={arXiv preprint arXiv:2102.01547},
74
- year={2021}
75
- }
76
- ```
77
- [base]:https://huggingface.co/uer/albert-base-chinese-cluecorpussmall
78
  [large]:https://huggingface.co/uer/albert-large-chinese-cluecorpussmall
 
1
+ ---
2
+ language: All languages
3
+ datasets: ISML datasets (80 thousands hours unlabeled data) + babel datasets (2 thousands unlabeled data)
4
+
5
+ # Chinese W2v-conformer
6
+ ## Model description
7
+ This is the set of Speech W2v-conformer model pre-trained by UER-py. You can download the model either from the [UER-py Github page](https://github.com/dbiir/UER-py/):
8
+
9
+ ## How to use
10
+ You can use the model directly with a pipeline for speech recognition:
11
+ ```python
12
+ >>> from wenet.dataset.dataset import CollateFunc, AudioDataset
13
+ >>> from wenet.transformer.asr_model import ASRModel
14
+ >>> from wenet.transformer.encoder import ConformerEncoder
15
+ >>> from wenet.transformer.decoder import TransformerDecoder
16
+ >>> from wenet.transformer.ctc import CTC
17
+ >>> from wenet.utils.executor import Executor
18
+ >>> from wenet.utils.checkpoint import save_checkpoint, load_checkpoint
19
+ >>> encoder = ConformerEncoder(input_dim, **configs['encoder_conf'])
20
+ >>> decoder = TransformerDecoder(vocab_size, encoder.output_size(), **configs['decoder_conf'])
21
+ >>> ctc = CTC(vocab_size, encoder.output_size())
22
+ >>> with open(args.config, 'r') as fin: configs = yaml.load(fin)
23
+ >>> model = ASRModel(
24
+ vocab_size=vocab_size,
25
+ encoder=encoder,
26
+ decoder=decoder,
27
+ ctc=ctc,
28
+ **configs['model_conf'],
29
+ )
30
+ >>> infos = load_checkpoint(model, args.checkpoint)
31
+
32
+ ```
33
+
34
+ ## Training data
35
+ ISML datasets (80 thousands hours unlabeled data) and babel datasets (2 thousands unlabeled data) are used as training data.
36
+ ## Training procedure
37
+ The model is pre-trained by wav2vec2 (https://github.com/dbiir/UER-py/) on [Tencent Cloud](https://cloud.tencent.com/). We pre-train 70 epochs with a batch size of 128. We use the same hyper-parameters on different model sizes.
38
+ The downstream models are finetuned:
39
+ ```
40
+ Stage 1:
41
+ ```
42
+ python wenet/bin/train.py --gpu 0,1,2,3,4,5,6,7 \
43
+ --config $train_config \
44
+ --train_data train.data \
45
+ --cv_data dev.data \
46
+ ${checkpoint:+--checkpoint $checkpoint} \
47
+ --model_dir $dir \
48
+ --ddp.init_method $init_method \
49
+ --ddp.world_size 7 \
50
+ --ddp.dist_backend nccl \
51
+ --num_workers 2
52
+ ```
53
+
54
+ ### BibTeX entry and citation info
55
+ ```
56
+ @article{baevski2020wav2vec,
57
+ title={wav2vec 2.0: A framework for self-supervised learning of speech representations},
58
+ author={Baevski, Alexei and Zhou, Henry and Mohamed, Abdelrahman and Auli, Michael},
59
+ journal={arXiv preprint arXiv:2006.11477},
60
+ year={2020}
61
+ }
62
+
63
+ @article{zhang2020pushing,
64
+ title={Pushing the limits of semi-supervised learning for automatic speech recognition},
65
+ author={Zhang, Yu and Qin, James and Park, Daniel S and Han, Wei and Chiu, Chung-Cheng and Pang, Ruoming and Le, Quoc V and Wu, Yonghui},
66
+ journal={arXiv preprint arXiv:2010.10504},
67
+ year={2020}
68
+ }
69
+
70
+ @article{zhang2021wenet,
71
+ title={WeNet: Production First and Production Ready End-to-End Speech Recognition Toolkit},
72
+ author={Zhang, Binbin and Wu, Di and Yang, Chao and Chen, Xiaoyu and Peng, Zhendong and Wang, Xiangming and Yao, Zhuoyuan and Wang, Xiong and Yu, Fan and Xie, Lei and others},
73
+ journal={arXiv preprint arXiv:2102.01547},
74
+ year={2021}
75
+ }
76
+ ```
77
+ [base]:https://huggingface.co/uer/albert-base-chinese-cluecorpussmall
78
  [large]:https://huggingface.co/uer/albert-large-chinese-cluecorpussmall