yky-h commited on
Commit
53a48d2
1 Parent(s): 556bd3e
Files changed (6) hide show
  1. README.md +93 -0
  2. config.json +106 -0
  3. fairseq/model.pt +3 -0
  4. preprocessor_config.json +9 -0
  5. pytorch_model.bin +3 -0
  6. rinna.png +0 -0
README.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ thumbnail: https://github.com/rinnakk/japanese-pretrained-models/blob/master/rinna.png
3
+ language: ja
4
+ license: apache-2.0
5
+ datasets: reazon-research/reazonspeech
6
+ pipeline_tag: feature-extraction
7
+ inference: false
8
+ tags:
9
+ - wav2vec2
10
+ - speech
11
+ ---
12
+
13
+ # `rinna/japanese-wav2vec2-base`
14
+
15
+ ![rinna-icon](./rinna.png)
16
+
17
+ # Overview
18
+
19
+ This is a Japanese wav2vec 2.0 Base model trained by [rinna Co., Ltd.](https://rinna.co.jp/)
20
+
21
+ * **Model summary**
22
+
23
+ The model architecture is the same as the [original wav2vec 2.0 Base model](https://huggingface.co/facebook/wav2vec2-base), which contains 12 transformer layers with 12 attention heads.
24
+ The model was trained using code from the [official repository](https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec), and the detailed training configuration can be found in the same repository and the [original paper](https://proceedings.neurips.cc/paper/2020/hash/92d1e1eb1cd6f9fba3227870bb6d7f07-Abstract.html).
25
+
26
+
27
+ * **Training**
28
+
29
+ The model was trained on approximately 19,000 hours of following Japanese speech corpus ReazonSpeech v1.
30
+ - [ReazonSpeech](https://huggingface.co/datasets/reazon-research/reazonspeech)
31
+
32
+ * **Contributors**
33
+
34
+ - [Yukiya Hono](https://huggingface.co/yky-h)
35
+ - [Kentaro Mitsui](https://huggingface.co/Kentaro321)
36
+ - [Kei Sawada](https://huggingface.co/keisawada)
37
+
38
+ ---
39
+
40
+ # How to use the model
41
+
42
+ ```python
43
+ import soundfile as sf
44
+ from transformers import AutoFeatureExtractor, AutoModel
45
+
46
+ model_name = "rinna/japanese-wav2vec2-base"
47
+ feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
48
+ model = AutoModel.from_pretrained(model_name)
49
+ model.eval()
50
+
51
+ raw_speech_16kHz, sr = sf.read(audio_file)
52
+ inputs = feature_extractor(
53
+ raw_speech_16kHz,
54
+ return_tensors="pt",
55
+ sampling_rate=sr,
56
+ )
57
+ outputs = model(**inputs)
58
+
59
+ print(f"Input: {inputs.input_values.size()}") # [1, #samples]
60
+ print(f"Output: {outputs.last_hidden_state.size()}") # [1, #frames, 768]
61
+ ```
62
+
63
+ A fairseq checkpoint file can also be available [here](https://huggingface.co/rinna/japanese-wav2vec2-base/tree/main/fairseq).
64
+
65
+ ---
66
+
67
+ # How to cite
68
+ ```bibtex
69
+ @misc{rinna-japanese-wav2vec2-base,
70
+ title={rinna/japanese-wav2vec2-base},
71
+ author={Hono, Yukiya and Mitsui, Kentaro and Sawada, Kei},
72
+ url={https://huggingface.co/rinna/japanese-wav2vec2-base}
73
+ }
74
+ ```
75
+
76
+ ---
77
+
78
+ # Citations
79
+ ```bibtex
80
+ @inproceedings{baevski2020wav2vec,
81
+ title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
82
+ author={Baevski, Alexei and Zhou, Yuhao and Mohamed, Abdelrahman and Auli, Michael},
83
+ booktitle={Advances in Neural Information Processing Systems},
84
+ volume={33},
85
+ pages={12449--12460},
86
+ year={2020},
87
+ url={https://proceedings.neurips.cc/paper/2020/hash/92d1e1eb1cd6f9fba3227870bb6d7f07-Abstract.html}
88
+ }
89
+ ```
90
+ ---
91
+
92
+ # License
93
+ [The Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0)
config.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "rinna/japanese-wav2vec2-base",
3
+ "activation_dropout": 0.1,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForPreTraining"
10
+ ],
11
+ "attention_dropout": 0.1,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 256,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": false,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "sum",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": false,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_norm": "group",
51
+ "feat_proj_dropout": 0.0,
52
+ "feat_quantizer_dropout": 0.0,
53
+ "final_dropout": 0.1,
54
+ "hidden_act": "gelu",
55
+ "hidden_dropout": 0.1,
56
+ "hidden_size": 768,
57
+ "initializer_range": 0.02,
58
+ "intermediate_size": 3072,
59
+ "layer_norm_eps": 1e-05,
60
+ "layerdrop": 0.1,
61
+ "mask_feature_length": 10,
62
+ "mask_feature_min_masks": 0,
63
+ "mask_feature_prob": 0.0,
64
+ "mask_time_length": 10,
65
+ "mask_time_min_masks": 2,
66
+ "mask_time_prob": 0.05,
67
+ "model_type": "wav2vec2",
68
+ "num_adapter_layers": 3,
69
+ "num_attention_heads": 12,
70
+ "num_codevector_groups": 2,
71
+ "num_codevectors_per_group": 320,
72
+ "num_conv_pos_embedding_groups": 16,
73
+ "num_conv_pos_embeddings": 128,
74
+ "num_feat_extract_layers": 7,
75
+ "num_hidden_layers": 12,
76
+ "num_negatives": 100,
77
+ "output_hidden_size": 768,
78
+ "pad_token_id": 0,
79
+ "proj_codevector_dim": 256,
80
+ "tdnn_dilation": [
81
+ 1,
82
+ 2,
83
+ 3,
84
+ 1,
85
+ 1
86
+ ],
87
+ "tdnn_dim": [
88
+ 512,
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 1500
93
+ ],
94
+ "tdnn_kernel": [
95
+ 5,
96
+ 3,
97
+ 3,
98
+ 1,
99
+ 1
100
+ ],
101
+ "torch_dtype": "float32",
102
+ "transformers_version": "4.28.1",
103
+ "use_weighted_layer_sum": false,
104
+ "vocab_size": 32,
105
+ "xvector_output_dim": 512
106
+ }
fairseq/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acea47a3380a25d90ead7a4706849b08779937d1113fd517d92c34255d308f54
3
+ size 380266381
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": false,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f256c3ad63fafda4ccd06d9ce1830096304547d23fa0f4833592907293018c1
3
+ size 380250485
rinna.png ADDED