Feature Extraction
Transformers
PyTorch
Safetensors
Japanese
hubert
speech
yky-h commited on
Commit
29f8087
1 Parent(s): b30124e
Files changed (6) hide show
  1. README.md +92 -0
  2. config.json +83 -0
  3. fairseq/model.pt +3 -0
  4. preprocessor_config.json +9 -0
  5. pytorch_model.bin +3 -0
  6. rinna.png +0 -0
README.md ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ thumbnail: https://github.com/rinnakk/japanese-pretrained-models/blob/master/rinna.png
3
+ language: ja
4
+ license: apache-2.0
5
+ datasets: reazon-research/reazonspeech
6
+ inference: false
7
+ tags:
8
+ - hubert
9
+ - speech
10
+ ---
11
+
12
+ # `rinna/japanese-hubert-large`
13
+
14
+ ![rinna-icon](./rinna.png)
15
+
16
+ # Overview
17
+
18
+ This is a Japanese HuBERT Large model trained by [rinna Co., Ltd.](https://rinna.co.jp/)
19
+
20
+ * **Model summary**
21
+
22
+ The model architecture is the same as the [original HuBERT Large model](https://huggingface.co/facebook/hubert-large-ll60k), which contains 24 transformer layers with 16 attention heads.
23
+ The model was trained using code from the [official repository](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert), and the detailed training configuration can be found in the same repository and the [original paper](https://ieeexplore.ieee.org/document/9585401).
24
+
25
+ * **Training**
26
+
27
+ The model was trained on approximately 19,000 hours of following Japanese speech corpus ReazonSpeech v1.
28
+ - [ReazonSpeech](https://huggingface.co/datasets/reazon-research/reazonspeech)
29
+
30
+ * **Contributors**
31
+
32
+ - [Yukiya Hono](https://huggingface.co/yky-h)
33
+ - [Kentaro Mitsui](https://huggingface.co/Kentaro321)
34
+ - [Kei Sawada](https://huggingface.co/keisawada)
35
+
36
+ ---
37
+
38
+ # How to use the model
39
+
40
+ ```python
41
+ import soundfile as sf
42
+ from transformers import AutoFeatureExtractor, AutoModel
43
+
44
+ model_name = "rinna/japanese-hubert-large"
45
+ feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
46
+ model = AutoModel.from_pretrained(model_name)
47
+ model.eval()
48
+
49
+ raw_speech_16kHz, sr = sf.read(audio_file)
50
+ inputs = feature_extractor(
51
+ raw_speech_16kHz,
52
+ return_tensors="pt",
53
+ sampling_rate=sr,
54
+ )
55
+ outputs = model(**inputs)
56
+
57
+ print(f"Input: {inputs.input_values.size()}") # [1, #samples]
58
+ print(f"Output: {outputs.last_hidden_state.size()}") # [1, #frames, 1024]
59
+ ```
60
+
61
+ A fairseq checkpoint file can also be available [here](https://huggingface.co/rinna/japanese-hubert-large/tree/main/fairseq).
62
+
63
+ ---
64
+
65
+ # How to cite
66
+ ```bibtex
67
+ @misc{rinna-japanese-hubert-large,
68
+ title={rinna/japanese-hubert-large},
69
+ author={Hono, Yukiya and Mitsui, Kentaro and Sawada, Kei},
70
+ url={https://huggingface.co/rinna/japanese-hubert-large}
71
+ }
72
+ ```
73
+
74
+ ---
75
+
76
+ # Citations
77
+ ```bibtex
78
+ @article{hsu2021hubert,
79
+ author={Hsu, Wei-Ning and Bolte, Benjamin and Tsai, Yao-Hung Hubert and Lakhotia, Kushal and Salakhutdinov, Ruslan and Mohamed, Abdelrahman},
80
+ journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
81
+ title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units},
82
+ year={2021},
83
+ volume={29},
84
+ number={},
85
+ pages={3451-3460},
86
+ doi={10.1109/TASLP.2021.3122291}
87
+ }
88
+ ```
89
+ ---
90
+
91
+ # License
92
+ [The Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0)
config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "rinna/japanese-hubert-large",
3
+ "activation_dropout": 0.0,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "HubertModel"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": true,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": true,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_dropout": 0.0,
45
+ "feat_extract_norm": "layer",
46
+ "feat_proj_dropout": 0.1,
47
+ "feat_proj_layer_norm": true,
48
+ "final_dropout": 0.0,
49
+ "gradient_checkpointing": false,
50
+ "hidden_act": "gelu",
51
+ "hidden_dropout": 0.1,
52
+ "hidden_size": 1024,
53
+ "initializer_range": 0.02,
54
+ "intermediate_size": 4096,
55
+ "layer_norm_eps": 1e-05,
56
+ "layerdrop": 0.1,
57
+ "mask_channel_length": 10,
58
+ "mask_channel_min_space": 1,
59
+ "mask_channel_other": 0.0,
60
+ "mask_channel_prob": 0.0,
61
+ "mask_channel_selection": "static",
62
+ "mask_feature_length": 10,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.0,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_min_space": 1,
68
+ "mask_time_other": 0.0,
69
+ "mask_time_prob": 0.075,
70
+ "mask_time_selection": "static",
71
+ "model_type": "hubert",
72
+ "num_attention_heads": 16,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 24,
77
+ "pad_token_id": 0,
78
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
79
+ "torch_dtype": "float32",
80
+ "transformers_version": "4.28.1",
81
+ "use_weighted_layer_sum": false,
82
+ "vocab_size": 32
83
+ }
fairseq/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f1046daff2169846e024d0dab6214a67e768d0c3116e4be94cabd7bfb645889
3
+ size 1266606909
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6319cee367d17923b8dee987b9813bc8c9c70c7572bf828c59b4d628f177aaee
3
+ size 1261891557
rinna.png ADDED