valhalla commited on
Commit
34ecec7
2 Parent(s): e73aa2c c584395

update weights

Browse files
.ipynb_checkpoints/config-checkpoint.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.15,
3
+ "activation_function": "relu",
4
+ "architectures": [
5
+ "Speech2TextTransformerForConditionalGeneration"
6
+ ],
7
+ "attention_dropout": 0.15,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.0,
10
+ "conv_channels": 1024,
11
+ "conv_kernel_sizes": [
12
+ 5,
13
+ 5
14
+ ],
15
+ "d_model": 512,
16
+ "decoder_attention_heads": 8,
17
+ "decoder_ffn_dim": 2048,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 6,
20
+ "decoder_start_token_id": 2,
21
+ "dropout": 0.15,
22
+ "early_stopping": true,
23
+ "encoder_attention_heads": 8,
24
+ "encoder_ffn_dim": 2048,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 12,
27
+ "eos_token_id": 2,
28
+ "gradient_checkpointing": false,
29
+ "init_std": 0.02,
30
+ "input_channels": 1,
31
+ "input_feat_per_channel": 80,
32
+ "is_encoder_decoder": true,
33
+ "max_length": 200,
34
+ "max_source_positions": 6000,
35
+ "max_target_positions": 1024,
36
+ "model_type": "speech_to_text_transformer",
37
+ "num_beams": 5,
38
+ "num_conv_layers": 2,
39
+ "num_hidden_layers": 12,
40
+ "pad_token_id": 1,
41
+ "scale_embedding": true,
42
+ "transformers_version": "4.4.0.dev0",
43
+ "use_cache": true,
44
+ "vocab_size": 10000
45
+ }
README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ datasets:
4
+ - librispeech_asr
5
+ tags:
6
+ - audio
7
+ - automatic-speech-recognition
8
+ license: apache-2.0
9
+ ---
10
+
11
+ TODO: [To be filled]
12
+
13
+
14
+ ## Evaluation on LibriSpeech Test
15
+
16
+ The following script shows how to evaluate this model on the [LibriSpeech](https://huggingface.co/datasets/librispeech_asr) *"clean"* and *"other"* test dataset.
17
+
18
+ ```python
19
+ from datasets import load_dataset
20
+ from transformers import Speech2TextTransformerForConditionalGeneration, Speech2TextTransformerTokenizer
21
+ import soundfile as sf
22
+ from jiwer import wer
23
+
24
+ librispeech_eval = load_dataset("librispeech_asr", "clean", split="test") # change to "other" for other test dataset
25
+
26
+ model = Speech2TextTransformerForConditionalGeneration.from_pretrained("valhalla/s2t_librispeech_medium").to("cuda")
27
+ tokenizer = Speech2TextTransformerTokenizer.from_pretrained("valhalla/s2t_librispeech_medium", do_upper_case=True)
28
+
29
+ def map_to_array(batch):
30
+ speech, _ = sf.read(batch["file"])
31
+ batch["speech"] = speech
32
+ return batch
33
+
34
+ librispeech_eval = librispeech_eval.map(map_to_array)
35
+
36
+ def map_to_pred(batch):
37
+ features = tokenizer(batch["speech"], sample_rate=16000, padding=True, return_tensors="pt")
38
+ input_features = features.input_features.to("cuda")
39
+ attention_mask = features.attention_mask.to("cuda")
40
+
41
+ gen_tokens = model.generate(input_ids=input_features, attention_mask=attention_mask)
42
+ batch["transcription"] = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
43
+ return batch
44
+
45
+ result = librispeech_eval.map(map_to_pred, batched=True, batch_size=8, remove_columns=["speech"])
46
+
47
+ print("WER:", wer(result["text"], result["transcription"]))
48
+ ```
49
+
50
+ *Result (WER)*:
51
+
52
+ | "clean" | "other" |
53
+ |---|---|
54
+ | 3.5 | 7.8 |