valhalla commited on
Commit
517dc90
1 Parent(s): 762d8d7

update processor and readme

Browse files
README.md CHANGED
@@ -24,7 +24,7 @@ autoregressive cross-entropy loss and generates the transcripts autoregressively
24
  ## Intended uses & limitations
25
 
26
  This model can be used for end-to-end speech recognition (ASR).
27
- See the [model hub](https://huggingface.co/models?filter=speech_to_text_transformer) to look for other S2T checkpoints.
28
 
29
 
30
  ### How to use
@@ -35,16 +35,18 @@ transcripts by passing the speech features to the model.
35
  *Note: The `Speech2TextProcessor` object uses [torchaudio](https://github.com/pytorch/audio) to extract the
36
  filter bank features. Make sure to install the `torchaudio` package before running this example.*
37
 
38
- To install `torchaudio` run `pip install torchaudio`
 
 
39
 
40
 
41
  ```python
42
  import torch
43
- from transformers import Speech2TextProcessor, Speech2TextTransformerForConditionalGeneration
44
  from datasets import load_dataset
45
  import soundfile as sf
46
 
47
- model = Speech2TextTransformerForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
48
  processor = Speech2Textprocessor.from_pretrained("facebook/s2t-medium-librispeech-asr")
49
 
50
  def map_to_array(batch):
@@ -76,13 +78,13 @@ The following script shows how to evaluate this model on the [LibriSpeech](https
76
 
77
  ```python
78
  from datasets import load_dataset, load_metric
79
- from transformers import Speech2TextTransformerForConditionalGeneration, Speech2TextProcessor
80
  import soundfile as sf
81
 
82
  librispeech_eval = load_dataset("librispeech_asr", "clean", split="test") # change to "other" for other test dataset
83
  wer = load_metric("wer")
84
 
85
- model = Speech2TextTransformerForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr").to("cuda")
86
  processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr", do_upper_case=True)
87
 
88
  def map_to_array(batch):
 
24
  ## Intended uses & limitations
25
 
26
  This model can be used for end-to-end speech recognition (ASR).
27
+ See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look for other S2T checkpoints.
28
 
29
 
30
  ### How to use
 
35
  *Note: The `Speech2TextProcessor` object uses [torchaudio](https://github.com/pytorch/audio) to extract the
36
  filter bank features. Make sure to install the `torchaudio` package before running this example.*
37
 
38
+ You could either install those as extra speech dependancies with
39
+ `pip install transformers"[speech, sentencepiece]"` or install the packages seperatly
40
+ with `pip install torchaudio sentencepiece`.
41
 
42
 
43
  ```python
44
  import torch
45
+ from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
46
  from datasets import load_dataset
47
  import soundfile as sf
48
 
49
+ model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
50
  processor = Speech2Textprocessor.from_pretrained("facebook/s2t-medium-librispeech-asr")
51
 
52
  def map_to_array(batch):
 
78
 
79
  ```python
80
  from datasets import load_dataset, load_metric
81
+ from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
82
  import soundfile as sf
83
 
84
  librispeech_eval = load_dataset("librispeech_asr", "clean", split="test") # change to "other" for other test dataset
85
  wer = load_metric("wer")
86
 
87
+ model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr").to("cuda")
88
  processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr", do_upper_case=True)
89
 
90
  def map_to_array(batch):
config.json CHANGED
@@ -1,8 +1,9 @@
1
  {
 
2
  "activation_dropout": 0.15,
3
  "activation_function": "relu",
4
  "architectures": [
5
- "Speech2TextTransformerForConditionalGeneration"
6
  ],
7
  "attention_dropout": 0.15,
8
  "bos_token_id": 0,
@@ -33,7 +34,7 @@
33
  "max_length": 200,
34
  "max_source_positions": 6000,
35
  "max_target_positions": 1024,
36
- "model_type": "speech_to_text_transformer",
37
  "num_beams": 5,
38
  "num_conv_layers": 2,
39
  "num_hidden_layers": 12,
 
1
  {
2
+ "_name_or_path": "hf_models_fb/s2t-medium-librispeech-asr/",
3
  "activation_dropout": 0.15,
4
  "activation_function": "relu",
5
  "architectures": [
6
+ "Speech2TextForConditionalGeneration"
7
  ],
8
  "attention_dropout": 0.15,
9
  "bos_token_id": 0,
 
34
  "max_length": 200,
35
  "max_source_positions": 6000,
36
  "max_target_positions": 1024,
37
+ "model_type": "speech_to_text",
38
  "num_beams": 5,
39
  "num_conv_layers": 2,
40
  "num_hidden_layers": 12,
preprocessor_config.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "do_normalize": true,
3
  "feature_size": 80,
4
- "norm_means": true,
5
- "norm_vars": true,
6
  "num_mel_bins": 80,
7
  "padding_side": "right",
8
  "padding_value": 0.0,
 
1
  {
2
+ "do_ceptral_normalize": true,
3
  "feature_size": 80,
4
+ "normalize_means": true,
5
+ "normalize_vars": true,
6
  "num_mel_bins": 80,
7
  "padding_side": "right",
8
  "padding_value": 0.0,
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b016d9dde06a7d3f73855d19ef597b61055a5ef6d9ec0d12132c7f4077e2aea
3
- size 284968270
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57cc7b2911f0849f3d63bedffb8837e68ce407a2b2b7843c54ec0dae7037813f
3
+ size 284953916
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "do_upper_case": false, "do_lower_case": true, "tgt_lang": null, "lang_codes": null, "special_tokens_map_file": "/home/suraj/.cache/huggingface/transformers/f39f1499e9c4d2b3e803e3cad8a31c4cf3e626e1c69197d4cd6921e5c07007f9.9d6cd81ef646692fb1c169a880161ea1cb95f49694f220aced9b704b457e51dd", "tokenizer_file": null, "name_or_path": "hf_models_fb/s2t-small-librispeech-asr/"}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "do_upper_case": false, "do_lower_case": true, "tgt_lang": null, "lang_codes": null, "special_tokens_map_file": "/home/suraj/.cache/huggingface/transformers/f39f1499e9c4d2b3e803e3cad8a31c4cf3e626e1c69197d4cd6921e5c07007f9.9d6cd81ef646692fb1c169a880161ea1cb95f49694f220aced9b704b457e51dd", "tokenizer_file": null, "name_or_path": "hf_models_fb/s2t-medium-librispeech-asr/"}