facebook
/

s2t-large-librispeech-asr

@@ -35,16 +35,18 @@ transcripts by passing the speech features to the model.
 *Note: The `Speech2TextProcessor` object uses [torchaudio](https://github.com/pytorch/audio)  to extract the
 filter bank features. Make sure to install the `torchaudio` package before running this example.*
-To install `torchaudio` run `pip install torchaudio`
 ```python
 import torch
-from transformers import Speech2TextProcessor, Speech2TextTransformerForConditionalGeneration
 from datasets import load_dataset
 import soundfile as sf
-model = Speech2TextTransformerForConditionalGeneration.from_pretrained("facebook/s2t-large-librispeech-asr")
 processor = Speech2Textprocessor.from_pretrained("facebook/s2t-large-librispeech-asr")
 def map_to_array(batch):
@@ -76,13 +78,13 @@ The following script shows how to evaluate this model on the [LibriSpeech](https
 ```python
 from datasets import load_dataset, load_metric
-from transformers import Speech2TextTransformerForConditionalGeneration, Speech2TextProcessor
 import soundfile as sf
 librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")  # change to "other" for other test dataset
 wer = load_metric("wer")
-model = Speech2TextTransformerForConditionalGeneration.from_pretrained("facebook/s2t-large-librispeech-asr").to("cuda")
 processor = Speech2TextProcessor.from_pretrained("facebook/s2t-large-librispeech-asr", do_upper_case=True)
 def map_to_array(batch):

 *Note: The `Speech2TextProcessor` object uses [torchaudio](https://github.com/pytorch/audio)  to extract the
 filter bank features. Make sure to install the `torchaudio` package before running this example.*
+You could either install those as extra speech dependancies with
+`pip install transformers"[speech, sentencepiece]"` or install the packages seperatly
+with `pip install torchaudio sentencepiece`.
 ```python
 import torch
+from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
 from datasets import load_dataset
 import soundfile as sf
+model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-large-librispeech-asr")
 processor = Speech2Textprocessor.from_pretrained("facebook/s2t-large-librispeech-asr")
 def map_to_array(batch):
 ```python
 from datasets import load_dataset, load_metric
+from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
 import soundfile as sf
 librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")  # change to "other" for other test dataset
 wer = load_metric("wer")
+model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-large-librispeech-asr").to("cuda")
 processor = Speech2TextProcessor.from_pretrained("facebook/s2t-large-librispeech-asr", do_upper_case=True)
 def map_to_array(batch):

config.json CHANGED Viewed

@@ -1,8 +1,9 @@
 {
   "activation_dropout": 0.2,
   "activation_function": "relu",
   "architectures": [
-    "Speech2TextTransformerForConditionalGeneration"
   ],
   "attention_dropout": 0.2,
   "bos_token_id": 0,
@@ -33,7 +34,7 @@
   "max_length": 200,
   "max_source_positions": 6000,
   "max_target_positions": 1024,
-  "model_type": "speech_to_text_transformer",
   "num_beams": 5,
   "num_conv_layers": 2,
   "num_hidden_layers": 12,

 {
+  "_name_or_path": "hf_models_fb/s2t-large-librispeech-asr/",
   "activation_dropout": 0.2,
   "activation_function": "relu",
   "architectures": [
+    "Speech2TextForConditionalGeneration"
   ],
   "attention_dropout": 0.2,
   "bos_token_id": 0,
   "max_length": 200,
   "max_source_positions": 6000,
   "max_target_positions": 1024,
+  "model_type": "speech_to_text",
   "num_beams": 5,
   "num_conv_layers": 2,
   "num_hidden_layers": 12,

preprocessor_config.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-  "do_normalize": true,
   "feature_size": 80,
-  "norm_means": true,
-  "norm_vars": true,
   "num_mel_bins": 80,
   "padding_side": "right",
   "padding_value": 0.0,

 {
+  "do_ceptral_normalize": true,
   "feature_size": 80,
+  "normalize_means": true,
+  "normalize_vars": true,
   "num_mel_bins": 80,
   "padding_side": "right",
   "padding_value": 0.0,

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:02f8a1dd9fbebf969266e155a5a9df274f24bf82f544d000ec4cc55ca3ebda9a
-size 1071473998

 version https://git-lfs.github.com/spec/v1
+oid sha256:55e3aa76d71b3792f1a0a055316c0c205e0697a44778d4a6e6af4fc9994fd93c
+size 1071459644