patrickvonplaten
/

wav2vec2-base-960h-4-gram

Automatic Speech Recognition

hf-asr-leaderboard

Inference Endpoints

Model card Files Files and versions Community

patrickvonplaten commited on Apr 12, 2022

Commit

e7617d1

•

1 Parent(s): 301ef17

Update README.md

Files changed (1) hide show

README.md +16 -13

README.md CHANGED Viewed

@@ -13,7 +13,7 @@ widget:
 - example_title: Librispeech sample 2
   src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
 model-index:
-- name: wav2vec2-base-960h
   results:
   - task:
       name: Automatic Speech Recognition
@@ -25,7 +25,7 @@ model-index:
     metrics:
     - name: Test WER
       type: wer
-      value: 3.4
 ---
 # Wav2Vec2-Base-960h + 4-gram
@@ -39,33 +39,36 @@ augmented with an English 4-gram. The `4-gram.arpa.gz` of [Librispeech's officia
 ```python
 from datasets import load_dataset
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import torch
 from jiwer import wer
-librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
-model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")
-processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
 def map_to_pred(batch):
-    input_values = processor(batch["audio"]["array"], return_tensors="pt", padding="longest").input_values
     with torch.no_grad():
-        logits = model(input_values.to("cuda")).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = processor.batch_decode(predicted_ids)
     batch["transcription"] = transcription
     return batch
-result = librispeech_eval.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])
-print("WER:", wer(result["text"], result["transcription"]))
 ```
 *Result (WER)*:
 | "clean" | "other" |
 |---|---|
-| 3.4 | 8.6 |

 - example_title: Librispeech sample 2
   src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
 model-index:
+- name: patrickvonplaten/wav2vec2-base-960h-4-gram
   results:
   - task:
       name: Automatic Speech Recognition
     metrics:
     - name: Test WER
       type: wer
+      value: 2.59
 ---
 # Wav2Vec2-Base-960h + 4-gram
 ```python
 from datasets import load_dataset
+from transformers import AutoModelForCTC, AutoProcessor
 import torch
 from jiwer import wer
+model_id = "patrickvonplaten/wav2vec2-base-960h-4-gram"
+librispeech_eval = load_dataset("librispeech_asr", "other", split="test")
+model = AutoModelForCTC.from_pretrained(model_id).to("cuda")
+processor = AutoProcessor.from_pretrained(model_id)
 def map_to_pred(batch):
+    inputs = processor(batch["audio"]["array"], sampling_rate=16_000, return_tensors="pt")
+    inputs = {k: v.to("cuda") for k,v in inputs.items()}
     with torch.no_grad():
+        logits = model(**inputs).logits
+    transcription = processor.batch_decode(logits.cpu().numpy()).text[0]
     batch["transcription"] = transcription
     return batch
+result = librispeech_eval.map(map_to_pred, remove_columns=["audio"])
+print(wer(result["text"], result["transcription"]))
 ```
 *Result (WER)*:
 | "clean" | "other" |
 |---|---|
+| 2.59 | 6.46 |