patrickvonplaten commited on
Commit
e517d24
·
1 Parent(s): 78add66

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +6 -49
README.md CHANGED
@@ -6,6 +6,11 @@ tags:
6
  - speech
7
  - audio
8
  - automatic-speech-recognition
 
 
 
 
 
9
  license: apache-2.0
10
  ---
11
 
@@ -59,52 +64,4 @@ To transcribe audio files the model can be used as a standalone acoustic model a
59
  # take argmax and decode
60
  predicted_ids = torch.argmax(logits, dim=-1)
61
  transcription = processor.batch_decode(predicted_ids)
62
- ```
63
-
64
- ## Evaluation
65
-
66
- This code snippet shows how to evaluate **facebook/wav2vec2-large-960h-lv60-self** on LibriSpeech's "clean" and "other" test data.
67
-
68
- ```python
69
- from datasets import load_dataset
70
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
71
- import soundfile as sf
72
- import torch
73
- from jiwer import wer
74
-
75
-
76
- librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
77
-
78
- model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to("cuda")
79
- processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
80
-
81
- def map_to_array(batch):
82
- speech, _ = sf.read(batch["file"])
83
- batch["speech"] = speech
84
- return batch
85
-
86
- librispeech_eval = librispeech_eval.map(map_to_array)
87
-
88
- def map_to_pred(batch):
89
- inputs = processor(batch["speech"], return_tensors="pt", padding="longest")
90
- input_values = inputs.input_values.to("cuda")
91
- attention_mask = inputs.attention_mask.to("cuda")
92
-
93
- with torch.no_grad():
94
- logits = model(input_values, attention_mask=attention_mask).logits
95
-
96
- predicted_ids = torch.argmax(logits, dim=-1)
97
- transcription = processor.batch_decode(predicted_ids)
98
- batch["transcription"] = transcription
99
- return batch
100
-
101
- result = librispeech_eval.map(map_to_pred, batched=True, batch_size=16, remove_columns=["speech"])
102
-
103
- print("WER:", wer(result["text"], result["transcription"]))
104
- ```
105
-
106
- *Result (WER)*:
107
-
108
- | "clean" | "other" |
109
- |---|---|
110
- | 1.9 | 3.9 |
 
6
  - speech
7
  - audio
8
  - automatic-speech-recognition
9
+ widget:
10
+ - label: Librispeech sample 1
11
+ src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
12
+ - label: Librispeech sample 2
13
+ src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
14
  license: apache-2.0
15
  ---
16
 
 
64
  # take argmax and decode
65
  predicted_ids = torch.argmax(logits, dim=-1)
66
  transcription = processor.batch_decode(predicted_ids)
67
+ ```