patrickvonplaten commited on
Commit
e7617d1
1 Parent(s): 301ef17

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +16 -13
README.md CHANGED
@@ -13,7 +13,7 @@ widget:
13
  - example_title: Librispeech sample 2
14
  src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
15
  model-index:
16
- - name: wav2vec2-base-960h
17
  results:
18
  - task:
19
  name: Automatic Speech Recognition
@@ -25,7 +25,7 @@ model-index:
25
  metrics:
26
  - name: Test WER
27
  type: wer
28
- value: 3.4
29
  ---
30
 
31
  # Wav2Vec2-Base-960h + 4-gram
@@ -39,33 +39,36 @@ augmented with an English 4-gram. The `4-gram.arpa.gz` of [Librispeech's officia
39
 
40
  ```python
41
  from datasets import load_dataset
42
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
43
  import torch
44
  from jiwer import wer
45
 
 
46
 
47
- librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
48
 
49
- model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")
50
- processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
51
 
52
  def map_to_pred(batch):
53
- input_values = processor(batch["audio"]["array"], return_tensors="pt", padding="longest").input_values
 
 
 
54
  with torch.no_grad():
55
- logits = model(input_values.to("cuda")).logits
56
 
57
- predicted_ids = torch.argmax(logits, dim=-1)
58
- transcription = processor.batch_decode(predicted_ids)
59
  batch["transcription"] = transcription
60
  return batch
61
 
62
- result = librispeech_eval.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"])
63
 
64
- print("WER:", wer(result["text"], result["transcription"]))
65
  ```
66
 
67
  *Result (WER)*:
68
 
69
  | "clean" | "other" |
70
  |---|---|
71
- | 3.4 | 8.6 |
 
13
  - example_title: Librispeech sample 2
14
  src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
15
  model-index:
16
+ - name: patrickvonplaten/wav2vec2-base-960h-4-gram
17
  results:
18
  - task:
19
  name: Automatic Speech Recognition
 
25
  metrics:
26
  - name: Test WER
27
  type: wer
28
+ value: 2.59
29
  ---
30
 
31
  # Wav2Vec2-Base-960h + 4-gram
 
39
 
40
  ```python
41
  from datasets import load_dataset
42
+ from transformers import AutoModelForCTC, AutoProcessor
43
  import torch
44
  from jiwer import wer
45
 
46
+ model_id = "patrickvonplaten/wav2vec2-base-960h-4-gram"
47
 
48
+ librispeech_eval = load_dataset("librispeech_asr", "other", split="test")
49
 
50
+ model = AutoModelForCTC.from_pretrained(model_id).to("cuda")
51
+ processor = AutoProcessor.from_pretrained(model_id)
52
 
53
  def map_to_pred(batch):
54
+ inputs = processor(batch["audio"]["array"], sampling_rate=16_000, return_tensors="pt")
55
+
56
+ inputs = {k: v.to("cuda") for k,v in inputs.items()}
57
+
58
  with torch.no_grad():
59
+ logits = model(**inputs).logits
60
 
61
+ transcription = processor.batch_decode(logits.cpu().numpy()).text[0]
 
62
  batch["transcription"] = transcription
63
  return batch
64
 
65
+ result = librispeech_eval.map(map_to_pred, remove_columns=["audio"])
66
 
67
+ print(wer(result["text"], result["transcription"]))
68
  ```
69
 
70
  *Result (WER)*:
71
 
72
  | "clean" | "other" |
73
  |---|---|
74
+ | 2.59 | 6.46 |