patrickvonplaten
commited on
Commit
•
e7617d1
1
Parent(s):
301ef17
Update README.md
Browse files
README.md
CHANGED
@@ -13,7 +13,7 @@ widget:
|
|
13 |
- example_title: Librispeech sample 2
|
14 |
src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
|
15 |
model-index:
|
16 |
-
- name: wav2vec2-base-960h
|
17 |
results:
|
18 |
- task:
|
19 |
name: Automatic Speech Recognition
|
@@ -25,7 +25,7 @@ model-index:
|
|
25 |
metrics:
|
26 |
- name: Test WER
|
27 |
type: wer
|
28 |
-
value:
|
29 |
---
|
30 |
|
31 |
# Wav2Vec2-Base-960h + 4-gram
|
@@ -39,33 +39,36 @@ augmented with an English 4-gram. The `4-gram.arpa.gz` of [Librispeech's officia
|
|
39 |
|
40 |
```python
|
41 |
from datasets import load_dataset
|
42 |
-
from transformers import
|
43 |
import torch
|
44 |
from jiwer import wer
|
45 |
|
|
|
46 |
|
47 |
-
librispeech_eval = load_dataset("librispeech_asr", "
|
48 |
|
49 |
-
model =
|
50 |
-
processor =
|
51 |
|
52 |
def map_to_pred(batch):
|
53 |
-
|
|
|
|
|
|
|
54 |
with torch.no_grad():
|
55 |
-
logits = model(
|
56 |
|
57 |
-
|
58 |
-
transcription = processor.batch_decode(predicted_ids)
|
59 |
batch["transcription"] = transcription
|
60 |
return batch
|
61 |
|
62 |
-
result = librispeech_eval.map(map_to_pred,
|
63 |
|
64 |
-
print(
|
65 |
```
|
66 |
|
67 |
*Result (WER)*:
|
68 |
|
69 |
| "clean" | "other" |
|
70 |
|---|---|
|
71 |
-
|
|
|
|
13 |
- example_title: Librispeech sample 2
|
14 |
src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
|
15 |
model-index:
|
16 |
+
- name: patrickvonplaten/wav2vec2-base-960h-4-gram
|
17 |
results:
|
18 |
- task:
|
19 |
name: Automatic Speech Recognition
|
|
|
25 |
metrics:
|
26 |
- name: Test WER
|
27 |
type: wer
|
28 |
+
value: 2.59
|
29 |
---
|
30 |
|
31 |
# Wav2Vec2-Base-960h + 4-gram
|
|
|
39 |
|
40 |
```python
|
41 |
from datasets import load_dataset
|
42 |
+
from transformers import AutoModelForCTC, AutoProcessor
|
43 |
import torch
|
44 |
from jiwer import wer
|
45 |
|
46 |
+
model_id = "patrickvonplaten/wav2vec2-base-960h-4-gram"
|
47 |
|
48 |
+
librispeech_eval = load_dataset("librispeech_asr", "other", split="test")
|
49 |
|
50 |
+
model = AutoModelForCTC.from_pretrained(model_id).to("cuda")
|
51 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
52 |
|
53 |
def map_to_pred(batch):
|
54 |
+
inputs = processor(batch["audio"]["array"], sampling_rate=16_000, return_tensors="pt")
|
55 |
+
|
56 |
+
inputs = {k: v.to("cuda") for k,v in inputs.items()}
|
57 |
+
|
58 |
with torch.no_grad():
|
59 |
+
logits = model(**inputs).logits
|
60 |
|
61 |
+
transcription = processor.batch_decode(logits.cpu().numpy()).text[0]
|
|
|
62 |
batch["transcription"] = transcription
|
63 |
return batch
|
64 |
|
65 |
+
result = librispeech_eval.map(map_to_pred, remove_columns=["audio"])
|
66 |
|
67 |
+
print(wer(result["text"], result["transcription"]))
|
68 |
```
|
69 |
|
70 |
*Result (WER)*:
|
71 |
|
72 |
| "clean" | "other" |
|
73 |
|---|---|
|
74 |
+
| 2.59 | 6.46 |
|