update model card
Browse files
README.md
CHANGED
@@ -5,10 +5,8 @@ datasets:
|
|
5 |
tags:
|
6 |
- audio
|
7 |
- automatic-speech-recognition
|
|
|
8 |
license: apache-2.0
|
9 |
-
widget:
|
10 |
-
- label: Sample 1 (from LibriSpeech)
|
11 |
-
src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
|
12 |
---
|
13 |
|
14 |
# Wav2Vec2-Base-TIMIT
|
@@ -22,18 +20,24 @@ When using this model, make sure that your speech input is sampled at 16kHz.
|
|
22 |
The model can be used directly (without a language model) as follows:
|
23 |
|
24 |
```python
|
|
|
25 |
import torch
|
26 |
from datasets import load_dataset
|
27 |
-
import soundfile as sf
|
28 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
29 |
|
30 |
-
model_name = "elgeish/wav2vec2-base-timit"
|
31 |
processor = Wav2Vec2Processor.from_pretrained(model_name, do_lower_case=True)
|
32 |
model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
33 |
-
|
|
|
|
|
|
|
34 |
|
35 |
def prepare_example(example):
|
36 |
example["speech"], _ = sf.read(example["file"])
|
|
|
|
|
|
|
37 |
return example
|
38 |
|
39 |
dataset = dataset.map(prepare_example, remove_columns=["file"])
|
@@ -41,6 +45,7 @@ inputs = processor(dataset["speech"], sampling_rate=16000, return_tensors="pt",
|
|
41 |
|
42 |
with torch.no_grad():
|
43 |
predicted_ids = torch.argmax(model(inputs.input_values).logits, dim=-1)
|
|
|
44 |
predicted_transcripts = processor.tokenizer.batch_decode(predicted_ids)
|
45 |
for reference, predicted in zip(dataset["text"], predicted_transcripts):
|
46 |
print("reference:", reference)
|
@@ -51,39 +56,38 @@ for reference, predicted in zip(dataset["text"], predicted_transcripts):
|
|
51 |
Here's the output:
|
52 |
|
53 |
```
|
54 |
-
reference:
|
55 |
-
predicted:
|
56 |
-
--
|
57 |
-
reference: Don't ask me to carry an oily rag like that.
|
58 |
-
predicted: don't ask me to carry an oily rag like that
|
59 |
--
|
60 |
-
reference:
|
61 |
-
predicted:
|
62 |
--
|
63 |
-
reference:
|
64 |
-
predicted:
|
65 |
--
|
66 |
-
reference:
|
67 |
-
predicted:
|
68 |
--
|
69 |
-
reference:
|
70 |
-
predicted:
|
71 |
--
|
72 |
-
reference:
|
73 |
-
predicted:
|
74 |
--
|
75 |
-
reference:
|
76 |
-
predicted:
|
77 |
--
|
78 |
-
reference:
|
79 |
-
predicted:
|
80 |
--
|
81 |
-
reference:
|
82 |
-
predicted:
|
83 |
--
|
|
|
|
|
84 |
```
|
85 |
|
86 |
## Fine-Tuning Script
|
87 |
|
88 |
You can find the script used to produce this model
|
89 |
-
[here](https://github.com/elgeish/transformers/blob/
|
|
|
5 |
tags:
|
6 |
- audio
|
7 |
- automatic-speech-recognition
|
8 |
+
- speech
|
9 |
license: apache-2.0
|
|
|
|
|
|
|
10 |
---
|
11 |
|
12 |
# Wav2Vec2-Base-TIMIT
|
|
|
20 |
The model can be used directly (without a language model) as follows:
|
21 |
|
22 |
```python
|
23 |
+
import soundfile as sf
|
24 |
import torch
|
25 |
from datasets import load_dataset
|
|
|
26 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
27 |
|
28 |
+
model_name = "elgeish/wav2vec2-base-timit-asr"
|
29 |
processor = Wav2Vec2Processor.from_pretrained(model_name, do_lower_case=True)
|
30 |
model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
31 |
+
model.eval()
|
32 |
+
|
33 |
+
dataset = load_dataset("timit_asr", split="test").shuffle().select(range(10))
|
34 |
+
char_translations = str.maketrans({"-": " ", ".": "", "?": ""})
|
35 |
|
36 |
def prepare_example(example):
|
37 |
example["speech"], _ = sf.read(example["file"])
|
38 |
+
example["text"] = example["text"].translate(char_translations)
|
39 |
+
example["text"] = " ".join(example["text"].split()) # clean up whitespaces
|
40 |
+
example["text"] = example["text"].lower()
|
41 |
return example
|
42 |
|
43 |
dataset = dataset.map(prepare_example, remove_columns=["file"])
|
|
|
45 |
|
46 |
with torch.no_grad():
|
47 |
predicted_ids = torch.argmax(model(inputs.input_values).logits, dim=-1)
|
48 |
+
predicted_ids[predicted_ids == -100] = processor.tokenizer.pad_token_id # see fine-tuning script
|
49 |
predicted_transcripts = processor.tokenizer.batch_decode(predicted_ids)
|
50 |
for reference, predicted in zip(dataset["text"], predicted_transcripts):
|
51 |
print("reference:", reference)
|
|
|
56 |
Here's the output:
|
57 |
|
58 |
```
|
59 |
+
reference: she had your dark suit in greasy wash water all year
|
60 |
+
predicted: she had your dark suit in greasy wash water all year
|
|
|
|
|
|
|
61 |
--
|
62 |
+
reference: where were you while we were away
|
63 |
+
predicted: where were you while we were away
|
64 |
--
|
65 |
+
reference: cory and trish played tag with beach balls for hours
|
66 |
+
predicted: tcory and trish played tag with beach balls for hours
|
67 |
--
|
68 |
+
reference: tradition requires parental approval for under age marriage
|
69 |
+
predicted: tradition requires parrental proval for under age marrage
|
70 |
--
|
71 |
+
reference: objects made of pewter are beautiful
|
72 |
+
predicted: objects made of puder are bautiful
|
73 |
--
|
74 |
+
reference: don't ask me to carry an oily rag like that
|
75 |
+
predicted: don't o ask me to carry an oily rag like that
|
76 |
--
|
77 |
+
reference: cory and trish played tag with beach balls for hours
|
78 |
+
predicted: cory and trish played tag with beach balls for ours
|
79 |
--
|
80 |
+
reference: don't ask me to carry an oily rag like that
|
81 |
+
predicted: don't ask me to carry an oily rag like that
|
82 |
--
|
83 |
+
reference: don't do charlie's dirty dishes
|
84 |
+
predicted: don't do chawly's tirty dishes
|
85 |
--
|
86 |
+
reference: only those story tellers will remain who can imitate the style of the virtuous
|
87 |
+
predicted: only those story tillaers will remain who can imvitate the style the virtuous
|
88 |
```
|
89 |
|
90 |
## Fine-Tuning Script
|
91 |
|
92 |
You can find the script used to produce this model
|
93 |
+
[here](https://github.com/elgeish/transformers/blob/cfc0bd01f2ac2ea3a5acc578ef2e204bf4304de7/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh).
|