patrickvonplaten commited on
Commit
d72d8ff
1 Parent(s): 63fb1ed

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +3 -19
README.md CHANGED
@@ -48,25 +48,17 @@ To transcribe audio files the model can be used as a standalone acoustic model a
48
  ```python
49
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
50
  from datasets import load_dataset
51
- import soundfile as sf
52
  import torch
53
 
54
  # load model and processor
55
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
56
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
57
-
58
- # define function to read in sound file
59
- def map_to_array(batch):
60
- speech, _ = sf.read(batch["file"])
61
- batch["speech"] = speech
62
- return batch
63
 
64
  # load dummy dataset and read soundfiles
65
  ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
66
- ds = ds.map(map_to_array)
67
 
68
  # tokenize
69
- input_values = processor(ds["speech"][:2], return_tensors="pt", padding="longest").input_values # Batch size 1
70
 
71
  # retrieve logits
72
  logits = model(input_values).logits
@@ -83,7 +75,6 @@ To transcribe audio files the model can be used as a standalone acoustic model a
83
  ```python
84
  from datasets import load_dataset
85
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
86
- import soundfile as sf
87
  import torch
88
  from jiwer import wer
89
 
@@ -93,15 +84,8 @@ librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
93
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to("cuda")
94
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
95
 
96
- def map_to_array(batch):
97
- speech, _ = sf.read(batch["file"])
98
- batch["speech"] = speech
99
- return batch
100
-
101
- librispeech_eval = librispeech_eval.map(map_to_array)
102
-
103
  def map_to_pred(batch):
104
- inputs = processor(batch["speech"], return_tensors="pt", padding="longest")
105
  input_values = inputs.input_values.to("cuda")
106
  attention_mask = inputs.attention_mask.to("cuda")
107
 
@@ -113,7 +97,7 @@ def map_to_pred(batch):
113
  batch["transcription"] = transcription
114
  return batch
115
 
116
- result = librispeech_eval.map(map_to_pred, batched=True, batch_size=16, remove_columns=["speech"])
117
 
118
  print("WER:", wer(result["text"], result["transcription"]))
119
  ```
 
48
  ```python
49
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
50
  from datasets import load_dataset
 
51
  import torch
52
 
53
  # load model and processor
54
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
55
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
 
 
 
 
 
 
56
 
57
  # load dummy dataset and read soundfiles
58
  ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
 
59
 
60
  # tokenize
61
+ input_values = processor(batch["audio"]["array"], return_tensors="pt", padding="longest").input_values
62
 
63
  # retrieve logits
64
  logits = model(input_values).logits
 
75
  ```python
76
  from datasets import load_dataset
77
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
78
  import torch
79
  from jiwer import wer
80
 
 
84
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to("cuda")
85
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
86
 
 
 
 
 
 
 
 
87
  def map_to_pred(batch):
88
+ inputs = processor(batch["audio"]["array"], return_tensors="pt", padding="longest")
89
  input_values = inputs.input_values.to("cuda")
90
  attention_mask = inputs.attention_mask.to("cuda")
91
 
 
97
  batch["transcription"] = transcription
98
  return batch
99
 
100
+ result = librispeech_eval.map(map_to_pred, remove_columns=["speech"])
101
 
102
  print("WER:", wer(result["text"], result["transcription"]))
103
  ```