patrickvonplaten commited on
Commit
8e7d147
1 Parent(s): 976e11d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +2 -18
README.md CHANGED
@@ -47,25 +47,17 @@ To transcribe audio files the model can be used as a standalone acoustic model a
47
  ```python
48
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
49
  from datasets import load_dataset
50
- import soundfile as sf
51
  import torch
52
 
53
  # load model and processor
54
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
55
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")
56
 
57
- # define function to read in sound file
58
- def map_to_array(batch):
59
- speech, _ = sf.read(batch["file"])
60
- batch["speech"] = speech
61
- return batch
62
-
63
  # load dummy dataset and read soundfiles
64
  ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
65
- ds = ds.map(map_to_array)
66
 
67
  # tokenize
68
- input_values = processor(ds["speech"][:2], return_tensors="pt", padding="longest").input_values # Batch size 1
69
 
70
  # retrieve logits
71
  logits = model(input_values).logits
@@ -82,7 +74,6 @@ This code snippet shows how to evaluate **facebook/wav2vec2-large-960h-lv60** on
82
  ```python
83
  from datasets import load_dataset
84
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
85
- import soundfile as sf
86
  import torch
87
  from jiwer import wer
88
 
@@ -92,15 +83,8 @@ librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
92
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60").to("cuda")
93
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
94
 
95
- def map_to_array(batch):
96
- speech, _ = sf.read(batch["file"])
97
- batch["speech"] = speech
98
- return batch
99
-
100
- librispeech_eval = librispeech_eval.map(map_to_array)
101
-
102
  def map_to_pred(batch):
103
- inputs = processor(batch["speech"], return_tensors="pt", padding="longest")
104
  input_values = inputs.input_values.to("cuda")
105
  attention_mask = inputs.attention_mask.to("cuda")
106
 
47
  ```python
48
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
49
  from datasets import load_dataset
 
50
  import torch
51
 
52
  # load model and processor
53
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
54
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")
55
 
 
 
 
 
 
 
56
  # load dummy dataset and read soundfiles
57
  ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
 
58
 
59
  # tokenize
60
+ input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest").input_values # Batch size 1
61
 
62
  # retrieve logits
63
  logits = model(input_values).logits
74
  ```python
75
  from datasets import load_dataset
76
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
77
  import torch
78
  from jiwer import wer
79
 
83
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60").to("cuda")
84
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
85
 
 
 
 
 
 
 
 
86
  def map_to_pred(batch):
87
+ inputs = processor(batch["audio"]["array"], return_tensors="pt", padding="longest")
88
  input_values = inputs.input_values.to("cuda")
89
  attention_mask = inputs.attention_mask.to("cuda")
90