patrickvonplaten commited on
Commit
bdeaacd
1 Parent(s): 6c9a717

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +2 -17
README.md CHANGED
@@ -33,25 +33,17 @@ To transcribe audio files the model can be used as a standalone acoustic model a
33
  ```python
34
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
35
  from datasets import load_dataset
36
- import soundfile as sf
37
  import torch
38
 
39
  # load model and processor
40
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
41
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
42
-
43
- # define function to read in sound file
44
- def map_to_array(batch):
45
- speech, _ = sf.read(batch["file"])
46
- batch["speech"] = speech
47
- return batch
48
 
49
  # load dummy dataset and read soundfiles
50
  ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
51
- ds = ds.map(map_to_array)
52
 
53
  # tokenize
54
- input_values = processor(ds["speech"][:2], return_tensors="pt", padding="longest").input_values # Batch size 1
55
 
56
  # retrieve logits
57
  logits = model(input_values).logits
@@ -78,15 +70,8 @@ librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
78
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h").to("cuda")
79
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
80
 
81
- def map_to_array(batch):
82
- speech, _ = sf.read(batch["file"])
83
- batch["speech"] = speech
84
- return batch
85
-
86
- librispeech_eval = librispeech_eval.map(map_to_array)
87
-
88
  def map_to_pred(batch):
89
- input_values = processor(batch["speech"], return_tensors="pt", padding="longest").input_values
90
  with torch.no_grad():
91
  logits = model(input_values.to("cuda")).logits
92
 
33
  ```python
34
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
35
  from datasets import load_dataset
 
36
  import torch
37
 
38
  # load model and processor
39
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
40
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
 
 
 
 
 
 
41
 
42
  # load dummy dataset and read soundfiles
43
  ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
 
44
 
45
  # tokenize
46
+ input_values = processor(ds[0]["audio"]["array"],, return_tensors="pt", padding="longest").input_values # Batch size 1
47
 
48
  # retrieve logits
49
  logits = model(input_values).logits
70
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h").to("cuda")
71
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
72
 
 
 
 
 
 
 
 
73
  def map_to_pred(batch):
74
+ input_values = processor(batch["audio"]["array"], return_tensors="pt", padding="longest").input_values
75
  with torch.no_grad():
76
  logits = model(input_values.to("cuda")).logits
77