aapot commited on
Commit
6157892
1 Parent(s): 2d8ee3e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +7 -6
README.md CHANGED
@@ -36,6 +36,7 @@ When using this model, make sure that your speech input is sampled at 16kHz.
36
  The model can be used directly (without a language model) as follows:
37
 
38
  ```python
 
39
  import torch
40
  import torchaudio
41
  from datasets import load_dataset
@@ -46,20 +47,20 @@ test_dataset = load_dataset("common_voice", "fi", split="test[:2%]")
46
  processor = Wav2Vec2Processor.from_pretrained("aapot/wav2vec2-large-xlsr-53-finnish")
47
  model = Wav2Vec2ForCTC.from_pretrained("aapot/wav2vec2-large-xlsr-53-finnish")
48
 
49
- resampler = torchaudio.transforms.Resample(48_000, 16_000)
50
 
51
  # Preprocessing the datasets.
52
- # We need to read the aduio files as arrays
53
  def speech_file_to_array_fn(batch):
54
- speech_array, sampling_rate = torchaudio.load(batch["path"])
55
- batch["speech"] = resampler(speech_array).squeeze().numpy()
56
- return batch
57
 
58
  test_dataset = test_dataset.map(speech_file_to_array_fn)
59
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
60
 
61
  with torch.no_grad():
62
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
63
 
64
  predicted_ids = torch.argmax(logits, dim=-1)
65
 
 
36
  The model can be used directly (without a language model) as follows:
37
 
38
  ```python
39
+ import librosa
40
  import torch
41
  import torchaudio
42
  from datasets import load_dataset
 
47
  processor = Wav2Vec2Processor.from_pretrained("aapot/wav2vec2-large-xlsr-53-finnish")
48
  model = Wav2Vec2ForCTC.from_pretrained("aapot/wav2vec2-large-xlsr-53-finnish")
49
 
50
+ resampler = lambda sr, y: librosa.resample(y.squeeze(), sr, 16_000)
51
 
52
  # Preprocessing the datasets.
53
+ # We need to read the audio files as arrays
54
  def speech_file_to_array_fn(batch):
55
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
56
+ batch["speech"] = resampler(sampling_rate, speech_array.numpy()).squeeze()
57
+ return batch
58
 
59
  test_dataset = test_dataset.map(speech_file_to_array_fn)
60
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
61
 
62
  with torch.no_grad():
63
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
64
 
65
  predicted_ids = torch.argmax(logits, dim=-1)
66