johntsi commited on
Commit
d7e71fb
1 Parent(s): fcd47e1

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -3
README.md CHANGED
@@ -257,7 +257,14 @@ This version of ZeroSwot is trained with ASR data from CommonVoice, and adapting
257
 
258
  ```python
259
  from transformers import Wav2Vec2Processor, NllbTokenizer, AutoModel, AutoModelForSeq2SeqLM
260
- import soundfile as sf
 
 
 
 
 
 
 
261
 
262
  # Load processors and tokenizers
263
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
@@ -277,8 +284,7 @@ nllb_model.eval()
277
  nllb_model.to("cuda")
278
 
279
  # Load sample .wav
280
- audio, sr = sf.read("sample.wav")
281
- assert sr == 16000, "Input of wav2vec2.0 is expected to have sampling rate of 16,000"
282
  input_values = processor(audio, sampling_rate=16000, return_tensors="pt").cuda()
283
 
284
  # translation to German
 
257
 
258
  ```python
259
  from transformers import Wav2Vec2Processor, NllbTokenizer, AutoModel, AutoModelForSeq2SeqLM
260
+ import torchaudio
261
+
262
+ def load_and_resample_audio(audio_path, target_sr=16000):
263
+ audio, orig_freq = torchaudio.load(audio_path)
264
+ if orig_freq != target_sr:
265
+ audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=target_sr)
266
+ audio = audio.squeeze(0).numpy()
267
+ return audio
268
 
269
  # Load processors and tokenizers
270
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
 
284
  nllb_model.to("cuda")
285
 
286
  # Load sample .wav
287
+ audio = load_and_resample_audio("sample.wav")
 
288
  input_values = processor(audio, sampling_rate=16000, return_tensors="pt").cuda()
289
 
290
  # translation to German