johntsi
/

ZeroSwot-Medium_asr-cv_en-to-200

Automatic Speech Recognition

zero_swot_encoder

feature-extraction

speech translation

Model card Files Files and versions Community

johntsi commited on Jun 25

Commit

d7e71fb

•

1 Parent(s): fcd47e1

Update README.md

Files changed (1) hide show

README.md +9 -3

README.md CHANGED Viewed

@@ -257,7 +257,14 @@ This version of ZeroSwot is trained with ASR data from CommonVoice, and adapting
 ```python
 from transformers import Wav2Vec2Processor, NllbTokenizer, AutoModel, AutoModelForSeq2SeqLM
-import soundfile as sf
 # Load processors and tokenizers
 processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
@@ -277,8 +284,7 @@ nllb_model.eval()
 nllb_model.to("cuda")
 # Load sample .wav
-audio, sr = sf.read("sample.wav")
-assert sr == 16000, "Input of wav2vec2.0 is expected to have sampling rate of 16,000"
 input_values = processor(audio, sampling_rate=16000, return_tensors="pt").cuda()
 # translation to German

 ```python
 from transformers import Wav2Vec2Processor, NllbTokenizer, AutoModel, AutoModelForSeq2SeqLM
+import torchaudio
+def load_and_resample_audio(audio_path, target_sr=16000):
+    audio, orig_freq = torchaudio.load(audio_path)
+    if orig_freq != target_sr:
+        audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=target_sr)
+    audio = audio.squeeze(0).numpy()
+    return audio
 # Load processors and tokenizers
 processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
 nllb_model.to("cuda")
 # Load sample .wav
+audio = load_and_resample_audio("sample.wav")
 input_values = processor(audio, sampling_rate=16000, return_tensors="pt").cuda()
 # translation to German