# import import librosa, torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer # load the tokenizer and model tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") # load the audio data (use your own wav file here!) input_audio, sr = librosa.load('my_wav_file.wav', sr=16000) # tokenize input_values = tokenizer(input_audio, return_tensors="pt", padding="longest").input_values # retrieve logits logits = model(input_values).logits # take argmax and decode transcription = tokenizer.batch_decode(torch.argmax(logits, dim=-1)) # print the output print(transcription)