from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
from datasets import load_dataset, Audio | |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny", low_cpu_mem_usage=True) | |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") | |
model.to("cuda") | |
common_voice = load_dataset("mozilla-foundation/common_voice_16_1", "ru", split="validation", streaming=True) | |
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate)) | |
inputs = processor(next(iter(common_voice))["audio"]["array"], sampling_rate=16000, return_tensors="pt") | |
input_features = inputs.input_features | |
generated_ids = model.generate(input_features.to("cuda"), max_new_tokens=128) | |
pred_text = processor.decode(generated_ids[0], skip_special_tokens=True) | |
print("Pred text:", pred_text) | |
print("Environment set up successful?", generated_ids.shape[-1] == 20) | |