import tensorflow as tf from transformers import Speech2TextProcessor, TFSpeech2TextForConditionalGeneration from datasets import load_dataset import soundfile as sf model = TFSpeech2TextForConditionalGeneration.from_pretrained( "facebook/s2t-small-librispeech-asr", from_pt=True ) processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr") def map_to_array(batch): speech, _ = sf.read(batch["file"]) batch["speech"] = speech return batch ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.map(map_to_array) ds.set_format(type="tf") input_features = processor( ds["speech"][0], sampling_rate=16000, return_tensors="tf" ).input_features # Batch size 1 generated_ids = model.generate(input_features) transcription = processor.batch_decode(generated_ids)