minutes / app.py
adirsingh96's picture
first app
8318150
import tensorflow as tf
from transformers import Speech2TextProcessor, TFSpeech2TextForConditionalGeneration
from datasets import load_dataset
import soundfile as sf
model = TFSpeech2TextForConditionalGeneration.from_pretrained(
"facebook/s2t-small-librispeech-asr", from_pt=True
)
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.map(map_to_array)
ds.set_format(type="tf")
input_features = processor(
ds["speech"][0], sampling_rate=16000, return_tensors="tf"
).input_features # Batch size 1
generated_ids = model.generate(input_features)
transcription = processor.batch_decode(generated_ids)