import gradio as gr import librosa from transformers import AutoFeatureExtractor, AutoTokenizer, SpeechEncoderDecoderModel feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_auth_token="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") tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_auth_token="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", use_fast=False) model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_auth_token="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") def process_audio_file(file): data, sr = librosa.load(file) if sr != 16000: data = librosa.resample(data, sr, 16000) print(data.shape) input_values = feature_extractor(data, return_tensors="pt").input_values return input_values def transcribe(target_language, file): print("Target", target_language) input_values = process_audio_file(file) sequences = model.generate(input_values) transcription = tokenizer.batch_decode(sequences, skip_special_tokens=True) return transcription[0] target_languages = ["German", "French", "Italian"] iface = gr.Interface( fn=transcribe, inputs=[ gr.inputs.Dropdown(target_languages), gr.inputs.Audio(source="microphone", type='filepath'), ], outputs="text", ) iface.launch()