import soundfile as sf import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer import gradio as gr import sox import numpy as np import yaml import tensorflow as tf from tensorflow_tts.inference import TFAutoModel from tensorflow_tts.inference import AutoProcessor import scipy.signal as sps # initialize fastspeech2 model. fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en") # initialize mb_melgan model mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en") # inference processor_tts = AutoProcessor.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en") def tts(text): input_ids = processor_tts.text_to_sequence(text) # fastspeech inference mel_before, mel_after, duration_outputs, _, _ = fastspeech2.inference( input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32), speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32), f0_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32), energy_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32), ) # melgan inference audio_before = mb_melgan.inference(mel_before)[0, :, 0] audio_after = mb_melgan.inference(mel_after)[0, :, 0] # save to file sf.write('./audio_before.wav', audio_before, 22050, "PCM_16") sf.write('./audio_after.wav', audio_after, 22050, "PCM_16") return './audio_after.wav' def convert(inputfile, outfile): sox_tfm = sox.Transformer() sox_tfm.set_output_format( file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16 ) sox_tfm.build(inputfile, outfile) model_translate = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") tokenizer_translate = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") inlang='hi' outlang='en' tokenizer_translate.src_lang = inlang def translate(text): encoded_hi = tokenizer_translate(text, return_tensors="pt") generated_tokens = model_translate.generate(**encoded_hi, forced_bos_token_id=tokenizer_translate.get_lang_id(outlang)) return tokenizer_translate.batch_decode(generated_tokens, skip_special_tokens=True)[0] processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200") model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200") def read_file(wav): sample_rate, signal = wav signal = signal.mean(-1) number_of_samples = round(len(signal) * float(16000) / sample_rate) resampled_signal = sps.resample(signal, number_of_samples) return resampled_signal def parse_transcription(wav_file): #filename = wav_file.name.split('.')[0] #convert(wav_file.name, filename + "16k.wav") #speech, _ = sf.read(filename + "16k.wav") speech = read_file(wav_file) input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0], skip_special_tokens=True) translation = translate(transcription) return transcription, translation, tts(translation) output1 = gr.outputs.Textbox(label="Hindi Output from ASR") output2 = gr.outputs.Textbox(label="English Translated Output") input_ = gr.inputs.Audio(source="microphone", type="numpy") output_audio = gr.outputs.Audio(type="file", label="Output Audio") gr.Interface(parse_transcription, inputs = input_, outputs=[output1, output2, output_audio], analytics_enabled=False, show_tips=False, theme='huggingface', layout='vertical', title="Vakyansh: Speech To text for Indic Languages", description="This is a live demo for Speech to Speech Translation. Speak in Hindi and get output in English", enable_queue=True).launch( inline=False)