speech2speech / app.py
Harveenchadha's picture
Update app.py
46f5923
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import gradio as gr
import sox
import numpy as np
import yaml
import tensorflow as tf
from tensorflow_tts.inference import TFAutoModel
from tensorflow_tts.inference import AutoProcessor
import scipy.signal as sps
# initialize fastspeech2 model.
fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")
# initialize mb_melgan model
mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en")
# inference
processor_tts = AutoProcessor.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")
def tts(text):
input_ids = processor_tts.text_to_sequence(text)
# fastspeech inference
mel_before, mel_after, duration_outputs, _, _ = fastspeech2.inference(
input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
f0_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32),
energy_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32),
)
# melgan inference
audio_before = mb_melgan.inference(mel_before)[0, :, 0]
audio_after = mb_melgan.inference(mel_after)[0, :, 0]
# save to file
sf.write('./audio_before.wav', audio_before, 22050, "PCM_16")
sf.write('./audio_after.wav', audio_after, 22050, "PCM_16")
return './audio_after.wav'
def convert(inputfile, outfile):
sox_tfm = sox.Transformer()
sox_tfm.set_output_format(
file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
)
sox_tfm.build(inputfile, outfile)
model_translate = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer_translate = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
inlang='hi'
outlang='en'
tokenizer_translate.src_lang = inlang
def translate(text):
encoded_hi = tokenizer_translate(text, return_tensors="pt")
generated_tokens = model_translate.generate(**encoded_hi, forced_bos_token_id=tokenizer_translate.get_lang_id(outlang))
return tokenizer_translate.batch_decode(generated_tokens, skip_special_tokens=True)[0]
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
def read_file(wav):
sample_rate, signal = wav
signal = signal.mean(-1)
number_of_samples = round(len(signal) * float(16000) / sample_rate)
resampled_signal = sps.resample(signal, number_of_samples)
return resampled_signal
def parse_transcription(wav_file):
#filename = wav_file.name.split('.')[0]
#convert(wav_file.name, filename + "16k.wav")
#speech, _ = sf.read(filename + "16k.wav")
speech = read_file(wav_file)
input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
translation = translate(transcription)
return transcription, translation, tts(translation)
output1 = gr.outputs.Textbox(label="Hindi Output from ASR")
output2 = gr.outputs.Textbox(label="English Translated Output")
input_ = gr.inputs.Audio(source="microphone", type="numpy")
output_audio = gr.outputs.Audio(type="file", label="Output Audio")
gr.Interface(parse_transcription, inputs = input_, outputs=[output1, output2, output_audio], analytics_enabled=False,
show_tips=False,
theme='huggingface',
layout='vertical',
title="Vakyansh: Speech To text for Indic Languages",
description="This is a live demo for Speech to Speech Translation. Speak in Hindi and get output in English", enable_queue=True).launch( inline=False)