File size: 4,686 Bytes
60648c4
 
 
 
 
 
 
 
 
 
 
34b86ea
60648c4
 
 
 
46f5923
60648c4
 
 
39e9f1f
60648c4
 
39e9f1f
60648c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34b86ea
 
 
 
 
 
 
 
 
60648c4
34b86ea
 
 
 
60648c4
 
 
 
 
 
 
 
 
 
 
 
 
 
34b86ea
60648c4
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import gradio as gr
import sox
import numpy as np
import yaml
import tensorflow as tf
from tensorflow_tts.inference import TFAutoModel
from tensorflow_tts.inference import AutoProcessor
import scipy.signal as sps



# initialize fastspeech2 model.
fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")
# initialize mb_melgan model
mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en")
# inference
processor_tts = AutoProcessor.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")

def tts(text):
  input_ids = processor_tts.text_to_sequence(text)
  # fastspeech inference
  
  mel_before, mel_after, duration_outputs, _, _ = fastspeech2.inference(
      input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
      speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
      speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
      f0_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32),
      energy_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32),
  )
  
  # melgan inference
  audio_before = mb_melgan.inference(mel_before)[0, :, 0]
  audio_after = mb_melgan.inference(mel_after)[0, :, 0]
  
  # save to file
  sf.write('./audio_before.wav', audio_before, 22050, "PCM_16")
  sf.write('./audio_after.wav', audio_after, 22050, "PCM_16")
  return './audio_after.wav'


def convert(inputfile, outfile):
    sox_tfm = sox.Transformer()
    sox_tfm.set_output_format(
        file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
    )
    sox_tfm.build(inputfile, outfile)
    
    
model_translate = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer_translate = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
inlang='hi'
outlang='en'
tokenizer_translate.src_lang = inlang
def translate(text):    
    encoded_hi = tokenizer_translate(text, return_tensors="pt")
    generated_tokens = model_translate.generate(**encoded_hi, forced_bos_token_id=tokenizer_translate.get_lang_id(outlang))
    return tokenizer_translate.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    
    
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")

def read_file(wav):
    sample_rate, signal = wav                                                                                                                        
    signal = signal.mean(-1)                                                                                                                              
    number_of_samples = round(len(signal) * float(16000) / sample_rate)                                                                                   
    resampled_signal = sps.resample(signal, number_of_samples)
    return resampled_signal


def parse_transcription(wav_file):
    #filename = wav_file.name.split('.')[0]
    #convert(wav_file.name, filename + "16k.wav")
    #speech, _ = sf.read(filename + "16k.wav")
    speech = read_file(wav_file)
    input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
    translation = translate(transcription)
    return transcription, translation, tts(translation)



  
  
output1 = gr.outputs.Textbox(label="Hindi Output from ASR")
output2 = gr.outputs.Textbox(label="English Translated Output")
    
input_ = gr.inputs.Audio(source="microphone", type="numpy") 


output_audio =  gr.outputs.Audio(type="file", label="Output Audio")

gr.Interface(parse_transcription, inputs = input_,  outputs=[output1, output2, output_audio], analytics_enabled=False, 
                                                                            show_tips=False, 
                                                                            theme='huggingface',
                                                                            layout='vertical',
                                                                            title="Vakyansh: Speech To text for Indic Languages",
                                                                            description="This is a live demo for Speech to Speech Translation. Speak in Hindi and get output in English", enable_queue=True).launch( inline=False)