Kartheesh commited on
Commit
d27f30a
1 Parent(s): e872bc1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -0
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import soundfile as sf
3
+ import numpy as np
4
+ import gradio as gr
5
+ from transformers import VitsModel, MBartForConditionalGeneration, AutoTokenizer, pipeline
6
+
7
+ # Load the models and tokenizers
8
+ transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
9
+ translation_tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", use_fast=False)
10
+ translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
11
+ tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")
12
+ tts_model = VitsModel.from_pretrained("facebook/mms-tts-hin")
13
+
14
+ def process_audio(audio):
15
+ if audio is None:
16
+ return "No audio provided.", None
17
+
18
+ sr, y = audio
19
+ y = y.astype(np.float32)
20
+ y /= np.max(np.abs(y))
21
+
22
+ # Transcribe the audio
23
+ transcription = transcriber({"sampling_rate": sr, "raw": y})["text"]
24
+
25
+ # Translate from English to Hindi
26
+ model_inputs = translation_tokenizer(transcription, return_tensors="pt", padding=True, truncation=True)
27
+ generated_tokens = translation_model.generate(
28
+ **model_inputs,
29
+ forced_bos_token_id=translation_tokenizer.lang_code_to_id["hi_IN"]
30
+ )
31
+ translated_text = translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
32
+
33
+ # Generate Hindi speech from translated text
34
+ tts_inputs = tts_tokenizer(translated_text, return_tensors="pt")
35
+ try:
36
+ with torch.no_grad():
37
+ tts_output = tts_model(**tts_inputs)
38
+ waveform = tts_output.waveform.squeeze().cpu().numpy()
39
+ except RuntimeError as e:
40
+ return f"Runtime Error: {e}", None
41
+
42
+ # Save the waveform to an audio file
43
+ audio_path = 'output.wav'
44
+ sf.write(audio_path, waveform, 22050)
45
+
46
+ return audio_path
47
+
48
+ # Create the Gradio interface
49
+ demo = gr.Interface(
50
+ fn=process_audio,
51
+ inputs=gr.Audio(sources=["microphone"], type="numpy"),
52
+ outputs="audio",
53
+ title="Speech-to-Hindi",
54
+ description="Record your speech or upload an audio file to transcribe, translate to Hindi, and convert to speech."
55
+ )
56
+
57
+ # Launch the Gradio app
58
+ demo.launch(debug=True)