frogcho123 commited on
Commit
8d7bec1
1 Parent(s): 400111e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -59
app.py CHANGED
@@ -1,65 +1,68 @@
 
1
  import gradio as gr
2
- from gradio import components
3
  import whisper
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  from gtts import gTTS
 
 
6
  import soundfile as sf
7
- import os
8
- import numpy as np
9
-
10
- def translate_speech_to_speech(input_tuple):
11
- input_audio, sample_rate = input_tuple
12
-
13
- # Save the input audio to a temporary file
14
- input_file = "input_audio.wav"
15
- sf.write(input_file, input_audio, sample_rate) # use the sample rate from Gradio
16
-
17
-
18
- # Language detection and translation code from the first code snippet
19
- model = whisper.load_model("base")
20
- audio = whisper.load_audio(input_file)
21
- audio = whisper.pad_or_trim(audio)
22
- mel = whisper.log_mel_spectrogram(audio).to(model.device)
23
- _, probs = model.detect_language(mel)
24
-
25
- options = whisper.DecodingOptions()
26
- result = whisper.decode(model, mel, options)
27
-
28
- text = result.text
29
- lang = max(probs, key=probs.get)
30
-
31
- # Translation code from the first code snippet
32
- to_lang = 'ru'
33
- tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
34
- model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
35
-
36
- tokenizer.src_lang = lang
37
- encoded_bg = tokenizer(text, return_tensors="pt")
38
- generated_tokens = model.generate(**encoded_bg)
39
- translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
40
-
41
- # Text-to-speech (TTS) code from the first code snippet
42
- tts = gTTS(text=translated_text, lang=to_lang)
43
- output_file = "translated_speech.wav"
44
- tts.save(output_file)
45
-
46
- # Load the translated audio and return as an output
47
- translated_audio, _ = sf.read(output_file, dtype="int16")
48
-
49
- return translated_audio
50
-
51
- title = "Speech-to-Speech Translator"
52
-
53
- input_audio = gr.inputs.Audio(source="microphone")
54
- output_audio = gr.outputs.Audio(type="numpy")
55
-
56
- stt_demo = gr.Interface(
57
- fn=translate_speech_to_speech,
58
- inputs=input_audio,
59
- outputs=output_audio,
60
- title=title,
61
- description="Speak in any language, and the translator will convert it to speech in the target language.",
62
- )
63
 
64
- if __name__ == "__main__":
65
- stt_demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import gradio as gr
 
3
  import whisper
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  from gtts import gTTS
6
+ import sentencepiece
7
+ import sounddevice as sd
8
  import soundfile as sf
9
+ import tempfile
10
+
11
+
12
+ def translate_voice(audio, target_lang):
13
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
14
+ temp_filename = temp_audio.name
15
+ sf.write(temp_filename, audio, 16000)
16
+
17
+ model = whisper.load_model("base").float()
18
+
19
+ audio = whisper.load_audio(temp_filename)
20
+
21
+ audio = whisper.pad_or_trim(audio)
22
+
23
+ mel = whisper.log_mel_spectrogram(audio).to(model.device).float()
24
+
25
+ _, probs = model.detect_language(mel)
26
+ options = whisper.DecodingOptions(fp16=False)
27
+ result = whisper.decode(model, mel, options)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ text = result.text
30
+ lang = max(probs, key=probs.get)
31
+
32
+ tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
33
+ model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
34
+
35
+ tokenizer.src_lang = target_lang
36
+ encoded_bg = tokenizer(text, return_tensors="pt")
37
+ generated_tokens = model.generate(**encoded_bg)
38
+ translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
39
+
40
+ tts = gTTS(text=translated_text, lang=target_lang)
41
+ filename = "to_speech.mp3"
42
+ tts.save(filename)
43
+
44
+ return filename, text, translated_text, target_lang
45
+
46
+
47
+ def record_audio():
48
+ fs = 16000
49
+ duration = 5 # Record audio for 5 seconds, you can adjust the duration as needed
50
+ audio = sd.rec(int(duration * fs), samplerate=fs, channels=1)
51
+ sd.wait()
52
+ return audio.flatten()
53
+
54
+
55
+ iface = gr.Interface(
56
+ fn=translate_voice,
57
+ inputs=[
58
+ gr.inputs.Audio(type="microphone", label="Speak"),
59
+ gr.inputs.Dropdown(choices=['en', 'ru', 'de', 'fr'], label="Target Language")
60
+ ],
61
+ outputs=[
62
+ gr.outputs.Audio(type="filepath", label="Translated Audio"),
63
+ gr.outputs.Textbox(label="Original Text"),
64
+ gr.outputs.Textbox(label="Translated Text"),
65
+ gr.outputs.Textbox(label="Target Language"),
66
+ ]
67
+ )
68
+ iface.launch()