frogcho123 commited on
Commit
c89a3ea
1 Parent(s): 19f9d93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -12
app.py CHANGED
@@ -4,20 +4,25 @@ import whisper
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  from gtts import gTTS
6
 
7
- def translate_voice(file, target_lang):
 
 
 
 
 
8
  # Load the model and switch to float32
9
  model = whisper.load_model("base").float()
10
 
11
- # Load the audio
12
- audio = whisper.load_audio(file)
13
 
14
- # Pad or trim the audio
15
  audio = whisper.pad_or_trim(audio)
16
 
17
- # Convert the audio to a log Mel spectrogram and move it to the same device as the model (CPU in your case)
18
  mel = whisper.log_mel_spectrogram(audio).to(model.device).float() # convert to full-precision float32
19
 
20
- # Proceed with your language detection and decoding
21
  _, probs = model.detect_language(mel)
22
  options = whisper.DecodingOptions()
23
  result = whisper.decode(model, mel, options)
@@ -44,14 +49,14 @@ def translate_voice(file, target_lang):
44
  iface = gr.Interface(
45
  fn=translate_voice,
46
  inputs=[
47
- gr.components.File(type="file", label="Your Audio"),
48
- gr.components.Dropdown(choices=['en', 'ru', 'de', 'fr'], label="Target Language")
49
  ],
50
  outputs=[
51
- gr.components.Audio(type="filepath", label="Translated Audio"),
52
- gr.components.Textbox(label="Original Text"),
53
- gr.components.Textbox(label="Translated Text"),
54
- gr.components.Textbox(label="Target Language"),
55
  ]
56
  )
57
  iface.launch()
 
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  from gtts import gTTS
6
 
7
+ def translate_voice(file_obj, target_lang):
8
+ # Save the temporary file to disk
9
+ temp_file_path = "temp_audio_file.wav"
10
+ with open(temp_file_path, "wb") as out_file:
11
+ out_file.write(file_obj.read())
12
+
13
  # Load the model and switch to float32
14
  model = whisper.load_model("base").float()
15
 
16
+ # Load the audio
17
+ audio = whisper.load_audio(temp_file_path)
18
 
19
+ # Pad or trim the audio
20
  audio = whisper.pad_or_trim(audio)
21
 
22
+ # Convert the audio to a log Mel spectrogram and move it to the same device as the model (CPU in your case)
23
  mel = whisper.log_mel_spectrogram(audio).to(model.device).float() # convert to full-precision float32
24
 
25
+ # Proceed with your language detection and decoding
26
  _, probs = model.detect_language(mel)
27
  options = whisper.DecodingOptions()
28
  result = whisper.decode(model, mel, options)
 
49
  iface = gr.Interface(
50
  fn=translate_voice,
51
  inputs=[
52
+ gr.inputs.File(type="file", label="Your Audio"),
53
+ gr.inputs.Dropdown(choices=['en', 'ru', 'de', 'fr'], label="Target Language")
54
  ],
55
  outputs=[
56
+ gr.outputs.Audio(type="file", label="Translated Audio"),
57
+ gr.outputs.Textbox(label="Original Text"),
58
+ gr.outputs.Textbox(label="Translated Text"),
59
+ gr.outputs.Textbox(label="Target Language"),
60
  ]
61
  )
62
  iface.launch()