cotxetj commited on
Commit
a7a78fa
1 Parent(s): 6af5660

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -8
app.py CHANGED
@@ -11,13 +11,15 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
 
12
  def inference(audio):
13
  audio = whisper.load_audio(audio)
 
14
  audio = whisper.pad_or_trim(audio)
15
-
16
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
17
-
18
  _, probs = model.detect_language(mel)
19
-
20
  options = whisper.DecodingOptions(fp16 = False)
 
21
  result = whisper.decode(model, mel, options)
22
 
23
  print(result.text)
@@ -40,9 +42,9 @@ pipe = pipeline("automatic-speech-recognition",
40
  # Define a function to translate an audio, in english here
41
  def translate(audio):
42
  return inference(audio)
43
- outputs = pipe(audio, max_new_tokens=256,
44
- generate_kwargs={"task": "transcribe", "language": "english"})
45
- return outputs["text"]
46
 
47
 
48
  # Define function to generate the waveform output
@@ -62,7 +64,7 @@ def speech_to_speech_translation(audio):
62
  synthesised_speech = synthesise(translated_text)
63
  synthesised_speech = (
64
  synthesised_speech.numpy() * 32767).astype(np.int16)
65
- return (16000, synthesised_speech)
66
 
67
  def predict(transType, language, audio, audio_mic = None):
68
  print("debug1:", audio,"debug2", audio_mic)
@@ -72,7 +74,7 @@ def predict(transType, language, audio, audio_mic = None):
72
  if transType == "Text":
73
  return translate(audio), None
74
  if transType == "Audio":
75
- return "",speech_to_speech_translation(audio)
76
 
77
  # Define the title etc
78
  title = "Swedish STSOT (Speech To Speech Or Text)"
 
11
 
12
  def inference(audio):
13
  audio = whisper.load_audio(audio)
14
+ print("loading finished")
15
  audio = whisper.pad_or_trim(audio)
16
+ print("audio trimed")
17
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
18
+ print("spectro finished")
19
  _, probs = model.detect_language(mel)
20
+ print("lang detected")
21
  options = whisper.DecodingOptions(fp16 = False)
22
+ print("options decoded")
23
  result = whisper.decode(model, mel, options)
24
 
25
  print(result.text)
 
42
  # Define a function to translate an audio, in english here
43
  def translate(audio):
44
  return inference(audio)
45
+ # outputs = pipe(audio, max_new_tokens=256,
46
+ # generate_kwargs={"task": "transcribe", "language": "english"})
47
+ # return outputs["text"]
48
 
49
 
50
  # Define function to generate the waveform output
 
64
  synthesised_speech = synthesise(translated_text)
65
  synthesised_speech = (
66
  synthesised_speech.numpy() * 32767).astype(np.int16)
67
+ return [translated_text, (16000, synthesised_speech)]
68
 
69
  def predict(transType, language, audio, audio_mic = None):
70
  print("debug1:", audio,"debug2", audio_mic)
 
74
  if transType == "Text":
75
  return translate(audio), None
76
  if transType == "Audio":
77
+ return speech_to_speech_translation(audio)
78
 
79
  # Define the title etc
80
  title = "Swedish STSOT (Speech To Speech Or Text)"