ysharma HF staff commited on
Commit
71471a7
1 Parent(s): 7298d79
Files changed (1) hide show
  1. app.py +25 -3
app.py CHANGED
@@ -13,17 +13,39 @@ model = whisper.load_model("base")
13
  API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
14
  HF_TOKEN = os.environ["HF_TOKEN"]
15
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
 
16
 
17
  # Text-to-Speech
18
  LANGUAGES = list(CoquiTTS.langs.keys())
19
  coquiTTS = CoquiTTS()
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # Processing input Audio
23
- def fun(audio) :
 
24
  text1 = model.transcribe(audio)["text"]
25
  text2 = lang_model_response(text1)
26
- speech = tts(text, language)
27
  return text1, text2, speech
28
 
29
  def lang_model_response(prompt):
@@ -69,7 +91,7 @@ demo = gr.Interface(fn=tts, inputs=inputs, outputs=outputs)
69
  demo.launch()
70
  gr.Interface(
71
  title = 'Testing Whisper',
72
- fn=fun,
73
  inputs=[
74
  gr.Audio(source="microphone", type="filepath"), #streaming = True,
75
  # "state"
 
13
  API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
14
  HF_TOKEN = os.environ["HF_TOKEN"]
15
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
16
+ #en, fr, esp, arb, hn, portu, Indonesian, Vietnamese, Chinese, tamil, telugu, bengali
17
 
18
  # Text-to-Speech
19
  LANGUAGES = list(CoquiTTS.langs.keys())
20
  coquiTTS = CoquiTTS()
21
 
22
 
23
+ def whisper_stt(audio):
24
+ # load audio and pad/trim it to fit 30 seconds
25
+ audio = whisper.load_audio(audio)
26
+ audio = whisper.pad_or_trim(audio)
27
+
28
+ # make log-Mel spectrogram and move to the same device as the model
29
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
30
+
31
+ # detect the spoken language
32
+ _, probs = model.detect_language(mel)
33
+ print(f"Detected language: {max(probs, key=probs.get)}")
34
+
35
+ # decode the audio
36
+ options = whisper.DecodingOptions()
37
+ result = whisper.decode(model, mel, options)
38
+
39
+ # print the recognized text
40
+ print(f"transcript is : {result.text}")
41
+ return result.text
42
+
43
  # Processing input Audio
44
+ def fun_engine(audio) :
45
+ #text1 = whisper_stt(audio)
46
  text1 = model.transcribe(audio)["text"]
47
  text2 = lang_model_response(text1)
48
+ speech = tts(text, 'en')
49
  return text1, text2, speech
50
 
51
  def lang_model_response(prompt):
 
91
  demo.launch()
92
  gr.Interface(
93
  title = 'Testing Whisper',
94
+ fn=fun_engine,
95
  inputs=[
96
  gr.Audio(source="microphone", type="filepath"), #streaming = True,
97
  # "state"