KingNish commited on
Commit
4413825
1 Parent(s): 1c0af73

modified: app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -7
app.py CHANGED
@@ -24,7 +24,6 @@ import gradio as gr
24
  from transformers import TextIteratorStreamer
25
  from transformers import Idefics2ForConditionalGeneration
26
  import tempfile
27
- from streaming_stt_nemo import Model
28
  from huggingface_hub import InferenceClient
29
  import edge_tts
30
  import asyncio
@@ -59,15 +58,25 @@ theme = gr.themes.Base(
59
  font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
60
  )
61
 
62
- default_lang = "en"
 
63
 
64
- engines = { default_lang: Model(default_lang) }
65
 
 
 
 
 
 
 
 
 
66
  def transcribe(audio):
67
- lang = "en"
68
- model = engines[lang]
69
- text = model.stt_file(audio)[0]
70
- return text
 
71
 
72
  client1 = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
73
 
@@ -489,6 +498,8 @@ with gr.Blocks(
489
  )
490
 
491
  gr.ChatInterface(
 
 
492
  fn=model_inference,
493
  chatbot=chatbot,
494
  examples=EXAMPLES,
@@ -512,12 +523,16 @@ with gr.Blocks() as voice:
512
  autoplay=True,
513
  elem_classes="audio")
514
  gr.Interface(
 
 
515
  fn=respond,
516
  inputs=[input],
517
  outputs=[output], live=True)
518
 
519
  with gr.Blocks() as livechat:
520
  gr.Interface(
 
 
521
  fn=videochat,
522
  inputs=[gr.Image(type="pil",sources="webcam", label="Upload Image"), gr.Textbox(label="Prompt", value="what he is doing")],
523
  outputs=gr.Textbox(label="Answer")
 
24
  from transformers import TextIteratorStreamer
25
  from transformers import Idefics2ForConditionalGeneration
26
  import tempfile
 
27
  from huggingface_hub import InferenceClient
28
  import edge_tts
29
  import asyncio
 
58
  font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
59
  )
60
 
61
+ MODEL_NAME = "openai/whisper-medium"
62
+ BATCH_SIZE = 10
63
 
64
+ device = 0 if torch.cuda.is_available() else "cpu"
65
 
66
+ pipe = pipeline(
67
+ task="automatic-speech-recognition",
68
+ model=MODEL_NAME,
69
+ chunk_length_s=30,
70
+ device=device,
71
+ )
72
+
73
+ @spaces.GPU(queue=False)
74
  def transcribe(audio):
75
+ if inputs is None:
76
+ raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
77
+
78
+ text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"})["text"]
79
+ return text
80
 
81
  client1 = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
82
 
 
498
  )
499
 
500
  gr.ChatInterface(
501
+ batch=True,
502
+ max_batch_size=10,
503
  fn=model_inference,
504
  chatbot=chatbot,
505
  examples=EXAMPLES,
 
523
  autoplay=True,
524
  elem_classes="audio")
525
  gr.Interface(
526
+ batch=True,
527
+ max_batch_size=10,
528
  fn=respond,
529
  inputs=[input],
530
  outputs=[output], live=True)
531
 
532
  with gr.Blocks() as livechat:
533
  gr.Interface(
534
+ batch=True,
535
+ max_batch_size=10,
536
  fn=videochat,
537
  inputs=[gr.Image(type="pil",sources="webcam", label="Upload Image"), gr.Textbox(label="Prompt", value="what he is doing")],
538
  outputs=gr.Textbox(label="Answer")