kotoba_whisper

Sleeping

aka7774 commited on Apr 30

Commit

7935c8d

•

1 Parent(s): 3faeacc

Upload 3 files

Files changed (3) hide show

app.py CHANGED Viewed

@@ -3,12 +3,26 @@ import gradio as gr
 fn.load_model()
-demo = gr.Interface(
-    fn=fn.speech_to_text,
-    inputs=[
-        gr.Audio(sources="upload", type="filepath"),
-        ],
-    outputs=["text", "text"])
 if __name__ == '__main__':
     demo.launch()

 fn.load_model()
+with gr.Blocks() as demo:
+    audio = gr.Audio(sources="upload", type="filepath")
+    model = gr.Dropdown(value='large-v3', choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"])
+    run_button = gr.Button(value='Run')
+    prompt = gr.Textbox(label='prompt')
+    set_button = gr.Button(value='Set Prompt')
+    text_only = gr.Textbox(label='output')
+    text_with_timestamps = gr.Textbox(label='timestamps')
+    run_button.click(
+        fn=fn.speech_to_text,
+        inputs=[audio, model],
+        outputs=[text_only, text_with_timestamps],
+    )
+    set_button.click(
+        fn=fn.set_prompt,
+        inputs=[prompt],
+        outputs=[],
+    )
 if __name__ == '__main__':
     demo.launch()

fn.py CHANGED Viewed

@@ -10,6 +10,7 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
 model = None
 pipe = None
 def load_model():
     global model, pipe
@@ -28,14 +29,21 @@ def load_model():
         device=device,
     )
 def speech_to_text(audio_file, _model_size = None):
-    global model, pipe
     if not model:
         load_model()
     # run inference
-    result = pipe(audio_file)
     try:
         res = json.dumps(result)

 model = None
 pipe = None
+initial_prompt = None
 def load_model():
     global model, pipe
         device=device,
     )
+def set_prompt(prompt):
+    global initial_prompt
+    initial_prompt = prompt
 def speech_to_text(audio_file, _model_size = None):
+    global model, pipe, initial_prompt
     if not model:
         load_model()
     # run inference
+    generate_kwargs = {}
+    if initial_prompt:
+        generate_kwargs['prompt_ids'] = pipe.tokenizer.get_prompt_ids(prompt, return_tensors="pt").to(device)
+    result = pipe(audio_file, generate_kwargs=generate_kwargs)
     try:
         res = json.dumps(result)

main.py CHANGED Viewed

@@ -40,3 +40,12 @@ async def transcribe_audio(file: UploadFile = Form(...)):
         return {"transcription": text_only, "text_with_timestamps": text_with_timestamps}
     except Exception as e:
         return {"error": str(e)}

         return {"transcription": text_only, "text_with_timestamps": text_with_timestamps}
     except Exception as e:
         return {"error": str(e)}
+@app.post("/set_prompt")
+async def set_prompt(prompt: str):
+    try:
+        fn.set_prompt(prompt)
+        return {"status": 0}
+    except Exception as e:
+        return {"error": str(e)}