Spaces:

kotoba-speech
/

kotoba-whisper-demo

Running on Zero

App Files Files Community

asahi417 commited on Apr 21, 2024

Commit

fc18a2b

verified ·

1 Parent(s): da4f293

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -5

app.py CHANGED Viewed

@@ -34,10 +34,12 @@ pipe = pipeline(
-def transcribe(inputs):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     generate_kwargs = {"language": "japanese", "task": "transcribe"}
     return pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs=generate_kwargs)["text"]
@@ -71,7 +73,7 @@ def download_yt_audio(yt_url, filename):
             raise gr.Error(str(err))
-def yt_transcribe(yt_url, max_filesize=75.0):
     html_embed_str = _return_yt_html_embed(yt_url)
     with tempfile.TemporaryDirectory() as tmpdirname:
         filepath = os.path.join(tmpdirname, "video.mp4")
@@ -81,6 +83,8 @@ def yt_transcribe(yt_url, max_filesize=75.0):
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
     generate_kwargs = {"language": "japanese", "task": "transcribe"}
     text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs=generate_kwargs)["text"]
     return html_embed_str, text
@@ -88,7 +92,10 @@ def yt_transcribe(yt_url, max_filesize=75.0):
 demo = gr.Blocks()
 mf_transcribe = gr.Interface(
     fn=transcribe,
-    inputs=[gr.inputs.Audio(source="microphone", type="filepath", optional=True)],
     outputs="text",
     layout="horizontal",
     theme="huggingface",
@@ -99,7 +106,10 @@ mf_transcribe = gr.Interface(
 file_transcribe = gr.Interface(
     fn=transcribe,
-    inputs=[gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file")],
     outputs="text",
     layout="horizontal",
     theme="huggingface",
@@ -109,7 +119,10 @@ file_transcribe = gr.Interface(
 )
 yt_transcribe = gr.Interface(
     fn=yt_transcribe,
-    inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
     outputs=["html", "text"],
     layout="horizontal",
     theme="huggingface",

+def transcribe(inputs, prompt):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     generate_kwargs = {"language": "japanese", "task": "transcribe"}
+    if prompt:
+        generate_kwargs['prompt_ids'] = pipe.tokenizer.get_prompt_ids(prompt, return_tensors='pt').to(device)
     return pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs=generate_kwargs)["text"]
             raise gr.Error(str(err))
+def yt_transcribe(yt_url, prompt, max_filesize=75.0):
     html_embed_str = _return_yt_html_embed(yt_url)
     with tempfile.TemporaryDirectory() as tmpdirname:
         filepath = os.path.join(tmpdirname, "video.mp4")
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
     generate_kwargs = {"language": "japanese", "task": "transcribe"}
+    if prompt:
+        generate_kwargs['prompt_ids'] = pipe.tokenizer.get_prompt_ids(prompt, return_tensors='pt').to(device)
     text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs=generate_kwargs)["text"]
     return html_embed_str, text
 demo = gr.Blocks()
 mf_transcribe = gr.Interface(
     fn=transcribe,
+    inputs=[
+        gr.inputs.Audio(source="microphone", type="filepath", optional=True),
+        gr.inputs.Textbox(lines=1, placeholder="Prompt", value="")
+    ],
     outputs="text",
     layout="horizontal",
     theme="huggingface",
 file_transcribe = gr.Interface(
     fn=transcribe,
+    inputs=[
+        gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
+        gr.inputs.Textbox(lines=1, placeholder="Prompt", value="")
+    ],
     outputs="text",
     layout="horizontal",
     theme="huggingface",
 )
 yt_transcribe = gr.Interface(
     fn=yt_transcribe,
+    inputs=[
+        gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
+        gr.inputs.Textbox(lines=1, placeholder="Prompt", value="")
+    ],
     outputs=["html", "text"],
     layout="horizontal",
     theme="huggingface",