Spaces:

kotoba-speech
/

kotoba-whisper-demo

Running on Zero

App Files Files Community

asahi417 commited on Apr 26, 2024

Commit

c962a1e

1 Parent(s): 0f5d4d0

add punctuator and timestamped output

Browse files

Files changed (2) hide show

app.py +54 -18
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,20 +1,28 @@
 import re
 import torch
 import gradio as gr
 import yt_dlp as youtube_dl
 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
-import tempfile
-import os
 MODEL_NAME = "kotoba-tech/kotoba-whisper-v1.0"
 BATCH_SIZE = 16
 CHUNK_LENGTH_S = 15
 FILE_LIMIT_MB = 1000
 YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files
 if torch.cuda.is_available():
     torch_dtype = torch.bfloat16
     device = "cuda:0"
@@ -24,6 +32,7 @@ else:
     device = "cpu"
     model_kwargs = {}
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
@@ -35,21 +44,52 @@ pipe = pipeline(
 )
-def transcribe(inputs, prompt):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    generate_kwargs = {"language": "japanese", "task": "transcribe"}
-    prompt = "。" if not prompt else prompt
-    generate_kwargs['prompt_ids'] = pipe.tokenizer.get_prompt_ids(prompt, return_tensors='pt').to(device)
-    text = pipe(inputs, generate_kwargs=generate_kwargs)['text']
-    # currently the pipeline for ASR appends the prompt at the beginning of the transcription, so remove it
-    return re.sub(rf"\A\s*{prompt}\s*", "", text)
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     return f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe> </center>'
 def download_yt_audio(yt_url, filename):
     info_loader = youtube_dl.YoutubeDL()
     try:
@@ -76,7 +116,7 @@ def download_yt_audio(yt_url, filename):
             raise gr.Error(str(err))
-def yt_transcribe(yt_url, prompt, max_filesize=75.0):
     html_embed_str = _return_yt_html_embed(yt_url)
     with tempfile.TemporaryDirectory() as tmpdirname:
         filepath = os.path.join(tmpdirname, "video.mp4")
@@ -85,12 +125,8 @@ def yt_transcribe(yt_url, prompt, max_filesize=75.0):
             inputs = f.read()
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    generate_kwargs = {"language": "japanese", "task": "transcribe"}
-    prompt = "。" if not prompt else prompt
-    generate_kwargs['prompt_ids'] = pipe.tokenizer.get_prompt_ids(prompt, return_tensors='pt').to(device)
-    text = pipe(inputs, generate_kwargs=generate_kwargs)['text']
-    # currently the pipeline for ASR appends the prompt at the beginning of the transcription, so remove it
-    return html_embed_str, re.sub(rf"\A\s*{prompt}\s*", "", text)
 demo = gr.Blocks()
@@ -100,7 +136,7 @@ mf_transcribe = gr.Interface(
         gr.inputs.Audio(source="microphone", type="filepath", optional=True),
         gr.inputs.Textbox(lines=1, placeholder="Prompt", optional=True)
     ],
-    outputs="text",
     layout="horizontal",
     theme="huggingface",
     title=f"Transcribe Audio with {os.path.basename(MODEL_NAME)}",
@@ -114,7 +150,7 @@ file_transcribe = gr.Interface(
         gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
         gr.inputs.Textbox(lines=1, placeholder="Prompt", optional=True)
     ],
-    outputs="text",
     layout="horizontal",
     theme="huggingface",
     title=f"Transcribe Audio with {os.path.basename(MODEL_NAME)}",

+import os
+import time
+import tempfile
 import re
+from math import floor
+from typing import Optional
 import torch
 import gradio as gr
 import yt_dlp as youtube_dl
 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
+from punctuators.models import PunctCapSegModelONNX
+# configuration
 MODEL_NAME = "kotoba-tech/kotoba-whisper-v1.0"
 BATCH_SIZE = 16
 CHUNK_LENGTH_S = 15
 FILE_LIMIT_MB = 1000
 YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files
+PUNCTUATOR = PunctCapSegModelONNX.from_pretrained("pcs_47lang")
+# device setting
 if torch.cuda.is_available():
     torch_dtype = torch.bfloat16
     device = "cuda:0"
     device = "cpu"
     model_kwargs = {}
+# define the pipeline
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
 )
+def format_time(start: Optional[float], end: Optional[float]):
+    def _format_time(seconds: Optional[float]):
+        if seconds is None:
+            return "complete    "
+        minutes = floor(seconds / 60)
+        hours = floor(seconds / 3600)
+        seconds = seconds - hours * 3600 - minutes * 60
+        m_seconds = floor(round(seconds - floor(seconds), 3) * 10 ** 3)
+        seconds = floor(seconds)
+        return f'{hours:02}:{minutes:02}:{seconds:02}.{m_seconds:03}'
+    return f"[{_format_time(start)}-> {_format_time(end)}]:"
+def get_prediction(inputs, prompt: Optional[str], punctuate_text: bool = True):
+    generate_kwargs = {"language": "japanese", "task": "transcribe"}
+    if prompt:
+        generate_kwargs['prompt_ids'] = pipe.tokenizer.get_prompt_ids(prompt, return_tensors='pt').to(device)
+    prediction = pipe(inputs, return_timestamps=True, generate_kwargs=generate_kwargs)
+    if punctuate_text:
+        text_edit = PUNCTUATOR.infer([c['text'] for c in prediction['chunks']])
+        prediction['chunks'] = [
+            {
+                'timestamp': c['timestamp'],
+                'text': "".join(e) if 'unk' not in "".join(e).lower() else c['text']
+            } for c, e in zip(prediction['chunks'], text_edit)
+        ]
+    text = "".join([c['text'] for c in prediction['chunks']])
+    text_timestamped = "\n".join([
+        f"{format_time(*c['timestamp'])} {c['text']}" for c in prediction['chunks']
+    ])
+    return text, text_timestamped
+def transcribe(inputs, prompt, punctuate_text: bool = True):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    return get_prediction(inputs, prompt, punctuate_text)
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     return f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe> </center>'
 def download_yt_audio(yt_url, filename):
     info_loader = youtube_dl.YoutubeDL()
     try:
             raise gr.Error(str(err))
+def yt_transcribe(yt_url, prompt, punctuate_text: bool = True):
     html_embed_str = _return_yt_html_embed(yt_url)
     with tempfile.TemporaryDirectory() as tmpdirname:
         filepath = os.path.join(tmpdirname, "video.mp4")
             inputs = f.read()
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+    text, text_timestamped = get_prediction(inputs, prompt, punctuate_text)
+    return html_embed_str, text, text_timestamped
 demo = gr.Blocks()
         gr.inputs.Audio(source="microphone", type="filepath", optional=True),
         gr.inputs.Textbox(lines=1, placeholder="Prompt", optional=True)
     ],
+    outputs=["text", "text"],
     layout="horizontal",
     theme="huggingface",
     title=f"Transcribe Audio with {os.path.basename(MODEL_NAME)}",
         gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
         gr.inputs.Textbox(lines=1, placeholder="Prompt", optional=True)
     ],
+    outputs=["text", "text"],
     layout="horizontal",
     theme="huggingface",
     title=f"Transcribe Audio with {os.path.basename(MODEL_NAME)}",

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 git+https://github.com/huggingface/transformers
 torch
 yt-dlp

 git+https://github.com/huggingface/transformers
 torch
 yt-dlp
+punctuators