whisper-large-v3-srt

Runtime error

App Files Files Community

datxy commited on Aug 27

Commit

caa6c38

verified ·

1 Parent(s): 616414f

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -40

app.py CHANGED Viewed

@@ -2,11 +2,9 @@ import spaces
 import torch
 import gradio as gr
 from transformers import pipeline
-from transformers.pipelines.audio_utils import ffmpeg_read
 import tempfile
 import os
-import time
 from datetime import timedelta
 # ===== 配置 =====
@@ -23,44 +21,103 @@ pipe = pipeline(
     chunk_length_s=30,
     device=device,
     torch_dtype=dtype,
-    return_timestamps=True,
 )
 # ===== 工具函数：时间戳/SRT =====
-def _srt_timestamp(seconds: float | None) -> str:
-    """秒 -> SRT 时间戳 00:00:00,000。None 时用 0."""
-    if seconds is None:
         seconds = 0.0
-    if seconds < 0:
-        seconds = 0.0
-    td = timedelta(seconds=float(seconds))
-    total_ms = int(td.total_seconds() * 1000)
-    hours = total_ms // 3_600_000
-    minutes = (total_ms % 3_600_000) // 60_000
-    secs = (total_ms % 60_000) // 1000
-    ms = total_ms % 1000
-    return f"{hours:02d}:{minutes:02d}:{secs:02d},{ms:03d}"
-def chunks_to_srt(chunks: list[dict]) -> str:
-    """将 Whisper 返回的 chunks 转为 SRT 字符串。"""
-    lines = []
-    idx = 1
-    for ch in chunks:
-        ts = ch.get("timestamp") or ch.get("timestamps")
-        text = (ch.get("text") or "").strip()
-        if not text:
             continue
-        if isinstance(ts, (list, tuple)) and len(ts) == 2:
-            start, end = ts
-        else:
-            start, end = 0.0, 2.0
-        start_srt = _srt_timestamp(start)
-        end_srt = _srt_timestamp(end)
-        lines.append(str(idx))
-        lines.append(f"{start_srt} --> {end_srt}")
-        lines.append(text)
         lines.append("")
-        idx += 1
     return "\n".join(lines).strip() + ("\n" if lines else "")
 # ===== 上传音频 -> SRT 导出 =====
@@ -76,20 +133,21 @@ def transcribe_file_to_srt(audio_path: str, task: str):
         pass
     result = pipe(audio_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task})
-    text = result.get("text", "")
     chunks = result.get("chunks") or []
-    srt_str = chunks_to_srt(chunks)
-    if not srt_str and text.strip():
-        srt_str = "1\n00:00:00,000 --> 00:00:02,000\n" + text.strip() + "\n"
     tmpdir = tempfile.mkdtemp(prefix="srt_")
     base = os.path.splitext(os.path.basename(audio_path))[0] or "subtitle"
     srt_path = os.path.join(tmpdir, f"{base}.srt")
     with open(srt_path, "w", encoding="utf-8") as f:
         f.write(srt_str)
-    return text, srt_path
 # ===== Gradio 界面 =====
 demo = gr.Interface(
@@ -99,7 +157,7 @@ demo = gr.Interface(
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs=[
-        gr.Textbox(label="Transcript (preview)"),
         gr.File(label="Download SRT"),
     ],
     title="Upload Audio → SRT Subtitle",

 import torch
 import gradio as gr
 from transformers import pipeline
 import tempfile
 import os
 from datetime import timedelta
 # ===== 配置 =====
     chunk_length_s=30,
     device=device,
     torch_dtype=dtype,
+    return_timestamps="word",   # 关键：逐词时间戳，便于细分
 )
 # ===== 工具函数：时间戳/SRT =====
+def _srt_timestamp(seconds):
+    """秒 -> SRT 时间戳 00:00:00,000。None/负数时归零。"""
+    if seconds is None or seconds < 0:
         seconds = 0.0
+    ms = int(float(seconds) * 1000 + 0.5)
+    h, ms = divmod(ms, 3600000)
+    m, ms = divmod(ms, 60000)
+    s, ms = divmod(ms, 1000)
+    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+def chunks_to_srt(chunks, text_fallback="", max_seg_dur=6.0, max_seg_chars=42):
+    """
+    用逐词时间戳把长 chunk 细分成更短的 SRT 行：
+    - 每行最长持续 max_seg_dur 秒
+    - 或字符数约 max_seg_chars
+    - 遇到句末标点（。！？.!?）优先断句
+    """
+    segs = []
+    cur_words = []
+    cur_start = None
+    cur_len = 0
+    def flush_seg():
+        nonlocal cur_words, cur_start, cur_len
+        if not cur_words:
+            return
+        # 兼容多种时间戳字段
+        st = cur_start if cur_start is not None else cur_words[0].get("start", 0.0)
+        en = cur_words[-1].get("end", cur_words[-1].get("timestamp", [0.0, 0.0])[-1] if isinstance(cur_words[-1].get("timestamp"), (list, tuple)) else 0.0)
+        if isinstance(st, (list, tuple)): st = st[0]
+        if isinstance(en, (list, tuple)): en = en[-1]
+        text = "".join(w.get("word", "").strip() for w in cur_words).strip()
+        if text:
+            segs.append((float(st or 0.0), float(en or 0.0), text))
+        cur_words = []
+        cur_start = None
+        cur_len = 0
+    def maybe_flush(force=False, strong_punct=False):
+        if not cur_words:
+            return
+        st = cur_start if cur_start is not None else cur_words[0].get("start", 0.0)
+        en = cur_words[-1].get("end", cur_words[-1].get("timestamp", [0.0, 0.0])[-1] if isinstance(cur_words[-1].get("timestamp"), (list, tuple)) else 0.0)
+        if isinstance(st, (list, tuple)): st = st[0]
+        if isinstance(en, (list, tuple)): en = en[-1]
+        dur = float((en or 0.0) - (st or 0.0))
+        if force or strong_punct or dur >= max_seg_dur or cur_len >= max_seg_chars:
+            flush_seg()
+    # 汇总所有词
+    all_words = []
+    for ch in chunks or []:
+        words = ch.get("words") or []
+        if not words and ch.get("text"):
+            ts = ch.get("timestamp") or ch.get("timestamps") or [0.0, 2.0]
+            if isinstance(ts, (list, tuple)) and len(ts) == 2:
+                all_words.append({"word": ch["text"], "start": ts[0], "end": ts[1]})
+            else:
+                all_words.append({"word": ch["text"], "start": 0.0, "end": 2.0})
             continue
+        for w in words:
+            token = (w.get("word") or "").replace("\n", " ")
+            start = w.get("start")
+            end = w.get("end")
+            if (start is None or end is None) and isinstance(w.get("timestamp"), (list, tuple)) and len(w["timestamp"]) == 2:
+                start, end = w["timestamp"]
+            all_words.append({"word": token, "start": start, "end": end})
+    # 若依旧拿不到逐词，回退整段文本
+    if not all_words and text_fallback.strip():
+        all_words = [{"word": text_fallback.strip(), "start": 0.0, "end": max_seg_dur}]
+    # 按规则切分
+    for w in all_words:
+        token = w.get("word", "")
+        if not token:
+            continue
+        if cur_start is None:
+            cur_start = w.get("start", 0.0)
+        cur_words.append(w)
+        cur_len += len(token)
+        strong = token.endswith(("。", "！", "？", ".", "!", "?"))
+        maybe_flush(force=False, strong_punct=strong)
+    maybe_flush(force=True)
+    # 生成 SRT
+    lines = []
+    for i, (st, en, txt) in enumerate(segs, 1):
+        lines.append(str(i))
+        lines.append(f"{_srt_timestamp(st)} --> {_srt_timestamp(en)}")
+        lines.append(txt)
         lines.append("")
     return "\n".join(lines).strip() + ("\n" if lines else "")
 # ===== 上传音频 -> SRT 导出 =====
         pass
     result = pipe(audio_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task})
+    text = result.get("text", "") or ""
     chunks = result.get("chunks") or []
+    # 转 SRT（预览即为 SRT）
+    srt_str = chunks_to_srt(chunks, text_fallback=text)
+    # 写入临时文件供下载
     tmpdir = tempfile.mkdtemp(prefix="srt_")
     base = os.path.splitext(os.path.basename(audio_path))[0] or "subtitle"
     srt_path = os.path.join(tmpdir, f"{base}.srt")
     with open(srt_path, "w", encoding="utf-8") as f:
         f.write(srt_str)
+    # 第一个输出显示 SRT 字符串，第二个输出提供下载
+    return srt_str, srt_path
 # ===== Gradio 界面 =====
 demo = gr.Interface(
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs=[
+        gr.Textbox(label="Transcript (SRT Preview)", lines=18),
         gr.File(label="Download SRT"),
     ],
     title="Upload Audio → SRT Subtitle",