Spaces:

laubonghaudoi
/

cantonese-srt

Running

App Files Files Community

laubonghaudoi commited on Nov 10, 2024

Commit

1d7163f

1 Parent(s): c5d7b1a

Inital commit

Browse files

Files changed (9) hide show

.gitignore +8 -0
app.py +79 -0
corrector/Corrector.py +62 -0
corrector/__init__.py +4 -0
requirements.txt +18 -0
transcriber/AutoTranscriber.py +145 -0
transcriber/TranscribeResult.py +15 -0
transcriber/__init__.py +7 -0
utils.py +69 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+models/*
+!models/denoiser.onnx
+.venv
+__pycache__
+.DS_Store
+*.mp3
+output
+.aider*

app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import logging
+import tempfile
+import gradio as gr
+from transcriber import AutoTranscriber
+from utils import to_srt
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    force=True,
+)
+logger = logging.getLogger(__name__)
+def transcribe_audio(audio_path):
+    """Process audio file and return SRT content and preview text"""
+    try:
+        transcriber = AutoTranscriber(
+            corrector="opencc",
+            use_denoiser=False,
+            with_punct=False
+        )
+        transcribe_results = transcriber.transcribe(audio_path)
+        if not transcribe_results:
+            return None, "無字幕生成， 可能係檢測唔到語音。"
+        # Generate SRT text for both preview and download
+        srt_text = to_srt(transcribe_results)
+        # Create temporary file for download
+        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.srt', encoding='utf-8') as tmp:
+            tmp.write(srt_text)
+            return tmp.name, srt_text
+    except Exception as e:
+        logger.error(f"Error during transcription: {str(e)}")
+        return None, f"Error: {str(e)}"
+def create_ui():
+    with gr.Blocks() as demo:
+        gr.Markdown("# 粵文字幕生成器")
+        gr.Markdown(
+            "上傳一個音頻文件，撳「生成字幕」，過一陣就會得到 SRT 文件。目前支援格式：.mp3、.wav、.flac、.m4a、.ogg、opus、.webm")
+        with gr.Row():
+            audio_input = gr.Audio(type="filepath", label="上傳音頻文件或者錄音")
+        with gr.Row():
+            generate_btn = gr.Button("生成字幕 SRT 文件", variant="primary", scale=2)
+        with gr.Row():
+            with gr.Column():
+                preview = gr.Textbox(label="預覽生成字幕", lines=10)
+            with gr.Column():
+                output = gr.File(label="下載 SRT")
+        generate_btn.click(
+            fn=transcribe_audio,
+            inputs=[audio_input],
+            outputs=[output, preview]
+        )
+    return demo
+def main():
+    demo = create_ui()
+    demo.launch(server_name="0.0.0.0", server_port=8081)
+if __name__ == "__main__":
+    main()

corrector/Corrector.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import opencc
+from typing import Literal
+import re
+class Corrector:
+    """
+    SenseVoice model ouputs Simplified Chinese only, this class converts the output to Traditional Chinese
+    and fix common Cantonese spelling errors.
+    """
+    def __init__(self, corrector: Literal["opencc"] = "opencc"):
+        self.corrector = corrector
+        self.converter = None
+        self.bert_model = None
+        if corrector == "opencc":
+            self.converter = opencc.OpenCC("s2hk")
+            self.regular_errors: list[tuple[re.Pattern, str]] = [
+                (re.compile(r"俾(?!(?:路支|斯麥|益))"), r"畀"),
+                (re.compile(r"(?<!(?:聯))[系繫](?!(?:統))"), r"係"),
+                (re.compile(r"噶"), r"㗎"),
+                (re.compile(r"咁(?=[我你佢就樣就話係啊呀嘅，。])"), r"噉"),
+                (re.compile(r"(?<![曝晾])曬(?:[衣太衫褲被命嘢相])"), r"晒"),
+                (re.compile(r"(?<=[好])翻(?=[去到嚟])"), r"返"),
+                (re.compile(r"<\|\w+\|>"), r""),
+            ]
+    def correct(self, text: str) -> str:
+        """
+        Correct the output text using either a language model or OpenCC
+        Args:
+            text: Input text to correct
+            t2s_char_dict: Dictionary mapping traditional to simplified characters
+            lm_model: Either 'opencc' or a LanguageModel instance
+        Returns:
+            Corrected text string
+        """
+        text = text.strip()
+        if not text:  # Early return for empty string
+            return text
+        if self.corrector == "opencc":
+            return self.opencc_correct(text)
+        else:
+            raise ValueError("corrector should be either 'opencc' or 'bert'")
+    def opencc_correct(self, text: str) -> str:
+        """
+        Convert text using OpenCC
+        Args:
+            text: Input text to convert
+            config: OpenCC configuration
+        Returns:
+            Converted text string
+        """
+        opencc_text = self.converter.convert(text)
+        for pattern, replacement in self.regular_errors:
+            opencc_text = pattern.sub(replacement, opencc_text)
+        return opencc_text

corrector/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .Corrector import Corrector
+# Re-export at package level
+__all__ = ['Corrector']

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+OpenCC
+datasets
+flask
+funasr
+funasr_onnx
+librosa
+modelscope
+onnxruntime
+onnxruntime-gpu; sys_platform != 'darwin' and platform_machine != 'arm64' and platform_machine != 'aarch64'
+optimum[onnxruntime]
+psutil
+pysrt
+pytest
+pytubefix
+resampy
+torch
+torchaudio
+transformers[onnx]

transcriber/AutoTranscriber.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import logging
+import time
+from typing import List, Literal
+import librosa
+import numpy as np
+from funasr import AutoModel
+from resampy.core import resample
+from tqdm.auto import tqdm
+from corrector.Corrector import Corrector
+from transcriber.TranscribeResult import TranscribeResult
+logger = logging.getLogger(__name__)
+class AutoTranscriber:
+    """
+    Transcriber class that uses FunASR's AutoModel for VAD and ASR
+    """
+    def __init__(
+        self,
+        corrector: Literal["opencc", "bert", None] = None,
+        use_denoiser=False,
+        with_punct=True,
+        offset_in_seconds=-0.25,
+        sr=16000,
+    ):
+        self.corrector = corrector
+        self.use_denoiser = use_denoiser
+        self.with_punct = with_punct
+        self.sr = sr
+        self.offset_in_seconds = offset_in_seconds
+        # Initialize models
+        self.vad_model = AutoModel(model="fsmn-vad")
+        self.asr_model = AutoModel(
+            model="iic/SenseVoiceSmall",
+            vad_model=None,  # We'll handle VAD separately
+            punc_model="ct-punc" if with_punct else None,
+            ban_emo_unks=True,
+        )
+    def transcribe(
+        self,
+        audio_file: str,
+    ) -> List[TranscribeResult]:
+        """
+        Transcribe audio file to text with timestamps.
+        Args:
+            audio_file (str): Path to audio file
+        Returns:
+            List[TranscribeResult]: List of transcription results
+        """
+        # Load and preprocess audio
+        speech, sr = librosa.load(audio_file, sr=self.sr)
+        # if self.use_denoiser:
+        #     logger.info("Denoising speech...")
+        #     speech, _ = denoiser(speech, sr)
+        if sr != 16_000:
+            speech = resample(speech, sr, 16_000,
+                              filter="kaiser_best", parallel=True)
+        # Get VAD segments
+        logger.info("Segmenting speech...")
+        start_time = time.time()
+        vad_results = self.vad_model.generate(input=speech)
+        logger.info("VAD took %.2f seconds", time.time() - start_time)
+        if not vad_results or not vad_results[0]["value"]:
+            return []
+        vad_segments = vad_results[0]["value"]
+        # Process each segment
+        results = []
+        start_time = time.time()
+        for segment in tqdm(vad_segments, desc="Transcribing"):
+            start_sample = int(segment[0] * 16)  # Convert ms to samples
+            end_sample = int(segment[1] * 16)
+            segment_audio = speech[start_sample:end_sample]
+            # Get ASR results for segment
+            asr_result = self.asr_model.generate(
+                input=segment_audio, language="yue", use_itn=True
+            )
+            if not asr_result:
+                continue
+            start_time = max(0, segment[0] / 1000.0 + self.offset_in_seconds)
+            end_time = segment[1] / 1000.0 + self.offset_in_seconds
+            # Convert ASR result to TranscribeResult format
+            segment_result = TranscribeResult(
+                text=asr_result[0]["text"],
+                start_time=start_time,  # Convert ms to seconds
+                end_time=end_time,
+            )
+            results.append(segment_result)
+        logger.info("ASR took %.2f seconds", time.time() - start_time)
+        # Apply Chinese conversion if needed
+        start_time = time.time()
+        results = self._convert_to_traditional_chinese(results)
+        logger.info("Conversion took %.2f seconds", time.time() - start_time)
+        return results
+    def _convert_to_traditional_chinese(
+        self, results: List[TranscribeResult]
+    ) -> List[TranscribeResult]:
+        """Convert simplified Chinese to traditional Chinese"""
+        if not results or not self.corrector:
+            return results
+        corrector = Corrector(self.corrector)
+        if self.corrector == "bert":
+            for result in tqdm(
+                results, total=len(results), desc="Converting to Traditional Chinese"
+            ):
+                result.text = corrector.correct(result.text)
+        elif self.corrector == "opencc":
+            # Use a special delimiter that won't appear in Chinese text
+            delimiter = "|||"
+            # Concatenate all texts with delimiter
+            combined_text = delimiter.join(result.text for result in results)
+            # Convert all text at once
+            converted_text = corrector.correct(combined_text)
+            # Split back into individual results
+            converted_parts = converted_text.split(delimiter)
+            # Update results with converted text
+            for result, converted in zip(results, converted_parts):
+                result.text = converted
+        return results

transcriber/TranscribeResult.py ADDED Viewed

	@@ -0,0 +1,15 @@

+class TranscribeResult:
+    """
+    Each TranscribeResult object represents one SRT line.
+    """
+    def __init__(self, text: str, start_time: float, end_time: float):
+        self.text = text
+        self.start_time = start_time
+        self.end_time = end_time
+    def __str__(self):
+        return f"TranscribeResult(text={self.text}, start_time={self.start_time}, end_time={self.end_time})"
+    def __repr__(self):
+        return str(self)

transcriber/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .AutoTranscriber import AutoTranscriber
+from .TranscribeResult import TranscribeResult
+__all__ = [
+    "AutoTranscriber",
+    "TranscribeResult",
+]

utils.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import logging
+import os
+import tempfile
+from typing import Iterator
+from pysrt import SubRipFile, SubRipItem, SubRipTime
+from pytubefix import YouTube
+from transcriber import TranscribeResult
+logger = logging.getLogger(__name__)
+def download_youtube_audio(video_id: str) -> str:
+    """
+    Download audio from YouTube video.
+    Args:
+        video_id (str): YouTube video ID.
+    Returns:
+        str: Path to the downloaded audio file.
+    """
+    urls = "https://www.youtube.com/watch?v={}".format(video_id)
+    try:
+        # https://github.com/JuanBindez/pytubefix/issues/242#issuecomment-2369067929
+        vid = YouTube(urls, "MWEB")
+        if vid.title is None:
+            return None
+        audio_download = vid.streams.get_audio_only()
+        audio_download.download(
+            mp3=True,
+            filename=video_id,
+            output_path=tempfile.gettempdir(),
+            skip_existing=True,
+        )
+        audio_file = tempfile.gettempdir() + "/" + video_id + ".mp3"
+        return audio_file
+    except Exception as e:
+        print(e)
+        return None
+def to_srt(results: Iterator["TranscribeResult"]) -> str:
+    """
+    Convert the list of TranscribeResult objects into a SRT file
+    """
+    srt = SubRipFile()
+    for i, t in enumerate(results):
+        start = SubRipTime(seconds=t.start_time)
+        end = SubRipTime(seconds=t.end_time)
+        item = SubRipItem(index=i, start=start, end=end, text=t.text)
+        srt.append(item)
+    temp_file = tempfile.gettempdir() + "/output.srt"
+    srt.save(temp_file)
+    with open(temp_file, "r", encoding="utf-8") as f:
+        srt_text = f.read()
+    os.remove(temp_file)
+    return srt_text