Spaces:

StevenChen16
/

WhisperX-V2

Sleeping

App Files Files Community

StevenChen16 commited on Nov 13

Commit

550cf61

•

1 Parent(s): d6c72bf

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -54

app.py CHANGED Viewed

@@ -1,66 +1,114 @@
-import spaces
-import whisperx
 import torch
 import gradio as gr
 import tempfile
-import os
-device = "cuda" if torch.cuda.is_available() else "cpu"
-batch_size = 4  # 如果GPU内存不足，可适当减少
-compute_type = "float32"  # 如果GPU内存不足，可改为 "int8"（可能影响准确度）
-@spaces.GPU
-def transcribe_whisperx(audio_file, task):
-    # WhisperX模型加载
-    model = whisperx.load_model("large-v3", device=device, compute_type=compute_type)
-    if audio_file is None:
-        raise gr.Error("请上传或录制音频文件再提交请求！")
-    # 加载音频文件
-    audio = whisperx.load_audio(audio_file)
-    # 执行初步转录
-    result = model.transcribe(audio, batch_size=batch_size)
-    # 释放模型资源，防止GPU内存不足
-    torch.cuda.empty_cache()
-    # 加载对齐模型并对齐转录结果
-    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
-    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
-    # 执行说话人分离
-    hf_token = os.getenv("HF_TOKEN")
-    diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token,
-                                                 device=device)
-    diarize_segments = diarize_model(audio_file)
-    result = whisperx.assign_word_speakers(diarize_segments, result)
-    # 格式化输出文本
-    output_text = ""
-    for segment in result["segments"]:
-        speaker = segment.get("speaker", "未知")
-        text = segment["text"]
-        output_text += f"{speaker}: {text}\n"
-    return output_text
-# Gradio界面
 demo = gr.Blocks(theme=gr.themes.Ocean())
-transcribe_interface = gr.Interface(
-    fn=transcribe_whisperx,
-    inputs=[
-        gr.Audio(sources=["microphone", "upload"], type="filepath"),
-        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
-    ],
-    outputs="text",
-    title="WhisperX: Transcribe and Diarize Audio",
-    description="使用WhisperX对音频文件或麦克风输入进行转录和说话人分离。"
-)
 with demo:
-    transcribe_interface
-demo.queue().launch(ssr_mode=False)

+import os
 import torch
 import gradio as gr
+import whisperx
+from transformers.pipelines.audio_utils import ffmpeg_read
 import tempfile
+import gc
+# Constants
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+BATCH_SIZE = 4
+COMPUTE_TYPE = "float32"
+FILE_LIMIT_MB = 1000
+def transcribe_audio(inputs, task):
+    if inputs is None:
+        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    try:
+        # Load audio
+        if isinstance(inputs, str):
+            # For file path input
+            audio = whisperx.load_audio(inputs)
+        else:
+            # For microphone input (needs conversion)
+            audio = whisperx.load_audio(inputs)
+        # 1. Transcribe with base Whisper model
+        model = whisperx.load_model("large-v3", device=DEVICE, compute_type=COMPUTE_TYPE)
+        result = model.transcribe(audio, batch_size=BATCH_SIZE)
+        # Clear GPU memory
+        del model
+        gc.collect()
+        torch.cuda.empty_cache()
+        # 2. Align whisper output
+        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=DEVICE)
+        result = whisperx.align(result["segments"], model_a, metadata, audio, DEVICE, return_char_alignments=False)
+        # Clear GPU memory again
+        del model_a
+        gc.collect()
+        torch.cuda.empty_cache()
+        # 3. Diarize audio
+        diarize_model = whisperx.DiarizationPipeline(use_auth_token="YOUR_HF_TOKEN", device=DEVICE)
+        diarize_segments = diarize_model(audio)
+        # 4. Assign speaker labels
+        result = whisperx.assign_word_speakers(diarize_segments, result)
+        # Format output
+        output_text = ""
+        for segment in result['segments']:
+            speaker = segment.get('speaker', 'Unknown Speaker')
+            text = segment['text']
+            output_text += f"{speaker}: {text}\n"
+        return output_text
+    except Exception as e:
+        raise gr.Error(f"Error processing audio: {str(e)}")
+    finally:
+        # Final cleanup
+        gc.collect()
+        torch.cuda.empty_cache()
+# Create Gradio interface
 demo = gr.Blocks(theme=gr.themes.Ocean())
 with demo:
+    gr.Markdown("# WhisperX: Advanced Speech Recognition with Speaker Diarization")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(
+                sources=["microphone", "upload"],
+                type="filepath",
+                label="Audio Input (Microphone or File Upload)"
+            )
+            task = gr.Radio(
+                ["transcribe", "translate"],
+                label="Task",
+                value="transcribe"
+            )
+            submit_button = gr.Button("Process Audio")
+        with gr.Column():
+            output_text = gr.Textbox(
+                label="Transcription with Speaker Diarization",
+                lines=10,
+                placeholder="Transcribed text will appear here..."
+            )
+    gr.Markdown("""
+    ### Features:
+    - High-accuracy transcription using WhisperX
+    - Automatic speaker diarization
+    - Support for both microphone recording and file upload
+    - GPU-accelerated processing
+    ### Note:
+    Processing may take a few moments depending on the audio length and system resources.
+    """)
+    submit_button.click(
+        fn=transcribe_audio,
+        inputs=[audio_input, task],
+        outputs=output_text
+    )
+demo.queue().launch(share=True)