Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

finetuned_segmentation_model_improved.ckpt +3 -0
generate_script.bat +41 -0
transcribe_japanese_with_diarization.py +94 -0

finetuned_segmentation_model_improved.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d745e069675a76fff20df3875e235b2f4b916a76b207b92ab741dd11a1685deb
+size 17732680

generate_script.bat ADDED Viewed

	@@ -0,0 +1,41 @@

+@echo off
+setlocal enabledelayedexpansion
+if "%~1"=="" (
+    echo Usage: %0 ^<input_file^>
+    exit /b 1
+)
+set "input_file=%~1"
+set "file_extension=%~x1"
+set "file_name=%~n1"
+set "file_dir=%~dp1"
+REM Extract audio if input is a video file
+if /i "%file_extension%"==".avi" (
+    echo Extracting audio from video...
+    ffmpeg -i "%input_file%" -q:a 0 "%file_dir%%file_name%.mp3"
+    set "audio_file=%file_dir%%file_name%.mp3"
+) else (
+    set "audio_file=%input_file%"
+)
+REM Run demucs for vocal extraction
+echo Running Demucs for vocal extraction...
+demucs -n htdemucs_ft "%audio_file%" --two-stems vocals -o "%file_dir%vocal_extracted" --filename "{track}-{stem}.{ext}"
+REM Process the extracted vocals
+echo Processing extracted vocals...
+ffmpeg -i "%file_dir%vocal_extracted\htdemucs_ft\%file_name%-vocals.wav" -ar 16000 -ac 1 -acodec pcm_s16le -af "aresample=resampler=soxr, lowpass=f=7500, acompressor=threshold=-12dB:ratio=2:attack=5:release=50, equalizer=f=1000:width_type=o:width=1:g=2, equalizer=f=3000:width_type=o:width=1:g=3" -b:a 128k "%file_dir%%file_name%_vocals_16k_mono_enhanced.wav"
+REM Run transcription script
+echo Running transcription...
+python "%~dp0transcribe_japanese_with_diarization.py" "%file_dir%%file_name%_vocals_16k_mono_enhanced.wav"
+REM Cleanup
+echo Cleaning up temporary files...
+if /i "%file_extension%"==".avi" del "%file_dir%%file_name%.mp3"
+rmdir /s /q "%file_dir%vocal_extracted"
+del "%file_dir%%file_name%_vocals_16k_mono_enhanced.wav"
+echo Processing complete!

transcribe_japanese_with_diarization.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import sys
+import torch
+import whisper
+import re
+from pyannote.audio import Pipeline, Audio, Model
+from pyannote.audio.pipelines import SpeakerDiarization
+def should_skip_line(s: str) -> bool:
+    parts = s.split(':', 1)
+    if len(parts) > 1 and parts[1].strip() == '':
+        return True
+    phrases_to_skip = ["ご視聴ありがとうございました", "by H."]
+    for phrase in phrases_to_skip:
+        if phrase in s:
+            return True
+    return False
+def main(audio_file):
+    # Get HuggingFace token from environment variable
+    HF_TOKEN = os.environ.get("HF_TOKEN_NOT_LOGIN")
+    if not HF_TOKEN:
+        print("Error: HF_TOKEN_NOT_LOGIN environment variable is not set.")
+        sys.exit(1)
+    # Load pyannote.audio speaker diarization
+    pretrained_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN)
+    # Generate Finetune Pipeline
+    finetuned_model_path = "finetuned_segmentation_model_improved.ckpt"
+    finetuned_model = Model.from_pretrained(finetuned_model_path)
+    best_segmentation_threshold = 0.6455219392347773
+    best_clustering_threshold = 0.6425210602903073
+    finetuned_pipeline = SpeakerDiarization(
+        segmentation=finetuned_model,
+        embedding=r"pyannote/wespeaker-voxceleb-resnet34-LM",
+        clustering=pretrained_pipeline.klustering,
+    )
+    finetuned_pipeline.instantiate({
+        "segmentation": {
+            "threshold": best_segmentation_threshold,
+            "min_duration_off": 0.0,
+        },
+        "clustering": {
+            "method": "centroid",
+            "min_cluster_size": 15,
+            "threshold": best_clustering_threshold,
+        },
+    })
+    finetuned_pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+    # Apply speaker diarization
+    who_speaks_when = finetuned_pipeline(audio_file)
+    # Load OpenAI Whisper automatic speech transcription
+    print("Loading whisper model...")
+    model = whisper.load_model("large-v2", device="cuda" if torch.cuda.is_available() else "cpu")
+    print("Whisper model loaded.")
+    # Transcribe audio
+    print("Importing Audio!")
+    audio = Audio(sample_rate=16000, mono=True)
+    transcribed_lines = []
+    for segment, _, speaker in who_speaks_when.itertracks(yield_label=True):
+        waveform, sample_rate = audio.crop(audio_file, segment)
+        text = model.transcribe(waveform.squeeze().numpy(), language="Japanese")["text"]
+        timed_line = f"{segment.start:06.1f}s {segment.end:06.1f}s {speaker}: {text}"
+        print(timed_line)
+        transcribed_lines.append(timed_line)
+    # Write transcription to file
+    output_file = 'timed_script.txt'
+    with open(output_file, 'w', encoding='UTF-8') as f:
+        for line in transcribed_lines:
+            if not should_skip_line(line):
+                f.write(line + '\n')
+    print(f"Transcription completed. Output saved to {output_file}")
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python rich_transcription.py <audio_file>")
+        sys.exit(1)
+    audio_file = sys.argv[1]
+    if not os.path.exists(audio_file):
+        print(f"Error: The file '{audio_file}' does not exist.")
+        sys.exit(1)
+    main(audio_file)