NekoMikoReimu commited on
Commit
0ed2cac
β€’
1 Parent(s): 1622815

Upload folder using huggingface_hub

Browse files
finetuned_segmentation_model_improved.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d745e069675a76fff20df3875e235b2f4b916a76b207b92ab741dd11a1685deb
3
+ size 17732680
generate_script.bat ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+ setlocal enabledelayedexpansion
3
+
4
+ if "%~1"=="" (
5
+ echo Usage: %0 ^<input_file^>
6
+ exit /b 1
7
+ )
8
+
9
+ set "input_file=%~1"
10
+ set "file_extension=%~x1"
11
+ set "file_name=%~n1"
12
+ set "file_dir=%~dp1"
13
+
14
+ REM Extract audio if input is a video file
15
+ if /i "%file_extension%"==".avi" (
16
+ echo Extracting audio from video...
17
+ ffmpeg -i "%input_file%" -q:a 0 "%file_dir%%file_name%.mp3"
18
+ set "audio_file=%file_dir%%file_name%.mp3"
19
+ ) else (
20
+ set "audio_file=%input_file%"
21
+ )
22
+
23
+ REM Run demucs for vocal extraction
24
+ echo Running Demucs for vocal extraction...
25
+ demucs -n htdemucs_ft "%audio_file%" --two-stems vocals -o "%file_dir%vocal_extracted" --filename "{track}-{stem}.{ext}"
26
+
27
+ REM Process the extracted vocals
28
+ echo Processing extracted vocals...
29
+ ffmpeg -i "%file_dir%vocal_extracted\htdemucs_ft\%file_name%-vocals.wav" -ar 16000 -ac 1 -acodec pcm_s16le -af "aresample=resampler=soxr, lowpass=f=7500, acompressor=threshold=-12dB:ratio=2:attack=5:release=50, equalizer=f=1000:width_type=o:width=1:g=2, equalizer=f=3000:width_type=o:width=1:g=3" -b:a 128k "%file_dir%%file_name%_vocals_16k_mono_enhanced.wav"
30
+
31
+ REM Run transcription script
32
+ echo Running transcription...
33
+ python "%~dp0transcribe_japanese_with_diarization.py" "%file_dir%%file_name%_vocals_16k_mono_enhanced.wav"
34
+
35
+ REM Cleanup
36
+ echo Cleaning up temporary files...
37
+ if /i "%file_extension%"==".avi" del "%file_dir%%file_name%.mp3"
38
+ rmdir /s /q "%file_dir%vocal_extracted"
39
+ del "%file_dir%%file_name%_vocals_16k_mono_enhanced.wav"
40
+
41
+ echo Processing complete!
transcribe_japanese_with_diarization.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import whisper
5
+ import re
6
+ from pyannote.audio import Pipeline, Audio, Model
7
+ from pyannote.audio.pipelines import SpeakerDiarization
8
+
9
+ def should_skip_line(s: str) -> bool:
10
+ parts = s.split(':', 1)
11
+ if len(parts) > 1 and parts[1].strip() == '':
12
+ return True
13
+
14
+ phrases_to_skip = ["γ”θ¦–θ΄γ‚γ‚ŠγŒγ¨γ†γ”γ–γ„γΎγ—γŸ", "by H."]
15
+ for phrase in phrases_to_skip:
16
+ if phrase in s:
17
+ return True
18
+
19
+ return False
20
+
21
+ def main(audio_file):
22
+ # Get HuggingFace token from environment variable
23
+ HF_TOKEN = os.environ.get("HF_TOKEN_NOT_LOGIN")
24
+ if not HF_TOKEN:
25
+ print("Error: HF_TOKEN_NOT_LOGIN environment variable is not set.")
26
+ sys.exit(1)
27
+
28
+ # Load pyannote.audio speaker diarization
29
+ pretrained_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN)
30
+
31
+ # Generate Finetune Pipeline
32
+ finetuned_model_path = "finetuned_segmentation_model_improved.ckpt"
33
+ finetuned_model = Model.from_pretrained(finetuned_model_path)
34
+ best_segmentation_threshold = 0.6455219392347773
35
+ best_clustering_threshold = 0.6425210602903073
36
+
37
+ finetuned_pipeline = SpeakerDiarization(
38
+ segmentation=finetuned_model,
39
+ embedding=r"pyannote/wespeaker-voxceleb-resnet34-LM",
40
+ clustering=pretrained_pipeline.klustering,
41
+ )
42
+
43
+ finetuned_pipeline.instantiate({
44
+ "segmentation": {
45
+ "threshold": best_segmentation_threshold,
46
+ "min_duration_off": 0.0,
47
+ },
48
+ "clustering": {
49
+ "method": "centroid",
50
+ "min_cluster_size": 15,
51
+ "threshold": best_clustering_threshold,
52
+ },
53
+ })
54
+ finetuned_pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
55
+
56
+ # Apply speaker diarization
57
+ who_speaks_when = finetuned_pipeline(audio_file)
58
+
59
+ # Load OpenAI Whisper automatic speech transcription
60
+ print("Loading whisper model...")
61
+ model = whisper.load_model("large-v2", device="cuda" if torch.cuda.is_available() else "cpu")
62
+ print("Whisper model loaded.")
63
+
64
+ # Transcribe audio
65
+ print("Importing Audio!")
66
+ audio = Audio(sample_rate=16000, mono=True)
67
+ transcribed_lines = []
68
+
69
+ for segment, _, speaker in who_speaks_when.itertracks(yield_label=True):
70
+ waveform, sample_rate = audio.crop(audio_file, segment)
71
+ text = model.transcribe(waveform.squeeze().numpy(), language="Japanese")["text"]
72
+ timed_line = f"{segment.start:06.1f}s {segment.end:06.1f}s {speaker}: {text}"
73
+ print(timed_line)
74
+ transcribed_lines.append(timed_line)
75
+
76
+ # Write transcription to file
77
+ output_file = 'timed_script.txt'
78
+ with open(output_file, 'w', encoding='UTF-8') as f:
79
+ for line in transcribed_lines:
80
+ if not should_skip_line(line):
81
+ f.write(line + '\n')
82
+ print(f"Transcription completed. Output saved to {output_file}")
83
+
84
+ if __name__ == "__main__":
85
+ if len(sys.argv) != 2:
86
+ print("Usage: python rich_transcription.py <audio_file>")
87
+ sys.exit(1)
88
+
89
+ audio_file = sys.argv[1]
90
+ if not os.path.exists(audio_file):
91
+ print(f"Error: The file '{audio_file}' does not exist.")
92
+ sys.exit(1)
93
+
94
+ main(audio_file)