import soundfile as sf | |
import torch | |
from kotoba_whisper import SpeakerDiarization | |
pipeline = SpeakerDiarization(device=torch.device("cpu")) | |
a, sr = sf.read("sample_diarization_japanese.mp3") | |
output = pipeline(a.T, sampling_rate=sr) | |
output = {s: [[i.start, i.end] for i in output.label_timeline(s)] for s in output.labels()} | |
print(output) | |