OverlapSpeakerCounter / SpeakerCounter.py

SpeakerCounter

4279f1f verified about 2 months ago

10.1 kB

	import torch
	from speechbrain.inference.interfaces import Pretrained
	import torchaudio


	def merge_overlapping_segments(segments):
	"""
	Merges segments that overlap or are contiguous, ensuring each speaker segment is represented once.

	Args:
	segments (list of tuples): List of tuples representing (start, end, label) of segments.

	Returns:
	list of tuples: Merged list of segments.
	"""
	if not segments:
	return []
	merged = [segments[0]]
	for current in segments[1:]:
	prev = merged[-1]
	if current[0] <= prev[1]:
	if current[2] == prev[2]:
	merged[-1] = (prev[0], max(prev[1], current[1]), prev[2])
	else:
	merged.append(current)
	else:
	merged.append(current)
	return merged


	def refine_transitions(aggregated_predictions):
	"""
	Refines transitions between speaker segments to enhance accuracy.

	Args:
	aggregated_predictions (list of tuples): The aggregated predictions with potential overlaps.

	Returns:
	list of tuples: Predictions with adjusted transitions.
	"""
	refined_predictions = []
	for i in range(len(aggregated_predictions)):
	if i == 0:
	refined_predictions.append(aggregated_predictions[i])
	continue

	current_start, current_end, current_label = aggregated_predictions[i]
	prev_start, prev_end, prev_label = aggregated_predictions[i - 1]

	if current_start - prev_end <= 1.0:
	new_start = prev_end
	else:
	new_start = current_start

	refined_predictions.append((new_start, current_end, current_label))

	return refined_predictions


	def refine_transitions_with_confidence(aggregated_predictions, segment_confidences):
	"""
	Refines transitions between segments based on confidence levels.

	Args:
	aggregated_predictions (list of tuples): Initial aggregated predictions.
	segment_confidences (list of float): Confidence scores corresponding to each segment.

	Returns:
	list of tuples: Refined segment predictions.
	"""
	refined_predictions = []
	for i in range(len(aggregated_predictions)):
	if i == 0:
	refined_predictions.append(aggregated_predictions[i])
	continue

	current_start, current_end, current_label = aggregated_predictions[i]
	prev_start, prev_end, prev_label, prev_confidence = refined_predictions[-1] + (segment_confidences[i - 1],)

	current_confidence = segment_confidences[i]

	if current_label != prev_label:
	if prev_confidence < current_confidence:
	transition_point = current_start
	else:
	transition_point = prev_end
	refined_predictions[-1] = (prev_start, transition_point, prev_label)
	refined_predictions.append((transition_point, current_end, current_label))
	else:
	if prev_confidence < current_confidence:
	refined_predictions[-1] = (prev_start, current_end, current_label)
	else:
	refined_predictions.append((current_start, current_end, current_label))

	return refined_predictions


	def aggregate_segments_with_overlap(segment_predictions):
	"""
	Aggregates overlapping segments into single segments based on speaker labels.

	Args:
	segment_predictions (list of tuples): List of tuples representing (start, end, label) of segments.

	Returns:
	list of tuples: Aggregated segments.
	"""
	aggregated_predictions = []
	last_start, last_end, last_label = segment_predictions[0]

	for start, end, label in segment_predictions[1:]:
	if label == last_label and start <= last_end:
	last_end = max(last_end, end)
	else:
	aggregated_predictions.append((last_start, last_end, last_label))
	last_start, last_end, last_label = start, end, label

	aggregated_predictions.append((last_start, last_end, last_label))

	merged = merge_overlapping_segments(aggregated_predictions)
	return merged


	class SpeakerCounter(Pretrained):
	"""
	A class for counting speakers in an audio file, built upon the SpeechBrain Pretrained class.
	This class integrates several preprocessing and prediction modules to handle speaker diarization tasks.
	"""

	def __init__(self, args, *kwargs):
	"""
	Initialize the SpeakerCounter with standard and custom parameters.
	Args:
	*args: Variable length argument list.
	**kwargs: Arbitrary keyword arguments.
	"""
	super().__init__(args, *kwargs)
	self.sample_rate = self.hparams.sample_rate

	MODULES_NEEDED = [
	"compute_features",
	"mean_var_norm",
	"embedding_model",
	"classifier",
	]

	def resample_waveform(self, waveform, orig_sample_rate):
	"""
	Resamples the input waveform to the target sample rate specified in the object.

	Args:
	waveform (Tensor): The input waveform tensor.
	orig_sample_rate (int): The original sample rate of the waveform.

	Returns:
	Tensor: The resampled waveform.
	"""
	if orig_sample_rate != self.sample_rate:
	resample_transform = torchaudio.transforms.Resample(orig_freq=orig_sample_rate, new_freq=self.sample_rate)
	waveform = resample_transform(waveform)
	return waveform

	def encode_batch(self, wavs, wav_lens=None):
	"""
	Encodes a batch of waveforms into embeddings using the loaded models.

	Args:
	wavs (Tensor): Batch of waveforms.
	wav_lens (Tensor, optional): Lengths of the waveforms for normalization.

	Returns:
	Tensor: Batch of embeddings.
	"""
	if len(wavs.shape) == 1:
	wavs = wavs.unsqueeze(0)

	if wav_lens is None:
	wav_lens = torch.ones(wavs.shape[0], device=self.device)

	wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
	wavs = wavs.float()

	# Computing features and embeddings
	feats = self.mods.compute_features(wavs)
	feats = self.mods.mean_var_norm(feats, wav_lens)
	embeddings = self.mods.embedding_model(feats, wav_lens)
	return embeddings

	def create_segments(self, waveform, segment_length, overlap):
	"""
	Creates segments from a single waveform for batch processing.

	Args:
	waveform (Tensor): Input waveform tensor.
	segment_length (float): Length of each segment in seconds.
	overlap (float): Overlap between segments in seconds.

	Returns:
	tuple: (segments, segment_times) where segments is a list of tensors, and segment_times
	is a list of (start, end) times.
	"""
	num_samples = waveform.shape[1]
	segment_samples = int(segment_length * self.sample_rate)
	overlap_samples = int(overlap * self.sample_rate)
	step_samples = segment_samples - overlap_samples
	segments = []
	segment_times = []

	for start in range(0, num_samples - segment_samples + 1, step_samples):
	end = start + segment_samples
	segments.append(waveform[:, start:end])
	start_time = start / self.sample_rate
	end_time = end / self.sample_rate
	segment_times.append((start_time, end_time))

	return segments, segment_times

	def classify_file(self, path, segment_length=2.0, overlap=1.47):
	"""
	Processes an audio file to classify and count speakers within segments.
	Utilizes multiple stages of processing to handle overlapping speech and transitions.

	Args:
	path (str): Path to the audio file.
	segment_length (float): Length of each segment in seconds.
	overlap (float): Overlap between segments in seconds.

	Outputs:
	Writes the number of speakers in each segment to a text file.
	"""
	waveform, osr = torchaudio.load(path)
	waveform = self.resample_waveform(waveform, osr)

	segments, segment_times = self.create_segments(waveform, segment_length, overlap)
	segment_predictions = []

	for segment, (start_time, end_time) in zip(segments, segment_times):
	rel_length = torch.tensor([1.0])
	emb = self.encode_batch(segment, rel_length)
	out_prob = self.mods.classifier(emb).squeeze(1)
	score, index = torch.max(out_prob, dim=-1)
	text_lab = index.item()
	segment_predictions.append((start_time, end_time, text_lab))

	aggregated_predictions = aggregate_segments_with_overlap(segment_predictions)
	refined_predictions = refine_transitions(aggregated_predictions)
	preds = refine_transitions_with_confidence(aggregated_predictions, refined_predictions)

	with open("sample_segment_predictions.txt", "w") as file:
	for start_time, end_time, prediction in preds:
	speaker_text = "no speech" if str(prediction) == "0" else (
	"1 speaker" if str(prediction) == "1" else f"{prediction} speakers")
	print(f"{start_time:.2f}-{end_time:.2f} has {speaker_text}")
	file.write(f"{start_time:.2f}-{end_time:.2f} has {speaker_text}\n")

	def forward(self, wavs, wav_lens=None):
	"""
	Forward pass for classifying audio using preloaded modules.

	Args:
	wavs (Tensor): Input waveforms.
	wav_lens (Tensor, optional): Lengths of the input waveforms.

	Returns:
	Output from classify_file method.
	"""
	return self.classify_file(wavs, wav_lens)