Spaces:
Running
Running
# Diarization_Lib.py | |
######################################### | |
# Diarization Library | |
# This library is used to perform diarization of audio files. | |
# Currently, uses FIXME for transcription. | |
# | |
#################### | |
#################### | |
# Function List | |
# | |
# 1. speaker_diarize(video_file_path, segments, embedding_model = "pyannote/embedding", embedding_size=512, num_speakers=0) | |
# | |
#################### | |
# Import necessary libraries | |
import configparser | |
import json | |
import logging | |
import os | |
from pathlib import Path | |
import time | |
# Import Local | |
from App_Function_Libraries.Audio_Transcription_Lib import speech_to_text | |
# | |
# Import 3rd Party | |
from pyannote.audio import Model | |
from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization | |
import torch | |
import yaml | |
# | |
####################################################################################################################### | |
# Function Definitions | |
# | |
def load_pipeline_from_pretrained(path_to_config: str | Path) -> SpeakerDiarization: | |
path_to_config = Path(path_to_config).resolve() | |
print(f"Loading pyannote pipeline from {path_to_config}...") | |
if not path_to_config.exists(): | |
raise FileNotFoundError(f"Config file not found: {path_to_config}") | |
# Load the YAML configuration | |
with open(path_to_config, 'r') as config_file: | |
config = yaml.safe_load(config_file) | |
# Store current working directory | |
cwd = Path.cwd().resolve() | |
# Change to the directory containing the config file | |
cd_to = path_to_config.parent.resolve() | |
print(f"Changing working directory to {cd_to}") | |
os.chdir(cd_to) | |
try: | |
# Create a SpeakerDiarization pipeline | |
pipeline = SpeakerDiarization() | |
# Load models explicitly from local paths | |
embedding_path = Path(config['pipeline']['params']['embedding']).resolve() | |
segmentation_path = Path(config['pipeline']['params']['segmentation']).resolve() | |
if not embedding_path.exists(): | |
raise FileNotFoundError(f"Embedding model file not found: {embedding_path}") | |
if not segmentation_path.exists(): | |
raise FileNotFoundError(f"Segmentation model file not found: {segmentation_path}") | |
# Load the models from local paths using pyannote's Model class | |
pipeline.embedding = Model.from_pretrained(str(embedding_path), map_location=torch.device('cpu')) | |
pipeline.segmentation = Model.from_pretrained(str(segmentation_path), map_location=torch.device('cpu')) | |
# Set other parameters | |
pipeline.clustering = config['pipeline']['params']['clustering'] | |
pipeline.embedding_batch_size = config['pipeline']['params']['embedding_batch_size'] | |
pipeline.embedding_exclude_overlap = config['pipeline']['params']['embedding_exclude_overlap'] | |
pipeline.segmentation_batch_size = config['pipeline']['params']['segmentation_batch_size'] | |
# Set additional parameters | |
pipeline.instantiate(config['params']) | |
finally: | |
# Change back to the original working directory | |
print(f"Changing working directory back to {cwd}") | |
os.chdir(cwd) | |
return pipeline | |
def audio_diarization(audio_file_path): | |
logging.info('audio-diarization: Loading pyannote pipeline') | |
config = configparser.ConfigParser() | |
config.read('config.txt') | |
processing_choice = config.get('Processing', 'processing_choice', fallback='cpu') | |
base_dir = Path(__file__).parent.resolve() | |
config_path = base_dir / 'models' / 'config.yaml' | |
pipeline = load_pipeline_from_pretrained(config_path) | |
time_start = time.time() | |
if audio_file_path is None: | |
raise ValueError("audio-diarization: No audio file provided") | |
logging.info("audio-diarization: Audio file path: %s", audio_file_path) | |
try: | |
_, file_ending = os.path.splitext(audio_file_path) | |
out_file = audio_file_path.replace(file_ending, ".diarization.json") | |
prettified_out_file = audio_file_path.replace(file_ending, ".diarization_pretty.json") | |
if os.path.exists(out_file): | |
logging.info("audio-diarization: Diarization file already exists: %s", out_file) | |
with open(out_file) as f: | |
global diarization_result | |
diarization_result = json.load(f) | |
return diarization_result | |
logging.info('audio-diarization: Starting diarization...') | |
diarization_result = pipeline(audio_file_path) | |
segments = [] | |
for turn, _, speaker in diarization_result.itertracks(yield_label=True): | |
chunk = { | |
"Time_Start": turn.start, | |
"Time_End": turn.end, | |
"Speaker": speaker | |
} | |
logging.debug("Segment: %s", chunk) | |
segments.append(chunk) | |
logging.info("audio-diarization: Diarization completed with pyannote") | |
output_data = {'segments': segments} | |
logging.info("audio-diarization: Saving prettified JSON to %s", prettified_out_file) | |
with open(prettified_out_file, 'w') as f: | |
json.dump(output_data, f, indent=2) | |
logging.info("audio-diarization: Saving JSON to %s", out_file) | |
with open(out_file, 'w') as f: | |
json.dump(output_data, f) | |
except Exception as e: | |
logging.error("audio-diarization: Error performing diarization: %s", str(e)) | |
raise RuntimeError("audio-diarization: Error performing diarization") | |
return segments | |
def combine_transcription_and_diarization(audio_file_path): | |
logging.info('combine-transcription-and-diarization: Starting transcription and diarization...') | |
transcription_result = speech_to_text(audio_file_path) | |
diarization_result = audio_diarization(audio_file_path) | |
combined_result = [] | |
for transcription_segment in transcription_result: | |
for diarization_segment in diarization_result: | |
if transcription_segment['Time_Start'] >= diarization_segment['Time_Start'] and transcription_segment[ | |
'Time_End'] <= diarization_segment['Time_End']: | |
combined_segment = { | |
"Time_Start": transcription_segment['Time_Start'], | |
"Time_End": transcription_segment['Time_End'], | |
"Speaker": diarization_segment['Speaker'], | |
"Text": transcription_segment['Text'] | |
} | |
combined_result.append(combined_segment) | |
break | |
_, file_ending = os.path.splitext(audio_file_path) | |
out_file = audio_file_path.replace(file_ending, ".combined.json") | |
prettified_out_file = audio_file_path.replace(file_ending, ".combined_pretty.json") | |
logging.info("combine-transcription-and-diarization: Saving prettified JSON to %s", prettified_out_file) | |
with open(prettified_out_file, 'w') as f: | |
json.dump(combined_result, f, indent=2) | |
logging.info("combine-transcription-and-diarization: Saving JSON to %s", out_file) | |
with open(out_file, 'w') as f: | |
json.dump(combined_result, f) | |
return combined_result | |
# | |
# | |
####################################################################################################################### |