import logging from speechbrain.pretrained import EncoderClassifier from typing import Dict, List, Any import requests from pydub import AudioSegment from io import BytesIO import tempfile import os def save_chunks_to_temp_files(url, chunk_length=5000): # chunk_length in milliseconds # Download the audio file from the URL if not url.startswith("file://"): response = requests.get(url) response.raise_for_status() # Ensure the content type is audio if "audio" not in response.headers["Content-Type"]: raise ValueError("URL does not seem to be an audio file") # Convert the downloaded bytes into a file-like object audio_file = BytesIO(response.content) # Load audio into an AudioSegment audio_segment = AudioSegment.from_file(audio_file) else: audio_segment = AudioSegment.from_file(url[7:]) # Split audio into 10-second chunks chunks = [ audio_segment[i : i + chunk_length] for i in range(0, len(audio_segment), chunk_length) ] if len(chunks) > 1: chunks[-1] = audio_segment[-chunk_length:] # Save each chunk to a temporary file and store file paths in a list temp_files = [] for idx, chunk in enumerate(chunks): with tempfile.NamedTemporaryFile( delete=False, suffix=f"_chunk{idx}.mp3" ) as temp_file: chunk.export(temp_file.name, format="mp3") temp_files.append(temp_file.name) return temp_files class EndpointHandler: def __init__(self, path=""): self.model = EncoderClassifier.from_hparams( "speechbrain/lang-id-voxlingua107-ecapa" ) print("model loaded") logging.info("model loaded") def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: url = data.pop("inputs", data) print("audio_url", url) logging.info(f"audio_url {url}") response = [] temp_filepaths = save_chunks_to_temp_files(url) for i, path in enumerate(temp_filepaths): logging.info(f"processing chunk {i} / {len(temp_filepaths)}") output = self.model.classify_file(path) response.append( { "prediction": float(output[1].exp()[0]), "language": output[3][0], } ) os.remove(path) return response