apenasissso's picture
process from file and fix short audios
7274bc5
import logging
from speechbrain.pretrained import EncoderClassifier
from typing import Dict, List, Any
import requests
from pydub import AudioSegment
from io import BytesIO
import tempfile
import os
def save_chunks_to_temp_files(url, chunk_length=5000): # chunk_length in milliseconds
# Download the audio file from the URL
if not url.startswith("file://"):
response = requests.get(url)
response.raise_for_status()
# Ensure the content type is audio
if "audio" not in response.headers["Content-Type"]:
raise ValueError("URL does not seem to be an audio file")
# Convert the downloaded bytes into a file-like object
audio_file = BytesIO(response.content)
# Load audio into an AudioSegment
audio_segment = AudioSegment.from_file(audio_file)
else:
audio_segment = AudioSegment.from_file(url[7:])
# Split audio into 10-second chunks
chunks = [
audio_segment[i : i + chunk_length]
for i in range(0, len(audio_segment), chunk_length)
]
if len(chunks) > 1:
chunks[-1] = audio_segment[-chunk_length:]
# Save each chunk to a temporary file and store file paths in a list
temp_files = []
for idx, chunk in enumerate(chunks):
with tempfile.NamedTemporaryFile(
delete=False, suffix=f"_chunk{idx}.mp3"
) as temp_file:
chunk.export(temp_file.name, format="mp3")
temp_files.append(temp_file.name)
return temp_files
class EndpointHandler:
def __init__(self, path=""):
self.model = EncoderClassifier.from_hparams(
"speechbrain/lang-id-voxlingua107-ecapa"
)
print("model loaded")
logging.info("model loaded")
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
url = data.pop("inputs", data)
print("audio_url", url)
logging.info(f"audio_url {url}")
response = []
temp_filepaths = save_chunks_to_temp_files(url)
for i, path in enumerate(temp_filepaths):
logging.info(f"processing chunk {i} / {len(temp_filepaths)}")
output = self.model.classify_file(path)
response.append(
{
"prediction": float(output[1].exp()[0]),
"language": output[3][0],
}
)
os.remove(path)
return response