Spaces:
Running
Running
| import gradio as gr | |
| import numpy as np | |
| from pydub import AudioSegment | |
| from pydub.silence import detect_nonsilent | |
| import io | |
| import csv | |
| def numpy_to_audiosegment(audio_array, sampling_rate): | |
| """Converts a NumPy audio array into a Pydub AudioSegment.""" | |
| if np.issubdtype(audio_array.dtype, np.floating): | |
| max_val = np.max(np.abs(audio_array)) | |
| audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range | |
| audio_array = audio_array.astype(np.int16) | |
| audio_segment = AudioSegment( | |
| audio_array.tobytes(), | |
| frame_rate=sampling_rate, | |
| sample_width=audio_array.dtype.itemsize, | |
| channels=1 | |
| ) | |
| return audio_segment | |
| def audiosegment_to_numpy(audio_segment): | |
| """Converts a Pydub AudioSegment back into a NumPy array.""" | |
| samples = np.array(audio_segment.get_array_of_samples()) | |
| return samples | |
| def split_audio_on_silence(audio_segment, chunk_length_s, silence_thresh=-40, min_silence_len=500): | |
| """Splits audio into chunks based on silence, each chunk <= chunk_length_s.""" | |
| max_length = chunk_length_s * 1000 # Convert to milliseconds | |
| nonsilent_ranges = detect_nonsilent(audio_segment, min_silence_len=min_silence_len, silence_thresh=silence_thresh) | |
| chunks = [] | |
| start_time = 0 | |
| for start, end in nonsilent_ranges: | |
| if end - start > max_length: | |
| # Split long nonsilent sections into smaller chunks | |
| while start + max_length <= end: | |
| chunks.append((start, start + max_length)) | |
| start += max_length | |
| chunks.append((start, end)) | |
| start_time = end | |
| return chunks | |
| def format_time(milliseconds): | |
| """Formats time in milliseconds to MM:SS format.""" | |
| seconds = milliseconds / 1000 | |
| minutes = int(seconds // 60) | |
| secs = int(seconds % 60) | |
| return f"{minutes:02}:{secs:02}" | |
| def numpy_to_mp3(audio_array, sampling_rate): | |
| """Converts a numpy audio array to MP3 format.""" | |
| # Normalize audio_array if it's floating-point | |
| if np.issubdtype(audio_array.dtype, np.floating): | |
| max_val = np.max(np.abs(audio_array)) | |
| audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range | |
| audio_array = audio_array.astype(np.int16) | |
| audio_segment = AudioSegment( | |
| audio_array.tobytes(), | |
| frame_rate=sampling_rate, | |
| sample_width=audio_array.dtype.itemsize, | |
| channels=1 | |
| ) | |
| # Export the audio segment to MP3 bytes | |
| mp3_io = io.BytesIO() | |
| audio_segment.export(mp3_io, format="mp3", bitrate="320k") | |
| mp3_bytes = mp3_io.getvalue() | |
| mp3_io.close() | |
| return mp3_bytes | |
| def stream(audio, chunk_length_s): | |
| sampling_rate, array = audio | |
| audio_segment = numpy_to_audiosegment(array, sampling_rate) | |
| # Split the audio based on silence | |
| chunks = split_audio_on_silence(audio_segment, chunk_length_s) | |
| # Prepare output data | |
| formatted_timestamps = [] | |
| for idx, (start, end) in enumerate(chunks): | |
| # Extract the audio chunk | |
| chunk_segment = audio_segment[start:end] | |
| chunk_numpy = audiosegment_to_numpy(chunk_segment) | |
| chunk_mp3 = numpy_to_mp3(chunk_numpy, sampling_rate) | |
| # Format timestamps | |
| start_time_formatted = format_time(start) | |
| end_time_formatted = format_time(end) | |
| formatted_timestamps.append((start_time_formatted, end_time_formatted)) | |
| yield chunk_mp3, formatted_timestamps | |
| # Save timestamps to CSV | |
| with open("silence_based_timestamps.csv", mode="w", newline="") as file: | |
| writer = csv.writer(file) | |
| writer.writerow(["Start Time", "End Time"]) | |
| writer.writerows(formatted_timestamps) | |
| print(f"Timestamps saved to 'silence_based_timestamps.csv'") | |
| print("Formatted timestamps:") | |
| for start, end in formatted_timestamps: | |
| print(f"{start} to {end}") | |
| # Gradio Interface | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_in = gr.Audio(value="librispeech.wav", sources=["upload"], type="numpy", label="Input Audio") | |
| chunk_length = gr.Slider(minimum=10, maximum=30, value=30, step=5, label="Max Chunk Length (s)") | |
| run_button = gr.Button("Split on Silence") | |
| with gr.Column(): | |
| audio_out = gr.Audio(streaming=True, autoplay=True, format="mp3", label="Streamed MP3 Audio") | |
| timestamps_output = gr.Dataframe( | |
| headers=["Start Time", "End Time"], | |
| label="Silence-Based Audio Chunk Timestamps", | |
| interactive=False | |
| ) | |
| # Updated function outputs with the silence-based timestamps | |
| run_button.click( | |
| fn=stream, | |
| inputs=[audio_in, chunk_length], | |
| outputs=[audio_out, timestamps_output] | |
| ) | |
| demo.launch() | |