import moviepy.editor as mp import librosa import numpy as np import gradio as gr import subprocess def buffer_n_merge(intervals, buffer=0.1): if not intervals: return [] new_intervals = [intervals[0]] new_intervals[0][0] -= buffer new_intervals[0][1] += buffer for start, end in intervals[1:]: start -= buffer end += buffer if new_intervals[-1][-1] >= start: new_intervals[-1][-1] = end else: new_intervals.append([start, end]) return new_intervals def download_and_process_video(in_f, threshold_db, buffer_sec): vidpath = in_f.name # load the video video = mp.VideoFileClip(vidpath) # extract audio and convert to mono audio = video.audio.to_soundarray(fps=22000) # use librosa to get non-silent intervals non_silent_intervals = librosa.effects.split(audio[:, 0], top_db=threshold_db) # convert non_silent_intervals from samples to seconds, as librosa works with samples not seconds non_silent_intervals_sec = np.array(non_silent_intervals) / 22000 # Add buffer and merge intervals non_silent_intervals_sec = buffer_n_merge(non_silent_intervals_sec.tolist(), buffer=buffer_sec) # Process video # cut the video using the non-silent intervals and store the clips in a list clips = [video.subclip(max(0, start_time), min(end_time, video.duration)) for start_time, end_time in non_silent_intervals_sec] output_file = 'my_concatenation.mp4' final_clip = mp.concatenate_videoclips(clips) final_clip.write_videofile(output_file, codec='libx264', audio_codec='aac', temp_audiofile='temp-audio.m4a', remove_temp=True) return output_file iface = gr.Interface( fn=download_and_process_video, inputs=[ gr.inputs.File(label="Video File (.mp4 only)", file_count='single', type='file'), gr.inputs.Slider(minimum=1, maximum=70, step=1, default=30, label="Threshold (db)"), gr.inputs.Slider(minimum=0, maximum=2, step=0.01, default=0.1, label="Buffer (sec)"), ], outputs=gr.outputs.Video(label="Processed Video"), title="Video Silence Remover" ) iface.launch()