#Code Author: Jonathan Whitaker 😎 import librosa import soundfile as sf from scipy.signal import savgol_filter # The driving audio file audio_file = './sounds/bensound-cute.wav' #@param # How many points in the base latent walk loop n_points = 6 #@param # Smooths the animation effect, smaller=jerkier, must be odd filter_window_size=301 #@param # How much should we scale position based on music vs the base path? chr_scale = 0.5 #@param base_scale = 0.3 #@param # Load the file X, sample_rate = sf.read(audio_file, dtype='float32') X= X[:int(len(X)*0.5)] # Remove percussive elements harmonic = librosa.effects.harmonic(X[:,0]) # Get chroma_stft (power in different notes) chroma = librosa.feature.chroma_stft(harmonic) # Just one channel # Smooth these out chroma = savgol_filter(chroma, filter_window_size, 3) # Calculate how many frames we want fps = 25 duration = X.shape[0] / sample_rate print('Duration:', duration) n_steps = int(fps * duration) print('N frames:', n_steps, fps * duration) latents = torch.randn(n_points, 256)*base_scale chroma_latents = torch.randn(12, 256)*chr_scale frames=[] for i in tqdm(range(n_steps)): p1 = max(0, int(n_points*i/n_steps)) p2 = min(n_points, int(n_points*i/n_steps)+1)%n_points # so it wraps back to 0 frac = (i-(p1*(n_steps/n_points))) / (n_steps/n_points) l = latents[p1]*(1-frac) + latents[p2]*frac for c in range(12): # HERE adding the music influence to the latent scale_factor = chroma[c, int(i*chroma.shape[1]/n_steps)] l += chroma_latents[c]*chr_scale*scale_factor im = model.G(l.unsqueeze(0)).clamp_(0., 1.) frame=(im[0].permute(1, 2, 0).detach().cpu().numpy()*255).astype(np.uint8) frames.append(frame)