#Code Author: Jonathan Whitaker 😎

import librosa
import soundfile as sf
from scipy.signal import savgol_filter

# The driving audio file
audio_file = './sounds/bensound-cute.wav' #@param

# How many points in the base latent walk loop
n_points = 6 #@param

# Smooths the animation effect, smaller=jerkier, must be odd
filter_window_size=301 #@param 

# How much should we scale position based on music vs the base path?
chr_scale = 0.5 #@param
base_scale = 0.3 #@param

# Load the file
X, sample_rate = sf.read(audio_file, dtype='float32')

X= X[:int(len(X)*0.5)]

# Remove percussive elements
harmonic = librosa.effects.harmonic(X[:,0])

# Get chroma_stft (power in different notes)
chroma = librosa.feature.chroma_stft(harmonic) # Just one channel

# Smooth these out
chroma = savgol_filter(chroma, filter_window_size, 3)

# Calculate how many frames we want
fps = 25
duration = X.shape[0] / sample_rate
print('Duration:', duration)
n_steps = int(fps * duration)
print('N frames:', n_steps, fps * duration)

latents = torch.randn(n_points, 256)*base_scale
chroma_latents = torch.randn(12, 256)*chr_scale

frames=[]
for i in tqdm(range(n_steps)):
  p1 = max(0, int(n_points*i/n_steps))
  p2 = min(n_points, int(n_points*i/n_steps)+1)%n_points # so it wraps back to 0
  frac = (i-(p1*(n_steps/n_points))) / (n_steps/n_points)
  l = latents[p1]*(1-frac) + latents[p2]*frac
  for c in range(12): # HERE adding the music influence to the latent 
    scale_factor = chroma[c, int(i*chroma.shape[1]/n_steps)]
    l += chroma_latents[c]*chr_scale*scale_factor
  im = model.G(l.unsqueeze(0)).clamp_(0., 1.)
  frame=(im[0].permute(1, 2, 0).detach().cpu().numpy()*255).astype(np.uint8)
  frames.append(frame)