generative-music-visualizer

Paused

App Files Files Community

generative-music-visualizer / visualize.py

kevinwang676

Duplicate from lambdalabs/generative-music-visualizer

b8e88d2 over 2 years ago

raw

history blame contribute delete

5.93 kB

	import librosa
	import numpy as np
	import moviepy.editor as mpy
	import random
	import torch
	from tqdm import tqdm
	import dnnlib
	import legacy

	target_sr = 22050

	def visualize(audio_file,
	network,
	truncation,
	tempo_sensitivity,
	jitter,
	frame_length,
	duration,
	):
	print(audio_file)

	if audio_file:
	print('\nReading audio \n')
	audio, sr = librosa.load(audio_file, duration=duration)
	else:
	raise ValueError("you must enter an audio file name in the --song argument")

	# print(sr)
	# print(audio.dtype)
	# print(audio.shape)
	# if audio.shape[0] < duration * sr:
	# duration = None
	# else:
	# frames = duration * sr
	# audio = audio[:frames]
	#
	# print(audio.dtype)
	# print(audio.shape)
	# if audio.dtype == np.int16:
	# print(f'min: {np.min(audio)}, max: {np.max(audio)}')
	# audio = audio.astype(np.float32, order='C') / 2**15
	# elif audio.dtype == np.int32:
	# print(f'min: {np.min(audio)}, max: {np.max(audio)}')
	# audio = audio.astype(np.float32, order='C') / 2**31
	# audio = audio.T
	# audio = librosa.to_mono(audio)
	# audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr, res_type="kaiser_best")
	# print(audio.dtype)
	# print(audio.shape)



	# TODO:
	batch_size = 1
	resolution = 512
	outfile="output.mp4"

	tempo_sensitivity = tempo_sensitivity * frame_length / 512

	# Load pre-trained model
	device = torch.device('cuda')
	with dnnlib.util.open_url(network) as f:
	G = legacy.load_network_pkl(f)['G_ema'].to(device) # type: ignore
	G.eval()

	with torch.no_grad():
	z = torch.randn([1, G.z_dim]).cuda() # latent codes
	c = None # class labels (not used in this example)
	img = G(z, c) # NCHW, float32, dynamic range [-1, +1], no truncation

	#set device
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	#create spectrogram
	spec = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=512,fmax=8000, hop_length=frame_length)

	#get mean power at each time point
	specm=np.mean(spec,axis=0)

	#compute power gradient across time points
	gradm=np.gradient(specm)

	#set max to 1
	gradm=gradm/np.max(gradm)

	#set negative gradient time points to zero
	gradm = gradm.clip(min=0)

	#normalize mean power between 0-1
	specm=(specm-np.min(specm))/np.ptp(specm)

	#initialize first noise vector
	nv1 = torch.randn([G.z_dim]).cuda()

	#initialize list of class and noise vectors
	noise_vectors=[nv1]

	#initialize previous vectors (will be used to track the previous frame)
	nvlast=nv1

	#initialize the direction of noise vector unit updates
	update_dir=np.zeros(512)
	print(len(nv1))
	for ni,n in enumerate(nv1):
	if n<0:
	update_dir[ni] = 1
	else:
	update_dir[ni] = -1

	#initialize noise unit update
	update_last=np.zeros(512)

	#get new jitters
	def new_jitters(jitter):
	jitters=np.zeros(512)
	for j in range(512):
	if random.uniform(0,1)<0.5:
	jitters[j]=1
	else:
	jitters[j]=1-jitter
	return jitters


	#get new update directions
	def new_update_dir(nv2,update_dir):
	for ni,n in enumerate(nv2):
	if n >= 2*truncation - tempo_sensitivity:
	update_dir[ni] = -1

	elif n < -2*truncation + tempo_sensitivity:
	update_dir[ni] = 1
	return update_dir

	print('\nGenerating input vectors \n')
	for i in tqdm(range(len(gradm))):

	#update jitter vector every 100 frames by setting ~half of noise vector units to lower sensitivity
	if i%200==0:
	jitters=new_jitters(jitter)

	#get last noise vector
	nv1=nvlast

	#set noise vector update based on direction, sensitivity, jitter, and combination of overall power and gradient of power
	update = np.array([tempo_sensitivity for k in range(512)]) * (gradm[i]+specm[i]) * update_dir * jitters

	#smooth the update with the previous update (to avoid overly sharp frame transitions)
	update=(update+update_last*3)/4

	#set last update
	update_last=update

	#update noise vector
	nv2=nv1.cpu()+update

	#append to noise vectors
	noise_vectors.append(nv2)

	#set last noise vector
	nvlast=nv2

	#update the direction of noise units
	update_dir=new_update_dir(nv2,update_dir)

	noise_vectors = torch.stack([nv.cuda() for nv in noise_vectors])


	print('\n\nGenerating frames \n')
	frames = []
	for i in tqdm(range(noise_vectors.shape[0] // batch_size)):

	noise_vector=noise_vectors[ibatch_size:(i+1)batch_size]

	c = None # class labels (not used in this example)
	with torch.no_grad():
	img = np.array(G(noise_vector, c, truncation_psi=truncation, noise_mode='const').cpu()) # NCHW, float32, dynamic range [-1, +1], no truncation
	img = np.transpose(img, (0,2,3,1)) #CHW -> HWC
	img = np.clip((img * 127.5 + 128), 0, 255).astype(np.uint8)

	# add to frames
	for im in img:
	frames.append(im)


	#Save video
	aud = mpy.AudioFileClip(audio_file)

	if duration < aud.duration:
	aud.duration = duration

	fps = target_sr / frame_length
	clip = mpy.ImageSequenceClip(frames, fps=fps)
	clip = clip.set_audio(aud)
	clip.write_videofile(outfile, audio_codec='aac', ffmpeg_params=[
	# "-vf", "scale=-1:2160:flags=lanczos",
	"-bf", "2",
	"-g", f"{fps/2}",
	"-crf", "18",
	"-movflags", "faststart"
	])

	return outfile