Spaces:
Build error
Build error
import librosa | |
import numpy as np | |
import moviepy.editor as mpy | |
import random | |
import torch | |
from tqdm import tqdm | |
import dnnlib | |
import legacy | |
target_sr = 22050 | |
def visualize(audio_file, | |
network, | |
truncation, | |
tempo_sensitivity, | |
jitter, | |
frame_length, | |
duration, | |
): | |
print(audio_file) | |
if audio_file: | |
print('\nReading audio \n') | |
audio, sr = librosa.load(audio_file, duration=duration) | |
else: | |
raise ValueError("you must enter an audio file name in the --song argument") | |
# print(sr) | |
# print(audio.dtype) | |
# print(audio.shape) | |
# if audio.shape[0] < duration * sr: | |
# duration = None | |
# else: | |
# frames = duration * sr | |
# audio = audio[:frames] | |
# | |
# print(audio.dtype) | |
# print(audio.shape) | |
# if audio.dtype == np.int16: | |
# print(f'min: {np.min(audio)}, max: {np.max(audio)}') | |
# audio = audio.astype(np.float32, order='C') / 2**15 | |
# elif audio.dtype == np.int32: | |
# print(f'min: {np.min(audio)}, max: {np.max(audio)}') | |
# audio = audio.astype(np.float32, order='C') / 2**31 | |
# audio = audio.T | |
# audio = librosa.to_mono(audio) | |
# audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr, res_type="kaiser_best") | |
# print(audio.dtype) | |
# print(audio.shape) | |
# TODO: | |
batch_size = 1 | |
resolution = 512 | |
tempo_sensitivity = tempo_sensitivity * frame_length / 512 | |
outfile = "output.mp4" | |
# Load pre-trained model | |
device = torch.device('cuda') | |
with dnnlib.util.open_url(network) as f: | |
G = legacy.load_network_pkl(f)['G_ema'].to(device) # type: ignore | |
G.eval() | |
with torch.no_grad(): | |
z = torch.randn([1, G.z_dim]).cuda() # latent codes | |
c = None # class labels (not used in this example) | |
img = G(z, c) # NCHW, float32, dynamic range [-1, +1], no truncation | |
#set device | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
#create spectrogram | |
spec = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=512,fmax=8000, hop_length=frame_length) | |
#get mean power at each time point | |
specm=np.mean(spec,axis=0) | |
#compute power gradient across time points | |
gradm=np.gradient(specm) | |
#set max to 1 | |
gradm=gradm/np.max(gradm) | |
#set negative gradient time points to zero | |
gradm = gradm.clip(min=0) | |
#normalize mean power between 0-1 | |
specm=(specm-np.min(specm))/np.ptp(specm) | |
#initialize first noise vector | |
nv1 = torch.randn([G.z_dim]).cuda() | |
#initialize list of class and noise vectors | |
noise_vectors=[nv1] | |
#initialize previous vectors (will be used to track the previous frame) | |
nvlast=nv1 | |
#initialize the direction of noise vector unit updates | |
update_dir=np.zeros(512) | |
print(len(nv1)) | |
for ni,n in enumerate(nv1): | |
if n<0: | |
update_dir[ni] = 1 | |
else: | |
update_dir[ni] = -1 | |
#initialize noise unit update | |
update_last=np.zeros(512) | |
#get new jitters | |
def new_jitters(jitter): | |
jitters=np.zeros(512) | |
for j in range(512): | |
if random.uniform(0,1)<0.5: | |
jitters[j]=1 | |
else: | |
jitters[j]=1-jitter | |
return jitters | |
#get new update directions | |
def new_update_dir(nv2,update_dir): | |
for ni,n in enumerate(nv2): | |
if n >= 2*truncation - tempo_sensitivity: | |
update_dir[ni] = -1 | |
elif n < -2*truncation + tempo_sensitivity: | |
update_dir[ni] = 1 | |
return update_dir | |
print('\nGenerating input vectors \n') | |
for i in tqdm(range(len(gradm))): | |
#update jitter vector every 100 frames by setting ~half of noise vector units to lower sensitivity | |
if i%200==0: | |
jitters=new_jitters(jitter) | |
#get last noise vector | |
nv1=nvlast | |
#set noise vector update based on direction, sensitivity, jitter, and combination of overall power and gradient of power | |
update = np.array([tempo_sensitivity for k in range(512)]) * (gradm[i]+specm[i]) * update_dir * jitters | |
#smooth the update with the previous update (to avoid overly sharp frame transitions) | |
update=(update+update_last*3)/4 | |
#set last update | |
update_last=update | |
#update noise vector | |
nv2=nv1.cpu()+update | |
#append to noise vectors | |
noise_vectors.append(nv2) | |
#set last noise vector | |
nvlast=nv2 | |
#update the direction of noise units | |
update_dir=new_update_dir(nv2,update_dir) | |
noise_vectors = torch.stack([nv.cuda() for nv in noise_vectors]) | |
print('\n\nGenerating frames \n') | |
frames = [] | |
for i in tqdm(range(noise_vectors.shape[0] // batch_size)): | |
noise_vector=noise_vectors[i*batch_size:(i+1)*batch_size] | |
c = None # class labels (not used in this example) | |
with torch.no_grad(): | |
img = np.array(G(noise_vector, c, truncation_psi=truncation, noise_mode='const').cpu()) # NCHW, float32, dynamic range [-1, +1], no truncation | |
img = np.transpose(img, (0,2,3,1)) #CHW -> HWC | |
img = np.clip((img * 127.5 + 128), 0, 255).astype(np.uint8) | |
# add to frames | |
for im in img: | |
frames.append(im) | |
#Save video | |
aud = mpy.AudioFileClip(audio_file) | |
if duration is not None: | |
aud.duration = duration | |
fps = target_sr / frame_length | |
clip = mpy.ImageSequenceClip(frames, fps=fps) | |
clip = clip.set_audio(aud) | |
clip.write_videofile(outfile, audio_codec='aac', ffmpeg_params=[ | |
# "-vf", "scale=-1:2160:flags=lanczos", | |
"-bf", "2", | |
"-g", f"{fps/2}", | |
"-crf", "18", | |
"-movflags", "faststart" | |
]) | |
return outfile |