|
import librosa |
|
import numpy as np |
|
import moviepy.editor as mpy |
|
import random |
|
import torch |
|
from tqdm import tqdm |
|
import dnnlib |
|
import legacy |
|
|
|
target_sr = 22050 |
|
|
|
def visualize(audio_file, |
|
network, |
|
truncation, |
|
tempo_sensitivity, |
|
jitter, |
|
frame_length, |
|
duration, |
|
): |
|
print(audio_file) |
|
|
|
if audio_file: |
|
print('\nReading audio \n') |
|
audio, sr = librosa.load(audio_file, duration=duration) |
|
else: |
|
raise ValueError("you must enter an audio file name in the --song argument") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
batch_size = 1 |
|
resolution = 512 |
|
outfile="output.mp4" |
|
|
|
tempo_sensitivity = tempo_sensitivity * frame_length / 512 |
|
|
|
|
|
device = torch.device('cuda') |
|
with dnnlib.util.open_url(network) as f: |
|
G = legacy.load_network_pkl(f)['G_ema'].to(device) |
|
G.eval() |
|
|
|
with torch.no_grad(): |
|
z = torch.randn([1, G.z_dim]).cuda() |
|
c = None |
|
img = G(z, c) |
|
|
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
|
|
|
spec = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=512,fmax=8000, hop_length=frame_length) |
|
|
|
|
|
specm=np.mean(spec,axis=0) |
|
|
|
|
|
gradm=np.gradient(specm) |
|
|
|
|
|
gradm=gradm/np.max(gradm) |
|
|
|
|
|
gradm = gradm.clip(min=0) |
|
|
|
|
|
specm=(specm-np.min(specm))/np.ptp(specm) |
|
|
|
|
|
nv1 = torch.randn([G.z_dim]).cuda() |
|
|
|
|
|
noise_vectors=[nv1] |
|
|
|
|
|
nvlast=nv1 |
|
|
|
|
|
update_dir=np.zeros(512) |
|
print(len(nv1)) |
|
for ni,n in enumerate(nv1): |
|
if n<0: |
|
update_dir[ni] = 1 |
|
else: |
|
update_dir[ni] = -1 |
|
|
|
|
|
update_last=np.zeros(512) |
|
|
|
|
|
def new_jitters(jitter): |
|
jitters=np.zeros(512) |
|
for j in range(512): |
|
if random.uniform(0,1)<0.5: |
|
jitters[j]=1 |
|
else: |
|
jitters[j]=1-jitter |
|
return jitters |
|
|
|
|
|
|
|
def new_update_dir(nv2,update_dir): |
|
for ni,n in enumerate(nv2): |
|
if n >= 2*truncation - tempo_sensitivity: |
|
update_dir[ni] = -1 |
|
|
|
elif n < -2*truncation + tempo_sensitivity: |
|
update_dir[ni] = 1 |
|
return update_dir |
|
|
|
print('\nGenerating input vectors \n') |
|
for i in tqdm(range(len(gradm))): |
|
|
|
|
|
if i%200==0: |
|
jitters=new_jitters(jitter) |
|
|
|
|
|
nv1=nvlast |
|
|
|
|
|
update = np.array([tempo_sensitivity for k in range(512)]) * (gradm[i]+specm[i]) * update_dir * jitters |
|
|
|
|
|
update=(update+update_last*3)/4 |
|
|
|
|
|
update_last=update |
|
|
|
|
|
nv2=nv1.cpu()+update |
|
|
|
|
|
noise_vectors.append(nv2) |
|
|
|
|
|
nvlast=nv2 |
|
|
|
|
|
update_dir=new_update_dir(nv2,update_dir) |
|
|
|
noise_vectors = torch.stack([nv.cuda() for nv in noise_vectors]) |
|
|
|
|
|
print('\n\nGenerating frames \n') |
|
frames = [] |
|
for i in tqdm(range(noise_vectors.shape[0] // batch_size)): |
|
|
|
noise_vector=noise_vectors[i*batch_size:(i+1)*batch_size] |
|
|
|
c = None |
|
with torch.no_grad(): |
|
img = np.array(G(noise_vector, c, truncation_psi=truncation, noise_mode='const').cpu()) |
|
img = np.transpose(img, (0,2,3,1)) |
|
img = np.clip((img * 127.5 + 128), 0, 255).astype(np.uint8) |
|
|
|
|
|
for im in img: |
|
frames.append(im) |
|
|
|
|
|
|
|
aud = mpy.AudioFileClip(audio_file) |
|
|
|
if duration < aud.duration: |
|
aud.duration = duration |
|
|
|
fps = target_sr / frame_length |
|
clip = mpy.ImageSequenceClip(frames, fps=fps) |
|
clip = clip.set_audio(aud) |
|
clip.write_videofile(outfile, audio_codec='aac', ffmpeg_params=[ |
|
|
|
"-bf", "2", |
|
"-g", f"{fps/2}", |
|
"-crf", "18", |
|
"-movflags", "faststart" |
|
]) |
|
|
|
return outfile |