|
import os
|
|
import random
|
|
|
|
import numpy as np
|
|
import soundfile as sf
|
|
import torch
|
|
from cog import BasePredictor, Input, Path
|
|
|
|
from audiosr import build_model, super_resolution
|
|
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
|
torch.set_float32_matmul_precision("high")
|
|
|
|
class Predictor(BasePredictor):
|
|
def setup(self, model_name="basic", device="auto"):
|
|
self.model_name = model_name
|
|
self.device = device
|
|
self.sr = 48000
|
|
self.audiosr = build_model(model_name=self.model_name, device=self.device)
|
|
|
|
def predict(self,
|
|
input_file: Path = Input(description="Audio to upsample"),
|
|
ddim_steps: int = Input(description="Number of inference steps", default=50, ge=10, le=500),
|
|
guidance_scale: float = Input(description="Scale for classifier free guidance", default=3.5, ge=1.0, le=20.0),
|
|
seed: int = Input(description="Random seed. Leave blank to randomize the seed", default=None)
|
|
) -> Path:
|
|
"""Run a single prediction on the model"""
|
|
if seed is None:
|
|
seed = random.randint(0, 2**32 - 1)
|
|
print(f"Setting seed to: {seed}")
|
|
|
|
waveform = super_resolution(
|
|
self.audiosr,
|
|
input_file,
|
|
seed=seed,
|
|
guidance_scale=guidance_scale,
|
|
ddim_steps=ddim_steps,
|
|
latent_t_per_second=12.8
|
|
)
|
|
out_wav = (waveform[0] * 32767).astype(np.int16).T
|
|
sf.write("out.wav", data=out_wav, samplerate=48000)
|
|
return Path("out.wav")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
p = Predictor()
|
|
p.setup()
|
|
out = p.predict(
|
|
"example/music.wav",
|
|
ddim_steps=50,
|
|
guidance_scale=3.5,
|
|
seed=42
|
|
) |