namelessai
/

audiosr

Model card Files Files and versions Community

namelessai commited on Dec 5, 2024

Commit

cb25d6b

verified ·

1 Parent(s): 5167d22

upload main files

Browse files

Files changed (4) hide show

__init__.py +2 -0
__main__.py +123 -0
lowpass.py +249 -0
pipeline.py +175 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .utils import seed_everything, save_wave, get_time, get_duration, read_list
2	+ from .pipeline import *

__main__.py ADDED Viewed

	@@ -0,0 +1,123 @@

+#!/usr/bin/python3
+import os
+import torch
+import logging
+from audiosr import super_resolution, build_model, save_wave, get_time, read_list
+import argparse
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+matplotlib_logger = logging.getLogger('matplotlib')
+matplotlib_logger.setLevel(logging.WARNING)
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-i",
+    "--input_audio_file",
+    type=str,
+    required=False,
+    help="Input audio file for audio super resolution",
+)
+parser.add_argument(
+    "-il",
+    "--input_file_list",
+    type=str,
+    required=False,
+    default="",
+    help="A file that contains all audio files that need to perform audio super resolution",
+)
+parser.add_argument(
+    "-s",
+    "--save_path",
+    type=str,
+    required=False,
+    help="The path to save model output",
+    default="./output",
+)
+parser.add_argument(
+    "--model_name",
+    type=str,
+    required=False,
+    help="The checkpoint you gonna use",
+    default="basic",
+    choices=["basic","speech"]
+)
+parser.add_argument(
+    "-d",
+    "--device",
+    type=str,
+    required=False,
+    help="The device for computation. If not specified, the script will automatically choose the device based on your environment.",
+    default="auto",
+)
+parser.add_argument(
+    "--ddim_steps",
+    type=int,
+    required=False,
+    default=50,
+    help="The sampling step for DDIM",
+)
+parser.add_argument(
+    "-gs",
+    "--guidance_scale",
+    type=float,
+    required=False,
+    default=3.5,
+    help="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)",
+)
+parser.add_argument(
+    "--seed",
+    type=int,
+    required=False,
+    default=42,
+    help="Change this value (any integer number) will lead to a different generation result.",
+)
+parser.add_argument(
+    "--suffix",
+    type=str,
+    required=False,
+    help="Suffix for the output file",
+    default="_AudioSR_Processed_48K",
+)
+args = parser.parse_args()
+torch.set_float32_matmul_precision("high")
+save_path = os.path.join(args.save_path, get_time())
+assert args.input_file_list is not None or args.input_audio_file is not None,"Please provide either a list of audio files or a single audio file"
+input_file = args.input_audio_file
+random_seed = args.seed
+sample_rate=48000
+latent_t_per_second=12.8
+guidance_scale = args.guidance_scale
+os.makedirs(save_path, exist_ok=True)
+audiosr = build_model(model_name=args.model_name, device=args.device)
+if(args.input_file_list):
+    print("Generate audio based on the text prompts in %s" % args.input_file_list)
+    files_todo = read_list(args.input_file_list)
+else:
+    files_todo = [input_file]
+for input_file in files_todo:
+    name = os.path.splitext(os.path.basename(input_file))[0] + args.suffix
+    waveform = super_resolution(
+        audiosr,
+        input_file,
+        seed=random_seed,
+        guidance_scale=guidance_scale,
+        ddim_steps=args.ddim_steps,
+        latent_t_per_second=latent_t_per_second
+    )
+    save_wave(waveform, inputpath=input_file, savepath=save_path, name=name, samplerate=sample_rate)

lowpass.py ADDED Viewed

	@@ -0,0 +1,249 @@

+from scipy.signal import butter, lfilter
+import torch
+from scipy import signal
+import librosa
+import numpy as np
+from scipy.signal import sosfiltfilt
+from scipy.signal import butter, cheby1, cheby2, ellip, bessel
+from scipy.signal import resample_poly
+def align_length(x=None, y=None, Lx=None):
+    """align the length of y to that of x
+    Args:
+        x (np.array): reference signal
+        y (np.array): the signal needs to be length aligned
+    Return:
+        yy (np.array): signal with the same length as x
+    """
+    assert y is not None
+    if Lx is None:
+        Lx = len(x)
+    Ly = len(y)
+    if Lx == Ly:
+        return y
+    elif Lx > Ly:
+        # pad y with zeros
+        return np.pad(y, (0, Lx - Ly), mode="constant")
+    else:
+        # cut y
+        return y[:Lx]
+def bandpass_filter(x, lowcut, highcut, fs, order, ftype):
+    """process input signal x using bandpass filter
+    Args:
+        x (np.array): input signal
+        lowcut (float): low cutoff frequency
+        highcut (float): high cutoff frequency
+        order (int): the order of filter
+        ftype (string): type of filter
+            ['butter', 'cheby1', 'cheby2', 'ellip', 'bessel']
+    Return:
+        y (np.array): filtered signal
+    """
+    nyq = 0.5 * fs
+    lo = lowcut / nyq
+    hi = highcut / nyq
+    if ftype == "butter":
+        # b, a = butter(order, [lo, hi], btype='band')
+        sos = butter(order, [lo, hi], btype="band", output="sos")
+    elif ftype == "cheby1":
+        sos = cheby1(order, 0.1, [lo, hi], btype="band", output="sos")
+    elif ftype == "cheby2":
+        sos = cheby2(order, 60, [lo, hi], btype="band", output="sos")
+    elif ftype == "ellip":
+        sos = ellip(order, 0.1, 60, [lo, hi], btype="band", output="sos")
+    elif ftype == "bessel":
+        sos = bessel(order, [lo, hi], btype="band", output="sos")
+    else:
+        raise Exception(f"The bandpass filter {ftype} is not supported!")
+    # y = lfilter(b, a, x)
+    y = sosfiltfilt(sos, x)
+    if len(y) != len(x):
+        y = align_length(x, y)
+    return y
+def lowpass_filter(x, highcut, fs, order, ftype):
+    """process input signal x using lowpass filter
+    Args:
+        x (np.array): input signal
+        highcut (float): high cutoff frequency
+        order (int): the order of filter
+        ftype (string): type of filter
+            ['butter', 'cheby1', 'cheby2', 'ellip', 'bessel']
+    Return:
+        y (np.array): filtered signal
+    """
+    nyq = 0.5 * fs
+    hi = highcut / nyq
+    if ftype == "butter":
+        sos = butter(order, hi, btype="low", output="sos")
+    elif ftype == "cheby1":
+        sos = cheby1(order, 0.1, hi, btype="low", output="sos")
+    elif ftype == "cheby2":
+        sos = cheby2(order, 60, hi, btype="low", output="sos")
+    elif ftype == "ellip":
+        sos = ellip(order, 0.1, 60, hi, btype="low", output="sos")
+    elif ftype == "bessel":
+        sos = bessel(order, hi, btype="low", output="sos")
+    else:
+        raise Exception(f"The lowpass filter {ftype} is not supported!")
+    y = sosfiltfilt(sos, x)
+    if len(y) != len(x):
+        y = align_length(x, y)
+    y_len = len(y)
+    y = stft_hard_lowpass(y, hi, fs_ori=fs)
+    y = sosfiltfilt(sos, y)
+    if len(y) != y_len:
+        y = align_length(y=y, Lx=y_len)
+    return y
+def stft_hard_lowpass(data, lowpass_ratio, fs_ori=44100):
+    fs_down = int(lowpass_ratio * fs_ori)
+    # downsample to the low sampling rate
+    y = resample_poly(data, fs_down, fs_ori)
+    # upsample to the original sampling rate
+    y = resample_poly(y, fs_ori, fs_down)
+    if len(y) != len(data):
+        y = align_length(data, y)
+    return y
+def limit(integer, high, low):
+    if integer > high:
+        return high
+    elif integer < low:
+        return low
+    else:
+        return int(integer)
+def lowpass(data, highcut, fs, order=5, _type="butter"):
+    """
+    :param data: np.float32 type 1d time numpy array, (samples,) , can not be (samples, 1) !!!!!!!!!!!!
+    :param highcut: cutoff frequency
+    :param fs: sample rate of the original data
+    :param order: order of the filter
+    :return: filtered data, (samples,)
+    """
+    if len(list(data.shape)) != 1:
+        raise ValueError(
+            "Error (chebyshev_lowpass_filter): Data "
+            + str(data.shape)
+            + " should be type 1d time array, (samples,) , can not be (samples, 1)"
+        )
+    if _type in "butter":
+        order = limit(order, high=10, low=2)
+        return lowpass_filter(
+            x=data, highcut=int(highcut), fs=fs, order=order, ftype="butter"
+        )
+    elif _type in "cheby1":
+        order = limit(order, high=10, low=2)
+        return lowpass_filter(
+            x=data, highcut=int(highcut), fs=fs, order=order, ftype="cheby1"
+        )
+    elif _type in "ellip":
+        order = limit(order, high=10, low=2)
+        return lowpass_filter(
+            x=data, highcut=int(highcut), fs=fs, order=order, ftype="ellip"
+        )
+    elif _type in "bessel":
+        order = limit(order, high=10, low=2)
+        return lowpass_filter(
+            x=data, highcut=int(highcut), fs=fs, order=order, ftype="bessel"
+        )
+    # elif(_type in "stft"):
+    #     return stft_hard_lowpass(data, lowpass_ratio=highcut / int(fs / 2))
+    # elif(_type in "stft_hard"):
+    #     return stft_hard_lowpass_v0(data, lowpass_ratio=highcut / int(fs / 2))
+    else:
+        raise ValueError("Error: Unexpected filter type " + _type)
+def bandpass(data, lowcut, highcut, fs, order=5, _type="butter"):
+    """
+    :param data: np.float32 type 1d time numpy array, (samples,) , can not be (samples, 1) !!!!!!!!!!!!
+    :param lowcut: low cutoff frequency
+    :param highcut: high cutoff frequency
+    :param fs: sample rate of the original data
+    :param order: order of the filter
+    :param _type: type of filter
+    :return: filtered data, (samples,)
+    """
+    if len(list(data.shape)) != 1:
+        raise ValueError(
+            "Error (chebyshev_lowpass_filter): Data "
+            + str(data.shape)
+            + " should be type 1d time array, (samples,) , can not be (samples, 1)"
+        )
+    if _type in "butter":
+        order = limit(order, high=10, low=2)
+        return bandpass_filter(
+            x=data,
+            lowcut=int(lowcut),
+            highcut=int(highcut),
+            fs=fs,
+            order=order,
+            ftype="butter",
+        )
+    elif _type in "cheby1":
+        order = limit(order, high=10, low=2)
+        return bandpass_filter(
+            x=data,
+            lowcut=int(lowcut),
+            highcut=int(highcut),
+            fs=fs,
+            order=order,
+            ftype="cheby1",
+        )
+    # elif(_type in "cheby2"):
+    #     return bandpass_filter(x=data,lowcut=int(lowcut),highcut=int(highcut), fs=fs, order=order,ftype="cheby2")
+    elif _type in "ellip":
+        order = limit(order, high=10, low=2)
+        return bandpass_filter(
+            x=data,
+            lowcut=int(lowcut),
+            highcut=int(highcut),
+            fs=fs,
+            order=order,
+            ftype="ellip",
+        )
+    elif _type in "bessel":
+        order = limit(order, high=10, low=2)
+        return bandpass_filter(
+            x=data,
+            lowcut=int(lowcut),
+            highcut=int(highcut),
+            fs=fs,
+            order=order,
+            ftype="bessel",
+        )
+    else:
+        raise ValueError("Error: Unexpected filter type " + _type)

pipeline.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import os
+import re
+import yaml
+import torch
+import torchaudio
+import numpy as np
+import audiosr.latent_diffusion.modules.phoneme_encoder.text as text
+from audiosr.latent_diffusion.models.ddpm import LatentDiffusion
+from audiosr.latent_diffusion.util import get_vits_phoneme_ids_no_padding
+from audiosr.utils import (
+    default_audioldm_config,
+    download_checkpoint,
+    read_audio_file,
+    lowpass_filtering_prepare_inference,
+    wav_feature_extraction,
+)
+import os
+def seed_everything(seed):
+    import random, os
+    import numpy as np
+    import torch
+    random.seed(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = True
+def text2phoneme(data):
+    return text._clean_text(re.sub(r"<.*?>", "", data), ["english_cleaners2"])
+def text_to_filename(text):
+    return text.replace(" ", "_").replace("'", "_").replace('"', "_")
+def extract_kaldi_fbank_feature(waveform, sampling_rate, log_mel_spec):
+    norm_mean = -4.2677393
+    norm_std = 4.5689974
+    if sampling_rate != 16000:
+        waveform_16k = torchaudio.functional.resample(
+            waveform, orig_freq=sampling_rate, new_freq=16000
+        )
+    else:
+        waveform_16k = waveform
+    waveform_16k = waveform_16k - waveform_16k.mean()
+    fbank = torchaudio.compliance.kaldi.fbank(
+        waveform_16k,
+        htk_compat=True,
+        sample_frequency=16000,
+        use_energy=False,
+        window_type="hanning",
+        num_mel_bins=128,
+        dither=0.0,
+        frame_shift=10,
+    )
+    TARGET_LEN = log_mel_spec.size(0)
+    # cut and pad
+    n_frames = fbank.shape[0]
+    p = TARGET_LEN - n_frames
+    if p > 0:
+        m = torch.nn.ZeroPad2d((0, 0, 0, p))
+        fbank = m(fbank)
+    elif p < 0:
+        fbank = fbank[:TARGET_LEN, :]
+    fbank = (fbank - norm_mean) / (norm_std * 2)
+    return {"ta_kaldi_fbank": fbank}  # [1024, 128]
+def make_batch_for_super_resolution(input_file, waveform=None, fbank=None):
+    log_mel_spec, stft, waveform, duration, target_frame = read_audio_file(input_file)
+    batch = {
+        "waveform": torch.FloatTensor(waveform),
+        "stft": torch.FloatTensor(stft),
+        "log_mel_spec": torch.FloatTensor(log_mel_spec),
+        "sampling_rate": 48000,
+    }
+    # print(batch["waveform"].size(), batch["stft"].size(), batch["log_mel_spec"].size())
+    batch.update(lowpass_filtering_prepare_inference(batch))
+    assert "waveform_lowpass" in batch.keys()
+    lowpass_mel, lowpass_stft = wav_feature_extraction(
+        batch["waveform_lowpass"], target_frame
+    )
+    batch["lowpass_mel"] = lowpass_mel
+    for k in batch.keys():
+        if type(batch[k]) == torch.Tensor:
+            batch[k] = torch.FloatTensor(batch[k]).unsqueeze(0)
+    return batch, duration
+def round_up_duration(duration):
+    return int(round(duration / 2.5) + 1) * 2.5
+def build_model(ckpt_path=None, config=None, device=None, model_name="basic"):
+    if device is None or device == "auto":
+        if torch.cuda.is_available():
+            device = torch.device("cuda:0")
+        elif torch.backends.mps.is_available():
+            device = torch.device("mps")
+        else:
+            device = torch.device("cpu")
+    print("Loading AudioSR: %s" % model_name)
+    print("Loading model on %s" % device)
+    ckpt_path = download_checkpoint(model_name)
+    if config is not None:
+        assert type(config) is str
+        config = yaml.load(open(config, "r"), Loader=yaml.FullLoader)
+    else:
+        config = default_audioldm_config(model_name)
+    # # Use text as condition instead of using waveform during training
+    config["model"]["params"]["device"] = device
+    # config["model"]["params"]["cond_stage_key"] = "text"
+    # No normalization here
+    latent_diffusion = LatentDiffusion(**config["model"]["params"])
+    resume_from_checkpoint = ckpt_path
+    checkpoint = torch.load(resume_from_checkpoint, map_location=device)
+    latent_diffusion.load_state_dict(checkpoint["state_dict"], strict=False)
+    latent_diffusion.eval()
+    latent_diffusion = latent_diffusion.to(device)
+    return latent_diffusion
+def super_resolution(
+    latent_diffusion,
+    input_file,
+    seed=42,
+    ddim_steps=200,
+    guidance_scale=3.5,
+    latent_t_per_second=12.8,
+    config=None,
+):
+    seed_everything(int(seed))
+    waveform = None
+    batch, duration = make_batch_for_super_resolution(input_file, waveform=waveform)
+    with torch.no_grad():
+        waveform = latent_diffusion.generate_batch(
+            batch,
+            unconditional_guidance_scale=guidance_scale,
+            ddim_steps=ddim_steps,
+            duration=duration,
+        )
+    return waveform