File size: 2,480 Bytes
a3b14a2
 
 
693611a
 
 
 
 
 
 
 
 
 
 
 
a3b14a2
693611a
aa1731e
 
 
 
a3b14a2
aa1731e
693611a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3b14a2
 
 
 
693611a
 
a3b14a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
693611a
 
 
 
 
 
 
 
a3b14a2
693611a
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import argparse
from functools import partial

import gradio as gr
import torch
import torchaudio

from resemble_enhance.enhancer.inference import denoise, enhance

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"


def _fn(path, solver, nfe, tau, denoising, unlimited):
    if path is None:
        gr.Warning("Please upload an audio file.")
        return None, None

    info = torchaudio.info(path)
    if not unlimited and (info.num_frames / info.sample_rate > 60):
        gr.Warning("Only audio files shorter than 60 seconds are supported.")
        return None, None

    solver = solver.lower()
    nfe = int(nfe)
    lambd = 0.9 if denoising else 0.1

    dwav, sr = torchaudio.load(path)
    dwav = dwav.mean(dim=0)

    wav1, new_sr = denoise(dwav, sr, device)
    wav2, new_sr = enhance(dwav, sr, device, nfe=nfe, solver=solver, lambd=lambd, tau=tau)

    wav1 = wav1.cpu().numpy()
    wav2 = wav2.cpu().numpy()

    return (new_sr, wav1), (new_sr, wav2)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--unlimited", action="store_true")
    args = parser.parse_args()

    inputs: list = [
        gr.Audio(type="filepath", label="Input Audio"),
        gr.Dropdown(
            choices=["Midpoint", "RK4", "Euler"],
            value="Midpoint",
            label="CFM ODE Solver (Midpoint is recommended)",
        ),
        gr.Slider(
            minimum=1,
            maximum=128,
            value=64,
            step=1,
            label="CFM Number of Function Evaluations (higher values in general yield better quality but may be slower)",
        ),
        gr.Slider(
            minimum=0,
            maximum=1,
            value=0.5,
            step=0.01,
            label="CFM Prior Temperature (higher values can improve quality but can reduce stability)",
        ),
        gr.Checkbox(
            value=False,
            label="Denoise Before Enhancement (tick if your audio contains heavy background noise)",
        ),
    ]

    outputs: list = [
        gr.Audio(label="Output Denoised Audio"),
        gr.Audio(label="Output Enhanced Audio"),
    ]

    interface = gr.Interface(
        fn=partial(_fn, unlimited=args.unlimited),
        title="Resemble Enhance",
        description="AI-driven audio enhancement for your audio files, powered by Resemble AI.",
        inputs=inputs,
        outputs=outputs,
    )

    interface.launch()


if __name__ == "__main__":
    main()