File size: 3,633 Bytes
7150737
fa90792
 
 
 
6e70b11
 
7150737
7ac1a1c
6e70b11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa90792
6e70b11
 
 
 
 
 
fa90792
 
3d2b4d8
fa90792
7ac1a1c
fa90792
7ac1a1c
fa90792
 
 
 
 
 
 
 
 
 
 
 
7ac1a1c
ff5b91f
fa90792
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e70b11
fa90792
 
6e70b11
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import spaces
import gradio as gr
from audiosr import super_resolution, build_model
import torch
import gc # free up memory
import soundfile as sf # read audio
import math # For dynamic gpu duration calculation


# Estimate a dynamic gpu duration done by a private Benchmarking HuggingFace ZeroGPU (H200) Space on the 16th November 2025 for saving quota
def get_duration(audio_file, model_name, guidance_scale, ddim_steps, seed):
    if not audio_file:
        return 0

    try:
        info = sf.info(audio_file)
        audio_duration = info.duration

        
        # 1. Base overhead for model loading (using the higher 'speech' model value).
        base_overhead = 24  # seconds

        # 2. Multipliers for the core ML task.
        # From benchmark: ~11s for 8s audio @ 50 steps.
        # Formula: (8s * C1) + (50 steps * C2) = 11s.
        # We'll estimate C1=1.0 and C2=0.06.
        time_per_audio_second = 1.0
        time_per_ddim_step = 0.06

        # 3. Calculate the estimated processing time.
        estimated_time = base_overhead + (audio_duration * time_per_audio_second) + (ddim_steps * time_per_ddim_step)

        # 4. Add a safety buffer to prevent unexpected timeouts.
        safety_buffer = 10
        
        calculated_duration = estimated_time + safety_buffer

        # 5. Apply min/max constraints.
        min_duration = 50  # Must be enough for model load + buffer
        max_duration = 180 # Current ZeroGPU maximum duration
        
        final_duration = max(min_duration, min(max_duration, calculated_duration))
        print("FINAL DURATION", final_duration)
        return math.ceil(final_duration)

    except Exception as e:
        # Fallback to a safe default duration if reading the audio fails.
        print(f"Error in get_duration, using fallback (60): {e}")
        return 60


@spaces.GPU(duration=get_duration)
def inference(audio_file, model_name, guidance_scale, ddim_steps, seed):
    
    if not audio_file:
        print("No audio file provided, skipping inference.")
        raise gr.Error(
            "Please upload an audio file."
            )
    audiosr = build_model(model_name=model_name)
    
    if torch.cuda.is_available():
        torch.cuda.empty_cache() # empty cuda cache

    gc.collect()

    # set random seed when seed input value is 0
    if seed == 0:
        import random
        seed = random.randint(1, 2**32-1)
    
    waveform = super_resolution(
        audiosr,
        audio_file,
        seed,
        guidance_scale=guidance_scale,
        ddim_steps=ddim_steps
    )

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    gc.collect()
    
    return (48000, waveform)

iface = gr.Interface(
    fn=inference, 
    inputs=[
        gr.Audio(type="filepath", label="Input Audio"),
        gr.Dropdown(["basic", "speech"], value="basic", label="Model"),
        gr.Slider(1, 10, value=3.5, step=0.1, label="Guidance Scale", info="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)"),  
        gr.Slider(1, 100, value=50, step=1, label="DDIM Steps", info="The sampling step for DDIM"),
        gr.Number(value=42, precision=0, label="Seed", info="Changing this value (any integer number) will lead to a different generation result, put 0 for a random one.")
    ],
    outputs=gr.Audio(type="numpy", label="Output Audio"),
    title="AudioSR",
    description="Audio Super Resolution with AudioSR. <br> It estimates a dynamic gpu duration done by a private Benchmarking HuggingFace ZeroGPU (H200) Space on the 16th November 2025 for saving quota."
)

iface.launch(share=False)