import spaces import gradio as gr from audiosr import super_resolution, build_model import torch import gc # free up memory import soundfile as sf # read audio import math # For dynamic gpu duration calculation # Estimate a dynamic gpu duration done by a private Benchmarking HuggingFace ZeroGPU (H200) Space on the 16th November 2025 for saving quota def get_duration(audio_file, model_name, guidance_scale, ddim_steps, seed): if not audio_file: return 0 try: info = sf.info(audio_file) audio_duration = info.duration # 1. Base overhead for model loading (using the higher 'speech' model value). base_overhead = 24 # seconds # 2. Multipliers for the core ML task. # From benchmark: ~11s for 8s audio @ 50 steps. # Formula: (8s * C1) + (50 steps * C2) = 11s. # We'll estimate C1=1.0 and C2=0.06. time_per_audio_second = 1.0 time_per_ddim_step = 0.06 # 3. Calculate the estimated processing time. estimated_time = base_overhead + (audio_duration * time_per_audio_second) + (ddim_steps * time_per_ddim_step) # 4. Add a safety buffer to prevent unexpected timeouts. safety_buffer = 10 calculated_duration = estimated_time + safety_buffer # 5. Apply min/max constraints. min_duration = 50 # Must be enough for model load + buffer max_duration = 180 # Current ZeroGPU maximum duration final_duration = max(min_duration, min(max_duration, calculated_duration)) print("FINAL DURATION", final_duration) return math.ceil(final_duration) except Exception as e: # Fallback to a safe default duration if reading the audio fails. print(f"Error in get_duration, using fallback (60): {e}") return 60 @spaces.GPU(duration=get_duration) def inference(audio_file, model_name, guidance_scale, ddim_steps, seed): if not audio_file: print("No audio file provided, skipping inference.") raise gr.Error( "Please upload an audio file." ) audiosr = build_model(model_name=model_name) if torch.cuda.is_available(): torch.cuda.empty_cache() # empty cuda cache gc.collect() # set random seed when seed input value is 0 if seed == 0: import random seed = random.randint(1, 2**32-1) waveform = super_resolution( audiosr, audio_file, seed, guidance_scale=guidance_scale, ddim_steps=ddim_steps ) if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() return (48000, waveform) iface = gr.Interface( fn=inference, inputs=[ gr.Audio(type="filepath", label="Input Audio"), gr.Dropdown(["basic", "speech"], value="basic", label="Model"), gr.Slider(1, 10, value=3.5, step=0.1, label="Guidance Scale", info="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)"), gr.Slider(1, 100, value=50, step=1, label="DDIM Steps", info="The sampling step for DDIM"), gr.Number(value=42, precision=0, label="Seed", info="Changing this value (any integer number) will lead to a different generation result, put 0 for a random one.") ], outputs=gr.Audio(type="numpy", label="Output Audio"), title="AudioSR", description="Audio Super Resolution with AudioSR.
It estimates a dynamic gpu duration done by a private Benchmarking HuggingFace ZeroGPU (H200) Space on the 16th November 2025 for saving quota." ) iface.launch(share=False)