import gradio as gr import numpy as np import librosa import soundfile as sf # Import SV2TTS modules (from the Real-Time-Voice-Cloning repo you’ll add as dependency) from encoder import inference as encoder from synthesizer.inference import Synthesizer from vocoder import inference as vocoder # Load models (make sure weights are in /saved_models in your repo) encoder.load_model("saved_models/encoder/saved_model.pt") synthesizer = Synthesizer("saved_models/synthesizer/saved_model.pt") vocoder.load_model("saved_models/vocoder/saved_model.pt") def clone_voice(sample_wav, text, consent): if not consent: return "⚠️ You must confirm consent to use this voice.", None if sample_wav is None or text.strip() == "": return "Please upload a sample and enter text.", None wav, sr = librosa.load(sample_wav, sr=None) wav = encoder.preprocess_wav(wav, sr) embed = encoder.embed_utterance(wav) # Generate mel spectrogram specs = synthesizer.synthesize_spectrograms([text], [embed]) # Vocoder to waveform generated_wav = vocoder.infer_waveform(specs[0]) out_path = "out.wav" sf.write(out_path, generated_wav, synthesizer.sample_rate) return "✅ Done", out_path with gr.Blocks() as demo: gr.Markdown("# 🎙️ SV2TTS Voice Cloning Demo") sample = gr.Audio(label="Upload speaker sample (5–10s)", source="upload", type="filepath") txt = gr.Textbox(label="Text to say", value="Hello, this is a test.") consent = gr.Checkbox(label="I confirm I have permission to clone this voice", value=False) status = gr.Textbox(label="Status") out_audio = gr.Audio(label="Generated audio") btn = gr.Button("Generate") btn.click(fn=clone_voice, inputs=[sample, txt, consent], outputs=[status, out_audio]) if __name__ == "__main__": demo.launch()