SV2TTS / app.py
mujahid1214's picture
Create app.py
61d95ea verified
import gradio as gr
import numpy as np
import librosa
import soundfile as sf
# Import SV2TTS modules (from the Real-Time-Voice-Cloning repo you’ll add as dependency)
from encoder import inference as encoder
from synthesizer.inference import Synthesizer
from vocoder import inference as vocoder
# Load models (make sure weights are in /saved_models in your repo)
encoder.load_model("saved_models/encoder/saved_model.pt")
synthesizer = Synthesizer("saved_models/synthesizer/saved_model.pt")
vocoder.load_model("saved_models/vocoder/saved_model.pt")
def clone_voice(sample_wav, text, consent):
if not consent:
return "⚠️ You must confirm consent to use this voice.", None
if sample_wav is None or text.strip() == "":
return "Please upload a sample and enter text.", None
wav, sr = librosa.load(sample_wav, sr=None)
wav = encoder.preprocess_wav(wav, sr)
embed = encoder.embed_utterance(wav)
# Generate mel spectrogram
specs = synthesizer.synthesize_spectrograms([text], [embed])
# Vocoder to waveform
generated_wav = vocoder.infer_waveform(specs[0])
out_path = "out.wav"
sf.write(out_path, generated_wav, synthesizer.sample_rate)
return "✅ Done", out_path
with gr.Blocks() as demo:
gr.Markdown("# 🎙️ SV2TTS Voice Cloning Demo")
sample = gr.Audio(label="Upload speaker sample (5–10s)", source="upload", type="filepath")
txt = gr.Textbox(label="Text to say", value="Hello, this is a test.")
consent = gr.Checkbox(label="I confirm I have permission to clone this voice", value=False)
status = gr.Textbox(label="Status")
out_audio = gr.Audio(label="Generated audio")
btn = gr.Button("Generate")
btn.click(fn=clone_voice, inputs=[sample, txt, consent], outputs=[status, out_audio])
if __name__ == "__main__":
demo.launch()