{ "architectures": [ "VibeVoiceDiffusionHead" ], "ddpm_batch_mul": 4, "ddpm_beta_schedule": "cosine", "ddpm_num_inference_steps": 20, "ddpm_num_steps": 1000, "diffusion_type": "ddpm", "head_ffn_ratio": 3.0, "head_layers": 4, "hidden_size": 3584, "latent_size": 64, "model_type": "vibevoice_diffusion_head", "prediction_type": "v_prediction", "rms_norm_eps": 1e-05, "speech_vae_dim": 64, "torch_dtype": "bfloat16", "transformers_version": "4.51.3" }