{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# VibeVoice Voice Cloning Test\n", "\n", "**IMPORTANT:** Voice cloning with custom audio ONLY works through Gradio interface!\n", "\n", "The command-line script only uses built-in voices (Alice, Frank, etc.)" ] }, { "cell_type": "code", "metadata": { "collapsed": false, "scrolled": true }, "source": [ "# Setup\n", "import torch\n", "if torch.cuda.is_available():\n", " print(f\"GPU: {torch.cuda.get_device_name(0)}\")" ], "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "GPU: NVIDIA L40S\n" ] } ] }, { "cell_type": "code", "metadata": { "collapsed": false, "scrolled": true }, "source": [ "# Install VibeVoice\n", "![ -d /root/VibeVoice ] || git clone --quiet https://github.com/cseti007/VibeVoice.git /root/VibeVoice\n", "%uv pip install --quiet -e /root/VibeVoice\n", "print(\"Installed\")" ], "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Note: you may need to restart the kernel to use updated packages.\n", "Installed\n" ] } ] }, { "cell_type": "code", "metadata": { "collapsed": false, "scrolled": true }, "source": [ "# Download models\n", "!huggingface-cli download aoi-ot/VibeVoice-Large --local-dir /root/models/VibeVoice-Large --quiet\n", "!huggingface-cli download ABDALLALSWAITI/vibevoice-arabic-Z --local-dir /root/models/vibevoice-arabic-Z --quiet\n", "print(\"Models ready\")" ], "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[33m\u26a0\ufe0f Warning: 'huggingface-cli download' is deprecated. Use 'hf download' instead.\u001b[0m\r\n", "/root/models/VibeVoice-Large\r\n", "\u001b[33m\u26a0\ufe0f Warning: 'huggingface-cli download' is deprecated. Use 'hf download' instead.\u001b[0m\r\n", "/root/models/vibevoice-arabic-Z\r\n", "Models ready\n" ] } ] }, { "cell_type": "code", "metadata": { "collapsed": false, "scrolled": true }, "source": [ "# Launch Gradio with Arabic LoRA\n", "!python /root/VibeVoice/demo/gradio_demo.py \\\n", " --model_path /root/models/VibeVoice-Large \\\n", " --checkpoint_path /root/models/vibevoice-arabic-Z \\\n", " --share" ], "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "APEX FusedRMSNorm not available, using native implementation\r\n", "\ud83c\udf99\ufe0f Initializing VibeVoice Demo with Streaming Support...\r\n", "Loading processor & model from /root/models/VibeVoice-Large\r\n", "Using device: cuda\r\n", "\rtokenizer_config.json: 0.00B [00:00, ?B/s]\rtokenizer_config.json: 7.23kB [00:00, 25.5MB/s]\r\n", "\rvocab.json: 0.00B [00:00, ?B/s]\rvocab.json: 2.78MB [00:00, 134MB/s]\r\n", "\rmerges.txt: 0.00B [00:00, ?B/s]\rmerges.txt: 1.67MB [00:00, 148MB/s]\r\n", "\rtokenizer.json: 0.00B [00:00, ?B/s]\rtokenizer.json: 7.03MB [00:00, 175MB/s]\r\n", "loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/vocab.json\r\n", "loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/merges.txt\r\n", "loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/tokenizer.json\r\n", "loading file added_tokens.json from cache at None\r\n", "loading file special_tokens_map.json from cache at None\r\n", "loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/tokenizer_config.json\r\n", "loading file chat_template.jinja from cache at None\r\n", "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \r\n", "The tokenizer class you load from this checkpoint is 'Qwen2Tokenizer'. \r\n", "The class this function is called from is 'VibeVoiceTextTokenizerFast'.\r\n", "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\r\n", "Using device: cuda, torch_dtype: torch.bfloat16, attn_implementation: flash_attention_2\r\n", "loading configuration file /root/models/VibeVoice-Large/config.json\r\n", "Model config VibeVoiceConfig {\r\n", " \"acostic_vae_dim\": 64,\r\n", " \"acoustic_tokenizer_config\": {\r\n", " \"causal\": true,\r\n", " \"channels\": 1,\r\n", " \"conv_bias\": true,\r\n", " \"conv_norm\": \"none\",\r\n", " \"corpus_normalize\": 0.0,\r\n", " \"decoder_depths\": null,\r\n", " \"decoder_n_filters\": 32,\r\n", " \"decoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"disable_last_norm\": true,\r\n", " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", " \"encoder_n_filters\": 32,\r\n", " \"encoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"fix_std\": 0.5,\r\n", " \"layer_scale_init_value\": 1e-06,\r\n", " \"layernorm\": \"RMSNorm\",\r\n", " \"layernorm_elementwise_affine\": true,\r\n", " \"layernorm_eps\": 1e-05,\r\n", " \"mixer_layer\": \"depthwise_conv\",\r\n", " \"model_type\": \"vibevoice_acoustic_tokenizer\",\r\n", " \"pad_mode\": \"constant\",\r\n", " \"std_dist_type\": \"gaussian\",\r\n", " \"vae_dim\": 64,\r\n", " \"weight_init_value\": 0.01\r\n", " },\r\n", " \"acoustic_vae_dim\": 64,\r\n", " \"architectures\": [\r\n", " \"VibeVoiceForConditionalGeneration\"\r\n", " ],\r\n", " \"decoder_config\": {\r\n", " \"attention_dropout\": 0.0,\r\n", " \"hidden_act\": \"silu\",\r\n", " \"hidden_size\": 3584,\r\n", " \"initializer_range\": 0.02,\r\n", " \"intermediate_size\": 18944,\r\n", " \"max_position_embeddings\": 32768,\r\n", " \"max_window_layers\": 28,\r\n", " \"model_type\": \"qwen2\",\r\n", " \"num_attention_heads\": 28,\r\n", " \"num_hidden_layers\": 28,\r\n", " \"num_key_value_heads\": 4,\r\n", " \"rms_norm_eps\": 1e-06,\r\n", " \"rope_scaling\": null,\r\n", " \"rope_theta\": 1000000.0,\r\n", " \"sliding_window\": null,\r\n", " \"torch_dtype\": \"bfloat16\",\r\n", " \"use_cache\": true,\r\n", " \"use_mrope\": false,\r\n", " \"use_sliding_window\": false,\r\n", " \"vocab_size\": 152064\r\n", " },\r\n", " \"diffusion_head_config\": {\r\n", " \"ddpm_batch_mul\": 4,\r\n", " \"ddpm_beta_schedule\": \"cosine\",\r\n", " \"ddpm_num_inference_steps\": 20,\r\n", " \"ddpm_num_steps\": 1000,\r\n", " \"diffusion_type\": \"ddpm\",\r\n", " \"head_ffn_ratio\": 3.0,\r\n", " \"head_layers\": 4,\r\n", " \"hidden_size\": 3584,\r\n", " \"latent_size\": 64,\r\n", " \"model_type\": \"vibevoice_diffusion_head\",\r\n", " \"prediction_type\": \"v_prediction\",\r\n", " \"rms_norm_eps\": 1e-05,\r\n", " \"speech_vae_dim\": 64\r\n", " },\r\n", " \"model_type\": \"vibevoice\",\r\n", " \"semantic_tokenizer_config\": {\r\n", " \"causal\": true,\r\n", " \"channels\": 1,\r\n", " \"conv_bias\": true,\r\n", " \"conv_norm\": \"none\",\r\n", " \"corpus_normalize\": 0.0,\r\n", " \"disable_last_norm\": true,\r\n", " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", " \"encoder_n_filters\": 32,\r\n", " \"encoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"fix_std\": 0,\r\n", " \"layer_scale_init_value\": 1e-06,\r\n", " \"layernorm\": \"RMSNorm\",\r\n", " \"layernorm_elementwise_affine\": true,\r\n", " \"layernorm_eps\": 1e-05,\r\n", " \"mixer_layer\": \"depthwise_conv\",\r\n", " \"model_type\": \"vibevoice_semantic_tokenizer\",\r\n", " \"pad_mode\": \"constant\",\r\n", " \"std_dist_type\": \"none\",\r\n", " \"vae_dim\": 128,\r\n", " \"weight_init_value\": 0.01\r\n", " },\r\n", " \"semantic_vae_dim\": 128,\r\n", " \"tie_word_embeddings\": false,\r\n", " \"torch_dtype\": \"bfloat16\",\r\n", " \"transformers_version\": \"4.51.3\"\r\n", "}\r\n", "\r\n", "loading weights file /root/models/VibeVoice-Large/model.safetensors.index.json\r\n", "Instantiating VibeVoiceForConditionalGenerationInference model under default dtype torch.bfloat16.\r\n", "[ERROR] : ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\r\n", "Traceback (most recent call last):\r\n", " File \"/root/VibeVoice/demo/gradio_demo.py\", line 86, in load_model\r\n", " self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 279, in _wrapper\r\n", " return func(*args, **kwargs)\r\n", " ^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 4336, in from_pretrained\r\n", " config = cls._autoset_attn_implementation(\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 2109, in _autoset_attn_implementation\r\n", " cls._check_and_enable_flash_attn_2(\r\n", " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 2252, in _check_and_enable_flash_attn_2\r\n", " raise ImportError(f\"{preface} the package flash_attn seems to be not installed. {install_message}\")\r\n", "ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\r\n", "\r\n", "Falling back to attention implementation: sdpa\r\n", "loading configuration file /root/models/VibeVoice-Large/config.json\r\n", "Model config VibeVoiceConfig {\r\n", " \"acostic_vae_dim\": 64,\r\n", " \"acoustic_tokenizer_config\": {\r\n", " \"causal\": true,\r\n", " \"channels\": 1,\r\n", " \"conv_bias\": true,\r\n", " \"conv_norm\": \"none\",\r\n", " \"corpus_normalize\": 0.0,\r\n", " \"decoder_depths\": null,\r\n", " \"decoder_n_filters\": 32,\r\n", " \"decoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"disable_last_norm\": true,\r\n", " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", " \"encoder_n_filters\": 32,\r\n", " \"encoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"fix_std\": 0.5,\r\n", " \"layer_scale_init_value\": 1e-06,\r\n", " \"layernorm\": \"RMSNorm\",\r\n", " \"layernorm_elementwise_affine\": true,\r\n", " \"layernorm_eps\": 1e-05,\r\n", " \"mixer_layer\": \"depthwise_conv\",\r\n", " \"model_type\": \"vibevoice_acoustic_tokenizer\",\r\n", " \"pad_mode\": \"constant\",\r\n", " \"std_dist_type\": \"gaussian\",\r\n", " \"vae_dim\": 64,\r\n", " \"weight_init_value\": 0.01\r\n", " },\r\n", " \"acoustic_vae_dim\": 64,\r\n", " \"architectures\": [\r\n", " \"VibeVoiceForConditionalGeneration\"\r\n", " ],\r\n", " \"decoder_config\": {\r\n", " \"attention_dropout\": 0.0,\r\n", " \"hidden_act\": \"silu\",\r\n", " \"hidden_size\": 3584,\r\n", " \"initializer_range\": 0.02,\r\n", " \"intermediate_size\": 18944,\r\n", " \"max_position_embeddings\": 32768,\r\n", " \"max_window_layers\": 28,\r\n", " \"model_type\": \"qwen2\",\r\n", " \"num_attention_heads\": 28,\r\n", " \"num_hidden_layers\": 28,\r\n", " \"num_key_value_heads\": 4,\r\n", " \"rms_norm_eps\": 1e-06,\r\n", " \"rope_scaling\": null,\r\n", " \"rope_theta\": 1000000.0,\r\n", " \"sliding_window\": null,\r\n", " \"torch_dtype\": \"bfloat16\",\r\n", " \"use_cache\": true,\r\n", " \"use_mrope\": false,\r\n", " \"use_sliding_window\": false,\r\n", " \"vocab_size\": 152064\r\n", " },\r\n", " \"diffusion_head_config\": {\r\n", " \"ddpm_batch_mul\": 4,\r\n", " \"ddpm_beta_schedule\": \"cosine\",\r\n", " \"ddpm_num_inference_steps\": 20,\r\n", " \"ddpm_num_steps\": 1000,\r\n", " \"diffusion_type\": \"ddpm\",\r\n", " \"head_ffn_ratio\": 3.0,\r\n", " \"head_layers\": 4,\r\n", " \"hidden_size\": 3584,\r\n", " \"latent_size\": 64,\r\n", " \"model_type\": \"vibevoice_diffusion_head\",\r\n", " \"prediction_type\": \"v_prediction\",\r\n", " \"rms_norm_eps\": 1e-05,\r\n", " \"speech_vae_dim\": 64\r\n", " },\r\n", " \"model_type\": \"vibevoice\",\r\n", " \"semantic_tokenizer_config\": {\r\n", " \"causal\": true,\r\n", " \"channels\": 1,\r\n", " \"conv_bias\": true,\r\n", " \"conv_norm\": \"none\",\r\n", " \"corpus_normalize\": 0.0,\r\n", " \"disable_last_norm\": true,\r\n", " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", " \"encoder_n_filters\": 32,\r\n", " \"encoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"fix_std\": 0,\r\n", " \"layer_scale_init_value\": 1e-06,\r\n", " \"layernorm\": \"RMSNorm\",\r\n", " \"layernorm_elementwise_affine\": true,\r\n", " \"layernorm_eps\": 1e-05,\r\n", " \"mixer_layer\": \"depthwise_conv\",\r\n", " \"model_type\": \"vibevoice_semantic_tokenizer\",\r\n", " \"pad_mode\": \"constant\",\r\n", " \"std_dist_type\": \"none\",\r\n", " \"vae_dim\": 128,\r\n", " \"weight_init_value\": 0.01\r\n", " },\r\n", " \"semantic_vae_dim\": 128,\r\n", " \"tie_word_embeddings\": false,\r\n", " \"torch_dtype\": \"bfloat16\",\r\n", " \"transformers_version\": \"4.51.3\"\r\n", "}\r\n", "\r\n", "loading weights file /root/models/VibeVoice-Large/model.safetensors.index.json\r\n", "Instantiating VibeVoiceForConditionalGenerationInference model under default dtype torch.bfloat16.\r\n", "Generate config GenerationConfig {}\r\n", "\r\n", "Instantiating Qwen2Model model under default dtype torch.bfloat16.\r\n", "Instantiating VibeVoiceAcousticTokenizerModel model under default dtype torch.bfloat16.\r\n", "Instantiating VibeVoiceSemanticTokenizerModel model under default dtype torch.bfloat16.\r\n", "Instantiating VibeVoiceDiffusionHead model under default dtype torch.bfloat16.\r\n", "\rLoading checkpoint shards: 0%| | 0/10 [00:00 https://89c767c53e806c2545.gradio.live\r\n" ] } ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Alternative: Test Built-in Voices\n", "\n", "If you want to test the Arabic LoRA with built-in voices (not your custom voice):" ] }, { "cell_type": "code", "metadata": { "collapsed": false, "scrolled": true }, "source": [ "# Create test text\n", "import os\n", "text = \"\"\"Speaker 1: \u0645\u0631\u062d\u0628\u0627\u064b \u0628\u0643\u0645\u060c \u0627\u0633\u0645\u064a \u0633\u0627\u0645\u064a.\n", "\u0623\u0646\u0627 \u0627\u0644\u0622\u0646 \u0623\u062e\u062a\u0628\u0631 \u062a\u0642\u0646\u064a\u0629 \u062c\u062f\u064a\u062f\u0629 \u0644\u062a\u062d\u0648\u064a\u0644 \u0627\u0644\u0646\u0635 \u0625\u0644\u0649 \u0643\u0644\u0627\u0645.\n", "\n", "\u0643\u064a\u0641 \u064a\u0628\u062f\u0648 \u0635\u0648\u062a\u064a\u061f\n", "\u0647\u0644 \u062a\u0633\u0645\u0639 \u0627\u0644\u0646\u0628\u0631\u0629 \u0627\u0644\u0637\u0628\u064a\u0639\u064a\u0629 \u0641\u064a \u062d\u062f\u064a\u062b\u064a\u061f\n", "\n", "\u0627\u0644\u0623\u0631\u062f\u0646 \u0628\u0644\u062f \u0627\u0644\u062c\u0628\u0627\u0644 \u0648\u0627\u0644\u0628\u062d\u0631 \u0648\u0627\u0644\u0635\u062d\u0631\u0627\u0621\u060c\n", "\u0648\u0641\u064a \u0643\u0644 \u0645\u062f\u064a\u0646\u0629\u064d \u0642\u0635\u0629\u060c \u0648\u0641\u064a \u0643\u0644 \u0634\u0627\u0631\u0639\u064d \u062d\u0643\u0627\u064a\u0629.\n", "\n", "\u0627\u0644\u062d\u064a\u0627\u0629 \u0631\u062d\u0644\u0629 \u0646\u062a\u0639\u0644\u0651\u0645 \u0645\u0646\u0647\u0627 \u0643\u0644 \u064a\u0648\u0645\u060c\n", "\u0641\u0644\u0646\u0628\u062a\u0633\u0645 \u0627\u0644\u0622\u0646\u2026 \u0648\u0644\u0646\u0628\u062f\u0623 \u0645\u0646 \u062c\u062f\u064a\u062f.\\nSpeaker 2: \u0623\u0646\u0627 \u0628\u062e\u064a\u0631 \u0634\u0643\u0631\u0627\"\"\"\n", "with open('/root/test.txt', 'w', encoding='utf-8') as f:\n", " f.write(text)" ], "execution_count": 11, "outputs": [] }, { "cell_type": "code", "metadata": { "collapsed": false, "scrolled": true }, "source": [ "# WITH LoRA (built-in Alice voice)\n", "os.makedirs('/root/outputs/builtin_with_lora', exist_ok=True)\n", "!python /root/VibeVoice/demo/inference_from_file.py \\\n", " --model_path /root/models/VibeVoice-Large \\\n", " --txt_path /root/test.txt \\\n", " --speaker_names Alice Frank \\\n", " --checkpoint_path /root/models/vibevoice-arabic-Z \\\n", " --output_dir /root/outputs/builtin_with_lora" ], "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "APEX FusedRMSNorm not available, using native implementation\r\n", "Using device: cuda\r\n", "Found 9 voice files in /root/VibeVoice/demo/voices\r\n", "Available voices: en-Alice_woman, en-Carter_man, en-Frank_man, en-Mary_woman_bgm, en-Maya_woman, in-Samuel_man, zh-Anchen_man_bgm, zh-Bowen_man, zh-Xinran_woman\r\n", "Reading script from: /root/test.txt\r\n", "Found 2 speaker segments:\r\n", " 1. Speaker 1\r\n", " Text preview: Speaker 1: \u0645\u0631\u062d\u0628\u0627\u064b \u0628\u0643\u0645\u060c \u0627\u0633\u0645\u064a \u0633\u0627\u0645\u064a. \u0623\u0646\u0627 \u0627\u0644\u0622\u0646 \u0623\u062e\u062a\u0628\u0631 \u062a\u0642\u0646\u064a\u0629 \u062c\u062f\u064a\u062f\u0629 \u0644\u062a\u062d\u0648\u064a\u0644 \u0627\u0644\u0646\u0635 \u0625\u0644\u0649 \u0643\u0644\u0627\u0645. \u0643\u064a\u0641 \u064a\u0628\u062f\u0648 \u0635\u0648\u062a\u064a\u061f \u0647\u0644...\r\n", " 2. Speaker 2\r\n", " Text preview: Speaker 2: \u0623\u0646\u0627 \u0628\u062e\u064a\u0631 \u0634\u0643\u0631\u0627...\r\n", "\r\n", "Speaker mapping:\r\n", " Speaker 2 -> Frank\r\n", " Speaker 1 -> Alice\r\n", "Speaker 1 ('Alice') -> Voice: en-Alice_woman.wav\r\n", "Speaker 2 ('Frank') -> Voice: en-Frank_man.wav\r\n", "Loading processor & model from /root/models/VibeVoice-Large\r\n", "loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/vocab.json\r\n", "loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/merges.txt\r\n", "loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/tokenizer.json\r\n", "loading file added_tokens.json from cache at None\r\n", "loading file special_tokens_map.json from cache at None\r\n", "loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/tokenizer_config.json\r\n", "loading file chat_template.jinja from cache at None\r\n", "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \r\n", "The tokenizer class you load from this checkpoint is 'Qwen2Tokenizer'. \r\n", "The class this function is called from is 'VibeVoiceTextTokenizerFast'.\r\n", "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\r\n", "Using device: cuda, torch_dtype: torch.bfloat16, attn_implementation: flash_attention_2\r\n", "loading configuration file /root/models/VibeVoice-Large/config.json\r\n", "Model config VibeVoiceConfig {\r\n", " \"acostic_vae_dim\": 64,\r\n", " \"acoustic_tokenizer_config\": {\r\n", " \"causal\": true,\r\n", " \"channels\": 1,\r\n", " \"conv_bias\": true,\r\n", " \"conv_norm\": \"none\",\r\n", " \"corpus_normalize\": 0.0,\r\n", " \"decoder_depths\": null,\r\n", " \"decoder_n_filters\": 32,\r\n", " \"decoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"disable_last_norm\": true,\r\n", " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", " \"encoder_n_filters\": 32,\r\n", " \"encoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"fix_std\": 0.5,\r\n", " \"layer_scale_init_value\": 1e-06,\r\n", " \"layernorm\": \"RMSNorm\",\r\n", " \"layernorm_elementwise_affine\": true,\r\n", " \"layernorm_eps\": 1e-05,\r\n", " \"mixer_layer\": \"depthwise_conv\",\r\n", " \"model_type\": \"vibevoice_acoustic_tokenizer\",\r\n", " \"pad_mode\": \"constant\",\r\n", " \"std_dist_type\": \"gaussian\",\r\n", " \"vae_dim\": 64,\r\n", " \"weight_init_value\": 0.01\r\n", " },\r\n", " \"acoustic_vae_dim\": 64,\r\n", " \"architectures\": [\r\n", " \"VibeVoiceForConditionalGeneration\"\r\n", " ],\r\n", " \"decoder_config\": {\r\n", " \"attention_dropout\": 0.0,\r\n", " \"hidden_act\": \"silu\",\r\n", " \"hidden_size\": 3584,\r\n", " \"initializer_range\": 0.02,\r\n", " \"intermediate_size\": 18944,\r\n", " \"max_position_embeddings\": 32768,\r\n", " \"max_window_layers\": 28,\r\n", " \"model_type\": \"qwen2\",\r\n", " \"num_attention_heads\": 28,\r\n", " \"num_hidden_layers\": 28,\r\n", " \"num_key_value_heads\": 4,\r\n", " \"rms_norm_eps\": 1e-06,\r\n", " \"rope_scaling\": null,\r\n", " \"rope_theta\": 1000000.0,\r\n", " \"sliding_window\": null,\r\n", " \"torch_dtype\": \"bfloat16\",\r\n", " \"use_cache\": true,\r\n", " \"use_mrope\": false,\r\n", " \"use_sliding_window\": false,\r\n", " \"vocab_size\": 152064\r\n", " },\r\n", " \"diffusion_head_config\": {\r\n", " \"ddpm_batch_mul\": 4,\r\n", " \"ddpm_beta_schedule\": \"cosine\",\r\n", " \"ddpm_num_inference_steps\": 20,\r\n", " \"ddpm_num_steps\": 1000,\r\n", " \"diffusion_type\": \"ddpm\",\r\n", " \"head_ffn_ratio\": 3.0,\r\n", " \"head_layers\": 4,\r\n", " \"hidden_size\": 3584,\r\n", " \"latent_size\": 64,\r\n", " \"model_type\": \"vibevoice_diffusion_head\",\r\n", " \"prediction_type\": \"v_prediction\",\r\n", " \"rms_norm_eps\": 1e-05,\r\n", " \"speech_vae_dim\": 64\r\n", " },\r\n", " \"model_type\": \"vibevoice\",\r\n", " \"semantic_tokenizer_config\": {\r\n", " \"causal\": true,\r\n", " \"channels\": 1,\r\n", " \"conv_bias\": true,\r\n", " \"conv_norm\": \"none\",\r\n", " \"corpus_normalize\": 0.0,\r\n", " \"disable_last_norm\": true,\r\n", " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", " \"encoder_n_filters\": 32,\r\n", " \"encoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"fix_std\": 0,\r\n", " \"layer_scale_init_value\": 1e-06,\r\n", " \"layernorm\": \"RMSNorm\",\r\n", " \"layernorm_elementwise_affine\": true,\r\n", " \"layernorm_eps\": 1e-05,\r\n", " \"mixer_layer\": \"depthwise_conv\",\r\n", " \"model_type\": \"vibevoice_semantic_tokenizer\",\r\n", " \"pad_mode\": \"constant\",\r\n", " \"std_dist_type\": \"none\",\r\n", " \"vae_dim\": 128,\r\n", " \"weight_init_value\": 0.01\r\n", " },\r\n", " \"semantic_vae_dim\": 128,\r\n", " \"tie_word_embeddings\": false,\r\n", " \"torch_dtype\": \"bfloat16\",\r\n", " \"transformers_version\": \"4.51.3\"\r\n", "}\r\n", "\r\n", "loading weights file /root/models/VibeVoice-Large/model.safetensors.index.json\r\n", "Instantiating VibeVoiceForConditionalGenerationInference model under default dtype torch.bfloat16.\r\n", "[ERROR] : ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\r\n", "Traceback (most recent call last):\r\n", " File \"/root/VibeVoice/demo/inference_from_file.py\", line 305, in main\r\n", " model = VibeVoiceForConditionalGenerationInference.from_pretrained(\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 279, in _wrapper\r\n", " return func(*args, **kwargs)\r\n", " ^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 4336, in from_pretrained\r\n", " config = cls._autoset_attn_implementation(\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 2109, in _autoset_attn_implementation\r\n", " cls._check_and_enable_flash_attn_2(\r\n", " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 2252, in _check_and_enable_flash_attn_2\r\n", " raise ImportError(f\"{preface} the package flash_attn seems to be not installed. {install_message}\")\r\n", "ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\r\n", "\r\n", "Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.\r\n", "loading configuration file /root/models/VibeVoice-Large/config.json\r\n", "Model config VibeVoiceConfig {\r\n", " \"acostic_vae_dim\": 64,\r\n", " \"acoustic_tokenizer_config\": {\r\n", " \"causal\": true,\r\n", " \"channels\": 1,\r\n", " \"conv_bias\": true,\r\n", " \"conv_norm\": \"none\",\r\n", " \"corpus_normalize\": 0.0,\r\n", " \"decoder_depths\": null,\r\n", " \"decoder_n_filters\": 32,\r\n", " \"decoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"disable_last_norm\": true,\r\n", " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", " \"encoder_n_filters\": 32,\r\n", " \"encoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"fix_std\": 0.5,\r\n", " \"layer_scale_init_value\": 1e-06,\r\n", " \"layernorm\": \"RMSNorm\",\r\n", " \"layernorm_elementwise_affine\": true,\r\n", " \"layernorm_eps\": 1e-05,\r\n", " \"mixer_layer\": \"depthwise_conv\",\r\n", " \"model_type\": \"vibevoice_acoustic_tokenizer\",\r\n", " \"pad_mode\": \"constant\",\r\n", " \"std_dist_type\": \"gaussian\",\r\n", " \"vae_dim\": 64,\r\n", " \"weight_init_value\": 0.01\r\n", " },\r\n", " \"acoustic_vae_dim\": 64,\r\n", " \"architectures\": [\r\n", " \"VibeVoiceForConditionalGeneration\"\r\n", " ],\r\n", " \"decoder_config\": {\r\n", " \"attention_dropout\": 0.0,\r\n", " \"hidden_act\": \"silu\",\r\n", " \"hidden_size\": 3584,\r\n", " \"initializer_range\": 0.02,\r\n", " \"intermediate_size\": 18944,\r\n", " \"max_position_embeddings\": 32768,\r\n", " \"max_window_layers\": 28,\r\n", " \"model_type\": \"qwen2\",\r\n", " \"num_attention_heads\": 28,\r\n", " \"num_hidden_layers\": 28,\r\n", " \"num_key_value_heads\": 4,\r\n", " \"rms_norm_eps\": 1e-06,\r\n", " \"rope_scaling\": null,\r\n", " \"rope_theta\": 1000000.0,\r\n", " \"sliding_window\": null,\r\n", " \"torch_dtype\": \"bfloat16\",\r\n", " \"use_cache\": true,\r\n", " \"use_mrope\": false,\r\n", " \"use_sliding_window\": false,\r\n", " \"vocab_size\": 152064\r\n", " },\r\n", " \"diffusion_head_config\": {\r\n", " \"ddpm_batch_mul\": 4,\r\n", " \"ddpm_beta_schedule\": \"cosine\",\r\n", " \"ddpm_num_inference_steps\": 20,\r\n", " \"ddpm_num_steps\": 1000,\r\n", " \"diffusion_type\": \"ddpm\",\r\n", " \"head_ffn_ratio\": 3.0,\r\n", " \"head_layers\": 4,\r\n", " \"hidden_size\": 3584,\r\n", " \"latent_size\": 64,\r\n", " \"model_type\": \"vibevoice_diffusion_head\",\r\n", " \"prediction_type\": \"v_prediction\",\r\n", " \"rms_norm_eps\": 1e-05,\r\n", " \"speech_vae_dim\": 64\r\n", " },\r\n", " \"model_type\": \"vibevoice\",\r\n", " \"semantic_tokenizer_config\": {\r\n", " \"causal\": true,\r\n", " \"channels\": 1,\r\n", " \"conv_bias\": true,\r\n", " \"conv_norm\": \"none\",\r\n", " \"corpus_normalize\": 0.0,\r\n", " \"disable_last_norm\": true,\r\n", " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", " \"encoder_n_filters\": 32,\r\n", " \"encoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"fix_std\": 0,\r\n", " \"layer_scale_init_value\": 1e-06,\r\n", " \"layernorm\": \"RMSNorm\",\r\n", " \"layernorm_elementwise_affine\": true,\r\n", " \"layernorm_eps\": 1e-05,\r\n", " \"mixer_layer\": \"depthwise_conv\",\r\n", " \"model_type\": \"vibevoice_semantic_tokenizer\",\r\n", " \"pad_mode\": \"constant\",\r\n", " \"std_dist_type\": \"none\",\r\n", " \"vae_dim\": 128,\r\n", " \"weight_init_value\": 0.01\r\n", " },\r\n", " \"semantic_vae_dim\": 128,\r\n", " \"tie_word_embeddings\": false,\r\n", " \"torch_dtype\": \"bfloat16\",\r\n", " \"transformers_version\": \"4.51.3\"\r\n", "}\r\n", "\r\n", "loading weights file /root/models/VibeVoice-Large/model.safetensors.index.json\r\n", "Instantiating VibeVoiceForConditionalGenerationInference model under default dtype torch.bfloat16.\r\n", "Generate config GenerationConfig {}\r\n", "\r\n", "Instantiating Qwen2Model model under default dtype torch.bfloat16.\r\n", "Instantiating VibeVoiceAcousticTokenizerModel model under default dtype torch.bfloat16.\r\n", "Instantiating VibeVoiceSemanticTokenizerModel model under default dtype torch.bfloat16.\r\n", "Instantiating VibeVoiceDiffusionHead model under default dtype torch.bfloat16.\r\n", "\rLoading checkpoint shards: 0%| | 0/10 [00:00 Frank\r\n", " Speaker 1 -> Alice\r\n", "Speaker 1 ('Alice') -> Voice: en-Alice_woman.wav\r\n", "Speaker 2 ('Frank') -> Voice: en-Frank_man.wav\r\n", "Loading processor & model from /root/models/VibeVoice-Large\r\n", "loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/vocab.json\r\n", "loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/merges.txt\r\n", "loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/tokenizer.json\r\n", "loading file added_tokens.json from cache at None\r\n", "loading file special_tokens_map.json from cache at None\r\n", "loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/tokenizer_config.json\r\n", "loading file chat_template.jinja from cache at None\r\n", "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \r\n", "The tokenizer class you load from this checkpoint is 'Qwen2Tokenizer'. \r\n", "The class this function is called from is 'VibeVoiceTextTokenizerFast'.\r\n", "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\r\n", "Using device: cuda, torch_dtype: torch.bfloat16, attn_implementation: flash_attention_2\r\n", "loading configuration file /root/models/VibeVoice-Large/config.json\r\n", "Model config VibeVoiceConfig {\r\n", " \"acostic_vae_dim\": 64,\r\n", " \"acoustic_tokenizer_config\": {\r\n", " \"causal\": true,\r\n", " \"channels\": 1,\r\n", " \"conv_bias\": true,\r\n", " \"conv_norm\": \"none\",\r\n", " \"corpus_normalize\": 0.0,\r\n", " \"decoder_depths\": null,\r\n", " \"decoder_n_filters\": 32,\r\n", " \"decoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"disable_last_norm\": true,\r\n", " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", " \"encoder_n_filters\": 32,\r\n", " \"encoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"fix_std\": 0.5,\r\n", " \"layer_scale_init_value\": 1e-06,\r\n", " \"layernorm\": \"RMSNorm\",\r\n", " \"layernorm_elementwise_affine\": true,\r\n", " \"layernorm_eps\": 1e-05,\r\n", " \"mixer_layer\": \"depthwise_conv\",\r\n", " \"model_type\": \"vibevoice_acoustic_tokenizer\",\r\n", " \"pad_mode\": \"constant\",\r\n", " \"std_dist_type\": \"gaussian\",\r\n", " \"vae_dim\": 64,\r\n", " \"weight_init_value\": 0.01\r\n", " },\r\n", " \"acoustic_vae_dim\": 64,\r\n", " \"architectures\": [\r\n", " \"VibeVoiceForConditionalGeneration\"\r\n", " ],\r\n", " \"decoder_config\": {\r\n", " \"attention_dropout\": 0.0,\r\n", " \"hidden_act\": \"silu\",\r\n", " \"hidden_size\": 3584,\r\n", " \"initializer_range\": 0.02,\r\n", " \"intermediate_size\": 18944,\r\n", " \"max_position_embeddings\": 32768,\r\n", " \"max_window_layers\": 28,\r\n", " \"model_type\": \"qwen2\",\r\n", " \"num_attention_heads\": 28,\r\n", " \"num_hidden_layers\": 28,\r\n", " \"num_key_value_heads\": 4,\r\n", " \"rms_norm_eps\": 1e-06,\r\n", " \"rope_scaling\": null,\r\n", " \"rope_theta\": 1000000.0,\r\n", " \"sliding_window\": null,\r\n", " \"torch_dtype\": \"bfloat16\",\r\n", " \"use_cache\": true,\r\n", " \"use_mrope\": false,\r\n", " \"use_sliding_window\": false,\r\n", " \"vocab_size\": 152064\r\n", " },\r\n", " \"diffusion_head_config\": {\r\n", " \"ddpm_batch_mul\": 4,\r\n", " \"ddpm_beta_schedule\": \"cosine\",\r\n", " \"ddpm_num_inference_steps\": 20,\r\n", " \"ddpm_num_steps\": 1000,\r\n", " \"diffusion_type\": \"ddpm\",\r\n", " \"head_ffn_ratio\": 3.0,\r\n", " \"head_layers\": 4,\r\n", " \"hidden_size\": 3584,\r\n", " \"latent_size\": 64,\r\n", " \"model_type\": \"vibevoice_diffusion_head\",\r\n", " \"prediction_type\": \"v_prediction\",\r\n", " \"rms_norm_eps\": 1e-05,\r\n", " \"speech_vae_dim\": 64\r\n", " },\r\n", " \"model_type\": \"vibevoice\",\r\n", " \"semantic_tokenizer_config\": {\r\n", " \"causal\": true,\r\n", " \"channels\": 1,\r\n", " \"conv_bias\": true,\r\n", " \"conv_norm\": \"none\",\r\n", " \"corpus_normalize\": 0.0,\r\n", " \"disable_last_norm\": true,\r\n", " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", " \"encoder_n_filters\": 32,\r\n", " \"encoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"fix_std\": 0,\r\n", " \"layer_scale_init_value\": 1e-06,\r\n", " \"layernorm\": \"RMSNorm\",\r\n", " \"layernorm_elementwise_affine\": true,\r\n", " \"layernorm_eps\": 1e-05,\r\n", " \"mixer_layer\": \"depthwise_conv\",\r\n", " \"model_type\": \"vibevoice_semantic_tokenizer\",\r\n", " \"pad_mode\": \"constant\",\r\n", " \"std_dist_type\": \"none\",\r\n", " \"vae_dim\": 128,\r\n", " \"weight_init_value\": 0.01\r\n", " },\r\n", " \"semantic_vae_dim\": 128,\r\n", " \"tie_word_embeddings\": false,\r\n", " \"torch_dtype\": \"bfloat16\",\r\n", " \"transformers_version\": \"4.51.3\"\r\n", "}\r\n", "\r\n", "loading weights file /root/models/VibeVoice-Large/model.safetensors.index.json\r\n", "Instantiating VibeVoiceForConditionalGenerationInference model under default dtype torch.bfloat16.\r\n", "[ERROR] : ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\r\n", "Traceback (most recent call last):\r\n", " File \"/root/VibeVoice/demo/inference_from_file.py\", line 305, in main\r\n", " model = VibeVoiceForConditionalGenerationInference.from_pretrained(\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 279, in _wrapper\r\n", " return func(*args, **kwargs)\r\n", " ^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 4336, in from_pretrained\r\n", " config = cls._autoset_attn_implementation(\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 2109, in _autoset_attn_implementation\r\n", " cls._check_and_enable_flash_attn_2(\r\n", " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 2252, in _check_and_enable_flash_attn_2\r\n", " raise ImportError(f\"{preface} the package flash_attn seems to be not installed. {install_message}\")\r\n", "ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\r\n", "\r\n", "Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.\r\n", "loading configuration file /root/models/VibeVoice-Large/config.json\r\n", "Model config VibeVoiceConfig {\r\n", " \"acostic_vae_dim\": 64,\r\n", " \"acoustic_tokenizer_config\": {\r\n", " \"causal\": true,\r\n", " \"channels\": 1,\r\n", " \"conv_bias\": true,\r\n", " \"conv_norm\": \"none\",\r\n", " \"corpus_normalize\": 0.0,\r\n", " \"decoder_depths\": null,\r\n", " \"decoder_n_filters\": 32,\r\n", " \"decoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"disable_last_norm\": true,\r\n", " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", " \"encoder_n_filters\": 32,\r\n", " \"encoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"fix_std\": 0.5,\r\n", " \"layer_scale_init_value\": 1e-06,\r\n", " \"layernorm\": \"RMSNorm\",\r\n", " \"layernorm_elementwise_affine\": true,\r\n", " \"layernorm_eps\": 1e-05,\r\n", " \"mixer_layer\": \"depthwise_conv\",\r\n", " \"model_type\": \"vibevoice_acoustic_tokenizer\",\r\n", " \"pad_mode\": \"constant\",\r\n", " \"std_dist_type\": \"gaussian\",\r\n", " \"vae_dim\": 64,\r\n", " \"weight_init_value\": 0.01\r\n", " },\r\n", " \"acoustic_vae_dim\": 64,\r\n", " \"architectures\": [\r\n", " \"VibeVoiceForConditionalGeneration\"\r\n", " ],\r\n", " \"decoder_config\": {\r\n", " \"attention_dropout\": 0.0,\r\n", " \"hidden_act\": \"silu\",\r\n", " \"hidden_size\": 3584,\r\n", " \"initializer_range\": 0.02,\r\n", " \"intermediate_size\": 18944,\r\n", " \"max_position_embeddings\": 32768,\r\n", " \"max_window_layers\": 28,\r\n", " \"model_type\": \"qwen2\",\r\n", " \"num_attention_heads\": 28,\r\n", " \"num_hidden_layers\": 28,\r\n", " \"num_key_value_heads\": 4,\r\n", " \"rms_norm_eps\": 1e-06,\r\n", " \"rope_scaling\": null,\r\n", " \"rope_theta\": 1000000.0,\r\n", " \"sliding_window\": null,\r\n", " \"torch_dtype\": \"bfloat16\",\r\n", " \"use_cache\": true,\r\n", " \"use_mrope\": false,\r\n", " \"use_sliding_window\": false,\r\n", " \"vocab_size\": 152064\r\n", " },\r\n", " \"diffusion_head_config\": {\r\n", " \"ddpm_batch_mul\": 4,\r\n", " \"ddpm_beta_schedule\": \"cosine\",\r\n", " \"ddpm_num_inference_steps\": 20,\r\n", " \"ddpm_num_steps\": 1000,\r\n", " \"diffusion_type\": \"ddpm\",\r\n", " \"head_ffn_ratio\": 3.0,\r\n", " \"head_layers\": 4,\r\n", " \"hidden_size\": 3584,\r\n", " \"latent_size\": 64,\r\n", " \"model_type\": \"vibevoice_diffusion_head\",\r\n", " \"prediction_type\": \"v_prediction\",\r\n", " \"rms_norm_eps\": 1e-05,\r\n", " \"speech_vae_dim\": 64\r\n", " },\r\n", " \"model_type\": \"vibevoice\",\r\n", " \"semantic_tokenizer_config\": {\r\n", " \"causal\": true,\r\n", " \"channels\": 1,\r\n", " \"conv_bias\": true,\r\n", " \"conv_norm\": \"none\",\r\n", " \"corpus_normalize\": 0.0,\r\n", " \"disable_last_norm\": true,\r\n", " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", " \"encoder_n_filters\": 32,\r\n", " \"encoder_ratios\": [\r\n", " 8,\r\n", " 5,\r\n", " 5,\r\n", " 4,\r\n", " 2,\r\n", " 2\r\n", " ],\r\n", " \"fix_std\": 0,\r\n", " \"layer_scale_init_value\": 1e-06,\r\n", " \"layernorm\": \"RMSNorm\",\r\n", " \"layernorm_elementwise_affine\": true,\r\n", " \"layernorm_eps\": 1e-05,\r\n", " \"mixer_layer\": \"depthwise_conv\",\r\n", " \"model_type\": \"vibevoice_semantic_tokenizer\",\r\n", " \"pad_mode\": \"constant\",\r\n", " \"std_dist_type\": \"none\",\r\n", " \"vae_dim\": 128,\r\n", " \"weight_init_value\": 0.01\r\n", " },\r\n", " \"semantic_vae_dim\": 128,\r\n", " \"tie_word_embeddings\": false,\r\n", " \"torch_dtype\": \"bfloat16\",\r\n", " \"transformers_version\": \"4.51.3\"\r\n", "}\r\n", "\r\n", "loading weights file /root/models/VibeVoice-Large/model.safetensors.index.json\r\n", "Instantiating VibeVoiceForConditionalGenerationInference model under default dtype torch.bfloat16.\r\n", "Generate config GenerationConfig {}\r\n", "\r\n", "Instantiating Qwen2Model model under default dtype torch.bfloat16.\r\n", "Instantiating VibeVoiceAcousticTokenizerModel model under default dtype torch.bfloat16.\r\n", "Instantiating VibeVoiceSemanticTokenizerModel model under default dtype torch.bfloat16.\r\n", "Instantiating VibeVoiceDiffusionHead model under default dtype torch.bfloat16.\r\n", "\rLoading checkpoint shards: 0%| | 0/10 [00:00WITH LoRA (Alice)\"))\n", "display(Audio(\"/root/outputs/builtin_with_lora/test_generated.wav\"))\n", "display(HTML(\"

WITHOUT LoRA (Alice)

\"))\n", "display(Audio(\"/root/outputs/builtin_without_lora/test_generated.wav\"))" ], "execution_count": 16, "outputs": [ { "output_type": "display_data", "data": { "text/html": "

WITH LoRA (Alice)

", "text/plain": "" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/html": "\n \n ", "text/plain": "" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/html": "

WITHOUT LoRA (Alice)

", "text/plain": "" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/html": "\n \n ", "text/plain": "" }, "metadata": {} } ] } ], "metadata": { "kernelspec": { "display_name": "Python", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3" } }, "nbformat": 4, "nbformat_minor": 5 }