{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "b92d046f", "metadata": {}, "outputs": [], "source": [ "import os\n", "os.environ['VLLM_USE_V1'] = '0'\n", "os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'\n", "os.environ[\"VLLM_LOGGING_LEVEL\"] = \"ERROR\"\n", "os.environ['CUDA_VISIBLE_DEVICES'] = \"0\"\n", "import torch\n", "import warnings\n", "import numpy as np\n", "\n", "warnings.filterwarnings('ignore')\n", "warnings.filterwarnings('ignore', category=DeprecationWarning)\n", "warnings.filterwarnings('ignore', category=FutureWarning)\n", "warnings.filterwarnings('ignore', category=UserWarning)\n", "\n", "from qwen_omni_utils import process_mm_info\n", "from transformers import Qwen3OmniMoeProcessor\n", "\n", "def _load_model_processor():\n", " if USE_TRANSFORMERS:\n", " from transformers import Qwen3OmniMoeForConditionalGeneration\n", " if TRANSFORMERS_USE_FLASH_ATTN2:\n", " model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(MODEL_PATH,\n", " dtype='auto',\n", " attn_implementation='flash_attention_2',\n", " device_map=\"auto\")\n", " else:\n", " model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(MODEL_PATH, device_map=\"auto\", dtype='auto')\n", " else:\n", " from vllm import LLM\n", " model = LLM(\n", " model=MODEL_PATH, trust_remote_code=True, gpu_memory_utilization=0.95,\n", " tensor_parallel_size=torch.cuda.device_count(),\n", " limit_mm_per_prompt={'image': 1, 'video': 3, 'audio': 3},\n", " max_num_seqs=1,\n", " max_model_len=8192,\n", " seed=1234,\n", " )\n", "\n", " processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)\n", " return model, processor\n", "\n", "def run_model(model, processor, messages, return_audio, use_audio_in_video):\n", " if USE_TRANSFORMERS:\n", " text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)\n", " audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)\n", " inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors=\"pt\", padding=True, use_audio_in_video=use_audio_in_video)\n", " inputs = inputs.to(model.device).to(model.dtype)\n", " text_ids, audio = model.generate(**inputs,\n", " thinker_return_dict_in_generate=True,\n", " thinker_max_new_tokens=8192,\n", " thinker_do_sample=True,\n", " thinker_top_p=0.95,\n", " thinker_top_k=20,\n", " thinker_temperature=0.6,\n", " speaker=\"Chelsie\",\n", " use_audio_in_video=use_audio_in_video,\n", " return_audio=return_audio)\n", " response = processor.batch_decode(text_ids.sequences[:, inputs[\"input_ids\"].shape[1] :], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n", " if audio is not None:\n", " audio = np.array(audio.reshape(-1).detach().cpu().numpy() * 32767).astype(np.int16)\n", " return response, audio\n", " else:\n", " from vllm import SamplingParams\n", " sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=20, max_tokens=4096)\n", " text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n", " audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)\n", " inputs = {'prompt': text, 'multi_modal_data': {}, \"mm_processor_kwargs\": {\"use_audio_in_video\": use_audio_in_video}}\n", " if images is not None: inputs['multi_modal_data']['image'] = images\n", " if videos is not None: inputs['multi_modal_data']['video'] = videos\n", " if audios is not None: inputs['multi_modal_data']['audio'] = audios\n", " outputs = model.generate(inputs, sampling_params=sampling_params)\n", " response = outputs[0].outputs[0].text\n", " return response, None\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d37dcedc", "metadata": {}, "outputs": [], "source": [ "import librosa\n", "import audioread\n", "\n", "from IPython.display import Audio\n", "\n", "MODEL_PATH = \"NandemoGHS/Anime-Speech-Japanese-Refiner-FP8-DYNAMIC\"\n", "\n", "USE_TRANSFORMERS = False\n", "TRANSFORMERS_USE_FLASH_ATTN2 = True\n", "\n", "model, processor = _load_model_processor()\n", "\n", "USE_AUDIO_IN_VIDEO = True" ] }, { "cell_type": "code", "execution_count": null, "id": "5bf60bf5", "metadata": {}, "outputs": [], "source": [ "audio_path = \"https://huggingface.co/NandemoGHS/Anime-Speech-Japanese-Refiner/resolve/main/examples/example1.wav\"\n", "\n", "original_transcription = \"あっ、あぁんっ、好き、大好きですわ…。もっと…はぁ、んんっ、はぁんっ、もっとぉ!\"\n", "\n", "prompt = f\"\"\"これから与えられる音声クリップとその文字起こしについて、声の特徴と読み上げスタイル、感情などをアノテーションしたうえで、日本語の短いキャプションで要約してください。\n", "出力には以下の項目を含めてください。\n", "\n", "profile: 話者プロファイル(例: お姉さん的な女性声/落ち着いた男性声/少女声 等)\n", "mood: 感情・ムード(例: 明るい/落ち着いた/緊張/怒り/恐怖/悲しみ/快楽 等)\n", "speed: 話速(例: とても遅い/やや速い/一定/(1.2×) 等)\n", "prosody: 抑揚・リズム(例: 平坦/メリハリ/語尾上げ下げ/ため息混じり 等)\n", "pitch_timbre: ピッチ/声質(例: 高め/低め/息多め/張りのある/囁き 等)\n", "style: 発話スタイル(例: ナレーション風/会話調/朗読調/プレゼン調/囁き/喘ぎ/嗚咽/叫び 等)\n", "emotion: 感情タグ(次のリストから1つ選択: [\"angry\", \"sad\", \"disdainful\", \"excited\", \"surprised\", \"satisfied\", \"unhappy\", \"anxious\", \"hysterical\", \"delighted\", \"scared\", \"worried\", \"indifferent\", \"upset\", \"impatient\", \"nervous\", \"guilty\", \"scornful\", \"frustrated\", \"depressed\", \"panicked\", \"furious\", \"empathetic\", \"embarrassed\", \"reluctant\", \"disgusted\", \"keen\", \"moved\", \"proud\", \"relaxed\", \"grateful\", \"confident\", \"interested\", \"curious\", \"confused\", \"joyful\", \"disapproving\", \"negative\", \"denying\", \"astonished\", \"serious\", \"sarcastic\", \"conciliative\", \"comforting\", \"sincere\", \"sneering\", \"hesitating\", \"yielding\", \"painful\", \"awkward\", \"amused\", \"loving\", \"dating\", \"longing\", \"aroused\", \"seductive\", \"ecstatic\", \"shy\"])\n", "notes: 特記事項(間の取り方、笑い・ため・ブレス、ノイズ感、キス音、効果音、チュパ音 等)\n", "caption: 上記を1〜2文・全角30〜80文字で自然文に要約\n", "refined_text: 元の文字起こしテキストに、必要に応じて特殊タグを音声中のイベントの描写として文章のどこかに挿入したもの(必要なければ元テキストをそのまま出力)。\n", "\n", "元の文字起こしテキスト: {original_transcription}\n", "元の音声クリップ:\"\"\"\n", "\n", "messages = [\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\"type\": \"text\", \"text\": prompt},\n", " {\"type\": \"audio\", \"audio\": audio_path},\n", " ]\n", " }\n", "]\n", "\n", "display(Audio(librosa.load(audioread.ffdec.FFmpegAudioFile(audio_path), sr=16000)[0], rate=16000))\n", "\n", "response, _ = run_model(model=model, messages=messages, processor=processor, return_audio=False, use_audio_in_video=USE_AUDIO_IN_VIDEO)\n", "\n", "print(response)" ] } ], "metadata": { "kernelspec": { "display_name": "venv (3.10.12)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }