import io import os import spaces os.environ['VLLM_USE_V1'] = '0' os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' from argparse import ArgumentParser import gradio as gr import gradio.processing_utils as processing_utils import numpy as np import soundfile as sf from gradio_client import utils as client_utils import torch # Transformers and Qwen Omni-Utils imports for local inference from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor from qwen_omni_utils import process_mm_info def _load_model_processor(args): """ Loads the Qwen3-Omni model and processor from Hugging Face using the transformers library. """ print(f"Loading model from: {args.checkpoint_path}") # Model loading configuration device_map = "cuda" if torch.cuda.is_available() and not args.cpu_only else "cpu" model_kwargs = { "dtype": "auto", "device_map": device_map, "trust_remote_code": True, } # Use flash attention 2 if available and enabled for better performance if args.flash_attn2 and torch.cuda.is_available(): model_kwargs["attn_implementation"] = "flash_attention_2" # Load the model and processor model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( args.checkpoint_path, **model_kwargs ) processor = Qwen3OmniMoeProcessor.from_pretrained(args.checkpoint_path) print("Model and processor loaded successfully.") return model, processor def _launch_demo(args, model, processor): # Voice settings updated for the transformers model VOICE_OPTIONS = { "Ethan (Male)": "Ethan", "Chelsie (Female)": "Chelsie", "Aiden (Male)": "Aiden", } DEFAULT_VOICE = 'Ethan (Male)' default_system_prompt = '' def to_mp4(path): """Converts webm video files to mp4 for compatibility.""" import subprocess if path and path.endswith(".webm"): mp4_path = path.replace(".webm", ".mp4") try: subprocess.run([ "ffmpeg", "-y", "-i", path, "-c:v", "libx264", "-preset", "ultrafast", "-pix_fmt", "yuv420p", "-c:a", "aac", "-b:a", "128k", mp4_path ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) return mp4_path except (subprocess.CalledProcessError, FileNotFoundError): print("ffmpeg conversion failed. Returning original path.") return path return path def format_conversation_for_transformers(history: list, system_prompt: str): """ Formats the Gradio chat history into the conversation format required by the Qwen3-Omni processor. """ conversation = [] if system_prompt: conversation.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]}) # Group consecutive user messages current_user_content = [] for item in history: role = item['role'] content = item['content'] if role == "user": if isinstance(content, str) and content: current_user_content.append({"type": "text", "text": content}) elif isinstance(content, tuple) and content[0]: file_path = content[0] mime_type = client_utils.get_mimetype(file_path) if mime_type.startswith("image"): current_user_content.append({"type": "image", "image": file_path}) elif mime_type.startswith("video"): current_user_content.append({"type": "video", "video": to_mp4(file_path)}) elif mime_type.startswith("audio"): current_user_content.append({"type": "audio", "audio": file_path}) elif role == "assistant": if current_user_content: conversation.append({"role": "user", "content": current_user_content}) current_user_content = [] if isinstance(content, str) and content: conversation.append({"role": "assistant", "content": [{"type": "text", "text": content}]}) if current_user_content: conversation.append({"role": "user", "content": current_user_content}) return conversation @spaces.GPU def predict(conversation, voice_choice, temperature, top_p, top_k, return_audio, enable_thinking): """ Runs local inference using the loaded transformers model. """ speaker = VOICE_OPTIONS[voice_choice] use_audio_in_video = True # Consistently process audio from video files text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) audios, images, videos = process_mm_info(conversation, use_audio_in_video=use_audio_in_video) inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=use_audio_in_video) inputs = inputs.to(model.device).to(model.dtype) gen_kwargs = { "speaker": speaker, "thinker_return_dict_in_generate": True, "use_audio_in_video": use_audio_in_video, "return_audio": return_audio, "temperature": float(temperature), "top_p": float(top_p), "top_k": int(top_k), "max_new_tokens": 8192, } text_ids, audio_tensor = model.generate(**inputs, **gen_kwargs) response_text = processor.batch_decode( text_ids.sequences[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] yield {"type": "text", "data": response_text} if audio_tensor is not None and return_audio: audio_np = audio_tensor.reshape(-1).detach().cpu().numpy() with io.BytesIO() as wav_io: sf.write(wav_io, audio_np, samplerate=24000, format="WAV") wav_bytes = wav_io.getvalue() audio_path = processing_utils.save_bytes_to_cache( wav_bytes, "audio.wav", cache_dir=demo.GRADIO_CACHE ) yield {"type": "audio", "data": audio_path} @spaces.GPU def chat_predict(text, audio, image, video, history, system_prompt, voice_choice, temperature, top_p, top_k, return_audio=False, enable_thinking=False): if audio: history.append({"role": "user", "content": (audio,)}) if image: history.append({"role": "user", "content": (image,)}) if video: history.append({"role": "user", "content": (video,)}) if text: history.append({"role": "user", "content": text}) yield gr.Textbox(value=None), gr.Audio(value=None), gr.Image(value=None), gr.Video(value=None), history conversation = format_conversation_for_transformers(history, system_prompt) history.append({"role": "assistant", "content": ""}) final_text = "" final_audio_path = None for chunk in predict(conversation, voice_choice, temperature, top_p, top_k, return_audio, enable_thinking): if chunk["type"] == "text": final_text = chunk["data"] history[-1]["content"] = final_text yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), history elif chunk["type"] == "audio": final_audio_path = chunk["data"] if final_audio_path: history.append({"role": "assistant", "content": gr.Audio(final_audio_path, autoplay=True)}) yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), history with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"]), css=".gradio-container {max-width: none !important;}") as demo: gr.Markdown("# Qwen3-Omni Demo (Local Transformers on HF Spaces)") gr.Markdown( "**Instructions**: Interact with the locally running model through text, audio, images, or video.") with gr.Row(equal_height=False): with gr.Column(scale=1): gr.Markdown("### ⚙️ Parameters") system_prompt_textbox = gr.Textbox(label="System Prompt", value=default_system_prompt, lines=4, max_lines=8) voice_choice = gr.Dropdown(label="Voice Choice", choices=list(VOICE_OPTIONS.keys()), value=DEFAULT_VOICE, visible=True) return_audio = gr.Checkbox( label="Return Audio", value=True, interactive=True, ) enable_thinking = gr.Checkbox( label="Enable Thinking", value=False, interactive=True, info="Note: Requires loading the 'Thinking' model variant." ) temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.6, step=0.1) top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05) top_k = gr.Slider(label="Top K", minimum=1, maximum=100, value=20, step=1) with gr.Column(scale=3): chatbot = gr.Chatbot(label="Chat History", height=650, layout="panel", bubble_full_width=False, render=False, type="messages") chatbot.render() with gr.Accordion("📎 Click to upload multimodal files", open=False): with gr.Row(): audio_input = gr.Audio(sources=["upload", 'microphone'], type="filepath", label="Audio") image_input = gr.Image(sources=["upload", 'webcam'], type="filepath", label="Image") video_input = gr.Video(sources=["upload", 'webcam'], label="Video") with gr.Row(): text_input = gr.Textbox(show_label=False, placeholder="Enter text or upload files and press Submit...", scale=7) submit_btn = gr.Button("Submit", variant="primary", scale=1) clear_btn = gr.Button("Clear", scale=1) def clear_history(): return [], None, None, None, None submit_event = gr.on( triggers=[submit_btn.click, text_input.submit], fn=chat_predict, inputs=[text_input, audio_input, image_input, video_input, chatbot, system_prompt_textbox, voice_choice, temperature, top_p, top_k, return_audio, enable_thinking], outputs=[text_input, audio_input, image_input, video_input, chatbot] ) clear_btn.click(fn=clear_history, outputs=[chatbot, text_input, audio_input, image_input, video_input]) demo.queue().launch(share=args.share, inbrowser=args.inbrowser, server_port=args.server_port, server_name=args.server_name) DEFAULT_CKPT_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct" def _get_args(): parser = ArgumentParser() parser.add_argument('-c', '--checkpoint-path', type=str, default=DEFAULT_CKPT_PATH, help='Hugging Face model checkpoint name or path, default to %(default)r') parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only') parser.add_argument('--flash-attn2', action='store_true', default=True, help='Enable flash_attention_2 when loading the model.') parser.add_argument('--share', action='store_true', default=False, help='Create a publicly shareable link for the interface.') parser.add_argument('--inbrowser', action='store_true', default=False, help='Automatically launch the interface in a new tab on the default browser.') parser.add_argument('--server-port', type=int, default=7860, help='Demo server port.') parser.add_argument('--server-name', type=str, default=None, help='Demo server name.') # Set to None for Spaces args = parser.parse_args([]) # Use empty list for args when running in Spaces return args if __name__ == "__main__": args = _get_args() model, processor = _load_model_processor(args) _launch_demo(args, model, processor)