AnyTalker

Paused

App Files Files Community

C4G-HKUST commited on 17 days ago

Commit

ff7f08b

1 Parent(s): 0f8cfbd

feat: time out check

Browse files

Files changed (3) hide show

app.py +37 -37
wan/audio2video_multiID.py +2 -2
wan/utils/infer_utils.py +25 -25

app.py CHANGED Viewed

@@ -436,7 +436,7 @@ def run_graio_demo(args):
     logging.info("Model and face processor loaded successfully.")
     def generate_video(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
-                    sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector, fixed_steps=None, trim_to_6s=False):
         # 参考 LivePortrait: 在 worker 进程中直接使用 cuda 设备
         # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/src/gradio_pipeline.py
         # @spaces.GPU 装饰器已经初始化了 GPU，这里直接使用即可
@@ -483,13 +483,13 @@ def run_graio_demo(args):
                 fps = getattr(cfg, 'fps', 24)
                 calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
-                # Fast模式：如果trim_to_6s为True，强制限制为6秒对应的帧数
-                if trim_to_6s:
-                    # 计算6秒对应的帧数（4n+1格式）
-                    max_frames_6s = int(math.ceil(6.0 * fps))
-                    max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
-                    current_frame_num = min(calculated_frame_num, max_frames_6s)
-                    logging.warning(f"Fast mode: Audio duration exceeds 6 seconds. Trimming to 6 seconds ({max_frames_6s} frames). Original: {calculated_frame_num} frames")
                 else:
                     current_frame_num = calculated_frame_num
@@ -531,7 +531,7 @@ def run_graio_demo(args):
             audio_paths=audio_paths,
             task_key="gradio_output",
             mode=audio_mode_selector,
-            trim_to_6s=trim_to_6s,
         )
         if isinstance(video, dict):
@@ -618,13 +618,13 @@ def run_graio_demo(args):
     # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
     # @spaces.GPU 装饰器会自动处理 GPU 初始化，不需要手动初始化
-    # 快速生成模式：180秒，固定10步去噪
-    @spaces.GPU(duration=180)
     def gpu_wrapped_generate_video_fast(*args, **kwargs):
         # 固定使用10步去噪，通过关键字参数传递
         kwargs['fixed_steps'] = 8
-        # Fast模式音频长度检测：检查是否超过6秒
         # 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
         #          sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
         if len(args) >= 11:
@@ -658,27 +658,27 @@ def run_graio_demo(args):
                 if img2vid_audio_3:
                     audio_paths.append(img2vid_audio_3)
-            # 检测音频长度是否超过6秒
             if audio_paths and len(audio_paths) > 0:
                 fps = getattr(cfg, 'fps', 24)
                 try:
                     calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
-                    # 计算6秒对应的帧数
-                    max_frames_6s = int(math.ceil(6.0 * fps))
-                    max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
-                    if calculated_frame_num > max_frames_6s:
-                        # 超过6秒，设置trim_to_6s标记
-                        kwargs['trim_to_6s'] = True
                         calculated_duration = calculated_frame_num / fps
-                        logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 6 seconds limit. Will trim to 6 seconds.")
                     else:
-                        kwargs['trim_to_6s'] = False
                 except Exception as e:
                     logging.warning(f"Failed to check audio duration: {e}")
-                    kwargs['trim_to_6s'] = False
             else:
-                kwargs['trim_to_6s'] = False
         return gpu_wrapped_generate_video_worker(*args, **kwargs)
@@ -760,8 +760,8 @@ def run_graio_demo(args):
                             ⚠️ Important Video Duration Limits
                         </div>
                         <div style="font-size: 14px; color: #856404; line-height: 1.6;">
-                            Fast Mode: Maximum video duration shoule be less than 6 seconds. Videos longer than 6 seconds will timeout.<br>
-                            Quality Mode: Maximum video duration shoule be less than 8 seconds with default 25 denoising steps (You can adjust the denoising steps to generate longer videos).
                         </div>
                     </div>
@@ -837,7 +837,7 @@ def run_graio_demo(args):
                 with gr.Row():
                     run_i2v_button_fast = gr.Button(
-                        "Generate Video (Fast - 180s, 8 steps)",
                         variant="secondary",
                         scale=1
                     )
@@ -848,10 +848,10 @@ def run_graio_demo(args):
                     )
                 gr.Markdown("""
                 **Generation Modes:**
-                - **Fast Mode (up to 180s GPU budget)**: Fixed 8 denoising steps for quick generation. **⚠️ Maximum video duration: 6 seconds. Videos longer than 6 seconds will timeout.**
                 - **Quality Mode (up to 780s GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). **⚠️ Maximum video duration: 8 seconds with default 25 denoising steps.**
-                *Note: The GPU duration (180s/780s) represents the maximum budget allocated, not the actual generation time. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
                 """)
             with gr.Column(scale=2):
@@ -889,7 +889,7 @@ def run_graio_demo(args):
         # 包装函数：处理警告信息显示
         def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
                                     sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
-            # 在开始生成前先检测音频长度，如果超过6秒立即显示警告
             # 根据人数收集音频路径
             audio_paths = []
             if person_num_selector == "1 Person":
@@ -908,19 +908,19 @@ def run_graio_demo(args):
                 if img2vid_audio_3:
                     audio_paths.append(img2vid_audio_3)
-            # 检测音频长度是否超过6秒
             if audio_paths and len(audio_paths) > 0:
                 fps = getattr(cfg, 'fps', 24)
                 try:
                     calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
-                    # 计算6秒对应的帧数
-                    max_frames_6s = int(math.ceil(6.0 * fps))
-                    max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
-                    if calculated_frame_num > max_frames_6s:
-                        # 超过6秒，立即显示警告
                         calculated_duration = calculated_frame_num / fps
-                        warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 6-second limit for Fast Mode. The audio will be automatically trimmed to 6 seconds to prevent timeout."
                         gr.Warning(warning_msg, duration=5)
                 except Exception as e:
                     logging.warning(f"Failed to check audio duration: {e}")
@@ -936,7 +936,7 @@ def run_graio_demo(args):
             result = gpu_wrapped_generate_video_quality(*args)
             return result
-        # 快速生成按钮：180秒，固定10步
         run_i2v_button_fast.click(
             fn=handle_fast_generation,
             inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],

     logging.info("Model and face processor loaded successfully.")
     def generate_video(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
+                    sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector, fixed_steps=None, trim_to_5s=False):
         # 参考 LivePortrait: 在 worker 进程中直接使用 cuda 设备
         # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/src/gradio_pipeline.py
         # @spaces.GPU 装饰器已经初始化了 GPU，这里直接使用即可
                 fps = getattr(cfg, 'fps', 24)
                 calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
+                # Fast模式：如果trim_to_5s为True，强制限制为5秒对应的帧数
+                if trim_to_5s:
+                    # 计算5秒对应的帧数（4n+1格式）
+                    max_frames_5s = int(math.ceil(5.0 * fps))
+                    max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
+                    current_frame_num = min(calculated_frame_num, max_frames_5s)
+                    logging.warning(f"Fast mode: Audio duration exceeds 5 seconds. Trimming to 5 seconds ({max_frames_5s} frames). Original: {calculated_frame_num} frames")
                 else:
                     current_frame_num = calculated_frame_num
             audio_paths=audio_paths,
             task_key="gradio_output",
             mode=audio_mode_selector,
+            trim_to_5s=trim_to_5s,
         )
         if isinstance(video, dict):
     # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
     # @spaces.GPU 装饰器会自动处理 GPU 初始化，不需要手动初始化
+    # 快速生成模式：150秒，固定10步去噪
+    @spaces.GPU(duration=150)
     def gpu_wrapped_generate_video_fast(*args, **kwargs):
         # 固定使用10步去噪，通过关键字参数传递
         kwargs['fixed_steps'] = 8
+        # Fast模式音频长度检测：检查是否超过5秒
         # 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
         #          sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
         if len(args) >= 11:
                 if img2vid_audio_3:
                     audio_paths.append(img2vid_audio_3)
+            # 检测音频长度是否超过5秒
             if audio_paths and len(audio_paths) > 0:
                 fps = getattr(cfg, 'fps', 24)
                 try:
                     calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
+                    # 计算5秒对应的帧数
+                    max_frames_5s = int(math.ceil(5.0 * fps))
+                    max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
+                    if calculated_frame_num > max_frames_5s:
+                        # 超过5秒，设置trim_to_5s标记
+                        kwargs['trim_to_5s'] = True
                         calculated_duration = calculated_frame_num / fps
+                        logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 5 seconds limit. Will trim to 5 seconds.")
                     else:
+                        kwargs['trim_to_5s'] = False
                 except Exception as e:
                     logging.warning(f"Failed to check audio duration: {e}")
+                    kwargs['trim_to_5s'] = False
             else:
+                kwargs['trim_to_5s'] = False
         return gpu_wrapped_generate_video_worker(*args, **kwargs)
                             ⚠️ Important Video Duration Limits
                         </div>
                         <div style="font-size: 14px; color: #856404; line-height: 1.6;">
+                            <strong>Fast Mode:</strong> Maximum video duration is <strong>5 seconds</strong>. Videos longer than 5 seconds will be automatically trimmed to 5 seconds.<br>
+                            <strong>Quality Mode:</strong> Maximum video duration is <strong>8 seconds</strong> with default 25 denoising steps.
                         </div>
                     </div>
                 with gr.Row():
                     run_i2v_button_fast = gr.Button(
+                        "Generate Video (Fast - 150s, 8 steps)",
                         variant="secondary",
                         scale=1
                     )
                     )
                 gr.Markdown("""
                 **Generation Modes:**
+                - **Fast Mode (up to 150s GPU budget)**: Fixed 8 denoising steps for quick generation. **⚠️ Maximum video duration: 5 seconds. Videos longer than 5 seconds will be automatically trimmed to 5 seconds.**
                 - **Quality Mode (up to 780s GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). **⚠️ Maximum video duration: 8 seconds with default 25 denoising steps.**
+                *Note: The GPU duration (150s/780s) represents the maximum budget allocated, not the actual generation time. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
                 """)
             with gr.Column(scale=2):
         # 包装函数：处理警告信息显示
         def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
                                     sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
+            # 在开始生成前先检测音频长度，如果超过5秒立即显示警告
             # 根据人数收集音频路径
             audio_paths = []
             if person_num_selector == "1 Person":
                 if img2vid_audio_3:
                     audio_paths.append(img2vid_audio_3)
+            # 检测音频长度是否超过5秒
             if audio_paths and len(audio_paths) > 0:
                 fps = getattr(cfg, 'fps', 24)
                 try:
                     calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
+                    # 计算5秒对应的帧数
+                    max_frames_5s = int(math.ceil(5.0 * fps))
+                    max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
+                    if calculated_frame_num > max_frames_5s:
+                        # 超过5秒，立即显示警告
                         calculated_duration = calculated_frame_num / fps
+                        warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 5-second limit for Fast Mode. The audio will be automatically trimmed to 5 seconds to prevent timeout."
                         gr.Warning(warning_msg, duration=5)
                 except Exception as e:
                     logging.warning(f"Failed to check audio duration: {e}")
             result = gpu_wrapped_generate_video_quality(*args)
             return result
+        # 快速生成按钮：150秒，固定10步
         run_i2v_button_fast.click(
             fn=handle_fast_generation,
             inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],

wan/audio2video_multiID.py CHANGED Viewed

@@ -199,7 +199,7 @@ class WanAF2V:
         audio_paths=None, # New: audio path list, supports multiple audio files
         task_key=None,
         mode="pad",  # Audio processing mode: "pad" or "concat"
-        trim_to_6s=False,  # Fast mode: trim audio to 6 seconds
     ):
         r"""
         Generates video frames from input image and text prompt using diffusion process.
@@ -515,7 +515,7 @@ class WanAF2V:
             half_dtype=self.half_dtype,
             preprocess_audio=preprocess_audio,
             resample_audio=resample_audio,
-            trim_to_6s=trim_to_6s,
         )
         # Prepare audio_ref_features - new list mode

         audio_paths=None, # New: audio path list, supports multiple audio files
         task_key=None,
         mode="pad",  # Audio processing mode: "pad" or "concat"
+        trim_to_5s=False,  # Fast mode: trim audio to 5 seconds
     ):
         r"""
         Generates video frames from input image and text prompt using diffusion process.
             half_dtype=self.half_dtype,
             preprocess_audio=preprocess_audio,
             resample_audio=resample_audio,
+            trim_to_5s=trim_to_5s,
         )
         # Prepare audio_ref_features - new list mode

wan/utils/infer_utils.py CHANGED Viewed

@@ -118,7 +118,7 @@ def process_audio_features(
     half_dtype=None,
     preprocess_audio=None,
     resample_audio=None,
-    trim_to_6s=False,  # Fast mode: trim audio to 6 seconds
 ):
     """
     Process audio files and extract audio features.
@@ -203,22 +203,22 @@ def process_audio_features(
             total_length = sum(audio_lengths)
             print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
-            # Fast mode: trim to 6 seconds if trim_to_6s is True
-            if trim_to_6s:
                 import math
-                # Calculate 6 seconds in frames
-                max_frames_6s = int(math.ceil(6.0 * fps))
-                max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
-                if total_length > max_frames_6s:
-                    print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_6s} frames (6 seconds)")
                     # Truncate each audio proportionally
-                    scale_factor = max_frames_6s / total_length
                     cumulative_length = 0
                     for i, audio_len in enumerate(audio_lengths):
                         if audio_len > 0:
                             new_audio_len = int(audio_len * scale_factor)
                             # Ensure it fits within remaining space
-                            remaining_space = max_frames_6s - cumulative_length
                             new_audio_len = min(new_audio_len, remaining_space)
                             audio_lengths[i] = new_audio_len
                             # Truncate the corresponding raw audio feature
@@ -283,7 +283,7 @@ def process_audio_features(
                     audio_feat_list.append(zero_audio_feat)
                     print(f"Audio {i} is missing, created zero features with shape: {zero_audio_feat.shape}")
         else:
-            # Pad mode: keep existing logic, but apply trim_to_6s if needed
             for i, audio_path in enumerate(audio_paths):
                 if audio_path and os.path.exists(audio_path):
                     print(f"Processing audio {i}: {audio_path}")
@@ -296,15 +296,15 @@ def process_audio_features(
                     with torch.no_grad():
                         print(f"wav2vec_model: {wav2vec_model}")
                         print(f"cache_dir:{cache_dir}")
-                        # Fast mode: if trim_to_6s, limit to 6 seconds
                         target_frames = F
-                        if trim_to_6s:
                             import math
-                            max_frames_6s = int(math.ceil(6.0 * fps))
-                            max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
-                            target_frames = min(F, max_frames_6s)
-                            if F > max_frames_6s:
-                                print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_6s} frames (6 seconds)")
                         # Use dynamically determined frame number
                         audio_emb, audio_length = preprocess_audio(
                             wav_path=target_resampled_audio_path,
@@ -346,15 +346,15 @@ def process_audio_features(
                     target_resampled_audio_path,
                 )
             with torch.no_grad():
-                # Fast mode: if trim_to_6s, limit to 6 seconds
                 target_frames = F
-                if trim_to_6s:
                     import math
-                    max_frames_6s = int(math.ceil(6.0 * fps))
-                    max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
-                    target_frames = min(F, max_frames_6s)
-                    if F > max_frames_6s:
-                        print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_6s} frames (6 seconds)")
                 # Use dynamically determined frame number
                 audio_emb, audio_length = preprocess_audio(
                     wav_path=audio,

     half_dtype=None,
     preprocess_audio=None,
     resample_audio=None,
+    trim_to_5s=False,  # Fast mode: trim audio to 5 seconds
 ):
     """
     Process audio files and extract audio features.
             total_length = sum(audio_lengths)
             print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
+            # Fast mode: trim to 5 seconds if trim_to_5s is True
+            if trim_to_5s:
                 import math
+                # Calculate 5 seconds in frames
+                max_frames_5s = int(math.ceil(5.0 * fps))
+                max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
+                if total_length > max_frames_5s:
+                    print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_5s} frames (5 seconds)")
                     # Truncate each audio proportionally
+                    scale_factor = max_frames_5s / total_length
                     cumulative_length = 0
                     for i, audio_len in enumerate(audio_lengths):
                         if audio_len > 0:
                             new_audio_len = int(audio_len * scale_factor)
                             # Ensure it fits within remaining space
+                            remaining_space = max_frames_5s - cumulative_length
                             new_audio_len = min(new_audio_len, remaining_space)
                             audio_lengths[i] = new_audio_len
                             # Truncate the corresponding raw audio feature
                     audio_feat_list.append(zero_audio_feat)
                     print(f"Audio {i} is missing, created zero features with shape: {zero_audio_feat.shape}")
         else:
+            # Pad mode: keep existing logic, but apply trim_to_5s if needed
             for i, audio_path in enumerate(audio_paths):
                 if audio_path and os.path.exists(audio_path):
                     print(f"Processing audio {i}: {audio_path}")
                     with torch.no_grad():
                         print(f"wav2vec_model: {wav2vec_model}")
                         print(f"cache_dir:{cache_dir}")
+                        # Fast mode: if trim_to_5s, limit to 5 seconds
                         target_frames = F
+                        if trim_to_5s:
                             import math
+                            max_frames_5s = int(math.ceil(5.0 * fps))
+                            max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
+                            target_frames = min(F, max_frames_5s)
+                            if F > max_frames_5s:
+                                print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_5s} frames (5 seconds)")
                         # Use dynamically determined frame number
                         audio_emb, audio_length = preprocess_audio(
                             wav_path=target_resampled_audio_path,
                     target_resampled_audio_path,
                 )
             with torch.no_grad():
+                # Fast mode: if trim_to_5s, limit to 5 seconds
                 target_frames = F
+                if trim_to_5s:
                     import math
+                    max_frames_5s = int(math.ceil(5.0 * fps))
+                    max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
+                    target_frames = min(F, max_frames_5s)
+                    if F > max_frames_5s:
+                        print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_5s} frames (5 seconds)")
                 # Use dynamically determined frame number
                 audio_emb, audio_length = preprocess_audio(
                     wav_path=audio,