C4G-HKUST commited on
Commit
ff7f08b
·
1 Parent(s): 0f8cfbd

feat: time out check

Browse files
Files changed (3) hide show
  1. app.py +37 -37
  2. wan/audio2video_multiID.py +2 -2
  3. wan/utils/infer_utils.py +25 -25
app.py CHANGED
@@ -436,7 +436,7 @@ def run_graio_demo(args):
436
  logging.info("Model and face processor loaded successfully.")
437
 
438
  def generate_video(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
439
- sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector, fixed_steps=None, trim_to_6s=False):
440
  # 参考 LivePortrait: 在 worker 进程中直接使用 cuda 设备
441
  # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/src/gradio_pipeline.py
442
  # @spaces.GPU 装饰器已经初始化了 GPU,这里直接使用即可
@@ -483,13 +483,13 @@ def run_graio_demo(args):
483
  fps = getattr(cfg, 'fps', 24)
484
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
485
 
486
- # Fast模式:如果trim_to_6s为True,强制限制为6秒对应的帧数
487
- if trim_to_6s:
488
- # 计算6秒对应的帧数(4n+1格式)
489
- max_frames_6s = int(math.ceil(6.0 * fps))
490
- max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
491
- current_frame_num = min(calculated_frame_num, max_frames_6s)
492
- logging.warning(f"Fast mode: Audio duration exceeds 6 seconds. Trimming to 6 seconds ({max_frames_6s} frames). Original: {calculated_frame_num} frames")
493
  else:
494
  current_frame_num = calculated_frame_num
495
 
@@ -531,7 +531,7 @@ def run_graio_demo(args):
531
  audio_paths=audio_paths,
532
  task_key="gradio_output",
533
  mode=audio_mode_selector,
534
- trim_to_6s=trim_to_6s,
535
  )
536
 
537
  if isinstance(video, dict):
@@ -618,13 +618,13 @@ def run_graio_demo(args):
618
  # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
619
  # @spaces.GPU 装饰器会自动处理 GPU 初始化,不需要手动初始化
620
 
621
- # 快速生成模式:180秒,固定10步去噪
622
- @spaces.GPU(duration=180)
623
  def gpu_wrapped_generate_video_fast(*args, **kwargs):
624
  # 固定使用10步去噪,通过关键字参数传递
625
  kwargs['fixed_steps'] = 8
626
 
627
- # Fast模式音频长度检测:检查是否超过6
628
  # 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
629
  # sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
630
  if len(args) >= 11:
@@ -658,27 +658,27 @@ def run_graio_demo(args):
658
  if img2vid_audio_3:
659
  audio_paths.append(img2vid_audio_3)
660
 
661
- # 检测音频长度是否超过6
662
  if audio_paths and len(audio_paths) > 0:
663
  fps = getattr(cfg, 'fps', 24)
664
  try:
665
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
666
- # 计算6秒对应的帧数
667
- max_frames_6s = int(math.ceil(6.0 * fps))
668
- max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
669
 
670
- if calculated_frame_num > max_frames_6s:
671
- # 超过6秒,设置trim_to_6s标记
672
- kwargs['trim_to_6s'] = True
673
  calculated_duration = calculated_frame_num / fps
674
- logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 6 seconds limit. Will trim to 6 seconds.")
675
  else:
676
- kwargs['trim_to_6s'] = False
677
  except Exception as e:
678
  logging.warning(f"Failed to check audio duration: {e}")
679
- kwargs['trim_to_6s'] = False
680
  else:
681
- kwargs['trim_to_6s'] = False
682
 
683
  return gpu_wrapped_generate_video_worker(*args, **kwargs)
684
 
@@ -760,8 +760,8 @@ def run_graio_demo(args):
760
  ⚠️ Important Video Duration Limits
761
  </div>
762
  <div style="font-size: 14px; color: #856404; line-height: 1.6;">
763
- Fast Mode: Maximum video duration shoule be less than 6 seconds. Videos longer than 6 seconds will timeout.<br>
764
- Quality Mode: Maximum video duration shoule be less than 8 seconds with default 25 denoising steps (You can adjust the denoising steps to generate longer videos).
765
  </div>
766
  </div>
767
 
@@ -837,7 +837,7 @@ def run_graio_demo(args):
837
 
838
  with gr.Row():
839
  run_i2v_button_fast = gr.Button(
840
- "Generate Video (Fast - 180s, 8 steps)",
841
  variant="secondary",
842
  scale=1
843
  )
@@ -848,10 +848,10 @@ def run_graio_demo(args):
848
  )
849
  gr.Markdown("""
850
  **Generation Modes:**
851
- - **Fast Mode (up to 180s GPU budget)**: Fixed 8 denoising steps for quick generation. **⚠️ Maximum video duration: 6 seconds. Videos longer than 6 seconds will timeout.**
852
  - **Quality Mode (up to 780s GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). **⚠️ Maximum video duration: 8 seconds with default 25 denoising steps.**
853
 
854
- *Note: The GPU duration (180s/780s) represents the maximum budget allocated, not the actual generation time. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
855
  """)
856
 
857
  with gr.Column(scale=2):
@@ -889,7 +889,7 @@ def run_graio_demo(args):
889
  # 包装函数:处理警告信息显示
890
  def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
891
  sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
892
- # 在开始生成前先检测音频长度,如果超过6秒立即显示警告
893
  # 根据人数收集音频路径
894
  audio_paths = []
895
  if person_num_selector == "1 Person":
@@ -908,19 +908,19 @@ def run_graio_demo(args):
908
  if img2vid_audio_3:
909
  audio_paths.append(img2vid_audio_3)
910
 
911
- # 检测音频长度是否超过6
912
  if audio_paths and len(audio_paths) > 0:
913
  fps = getattr(cfg, 'fps', 24)
914
  try:
915
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
916
- # 计算6秒对应的帧数
917
- max_frames_6s = int(math.ceil(6.0 * fps))
918
- max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
919
 
920
- if calculated_frame_num > max_frames_6s:
921
- # 超过6秒,立即显示警告
922
  calculated_duration = calculated_frame_num / fps
923
- warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 6-second limit for Fast Mode. The audio will be automatically trimmed to 6 seconds to prevent timeout."
924
  gr.Warning(warning_msg, duration=5)
925
  except Exception as e:
926
  logging.warning(f"Failed to check audio duration: {e}")
@@ -936,7 +936,7 @@ def run_graio_demo(args):
936
  result = gpu_wrapped_generate_video_quality(*args)
937
  return result
938
 
939
- # 快速生成按钮:180秒,固定10步
940
  run_i2v_button_fast.click(
941
  fn=handle_fast_generation,
942
  inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
 
436
  logging.info("Model and face processor loaded successfully.")
437
 
438
  def generate_video(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
439
+ sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector, fixed_steps=None, trim_to_5s=False):
440
  # 参考 LivePortrait: 在 worker 进程中直接使用 cuda 设备
441
  # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/src/gradio_pipeline.py
442
  # @spaces.GPU 装饰器已经初始化了 GPU,这里直接使用即可
 
483
  fps = getattr(cfg, 'fps', 24)
484
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
485
 
486
+ # Fast模式:如果trim_to_5s为True,强制限制为5秒对应的帧数
487
+ if trim_to_5s:
488
+ # 计算5秒对应的帧数(4n+1格式)
489
+ max_frames_5s = int(math.ceil(5.0 * fps))
490
+ max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
491
+ current_frame_num = min(calculated_frame_num, max_frames_5s)
492
+ logging.warning(f"Fast mode: Audio duration exceeds 5 seconds. Trimming to 5 seconds ({max_frames_5s} frames). Original: {calculated_frame_num} frames")
493
  else:
494
  current_frame_num = calculated_frame_num
495
 
 
531
  audio_paths=audio_paths,
532
  task_key="gradio_output",
533
  mode=audio_mode_selector,
534
+ trim_to_5s=trim_to_5s,
535
  )
536
 
537
  if isinstance(video, dict):
 
618
  # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
619
  # @spaces.GPU 装饰器会自动处理 GPU 初始化,不需要手动初始化
620
 
621
+ # 快速生成模式:150秒,固定10步去噪
622
+ @spaces.GPU(duration=150)
623
  def gpu_wrapped_generate_video_fast(*args, **kwargs):
624
  # 固定使用10步去噪,通过关键字参数传递
625
  kwargs['fixed_steps'] = 8
626
 
627
+ # Fast模式音频长度检测:检查是否超过5
628
  # 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
629
  # sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
630
  if len(args) >= 11:
 
658
  if img2vid_audio_3:
659
  audio_paths.append(img2vid_audio_3)
660
 
661
+ # 检测音频长度是否超过5
662
  if audio_paths and len(audio_paths) > 0:
663
  fps = getattr(cfg, 'fps', 24)
664
  try:
665
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
666
+ # 计算5秒对应的帧数
667
+ max_frames_5s = int(math.ceil(5.0 * fps))
668
+ max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
669
 
670
+ if calculated_frame_num > max_frames_5s:
671
+ # 超过5秒,设置trim_to_5s标记
672
+ kwargs['trim_to_5s'] = True
673
  calculated_duration = calculated_frame_num / fps
674
+ logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 5 seconds limit. Will trim to 5 seconds.")
675
  else:
676
+ kwargs['trim_to_5s'] = False
677
  except Exception as e:
678
  logging.warning(f"Failed to check audio duration: {e}")
679
+ kwargs['trim_to_5s'] = False
680
  else:
681
+ kwargs['trim_to_5s'] = False
682
 
683
  return gpu_wrapped_generate_video_worker(*args, **kwargs)
684
 
 
760
  ⚠️ Important Video Duration Limits
761
  </div>
762
  <div style="font-size: 14px; color: #856404; line-height: 1.6;">
763
+ <strong>Fast Mode:</strong> Maximum video duration is <strong>5 seconds</strong>. Videos longer than 5 seconds will be automatically trimmed to 5 seconds.<br>
764
+ <strong>Quality Mode:</strong> Maximum video duration is <strong>8 seconds</strong> with default 25 denoising steps.
765
  </div>
766
  </div>
767
 
 
837
 
838
  with gr.Row():
839
  run_i2v_button_fast = gr.Button(
840
+ "Generate Video (Fast - 150s, 8 steps)",
841
  variant="secondary",
842
  scale=1
843
  )
 
848
  )
849
  gr.Markdown("""
850
  **Generation Modes:**
851
+ - **Fast Mode (up to 150s GPU budget)**: Fixed 8 denoising steps for quick generation. **⚠️ Maximum video duration: 5 seconds. Videos longer than 5 seconds will be automatically trimmed to 5 seconds.**
852
  - **Quality Mode (up to 780s GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). **⚠️ Maximum video duration: 8 seconds with default 25 denoising steps.**
853
 
854
+ *Note: The GPU duration (150s/780s) represents the maximum budget allocated, not the actual generation time. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
855
  """)
856
 
857
  with gr.Column(scale=2):
 
889
  # 包装函数:处理警告信息显示
890
  def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
891
  sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
892
+ # 在开始生成前先检测音频长度,如果超过5秒立即显示警告
893
  # 根据人数收集音频路径
894
  audio_paths = []
895
  if person_num_selector == "1 Person":
 
908
  if img2vid_audio_3:
909
  audio_paths.append(img2vid_audio_3)
910
 
911
+ # 检测音频长度是否超过5
912
  if audio_paths and len(audio_paths) > 0:
913
  fps = getattr(cfg, 'fps', 24)
914
  try:
915
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
916
+ # 计算5秒对应的帧数
917
+ max_frames_5s = int(math.ceil(5.0 * fps))
918
+ max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
919
 
920
+ if calculated_frame_num > max_frames_5s:
921
+ # 超过5秒,立即显示警告
922
  calculated_duration = calculated_frame_num / fps
923
+ warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 5-second limit for Fast Mode. The audio will be automatically trimmed to 5 seconds to prevent timeout."
924
  gr.Warning(warning_msg, duration=5)
925
  except Exception as e:
926
  logging.warning(f"Failed to check audio duration: {e}")
 
936
  result = gpu_wrapped_generate_video_quality(*args)
937
  return result
938
 
939
+ # 快速生成按钮:150秒,固定10步
940
  run_i2v_button_fast.click(
941
  fn=handle_fast_generation,
942
  inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
wan/audio2video_multiID.py CHANGED
@@ -199,7 +199,7 @@ class WanAF2V:
199
  audio_paths=None, # New: audio path list, supports multiple audio files
200
  task_key=None,
201
  mode="pad", # Audio processing mode: "pad" or "concat"
202
- trim_to_6s=False, # Fast mode: trim audio to 6 seconds
203
  ):
204
  r"""
205
  Generates video frames from input image and text prompt using diffusion process.
@@ -515,7 +515,7 @@ class WanAF2V:
515
  half_dtype=self.half_dtype,
516
  preprocess_audio=preprocess_audio,
517
  resample_audio=resample_audio,
518
- trim_to_6s=trim_to_6s,
519
  )
520
 
521
  # Prepare audio_ref_features - new list mode
 
199
  audio_paths=None, # New: audio path list, supports multiple audio files
200
  task_key=None,
201
  mode="pad", # Audio processing mode: "pad" or "concat"
202
+ trim_to_5s=False, # Fast mode: trim audio to 5 seconds
203
  ):
204
  r"""
205
  Generates video frames from input image and text prompt using diffusion process.
 
515
  half_dtype=self.half_dtype,
516
  preprocess_audio=preprocess_audio,
517
  resample_audio=resample_audio,
518
+ trim_to_5s=trim_to_5s,
519
  )
520
 
521
  # Prepare audio_ref_features - new list mode
wan/utils/infer_utils.py CHANGED
@@ -118,7 +118,7 @@ def process_audio_features(
118
  half_dtype=None,
119
  preprocess_audio=None,
120
  resample_audio=None,
121
- trim_to_6s=False, # Fast mode: trim audio to 6 seconds
122
  ):
123
  """
124
  Process audio files and extract audio features.
@@ -203,22 +203,22 @@ def process_audio_features(
203
  total_length = sum(audio_lengths)
204
  print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
205
 
206
- # Fast mode: trim to 6 seconds if trim_to_6s is True
207
- if trim_to_6s:
208
  import math
209
- # Calculate 6 seconds in frames
210
- max_frames_6s = int(math.ceil(6.0 * fps))
211
- max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
212
- if total_length > max_frames_6s:
213
- print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_6s} frames (6 seconds)")
214
  # Truncate each audio proportionally
215
- scale_factor = max_frames_6s / total_length
216
  cumulative_length = 0
217
  for i, audio_len in enumerate(audio_lengths):
218
  if audio_len > 0:
219
  new_audio_len = int(audio_len * scale_factor)
220
  # Ensure it fits within remaining space
221
- remaining_space = max_frames_6s - cumulative_length
222
  new_audio_len = min(new_audio_len, remaining_space)
223
  audio_lengths[i] = new_audio_len
224
  # Truncate the corresponding raw audio feature
@@ -283,7 +283,7 @@ def process_audio_features(
283
  audio_feat_list.append(zero_audio_feat)
284
  print(f"Audio {i} is missing, created zero features with shape: {zero_audio_feat.shape}")
285
  else:
286
- # Pad mode: keep existing logic, but apply trim_to_6s if needed
287
  for i, audio_path in enumerate(audio_paths):
288
  if audio_path and os.path.exists(audio_path):
289
  print(f"Processing audio {i}: {audio_path}")
@@ -296,15 +296,15 @@ def process_audio_features(
296
  with torch.no_grad():
297
  print(f"wav2vec_model: {wav2vec_model}")
298
  print(f"cache_dir:{cache_dir}")
299
- # Fast mode: if trim_to_6s, limit to 6 seconds
300
  target_frames = F
301
- if trim_to_6s:
302
  import math
303
- max_frames_6s = int(math.ceil(6.0 * fps))
304
- max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
305
- target_frames = min(F, max_frames_6s)
306
- if F > max_frames_6s:
307
- print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_6s} frames (6 seconds)")
308
  # Use dynamically determined frame number
309
  audio_emb, audio_length = preprocess_audio(
310
  wav_path=target_resampled_audio_path,
@@ -346,15 +346,15 @@ def process_audio_features(
346
  target_resampled_audio_path,
347
  )
348
  with torch.no_grad():
349
- # Fast mode: if trim_to_6s, limit to 6 seconds
350
  target_frames = F
351
- if trim_to_6s:
352
  import math
353
- max_frames_6s = int(math.ceil(6.0 * fps))
354
- max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
355
- target_frames = min(F, max_frames_6s)
356
- if F > max_frames_6s:
357
- print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_6s} frames (6 seconds)")
358
  # Use dynamically determined frame number
359
  audio_emb, audio_length = preprocess_audio(
360
  wav_path=audio,
 
118
  half_dtype=None,
119
  preprocess_audio=None,
120
  resample_audio=None,
121
+ trim_to_5s=False, # Fast mode: trim audio to 5 seconds
122
  ):
123
  """
124
  Process audio files and extract audio features.
 
203
  total_length = sum(audio_lengths)
204
  print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
205
 
206
+ # Fast mode: trim to 5 seconds if trim_to_5s is True
207
+ if trim_to_5s:
208
  import math
209
+ # Calculate 5 seconds in frames
210
+ max_frames_5s = int(math.ceil(5.0 * fps))
211
+ max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
212
+ if total_length > max_frames_5s:
213
+ print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_5s} frames (5 seconds)")
214
  # Truncate each audio proportionally
215
+ scale_factor = max_frames_5s / total_length
216
  cumulative_length = 0
217
  for i, audio_len in enumerate(audio_lengths):
218
  if audio_len > 0:
219
  new_audio_len = int(audio_len * scale_factor)
220
  # Ensure it fits within remaining space
221
+ remaining_space = max_frames_5s - cumulative_length
222
  new_audio_len = min(new_audio_len, remaining_space)
223
  audio_lengths[i] = new_audio_len
224
  # Truncate the corresponding raw audio feature
 
283
  audio_feat_list.append(zero_audio_feat)
284
  print(f"Audio {i} is missing, created zero features with shape: {zero_audio_feat.shape}")
285
  else:
286
+ # Pad mode: keep existing logic, but apply trim_to_5s if needed
287
  for i, audio_path in enumerate(audio_paths):
288
  if audio_path and os.path.exists(audio_path):
289
  print(f"Processing audio {i}: {audio_path}")
 
296
  with torch.no_grad():
297
  print(f"wav2vec_model: {wav2vec_model}")
298
  print(f"cache_dir:{cache_dir}")
299
+ # Fast mode: if trim_to_5s, limit to 5 seconds
300
  target_frames = F
301
+ if trim_to_5s:
302
  import math
303
+ max_frames_5s = int(math.ceil(5.0 * fps))
304
+ max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
305
+ target_frames = min(F, max_frames_5s)
306
+ if F > max_frames_5s:
307
+ print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_5s} frames (5 seconds)")
308
  # Use dynamically determined frame number
309
  audio_emb, audio_length = preprocess_audio(
310
  wav_path=target_resampled_audio_path,
 
346
  target_resampled_audio_path,
347
  )
348
  with torch.no_grad():
349
+ # Fast mode: if trim_to_5s, limit to 5 seconds
350
  target_frames = F
351
+ if trim_to_5s:
352
  import math
353
+ max_frames_5s = int(math.ceil(5.0 * fps))
354
+ max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
355
+ target_frames = min(F, max_frames_5s)
356
+ if F > max_frames_5s:
357
+ print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_5s} frames (5 seconds)")
358
  # Use dynamically determined frame number
359
  audio_emb, audio_length = preprocess_audio(
360
  wav_path=audio,