feat: time out check
Browse files- app.py +37 -37
- wan/audio2video_multiID.py +2 -2
- wan/utils/infer_utils.py +25 -25
app.py
CHANGED
|
@@ -436,7 +436,7 @@ def run_graio_demo(args):
|
|
| 436 |
logging.info("Model and face processor loaded successfully.")
|
| 437 |
|
| 438 |
def generate_video(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 439 |
-
sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector, fixed_steps=None,
|
| 440 |
# 参考 LivePortrait: 在 worker 进程中直接使用 cuda 设备
|
| 441 |
# 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/src/gradio_pipeline.py
|
| 442 |
# @spaces.GPU 装饰器已经初始化了 GPU,这里直接使用即可
|
|
@@ -483,13 +483,13 @@ def run_graio_demo(args):
|
|
| 483 |
fps = getattr(cfg, 'fps', 24)
|
| 484 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 485 |
|
| 486 |
-
# Fast模式:如果
|
| 487 |
-
if
|
| 488 |
-
# 计算
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
current_frame_num = min(calculated_frame_num,
|
| 492 |
-
logging.warning(f"Fast mode: Audio duration exceeds
|
| 493 |
else:
|
| 494 |
current_frame_num = calculated_frame_num
|
| 495 |
|
|
@@ -531,7 +531,7 @@ def run_graio_demo(args):
|
|
| 531 |
audio_paths=audio_paths,
|
| 532 |
task_key="gradio_output",
|
| 533 |
mode=audio_mode_selector,
|
| 534 |
-
|
| 535 |
)
|
| 536 |
|
| 537 |
if isinstance(video, dict):
|
|
@@ -618,13 +618,13 @@ def run_graio_demo(args):
|
|
| 618 |
# 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
|
| 619 |
# @spaces.GPU 装饰器会自动处理 GPU 初始化,不需要手动初始化
|
| 620 |
|
| 621 |
-
# 快速生成模式:
|
| 622 |
-
@spaces.GPU(duration=
|
| 623 |
def gpu_wrapped_generate_video_fast(*args, **kwargs):
|
| 624 |
# 固定使用10步去噪,通过关键字参数传递
|
| 625 |
kwargs['fixed_steps'] = 8
|
| 626 |
|
| 627 |
-
# Fast模式音频长度检测:检查是否超过
|
| 628 |
# 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 629 |
# sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
|
| 630 |
if len(args) >= 11:
|
|
@@ -658,27 +658,27 @@ def run_graio_demo(args):
|
|
| 658 |
if img2vid_audio_3:
|
| 659 |
audio_paths.append(img2vid_audio_3)
|
| 660 |
|
| 661 |
-
# 检测音频长度是否超过
|
| 662 |
if audio_paths and len(audio_paths) > 0:
|
| 663 |
fps = getattr(cfg, 'fps', 24)
|
| 664 |
try:
|
| 665 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 666 |
-
# 计算
|
| 667 |
-
|
| 668 |
-
|
| 669 |
|
| 670 |
-
if calculated_frame_num >
|
| 671 |
-
# 超过
|
| 672 |
-
kwargs['
|
| 673 |
calculated_duration = calculated_frame_num / fps
|
| 674 |
-
logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds
|
| 675 |
else:
|
| 676 |
-
kwargs['
|
| 677 |
except Exception as e:
|
| 678 |
logging.warning(f"Failed to check audio duration: {e}")
|
| 679 |
-
kwargs['
|
| 680 |
else:
|
| 681 |
-
kwargs['
|
| 682 |
|
| 683 |
return gpu_wrapped_generate_video_worker(*args, **kwargs)
|
| 684 |
|
|
@@ -760,8 +760,8 @@ def run_graio_demo(args):
|
|
| 760 |
⚠️ Important Video Duration Limits
|
| 761 |
</div>
|
| 762 |
<div style="font-size: 14px; color: #856404; line-height: 1.6;">
|
| 763 |
-
Fast Mode
|
| 764 |
-
Quality Mode
|
| 765 |
</div>
|
| 766 |
</div>
|
| 767 |
|
|
@@ -837,7 +837,7 @@ def run_graio_demo(args):
|
|
| 837 |
|
| 838 |
with gr.Row():
|
| 839 |
run_i2v_button_fast = gr.Button(
|
| 840 |
-
"Generate Video (Fast -
|
| 841 |
variant="secondary",
|
| 842 |
scale=1
|
| 843 |
)
|
|
@@ -848,10 +848,10 @@ def run_graio_demo(args):
|
|
| 848 |
)
|
| 849 |
gr.Markdown("""
|
| 850 |
**Generation Modes:**
|
| 851 |
-
- **Fast Mode (up to
|
| 852 |
- **Quality Mode (up to 780s GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). **⚠️ Maximum video duration: 8 seconds with default 25 denoising steps.**
|
| 853 |
|
| 854 |
-
*Note: The GPU duration (
|
| 855 |
""")
|
| 856 |
|
| 857 |
with gr.Column(scale=2):
|
|
@@ -889,7 +889,7 @@ def run_graio_demo(args):
|
|
| 889 |
# 包装函数:处理警告信息显示
|
| 890 |
def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 891 |
sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
|
| 892 |
-
# 在开始生成前先检测音频长度,如果超过
|
| 893 |
# 根据人数收集音频路径
|
| 894 |
audio_paths = []
|
| 895 |
if person_num_selector == "1 Person":
|
|
@@ -908,19 +908,19 @@ def run_graio_demo(args):
|
|
| 908 |
if img2vid_audio_3:
|
| 909 |
audio_paths.append(img2vid_audio_3)
|
| 910 |
|
| 911 |
-
# 检测音频长度是否超过
|
| 912 |
if audio_paths and len(audio_paths) > 0:
|
| 913 |
fps = getattr(cfg, 'fps', 24)
|
| 914 |
try:
|
| 915 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 916 |
-
# 计算
|
| 917 |
-
|
| 918 |
-
|
| 919 |
|
| 920 |
-
if calculated_frame_num >
|
| 921 |
-
# 超过
|
| 922 |
calculated_duration = calculated_frame_num / fps
|
| 923 |
-
warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the
|
| 924 |
gr.Warning(warning_msg, duration=5)
|
| 925 |
except Exception as e:
|
| 926 |
logging.warning(f"Failed to check audio duration: {e}")
|
|
@@ -936,7 +936,7 @@ def run_graio_demo(args):
|
|
| 936 |
result = gpu_wrapped_generate_video_quality(*args)
|
| 937 |
return result
|
| 938 |
|
| 939 |
-
# 快速生成按钮:
|
| 940 |
run_i2v_button_fast.click(
|
| 941 |
fn=handle_fast_generation,
|
| 942 |
inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
|
|
|
|
| 436 |
logging.info("Model and face processor loaded successfully.")
|
| 437 |
|
| 438 |
def generate_video(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 439 |
+
sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector, fixed_steps=None, trim_to_5s=False):
|
| 440 |
# 参考 LivePortrait: 在 worker 进程中直接使用 cuda 设备
|
| 441 |
# 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/src/gradio_pipeline.py
|
| 442 |
# @spaces.GPU 装饰器已经初始化了 GPU,这里直接使用即可
|
|
|
|
| 483 |
fps = getattr(cfg, 'fps', 24)
|
| 484 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 485 |
|
| 486 |
+
# Fast模式:如果trim_to_5s为True,强制限制为5秒对应的帧数
|
| 487 |
+
if trim_to_5s:
|
| 488 |
+
# 计算5秒对应的帧数(4n+1格式)
|
| 489 |
+
max_frames_5s = int(math.ceil(5.0 * fps))
|
| 490 |
+
max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
|
| 491 |
+
current_frame_num = min(calculated_frame_num, max_frames_5s)
|
| 492 |
+
logging.warning(f"Fast mode: Audio duration exceeds 5 seconds. Trimming to 5 seconds ({max_frames_5s} frames). Original: {calculated_frame_num} frames")
|
| 493 |
else:
|
| 494 |
current_frame_num = calculated_frame_num
|
| 495 |
|
|
|
|
| 531 |
audio_paths=audio_paths,
|
| 532 |
task_key="gradio_output",
|
| 533 |
mode=audio_mode_selector,
|
| 534 |
+
trim_to_5s=trim_to_5s,
|
| 535 |
)
|
| 536 |
|
| 537 |
if isinstance(video, dict):
|
|
|
|
| 618 |
# 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
|
| 619 |
# @spaces.GPU 装饰器会自动处理 GPU 初始化,不需要手动初始化
|
| 620 |
|
| 621 |
+
# 快速生成模式:150秒,固定10步去噪
|
| 622 |
+
@spaces.GPU(duration=150)
|
| 623 |
def gpu_wrapped_generate_video_fast(*args, **kwargs):
|
| 624 |
# 固定使用10步去噪,通过关键字参数传递
|
| 625 |
kwargs['fixed_steps'] = 8
|
| 626 |
|
| 627 |
+
# Fast模式音频长度检测:检查是否超过5秒
|
| 628 |
# 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 629 |
# sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
|
| 630 |
if len(args) >= 11:
|
|
|
|
| 658 |
if img2vid_audio_3:
|
| 659 |
audio_paths.append(img2vid_audio_3)
|
| 660 |
|
| 661 |
+
# 检测音频长度是否超过5秒
|
| 662 |
if audio_paths and len(audio_paths) > 0:
|
| 663 |
fps = getattr(cfg, 'fps', 24)
|
| 664 |
try:
|
| 665 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 666 |
+
# 计算5秒对应的帧数
|
| 667 |
+
max_frames_5s = int(math.ceil(5.0 * fps))
|
| 668 |
+
max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
|
| 669 |
|
| 670 |
+
if calculated_frame_num > max_frames_5s:
|
| 671 |
+
# 超过5秒,设置trim_to_5s标记
|
| 672 |
+
kwargs['trim_to_5s'] = True
|
| 673 |
calculated_duration = calculated_frame_num / fps
|
| 674 |
+
logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 5 seconds limit. Will trim to 5 seconds.")
|
| 675 |
else:
|
| 676 |
+
kwargs['trim_to_5s'] = False
|
| 677 |
except Exception as e:
|
| 678 |
logging.warning(f"Failed to check audio duration: {e}")
|
| 679 |
+
kwargs['trim_to_5s'] = False
|
| 680 |
else:
|
| 681 |
+
kwargs['trim_to_5s'] = False
|
| 682 |
|
| 683 |
return gpu_wrapped_generate_video_worker(*args, **kwargs)
|
| 684 |
|
|
|
|
| 760 |
⚠️ Important Video Duration Limits
|
| 761 |
</div>
|
| 762 |
<div style="font-size: 14px; color: #856404; line-height: 1.6;">
|
| 763 |
+
<strong>Fast Mode:</strong> Maximum video duration is <strong>5 seconds</strong>. Videos longer than 5 seconds will be automatically trimmed to 5 seconds.<br>
|
| 764 |
+
<strong>Quality Mode:</strong> Maximum video duration is <strong>8 seconds</strong> with default 25 denoising steps.
|
| 765 |
</div>
|
| 766 |
</div>
|
| 767 |
|
|
|
|
| 837 |
|
| 838 |
with gr.Row():
|
| 839 |
run_i2v_button_fast = gr.Button(
|
| 840 |
+
"Generate Video (Fast - 150s, 8 steps)",
|
| 841 |
variant="secondary",
|
| 842 |
scale=1
|
| 843 |
)
|
|
|
|
| 848 |
)
|
| 849 |
gr.Markdown("""
|
| 850 |
**Generation Modes:**
|
| 851 |
+
- **Fast Mode (up to 150s GPU budget)**: Fixed 8 denoising steps for quick generation. **⚠️ Maximum video duration: 5 seconds. Videos longer than 5 seconds will be automatically trimmed to 5 seconds.**
|
| 852 |
- **Quality Mode (up to 780s GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). **⚠️ Maximum video duration: 8 seconds with default 25 denoising steps.**
|
| 853 |
|
| 854 |
+
*Note: The GPU duration (150s/780s) represents the maximum budget allocated, not the actual generation time. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
|
| 855 |
""")
|
| 856 |
|
| 857 |
with gr.Column(scale=2):
|
|
|
|
| 889 |
# 包装函数:处理警告信息显示
|
| 890 |
def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 891 |
sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
|
| 892 |
+
# 在开始生成前先检测音频长度,如果超过5秒立即显示警告
|
| 893 |
# 根据人数收集音频路径
|
| 894 |
audio_paths = []
|
| 895 |
if person_num_selector == "1 Person":
|
|
|
|
| 908 |
if img2vid_audio_3:
|
| 909 |
audio_paths.append(img2vid_audio_3)
|
| 910 |
|
| 911 |
+
# 检测音频长度是否超过5秒
|
| 912 |
if audio_paths and len(audio_paths) > 0:
|
| 913 |
fps = getattr(cfg, 'fps', 24)
|
| 914 |
try:
|
| 915 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 916 |
+
# 计算5秒对应的帧数
|
| 917 |
+
max_frames_5s = int(math.ceil(5.0 * fps))
|
| 918 |
+
max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
|
| 919 |
|
| 920 |
+
if calculated_frame_num > max_frames_5s:
|
| 921 |
+
# 超过5秒,立即显示警告
|
| 922 |
calculated_duration = calculated_frame_num / fps
|
| 923 |
+
warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 5-second limit for Fast Mode. The audio will be automatically trimmed to 5 seconds to prevent timeout."
|
| 924 |
gr.Warning(warning_msg, duration=5)
|
| 925 |
except Exception as e:
|
| 926 |
logging.warning(f"Failed to check audio duration: {e}")
|
|
|
|
| 936 |
result = gpu_wrapped_generate_video_quality(*args)
|
| 937 |
return result
|
| 938 |
|
| 939 |
+
# 快速生成按钮:150秒,固定10步
|
| 940 |
run_i2v_button_fast.click(
|
| 941 |
fn=handle_fast_generation,
|
| 942 |
inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
|
wan/audio2video_multiID.py
CHANGED
|
@@ -199,7 +199,7 @@ class WanAF2V:
|
|
| 199 |
audio_paths=None, # New: audio path list, supports multiple audio files
|
| 200 |
task_key=None,
|
| 201 |
mode="pad", # Audio processing mode: "pad" or "concat"
|
| 202 |
-
|
| 203 |
):
|
| 204 |
r"""
|
| 205 |
Generates video frames from input image and text prompt using diffusion process.
|
|
@@ -515,7 +515,7 @@ class WanAF2V:
|
|
| 515 |
half_dtype=self.half_dtype,
|
| 516 |
preprocess_audio=preprocess_audio,
|
| 517 |
resample_audio=resample_audio,
|
| 518 |
-
|
| 519 |
)
|
| 520 |
|
| 521 |
# Prepare audio_ref_features - new list mode
|
|
|
|
| 199 |
audio_paths=None, # New: audio path list, supports multiple audio files
|
| 200 |
task_key=None,
|
| 201 |
mode="pad", # Audio processing mode: "pad" or "concat"
|
| 202 |
+
trim_to_5s=False, # Fast mode: trim audio to 5 seconds
|
| 203 |
):
|
| 204 |
r"""
|
| 205 |
Generates video frames from input image and text prompt using diffusion process.
|
|
|
|
| 515 |
half_dtype=self.half_dtype,
|
| 516 |
preprocess_audio=preprocess_audio,
|
| 517 |
resample_audio=resample_audio,
|
| 518 |
+
trim_to_5s=trim_to_5s,
|
| 519 |
)
|
| 520 |
|
| 521 |
# Prepare audio_ref_features - new list mode
|
wan/utils/infer_utils.py
CHANGED
|
@@ -118,7 +118,7 @@ def process_audio_features(
|
|
| 118 |
half_dtype=None,
|
| 119 |
preprocess_audio=None,
|
| 120 |
resample_audio=None,
|
| 121 |
-
|
| 122 |
):
|
| 123 |
"""
|
| 124 |
Process audio files and extract audio features.
|
|
@@ -203,22 +203,22 @@ def process_audio_features(
|
|
| 203 |
total_length = sum(audio_lengths)
|
| 204 |
print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
|
| 205 |
|
| 206 |
-
# Fast mode: trim to
|
| 207 |
-
if
|
| 208 |
import math
|
| 209 |
-
# Calculate
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
if total_length >
|
| 213 |
-
print(f"Fast mode: Trimming audio from {total_length} frames to {
|
| 214 |
# Truncate each audio proportionally
|
| 215 |
-
scale_factor =
|
| 216 |
cumulative_length = 0
|
| 217 |
for i, audio_len in enumerate(audio_lengths):
|
| 218 |
if audio_len > 0:
|
| 219 |
new_audio_len = int(audio_len * scale_factor)
|
| 220 |
# Ensure it fits within remaining space
|
| 221 |
-
remaining_space =
|
| 222 |
new_audio_len = min(new_audio_len, remaining_space)
|
| 223 |
audio_lengths[i] = new_audio_len
|
| 224 |
# Truncate the corresponding raw audio feature
|
|
@@ -283,7 +283,7 @@ def process_audio_features(
|
|
| 283 |
audio_feat_list.append(zero_audio_feat)
|
| 284 |
print(f"Audio {i} is missing, created zero features with shape: {zero_audio_feat.shape}")
|
| 285 |
else:
|
| 286 |
-
# Pad mode: keep existing logic, but apply
|
| 287 |
for i, audio_path in enumerate(audio_paths):
|
| 288 |
if audio_path and os.path.exists(audio_path):
|
| 289 |
print(f"Processing audio {i}: {audio_path}")
|
|
@@ -296,15 +296,15 @@ def process_audio_features(
|
|
| 296 |
with torch.no_grad():
|
| 297 |
print(f"wav2vec_model: {wav2vec_model}")
|
| 298 |
print(f"cache_dir:{cache_dir}")
|
| 299 |
-
# Fast mode: if
|
| 300 |
target_frames = F
|
| 301 |
-
if
|
| 302 |
import math
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
target_frames = min(F,
|
| 306 |
-
if F >
|
| 307 |
-
print(f"Fast mode: Trimming audio {i} from {F} frames to {
|
| 308 |
# Use dynamically determined frame number
|
| 309 |
audio_emb, audio_length = preprocess_audio(
|
| 310 |
wav_path=target_resampled_audio_path,
|
|
@@ -346,15 +346,15 @@ def process_audio_features(
|
|
| 346 |
target_resampled_audio_path,
|
| 347 |
)
|
| 348 |
with torch.no_grad():
|
| 349 |
-
# Fast mode: if
|
| 350 |
target_frames = F
|
| 351 |
-
if
|
| 352 |
import math
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
target_frames = min(F,
|
| 356 |
-
if F >
|
| 357 |
-
print(f"Fast mode: Trimming single audio from {F} frames to {
|
| 358 |
# Use dynamically determined frame number
|
| 359 |
audio_emb, audio_length = preprocess_audio(
|
| 360 |
wav_path=audio,
|
|
|
|
| 118 |
half_dtype=None,
|
| 119 |
preprocess_audio=None,
|
| 120 |
resample_audio=None,
|
| 121 |
+
trim_to_5s=False, # Fast mode: trim audio to 5 seconds
|
| 122 |
):
|
| 123 |
"""
|
| 124 |
Process audio files and extract audio features.
|
|
|
|
| 203 |
total_length = sum(audio_lengths)
|
| 204 |
print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
|
| 205 |
|
| 206 |
+
# Fast mode: trim to 5 seconds if trim_to_5s is True
|
| 207 |
+
if trim_to_5s:
|
| 208 |
import math
|
| 209 |
+
# Calculate 5 seconds in frames
|
| 210 |
+
max_frames_5s = int(math.ceil(5.0 * fps))
|
| 211 |
+
max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
|
| 212 |
+
if total_length > max_frames_5s:
|
| 213 |
+
print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_5s} frames (5 seconds)")
|
| 214 |
# Truncate each audio proportionally
|
| 215 |
+
scale_factor = max_frames_5s / total_length
|
| 216 |
cumulative_length = 0
|
| 217 |
for i, audio_len in enumerate(audio_lengths):
|
| 218 |
if audio_len > 0:
|
| 219 |
new_audio_len = int(audio_len * scale_factor)
|
| 220 |
# Ensure it fits within remaining space
|
| 221 |
+
remaining_space = max_frames_5s - cumulative_length
|
| 222 |
new_audio_len = min(new_audio_len, remaining_space)
|
| 223 |
audio_lengths[i] = new_audio_len
|
| 224 |
# Truncate the corresponding raw audio feature
|
|
|
|
| 283 |
audio_feat_list.append(zero_audio_feat)
|
| 284 |
print(f"Audio {i} is missing, created zero features with shape: {zero_audio_feat.shape}")
|
| 285 |
else:
|
| 286 |
+
# Pad mode: keep existing logic, but apply trim_to_5s if needed
|
| 287 |
for i, audio_path in enumerate(audio_paths):
|
| 288 |
if audio_path and os.path.exists(audio_path):
|
| 289 |
print(f"Processing audio {i}: {audio_path}")
|
|
|
|
| 296 |
with torch.no_grad():
|
| 297 |
print(f"wav2vec_model: {wav2vec_model}")
|
| 298 |
print(f"cache_dir:{cache_dir}")
|
| 299 |
+
# Fast mode: if trim_to_5s, limit to 5 seconds
|
| 300 |
target_frames = F
|
| 301 |
+
if trim_to_5s:
|
| 302 |
import math
|
| 303 |
+
max_frames_5s = int(math.ceil(5.0 * fps))
|
| 304 |
+
max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
|
| 305 |
+
target_frames = min(F, max_frames_5s)
|
| 306 |
+
if F > max_frames_5s:
|
| 307 |
+
print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_5s} frames (5 seconds)")
|
| 308 |
# Use dynamically determined frame number
|
| 309 |
audio_emb, audio_length = preprocess_audio(
|
| 310 |
wav_path=target_resampled_audio_path,
|
|
|
|
| 346 |
target_resampled_audio_path,
|
| 347 |
)
|
| 348 |
with torch.no_grad():
|
| 349 |
+
# Fast mode: if trim_to_5s, limit to 5 seconds
|
| 350 |
target_frames = F
|
| 351 |
+
if trim_to_5s:
|
| 352 |
import math
|
| 353 |
+
max_frames_5s = int(math.ceil(5.0 * fps))
|
| 354 |
+
max_frames_5s = ((max_frames_5s - 1) // 4) * 4 + 1
|
| 355 |
+
target_frames = min(F, max_frames_5s)
|
| 356 |
+
if F > max_frames_5s:
|
| 357 |
+
print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_5s} frames (5 seconds)")
|
| 358 |
# Use dynamically determined frame number
|
| 359 |
audio_emb, audio_length = preprocess_audio(
|
| 360 |
wav_path=audio,
|