C4G-HKUST commited on
Commit
b505615
·
1 Parent(s): b2737a3

fix: hf demo

Browse files
Files changed (1) hide show
  1. app.py +35 -34
app.py CHANGED
@@ -649,26 +649,6 @@ def run_graio_demo(args):
649
 
650
  return generate_video(*args, **kwargs)
651
 
652
- def toggle_audio_inputs(person_num):
653
- """根据选择的人数显示对应数量的音频输入框"""
654
- if person_num == "1 Person":
655
- return [
656
- gr.Audio(visible=True, interactive=True),
657
- gr.Audio(visible=False, interactive=False),
658
- gr.Audio(visible=False, interactive=False)
659
- ]
660
- elif person_num == "2 Persons":
661
- return [
662
- gr.Audio(visible=True, interactive=True),
663
- gr.Audio(visible=True, interactive=True),
664
- gr.Audio(visible=False, interactive=False)
665
- ]
666
- else: # 3 Persons
667
- return [
668
- gr.Audio(visible=True, interactive=True),
669
- gr.Audio(visible=True, interactive=True),
670
- gr.Audio(visible=True, interactive=True)
671
- ]
672
 
673
 
674
  with gr.Blocks() as demo:
@@ -718,14 +698,15 @@ def run_graio_demo(args):
718
  - **pad**: Select this if every audio input track has already been zero-padded to a common length.
719
  - **concat**: Select this if you want the script to chain each speaker's clips together and then zero-pad the non-speaker segments to reach a uniform length.
720
  """)
721
- img2vid_audio_1 = gr.Audio(label="Audio for Person 1", type="filepath", visible=True)
722
- img2vid_audio_2 = gr.Audio(label="Audio for Person 2", type="filepath", visible=False)
723
- img2vid_audio_3 = gr.Audio(label="Audio for Person 3", type="filepath", visible=False)
724
- person_num_selector.change(
725
- fn=toggle_audio_inputs,
726
- inputs=person_num_selector,
727
- outputs=[img2vid_audio_1, img2vid_audio_2, img2vid_audio_3]
728
- )
 
729
 
730
  with gr.Accordion("Advanced Options", open=False):
731
  with gr.Row():
@@ -761,14 +742,34 @@ def run_graio_demo(args):
761
  result_gallery = gr.Video(
762
  label='Generated Video', interactive=False, height=600, )
763
 
764
- gr.Examples(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
765
  examples = [
766
- ["./input_example/images/1p-0.png", "The man stands in the dusty western street, backlit by the setting sun, and his determined gaze speaks of a rugged spirit.", "1 Person", "pad", "./input_example/audios/1p-0.wav", None, None],
767
- ["./input_example/images/2p-0.png", "The two people are talking to each other.", "2 Persons", "pad", "./input_example/audios/2p-0-left.wav", "./input_example/audios/2p-0-right.wav", None],
768
- ["./input_example/images/2p-1.png", "In a casual, intimate setting, a man and a woman are engaged in a heartfelt conversation inside a car. The man, sporting a denim jacket over a blue shirt, sits attentively with a seatbelt fastened, his gaze fixed on the woman beside him. The woman, wearing a black tank top and a denim jacket draped over her shoulders, smiles warmly, her eyes reflecting genuine interest and connection. The car's interior, with its beige seats and simple design, provides a backdrop that emphasizes their interaction. The scene captures a moment of shared understanding and connection, set against the soft, diffused light of an overcast day. A medium shot from a slightly angled perspective, focusing on their expressions and body language.", "2 Persons", "pad", "./input_example/audios/2p-1-left.wav", "./input_example/audios/2p-1-right.wav", None],
769
- ["./input_example/images/2p-2.png", "In a cozy recording studio, a man and a woman are singing together. The man, with tousled brown hair, stands to the left, wearing a light green button-down shirt. His gaze is directed towards the woman, who is smiling warmly. She, with wavy dark hair, is dressed in a black floral dress and stands to the right, her eyes closed in enjoyment. Between them is a professional microphone, capturing their harmonious voices. The background features wooden panels and various audio equipment, creating an intimate and focused atmosphere. The lighting is soft and warm, highlighting their expressions and the intimate setting. A medium shot captures their interaction closely.", "2 Persons", "pad", "./input_example/audios/2p-2-left.wav", "./input_example/audios/2p-2-right.wav", None],
770
  ],
771
- inputs = [img2vid_image, img2vid_prompt, person_num_selector, audio_mode_selector, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3],
 
 
772
  )
773
 
774
 
 
649
 
650
  return generate_video(*args, **kwargs)
651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
 
653
 
654
  with gr.Blocks() as demo:
 
698
  - **pad**: Select this if every audio input track has already been zero-padded to a common length.
699
  - **concat**: Select this if you want the script to chain each speaker's clips together and then zero-pad the non-speaker segments to reach a uniform length.
700
  """)
701
+ gr.Markdown("""
702
+ **Audio Binding Order:**
703
+ - Audio inputs are bound to persons based on their positions in the input image, from **left to right**.
704
+ - Person 1 corresponds to the leftmost person, Person 2 to the middle person (if any), and Person 3 to the rightmost person (if any).
705
+ """)
706
+ # 三个音频输入框始终可见,读取时根据 person_num_selector 只读取前 n 个
707
+ img2vid_audio_1 = gr.Audio(label="Audio for Person 1 (Leftmost)", type="filepath", visible=True)
708
+ img2vid_audio_2 = gr.Audio(label="Audio for Person 2 (Middle)", type="filepath", visible=True)
709
+ img2vid_audio_3 = gr.Audio(label="Audio for Person 3 (Rightmost)", type="filepath", visible=True)
710
 
711
  with gr.Accordion("Advanced Options", open=False):
712
  with gr.Row():
 
742
  result_gallery = gr.Video(
743
  label='Generated Video', interactive=False, height=600, )
744
 
745
+ gr.Markdown("""
746
+ ### Example Cases
747
+
748
+ *Note: Generation time (tested on NVIDIA H200 GPU) may vary depending on GPU specifications and system load.*
749
+ """)
750
+
751
+ # 隐藏的文本组件用于在 Examples 表格中显示生成耗时
752
+ generation_time_display = gr.Textbox(label="Generation Time (H200 GPU)", visible=True, interactive=False)
753
+
754
+ # 创建一个函数来处理 examples 选择,同时更新音频输入框的可见性
755
+ def handle_example_select(image, prompt, person_num, audio_mode, audio1, audio2, audio3, gen_time):
756
+ # 三个音频输入框始终可见,只返回值,不改变可见性
757
+ # 读取时根据 person_num_selector 只读取前 n 个音频
758
+ return (
759
+ image, prompt, person_num, audio_mode,
760
+ audio1, audio2, audio3, gen_time
761
+ )
762
+
763
+ examples_component = gr.Examples(
764
  examples = [
765
+ ["./input_example/images/1p-0.png", "The man stands in the dusty western street, backlit by the setting sun, and his determined gaze speaks of a rugged spirit.", "1 Person", "pad", "./input_example/audios/1p-0.wav", None, None, "~4 minutes"],
766
+ ["./input_example/images/2p-0.png", "The two people are talking to each other.", "2 Persons", "pad", "./input_example/audios/2p-0-left.wav", "./input_example/audios/2p-0-right.wav", None, "~10 minutes"],
767
+ ["./input_example/images/2p-1.png", "In a casual, intimate setting, a man and a woman are engaged in a heartfelt conversation inside a car. The man, sporting a denim jacket over a blue shirt, sits attentively with a seatbelt fastened, his gaze fixed on the woman beside him. The woman, wearing a black tank top and a denim jacket draped over her shoulders, smiles warmly, her eyes reflecting genuine interest and connection. The car's interior, with its beige seats and simple design, provides a backdrop that emphasizes their interaction. The scene captures a moment of shared understanding and connection, set against the soft, diffused light of an overcast day. A medium shot from a slightly angled perspective, focusing on their expressions and body language.", "2 Persons", "pad", "./input_example/audios/2p-1-left.wav", "./input_example/audios/2p-1-right.wav", None, "~6 minutes"],
768
+ ["./input_example/images/2p-2.png", "In a cozy recording studio, a man and a woman are singing together. The man, with tousled brown hair, stands to the left, wearing a light green button-down shirt. His gaze is directed towards the woman, who is smiling warmly. She, with wavy dark hair, is dressed in a black floral dress and stands to the right, her eyes closed in enjoyment. Between them is a professional microphone, capturing their harmonious voices. The background features wooden panels and various audio equipment, creating an intimate and focused atmosphere. The lighting is soft and warm, highlighting their expressions and the intimate setting. A medium shot captures their interaction closely.", "2 Persons", "pad", "./input_example/audios/2p-2-left.wav", "./input_example/audios/2p-2-right.wav", None, "~8 minutes"],
769
  ],
770
+ inputs = [img2vid_image, img2vid_prompt, person_num_selector, audio_mode_selector, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, generation_time_display],
771
+ outputs = [img2vid_image, img2vid_prompt, person_num_selector, audio_mode_selector, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, generation_time_display],
772
+ fn=handle_example_select,
773
  )
774
 
775