Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| from huggingface_hub import InferenceClient | |
| from pathlib import Path | |
| import tempfile | |
| # Initialize the inference client | |
| client = InferenceClient( | |
| provider="fal-ai", | |
| api_key=os.environ.get("HF_TOKEN"), | |
| bill_to="huggingface", | |
| ) | |
| def generate_video_with_auth(image, prompt, profile: gr.OAuthProfile | None, progress=gr.Progress()): | |
| """ | |
| Generate a video from an image using the Ovi model with authentication check. | |
| Args: | |
| image: Input image (PIL Image or file path) | |
| prompt: Text prompt describing the desired motion/animation | |
| profile: OAuth profile for authentication | |
| progress: Gradio progress tracker | |
| Returns: | |
| Path to the generated video file | |
| """ | |
| if profile is None: | |
| raise gr.Error("Click Sign in with Hugging Face button to use this app for free") | |
| if image is None: | |
| raise gr.Error("Please upload an image first!") | |
| if not prompt or prompt.strip() == "": | |
| raise gr.Error("Please enter a prompt describing the desired motion!") | |
| try: | |
| progress(0.2, desc="Processing image...") | |
| # Read the image file | |
| if isinstance(image, str): | |
| with open(image, "rb") as image_file: | |
| input_image = image_file.read() | |
| else: | |
| # If image is a PIL Image, save it temporarily | |
| temp_image = tempfile.NamedTemporaryFile(delete=False, suffix=".png") | |
| image.save(temp_image.name) | |
| with open(temp_image.name, "rb") as image_file: | |
| input_image = image_file.read() | |
| progress(0.4, desc="Generating video with AI...") | |
| # Generate video using the inference client | |
| video = client.image_to_video( | |
| input_image, | |
| prompt=prompt, | |
| model="chetwinlow1/Ovi", | |
| ) | |
| progress(0.9, desc="Finalizing video...") | |
| # Save the video to a temporary file | |
| output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") | |
| # Check if video is bytes or a file path | |
| if isinstance(video, bytes): | |
| with open(output_path.name, "wb") as f: | |
| f.write(video) | |
| elif isinstance(video, str) and os.path.exists(video): | |
| # If it's a path, copy it | |
| import shutil | |
| shutil.copy(video, output_path.name) | |
| else: | |
| # Try to write it directly | |
| with open(output_path.name, "wb") as f: | |
| f.write(video) | |
| progress(1.0, desc="Complete!") | |
| return output_path.name | |
| except Exception as e: | |
| raise gr.Error(f"Error generating video: {str(e)}") | |
| # Create the Gradio interface | |
| with gr.Blocks( | |
| theme=gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="indigo", | |
| ), | |
| css=""" | |
| .header-link { | |
| font-size: 0.9em; | |
| color: #666; | |
| text-decoration: none; | |
| margin-bottom: 1em; | |
| display: inline-block; | |
| } | |
| .header-link:hover { | |
| color: #333; | |
| text-decoration: underline; | |
| } | |
| .main-header { | |
| text-align: center; | |
| margin-bottom: 2em; | |
| } | |
| .info-box { | |
| background-color: #f0f7ff; | |
| border-left: 4px solid #4285f4; | |
| padding: 1em; | |
| margin: 1em 0; | |
| border-radius: 4px; | |
| } | |
| .auth-warning { | |
| color: #ff6b00; | |
| font-weight: bold; | |
| text-align: center; | |
| margin: 1em 0; | |
| } | |
| """, | |
| title="Image to Video Generator with Ovi", | |
| ) as demo: | |
| gr.HTML( | |
| """ | |
| <div class="main-header"> | |
| <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" class="header-link"> | |
| Built with anycoder β¨ | |
| </a> | |
| </div> | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| # π¬ Image to Video Generator with Ovi | |
| Transform your static images into dynamic videos with synchronized audio using AI! Upload an image and describe the motion you want to see. | |
| Powered by **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** via HuggingFace Inference API. | |
| """ | |
| ) | |
| gr.HTML( | |
| """ | |
| <div class="auth-warning"> | |
| β οΈ You must Sign in with Hugging Face using the button below to use this app. | |
| </div> | |
| """ | |
| ) | |
| # Add login button - required for OAuth | |
| gr.LoginButton() | |
| gr.HTML( | |
| """ | |
| <div class="info-box"> | |
| <strong>π‘ Tips for best results:</strong> | |
| <ul> | |
| <li>Use clear, well-lit images with a single main subject</li> | |
| <li>Write specific prompts describing the desired motion or action</li> | |
| <li>Keep prompts concise and focused on movement and audio elements</li> | |
| <li>Processing generates 5-second videos at 24 FPS with synchronized audio</li> | |
| <li>Processing may take 30-60 seconds depending on server load</li> | |
| </ul> | |
| </div> | |
| """ | |
| ) | |
| gr.HTML( | |
| """ | |
| <div class="info-box"> | |
| <strong>β¨ Special Tokens for Enhanced Control:</strong> | |
| <ul> | |
| <li><strong>Speech:</strong> <code><S>Your speech content here<E></code> - Text enclosed in these tags will be converted to speech</li> | |
| <li><strong>Audio Description:</strong> <code><AUDCAP>Audio description here<ENDAUDCAP></code> - Describes the audio or sound effects present in the video</li> | |
| </ul> | |
| <br> | |
| <strong>π Example Prompt:</strong><br> | |
| <code>Dogs bark loudly at a man wearing a red shirt. The man says <S>Please stop barking at me!<E>. <AUDCAP>Dogs barking, angry man yelling in stern voice<ENDAUDCAP>.</code> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| image_input = gr.Image( | |
| label="πΈ Upload Image", | |
| type="filepath", | |
| sources=["upload", "clipboard"], | |
| height=400, | |
| ) | |
| prompt_input = gr.Textbox( | |
| label="βοΈ Text Prompt", | |
| lines=3, | |
| ) | |
| generate_btn = gr.Button( | |
| "π¬ Generate Video", | |
| variant="primary", | |
| size="lg", | |
| ) | |
| clear_btn = gr.Button( | |
| "ποΈ Clear", | |
| variant="secondary", | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "5.png", | |
| 'A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the "CHOICE FM" logo and various social media handles like "@ilovechoicefm" with "RALEIGH" below it. The man intently addresses the microphone, articulating, <S>is talent. It\'s all about authenticity. You gotta be who you really are, especially if you\'re working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>' | |
| ] | |
| ], | |
| inputs=[image_input, prompt_input], | |
| label="Example", | |
| ) | |
| with gr.Column(scale=1): | |
| video_output = gr.Video( | |
| label="π₯ Generated Video", | |
| height=400, | |
| autoplay=True, | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### About Ovi Model | |
| **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** | |
| Developed by Chetwin Low, Weimin Wang (Character AI) & Calder Katyal (Yale University) | |
| π **Key Features:** | |
| - π¬ **Video+Audio Generation**: Generates synchronized video and audio content simultaneously | |
| - π **Flexible Input**: Supports text-only or text+image conditioning | |
| - β±οΈ **5-second Videos**: Generates 5-second videos at 24 FPS | |
| - π **Multiple Aspect Ratios**: Supports 720Γ720 area at various ratios (9:16, 16:9, 1:1, etc) | |
| Ovi is a veo-3 like model that simultaneously generates both video and audio content from text or text+image inputs. | |
| """ | |
| ) | |
| # Event handlers with authentication | |
| generate_btn.click( | |
| fn=generate_video_with_auth, | |
| inputs=[image_input, prompt_input], | |
| outputs=[video_output], | |
| queue=False, | |
| api_name=False, | |
| show_api=False, | |
| ) | |
| clear_btn.click( | |
| fn=lambda: (None, "", None), | |
| inputs=None, | |
| outputs=[image_input, prompt_input, video_output], | |
| queue=False, | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| ### π How it works | |
| 1. **Sign in** with your Hugging Face account | |
| 2. **Upload** your image - any photo or illustration | |
| 3. **Describe** the motion you want to see in the prompt | |
| 4. **Generate** and watch your image come to life! | |
| ### β οΈ Notes | |
| - Video generation may take 30-60 seconds | |
| - Generates 5-second videos at 24 FPS with synchronized audio | |
| - Supports multiple aspect ratios (9:16, 16:9, 1:1, etc) at 720Γ720 area | |
| - Requires a valid HuggingFace token with Inference API access | |
| - Best results with clear, high-quality images | |
| - The model works best with realistic subjects and natural motions | |
| ### π Resources | |
| - [Ovi Model Card](https://huggingface.co/chetwinlow1/Ovi) | |
| - [HuggingFace Inference API](https://huggingface.co/docs/huggingface_hub/guides/inference) | |
| - [Character AI](https://character.ai) | |
| """ | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch( | |
| show_api=False, | |
| enable_monitoring=False, | |
| quiet=True, | |
| ) |