Spaces:

Gertie01
/

demo-8efmrqxy

Runtime error

App Files Files Community

Gertie01 commited on 25 days ago

Commit

2ec7ba4

verified ·

1 Parent(s): 83b1cb6

Deploy Gradio app with multiple files

Browse files

Files changed (3) hide show

app.py +61 -0
models.py +154 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import gradio as gr
+from PIL import Image
+from models import remix_image
+# Gradio Interface
+with gr.Blocks(theme=gr.themes.Soft(), title="Rodin.AI Image Remixer") as demo:
+    gr.HTML(
+        """
+        <div style="text-align: center; max-width: 700px; margin: 0 auto;">
+            <h1><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/gradio-logo.svg" alt="Gradio Logo" style="height: 1em;"> Rodin.AI Image Remixer</h1>
+            <p>Upload an image and provide a text prompt to remix it using a powerful diffusion model.
+            Adjust the creativity with denoising strength and prompt adherence with guidance scale.
+            </p>
+            <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" style="text-decoration: underline;">anycoder</a></p>
+        </div>
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_image = gr.Image(type="pil", label="Input Image", value="https://huggingface.co/datasets/gradio/rodin-ai/resolve/main/rodin.jpeg")
+            prompt = gr.Textbox(
+                label="Prompt",
+                placeholder="A high-quality photo of a medieval knight, highly detailed, realistic, cinematic lighting, dramatic",
+                lines=2,
+                value="A high-quality photo of a medieval knight, highly detailed, realistic, cinematic lighting, dramatic",
+            )
+            negative_prompt = gr.Textbox(
+                label="Negative Prompt",
+                placeholder="blurry, low quality, bad anatomy, deformed, ugly",
+                lines=1,
+                value="blurry, low quality, bad anatomy, deformed, ugly",
+            )
+            guidance_scale = gr.Slider(
+                minimum=1.0,
+                maximum=15.0,
+                value=7.0,
+                step=0.5,
+                label="Guidance Scale (CFG)",
+                info="Higher values make the image more aligned with the prompt.",
+            )
+            denoising_strength = gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.7,
+                step=0.05,
+                label="Denoising Strength",
+                info="Higher values allow more changes to the original image. Lower values keep more of the original structure.",
+            )
+            remix_btn = gr.Button("Remix Image", variant="primary")
+        with gr.Column(scale=1):
+            output_image = gr.Image(label="Remixed Image", show_share_button=True)
+    remix_btn.click(
+        fn=remix_image,
+        inputs=[input_image, prompt, negative_prompt, guidance_scale, denoising_strength],
+        outputs=output_image,
+        api_name="remix_image"
+    )
+if __name__ == "__main__":
+    demo.launch()

models.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import spaces
+import torch
+from diffusers import DiffusionPipeline
+from PIL import Image
+import os
+import numpy as np # Required for some internal diffusers operations / data types
+# Model ID for Stable Diffusion XL Base
+MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0"
+# Load the pipeline globally
+# Use float16 for reduced memory usage and faster inference on GPU
+pipe = DiffusionPipeline.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float16,
+    variant="fp16", # Explicitly specify the fp16 variant
+    use_safetensors=True
+)
+pipe.to("cuda")
+# --- ZeroGPU AoT Compilation (MANDATORY for local diffusion models) ---
+# This function compiles key components of the diffusion pipeline ahead-of-time (AoT)
+# to achieve significant performance improvements (1.3x-1.8x speedup) on Hugging Face Spaces.
+# It uses the @spaces.GPU decorator with a long duration to ensure the compilation
+# completes during the Space's startup phase.
+@spaces.GPU(duration=1500) # Maximum duration allowed for startup tasks
+def compile_diffusion_pipeline_components():
+    print("Starting AoT compilation for Diffusion Pipeline components...")
+    # Compile text_encoder (CLIPTextModel)
+    print("Compiling pipe.text_encoder...")
+    with torch.no_grad():
+        # Prepare dummy input for text_encoder
+        text_input_ids = pipe.tokenizer(
+            "a test prompt",
+            padding="max_length",
+            max_length=pipe.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        ).input_ids.to("cuda")
+        # Capture and compile pipe.text_encoder
+        with spaces.aoti_capture(pipe.text_encoder) as call:
+            pipe.text_encoder(text_input_ids)
+        exported_text_encoder = torch.export.export(
+            pipe.text_encoder,
+            args=call.args,
+            kwargs=call.kwargs,
+        )
+        compiled_text_encoder = spaces.aoti_compile(exported_text_encoder)
+        spaces.aoti_apply(compiled_text_encoder, pipe.text_encoder)
+    print("pipe.text_encoder compiled and applied.")
+    # Compile text_encoder_2 (CLIPTextModelWithProjection)
+    print("Compiling pipe.text_encoder_2...")
+    with torch.no_grad():
+        # Prepare dummy input for text_encoder_2
+        text_input_ids_2 = pipe.tokenizer_2(
+            "a test prompt",
+            padding="max_length",
+            max_length=pipe.tokenizer_2.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        ).input_ids.to("cuda")
+        # Capture and compile pipe.text_encoder_2
+        with spaces.aoti_capture(pipe.text_encoder_2) as call:
+            pipe.text_encoder_2(text_input_ids_2)
+        exported_text_encoder_2 = torch.export.export(
+            pipe.text_encoder_2,
+            args=call.args,
+            kwargs=call.kwargs,
+        )
+        compiled_text_encoder_2 = spaces.aoti_compile(exported_text_encoder_2)
+        spaces.aoti_apply(compiled_text_encoder_2, pipe.text_encoder_2)
+    print("pipe.text_encoder_2 compiled and applied.")
+    # Compile UNet (the most computationally intensive part)
+    # The `spaces.aoti_capture` needs to trace the UNet's forward pass within a pipeline call.
+    # We will perform a minimal single-step image-to-image generation to capture the UNet's inputs.
+    print("Compiling pipe.unet...")
+    with torch.no_grad():
+        # Create a tiny dummy image (512x512 is typical minimum for SDXL, will be resized internally)
+        dummy_input_image = Image.new('RGB', (512, 512), color='white')
+        dummy_prompt = "a small test image"
+        # Capture the UNet's forward pass during a pipeline run
+        # This implicitly provides the complex inputs (latents, timestep, encoder_hidden_states, etc.)
+        with spaces.aoti_capture(pipe.unet) as call:
+            _ = pipe(
+                prompt=dummy_prompt,
+                image=dummy_input_image,
+                num_inference_steps=1, # Minimal steps for faster capture
+                guidance_scale=7.5,
+                denoising_strength=0.8,
+                output_type="pil" # Ensure PIL output for compatibility
+            )
+        exported_unet = torch.export.export(
+            pipe.unet,
+            args=call.args,
+            kwargs=call.kwargs,
+        )
+        compiled_unet = spaces.aoti_compile(exported_unet)
+        spaces.aoti_apply(compiled_unet, pipe.unet)
+    print("pipe.unet compiled and applied.")
+    print("AoT compilation complete.")
+# Call the compilation function once during the startup of the Space
+compile_diffusion_pipeline_components()
+@spaces.GPU(duration=60) # Decorate inference function with ZeroGPU
+def remix_image(
+    image: Image.Image,
+    prompt: str,
+    negative_prompt: str,
+    guidance_scale: float,
+    denoising_strength: float,
+) -> Image.Image:
+    """
+    Remixes an input image based on a text prompt using a diffusion model.
+    Args:
+        image (PIL.Image.Image): The input image to remix.
+        prompt (str): The text prompt guiding the remixing.
+        negative_prompt (str): The negative prompt to guide generation away from.
+        guidance_scale (float): Classifier-free guidance scale.
+        denoising_strength (float): The strength of denoising applied to the image.
+            Higher values allow more creative freedom (more changes from original).
+            Lower values keep more of the original image's structure.
+    Returns:
+        PIL.Image.Image: The remixed image.
+    """
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    print(f"Generating image with prompt: {prompt}")
+    print(f"Negative prompt: {negative_prompt}")
+    print(f"Guidance scale: {guidance_scale}, Denoising strength: {denoising_strength}")
+    generated_images = pipe(
+        prompt=prompt,
+        image=image,
+        negative_prompt=negative_prompt,
+        guidance_scale=guidance_scale,
+        denoising_strength=denoising_strength,
+        num_inference_steps=25, # Good balance of quality and speed
+        output_type="pil"
+    ).images
+    return generated_images[0] # Return the first generated image

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio
+torch
+diffusers
+transformers
+accelerate
+safetensors
+Pillow
+xformers
+numpy
+sentencepiece
+spaces