Gertie01 commited on
Commit
2ec7ba4
Β·
verified Β·
1 Parent(s): 83b1cb6

Deploy Gradio app with multiple files

Browse files
Files changed (3) hide show
  1. app.py +61 -0
  2. models.py +154 -0
  3. requirements.txt +11 -0
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import Image
3
+ from models import remix_image
4
+
5
+ # Gradio Interface
6
+ with gr.Blocks(theme=gr.themes.Soft(), title="Rodin.AI Image Remixer") as demo:
7
+ gr.HTML(
8
+ """
9
+ <div style="text-align: center; max-width: 700px; margin: 0 auto;">
10
+ <h1><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/gradio-logo.svg" alt="Gradio Logo" style="height: 1em;"> Rodin.AI Image Remixer</h1>
11
+ <p>Upload an image and provide a text prompt to remix it using a powerful diffusion model.
12
+ Adjust the creativity with denoising strength and prompt adherence with guidance scale.
13
+ </p>
14
+ <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" style="text-decoration: underline;">anycoder</a></p>
15
+ </div>
16
+ """
17
+ )
18
+ with gr.Row():
19
+ with gr.Column(scale=1):
20
+ input_image = gr.Image(type="pil", label="Input Image", value="https://huggingface.co/datasets/gradio/rodin-ai/resolve/main/rodin.jpeg")
21
+ prompt = gr.Textbox(
22
+ label="Prompt",
23
+ placeholder="A high-quality photo of a medieval knight, highly detailed, realistic, cinematic lighting, dramatic",
24
+ lines=2,
25
+ value="A high-quality photo of a medieval knight, highly detailed, realistic, cinematic lighting, dramatic",
26
+ )
27
+ negative_prompt = gr.Textbox(
28
+ label="Negative Prompt",
29
+ placeholder="blurry, low quality, bad anatomy, deformed, ugly",
30
+ lines=1,
31
+ value="blurry, low quality, bad anatomy, deformed, ugly",
32
+ )
33
+ guidance_scale = gr.Slider(
34
+ minimum=1.0,
35
+ maximum=15.0,
36
+ value=7.0,
37
+ step=0.5,
38
+ label="Guidance Scale (CFG)",
39
+ info="Higher values make the image more aligned with the prompt.",
40
+ )
41
+ denoising_strength = gr.Slider(
42
+ minimum=0.1,
43
+ maximum=1.0,
44
+ value=0.7,
45
+ step=0.05,
46
+ label="Denoising Strength",
47
+ info="Higher values allow more changes to the original image. Lower values keep more of the original structure.",
48
+ )
49
+ remix_btn = gr.Button("Remix Image", variant="primary")
50
+ with gr.Column(scale=1):
51
+ output_image = gr.Image(label="Remixed Image", show_share_button=True)
52
+
53
+ remix_btn.click(
54
+ fn=remix_image,
55
+ inputs=[input_image, prompt, negative_prompt, guidance_scale, denoising_strength],
56
+ outputs=output_image,
57
+ api_name="remix_image"
58
+ )
59
+
60
+ if __name__ == "__main__":
61
+ demo.launch()
models.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import torch
3
+ from diffusers import DiffusionPipeline
4
+ from PIL import Image
5
+ import os
6
+ import numpy as np # Required for some internal diffusers operations / data types
7
+
8
+ # Model ID for Stable Diffusion XL Base
9
+ MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0"
10
+
11
+ # Load the pipeline globally
12
+ # Use float16 for reduced memory usage and faster inference on GPU
13
+ pipe = DiffusionPipeline.from_pretrained(
14
+ MODEL_ID,
15
+ torch_dtype=torch.float16,
16
+ variant="fp16", # Explicitly specify the fp16 variant
17
+ use_safetensors=True
18
+ )
19
+ pipe.to("cuda")
20
+
21
+ # --- ZeroGPU AoT Compilation (MANDATORY for local diffusion models) ---
22
+ # This function compiles key components of the diffusion pipeline ahead-of-time (AoT)
23
+ # to achieve significant performance improvements (1.3x-1.8x speedup) on Hugging Face Spaces.
24
+ # It uses the @spaces.GPU decorator with a long duration to ensure the compilation
25
+ # completes during the Space's startup phase.
26
+ @spaces.GPU(duration=1500) # Maximum duration allowed for startup tasks
27
+ def compile_diffusion_pipeline_components():
28
+ print("Starting AoT compilation for Diffusion Pipeline components...")
29
+
30
+ # Compile text_encoder (CLIPTextModel)
31
+ print("Compiling pipe.text_encoder...")
32
+ with torch.no_grad():
33
+ # Prepare dummy input for text_encoder
34
+ text_input_ids = pipe.tokenizer(
35
+ "a test prompt",
36
+ padding="max_length",
37
+ max_length=pipe.tokenizer.model_max_length,
38
+ truncation=True,
39
+ return_tensors="pt",
40
+ ).input_ids.to("cuda")
41
+
42
+ # Capture and compile pipe.text_encoder
43
+ with spaces.aoti_capture(pipe.text_encoder) as call:
44
+ pipe.text_encoder(text_input_ids)
45
+
46
+ exported_text_encoder = torch.export.export(
47
+ pipe.text_encoder,
48
+ args=call.args,
49
+ kwargs=call.kwargs,
50
+ )
51
+ compiled_text_encoder = spaces.aoti_compile(exported_text_encoder)
52
+ spaces.aoti_apply(compiled_text_encoder, pipe.text_encoder)
53
+ print("pipe.text_encoder compiled and applied.")
54
+
55
+ # Compile text_encoder_2 (CLIPTextModelWithProjection)
56
+ print("Compiling pipe.text_encoder_2...")
57
+ with torch.no_grad():
58
+ # Prepare dummy input for text_encoder_2
59
+ text_input_ids_2 = pipe.tokenizer_2(
60
+ "a test prompt",
61
+ padding="max_length",
62
+ max_length=pipe.tokenizer_2.model_max_length,
63
+ truncation=True,
64
+ return_tensors="pt",
65
+ ).input_ids.to("cuda")
66
+
67
+ # Capture and compile pipe.text_encoder_2
68
+ with spaces.aoti_capture(pipe.text_encoder_2) as call:
69
+ pipe.text_encoder_2(text_input_ids_2)
70
+
71
+ exported_text_encoder_2 = torch.export.export(
72
+ pipe.text_encoder_2,
73
+ args=call.args,
74
+ kwargs=call.kwargs,
75
+ )
76
+ compiled_text_encoder_2 = spaces.aoti_compile(exported_text_encoder_2)
77
+ spaces.aoti_apply(compiled_text_encoder_2, pipe.text_encoder_2)
78
+ print("pipe.text_encoder_2 compiled and applied.")
79
+
80
+ # Compile UNet (the most computationally intensive part)
81
+ # The `spaces.aoti_capture` needs to trace the UNet's forward pass within a pipeline call.
82
+ # We will perform a minimal single-step image-to-image generation to capture the UNet's inputs.
83
+ print("Compiling pipe.unet...")
84
+ with torch.no_grad():
85
+ # Create a tiny dummy image (512x512 is typical minimum for SDXL, will be resized internally)
86
+ dummy_input_image = Image.new('RGB', (512, 512), color='white')
87
+ dummy_prompt = "a small test image"
88
+
89
+ # Capture the UNet's forward pass during a pipeline run
90
+ # This implicitly provides the complex inputs (latents, timestep, encoder_hidden_states, etc.)
91
+ with spaces.aoti_capture(pipe.unet) as call:
92
+ _ = pipe(
93
+ prompt=dummy_prompt,
94
+ image=dummy_input_image,
95
+ num_inference_steps=1, # Minimal steps for faster capture
96
+ guidance_scale=7.5,
97
+ denoising_strength=0.8,
98
+ output_type="pil" # Ensure PIL output for compatibility
99
+ )
100
+
101
+ exported_unet = torch.export.export(
102
+ pipe.unet,
103
+ args=call.args,
104
+ kwargs=call.kwargs,
105
+ )
106
+ compiled_unet = spaces.aoti_compile(exported_unet)
107
+ spaces.aoti_apply(compiled_unet, pipe.unet)
108
+ print("pipe.unet compiled and applied.")
109
+ print("AoT compilation complete.")
110
+
111
+ # Call the compilation function once during the startup of the Space
112
+ compile_diffusion_pipeline_components()
113
+
114
+ @spaces.GPU(duration=60) # Decorate inference function with ZeroGPU
115
+ def remix_image(
116
+ image: Image.Image,
117
+ prompt: str,
118
+ negative_prompt: str,
119
+ guidance_scale: float,
120
+ denoising_strength: float,
121
+ ) -> Image.Image:
122
+ """
123
+ Remixes an input image based on a text prompt using a diffusion model.
124
+
125
+ Args:
126
+ image (PIL.Image.Image): The input image to remix.
127
+ prompt (str): The text prompt guiding the remixing.
128
+ negative_prompt (str): The negative prompt to guide generation away from.
129
+ guidance_scale (float): Classifier-free guidance scale.
130
+ denoising_strength (float): The strength of denoising applied to the image.
131
+ Higher values allow more creative freedom (more changes from original).
132
+ Lower values keep more of the original image's structure.
133
+
134
+ Returns:
135
+ PIL.Image.Image: The remixed image.
136
+ """
137
+ if image.mode != "RGB":
138
+ image = image.convert("RGB")
139
+
140
+ print(f"Generating image with prompt: {prompt}")
141
+ print(f"Negative prompt: {negative_prompt}")
142
+ print(f"Guidance scale: {guidance_scale}, Denoising strength: {denoising_strength}")
143
+
144
+ generated_images = pipe(
145
+ prompt=prompt,
146
+ image=image,
147
+ negative_prompt=negative_prompt,
148
+ guidance_scale=guidance_scale,
149
+ denoising_strength=denoising_strength,
150
+ num_inference_steps=25, # Good balance of quality and speed
151
+ output_type="pil"
152
+ ).images
153
+
154
+ return generated_images[0] # Return the first generated image
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ diffusers
4
+ transformers
5
+ accelerate
6
+ safetensors
7
+ Pillow
8
+ xformers
9
+ numpy
10
+ sentencepiece
11
+ spaces