NON_WORKING_matrix_game_2

Paused

App Files Files Community

jbilcke-hf commited on Aug 13

Commit

eb94d89

verified ·

1 Parent(s): 7ffd2d3

Upload 91 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +26 -0
configs/distilled_model/gta_drive/config.json +49 -0
configs/distilled_model/templerun/config.json +42 -0
configs/distilled_model/universal/config.json +49 -0
configs/foundation_model/config.json +49 -0
configs/inference_yaml/inference_gta_drive.yaml +21 -0
configs/inference_yaml/inference_templerun.yaml +22 -0
configs/inference_yaml/inference_universal.yaml +21 -0
demo_images/gta_drive/0000.png +3 -0
demo_images/gta_drive/0001.png +3 -0
demo_images/gta_drive/0002.png +3 -0
demo_images/gta_drive/0003.png +3 -0
demo_images/gta_drive/0004.png +3 -0
demo_images/gta_drive/0005.png +3 -0
demo_images/temple_run/0000.png +3 -0
demo_images/temple_run/0001.png +3 -0
demo_images/temple_run/0002.png +3 -0
demo_images/temple_run/0003.png +3 -0
demo_images/temple_run/0004.png +3 -0
demo_images/temple_run/0005.png +3 -0
demo_images/universal/0000.png +0 -0
demo_images/universal/0001.png +3 -0
demo_images/universal/0002.png +3 -0
demo_images/universal/0003.png +3 -0
demo_images/universal/0004.png +3 -0
demo_images/universal/0005.png +3 -0
demo_images/universal/0006.png +3 -0
demo_images/universal/0007.png +3 -0
demo_images/universal/0008.png +3 -0
demo_images/universal/0009.png +3 -0
demo_images/universal/0010.webp +0 -0
demo_images/universal/0011.png +3 -0
demo_images/universal/0012.png +3 -0
demo_images/universal/0013.png +3 -0
demo_images/universal/0014.png +3 -0
demo_images/universal/0015.png +0 -0
demo_images/universal/0016.png +3 -0
demo_utils/constant.py +42 -0
demo_utils/memory.py +135 -0
demo_utils/taehv.py +313 -0
demo_utils/utils.py +616 -0
demo_utils/vae.py +390 -0
demo_utils/vae_block3.py +291 -0
demo_utils/vae_torch2trt.py +308 -0
inference.py +169 -0
inference_streaming.py +161 -0
pipeline/__init__.py +5 -0
pipeline/causal_inference.py +753 -0
requirements.txt +41 -0
setup.py +6 -0

.gitattributes CHANGED Viewed

@@ -72,3 +72,29 @@ GameWorldScore/GameWorld/third_party/DROID-SLAM/thirdparty/lietorch/examples/rgb
 GameWorldScore/GameWorld/third_party/DROID-SLAM/thirdparty/lietorch/examples/rgbdslam/assets/room.png filter=lfs diff=lfs merge=lfs -text
 GameWorldScore/GameWorld/third_party/DROID-SLAM/thirdparty/lietorch/lietorch.png filter=lfs diff=lfs merge=lfs -text
 GameWorldScore/GameWorld/third_party/RAFT/RAFT.png filter=lfs diff=lfs merge=lfs -text

 GameWorldScore/GameWorld/third_party/DROID-SLAM/thirdparty/lietorch/examples/rgbdslam/assets/room.png filter=lfs diff=lfs merge=lfs -text
 GameWorldScore/GameWorld/third_party/DROID-SLAM/thirdparty/lietorch/lietorch.png filter=lfs diff=lfs merge=lfs -text
 GameWorldScore/GameWorld/third_party/RAFT/RAFT.png filter=lfs diff=lfs merge=lfs -text
+demo_images/gta_drive/0000.png filter=lfs diff=lfs merge=lfs -text
+demo_images/gta_drive/0001.png filter=lfs diff=lfs merge=lfs -text
+demo_images/gta_drive/0002.png filter=lfs diff=lfs merge=lfs -text
+demo_images/gta_drive/0003.png filter=lfs diff=lfs merge=lfs -text
+demo_images/gta_drive/0004.png filter=lfs diff=lfs merge=lfs -text
+demo_images/gta_drive/0005.png filter=lfs diff=lfs merge=lfs -text
+demo_images/temple_run/0000.png filter=lfs diff=lfs merge=lfs -text
+demo_images/temple_run/0001.png filter=lfs diff=lfs merge=lfs -text
+demo_images/temple_run/0002.png filter=lfs diff=lfs merge=lfs -text
+demo_images/temple_run/0003.png filter=lfs diff=lfs merge=lfs -text
+demo_images/temple_run/0004.png filter=lfs diff=lfs merge=lfs -text
+demo_images/temple_run/0005.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0001.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0002.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0003.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0004.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0005.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0006.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0007.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0008.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0009.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0011.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0012.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0013.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0014.png filter=lfs diff=lfs merge=lfs -text
+demo_images/universal/0016.png filter=lfs diff=lfs merge=lfs -text

configs/distilled_model/gta_drive/config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "_class_name": "CausalWanModel",
+  "_diffusers_version": "0.35.0.dev0",
+  "action_config": {
+    "blocks": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    "enable_keyboard": true,
+    "enable_mouse": true,
+    "heads_num": 16,
+    "hidden_size": 128,
+    "img_hidden_size": 1536,
+    "keyboard_dim_in": 2,
+    "keyboard_hidden_dim": 1024,
+    "mouse_dim_in": 2,
+    "mouse_hidden_dim": 1024,
+    "mouse_qk_dim_list": [
+      8,
+      28,
+      28
+    ],
+    "patch_size": [
+      1,
+      2,
+      2
+    ],
+    "qk_norm": true,
+    "qkv_bias": false,
+    "rope_dim_list": [
+      8,
+      28,
+      28
+    ],
+    "rope_theta": 256,
+    "vae_time_compression_ratio": 4,
+    "windows_size": 3
+  },
+  "dim": 1536,
+  "eps": 1e-06,
+  "ffn_dim": 8960,
+  "freq_dim": 256,
+  "in_dim": 36,
+  "inject_sample_info": false,
+  "local_attn_size": 4,
+  "model_type": "i2v",
+  "num_heads": 12,
+  "num_layers": 30,
+  "out_dim": 16,
+  "sink_size": 0,
+  "text_len": 512
+}

configs/distilled_model/templerun/config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "_class_name": "CausalWanModel",
+  "_diffusers_version": "0.35.0.dev0",
+  "action_config": {
+    "blocks": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    "enable_keyboard": true,
+    "enable_mouse": false,
+    "heads_num": 16,
+    "hidden_size": 128,
+    "img_hidden_size": 1536,
+    "keyboard_dim_in": 7,
+    "keyboard_hidden_dim": 1024,
+    "patch_size": [
+      1,
+      2,
+      2
+    ],
+    "qk_norm": true,
+    "qkv_bias": false,
+    "rope_dim_list": [
+      8,
+      28,
+      28
+    ],
+    "rope_theta": 256,
+    "vae_time_compression_ratio": 4,
+    "windows_size": 3
+  },
+  "dim": 1536,
+  "eps": 1e-06,
+  "ffn_dim": 8960,
+  "freq_dim": 256,
+  "in_dim": 36,
+  "inject_sample_info": false,
+  "local_attn_size": 6,
+  "model_type": "i2v",
+  "num_heads": 12,
+  "num_layers": 30,
+  "out_dim": 16,
+  "sink_size": 0,
+  "text_len": 512
+}

configs/distilled_model/universal/config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "_class_name": "CausalWanModel",
+  "_diffusers_version": "0.35.0.dev0",
+  "action_config": {
+    "blocks": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    "enable_keyboard": true,
+    "enable_mouse": true,
+    "heads_num": 16,
+    "hidden_size": 128,
+    "img_hidden_size": 1536,
+    "keyboard_dim_in": 4,
+    "keyboard_hidden_dim": 1024,
+    "mouse_dim_in": 2,
+    "mouse_hidden_dim": 1024,
+    "mouse_qk_dim_list": [
+      8,
+      28,
+      28
+    ],
+    "patch_size": [
+      1,
+      2,
+      2
+    ],
+    "qk_norm": true,
+    "qkv_bias": false,
+    "rope_dim_list": [
+      8,
+      28,
+      28
+    ],
+    "rope_theta": 256,
+    "vae_time_compression_ratio": 4,
+    "windows_size": 3
+  },
+  "dim": 1536,
+  "eps": 1e-06,
+  "ffn_dim": 8960,
+  "freq_dim": 256,
+  "in_dim": 36,
+  "inject_sample_info": false,
+  "local_attn_size": 6,
+  "model_type": "i2v",
+  "num_heads": 12,
+  "num_layers": 30,
+  "out_dim": 16,
+  "sink_size": 0,
+  "text_len": 512
+}

configs/foundation_model/config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "_class_name": "CausalWanModel",
+  "_diffusers_version": "0.35.0.dev0",
+  "action_config": {
+    "blocks": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
+    "enable_keyboard": true,
+    "enable_mouse": true,
+    "heads_num": 16,
+    "hidden_size": 128,
+    "img_hidden_size": 1536,
+    "keyboard_dim_in": 4,
+    "keyboard_hidden_dim": 1024,
+    "mouse_dim_in": 2,
+    "mouse_hidden_dim": 1024,
+    "mouse_qk_dim_list": [
+      8,
+      28,
+      28
+    ],
+    "patch_size": [
+      1,
+      2,
+      2
+    ],
+    "qk_norm": true,
+    "qkv_bias": false,
+    "rope_dim_list": [
+      8,
+      28,
+      28
+    ],
+    "rope_theta": 256,
+    "vae_time_compression_ratio": 4,
+    "windows_size": 3
+  },
+  "dim": 1536,
+  "eps": 1e-06,
+  "ffn_dim": 8960,
+  "freq_dim": 256,
+  "in_dim": 36,
+  "inject_sample_info": false,
+  "local_attn_size": -1,
+  "model_type": "i2v",
+  "num_heads": 12,
+  "num_layers": 30,
+  "out_dim": 16,
+  "sink_size": 0,
+  "text_len": 512
+}

configs/inference_yaml/inference_gta_drive.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+denoising_step_list:
+- 1000
+- 666
+- 333
+warp_denoising_step: true
+ts_schedule: false
+mixed_precision: true
+seed: 42
+image_or_video_shape:
+- 1
+- 16
+- 15
+- 44
+- 80
+num_frame_per_block: 3
+context_noise: 0
+mode: gta_drive
+causal: true
+model_kwargs:
+  timestep_shift: 5.0
+  model_config: configs/distilled_model/gta_drive

configs/inference_yaml/inference_templerun.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+denoising_step_list:
+- 1000
+- 750
+- 500
+- 250
+warp_denoising_step: true
+ts_schedule: false
+mixed_precision: true
+seed: 42
+image_or_video_shape:
+- 1
+- 16
+- 15
+- 44
+- 80
+num_frame_per_block: 3
+context_noise: 0
+mode: templerun
+causal: true
+model_kwargs:
+  timestep_shift: 5.0
+  model_config: configs/distilled_model/templerun

configs/inference_yaml/inference_universal.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+denoising_step_list:
+- 1000
+- 666
+- 333
+warp_denoising_step: true
+ts_schedule: false
+mixed_precision: true
+seed: 42
+image_or_video_shape:
+- 1
+- 16
+- 15
+- 44
+- 80
+num_frame_per_block: 3
+context_noise: 0
+mode: universal
+causal: true
+model_kwargs:
+  timestep_shift: 5.0
+  model_config: configs/distilled_model/universal

demo_images/gta_drive/0000.png ADDED Viewed

Git LFS Details

SHA256: 2429865c8d5692f59a1f3a05e16980316a5e6031cd65cd0e3e8f09ad9cd2be53
Pointer size: 132 Bytes
Size of remote file: 2.4 MB

demo_images/gta_drive/0001.png ADDED Viewed

Git LFS Details

SHA256: f924eae5f01a35cf5450070df558974f9e4c2b730ea34f642314a32921943371
Pointer size: 131 Bytes
Size of remote file: 990 kB

demo_images/gta_drive/0002.png ADDED Viewed

Git LFS Details

SHA256: 1e8cb46f1c557f06f08a691c8b335a317765c4613b90f89a10971feb06a7cab8
Pointer size: 131 Bytes
Size of remote file: 679 kB

demo_images/gta_drive/0003.png ADDED Viewed

Git LFS Details

SHA256: fc99b831a007239b89c9f146fffc6c529ea60f5f74148647b93dd3c836ec4274
Pointer size: 131 Bytes
Size of remote file: 992 kB

demo_images/gta_drive/0004.png ADDED Viewed

Git LFS Details

SHA256: 00af95d7101b48f446d87d26ef56fdf21857ec08a4bc5b11021ebbff289cd45e
Pointer size: 132 Bytes
Size of remote file: 1.24 MB

demo_images/gta_drive/0005.png ADDED Viewed

Git LFS Details

SHA256: e0ea52df406f81cca64f58c572ac46da848af5bb2178deb3c8166ea51b6ceab6
Pointer size: 131 Bytes
Size of remote file: 558 kB

demo_images/temple_run/0000.png ADDED Viewed

Git LFS Details

SHA256: ff0dadd00d8e77fcaa727fc08d2a4d347b4faa7868c45c287690f7992b4a4e6e
Pointer size: 131 Bytes
Size of remote file: 441 kB

demo_images/temple_run/0001.png ADDED Viewed

Git LFS Details

SHA256: 0c7ed99d00f463c352c6d5b0f2f2c07c9ce9302767e669f8abe8425db716b4dc
Pointer size: 131 Bytes
Size of remote file: 504 kB

demo_images/temple_run/0002.png ADDED Viewed

Git LFS Details

SHA256: 9fd7bb19bd319dd9d03fadeb8ce932533246c67a58aca2137ffdff7e99fb5b58
Pointer size: 131 Bytes
Size of remote file: 812 kB

demo_images/temple_run/0003.png ADDED Viewed

Git LFS Details

SHA256: 36708492f149b922dcbf69e590a419699bf611c68537e2b58c29a2aed3f63a40
Pointer size: 131 Bytes
Size of remote file: 433 kB

demo_images/temple_run/0004.png ADDED Viewed

Git LFS Details

SHA256: 26b01fa846a5e51f5a655ab76453fe3499238aa3b84497c66855db487352a327
Pointer size: 131 Bytes
Size of remote file: 801 kB

demo_images/temple_run/0005.png ADDED Viewed

Git LFS Details

SHA256: 2c52da829c64e8ab376293077aba670b043cd4e37a54e9d866d805e6ab5e6dac
Pointer size: 131 Bytes
Size of remote file: 819 kB

demo_images/universal/0000.png ADDED Viewed

demo_images/universal/0001.png ADDED Viewed

Git LFS Details

SHA256: 9d8e12f9092ace1e54ff5f09df974e9b70f6a5b6d5badb2546191682cc8cebda
Pointer size: 131 Bytes
Size of remote file: 207 kB

demo_images/universal/0002.png ADDED Viewed

Git LFS Details

SHA256: 45c6130d5ca382d6e78857a514014d8f46d2a542dd263e8eeec79ba9d915d988
Pointer size: 131 Bytes
Size of remote file: 290 kB

demo_images/universal/0003.png ADDED Viewed

Git LFS Details

SHA256: e6228beca4ba882a199ac21211428bbcd76781da8d1dc1795fd6fbbf2db96ee4
Pointer size: 132 Bytes
Size of remote file: 1.03 MB

demo_images/universal/0004.png ADDED Viewed

Git LFS Details

SHA256: 41e074e0e6b05e4ccf1b1a89849ce2d68ecfbee38cb87a70cba18a14137711fa
Pointer size: 131 Bytes
Size of remote file: 184 kB

demo_images/universal/0005.png ADDED Viewed

Git LFS Details

SHA256: 61fae5982e0e811a167e8aef41fde19e562764e3d38a616264e8410a2b23ef25
Pointer size: 132 Bytes
Size of remote file: 2.84 MB

demo_images/universal/0006.png ADDED Viewed

Git LFS Details

SHA256: 99e9823f5b9797090548c30228863a9bfada1f777f4bb21d20d676cc1d1e9e27
Pointer size: 132 Bytes
Size of remote file: 3.02 MB

demo_images/universal/0007.png ADDED Viewed

Git LFS Details

SHA256: f52737a4f4302d3f57948f493a1a2644a0841af82c418001f5a78b55b3618a85
Pointer size: 132 Bytes
Size of remote file: 3.09 MB

demo_images/universal/0008.png ADDED Viewed

Git LFS Details

SHA256: 214d1c6c2d32dc83142385d2fff3655a2c26afbd1f87b9abb0b3767ac6950a0f
Pointer size: 131 Bytes
Size of remote file: 627 kB

demo_images/universal/0009.png ADDED Viewed

Git LFS Details

SHA256: 75da6c6652188c9b700bdf26776662e9e2c3a9803973aa6ac88e2301158a8047
Pointer size: 131 Bytes
Size of remote file: 820 kB

demo_images/universal/0010.webp ADDED Viewed

demo_images/universal/0011.png ADDED Viewed

Git LFS Details

SHA256: 4eda85711c6b1b65832b505630de7aa201f4af14363fc893aa835460b90ab33f
Pointer size: 132 Bytes
Size of remote file: 2 MB

demo_images/universal/0012.png ADDED Viewed

Git LFS Details

SHA256: 49f766f885c3c06bd1c3e147bfadb64599ec6a4389fdff797d094cc4ca34e6d9
Pointer size: 131 Bytes
Size of remote file: 115 kB

demo_images/universal/0013.png ADDED Viewed

Git LFS Details

SHA256: 27ab2fa580d8279bc0873192e88c0358370de43be8924118423c52bc1efa8bf2
Pointer size: 131 Bytes
Size of remote file: 944 kB

demo_images/universal/0014.png ADDED Viewed

Git LFS Details

SHA256: 34ae77bc7a60e5ed1950ea5763ce07635b3e1736ecf5a52c14822e4b7a829f6f
Pointer size: 132 Bytes
Size of remote file: 5.27 MB

demo_images/universal/0015.png ADDED Viewed

demo_images/universal/0016.png ADDED Viewed

Git LFS Details

SHA256: c8bee000c8291885198fb2abf28dd0bb3094dd86d6acb31c5e49c81ac2f5901f
Pointer size: 132 Bytes
Size of remote file: 1.07 MB

demo_utils/constant.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+base_size = 80
+base_size2 = 44
+ZERO_VAE_CACHE = [
+    torch.zeros(1, 16, 2, base_size2, base_size),
+    torch.zeros(1, 384, 2, base_size2, base_size),
+    torch.zeros(1, 384, 2, base_size2, base_size),
+    torch.zeros(1, 384, 2, base_size2, base_size),
+    torch.zeros(1, 384, 2, base_size2, base_size),
+    torch.zeros(1, 384, 2, base_size2, base_size),
+    torch.zeros(1, 384, 2, base_size2, base_size),
+    torch.zeros(1, 384, 2, base_size2, base_size),
+    torch.zeros(1, 384, 2, base_size2, base_size),
+    torch.zeros(1, 384, 2, base_size2, base_size),
+    torch.zeros(1, 384, 2, base_size2, base_size),
+    torch.zeros(1, 384, 2, base_size2, base_size),
+    torch.zeros(1, 192, 2, base_size2*2, base_size*2),
+    torch.zeros(1, 384, 2, base_size2*2, base_size*2),
+    torch.zeros(1, 384, 2, base_size2*2, base_size*2),
+    torch.zeros(1, 384, 2, base_size2*2, base_size*2),
+    torch.zeros(1, 384, 2, base_size2*2, base_size*2),
+    torch.zeros(1, 384, 2, base_size2*2, base_size*2),
+    torch.zeros(1, 384, 2, base_size2*2, base_size*2),
+    torch.zeros(1, 192, 2, base_size2*4, base_size*4),
+    torch.zeros(1, 192, 2, base_size2*4, base_size*4),
+    torch.zeros(1, 192, 2, base_size2*4, base_size*4),
+    torch.zeros(1, 192, 2, base_size2*4, base_size*4),
+    torch.zeros(1, 192, 2, base_size2*4, base_size*4),
+    torch.zeros(1, 192, 2, base_size2*4, base_size*4),
+    torch.zeros(1, 96, 2, base_size2*8, base_size*8),
+    torch.zeros(1, 96, 2, base_size2*8, base_size*8),
+    torch.zeros(1, 96, 2, base_size2*8, base_size*8),
+    torch.zeros(1, 96, 2, base_size2*8, base_size*8),
+    torch.zeros(1, 96, 2, base_size2*8, base_size*8),
+    torch.zeros(1, 96, 2, base_size2*8, base_size*8),
+    torch.zeros(1, 96, 2, base_size2*8, base_size*8)
+]
+feat_names = [f"vae_cache_{i}" for i in range(len(ZERO_VAE_CACHE))]
+ALL_INPUTS_NAMES = ["z", "use_cache"] + feat_names

demo_utils/memory.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Copied from https://github.com/lllyasviel/FramePack/tree/main/demo_utils
+# Apache-2.0 License
+# By lllyasviel
+import torch
+cpu = torch.device('cpu')
+gpu = torch.device(f'cuda:{torch.cuda.current_device()}')
+gpu_complete_modules = []
+class DynamicSwapInstaller:
+    @staticmethod
+    def _install_module(module: torch.nn.Module, **kwargs):
+        original_class = module.__class__
+        module.__dict__['forge_backup_original_class'] = original_class
+        def hacked_get_attr(self, name: str):
+            if '_parameters' in self.__dict__:
+                _parameters = self.__dict__['_parameters']
+                if name in _parameters:
+                    p = _parameters[name]
+                    if p is None:
+                        return None
+                    if p.__class__ == torch.nn.Parameter:
+                        return torch.nn.Parameter(p.to(**kwargs), requires_grad=p.requires_grad)
+                    else:
+                        return p.to(**kwargs)
+            if '_buffers' in self.__dict__:
+                _buffers = self.__dict__['_buffers']
+                if name in _buffers:
+                    return _buffers[name].to(**kwargs)
+            return super(original_class, self).__getattr__(name)
+        module.__class__ = type('DynamicSwap_' + original_class.__name__, (original_class,), {
+            '__getattr__': hacked_get_attr,
+        })
+        return
+    @staticmethod
+    def _uninstall_module(module: torch.nn.Module):
+        if 'forge_backup_original_class' in module.__dict__:
+            module.__class__ = module.__dict__.pop('forge_backup_original_class')
+        return
+    @staticmethod
+    def install_model(model: torch.nn.Module, **kwargs):
+        for m in model.modules():
+            DynamicSwapInstaller._install_module(m, **kwargs)
+        return
+    @staticmethod
+    def uninstall_model(model: torch.nn.Module):
+        for m in model.modules():
+            DynamicSwapInstaller._uninstall_module(m)
+        return
+def fake_diffusers_current_device(model: torch.nn.Module, target_device: torch.device):
+    if hasattr(model, 'scale_shift_table'):
+        model.scale_shift_table.data = model.scale_shift_table.data.to(target_device)
+        return
+    for k, p in model.named_modules():
+        if hasattr(p, 'weight'):
+            p.to(target_device)
+            return
+def get_cuda_free_memory_gb(device=None):
+    if device is None:
+        device = gpu
+    memory_stats = torch.cuda.memory_stats(device)
+    bytes_active = memory_stats['active_bytes.all.current']
+    bytes_reserved = memory_stats['reserved_bytes.all.current']
+    bytes_free_cuda, _ = torch.cuda.mem_get_info(device)
+    bytes_inactive_reserved = bytes_reserved - bytes_active
+    bytes_total_available = bytes_free_cuda + bytes_inactive_reserved
+    return bytes_total_available / (1024 ** 3)
+def move_model_to_device_with_memory_preservation(model, target_device, preserved_memory_gb=0):
+    print(f'Moving {model.__class__.__name__} to {target_device} with preserved memory: {preserved_memory_gb} GB')
+    for m in model.modules():
+        if get_cuda_free_memory_gb(target_device) <= preserved_memory_gb:
+            torch.cuda.empty_cache()
+            return
+        if hasattr(m, 'weight'):
+            m.to(device=target_device)
+    model.to(device=target_device)
+    torch.cuda.empty_cache()
+    return
+def offload_model_from_device_for_memory_preservation(model, target_device, preserved_memory_gb=0):
+    print(f'Offloading {model.__class__.__name__} from {target_device} to preserve memory: {preserved_memory_gb} GB')
+    for m in model.modules():
+        if get_cuda_free_memory_gb(target_device) >= preserved_memory_gb:
+            torch.cuda.empty_cache()
+            return
+        if hasattr(m, 'weight'):
+            m.to(device=cpu)
+    model.to(device=cpu)
+    torch.cuda.empty_cache()
+    return
+def unload_complete_models(*args):
+    for m in gpu_complete_modules + list(args):
+        m.to(device=cpu)
+        print(f'Unloaded {m.__class__.__name__} as complete.')
+    gpu_complete_modules.clear()
+    torch.cuda.empty_cache()
+    return
+def load_model_as_complete(model, target_device, unload=True):
+    if unload:
+        unload_complete_models()
+    model.to(device=target_device)
+    print(f'Loaded {model.__class__.__name__} to {target_device} as complete.')
+    gpu_complete_modules.append(model)
+    return

demo_utils/taehv.py ADDED Viewed

	@@ -0,0 +1,313 @@

+#!/usr/bin/env python3
+"""
+Tiny AutoEncoder for Hunyuan Video
+(DNN for encoding / decoding videos to Hunyuan Video's latent space)
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm.auto import tqdm
+from collections import namedtuple
+DecoderResult = namedtuple("DecoderResult", ("frame", "memory"))
+TWorkItem = namedtuple("TWorkItem", ("input_tensor", "block_index"))
+def conv(n_in, n_out, **kwargs):
+    return nn.Conv2d(n_in, n_out, 3, padding=1, **kwargs)
+class Clamp(nn.Module):
+    def forward(self, x):
+        return torch.tanh(x / 3) * 3
+class MemBlock(nn.Module):
+    def __init__(self, n_in, n_out):
+        super().__init__()
+        self.conv = nn.Sequential(conv(n_in * 2, n_out), nn.ReLU(inplace=True),
+                                  conv(n_out, n_out), nn.ReLU(inplace=True), conv(n_out, n_out))
+        self.skip = nn.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
+        self.act = nn.ReLU(inplace=True)
+    def forward(self, x, past):
+        return self.act(self.conv(torch.cat([x, past], 1)) + self.skip(x))
+class TPool(nn.Module):
+    def __init__(self, n_f, stride):
+        super().__init__()
+        self.stride = stride
+        self.conv = nn.Conv2d(n_f * stride, n_f, 1, bias=False)
+    def forward(self, x):
+        _NT, C, H, W = x.shape
+        return self.conv(x.reshape(-1, self.stride * C, H, W))
+class TGrow(nn.Module):
+    def __init__(self, n_f, stride):
+        super().__init__()
+        self.stride = stride
+        self.conv = nn.Conv2d(n_f, n_f * stride, 1, bias=False)
+    def forward(self, x):
+        _NT, C, H, W = x.shape
+        x = self.conv(x)
+        return x.reshape(-1, C, H, W)
+def apply_model_with_memblocks(model, x, parallel, show_progress_bar):
+    """
+    Apply a sequential model with memblocks to the given input.
+    Args:
+    - model: nn.Sequential of blocks to apply
+    - x: input data, of dimensions NTCHW
+    - parallel: if True, parallelize over timesteps (fast but uses O(T) memory)
+        if False, each timestep will be processed sequentially (slow but uses O(1) memory)
+    - show_progress_bar: if True, enables tqdm progressbar display
+    Returns NTCHW tensor of output data.
+    """
+    assert x.ndim == 5, f"TAEHV operates on NTCHW tensors, but got {x.ndim}-dim tensor"
+    N, T, C, H, W = x.shape
+    if parallel:
+        x = x.reshape(N * T, C, H, W)
+        # parallel over input timesteps, iterate over blocks
+        for b in tqdm(model, disable=not show_progress_bar):
+            if isinstance(b, MemBlock):
+                NT, C, H, W = x.shape
+                T = NT // N
+                _x = x.reshape(N, T, C, H, W)
+                mem = F.pad(_x, (0, 0, 0, 0, 0, 0, 1, 0), value=0)[:, :T].reshape(x.shape)
+                x = b(x, mem)
+            else:
+                x = b(x)
+        NT, C, H, W = x.shape
+        T = NT // N
+        x = x.view(N, T, C, H, W)
+    else:
+        # TODO(oboerbohan): at least on macos this still gradually uses more memory during decode...
+        # need to fix :(
+        out = []
+        # iterate over input timesteps and also iterate over blocks.
+        # because of the cursed TPool/TGrow blocks, this is not a nested loop,
+        # it's actually a ***graph traversal*** problem! so let's make a queue
+        work_queue = [TWorkItem(xt, 0) for t, xt in enumerate(x.reshape(N, T * C, H, W).chunk(T, dim=1))]
+        # in addition to manually managing our queue, we also need to manually manage our progressbar.
+        # we'll update it for every source node that we consume.
+        progress_bar = tqdm(range(T), disable=not show_progress_bar)
+        # we'll also need a separate addressable memory per node as well
+        mem = [None] * len(model)
+        while work_queue:
+            xt, i = work_queue.pop(0)
+            if i == 0:
+                # new source node consumed
+                progress_bar.update(1)
+            if i == len(model):
+                # reached end of the graph, append result to output list
+                out.append(xt)
+            else:
+                # fetch the block to process
+                b = model[i]
+                if isinstance(b, MemBlock):
+                    # mem blocks are simple since we're visiting the graph in causal order
+                    if mem[i] is None:
+                        xt_new = b(xt, xt * 0)
+                        mem[i] = xt
+                    else:
+                        xt_new = b(xt, mem[i])
+                        mem[i].copy_(xt)  # inplace might reduce mysterious pytorch memory allocations? doesn't help though
+                    # add successor to work queue
+                    work_queue.insert(0, TWorkItem(xt_new, i + 1))
+                elif isinstance(b, TPool):
+                    # pool blocks are miserable
+                    if mem[i] is None:
+                        mem[i] = []  # pool memory is itself a queue of inputs to pool
+                    mem[i].append(xt)
+                    if len(mem[i]) > b.stride:
+                        # pool mem is in invalid state, we should have pooled before this
+                        raise ValueError("???")
+                    elif len(mem[i]) < b.stride:
+                        # pool mem is not yet full, go back to processing the work queue
+                        pass
+                    else:
+                        # pool mem is ready, run the pool block
+                        N, C, H, W = xt.shape
+                        xt = b(torch.cat(mem[i], 1).view(N * b.stride, C, H, W))
+                        # reset the pool mem
+                        mem[i] = []
+                        # add successor to work queue
+                        work_queue.insert(0, TWorkItem(xt, i + 1))
+                elif isinstance(b, TGrow):
+                    xt = b(xt)
+                    NT, C, H, W = xt.shape
+                    # each tgrow has multiple successor nodes
+                    for xt_next in reversed(xt.view(N, b.stride * C, H, W).chunk(b.stride, 1)):
+                        # add successor to work queue
+                        work_queue.insert(0, TWorkItem(xt_next, i + 1))
+                else:
+                    # normal block with no funny business
+                    xt = b(xt)
+                    # add successor to work queue
+                    work_queue.insert(0, TWorkItem(xt, i + 1))
+        progress_bar.close()
+        x = torch.stack(out, 1)
+    return x
+class TAEHV(nn.Module):
+    latent_channels = 16
+    image_channels = 3
+    def __init__(self, checkpoint_path="taehv.pth", decoder_time_upscale=(True, True), decoder_space_upscale=(True, True, True)):
+        """Initialize pretrained TAEHV from the given checkpoint.
+        Arg:
+            checkpoint_path: path to weight file to load. taehv.pth for Hunyuan, taew2_1.pth for Wan 2.1.
+            decoder_time_upscale: whether temporal upsampling is enabled for each block. upsampling can be disabled for a cheaper preview.
+            decoder_space_upscale: whether spatial upsampling is enabled for each block. upsampling can be disabled for a cheaper preview.
+        """
+        super().__init__()
+        self.encoder = nn.Sequential(
+            conv(TAEHV.image_channels, 64), nn.ReLU(inplace=True),
+            TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64), MemBlock(64, 64), MemBlock(64, 64),
+            TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64), MemBlock(64, 64), MemBlock(64, 64),
+            TPool(64, 1), conv(64, 64, stride=2, bias=False), MemBlock(64, 64), MemBlock(64, 64), MemBlock(64, 64),
+            conv(64, TAEHV.latent_channels),
+        )
+        n_f = [256, 128, 64, 64]
+        self.frames_to_trim = 2**sum(decoder_time_upscale) - 1
+        self.decoder = nn.Sequential(
+            Clamp(), conv(TAEHV.latent_channels, n_f[0]), nn.ReLU(inplace=True),
+            MemBlock(n_f[0], n_f[0]), MemBlock(n_f[0], n_f[0]), MemBlock(n_f[0], n_f[0]), nn.Upsample(
+                scale_factor=2 if decoder_space_upscale[0] else 1), TGrow(n_f[0], 1), conv(n_f[0], n_f[1], bias=False),
+            MemBlock(n_f[1], n_f[1]), MemBlock(n_f[1], n_f[1]), MemBlock(n_f[1], n_f[1]), nn.Upsample(
+                scale_factor=2 if decoder_space_upscale[1] else 1), TGrow(n_f[1], 2 if decoder_time_upscale[0] else 1), conv(n_f[1], n_f[2], bias=False),
+            MemBlock(n_f[2], n_f[2]), MemBlock(n_f[2], n_f[2]), MemBlock(n_f[2], n_f[2]), nn.Upsample(
+                scale_factor=2 if decoder_space_upscale[2] else 1), TGrow(n_f[2], 2 if decoder_time_upscale[1] else 1), conv(n_f[2], n_f[3], bias=False),
+            nn.ReLU(inplace=True), conv(n_f[3], TAEHV.image_channels),
+        )
+        if checkpoint_path is not None:
+            self.load_state_dict(self.patch_tgrow_layers(torch.load(
+                checkpoint_path, map_location="cpu", weights_only=True)))
+    def patch_tgrow_layers(self, sd):
+        """Patch TGrow layers to use a smaller kernel if needed.
+        Args:
+            sd: state dict to patch
+        """
+        new_sd = self.state_dict()
+        for i, layer in enumerate(self.decoder):
+            if isinstance(layer, TGrow):
+                key = f"decoder.{i}.conv.weight"
+                if sd[key].shape[0] > new_sd[key].shape[0]:
+                    # take the last-timestep output channels
+                    sd[key] = sd[key][-new_sd[key].shape[0]:]
+        return sd
+    def encode_video(self, x, parallel=True, show_progress_bar=True):
+        """Encode a sequence of frames.
+        Args:
+            x: input NTCHW RGB (C=3) tensor with values in [0, 1].
+            parallel: if True, all frames will be processed at once.
+              (this is faster but may require more memory).
+              if False, frames will be processed sequentially.
+        Returns NTCHW latent tensor with ~Gaussian values.
+        """
+        return apply_model_with_memblocks(self.encoder, x, parallel, show_progress_bar)
+    def decode_video(self, x, parallel=True, show_progress_bar=False):
+        """Decode a sequence of frames.
+        Args:
+            x: input NTCHW latent (C=12) tensor with ~Gaussian values.
+            parallel: if True, all frames will be processed at once.
+              (this is faster but may require more memory).
+              if False, frames will be processed sequentially.
+        Returns NTCHW RGB tensor with ~[0, 1] values.
+        """
+        x = apply_model_with_memblocks(self.decoder, x, parallel, show_progress_bar)
+        # return x[:, self.frames_to_trim:]
+        return x
+    def forward(self, x):
+        return self.c(x)
+@torch.no_grad()
+def main():
+    """Run TAEHV roundtrip reconstruction on the given video paths."""
+    import os
+    import sys
+    import cv2  # no highly esteemed deed is commemorated here
+    class VideoTensorReader:
+        def __init__(self, video_file_path):
+            self.cap = cv2.VideoCapture(video_file_path)
+            assert self.cap.isOpened(), f"Could not load {video_file_path}"
+            self.fps = self.cap.get(cv2.CAP_PROP_FPS)
+        def __iter__(self):
+            return self
+        def __next__(self):
+            ret, frame = self.cap.read()
+            if not ret:
+                self.cap.release()
+                raise StopIteration  # End of video or error
+            return torch.from_numpy(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)).permute(2, 0, 1)  # BGR HWC -> RGB CHW
+    class VideoTensorWriter:
+        def __init__(self, video_file_path, width_height, fps=30):
+            self.writer = cv2.VideoWriter(video_file_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, width_height)
+            assert self.writer.isOpened(), f"Could not create writer for {video_file_path}"
+        def write(self, frame_tensor):
+            assert frame_tensor.ndim == 3 and frame_tensor.shape[0] == 3, f"{frame_tensor.shape}??"
+            self.writer.write(cv2.cvtColor(frame_tensor.permute(1, 2, 0).numpy(),
+                              cv2.COLOR_RGB2BGR))  # RGB CHW -> BGR HWC
+        def __del__(self):
+            if hasattr(self, 'writer'):
+                self.writer.release()
+    dev = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
+    dtype = torch.float16
+    checkpoint_path = os.getenv("TAEHV_CHECKPOINT_PATH", "taehv.pth")
+    checkpoint_name = os.path.splitext(os.path.basename(checkpoint_path))[0]
+    print(
+        f"Using device \033[31m{dev}\033[0m, dtype \033[32m{dtype}\033[0m, checkpoint \033[34m{checkpoint_name}\033[0m ({checkpoint_path})")
+    taehv = TAEHV(checkpoint_path=checkpoint_path).to(dev, dtype)
+    for video_path in sys.argv[1:]:
+        print(f"Processing {video_path}...")
+        video_in = VideoTensorReader(video_path)
+        video = torch.stack(list(video_in), 0)[None]
+        vid_dev = video.to(dev, dtype).div_(255.0)
+        # convert to device tensor
+        if video.numel() < 100_000_000:
+            print(f"  {video_path} seems small enough, will process all frames in parallel")
+            # convert to device tensor
+            vid_enc = taehv.encode_video(vid_dev)
+            print(f"  Encoded {video_path} -> {vid_enc.shape}. Decoding...")
+            vid_dec = taehv.decode_video(vid_enc)
+            print(f"  Decoded {video_path} -> {vid_dec.shape}")
+        else:
+            print(f"  {video_path} seems large, will process each frame sequentially")
+            # convert to device tensor
+            vid_enc = taehv.encode_video(vid_dev, parallel=False)
+            print(f"  Encoded {video_path} -> {vid_enc.shape}. Decoding...")
+            vid_dec = taehv.decode_video(vid_enc, parallel=False)
+            print(f"  Decoded {video_path} -> {vid_dec.shape}")
+        video_out_path = video_path + f".reconstructed_by_{checkpoint_name}.mp4"
+        video_out = VideoTensorWriter(
+            video_out_path, (vid_dec.shape[-1], vid_dec.shape[-2]), fps=int(round(video_in.fps)))
+        for frame in vid_dec.clamp_(0, 1).mul_(255).round_().byte().cpu()[0]:
+            video_out.write(frame)
+        print(f"  Saved to {video_out_path}")
+if __name__ == "__main__":
+    main()

demo_utils/utils.py ADDED Viewed

	@@ -0,0 +1,616 @@

+# Copied from https://github.com/lllyasviel/FramePack/tree/main/demo_utils
+# Apache-2.0 License
+# By lllyasviel
+import os
+import cv2
+import json
+import random
+import glob
+import torch
+import einops
+import numpy as np
+import datetime
+import torchvision
+from PIL import Image
+def min_resize(x, m):
+    if x.shape[0] < x.shape[1]:
+        s0 = m
+        s1 = int(float(m) / float(x.shape[0]) * float(x.shape[1]))
+    else:
+        s0 = int(float(m) / float(x.shape[1]) * float(x.shape[0]))
+        s1 = m
+    new_max = max(s1, s0)
+    raw_max = max(x.shape[0], x.shape[1])
+    if new_max < raw_max:
+        interpolation = cv2.INTER_AREA
+    else:
+        interpolation = cv2.INTER_LANCZOS4
+    y = cv2.resize(x, (s1, s0), interpolation=interpolation)
+    return y
+def d_resize(x, y):
+    H, W, C = y.shape
+    new_min = min(H, W)
+    raw_min = min(x.shape[0], x.shape[1])
+    if new_min < raw_min:
+        interpolation = cv2.INTER_AREA
+    else:
+        interpolation = cv2.INTER_LANCZOS4
+    y = cv2.resize(x, (W, H), interpolation=interpolation)
+    return y
+def resize_and_center_crop(image, target_width, target_height):
+    if target_height == image.shape[0] and target_width == image.shape[1]:
+        return image
+    pil_image = Image.fromarray(image)
+    original_width, original_height = pil_image.size
+    scale_factor = max(target_width / original_width, target_height / original_height)
+    resized_width = int(round(original_width * scale_factor))
+    resized_height = int(round(original_height * scale_factor))
+    resized_image = pil_image.resize((resized_width, resized_height), Image.LANCZOS)
+    left = (resized_width - target_width) / 2
+    top = (resized_height - target_height) / 2
+    right = (resized_width + target_width) / 2
+    bottom = (resized_height + target_height) / 2
+    cropped_image = resized_image.crop((left, top, right, bottom))
+    return np.array(cropped_image)
+def resize_and_center_crop_pytorch(image, target_width, target_height):
+    B, C, H, W = image.shape
+    if H == target_height and W == target_width:
+        return image
+    scale_factor = max(target_width / W, target_height / H)
+    resized_width = int(round(W * scale_factor))
+    resized_height = int(round(H * scale_factor))
+    resized = torch.nn.functional.interpolate(image, size=(resized_height, resized_width), mode='bilinear', align_corners=False)
+    top = (resized_height - target_height) // 2
+    left = (resized_width - target_width) // 2
+    cropped = resized[:, :, top:top + target_height, left:left + target_width]
+    return cropped
+def resize_without_crop(image, target_width, target_height):
+    if target_height == image.shape[0] and target_width == image.shape[1]:
+        return image
+    pil_image = Image.fromarray(image)
+    resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
+    return np.array(resized_image)
+def just_crop(image, w, h):
+    if h == image.shape[0] and w == image.shape[1]:
+        return image
+    original_height, original_width = image.shape[:2]
+    k = min(original_height / h, original_width / w)
+    new_width = int(round(w * k))
+    new_height = int(round(h * k))
+    x_start = (original_width - new_width) // 2
+    y_start = (original_height - new_height) // 2
+    cropped_image = image[y_start:y_start + new_height, x_start:x_start + new_width]
+    return cropped_image
+def write_to_json(data, file_path):
+    temp_file_path = file_path + ".tmp"
+    with open(temp_file_path, 'wt', encoding='utf-8') as temp_file:
+        json.dump(data, temp_file, indent=4)
+    os.replace(temp_file_path, file_path)
+    return
+def read_from_json(file_path):
+    with open(file_path, 'rt', encoding='utf-8') as file:
+        data = json.load(file)
+    return data
+def get_active_parameters(m):
+    return {k: v for k, v in m.named_parameters() if v.requires_grad}
+def cast_training_params(m, dtype=torch.float32):
+    result = {}
+    for n, param in m.named_parameters():
+        if param.requires_grad:
+            param.data = param.to(dtype)
+            result[n] = param
+    return result
+def separate_lora_AB(parameters, B_patterns=None):
+    parameters_normal = {}
+    parameters_B = {}
+    if B_patterns is None:
+        B_patterns = ['.lora_B.', '__zero__']
+    for k, v in parameters.items():
+        if any(B_pattern in k for B_pattern in B_patterns):
+            parameters_B[k] = v
+        else:
+            parameters_normal[k] = v
+    return parameters_normal, parameters_B
+def set_attr_recursive(obj, attr, value):
+    attrs = attr.split(".")
+    for name in attrs[:-1]:
+        obj = getattr(obj, name)
+    setattr(obj, attrs[-1], value)
+    return
+def print_tensor_list_size(tensors):
+    total_size = 0
+    total_elements = 0
+    if isinstance(tensors, dict):
+        tensors = tensors.values()
+    for tensor in tensors:
+        total_size += tensor.nelement() * tensor.element_size()
+        total_elements += tensor.nelement()
+    total_size_MB = total_size / (1024 ** 2)
+    total_elements_B = total_elements / 1e9
+    print(f"Total number of tensors: {len(tensors)}")
+    print(f"Total size of tensors: {total_size_MB:.2f} MB")
+    print(f"Total number of parameters: {total_elements_B:.3f} billion")
+    return
+@torch.no_grad()
+def batch_mixture(a, b=None, probability_a=0.5, mask_a=None):
+    batch_size = a.size(0)
+    if b is None:
+        b = torch.zeros_like(a)
+    if mask_a is None:
+        mask_a = torch.rand(batch_size) < probability_a
+    mask_a = mask_a.to(a.device)
+    mask_a = mask_a.reshape((batch_size,) + (1,) * (a.dim() - 1))
+    result = torch.where(mask_a, a, b)
+    return result
+@torch.no_grad()
+def zero_module(module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+@torch.no_grad()
+def supress_lower_channels(m, k, alpha=0.01):
+    data = m.weight.data.clone()
+    assert int(data.shape[1]) >= k
+    data[:, :k] = data[:, :k] * alpha
+    m.weight.data = data.contiguous().clone()
+    return m
+def freeze_module(m):
+    if not hasattr(m, '_forward_inside_frozen_module'):
+        m._forward_inside_frozen_module = m.forward
+    m.requires_grad_(False)
+    m.forward = torch.no_grad()(m.forward)
+    return m
+def get_latest_safetensors(folder_path):
+    safetensors_files = glob.glob(os.path.join(folder_path, '*.safetensors'))
+    if not safetensors_files:
+        raise ValueError('No file to resume!')
+    latest_file = max(safetensors_files, key=os.path.getmtime)
+    latest_file = os.path.abspath(os.path.realpath(latest_file))
+    return latest_file
+def generate_random_prompt_from_tags(tags_str, min_length=3, max_length=32):
+    tags = tags_str.split(', ')
+    tags = random.sample(tags, k=min(random.randint(min_length, max_length), len(tags)))
+    prompt = ', '.join(tags)
+    return prompt
+def interpolate_numbers(a, b, n, round_to_int=False, gamma=1.0):
+    numbers = a + (b - a) * (np.linspace(0, 1, n) ** gamma)
+    if round_to_int:
+        numbers = np.round(numbers).astype(int)
+    return numbers.tolist()
+def uniform_random_by_intervals(inclusive, exclusive, n, round_to_int=False):
+    edges = np.linspace(0, 1, n + 1)
+    points = np.random.uniform(edges[:-1], edges[1:])
+    numbers = inclusive + (exclusive - inclusive) * points
+    if round_to_int:
+        numbers = np.round(numbers).astype(int)
+    return numbers.tolist()
+def soft_append_bcthw(history, current, overlap=0):
+    if overlap <= 0:
+        return torch.cat([history, current], dim=2)
+    assert history.shape[2] >= overlap, f"History length ({history.shape[2]}) must be >= overlap ({overlap})"
+    assert current.shape[2] >= overlap, f"Current length ({current.shape[2]}) must be >= overlap ({overlap})"
+    weights = torch.linspace(1, 0, overlap, dtype=history.dtype, device=history.device).view(1, 1, -1, 1, 1)
+    blended = weights * history[:, :, -overlap:] + (1 - weights) * current[:, :, :overlap]
+    output = torch.cat([history[:, :, :-overlap], blended, current[:, :, overlap:]], dim=2)
+    return output.to(history)
+def save_bcthw_as_mp4(x, output_filename, fps=10, crf=0):
+    b, c, t, h, w = x.shape
+    per_row = b
+    for p in [6, 5, 4, 3, 2]:
+        if b % p == 0:
+            per_row = p
+            break
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, '(m n) c t h w -> t (m h) (n w) c', n=per_row)
+    torchvision.io.write_video(output_filename, x, fps=fps, video_codec='libx264', options={'crf': str(int(crf))})
+    return x
+def save_bcthw_as_png(x, output_filename):
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, 'b c t h w -> c (b h) (t w)')
+    torchvision.io.write_png(x, output_filename)
+    return output_filename
+def save_bchw_as_png(x, output_filename):
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, 'b c h w -> c h (b w)')
+    torchvision.io.write_png(x, output_filename)
+    return output_filename
+def add_tensors_with_padding(tensor1, tensor2):
+    if tensor1.shape == tensor2.shape:
+        return tensor1 + tensor2
+    shape1 = tensor1.shape
+    shape2 = tensor2.shape
+    new_shape = tuple(max(s1, s2) for s1, s2 in zip(shape1, shape2))
+    padded_tensor1 = torch.zeros(new_shape)
+    padded_tensor2 = torch.zeros(new_shape)
+    padded_tensor1[tuple(slice(0, s) for s in shape1)] = tensor1
+    padded_tensor2[tuple(slice(0, s) for s in shape2)] = tensor2
+    result = padded_tensor1 + padded_tensor2
+    return result
+def print_free_mem():
+    torch.cuda.empty_cache()
+    free_mem, total_mem = torch.cuda.mem_get_info(0)
+    free_mem_mb = free_mem / (1024 ** 2)
+    total_mem_mb = total_mem / (1024 ** 2)
+    print(f"Free memory: {free_mem_mb:.2f} MB")
+    print(f"Total memory: {total_mem_mb:.2f} MB")
+    return
+def print_gpu_parameters(device, state_dict, log_count=1):
+    summary = {"device": device, "keys_count": len(state_dict)}
+    logged_params = {}
+    for i, (key, tensor) in enumerate(state_dict.items()):
+        if i >= log_count:
+            break
+        logged_params[key] = tensor.flatten()[:3].tolist()
+    summary["params"] = logged_params
+    print(str(summary))
+    return
+def visualize_txt_as_img(width, height, text, font_path='font/DejaVuSans.ttf', size=18):
+    from PIL import Image, ImageDraw, ImageFont
+    txt = Image.new("RGB", (width, height), color="white")
+    draw = ImageDraw.Draw(txt)
+    font = ImageFont.truetype(font_path, size=size)
+    if text == '':
+        return np.array(txt)
+    # Split text into lines that fit within the image width
+    lines = []
+    words = text.split()
+    current_line = words[0]
+    for word in words[1:]:
+        line_with_word = f"{current_line} {word}"
+        if draw.textbbox((0, 0), line_with_word, font=font)[2] <= width:
+            current_line = line_with_word
+        else:
+            lines.append(current_line)
+            current_line = word
+    lines.append(current_line)
+    # Draw the text line by line
+    y = 0
+    line_height = draw.textbbox((0, 0), "A", font=font)[3]
+    for line in lines:
+        if y + line_height > height:
+            break  # stop drawing if the next line will be outside the image
+        draw.text((0, y), line, fill="black", font=font)
+        y += line_height
+    return np.array(txt)
+def blue_mark(x):
+    x = x.copy()
+    c = x[:, :, 2]
+    b = cv2.blur(c, (9, 9))
+    x[:, :, 2] = ((c - b) * 16.0 + b).clip(-1, 1)
+    return x
+def green_mark(x):
+    x = x.copy()
+    x[:, :, 2] = -1
+    x[:, :, 0] = -1
+    return x
+def frame_mark(x):
+    x = x.copy()
+    x[:64] = -1
+    x[-64:] = -1
+    x[:, :8] = 1
+    x[:, -8:] = 1
+    return x
+@torch.inference_mode()
+def pytorch2numpy(imgs):
+    results = []
+    for x in imgs:
+        y = x.movedim(0, -1)
+        y = y * 127.5 + 127.5
+        y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
+        results.append(y)
+    return results
+@torch.inference_mode()
+def numpy2pytorch(imgs):
+    h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
+    h = h.movedim(-1, 1)
+    return h
+@torch.no_grad()
+def duplicate_prefix_to_suffix(x, count, zero_out=False):
+    if zero_out:
+        return torch.cat([x, torch.zeros_like(x[:count])], dim=0)
+    else:
+        return torch.cat([x, x[:count]], dim=0)
+def weighted_mse(a, b, weight):
+    return torch.mean(weight.float() * (a.float() - b.float()) ** 2)
+def clamped_linear_interpolation(x, x_min, y_min, x_max, y_max, sigma=1.0):
+    x = (x - x_min) / (x_max - x_min)
+    x = max(0.0, min(x, 1.0))
+    x = x ** sigma
+    return y_min + x * (y_max - y_min)
+def expand_to_dims(x, target_dims):
+    return x.view(*x.shape, *([1] * max(0, target_dims - x.dim())))
+def repeat_to_batch_size(tensor: torch.Tensor, batch_size: int):
+    if tensor is None:
+        return None
+    first_dim = tensor.shape[0]
+    if first_dim == batch_size:
+        return tensor
+    if batch_size % first_dim != 0:
+        raise ValueError(f"Cannot evenly repeat first dim {first_dim} to match batch_size {batch_size}.")
+    repeat_times = batch_size // first_dim
+    return tensor.repeat(repeat_times, *[1] * (tensor.dim() - 1))
+def dim5(x):
+    return expand_to_dims(x, 5)
+def dim4(x):
+    return expand_to_dims(x, 4)
+def dim3(x):
+    return expand_to_dims(x, 3)
+def crop_or_pad_yield_mask(x, length):
+    B, F, C = x.shape
+    device = x.device
+    dtype = x.dtype
+    if F < length:
+        y = torch.zeros((B, length, C), dtype=dtype, device=device)
+        mask = torch.zeros((B, length), dtype=torch.bool, device=device)
+        y[:, :F, :] = x
+        mask[:, :F] = True
+        return y, mask
+    return x[:, :length, :], torch.ones((B, length), dtype=torch.bool, device=device)
+def extend_dim(x, dim, minimal_length, zero_pad=False):
+    original_length = int(x.shape[dim])
+    if original_length >= minimal_length:
+        return x
+    if zero_pad:
+        padding_shape = list(x.shape)
+        padding_shape[dim] = minimal_length - original_length
+        padding = torch.zeros(padding_shape, dtype=x.dtype, device=x.device)
+    else:
+        idx = (slice(None),) * dim + (slice(-1, None),) + (slice(None),) * (len(x.shape) - dim - 1)
+        last_element = x[idx]
+        padding = last_element.repeat_interleave(minimal_length - original_length, dim=dim)
+    return torch.cat([x, padding], dim=dim)
+def lazy_positional_encoding(t, repeats=None):
+    if not isinstance(t, list):
+        t = [t]
+    from diffusers.models.embeddings import get_timestep_embedding
+    te = torch.tensor(t)
+    te = get_timestep_embedding(timesteps=te, embedding_dim=256, flip_sin_to_cos=True, downscale_freq_shift=0.0, scale=1.0)
+    if repeats is None:
+        return te
+    te = te[:, None, :].expand(-1, repeats, -1)
+    return te
+def state_dict_offset_merge(A, B, C=None):
+    result = {}
+    keys = A.keys()
+    for key in keys:
+        A_value = A[key]
+        B_value = B[key].to(A_value)
+        if C is None:
+            result[key] = A_value + B_value
+        else:
+            C_value = C[key].to(A_value)
+            result[key] = A_value + B_value - C_value
+    return result
+def state_dict_weighted_merge(state_dicts, weights):
+    if len(state_dicts) != len(weights):
+        raise ValueError("Number of state dictionaries must match number of weights")
+    if not state_dicts:
+        return {}
+    total_weight = sum(weights)
+    if total_weight == 0:
+        raise ValueError("Sum of weights cannot be zero")
+    normalized_weights = [w / total_weight for w in weights]
+    keys = state_dicts[0].keys()
+    result = {}
+    for key in keys:
+        result[key] = state_dicts[0][key] * normalized_weights[0]
+        for i in range(1, len(state_dicts)):
+            state_dict_value = state_dicts[i][key].to(result[key])
+            result[key] += state_dict_value * normalized_weights[i]
+    return result
+def group_files_by_folder(all_files):
+    grouped_files = {}
+    for file in all_files:
+        folder_name = os.path.basename(os.path.dirname(file))
+        if folder_name not in grouped_files:
+            grouped_files[folder_name] = []
+        grouped_files[folder_name].append(file)
+    list_of_lists = list(grouped_files.values())
+    return list_of_lists
+def generate_timestamp():
+    now = datetime.datetime.now()
+    timestamp = now.strftime('%y%m%d_%H%M%S')
+    milliseconds = f"{int(now.microsecond / 1000):03d}"
+    random_number = random.randint(0, 9999)
+    return f"{timestamp}_{milliseconds}_{random_number}"
+def write_PIL_image_with_png_info(image, metadata, path):
+    from PIL.PngImagePlugin import PngInfo
+    png_info = PngInfo()
+    for key, value in metadata.items():
+        png_info.add_text(key, value)
+    image.save(path, "PNG", pnginfo=png_info)
+    return image
+def torch_safe_save(content, path):
+    torch.save(content, path + '_tmp')
+    os.replace(path + '_tmp', path)
+    return path
+def move_optimizer_to_device(optimizer, device):
+    for state in optimizer.state.values():
+        for k, v in state.items():
+            if isinstance(v, torch.Tensor):
+                state[k] = v.to(device)

demo_utils/vae.py ADDED Viewed

	@@ -0,0 +1,390 @@

+from typing import List
+from einops import rearrange
+import tensorrt as trt
+import torch
+import torch.nn as nn
+from demo_utils.constant import ALL_INPUTS_NAMES, ZERO_VAE_CACHE
+from wan.modules.vae import AttentionBlock, CausalConv3d, RMS_norm, Upsample
+CACHE_T = 2
+class ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False), nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False), nn.SiLU(), nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1))
+        self.shortcut = CausalConv3d(in_dim, out_dim, 1) \
+            if in_dim != out_dim else nn.Identity()
+    def forward(self, x, feat_cache_1, feat_cache_2):
+        h = self.shortcut(x)
+        feat_cache = feat_cache_1
+        out_feat_cache = []
+        for layer in self.residual:
+            if isinstance(layer, CausalConv3d):
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                        dim=2)
+                x = layer(x, feat_cache)
+                out_feat_cache.append(cache_x)
+                feat_cache = feat_cache_2
+            else:
+                x = layer(x)
+        return x + h, *out_feat_cache
+class Resample(nn.Module):
+    def __init__(self, dim, mode):
+        assert mode in ('none', 'upsample2d', 'upsample3d')
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # layers
+        if mode == 'upsample2d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+        elif mode == 'upsample3d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+            self.time_conv = CausalConv3d(
+                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, is_first_frame, feat_cache):
+        if self.mode == 'upsample3d':
+            b, c, t, h, w = x.size()
+            # x, out_feat_cache = torch.cond(
+            #     is_first_frame,
+            #     lambda: (torch.cat([torch.zeros_like(x), x], dim=2), feat_cache.clone()),
+            #     lambda: self.temporal_conv(x, feat_cache),
+            # )
+            # x, out_feat_cache = torch.cond(
+            #     is_first_frame,
+            #     lambda: (torch.cat([torch.zeros_like(x), x], dim=2), feat_cache.clone()),
+            #     lambda: self.temporal_conv(x, feat_cache),
+            # )
+            x, out_feat_cache = self.temporal_conv(x, is_first_frame, feat_cache)
+            out_feat_cache = torch.cond(
+                is_first_frame,
+                lambda: feat_cache.clone().contiguous(),
+                lambda: out_feat_cache.clone().contiguous(),
+            )
+            # if is_first_frame:
+            #     x = torch.cat([torch.zeros_like(x), x], dim=2)
+            #     out_feat_cache = feat_cache.clone()
+            # else:
+            #     x, out_feat_cache = self.temporal_conv(x, feat_cache)
+        else:
+            out_feat_cache = None
+        t = x.shape[2]
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.resample(x)
+        x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
+        return x, out_feat_cache
+    def temporal_conv(self, x, is_first_frame, feat_cache):
+        b, c, t, h, w = x.size()
+        cache_x = x[:, :, -CACHE_T:, :, :].clone()
+        if cache_x.shape[2] < 2 and feat_cache is not None:
+            cache_x = torch.cat([
+                torch.zeros_like(cache_x),
+                cache_x
+            ], dim=2)
+        x = torch.cond(
+            is_first_frame,
+            lambda: torch.cat([torch.zeros_like(x), x], dim=1).contiguous(),
+            lambda: self.time_conv(x, feat_cache).contiguous(),
+        )
+        # x = self.time_conv(x, feat_cache)
+        out_feat_cache = cache_x
+        x = x.reshape(b, 2, c, t, h, w)
+        x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
+                        3)
+        x = x.reshape(b, c, t * 2, h, w)
+        return x.contiguous(), out_feat_cache.contiguous()
+    def init_weight(self, conv):
+        conv_weight = conv.weight
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        # conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix  # * 0.5
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        # init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2)
+        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+class VAEDecoderWrapperSingle(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.decoder = VAEDecoder3d()
+        mean = [
+            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
+            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
+        ]
+        std = [
+            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
+            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
+        ]
+        self.mean = torch.tensor(mean, dtype=torch.float32)
+        self.std = torch.tensor(std, dtype=torch.float32)
+        self.z_dim = 16
+        self.conv2 = CausalConv3d(self.z_dim, self.z_dim, 1)
+    def forward(
+            self,
+            z: torch.Tensor,
+            is_first_frame: torch.Tensor,
+            *feat_cache: List[torch.Tensor]
+    ):
+        # from [batch_size, num_frames, num_channels, height, width]
+        # to [batch_size, num_channels, num_frames, height, width]
+        z = z.permute(0, 2, 1, 3, 4)
+        assert z.shape[2] == 1
+        feat_cache = list(feat_cache)
+        is_first_frame = is_first_frame.bool()
+        device, dtype = z.device, z.dtype
+        scale = [self.mean.to(device=device, dtype=dtype),
+                 1.0 / self.std.to(device=device, dtype=dtype)]
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            z = z / scale[1] + scale[0]
+        x = self.conv2(z)
+        out, feat_cache = self.decoder(x, is_first_frame, feat_cache=feat_cache)
+        out = out.clamp_(-1, 1)
+        # from [batch_size, num_channels, num_frames, height, width]
+        # to [batch_size, num_frames, num_channels, height, width]
+        out = out.permute(0, 2, 1, 3, 4)
+        return out, feat_cache
+class VAEDecoder3d(nn.Module):
+    def __init__(self,
+                 dim=96,
+                 z_dim=16,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_upsample=[True, True, False],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        self.cache_t = 2
+        self.decoder_conv_num = 32
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2**(len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]),
+            ResidualBlock(dims[0], dims[0], dropout))
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i == 1 or i == 2 or i == 3:
+                in_dim = in_dim // 2
+            for _ in range(num_res_blocks + 1):
+                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    upsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # upsample block
+            if i != len(dim_mult) - 1:
+                mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
+                upsamples.append(Resample(out_dim, mode=mode))
+                scale *= 2.0
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False), nn.SiLU(),
+            CausalConv3d(out_dim, 3, 3, padding=1))
+    def forward(
+            self,
+            x: torch.Tensor,
+            is_first_frame: torch.Tensor,
+            feat_cache: List[torch.Tensor]
+    ):
+        idx = 0
+        out_feat_cache = []
+        # conv1
+        cache_x = x[:, :, -self.cache_t:, :, :].clone()
+        if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+            # cache last frame of last two chunk
+            cache_x = torch.cat([
+                feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                    cache_x.device), cache_x
+            ],
+                dim=2)
+        x = self.conv1(x, feat_cache[idx])
+        out_feat_cache.append(cache_x)
+        idx += 1
+        # middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x, out_feat_cache_1, out_feat_cache_2 = layer(x, feat_cache[idx], feat_cache[idx + 1])
+                idx += 2
+                out_feat_cache.append(out_feat_cache_1)
+                out_feat_cache.append(out_feat_cache_2)
+            else:
+                x = layer(x)
+        # upsamples
+        for layer in self.upsamples:
+            if isinstance(layer, Resample):
+                x, cache_x = layer(x, is_first_frame, feat_cache[idx])
+                if cache_x is not None:
+                    out_feat_cache.append(cache_x)
+                    idx += 1
+            else:
+                x, out_feat_cache_1, out_feat_cache_2 = layer(x, feat_cache[idx], feat_cache[idx + 1])
+                idx += 2
+                out_feat_cache.append(out_feat_cache_1)
+                out_feat_cache.append(out_feat_cache_2)
+        # head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                cache_x = x[:, :, -self.cache_t:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                        dim=2)
+                x = layer(x, feat_cache[idx])
+                out_feat_cache.append(cache_x)
+                idx += 1
+            else:
+                x = layer(x)
+        return x, out_feat_cache
+class VAETRTWrapper():
+    def __init__(self):
+        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+        with open("checkpoints/vae_decoder_int8.trt", "rb") as f, trt.Runtime(TRT_LOGGER) as rt:
+            self.engine: trt.ICudaEngine = rt.deserialize_cuda_engine(f.read())
+        self.context: trt.IExecutionContext = self.engine.create_execution_context()
+        self.stream = torch.cuda.current_stream().cuda_stream
+        # ──────────────────────────────
+        # 2️⃣  Feed the engine with tensors
+        #     (name-based API in TRT ≥10)
+        # ──────────────────────────────
+        self.dtype_map = {
+            trt.float32: torch.float32,
+            trt.float16: torch.float16,
+            trt.int8: torch.int8,
+            trt.int32: torch.int32,
+        }
+        test_input = torch.zeros(1, 16, 1, 60, 104).cuda().half()
+        is_first_frame = torch.tensor(1.0).cuda().half()
+        test_cache_inputs = [c.cuda().half() for c in ZERO_VAE_CACHE]
+        test_inputs = [test_input, is_first_frame] + test_cache_inputs
+        # keep references so buffers stay alive
+        self.device_buffers, self.outputs = {}, []
+        # ---- inputs ----
+        for i, name in enumerate(ALL_INPUTS_NAMES):
+            tensor, scale = test_inputs[i], 1 / 127
+            tensor = self.quantize_if_needed(tensor, self.engine.get_tensor_dtype(name), scale)
+            # dynamic shapes
+            if -1 in self.engine.get_tensor_shape(name):
+                # new API :contentReference[oaicite:0]{index=0}
+                self.context.set_input_shape(name, tuple(tensor.shape))
+            # replaces bindings[] :contentReference[oaicite:1]{index=1}
+            self.context.set_tensor_address(name, int(tensor.data_ptr()))
+            self.device_buffers[name] = tensor                             # keep pointer alive
+        # ---- (after all input shapes are known) infer output shapes ----
+        # propagates shapes :contentReference[oaicite:2]{index=2}
+        self.context.infer_shapes()
+        for i in range(self.engine.num_io_tensors):
+            name = self.engine.get_tensor_name(i)
+            # replaces binding_is_input :contentReference[oaicite:3]{index=3}
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
+                shape = tuple(self.context.get_tensor_shape(name))
+                dtype = self.dtype_map[self.engine.get_tensor_dtype(name)]
+                out = torch.empty(shape, dtype=dtype, device="cuda").contiguous()
+                self.context.set_tensor_address(name, int(out.data_ptr()))
+                self.outputs.append(out)
+                self.device_buffers[name] = out
+    # helper to quant-convert on the fly
+    def quantize_if_needed(self, t, expected_dtype, scale):
+        if expected_dtype == trt.int8 and t.dtype != torch.int8:
+            t = torch.clamp((t / scale).round(), -128, 127).to(torch.int8).contiguous()
+        return t                            # keep pointer alive
+    def forward(self, *test_inputs):
+        for i, name in enumerate(ALL_INPUTS_NAMES):
+            tensor, scale = test_inputs[i], 1 / 127
+            tensor = self.quantize_if_needed(tensor, self.engine.get_tensor_dtype(name), scale)
+            self.context.set_tensor_address(name, int(tensor.data_ptr()))
+            self.device_buffers[name] = tensor
+        self.context.execute_async_v3(stream_handle=self.stream)
+        torch.cuda.current_stream().synchronize()
+        return self.outputs

demo_utils/vae_block3.py ADDED Viewed

	@@ -0,0 +1,291 @@

+from typing import List
+from einops import rearrange
+import torch
+import torch.nn as nn
+from wan.modules.vae import AttentionBlock, CausalConv3d, RMS_norm, ResidualBlock, Upsample
+class Resample(nn.Module):
+    def __init__(self, dim, mode):
+        assert mode in ('none', 'upsample2d', 'upsample3d', 'downsample2d',
+                        'downsample3d')
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        self.cache_t = 2
+        # layers
+        if mode == 'upsample2d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+        elif mode == 'upsample3d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+            self.time_conv = CausalConv3d(
+                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == 'downsample2d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == 'downsample3d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = CausalConv3d(
+                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == 'upsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = 'Rep'
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -self.cache_t:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] != 'Rep':
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat([
+                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                                cache_x.device), cache_x
+                        ],
+                            dim=2)
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] == 'Rep':
+                        cache_x = torch.cat([
+                            torch.zeros_like(cache_x).to(cache_x.device),
+                            cache_x
+                        ],
+                            dim=2)
+                    if feat_cache[idx] == 'Rep':
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
+                                    3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.resample(x)
+        x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
+        if self.mode == 'downsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    # if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx]!='Rep':
+                    #     # cache last frame of last two chunk
+                    #     cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+                    x = self.time_conv(
+                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+    def init_weight(self, conv):
+        conv_weight = conv.weight
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        # conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix  # * 0.5
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        # init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2)
+        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+class VAEDecoderWrapper(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.decoder = VAEDecoder3d()
+        mean = [
+            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
+            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
+        ]
+        std = [
+            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
+            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
+        ]
+        self.mean = torch.tensor(mean, dtype=torch.float32)
+        self.std = torch.tensor(std, dtype=torch.float32)
+        self.z_dim = 16
+        self.conv2 = CausalConv3d(self.z_dim, self.z_dim, 1)
+    def forward(
+            self,
+            z: torch.Tensor,
+            *feat_cache: List[torch.Tensor]
+    ):
+        # from [batch_size, num_frames, num_channels, height, width]
+        # to [batch_size, num_channels, num_frames, height, width]
+        z = z.permute(0, 2, 1, 3, 4)
+        feat_cache = list(feat_cache)
+        # print("Length of feat_cache: ", len(feat_cache))
+        device, dtype = z.device, z.dtype
+        scale = [self.mean.to(device=device, dtype=dtype),
+                 1.0 / self.std.to(device=device, dtype=dtype)]
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            if i == 0:
+                out, feat_cache = self.decoder(
+                    x[:, :, i:i + 1, :, :],
+                    feat_cache=feat_cache)
+            else:
+                out_, feat_cache = self.decoder(
+                    x[:, :, i:i + 1, :, :],
+                    feat_cache=feat_cache)
+                out = torch.cat([out, out_], 2)
+        out = out.float().clamp_(-1, 1)
+        # from [batch_size, num_channels, num_frames, height, width]
+        # to [batch_size, num_frames, num_channels, height, width]
+        out = out.permute(0, 2, 1, 3, 4)
+        return out, feat_cache
+class VAEDecoder3d(nn.Module):
+    def __init__(self,
+                 dim=96,
+                 z_dim=16,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_upsample=[True, True, False],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        self.cache_t = 2
+        self.decoder_conv_num = 32
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2**(len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]),
+            ResidualBlock(dims[0], dims[0], dropout))
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i == 1 or i == 2 or i == 3:
+                in_dim = in_dim // 2
+            for _ in range(num_res_blocks + 1):
+                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    upsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # upsample block
+            if i != len(dim_mult) - 1:
+                mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
+                upsamples.append(Resample(out_dim, mode=mode))
+                scale *= 2.0
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False), nn.SiLU(),
+            CausalConv3d(out_dim, 3, 3, padding=1))
+    def forward(
+            self,
+            x: torch.Tensor,
+            feat_cache: List[torch.Tensor]
+    ):
+        feat_idx = [0]
+        # conv1
+        idx = feat_idx[0]
+        cache_x = x[:, :, -self.cache_t:, :, :].clone()
+        if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+            # cache last frame of last two chunk
+            cache_x = torch.cat([
+                feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                    cache_x.device), cache_x
+            ],
+                dim=2)
+        x = self.conv1(x, feat_cache[idx])
+        feat_cache[idx] = cache_x
+        feat_idx[0] += 1
+        # middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        # upsamples
+        for layer in self.upsamples:
+            x = layer(x, feat_cache, feat_idx)
+        # head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -self.cache_t:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x, feat_cache

demo_utils/vae_torch2trt.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# ---- INT8 (optional) ----
+from demo_utils.vae import (
+    VAEDecoderWrapperSingle,                         # main nn.Module
+    ZERO_VAE_CACHE           # helper constants shipped with your code base
+)
+import pycuda.driver as cuda          # ← add
+import pycuda.autoinit  # noqa
+import sys
+from pathlib import Path
+import torch
+import tensorrt as trt
+from utils.dataset import ShardingLMDBDataset
+data_path = "/mnt/localssd/wanx_14B_shift-3.0_cfg-5.0_lmdb_oneshard"
+dataset = ShardingLMDBDataset(data_path, max_pair=int(1e8))
+dataloader = torch.utils.data.DataLoader(
+    dataset,
+    batch_size=1,
+    num_workers=0
+)
+# ─────────────────────────────────────────────────────────
+# 1️⃣  Bring the PyTorch model into scope
+#     (all code you pasted lives in `vae_decoder.py`)
+# ─────────────────────────────────────────────────────────
+# --- dummy tensors (exact shapes you posted) ---
+dummy_input = torch.randn(1, 1, 16, 60, 104).half().cuda()
+is_first_frame = torch.tensor([1.0], device="cuda", dtype=torch.float16)
+dummy_cache_input = [
+    torch.randn(*s.shape).half().cuda() if isinstance(s, torch.Tensor) else s
+    for s in ZERO_VAE_CACHE               # keep exactly the same ordering
+]
+inputs = [dummy_input, is_first_frame, *dummy_cache_input]
+# ─────────────────────────────────────────────────────────
+# 2️⃣  Export → ONNX
+# ─────────────────────────────────────────────────────────
+model = VAEDecoderWrapperSingle().half().cuda().eval()
+vae_state_dict = torch.load('wan_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth', map_location="cpu")
+decoder_state_dict = {}
+for key, value in vae_state_dict.items():
+    if 'decoder.' in key or 'conv2' in key:
+        decoder_state_dict[key] = value
+model.load_state_dict(decoder_state_dict)
+model = model.half().cuda().eval()                          # only batch dim dynamic
+onnx_path = Path("vae_decoder.onnx")
+feat_names = [f"vae_cache_{i}" for i in range(len(dummy_cache_input))]
+all_inputs_names = ["z", "use_cache"] + feat_names
+with torch.inference_mode():
+    torch.onnx.export(
+        model,
+        tuple(inputs),                                        # must be a tuple
+        onnx_path.as_posix(),
+        input_names=all_inputs_names,
+        output_names=["rgb_out", "cache_out"],
+        opset_version=17,
+        do_constant_folding=True,
+        dynamo=True
+    )
+print(f"✅  ONNX graph saved to {onnx_path.resolve()}")
+# (Optional) quick sanity-check with ONNX-Runtime
+try:
+    import onnxruntime as ort
+    sess = ort.InferenceSession(onnx_path.as_posix(),
+                                providers=["CUDAExecutionProvider"])
+    ort_inputs = {n: t.cpu().numpy() for n, t in zip(all_inputs_names, inputs)}
+    _ = sess.run(None, ort_inputs)
+    print("✅  ONNX graph is executable")
+except Exception as e:
+    print("⚠️  ONNX check failed:", e)
+# ─────────────────────────────────────────────────────────
+# 3️⃣  Build the TensorRT engine
+# ─────────────────────────────────────────────────────────
+TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+builder = trt.Builder(TRT_LOGGER)
+network = builder.create_network(
+    1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+parser = trt.OnnxParser(network, TRT_LOGGER)
+with open(onnx_path, "rb") as f:
+    if not parser.parse(f.read()):
+        for i in range(parser.num_errors):
+            print(parser.get_error(i))
+        sys.exit("❌  ONNX → TRT parsing failed")
+config = builder.create_builder_config()
+def set_workspace(config, bytes_):
+    """Version-agnostic workspace limit."""
+    if hasattr(config, "max_workspace_size"):                # TRT 8 / 9
+        config.max_workspace_size = bytes_
+    else:                                                    # TRT 10+
+        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, bytes_)
+# …
+config = builder.create_builder_config()
+set_workspace(config, 4 << 30)          # 4 GB
+# 4 GB
+if builder.platform_has_fast_fp16:
+    config.set_flag(trt.BuilderFlag.FP16)
+# ---- INT8 (optional) ----
+# provide a calibrator if you need an INT8 engine; comment this
+# block if you only care about FP16.
+# ─────────────────────────────────────────────────────────
+# helper: version-agnostic workspace limit
+# ─────────────────────────────────────────────────────────
+def set_workspace(config: trt.IBuilderConfig, bytes_: int = 4 << 30):
+    """
+    TRT < 10.x  →  config.max_workspace_size
+    TRT ≥ 10.x  →  config.set_memory_pool_limit(...)
+    """
+    if hasattr(config, "max_workspace_size"):                     # TRT 8 / 9
+        config.max_workspace_size = bytes_
+    else:                                                         # TRT 10+
+        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE,
+                                     bytes_)
+# ─────────────────────────────────────────────────────────
+# (optional) INT-8 calibrator
+# ─────────────────────────────────────────────────────────
+# ‼ Only keep this block if you really need INT-8 ‼                      # gracefully skip if PyCUDA not present
+class VAECalibrator(trt.IInt8EntropyCalibrator2):
+    def __init__(self, loader, cache="calibration.cache", max_batches=10):
+        super().__init__()
+        self.loader = iter(loader)
+        self.batch_size = loader.batch_size or 1
+        self.max_batches = max_batches
+        self.count = 0
+        self.cache_file = cache
+        self.stream = cuda.Stream()
+        self.dev_ptrs = {}
+    # --- TRT 10 needs BOTH spellings ---
+    def get_batch_size(self):
+        return self.batch_size
+    def getBatchSize(self):
+        return self.batch_size
+    def get_batch(self, names):
+        if self.count >= self.max_batches:
+            return None
+        # Randomly sample a number from 1 to 10
+        import random
+        vae_idx = random.randint(0, 10)
+        data = next(self.loader)
+        latent = data['ode_latent'][0][:, :1]
+        is_first_frame = torch.tensor([1.0], device="cuda", dtype=torch.float16)
+        feat_cache = ZERO_VAE_CACHE
+        for i in range(vae_idx):
+            inputs = [latent, is_first_frame, *feat_cache]
+            with torch.inference_mode():
+                outputs = model(*inputs)
+            latent = data['ode_latent'][0][:, i + 1:i + 2]
+            is_first_frame = torch.tensor([0.0], device="cuda", dtype=torch.float16)
+            feat_cache = outputs[1:]
+        # -------- ensure context is current --------
+        z_np = latent.cpu().numpy().astype('float32')
+        ptrs = []                # list[int] – one entry per name
+        for name in names:         # <-- match TRT's binding order
+            if name == "z":
+                arr = z_np
+            elif name == "use_cache":
+                arr = is_first_frame.cpu().numpy().astype('float32')
+            else:
+                idx = int(name.split('_')[-1])   # "vae_cache_17" -> 17
+                arr = feat_cache[idx].cpu().numpy().astype('float32')
+            if name not in self.dev_ptrs:
+                self.dev_ptrs[name] = cuda.mem_alloc(arr.nbytes)
+            cuda.memcpy_htod_async(self.dev_ptrs[name], arr, self.stream)
+            ptrs.append(int(self.dev_ptrs[name]))   # ***int() is required***
+        self.stream.synchronize()
+        self.count += 1
+        print(f"Calibration batch {self.count}/{self.max_batches}")
+        return ptrs
+    # --- calibration-cache helpers (both spellings) ---
+    def read_calibration_cache(self):
+        try:
+            with open(self.cache_file, "rb") as f:
+                return f.read()
+        except FileNotFoundError:
+            return None
+    def readCalibrationCache(self):
+        return self.read_calibration_cache()
+    def write_calibration_cache(self, cache):
+        with open(self.cache_file, "wb") as f:
+            f.write(cache)
+    def writeCalibrationCache(self, cache):
+        self.write_calibration_cache(cache)
+# ─────────────────────────────────────────────────────────
+# Builder-config + optimisation profile
+# ─────────────────────────────────────────────────────────
+config = builder.create_builder_config()
+set_workspace(config, 4 << 30)                    # 4 GB
+# ► enable FP16 if possible
+if builder.platform_has_fast_fp16:
+    config.set_flag(trt.BuilderFlag.FP16)
+# ► enable INT-8  (delete this block if you don’t need it)
+if cuda is not None:
+    config.set_flag(trt.BuilderFlag.INT8)
+    # supply any representative batch you like – here we reuse the latent z
+    calib = VAECalibrator(dataloader)
+    # TRT-10 renamed the setter:
+    if hasattr(config, "set_int8_calibrator"):    # TRT 10+
+        config.set_int8_calibrator(calib)
+    else:                                         # TRT ≤ 9
+        config.int8_calibrator = calib
+# ---- optimisation profile ----
+profile = builder.create_optimization_profile()
+profile.set_shape(all_inputs_names[0],            # latent z
+                  min=(1, 1, 16, 60, 104),
+                  opt=(1, 1, 16, 60, 104),
+                  max=(1, 1, 16, 60, 104))
+profile.set_shape("use_cache",               # scalar flag
+                  min=(1,), opt=(1,), max=(1,))
+for name, tensor in zip(all_inputs_names[2:], dummy_cache_input):
+    profile.set_shape(name, tensor.shape, tensor.shape, tensor.shape)
+config.add_optimization_profile(profile)
+# ─────────────────────────────────────────────────────────
+# Build the engine  (API changed in TRT-10)
+# ─────────────────────────────────────────────────────────
+print("⚙️  Building engine … (can take a minute)")
+if hasattr(builder, "build_serialized_network"):          # TRT 10+
+    serialized_engine = builder.build_serialized_network(network, config)
+    assert serialized_engine is not None, "build_serialized_network() failed"
+    plan_path = Path("checkpoints/vae_decoder_int8.trt")
+    plan_path.write_bytes(serialized_engine)
+    engine_bytes = serialized_engine                      # keep for smoke-test
+else:                                                     # TRT ≤ 9
+    engine = builder.build_engine(network, config)
+    assert engine is not None, "build_engine() returned None"
+    plan_path = Path("checkpoints/vae_decoder_int8.trt")
+    plan_path.write_bytes(engine.serialize())
+    engine_bytes = engine.serialize()
+print(f"✅  TensorRT engine written to {plan_path.resolve()}")
+# ─────────────────────────────────────────────────────────
+# 4️⃣  Quick smoke-test with the brand-new engine
+# ─────────────────────────────────────────────────────────
+with trt.Runtime(TRT_LOGGER) as rt:
+    engine = rt.deserialize_cuda_engine(engine_bytes)
+    context = engine.create_execution_context()
+    stream = torch.cuda.current_stream().cuda_stream
+    # pre-allocate device buffers once
+    device_buffers, outputs = {}, []
+    dtype_map = {trt.float32: torch.float32,
+                 trt.float16: torch.float16,
+                 trt.int8:    torch.int8,
+                 trt.int32:   torch.int32}
+    for name, tensor in zip(all_inputs_names, inputs):
+        if -1 in engine.get_tensor_shape(name):            # dynamic input
+            context.set_input_shape(name, tensor.shape)
+        context.set_tensor_address(name, int(tensor.data_ptr()))
+        device_buffers[name] = tensor
+    context.infer_shapes()                                 # propagate ⇢ outputs
+    for i in range(engine.num_io_tensors):
+        name = engine.get_tensor_name(i)
+        if engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
+            shape = tuple(context.get_tensor_shape(name))
+            dtype = dtype_map[engine.get_tensor_dtype(name)]
+            out = torch.empty(shape, dtype=dtype, device="cuda")
+            context.set_tensor_address(name, int(out.data_ptr()))
+            outputs.append(out)
+            print(f"output {name} shape: {shape}")
+    context.execute_async_v3(stream_handle=stream)
+    torch.cuda.current_stream().synchronize()
+    print("✅  TRT execution OK – first output shape:", outputs[0].shape)

inference.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import os
+import argparse
+import torch
+import numpy as np
+from omegaconf import OmegaConf
+from torchvision.transforms import v2
+from diffusers.utils import load_image
+from einops import rearrange
+from pipeline import CausalInferencePipeline
+from wan.vae.wanx_vae import get_wanx_vae_wrapper
+from demo_utils.vae_block3 import VAEDecoderWrapper
+from utils.visualize import process_video
+from utils.misc import set_seed
+from utils.conditions import *
+from utils.wan_wrapper import WanDiffusionWrapper
+from safetensors.torch import load_file
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_path", type=str, default="configs/inference_yaml/inference_universal.yaml", help="Path to the config file")
+    parser.add_argument("--checkpoint_path", type=str, default="", help="Path to the checkpoint")
+    parser.add_argument("--img_path", type=str, default="demo_images/universal/0000.png", help="Path to the image")
+    parser.add_argument("--output_folder", type=str, default="outputs/", help="Output folder")
+    parser.add_argument("--num_output_frames", type=int, default=150,
+                        help="Number of output latent frames")
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument("--pretrained_model_path", type=str, default="Matrix-Game-2.0", help="Path to the VAE model folder")
+    args = parser.parse_args()
+    return args
+class InteractiveGameInference:
+    def __init__(self, args):
+        self.args = args
+        self.device = torch.device("cuda")
+        self.weight_dtype = torch.bfloat16
+        self._init_config()
+        self._init_models()
+        self.frame_process = v2.Compose([
+            v2.Resize(size=(352, 640), antialias=True),
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+    def _init_config(self):
+        self.config = OmegaConf.load(self.args.config_path)
+    def _init_models(self):
+        # Initialize pipeline
+        generator = WanDiffusionWrapper(
+            **getattr(self.config, "model_kwargs", {}), is_causal=True)
+        current_vae_decoder = VAEDecoderWrapper()
+        vae_state_dict = torch.load(os.path.join(self.args.pretrained_model_path, "Wan2.1_VAE.pth"), map_location="cpu")
+        decoder_state_dict = {}
+        for key, value in vae_state_dict.items():
+            if 'decoder.' in key or 'conv2' in key:
+                decoder_state_dict[key] = value
+        current_vae_decoder.load_state_dict(decoder_state_dict)
+        current_vae_decoder.to(self.device, torch.float16)
+        current_vae_decoder.requires_grad_(False)
+        current_vae_decoder.eval()
+        current_vae_decoder.compile(mode="max-autotune-no-cudagraphs")
+        pipeline = CausalInferencePipeline(self.config, generator=generator, vae_decoder=current_vae_decoder)
+        if self.args.checkpoint_path:
+            print("Loading Pretrained Model...")
+            state_dict = load_file(self.args.checkpoint_path)
+            pipeline.generator.load_state_dict(state_dict)
+        self.pipeline = pipeline.to(device=self.device, dtype=self.weight_dtype)
+        self.pipeline.vae_decoder.to(torch.float16)
+        vae = get_wanx_vae_wrapper(self.args.pretrained_model_path, torch.float16)
+        vae.requires_grad_(False)
+        vae.eval()
+        self.vae = vae.to(self.device, self.weight_dtype)
+    def _resizecrop(self, image, th, tw):
+        w, h = image.size
+        if h / w > th / tw:
+            new_w = int(w)
+            new_h = int(new_w * th / tw)
+        else:
+            new_h = int(h)
+            new_w = int(new_h * tw / th)
+        left = (w - new_w) / 2
+        top = (h - new_h) / 2
+        right = (w + new_w) / 2
+        bottom = (h + new_h) / 2
+        image = image.crop((left, top, right, bottom))
+        return image
+    def generate_videos(self):
+        mode = self.config.pop('mode')
+        assert mode in ['universal', 'gta_drive', 'templerun']
+        image = load_image(self.args.img_path)
+        image = self._resizecrop(image, 352, 640)
+        image = self.frame_process(image)[None, :, None, :, :].to(dtype=self.weight_dtype, device=self.device)
+        # Encode the input image as the first latent
+        padding_video = torch.zeros_like(image).repeat(1, 1, 4 * (self.args.num_output_frames - 1), 1, 1)
+        img_cond = torch.concat([image, padding_video], dim=2)
+        tiler_kwargs={"tiled": True, "tile_size": [44, 80], "tile_stride": [23, 38]}
+        img_cond = self.vae.encode(img_cond, device=self.device, **tiler_kwargs).to(self.device)
+        mask_cond = torch.ones_like(img_cond)
+        mask_cond[:, :, 1:] = 0
+        cond_concat = torch.cat([mask_cond[:, :4], img_cond], dim=1)
+        visual_context = self.vae.clip.encode_video(image)
+        sampled_noise = torch.randn(
+            [1, 16,self.args.num_output_frames, 44, 80], device=self.device, dtype=self.weight_dtype
+        )
+        num_frames = (self.args.num_output_frames - 1) * 4 + 1
+        conditional_dict = {
+            "cond_concat": cond_concat.to(device=self.device, dtype=self.weight_dtype),
+            "visual_context": visual_context.to(device=self.device, dtype=self.weight_dtype)
+        }
+        if mode == 'universal':
+            cond_data = Bench_actions_universal(num_frames)
+            mouse_condition = cond_data['mouse_condition'].unsqueeze(0).to(device=self.device, dtype=self.weight_dtype)
+            conditional_dict['mouse_cond'] = mouse_condition
+        elif mode == 'gta_drive':
+            cond_data = Bench_actions_gta_drive(num_frames)
+            mouse_condition = cond_data['mouse_condition'].unsqueeze(0).to(device=self.device, dtype=self.weight_dtype)
+            conditional_dict['mouse_cond'] = mouse_condition
+        else:
+            cond_data = Bench_actions_templerun(num_frames)
+        keyboard_condition = cond_data['keyboard_condition'].unsqueeze(0).to(device=self.device, dtype=self.weight_dtype)
+        conditional_dict['keyboard_cond'] = keyboard_condition
+        with torch.no_grad():
+            videos = self.pipeline.inference(
+                noise=sampled_noise,
+                conditional_dict=conditional_dict,
+                return_latents=False,
+                mode=mode,
+                profile=False
+            )
+        videos_tensor = torch.cat(videos, dim=1)
+        videos = rearrange(videos_tensor, "B T C H W -> B T H W C")
+        videos = ((videos.float() + 1) * 127.5).clip(0, 255).cpu().numpy().astype(np.uint8)[0]
+        video = np.ascontiguousarray(videos)
+        mouse_icon = 'assets/images/mouse.png'
+        if mode != 'templerun':
+            config = (
+                keyboard_condition[0].float().cpu().numpy(),
+                mouse_condition[0].float().cpu().numpy()
+            )
+        else:
+            config = (
+                keyboard_condition[0].float().cpu().numpy()
+            )
+        process_video(video.astype(np.uint8), self.args.output_folder+f'/demo.mp4', config, mouse_icon, mouse_scale=0.1, process_icon=False, mode=mode)
+        process_video(video.astype(np.uint8), self.args.output_folder+f'/demo_icon.mp4', config, mouse_icon, mouse_scale=0.1, process_icon=True, mode=mode)
+        print("Done")
+def main():
+    """Main entry point for video generation."""
+    args = parse_args()
+    set_seed(args.seed)
+    os.makedirs(args.output_folder, exist_ok=True)
+    pipeline = InteractiveGameInference(args)
+    pipeline.generate_videos()
+if __name__ == "__main__":
+    main()

inference_streaming.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import argparse
+import torch
+import numpy as np
+import copy
+from omegaconf import OmegaConf
+from torchvision.transforms import v2
+from diffusers.utils import load_image
+from pipeline import CausalInferenceStreamingPipeline
+from wan.vae.wanx_vae import get_wanx_vae_wrapper
+from demo_utils.vae_block3 import VAEDecoderWrapper
+from utils.visualize import process_video
+from utils.misc import set_seed
+from utils.conditions import *
+from utils.wan_wrapper import WanDiffusionWrapper
+from safetensors.torch import load_file
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_path", type=str, default="configs/inference_yaml/inference_universal.yaml", help="Path to the config file")
+    parser.add_argument("--checkpoint_path", type=str, default="", help="Path to the checkpoint")
+    parser.add_argument("--output_folder", type=str, default="outputs/", help="Output folder")
+    parser.add_argument("--max_num_output_frames", type=int, default=360,
+                        help="Max number of output latent frames")
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument("--pretrained_model_path", type=str, default="Matrix-Game-2.0", help="Path to the VAE model folder")
+    args = parser.parse_args()
+    return args
+class InteractiveGameInference:
+    def __init__(self, args):
+        self.args = args
+        self.device = torch.device("cuda")
+        self.weight_dtype = torch.bfloat16
+        self._init_config()
+        self._init_models()
+        self.frame_process = v2.Compose([
+            v2.Resize(size=(352, 640), antialias=True),
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+    def _init_config(self):
+        self.config = OmegaConf.load(self.args.config_path)
+    def _init_models(self):
+        # Initialize pipeline
+        generator = WanDiffusionWrapper(
+            **getattr(self.config, "model_kwargs", {}), is_causal=True)
+        current_vae_decoder = VAEDecoderWrapper()
+        vae_state_dict = torch.load(os.path.join(self.args.pretrained_model_path, "Wan2.1_VAE.pth"), map_location="cpu")
+        decoder_state_dict = {}
+        for key, value in vae_state_dict.items():
+            if 'decoder.' in key or 'conv2' in key:
+                decoder_state_dict[key] = value
+        current_vae_decoder.load_state_dict(decoder_state_dict)
+        current_vae_decoder.to(self.device, torch.float16)
+        current_vae_decoder.requires_grad_(False)
+        current_vae_decoder.eval()
+        current_vae_decoder.compile(mode="max-autotune-no-cudagraphs")
+        pipeline = CausalInferenceStreamingPipeline(self.config, generator=generator, vae_decoder=current_vae_decoder)
+        if self.args.checkpoint_path:
+            print("Loading Pretrained Model...")
+            state_dict = load_file(self.args.checkpoint_path)
+            pipeline.generator.load_state_dict(state_dict)
+        self.pipeline = pipeline.to(device=self.device, dtype=self.weight_dtype)
+        self.pipeline.vae_decoder.to(torch.float16)
+        vae = get_wanx_vae_wrapper(self.args.pretrained_model_path, torch.float16)
+        vae.requires_grad_(False)
+        vae.eval()
+        self.vae = vae.to(self.device, self.weight_dtype)
+    def _resizecrop(self, image, th, tw):
+        w, h = image.size
+        if h / w > th / tw:
+            new_w = int(w)
+            new_h = int(new_w * th / tw)
+        else:
+            new_h = int(h)
+            new_w = int(new_h * tw / th)
+        left = (w - new_w) / 2
+        top = (h - new_h) / 2
+        right = (w + new_w) / 2
+        bottom = (h + new_h) / 2
+        image = image.crop((left, top, right, bottom))
+        return image
+    def generate_videos(self, mode='universal'):
+        assert mode in ['universal', 'gta_drive', 'templerun']
+        while True:
+            try:
+                img_path = input("Please input the image path: ")
+                image = load_image(img_path.strip())
+                break
+            except:
+                print(f"Fail to load image from {img_path}!")
+        image = self._resizecrop(image, 352, 640)
+        image = self.frame_process(image)[None, :, None, :, :].to(dtype=self.weight_dtype, device=self.device)
+        # Encode the input image as the first latent
+        padding_video = torch.zeros_like(image).repeat(1, 1, 4 * (self.args.max_num_output_frames - 1), 1, 1)
+        img_cond = torch.concat([image, padding_video], dim=2)
+        tiler_kwargs={"tiled": True, "tile_size": [44, 80], "tile_stride": [23, 38]}
+        img_cond = self.vae.encode(img_cond, device=self.device, **tiler_kwargs).to(self.device)
+        mask_cond = torch.ones_like(img_cond)
+        mask_cond[:, :, 1:] = 0
+        cond_concat = torch.cat([mask_cond[:, :4], img_cond], dim=1)
+        visual_context = self.vae.clip.encode_video(image)
+        sampled_noise = torch.randn(
+            [1, 16,self.args.max_num_output_frames, 44, 80], device=self.device, dtype=self.weight_dtype
+        )
+        num_frames = (self.args.max_num_output_frames - 1) * 4 + 1
+        conditional_dict = {
+            "cond_concat": cond_concat.to(device=self.device, dtype=self.weight_dtype),
+            "visual_context": visual_context.to(device=self.device, dtype=self.weight_dtype)
+        }
+        if mode == 'universal':
+            cond_data = Bench_actions_universal(num_frames)
+            mouse_condition = cond_data['mouse_condition'].unsqueeze(0).to(device=self.device, dtype=self.weight_dtype)
+            conditional_dict['mouse_cond'] = mouse_condition
+        elif mode == 'gta_drive':
+            cond_data = Bench_actions_gta_drive(num_frames)
+            mouse_condition = cond_data['mouse_condition'].unsqueeze(0).to(device=self.device, dtype=self.weight_dtype)
+            conditional_dict['mouse_cond'] = mouse_condition
+        else:
+            cond_data = Bench_actions_templerun(num_frames)
+        keyboard_condition = cond_data['keyboard_condition'].unsqueeze(0).to(device=self.device, dtype=self.weight_dtype)
+        conditional_dict['keyboard_cond'] = keyboard_condition
+        with torch.no_grad():
+            videos = self.pipeline.inference(
+                noise=sampled_noise,
+                conditional_dict=conditional_dict,
+                return_latents=False,
+                output_folder=self.args.output_folder,
+                name=os.path.basename(img_path),
+                mode=mode
+            )
+def main():
+    """Main entry point for video generation."""
+    args = parse_args()
+    set_seed(args.seed)
+    os.makedirs(args.output_folder, exist_ok=True)
+    pipeline = InteractiveGameInference(args)
+    mode = pipeline.config.pop('mode')
+    stop = ''
+    while stop != 'n':
+        pipeline.generate_videos(mode)
+        stop = input("Press `n` to stop generation: ").strip().lower()
+if __name__ == "__main__":
+    main()

pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .causal_inference import CausalInferencePipeline, CausalInferenceStreamingPipeline
+__all__ = [
+    "CausalInferencePipeline",
+    "CausalInferenceStreamingPipeline"
+]

pipeline/causal_inference.py ADDED Viewed

	@@ -0,0 +1,753 @@

+from typing import List, Optional
+import numpy as np
+import torch
+import time
+import copy
+from einops import rearrange
+from utils.wan_wrapper import WanDiffusionWrapper, WanVAEWrapper
+from utils.visualize import process_video
+import torch.nn.functional as F
+from demo_utils.constant import ZERO_VAE_CACHE
+from tqdm import tqdm
+def get_current_action(mode="universal"):
+    CAM_VALUE = 0.1
+    if mode == 'universal':
+        print()
+        print('-'*30)
+        print("PRESS [I, K, J, L, U] FOR CAMERA TRANSFORM\n (I: up, K: down, J: left, L: right, U: no move)")
+        print("PRESS [W, S, A, D, Q] FOR MOVEMENT\n (W: forward, S: back, A: left, D: right, Q: no move)")
+        print('-'*30)
+        CAMERA_VALUE_MAP = {
+            "i":  [CAM_VALUE, 0],
+            "k":  [-CAM_VALUE, 0],
+            "j":  [0, -CAM_VALUE],
+            "l":  [0, CAM_VALUE],
+            "u":  [0, 0]
+        }
+        KEYBOARD_IDX = {
+            "w": [1, 0, 0, 0], "s": [0, 1, 0, 0], "a": [0, 0, 1, 0], "d": [0, 0, 0, 1],
+            "q": [0, 0, 0, 0]
+        }
+        flag = 0
+        while flag != 1:
+            try:
+                idx_mouse = input('Please input the mouse action (e.g. `U`):\n').strip().lower()
+                idx_keyboard = input('Please input the keyboard action (e.g. `W`):\n').strip().lower()
+                if idx_mouse in CAMERA_VALUE_MAP.keys() and idx_keyboard in KEYBOARD_IDX.keys():
+                    flag = 1
+            except:
+                pass
+        mouse_cond = torch.tensor(CAMERA_VALUE_MAP[idx_mouse]).cuda()
+        keyboard_cond = torch.tensor(KEYBOARD_IDX[idx_keyboard]).cuda()
+    elif mode == 'gta_drive':
+        print()
+        print('-'*30)
+        print("PRESS [W, S, A, D, Q] FOR MOVEMENT\n (W: forward, S: back, A: left, D: right, Q: no move)")
+        print('-'*30)
+        CAMERA_VALUE_MAP = {
+            "a":  [0, -CAM_VALUE],
+            "d":  [0, CAM_VALUE],
+            "q":  [0, 0]
+        }
+        KEYBOARD_IDX = {
+            "w": [1, 0], "s": [0, 1],
+            "q": [0, 0]
+        }
+        flag = 0
+        while flag != 1:
+            try:
+                indexes = input('Please input the actions (split with ` `):\n(e.g. `W` for forward, `W A` for forward and left)\n').strip().lower().split(' ')
+                idx_mouse = []
+                idx_keyboard = []
+                for i in indexes:
+                    if i in CAMERA_VALUE_MAP.keys():
+                        idx_mouse += [i]
+                    elif i in KEYBOARD_IDX.keys():
+                        idx_keyboard += [i]
+                if len(idx_mouse) == 0:
+                    idx_mouse += ['q']
+                if len(idx_keyboard) == 0:
+                    idx_keyboard += ['q']
+                assert idx_mouse in [['a'], ['d'], ['q']] and idx_keyboard in [['q'], ['w'], ['s']]
+                flag = 1
+            except:
+                pass
+        mouse_cond = torch.tensor(CAMERA_VALUE_MAP[idx_mouse[0]]).cuda()
+        keyboard_cond = torch.tensor(KEYBOARD_IDX[idx_keyboard[0]]).cuda()
+    elif mode == 'templerun':
+        print()
+        print('-'*30)
+        print("PRESS [W, S, A, D, Z, C, Q] FOR ACTIONS\n (W: jump, S: slide, A: left side, D: right side, Z: turn left, C: turn right, Q: no move)")
+        print('-'*30)
+        KEYBOARD_IDX = {
+            "w": [0, 1, 0, 0, 0, 0, 0], "s": [0, 0, 1, 0, 0, 0, 0],
+            "a": [0, 0, 0, 0, 0, 1, 0], "d": [0, 0, 0, 0, 0, 0, 1],
+            "z": [0, 0, 0, 1, 0, 0, 0], "c": [0, 0, 0, 0, 1, 0, 0],
+            "q": [1, 0, 0, 0, 0, 0, 0]
+        }
+        flag = 0
+        while flag != 1:
+            try:
+                idx_keyboard = input('Please input the action: \n(e.g. `W` for forward, `Z` for turning left)\n').strip().lower()
+                if idx_keyboard in KEYBOARD_IDX.keys():
+                    flag = 1
+            except:
+                pass
+        keyboard_cond = torch.tensor(KEYBOARD_IDX[idx_keyboard]).cuda()
+    if mode != 'templerun':
+        return {
+            "mouse": mouse_cond,
+            "keyboard": keyboard_cond
+        }
+    return {
+        "keyboard": keyboard_cond
+    }
+def cond_current(conditional_dict, current_start_frame, num_frame_per_block, replace=None, mode='universal'):
+    new_cond = {}
+    new_cond["cond_concat"] = conditional_dict["cond_concat"][:, :, current_start_frame: current_start_frame + num_frame_per_block]
+    new_cond["visual_context"] = conditional_dict["visual_context"]
+    if replace != None:
+        if current_start_frame == 0:
+            last_frame_num = 1 + 4 * (num_frame_per_block - 1)
+        else:
+            last_frame_num = 4 * num_frame_per_block
+        final_frame = 1 + 4 * (current_start_frame + num_frame_per_block-1)
+        if mode != 'templerun':
+            conditional_dict["mouse_cond"][:, -last_frame_num + final_frame: final_frame] = replace['mouse'][None, None, :].repeat(1, last_frame_num, 1)
+        conditional_dict["keyboard_cond"][:, -last_frame_num + final_frame: final_frame] = replace['keyboard'][None, None, :].repeat(1, last_frame_num, 1)
+    if mode != 'templerun':
+        new_cond["mouse_cond"] = conditional_dict["mouse_cond"][:, : 1 + 4 * (current_start_frame + num_frame_per_block - 1)]
+    new_cond["keyboard_cond"] = conditional_dict["keyboard_cond"][:, : 1 + 4 * (current_start_frame + num_frame_per_block - 1)]
+    if replace != None:
+        return new_cond, conditional_dict
+    else:
+        return new_cond
+class CausalInferencePipeline(torch.nn.Module):
+    def __init__(
+            self,
+            args,
+            device="cuda",
+            generator=None,
+            vae_decoder=None,
+    ):
+        super().__init__()
+        # Step 1: Initialize all models
+        self.generator = WanDiffusionWrapper(
+            **getattr(args, "model_kwargs", {}), is_causal=True) if generator is None else generator
+        self.vae_decoder = vae_decoder
+        # Step 2: Initialize all causal hyperparmeters
+        self.scheduler = self.generator.get_scheduler()
+        self.denoising_step_list = torch.tensor(
+            args.denoising_step_list, dtype=torch.long)
+        if args.warp_denoising_step:
+            timesteps = torch.cat((self.scheduler.timesteps.cpu(), torch.tensor([0], dtype=torch.float32)))
+            self.denoising_step_list = timesteps[1000 - self.denoising_step_list]
+        self.num_transformer_blocks = 30
+        self.frame_seq_length = 880
+        self.kv_cache1 = None
+        self.kv_cache_mouse = None
+        self.kv_cache_keyboard = None
+        self.args = args
+        self.num_frame_per_block = getattr(args, "num_frame_per_block", 1)
+        self.local_attn_size = self.generator.model.local_attn_size
+        assert self.local_attn_size != -1
+        print(f"KV inference with {self.num_frame_per_block} frames per block")
+        if self.num_frame_per_block > 1:
+            self.generator.model.num_frame_per_block = self.num_frame_per_block
+    def inference(
+        self,
+        noise: torch.Tensor,
+        conditional_dict,
+        initial_latent = None,
+        return_latents = False,
+        mode = 'universal',
+        profile = False,
+    ) -> torch.Tensor:
+        """
+        Perform inference on the given noise and text prompts.
+        Inputs:
+            noise (torch.Tensor): The input noise tensor of shape
+                (batch_size, num_output_frames, num_channels, height, width).
+            text_prompts (List[str]): The list of text prompts.
+            initial_latent (torch.Tensor): The initial latent tensor of shape
+                (batch_size, num_input_frames, num_channels, height, width).
+                If num_input_frames is 1, perform image to video.
+                If num_input_frames is greater than 1, perform video extension.
+            return_latents (bool): Whether to return the latents.
+        Outputs:
+            video (torch.Tensor): The generated video tensor of shape
+                (batch_size, num_output_frames, num_channels, height, width).
+                It is normalized to be in the range [0, 1].
+        """
+        assert noise.shape[1] == 16
+        batch_size, num_channels, num_frames, height, width = noise.shape
+        assert num_frames % self.num_frame_per_block == 0
+        num_blocks = num_frames // self.num_frame_per_block
+        num_input_frames = initial_latent.shape[2] if initial_latent is not None else 0
+        num_output_frames = num_frames + num_input_frames  # add the initial latent frames
+        output = torch.zeros(
+            [batch_size, num_channels, num_output_frames, height, width],
+            device=noise.device,
+            dtype=noise.dtype
+        )
+        videos = []
+        vae_cache = copy.deepcopy(ZERO_VAE_CACHE)
+        for j in range(len(vae_cache)):
+            vae_cache[j] = None
+        self.kv_cache1 = self.kv_cache_keyboard = self.kv_cache_mouse = self.crossattn_cache=None
+        # Step 1: Initialize KV cache to all zeros
+        if self.kv_cache1 is None:
+            self._initialize_kv_cache(
+                batch_size=batch_size,
+                dtype=noise.dtype,
+                device=noise.device
+            )
+            self._initialize_kv_cache_mouse_and_keyboard(
+                batch_size=batch_size,
+                dtype=noise.dtype,
+                device=noise.device
+            )
+            self._initialize_crossattn_cache(
+                batch_size=batch_size,
+                dtype=noise.dtype,
+                device=noise.device
+            )
+        else:
+            # reset cross attn cache
+            for block_index in range(self.num_transformer_blocks):
+                self.crossattn_cache[block_index]["is_init"] = False
+            # reset kv cache
+            for block_index in range(len(self.kv_cache1)):
+                self.kv_cache1[block_index]["global_end_index"] = torch.tensor(
+                    [0], dtype=torch.long, device=noise.device)
+                self.kv_cache1[block_index]["local_end_index"] = torch.tensor(
+                    [0], dtype=torch.long, device=noise.device)
+                self.kv_cache_mouse[block_index]["global_end_index"] = torch.tensor(
+                    [0], dtype=torch.long, device=noise.device)
+                self.kv_cache_mouse[block_index]["local_end_index"] = torch.tensor(
+                    [0], dtype=torch.long, device=noise.device)
+                self.kv_cache_keyboard[block_index]["global_end_index"] = torch.tensor(
+                    [0], dtype=torch.long, device=noise.device)
+                self.kv_cache_keyboard[block_index]["local_end_index"] = torch.tensor(
+                    [0], dtype=torch.long, device=noise.device)
+        # Step 2: Cache context feature
+        current_start_frame = 0
+        if initial_latent is not None:
+            timestep = torch.ones([batch_size, 1], device=noise.device, dtype=torch.int64) * 0
+            # Assume num_input_frames is self.num_frame_per_block * num_input_blocks
+            assert num_input_frames % self.num_frame_per_block == 0
+            num_input_blocks = num_input_frames // self.num_frame_per_block
+            for _ in range(num_input_blocks):
+                current_ref_latents = \
+                    initial_latent[:, :, current_start_frame:current_start_frame + self.num_frame_per_block]
+                output[:, :, current_start_frame:current_start_frame + self.num_frame_per_block] = current_ref_latents
+                self.generator(
+                    noisy_image_or_video=current_ref_latents,
+                    conditional_dict=cond_current(conditional_dict, current_start_frame, self.num_frame_per_block, mode=mode),
+                    timestep=timestep * 0,
+                    kv_cache=self.kv_cache1,
+                    kv_cache_mouse=self.kv_cache_mouse,
+                    kv_cache_keyboard=self.kv_cache_keyboard,
+                    crossattn_cache=self.crossattn_cache,
+                    current_start=current_start_frame * self.frame_seq_length,
+                )
+                current_start_frame += self.num_frame_per_block
+        # Step 3: Temporal denoising loop
+        all_num_frames = [self.num_frame_per_block] * num_blocks
+        if profile:
+            diffusion_start = torch.cuda.Event(enable_timing=True)
+            diffusion_end = torch.cuda.Event(enable_timing=True)
+        for current_num_frames in tqdm(all_num_frames):
+            noisy_input = noise[
+                :, :, current_start_frame - num_input_frames:current_start_frame + current_num_frames - num_input_frames]
+            # Step 3.1: Spatial denoising loop
+            if profile:
+                torch.cuda.synchronize()
+                diffusion_start.record()
+            for index, current_timestep in enumerate(self.denoising_step_list):
+                # set current timestep
+                timestep = torch.ones(
+                    [batch_size, current_num_frames],
+                    device=noise.device,
+                    dtype=torch.int64) * current_timestep
+                if index < len(self.denoising_step_list) - 1:
+                    _, denoised_pred = self.generator(
+                        noisy_image_or_video=noisy_input,
+                        conditional_dict=cond_current(conditional_dict, current_start_frame, self.num_frame_per_block, mode=mode),
+                        timestep=timestep,
+                        kv_cache=self.kv_cache1,
+                        kv_cache_mouse=self.kv_cache_mouse,
+                        kv_cache_keyboard=self.kv_cache_keyboard,
+                        crossattn_cache=self.crossattn_cache,
+                        current_start=current_start_frame * self.frame_seq_length
+                    )
+                    next_timestep = self.denoising_step_list[index + 1]
+                    noisy_input = self.scheduler.add_noise(
+                        rearrange(denoised_pred, 'b c f h w -> (b f) c h w'),# .flatten(0, 1),
+                        torch.randn_like(rearrange(denoised_pred, 'b c f h w -> (b f) c h w')),
+                        next_timestep * torch.ones(
+                            [batch_size * current_num_frames], device=noise.device, dtype=torch.long)
+                    )
+                    noisy_input = rearrange(noisy_input, '(b f) c h w -> b c f h w', b=denoised_pred.shape[0])
+                else:
+                    # for getting real output
+                    _, denoised_pred = self.generator(
+                        noisy_image_or_video=noisy_input,
+                        conditional_dict=cond_current(conditional_dict, current_start_frame, self.num_frame_per_block, mode=mode),
+                        timestep=timestep,
+                        kv_cache=self.kv_cache1,
+                        kv_cache_mouse=self.kv_cache_mouse,
+                        kv_cache_keyboard=self.kv_cache_keyboard,
+                        crossattn_cache=self.crossattn_cache,
+                        current_start=current_start_frame * self.frame_seq_length
+                    )
+            # Step 3.2: record the model's output
+            output[:, :, current_start_frame:current_start_frame + current_num_frames] = denoised_pred
+            # Step 3.3: rerun with timestep zero to update KV cache using clean context
+            context_timestep = torch.ones_like(timestep) * self.args.context_noise
+            self.generator(
+                noisy_image_or_video=denoised_pred,
+                conditional_dict=cond_current(conditional_dict, current_start_frame, self.num_frame_per_block, mode=mode),
+                timestep=context_timestep,
+                kv_cache=self.kv_cache1,
+                kv_cache_mouse=self.kv_cache_mouse,
+                kv_cache_keyboard=self.kv_cache_keyboard,
+                crossattn_cache=self.crossattn_cache,
+                current_start=current_start_frame * self.frame_seq_length,
+            )
+            # Step 3.4: update the start and end frame indices
+            current_start_frame += current_num_frames
+            denoised_pred = denoised_pred.transpose(1,2)
+            video, vae_cache = self.vae_decoder(denoised_pred.half(), *vae_cache)
+            videos += [video]
+            if profile:
+                torch.cuda.synchronize()
+                diffusion_end.record()
+                diffusion_time = diffusion_start.elapsed_time(diffusion_end)
+                print(f"diffusion_time: {diffusion_time}", flush=True)
+                fps = video.shape[1]*1000/ diffusion_time
+                print(f"  - FPS: {fps:.2f}")
+        if return_latents:
+            return output
+        else:
+            return videos
+    def _initialize_kv_cache(self, batch_size, dtype, device):
+        """
+        Initialize a Per-GPU KV cache for the Wan model.
+        """
+        kv_cache1 = []
+        if self.local_attn_size != -1:
+            # Use the local attention size to compute the KV cache size
+            kv_cache_size = self.local_attn_size * self.frame_seq_length
+        else:
+            # Use the default KV cache size
+            kv_cache_size = 15 * 1 * self.frame_seq_length # 32760
+        for _ in range(self.num_transformer_blocks):
+            kv_cache1.append({
+                "k": torch.zeros([batch_size, kv_cache_size, 12, 128], dtype=dtype, device=device),
+                "v": torch.zeros([batch_size, kv_cache_size, 12, 128], dtype=dtype, device=device),
+                "global_end_index": torch.tensor([0], dtype=torch.long, device=device),
+                "local_end_index": torch.tensor([0], dtype=torch.long, device=device)
+            })
+        self.kv_cache1 = kv_cache1  # always store the clean cache
+    def _initialize_kv_cache_mouse_and_keyboard(self, batch_size, dtype, device):
+        """
+        Initialize a Per-GPU KV cache for the Wan model.
+        """
+        kv_cache_mouse = []
+        kv_cache_keyboard = []
+        if self.local_attn_size != -1:
+            kv_cache_size = self.local_attn_size
+        else:
+            kv_cache_size = 15 * 1
+        for _ in range(self.num_transformer_blocks):
+            kv_cache_keyboard.append({
+                "k": torch.zeros([batch_size, kv_cache_size, 16, 64], dtype=dtype, device=device),
+                "v": torch.zeros([batch_size, kv_cache_size, 16, 64], dtype=dtype, device=device),
+                "global_end_index": torch.tensor([0], dtype=torch.long, device=device),
+                "local_end_index": torch.tensor([0], dtype=torch.long, device=device)
+            })
+            kv_cache_mouse.append({
+                "k": torch.zeros([batch_size * self.frame_seq_length, kv_cache_size, 16, 64], dtype=dtype, device=device),
+                "v": torch.zeros([batch_size * self.frame_seq_length, kv_cache_size, 16, 64], dtype=dtype, device=device),
+                "global_end_index": torch.tensor([0], dtype=torch.long, device=device),
+                "local_end_index": torch.tensor([0], dtype=torch.long, device=device)
+            })
+        self.kv_cache_keyboard = kv_cache_keyboard  # always store the clean cache
+        self.kv_cache_mouse = kv_cache_mouse  # always store the clean cache
+    def _initialize_crossattn_cache(self, batch_size, dtype, device):
+        """
+        Initialize a Per-GPU cross-attention cache for the Wan model.
+        """
+        crossattn_cache = []
+        for _ in range(self.num_transformer_blocks):
+            crossattn_cache.append({
+                "k": torch.zeros([batch_size, 257, 12, 128], dtype=dtype, device=device),
+                "v": torch.zeros([batch_size, 257, 12, 128], dtype=dtype, device=device),
+                "is_init": False
+            })
+        self.crossattn_cache = crossattn_cache
+class CausalInferenceStreamingPipeline(torch.nn.Module):
+    def __init__(
+            self,
+            args,
+            device="cuda",
+            vae_decoder=None,
+            generator=None,
+    ):
+        super().__init__()
+        # Step 1: Initialize all models
+        self.generator = WanDiffusionWrapper(
+            **getattr(args, "model_kwargs", {}), is_causal=True) if generator is None else generator
+        self.vae_decoder = vae_decoder
+        # Step 2: Initialize all causal hyperparmeters
+        self.scheduler = self.generator.get_scheduler()
+        self.denoising_step_list = torch.tensor(
+            args.denoising_step_list, dtype=torch.long)
+        if args.warp_denoising_step:
+            timesteps = torch.cat((self.scheduler.timesteps.cpu(), torch.tensor([0], dtype=torch.float32)))
+            self.denoising_step_list = timesteps[1000 - self.denoising_step_list]
+        self.num_transformer_blocks = 30
+        self.frame_seq_length = 880 # 1590 # HW/4
+        self.kv_cache1 = None
+        self.kv_cache_mouse = None
+        self.kv_cache_keyboard = None
+        self.args = args
+        self.num_frame_per_block = getattr(args, "num_frame_per_block", 1)
+        self.local_attn_size = self.generator.model.local_attn_size
+        assert self.local_attn_size != -1
+        print(f"KV inference with {self.num_frame_per_block} frames per block")
+        if self.num_frame_per_block > 1:
+            self.generator.model.num_frame_per_block = self.num_frame_per_block
+    def inference(
+        self,
+        noise: torch.Tensor,
+        conditional_dict,
+        initial_latent: Optional[torch.Tensor] = None,
+        return_latents: bool = False,
+        output_folder = None,
+        name = None,
+        mode = 'universal'
+    ) -> torch.Tensor:
+        """
+        Perform inference on the given noise and text prompts.
+        Inputs:
+            noise (torch.Tensor): The input noise tensor of shape
+                (batch_size, num_output_frames, num_channels, height, width).
+            text_prompts (List[str]): The list of text prompts.
+            initial_latent (torch.Tensor): The initial latent tensor of shape
+                (batch_size, num_input_frames, num_channels, height, width).
+                If num_input_frames is 1, perform image to video.
+                If num_input_frames is greater than 1, perform video extension.
+            return_latents (bool): Whether to return the latents.
+        Outputs:
+            video (torch.Tensor): The generated video tensor of shape
+                (batch_size, num_output_frames, num_channels, height, width).
+                It is normalized to be in the range [0, 1].
+        """
+        assert noise.shape[1] == 16
+        batch_size, num_channels, num_frames, height, width = noise.shape
+        assert num_frames % self.num_frame_per_block == 0
+        num_blocks = num_frames // self.num_frame_per_block
+        num_input_frames = initial_latent.shape[2] if initial_latent is not None else 0
+        num_output_frames = num_frames + num_input_frames  # add the initial latent frames
+        output = torch.zeros(
+            [batch_size, num_channels, num_output_frames, height, width],
+            device=noise.device,
+            dtype=noise.dtype
+        )
+        videos = []
+        vae_cache = copy.deepcopy(ZERO_VAE_CACHE)
+        for j in range(len(vae_cache)):
+            vae_cache[j] = None
+        # Set up profiling if requested
+        self.kv_cache1=self.kv_cache_keyboard=self.kv_cache_mouse=self.crossattn_cache=None
+        # Step 1: Initialize KV cache to all zeros
+        if self.kv_cache1 is None:
+            self._initialize_kv_cache(
+                batch_size=batch_size,
+                dtype=noise.dtype,
+                device=noise.device
+            )
+            self._initialize_kv_cache_mouse_and_keyboard(
+                batch_size=batch_size,
+                dtype=noise.dtype,
+                device=noise.device
+            )
+            self._initialize_crossattn_cache(
+                batch_size=batch_size,
+                dtype=noise.dtype,
+                device=noise.device
+            )
+        else:
+            # reset cross attn cache
+            for block_index in range(self.num_transformer_blocks):
+                self.crossattn_cache[block_index]["is_init"] = False
+            # reset kv cache
+            for block_index in range(len(self.kv_cache1)):
+                self.kv_cache1[block_index]["global_end_index"] = torch.tensor(
+                    [0], dtype=torch.long, device=noise.device)
+                self.kv_cache1[block_index]["local_end_index"] = torch.tensor(
+                    [0], dtype=torch.long, device=noise.device)
+                self.kv_cache_mouse[block_index]["global_end_index"] = torch.tensor(
+                    [0], dtype=torch.long, device=noise.device)
+                self.kv_cache_mouse[block_index]["local_end_index"] = torch.tensor(
+                    [0], dtype=torch.long, device=noise.device)
+                self.kv_cache_keyboard[block_index]["global_end_index"] = torch.tensor(
+                    [0], dtype=torch.long, device=noise.device)
+                self.kv_cache_keyboard[block_index]["local_end_index"] = torch.tensor(
+                    [0], dtype=torch.long, device=noise.device)
+        # Step 2: Cache context feature
+        current_start_frame = 0
+        if initial_latent is not None:
+            timestep = torch.ones([batch_size, 1], device=noise.device, dtype=torch.int64) * 0
+            # Assume num_input_frames is self.num_frame_per_block * num_input_blocks
+            assert num_input_frames % self.num_frame_per_block == 0
+            num_input_blocks = num_input_frames // self.num_frame_per_block
+            for _ in range(num_input_blocks):
+                current_ref_latents = \
+                    initial_latent[:, :, current_start_frame:current_start_frame + self.num_frame_per_block]
+                output[:, :, current_start_frame:current_start_frame + self.num_frame_per_block] = current_ref_latents
+                self.generator(
+                    noisy_image_or_video=current_ref_latents,
+                    conditional_dict=cond_current(conditional_dict, current_start_frame, self.num_frame_per_block, replace=True),
+                    timestep=timestep * 0,
+                    kv_cache=self.kv_cache1,
+                    kv_cache_mouse=self.kv_cache_mouse,
+                    kv_cache_keyboard=self.kv_cache_keyboard,
+                    crossattn_cache=self.crossattn_cache,
+                    current_start=current_start_frame * self.frame_seq_length,
+                )
+                current_start_frame += self.num_frame_per_block
+        # Step 3: Temporal denoising loop
+        all_num_frames = [self.num_frame_per_block] * num_blocks
+        for current_num_frames in all_num_frames:
+            noisy_input = noise[
+                :, :, current_start_frame - num_input_frames:current_start_frame + current_num_frames - num_input_frames]
+            current_actions = get_current_action(mode=mode)
+            new_act, conditional_dict = cond_current(conditional_dict, current_start_frame, self.num_frame_per_block, replace=current_actions, mode=mode)
+            # Step 3.1: Spatial denoising loop
+            for index, current_timestep in enumerate(self.denoising_step_list):
+                # set current timestep
+                timestep = torch.ones(
+                    [batch_size, current_num_frames],
+                    device=noise.device,
+                    dtype=torch.int64) * current_timestep
+                if index < len(self.denoising_step_list) - 1:
+                    _, denoised_pred = self.generator(
+                        noisy_image_or_video=noisy_input,
+                        conditional_dict=new_act,
+                        timestep=timestep,
+                        kv_cache=self.kv_cache1,
+                        kv_cache_mouse=self.kv_cache_mouse,
+                        kv_cache_keyboard=self.kv_cache_keyboard,
+                        crossattn_cache=self.crossattn_cache,
+                        current_start=current_start_frame * self.frame_seq_length
+                    )
+                    next_timestep = self.denoising_step_list[index + 1]
+                    noisy_input = self.scheduler.add_noise(
+                        rearrange(denoised_pred, 'b c f h w -> (b f) c h w'),# .flatten(0, 1),
+                        torch.randn_like(rearrange(denoised_pred, 'b c f h w -> (b f) c h w')),
+                        next_timestep * torch.ones(
+                            [batch_size * current_num_frames], device=noise.device, dtype=torch.long)
+                    )
+                    noisy_input = rearrange(noisy_input, '(b f) c h w -> b c f h w', b=denoised_pred.shape[0])
+                else:
+                    # for getting real output
+                    _, denoised_pred = self.generator(
+                        noisy_image_or_video=noisy_input,
+                        conditional_dict=new_act,
+                        timestep=timestep,
+                        kv_cache=self.kv_cache1,
+                        kv_cache_mouse=self.kv_cache_mouse,
+                        kv_cache_keyboard=self.kv_cache_keyboard,
+                        crossattn_cache=self.crossattn_cache,
+                        current_start=current_start_frame * self.frame_seq_length
+                    )
+            # Step 3.2: record the model's output
+            output[:, :, current_start_frame:current_start_frame + current_num_frames] = denoised_pred
+            # Step 3.3: rerun with timestep zero to update KV cache using clean context
+            context_timestep = torch.ones_like(timestep) * self.args.context_noise
+            self.generator(
+                noisy_image_or_video=denoised_pred,
+                conditional_dict=new_act,
+                timestep=context_timestep,
+                kv_cache=self.kv_cache1,
+                kv_cache_mouse=self.kv_cache_mouse,
+                kv_cache_keyboard=self.kv_cache_keyboard,
+                crossattn_cache=self.crossattn_cache,
+                current_start=current_start_frame * self.frame_seq_length,
+            )
+            # Step 3.4: update the start and end frame indices
+            denoised_pred = denoised_pred.transpose(1,2)
+            video, vae_cache = self.vae_decoder(denoised_pred.half(), *vae_cache)
+            videos += [video]
+            video = rearrange(video, "B T C H W -> B T H W C")
+            video = ((video.float() + 1) * 127.5).clip(0, 255).cpu().numpy().astype(np.uint8)[0]
+            video = np.ascontiguousarray(video)
+            mouse_icon = 'assets/images/mouse.png'
+            if mode != 'templerun':
+                config = (
+                    conditional_dict["keyboard_cond"][0, : 1 + 4 * (current_start_frame + self.num_frame_per_block-1)].float().cpu().numpy(),
+                    conditional_dict["mouse_cond"][0, : 1 + 4 * (current_start_frame + self.num_frame_per_block-1)].float().cpu().numpy(),
+                )
+            else:
+                config = (
+                    conditional_dict["keyboard_cond"][0, : 1 + 4 * (current_start_frame + self.num_frame_per_block-1)].float().cpu().numpy()
+                )
+            process_video(video.astype(np.uint8), output_folder+f'/{name}_current.mp4', config, mouse_icon, mouse_scale=0.1, process_icon=False, mode=mode)
+            current_start_frame += current_num_frames
+            if input("Continue? (Press `n` to break)").strip() == "n":
+                break
+        videos_tensor = torch.cat(videos, dim=1)
+        videos = rearrange(videos_tensor, "B T C H W -> B T H W C")
+        videos = ((videos.float() + 1) * 127.5).clip(0, 255).cpu().numpy().astype(np.uint8)[0]
+        video = np.ascontiguousarray(videos)
+        mouse_icon = 'assets/images/mouse.png'
+        if mode != 'templerun':
+            config = (
+                conditional_dict["keyboard_cond"][0, : 1 + 4 * (current_start_frame + self.num_frame_per_block-1)].float().cpu().numpy(),
+                conditional_dict["mouse_cond"][0, : 1 + 4 * (current_start_frame + self.num_frame_per_block-1)].float().cpu().numpy(),
+            )
+        else:
+            config = (
+                conditional_dict["keyboard_cond"][0, : 1 + 4 * (current_start_frame + self.num_frame_per_block-1)].float().cpu().numpy()
+            )
+        process_video(video.astype(np.uint8), output_folder+f'/{name}_icon.mp4', config, mouse_icon, mouse_scale=0.1, mode=mode)
+        process_video(video.astype(np.uint8), output_folder+f'/{name}.mp4', config, mouse_icon, mouse_scale=0.1, process_icon=False, mode=mode)
+        if return_latents:
+            return output
+        else:
+            return video
+    def _initialize_kv_cache(self, batch_size, dtype, device):
+        """
+        Initialize a Per-GPU KV cache for the Wan model.
+        """
+        kv_cache1 = []
+        if self.local_attn_size != -1:
+            # Use the local attention size to compute the KV cache size
+            kv_cache_size = self.local_attn_size * self.frame_seq_length
+        else:
+            # Use the default KV cache size
+            kv_cache_size = 15 * 1 * self.frame_seq_length # 32760
+        for _ in range(self.num_transformer_blocks):
+            kv_cache1.append({
+                "k": torch.zeros([batch_size, kv_cache_size, 12, 128], dtype=dtype, device=device),
+                "v": torch.zeros([batch_size, kv_cache_size, 12, 128], dtype=dtype, device=device),
+                "global_end_index": torch.tensor([0], dtype=torch.long, device=device),
+                "local_end_index": torch.tensor([0], dtype=torch.long, device=device)
+            })
+        self.kv_cache1 = kv_cache1  # always store the clean cache
+    def _initialize_kv_cache_mouse_and_keyboard(self, batch_size, dtype, device):
+        """
+        Initialize a Per-GPU KV cache for the Wan model.
+        """
+        kv_cache_mouse = []
+        kv_cache_keyboard = []
+        if self.local_attn_size != -1:
+            kv_cache_size = self.local_attn_size
+        else:
+            kv_cache_size = 15 * 1
+        for _ in range(self.num_transformer_blocks):
+            kv_cache_keyboard.append({
+                "k": torch.zeros([batch_size, kv_cache_size, 16, 64], dtype=dtype, device=device),
+                "v": torch.zeros([batch_size, kv_cache_size, 16, 64], dtype=dtype, device=device),
+                "global_end_index": torch.tensor([0], dtype=torch.long, device=device),
+                "local_end_index": torch.tensor([0], dtype=torch.long, device=device)
+            })
+            kv_cache_mouse.append({
+                "k": torch.zeros([batch_size * self.frame_seq_length, kv_cache_size, 16, 64], dtype=dtype, device=device),
+                "v": torch.zeros([batch_size * self.frame_seq_length, kv_cache_size, 16, 64], dtype=dtype, device=device),
+                "global_end_index": torch.tensor([0], dtype=torch.long, device=device),
+                "local_end_index": torch.tensor([0], dtype=torch.long, device=device)
+            })
+        self.kv_cache_keyboard = kv_cache_keyboard  # always store the clean cache
+        self.kv_cache_mouse = kv_cache_mouse  # always store the clean cache
+    def _initialize_crossattn_cache(self, batch_size, dtype, device):
+        """
+        Initialize a Per-GPU cross-attention cache for the Wan model.
+        """
+        crossattn_cache = []
+        for _ in range(self.num_transformer_blocks):
+            crossattn_cache.append({
+                "k": torch.zeros([batch_size, 257, 12, 128], dtype=dtype, device=device),
+                "v": torch.zeros([batch_size, 257, 12, 128], dtype=dtype, device=device),
+                "is_init": False
+            })
+        self.crossattn_cache = crossattn_cache

requirements.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+torch>=2.4.0
+torchvision>=0.19.0
+opencv-python>=4.9.0.80
+diffusers
+transformers>=4.49.0
+tokenizers>=0.20.3
+accelerate>=1.1.1
+tqdm
+imageio
+easydict
+ftfy
+dashscope
+imageio-ffmpeg
+numpy
+wandb
+omegaconf
+einops
+av
+safetensors
+opencv-python
+git+https://github.com/openai/CLIP.git
+open_clip_torch
+starlette
+pycocotools
+lmdb
+matplotlib
+sentencepiece
+pydantic
+scikit-image
+huggingface_hub[cli]
+dominate
+nvidia-pyindex
+nvidia-tensorrt
+pycuda
+onnx
+onnxruntime
+onnxscript
+onnxconverter_common
+flask
+flask-socketio
+torchao

setup.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from setuptools import setup, find_packages
+setup(
+    name="matrix-game-2.0",
+    version="0.0.1",
+    packages=find_packages(),
+)