NON_WORKING_matrix_game_2

Paused

Julian Bilcke Claude commited on Aug 14

Commit

e1c4171

1 Parent(s): 3605c07

Fix tensor dimension mismatch in Matrix-Game V2 streaming pipeline

- Convert WebSocket keyboard format [6 elements] to pipeline format [4 elements] at engine level
- Add mode-specific keyboard format conversion (universal, gta_drive, templerun)
- Fix conditional_dict tensor building with correct dimensions
- Simplify WebSocket pipeline to handle single dominant action selection
- Resolve RuntimeError: tensor size mismatch in streaming inference loop

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (2) hide show

api_engine.py +31 -11
websocket_pipeline.py +23 -55

api_engine.py CHANGED Viewed

@@ -343,9 +343,33 @@ class MatrixGameEngine:
             logger.info(f"Using {max_num_output_frames} output frames -> {condition_num_frames} condition frames")
-            # Create condition tensors with the correct length
-            keyboard_tensor = torch.tensor(keyboard_condition * condition_num_frames, dtype=self.weight_dtype).unsqueeze(0).to(self.device)
-            mouse_tensor = torch.tensor(mouse_condition * condition_num_frames, dtype=self.weight_dtype).unsqueeze(0).to(self.device)
             logger.debug(f"Keyboard tensor shape: {keyboard_tensor.shape}")
             logger.debug(f"Mouse tensor shape: {mouse_tensor.shape}")
@@ -396,12 +420,8 @@ class MatrixGameEngine:
                 logger.debug("Starting pipeline.inference()...")
                 # Use inference method for single batch generation with WebSocket conditions
-                # The streaming pipeline expects single-frame action states, not expanded tensors
-                # Convert from WebSocket format [forward, back, left, right, jump, attack] and [x, y]
-                kb_state = keyboard_condition[0] if keyboard_condition else [0, 0, 0, 0, 0, 0]
-                mouse_state = mouse_condition[0] if mouse_condition else [0, 0]
-                logger.debug(f"Passing to pipeline - KB: {kb_state}, Mouse: {mouse_state}")
                 outputs = self.pipeline.inference(
                     noise=sampled_noise,
@@ -410,8 +430,8 @@ class MatrixGameEngine:
                     output_folder=None,  # Don't save to disk
                     name=None,
                     mode=mode,
-                    keyboard_condition=kb_state,
-                    mouse_condition=mouse_state
                 )
                 inference_time = time.time() - inference_start

             logger.info(f"Using {max_num_output_frames} output frames -> {condition_num_frames} condition frames")
+            # Convert WebSocket format to pipeline format BEFORE building tensors
+            # WebSocket: [forward, back, left, right, jump, attack] -> Pipeline: [forward, back, left, right]
+            ws_keyboard = keyboard_condition[0] if keyboard_condition else [0, 0, 0, 0, 0, 0]
+            ws_mouse = mouse_condition[0] if mouse_condition else [0, 0]
+            # Convert to pipeline format based on mode
+            if mode == 'universal':
+                # Use first 4 elements for universal mode: [forward, back, left, right]
+                pipeline_keyboard = ws_keyboard[:4]
+            elif mode == 'gta_drive':
+                # Use forward and back for GTA Drive: [forward, back]
+                pipeline_keyboard = ws_keyboard[:2]
+            elif mode in ['templerun', 'temple_run']:
+                # Use left and right for Temple Run: [left, right]
+                pipeline_keyboard = [ws_keyboard[2], ws_keyboard[3]]
+            else:
+                # Default to universal
+                pipeline_keyboard = ws_keyboard[:4]
+            pipeline_mouse = ws_mouse  # Mouse format is consistent
+            logger.debug(f"Converted WebSocket KB {ws_keyboard} -> Pipeline KB {pipeline_keyboard}")
+            logger.debug(f"Mode: {mode}, Mouse: {pipeline_mouse}")
+            # Create condition tensors with the correct format and length
+            keyboard_tensor = torch.tensor([pipeline_keyboard] * condition_num_frames, dtype=self.weight_dtype).unsqueeze(0).to(self.device)
+            mouse_tensor = torch.tensor([pipeline_mouse] * condition_num_frames, dtype=self.weight_dtype).unsqueeze(0).to(self.device)
             logger.debug(f"Keyboard tensor shape: {keyboard_tensor.shape}")
             logger.debug(f"Mouse tensor shape: {mouse_tensor.shape}")
                 logger.debug("Starting pipeline.inference()...")
                 # Use inference method for single batch generation with WebSocket conditions
+                # Pass the already-converted pipeline format conditions
+                logger.debug(f"Passing to pipeline - KB: {pipeline_keyboard}, Mouse: {pipeline_mouse}")
                 outputs = self.pipeline.inference(
                     noise=sampled_noise,
                     output_folder=None,  # Don't save to disk
                     name=None,
                     mode=mode,
+                    keyboard_condition=pipeline_keyboard,
+                    mouse_condition=pipeline_mouse
                 )
                 inference_time = time.time() - inference_start

websocket_pipeline.py CHANGED Viewed

@@ -30,72 +30,40 @@ class WebSocketStreamingPipeline(CausalInferenceStreamingPipeline):
         Get current action from stored WebSocket data instead of stdin
         Returns the same format as get_current_action()
-        The original pipeline expects SINGLE ACTION vectors, not multi-action states:
-        - Universal: keyboard [1,0,0,0] = forward only, [0,0,1,0] = left only
-        - WebSocket gives: [forward, back, left, right, jump, attack] with multiple 1s
-        We need to convert multi-action to dominant single action.
         """
-        # Get WebSocket format: [forward, back, left, right, jump, attack]
         if self.current_keyboard is None:
-            ws_keyboard = [0, 0, 0, 0, 0, 0]
         else:
-            ws_keyboard = self.current_keyboard
         if self.current_mouse is None:
-            ws_mouse = [0, 0]
-        else:
-            ws_mouse = self.current_mouse
-        # Convert WebSocket multi-action to single dominant action
-        if mode == 'universal':
-            # Pipeline expects: [forward, back, left, right] as single action
-            # Priority order: forward > back > left > right > no action
-            if ws_keyboard[0]:  # forward
-                keyboard = [1, 0, 0, 0]
-            elif ws_keyboard[1]:  # back
-                keyboard = [0, 1, 0, 0]
-            elif ws_keyboard[2]:  # left
-                keyboard = [0, 0, 1, 0]
-            elif ws_keyboard[3]:  # right
-                keyboard = [0, 0, 0, 1]
-            else:  # no action
-                keyboard = [0, 0, 0, 0]
-        elif mode == 'gta_drive':
-            # Pipeline expects: [forward, back] as single action
-            if ws_keyboard[0]:  # forward
-                keyboard = [1, 0]
-            elif ws_keyboard[1]:  # back
-                keyboard = [0, 1]
-            else:  # no action
-                keyboard = [0, 0]
-        elif mode == 'templerun':
-            # Pipeline expects: [left, right] as single action
-            if ws_keyboard[2]:  # left
-                keyboard = [1, 0]
-            elif ws_keyboard[3]:  # right
-                keyboard = [0, 1]
-            else:  # no action
-                keyboard = [0, 0]
-        else:
-            # Default to universal
-            keyboard = [0, 0, 0, 0]
-        # Mouse handling - use raw WebSocket values for most modes
-        if mode == 'templerun':
-            # Temple Run doesn't use mouse
             mouse = [0, 0]
         else:
-            # Use WebSocket mouse values directly (they should be in [-1, 1] range)
-            mouse = ws_mouse
         # Convert to tensors
         mouse_tensor = torch.tensor(mouse, dtype=torch.float32).cuda()
-        keyboard_tensor = torch.tensor(keyboard, dtype=torch.float32).cuda()
-        logger.debug(f"WebSocket action for mode {mode}: ws_kb={ws_keyboard}, ws_mouse={ws_mouse} -> kb={keyboard}, mouse={mouse}")
         return {
             'mouse': mouse_tensor,

         Get current action from stored WebSocket data instead of stdin
         Returns the same format as get_current_action()
+        Note: The format conversion is now handled at the engine level,
+        so we just need to convert multi-action to single dominant action.
         """
+        # Get current states (already in pipeline format from engine conversion)
         if self.current_keyboard is None:
+            keyboard = [0, 0, 0, 0] if mode == 'universal' else [0, 0]
         else:
+            keyboard = self.current_keyboard
         if self.current_mouse is None:
             mouse = [0, 0]
         else:
+            mouse = self.current_mouse
+        # Convert multi-action to single dominant action for the streaming pipeline
+        # This handles cases where multiple keys might be pressed simultaneously
+        dominant_keyboard = []
+        for i, val in enumerate(keyboard):
+            if val > 0:
+                # Create one-hot vector with this action
+                dominant = [0] * len(keyboard)
+                dominant[i] = 1
+                dominant_keyboard = dominant
+                break
+        if not dominant_keyboard:
+            # No action pressed
+            dominant_keyboard = [0] * len(keyboard)
         # Convert to tensors
         mouse_tensor = torch.tensor(mouse, dtype=torch.float32).cuda()
+        keyboard_tensor = torch.tensor(dominant_keyboard, dtype=torch.float32).cuda()
+        logger.debug(f"WebSocket action for mode {mode}: kb={keyboard} -> dominant_kb={dominant_keyboard}, mouse={mouse}")
         return {
             'mouse': mouse_tensor,