Julian Bilcke
Claude
commited on
Commit
·
e1c4171
1
Parent(s):
3605c07
Fix tensor dimension mismatch in Matrix-Game V2 streaming pipeline
Browse files- Convert WebSocket keyboard format [6 elements] to pipeline format [4 elements] at engine level
- Add mode-specific keyboard format conversion (universal, gta_drive, templerun)
- Fix conditional_dict tensor building with correct dimensions
- Simplify WebSocket pipeline to handle single dominant action selection
- Resolve RuntimeError: tensor size mismatch in streaming inference loop
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <[email protected]>
- api_engine.py +31 -11
- websocket_pipeline.py +23 -55
api_engine.py
CHANGED
|
@@ -343,9 +343,33 @@ class MatrixGameEngine:
|
|
| 343 |
|
| 344 |
logger.info(f"Using {max_num_output_frames} output frames -> {condition_num_frames} condition frames")
|
| 345 |
|
| 346 |
-
#
|
| 347 |
-
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
logger.debug(f"Keyboard tensor shape: {keyboard_tensor.shape}")
|
| 351 |
logger.debug(f"Mouse tensor shape: {mouse_tensor.shape}")
|
|
@@ -396,12 +420,8 @@ class MatrixGameEngine:
|
|
| 396 |
logger.debug("Starting pipeline.inference()...")
|
| 397 |
|
| 398 |
# Use inference method for single batch generation with WebSocket conditions
|
| 399 |
-
#
|
| 400 |
-
|
| 401 |
-
kb_state = keyboard_condition[0] if keyboard_condition else [0, 0, 0, 0, 0, 0]
|
| 402 |
-
mouse_state = mouse_condition[0] if mouse_condition else [0, 0]
|
| 403 |
-
|
| 404 |
-
logger.debug(f"Passing to pipeline - KB: {kb_state}, Mouse: {mouse_state}")
|
| 405 |
|
| 406 |
outputs = self.pipeline.inference(
|
| 407 |
noise=sampled_noise,
|
|
@@ -410,8 +430,8 @@ class MatrixGameEngine:
|
|
| 410 |
output_folder=None, # Don't save to disk
|
| 411 |
name=None,
|
| 412 |
mode=mode,
|
| 413 |
-
keyboard_condition=
|
| 414 |
-
mouse_condition=
|
| 415 |
)
|
| 416 |
|
| 417 |
inference_time = time.time() - inference_start
|
|
|
|
| 343 |
|
| 344 |
logger.info(f"Using {max_num_output_frames} output frames -> {condition_num_frames} condition frames")
|
| 345 |
|
| 346 |
+
# Convert WebSocket format to pipeline format BEFORE building tensors
|
| 347 |
+
# WebSocket: [forward, back, left, right, jump, attack] -> Pipeline: [forward, back, left, right]
|
| 348 |
+
ws_keyboard = keyboard_condition[0] if keyboard_condition else [0, 0, 0, 0, 0, 0]
|
| 349 |
+
ws_mouse = mouse_condition[0] if mouse_condition else [0, 0]
|
| 350 |
+
|
| 351 |
+
# Convert to pipeline format based on mode
|
| 352 |
+
if mode == 'universal':
|
| 353 |
+
# Use first 4 elements for universal mode: [forward, back, left, right]
|
| 354 |
+
pipeline_keyboard = ws_keyboard[:4]
|
| 355 |
+
elif mode == 'gta_drive':
|
| 356 |
+
# Use forward and back for GTA Drive: [forward, back]
|
| 357 |
+
pipeline_keyboard = ws_keyboard[:2]
|
| 358 |
+
elif mode in ['templerun', 'temple_run']:
|
| 359 |
+
# Use left and right for Temple Run: [left, right]
|
| 360 |
+
pipeline_keyboard = [ws_keyboard[2], ws_keyboard[3]]
|
| 361 |
+
else:
|
| 362 |
+
# Default to universal
|
| 363 |
+
pipeline_keyboard = ws_keyboard[:4]
|
| 364 |
+
|
| 365 |
+
pipeline_mouse = ws_mouse # Mouse format is consistent
|
| 366 |
+
|
| 367 |
+
logger.debug(f"Converted WebSocket KB {ws_keyboard} -> Pipeline KB {pipeline_keyboard}")
|
| 368 |
+
logger.debug(f"Mode: {mode}, Mouse: {pipeline_mouse}")
|
| 369 |
+
|
| 370 |
+
# Create condition tensors with the correct format and length
|
| 371 |
+
keyboard_tensor = torch.tensor([pipeline_keyboard] * condition_num_frames, dtype=self.weight_dtype).unsqueeze(0).to(self.device)
|
| 372 |
+
mouse_tensor = torch.tensor([pipeline_mouse] * condition_num_frames, dtype=self.weight_dtype).unsqueeze(0).to(self.device)
|
| 373 |
|
| 374 |
logger.debug(f"Keyboard tensor shape: {keyboard_tensor.shape}")
|
| 375 |
logger.debug(f"Mouse tensor shape: {mouse_tensor.shape}")
|
|
|
|
| 420 |
logger.debug("Starting pipeline.inference()...")
|
| 421 |
|
| 422 |
# Use inference method for single batch generation with WebSocket conditions
|
| 423 |
+
# Pass the already-converted pipeline format conditions
|
| 424 |
+
logger.debug(f"Passing to pipeline - KB: {pipeline_keyboard}, Mouse: {pipeline_mouse}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
|
| 426 |
outputs = self.pipeline.inference(
|
| 427 |
noise=sampled_noise,
|
|
|
|
| 430 |
output_folder=None, # Don't save to disk
|
| 431 |
name=None,
|
| 432 |
mode=mode,
|
| 433 |
+
keyboard_condition=pipeline_keyboard,
|
| 434 |
+
mouse_condition=pipeline_mouse
|
| 435 |
)
|
| 436 |
|
| 437 |
inference_time = time.time() - inference_start
|
websocket_pipeline.py
CHANGED
|
@@ -30,72 +30,40 @@ class WebSocketStreamingPipeline(CausalInferenceStreamingPipeline):
|
|
| 30 |
Get current action from stored WebSocket data instead of stdin
|
| 31 |
Returns the same format as get_current_action()
|
| 32 |
|
| 33 |
-
The
|
| 34 |
-
|
| 35 |
-
- WebSocket gives: [forward, back, left, right, jump, attack] with multiple 1s
|
| 36 |
-
|
| 37 |
-
We need to convert multi-action to dominant single action.
|
| 38 |
"""
|
| 39 |
-
# Get
|
| 40 |
if self.current_keyboard is None:
|
| 41 |
-
|
| 42 |
else:
|
| 43 |
-
|
| 44 |
|
| 45 |
if self.current_mouse is None:
|
| 46 |
-
ws_mouse = [0, 0]
|
| 47 |
-
else:
|
| 48 |
-
ws_mouse = self.current_mouse
|
| 49 |
-
|
| 50 |
-
# Convert WebSocket multi-action to single dominant action
|
| 51 |
-
if mode == 'universal':
|
| 52 |
-
# Pipeline expects: [forward, back, left, right] as single action
|
| 53 |
-
# Priority order: forward > back > left > right > no action
|
| 54 |
-
if ws_keyboard[0]: # forward
|
| 55 |
-
keyboard = [1, 0, 0, 0]
|
| 56 |
-
elif ws_keyboard[1]: # back
|
| 57 |
-
keyboard = [0, 1, 0, 0]
|
| 58 |
-
elif ws_keyboard[2]: # left
|
| 59 |
-
keyboard = [0, 0, 1, 0]
|
| 60 |
-
elif ws_keyboard[3]: # right
|
| 61 |
-
keyboard = [0, 0, 0, 1]
|
| 62 |
-
else: # no action
|
| 63 |
-
keyboard = [0, 0, 0, 0]
|
| 64 |
-
|
| 65 |
-
elif mode == 'gta_drive':
|
| 66 |
-
# Pipeline expects: [forward, back] as single action
|
| 67 |
-
if ws_keyboard[0]: # forward
|
| 68 |
-
keyboard = [1, 0]
|
| 69 |
-
elif ws_keyboard[1]: # back
|
| 70 |
-
keyboard = [0, 1]
|
| 71 |
-
else: # no action
|
| 72 |
-
keyboard = [0, 0]
|
| 73 |
-
|
| 74 |
-
elif mode == 'templerun':
|
| 75 |
-
# Pipeline expects: [left, right] as single action
|
| 76 |
-
if ws_keyboard[2]: # left
|
| 77 |
-
keyboard = [1, 0]
|
| 78 |
-
elif ws_keyboard[3]: # right
|
| 79 |
-
keyboard = [0, 1]
|
| 80 |
-
else: # no action
|
| 81 |
-
keyboard = [0, 0]
|
| 82 |
-
else:
|
| 83 |
-
# Default to universal
|
| 84 |
-
keyboard = [0, 0, 0, 0]
|
| 85 |
-
|
| 86 |
-
# Mouse handling - use raw WebSocket values for most modes
|
| 87 |
-
if mode == 'templerun':
|
| 88 |
-
# Temple Run doesn't use mouse
|
| 89 |
mouse = [0, 0]
|
| 90 |
else:
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
# Convert to tensors
|
| 95 |
mouse_tensor = torch.tensor(mouse, dtype=torch.float32).cuda()
|
| 96 |
-
keyboard_tensor = torch.tensor(
|
| 97 |
|
| 98 |
-
logger.debug(f"WebSocket action for mode {mode}:
|
| 99 |
|
| 100 |
return {
|
| 101 |
'mouse': mouse_tensor,
|
|
|
|
| 30 |
Get current action from stored WebSocket data instead of stdin
|
| 31 |
Returns the same format as get_current_action()
|
| 32 |
|
| 33 |
+
Note: The format conversion is now handled at the engine level,
|
| 34 |
+
so we just need to convert multi-action to single dominant action.
|
|
|
|
|
|
|
|
|
|
| 35 |
"""
|
| 36 |
+
# Get current states (already in pipeline format from engine conversion)
|
| 37 |
if self.current_keyboard is None:
|
| 38 |
+
keyboard = [0, 0, 0, 0] if mode == 'universal' else [0, 0]
|
| 39 |
else:
|
| 40 |
+
keyboard = self.current_keyboard
|
| 41 |
|
| 42 |
if self.current_mouse is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
mouse = [0, 0]
|
| 44 |
else:
|
| 45 |
+
mouse = self.current_mouse
|
| 46 |
+
|
| 47 |
+
# Convert multi-action to single dominant action for the streaming pipeline
|
| 48 |
+
# This handles cases where multiple keys might be pressed simultaneously
|
| 49 |
+
dominant_keyboard = []
|
| 50 |
+
for i, val in enumerate(keyboard):
|
| 51 |
+
if val > 0:
|
| 52 |
+
# Create one-hot vector with this action
|
| 53 |
+
dominant = [0] * len(keyboard)
|
| 54 |
+
dominant[i] = 1
|
| 55 |
+
dominant_keyboard = dominant
|
| 56 |
+
break
|
| 57 |
+
|
| 58 |
+
if not dominant_keyboard:
|
| 59 |
+
# No action pressed
|
| 60 |
+
dominant_keyboard = [0] * len(keyboard)
|
| 61 |
|
| 62 |
# Convert to tensors
|
| 63 |
mouse_tensor = torch.tensor(mouse, dtype=torch.float32).cuda()
|
| 64 |
+
keyboard_tensor = torch.tensor(dominant_keyboard, dtype=torch.float32).cuda()
|
| 65 |
|
| 66 |
+
logger.debug(f"WebSocket action for mode {mode}: kb={keyboard} -> dominant_kb={dominant_keyboard}, mouse={mouse}")
|
| 67 |
|
| 68 |
return {
|
| 69 |
'mouse': mouse_tensor,
|