Upload 4 files

Browse files

Files changed (4) hide show

config.json +16 -0
modeling_bitmar.py +1136 -0
pytorch_model.bin +3 -0
training_metadata.json +223 -0

config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "architectures": [
+    "BitMarModel"
+  ],
+  "model_type": "bitmar",
+  "vocab_size": 50257,
+  "text_encoder_dim": 128,
+  "text_encoder_layers": 4,
+  "text_encoder_heads": 4,
+  "vision_encoder_dim": 768,
+  "vision_latent_size": 128,
+  "fusion_hidden_size": 128,
+  "max_seq_len": 256,
+  "dropout": 0.15,
+  "torch_dtype": "float32"
+}

modeling_bitmar.py ADDED Viewed

	@@ -0,0 +1,1136 @@

+"""
+BitMar Model for Hugging Face Transformers
+BitNet-quantized Vision-Language Episodic Memory Transformer
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import logging
+import math
+import os
+import pickle
+import gzip
+from typing import Dict, List, Optional, Tuple, Union
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutput, BaseModelOutput
+import time
+logger = logging.getLogger(__name__)
+class BitMarConfig(PretrainedConfig):
+    """Configuration class for BitMar model"""
+    model_type = "bitmar"
+    def __init__(
+        self,
+        vocab_size: int = 50257,
+        text_encoder_dim: int = 128,
+        text_encoder_layers: int = 4,
+        text_encoder_heads: int = 4,
+        text_decoder_dim: int = 128,
+        text_decoder_layers: int = 4,
+        text_decoder_heads: int = 4,
+        vision_encoder_dim: int = 768,
+        vision_latent_size: int = 128,
+        vision_hidden_size: int = 64,
+        vision_compression_method: str = "learned_compression",
+        vision_spatial_pooling: bool = True,
+        vision_pool_size: int = 2,
+        fusion_hidden_size: int = 128,
+        fusion_num_heads: int = 4,
+        fusion_num_layers: int = 2,
+        memory_alpha: float = 0.2,
+        direct_writing: bool = True,
+        memory_compression: bool = True,
+        max_seq_len: int = 256,
+        dropout: float = 0.15,
+        initializer_range: float = 0.02,
+        layer_norm_epsilon: float = 1e-5,
+        use_cache: bool = True,
+        tie_word_embeddings: bool = True,
+        pad_token_id: int = 50256,
+        bos_token_id: int = 50256,
+        eos_token_id: int = 50256,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs
+        )
+        self.vocab_size = vocab_size
+        self.text_encoder_dim = text_encoder_dim
+        self.text_encoder_layers = text_encoder_layers
+        self.text_encoder_heads = text_encoder_heads
+        self.text_decoder_dim = text_decoder_dim
+        self.text_decoder_layers = text_decoder_layers
+        self.text_decoder_heads = text_decoder_heads
+        self.vision_encoder_dim = vision_encoder_dim
+        self.vision_latent_size = vision_latent_size
+        self.vision_hidden_size = vision_hidden_size
+        self.vision_compression_method = vision_compression_method
+        self.vision_spatial_pooling = vision_spatial_pooling
+        self.vision_pool_size = vision_pool_size
+        self.fusion_hidden_size = fusion_hidden_size
+        self.fusion_num_heads = fusion_num_heads
+        self.fusion_num_layers = fusion_num_layers
+        self.memory_alpha = memory_alpha
+        self.direct_writing = direct_writing
+        self.memory_compression = memory_compression
+        self.max_seq_len = max_seq_len
+        self.dropout = dropout
+        self.initializer_range = initializer_range
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+class BitNetLinear(nn.Module):
+    """1.58-bit Linear layer following BitNet b1.58 architecture - FIXED VERSION"""
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        # Weight parameters (full precision for training)
+        self.weight = nn.Parameter(torch.randn(out_features, in_features))
+        self.bias = nn.Parameter(torch.zeros(out_features)) if bias else None
+        # FIXED
+        self.register_buffer('weight_scale', torch.tensor(1.0))
+        self.register_buffer('input_scale', torch.tensor(1.0))
+    def quantize_weights_1_58_bit(self, weight: torch.Tensor) -> torch.Tensor:
+        """BitNet b1.58 weight quantization: {-1, 0, +1}"""
+        # Handle empty tensors
+        if weight.numel() == 0:
+            return weight
+        # Compute scaling factor with numerical stability
+        scale = weight.abs().mean()
+        # Handle case where all weights are zero
+        if scale < 1e-8:
+            scale = torch.tensor(1e-5, device=weight.device, dtype=weight.dtype)
+        self.weight_scale.data = scale.clamp(min=1e-5, max=1e3)
+        # Normalize weights with gradient clipping
+        weight_norm = torch.clamp(weight / self.weight_scale, min=-10.0, max=10.0)
+        # 1.58-bit quantization with threshold
+        threshold = 2.0 / 3.0  # Optimal threshold for ternary quantization
+        # Create ternary weights
+        quantized = torch.zeros_like(weight_norm)
+        quantized[weight_norm > threshold] = 1.0
+        quantized[weight_norm < -threshold] = -1.0
+        # Values between -threshold and threshold remain 0
+        return quantized
+    def quantize_activations_8bit(self, x: torch.Tensor) -> torch.Tensor:
+        """8-bit activation quantization with numerical stability"""
+        # Handle empty tensors
+        if x.numel() == 0:
+            return x
+        # Clamp extreme values to prevent overflow
+        x_clamped = torch.clamp(x, min=-1e6, max=1e6)
+        # Handle scalar tensors
+        if x_clamped.numel() == 1:
+            return x_clamped
+        # Compute quantization parameters
+        x_min, x_max = x_clamped.min(), x_clamped.max()
+        # Prevent division by zero
+        range_val = x_max - x_min
+        if range_val < 1e-8:
+            return x_clamped
+        scale = range_val / 255.0
+        self.input_scale.data = scale.clamp(min=1e-8, max=1e3)
+        # Quantize to 8-bit
+        zero_point = (-x_min / scale).round().clamp(0, 255)
+        quantized = ((x_clamped / scale) + zero_point).round().clamp(0, 255)
+        # Dequantize
+        dequantized = scale * (quantized - zero_point)
+        return dequantized
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.training:
+            # Full precision training with straight-through estimator
+            # Forward pass with quantized weights but gradients flow through original weights
+            weight_q = self.quantize_weights_1_58_bit(self.weight)
+            weight_forward = weight_q * self.weight_scale
+            # Use original weight for gradient computation
+            weight_forward = weight_forward + (self.weight - self.weight.detach())
+            return F.linear(x, weight_forward, self.bias)
+        else:
+            # Inference with full quantization
+            weight_q = self.quantize_weights_1_58_bit(self.weight) * self.weight_scale
+            x_q = self.quantize_activations_8bit(x)
+            return F.linear(x_q, weight_q, self.bias)
+class BitNetMLP(nn.Module):
+    """BitNet MLP block with 1.58-bit quantization"""
+    def __init__(self, dim: int, hidden_dim: int, dropout: float = 0.1):
+        super().__init__()
+        self.fc1 = BitNetLinear(dim, hidden_dim)
+        self.fc2 = BitNetLinear(hidden_dim, dim)
+        self.activation = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+        self.norm = nn.LayerNorm(dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        x = self.fc1(x)
+        x = self.activation(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return self.norm(x + residual)
+class BitNetAttention(nn.Module):
+    """Multi-head attention with BitNet quantization"""
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        dropout: float = 0.1,
+        bias: bool = True
+    ):
+        super().__init__()
+        assert dim % num_heads == 0
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        # BitNet quantized projections
+        self.q_proj = BitNetLinear(dim, dim, bias=bias)
+        self.k_proj = BitNetLinear(dim, dim, bias=bias)
+        self.v_proj = BitNetLinear(dim, dim, bias=bias)
+        self.out_proj = BitNetLinear(dim, dim, bias=bias)
+        self.dropout = nn.Dropout(dropout)
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size, seq_len = query.shape[:2]
+        # Validate input dimensions
+        if query.size(-1) != self.dim:
+            raise ValueError(f"Query dimension {query.size(-1)} doesn't match expected {self.dim}")
+        if key.size(-1) != self.dim:
+            raise ValueError(f"Key dimension {key.size(-1)} doesn't match expected {self.dim}")
+        if value.size(-1) != self.dim:
+            raise ValueError(f"Value dimension {value.size(-1)} doesn't match expected {self.dim}")
+        # Linear projections
+        q = self.q_proj(query)
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        # Get key/value sequence length (handle different shapes)
+        key_seq_len = key.size(1)
+        # Reshape for multi-head attention with proper dimension checking
+        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, key_seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, key_seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # Attention computation
+        attention_scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
+        if mask is not None:
+            # Handle mask shape: expand to match attention scores shape
+            if mask.dim() == 2:  # [batch_size, seq_len]
+                mask = mask.unsqueeze(1).unsqueeze(1)  # [batch_size, 1, 1, seq_len]
+            elif mask.dim() == 3:  # [batch_size, seq_len, seq_len]
+                mask = mask.unsqueeze(1)  # [batch_size, 1, seq_len, seq_len]
+            # Expand mask to match attention scores shape [batch_size, num_heads, seq_len, key_seq_len]
+            if mask.size(-1) != key_seq_len:
+                # Adjust mask if needed
+                if mask.size(-1) == seq_len:
+                    # Pad or trim mask to match key_seq_len
+                    if key_seq_len > seq_len:
+                        pad_size = key_seq_len - seq_len
+                        mask = torch.cat([mask, torch.zeros(*mask.shape[:-1], pad_size, device=mask.device, dtype=mask.dtype)], dim=-1)
+                    else:
+                        mask = mask[..., :key_seq_len]
+            mask = mask.expand(batch_size, self.num_heads, seq_len, key_seq_len)
+            attention_scores.masked_fill_(mask == 0, float('-inf'))
+        attention_weights = F.softmax(attention_scores, dim=-1)
+        attention_weights = self.dropout(attention_weights)
+        # Apply attention to values
+        attended = torch.matmul(attention_weights, v)
+        # Reshape and project output
+        attended = attended.transpose(1, 2).contiguous().view(
+            batch_size, seq_len, self.dim
+        )
+        output = self.out_proj(attended)
+        return output, attention_weights.mean(dim=1)  # Average across heads
+class BitNetTransformerBlock(nn.Module):
+    """BitNet Transformer block with quantized components"""
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.1
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = BitNetAttention(dim, num_heads, dropout)
+        self.norm2 = nn.LayerNorm(dim)
+        self.mlp = BitNetMLP(dim, int(dim * mlp_ratio), dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self-attention with residual connection
+        normed_x = self.norm1(x)
+        attn_out, attn_weights = self.attn(normed_x, normed_x, normed_x, mask)
+        x = x + attn_out
+        # MLP with residual connection
+        x = x + self.mlp(self.norm2(x))
+        return x, attn_weights
+class BitNetTextEncoder(nn.Module):
+    """BitNet-based text encoder"""
+    def __init__(
+        self,
+        vocab_size: int,
+        dim: int,
+        num_layers: int,
+        num_heads: int,
+        max_seq_len: int = 512,
+        dropout: float = 0.1
+    ):
+        super().__init__()
+        self.dim = dim
+        self.max_seq_len = max_seq_len
+        # Token embeddings (kept full precision)
+        self.token_embedding = nn.Embedding(vocab_size, dim)
+        self.position_embedding = nn.Embedding(max_seq_len, dim)
+        # BitNet transformer layers
+        self.layers = nn.ModuleList([
+            BitNetTransformerBlock(dim, num_heads, dropout=dropout)
+            for _ in range(num_layers)
+        ])
+        self.dropout = nn.Dropout(dropout)
+        self.norm = nn.LayerNorm(dim)
+        # Initialize embeddings
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.position_embedding.weight, std=0.02)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        batch_size, seq_len = input_ids.shape
+        # Embeddings
+        positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
+        x = self.token_embedding(input_ids) + \
+            self.position_embedding(positions)
+        x = self.dropout(x)
+        # Transform through BitNet layers
+        attention_patterns = []
+        for layer in self.layers:
+            # Convert attention mask to the right format for the layer
+            layer_mask = None
+            if attention_mask is not None:
+                # Create a mask where 1 means attend, 0 means don't attend
+                layer_mask = attention_mask.unsqueeze(
+                    1).unsqueeze(2)  # [batch_size, 1, 1, seq_len]
+            x, attn_weights = layer(x, layer_mask)
+            attention_patterns.append(attn_weights)
+        x = self.norm(x)
+        return x, attention_patterns
+class BitNetTextDecoder(nn.Module):
+    """BitNet-based text decoder with causal masking"""
+    def __init__(
+        self,
+        vocab_size: int,
+        dim: int,
+        num_layers: int,
+        num_heads: int,
+        max_seq_len: int = 512,
+        dropout: float = 0.1
+    ):
+        super().__init__()
+        self.dim = dim
+        self.max_seq_len = max_seq_len
+        # Token embeddings
+        self.token_embedding = nn.Embedding(vocab_size, dim)
+        self.position_embedding = nn.Embedding(max_seq_len, dim)
+        # BitNet transformer layers
+        self.layers = nn.ModuleList([
+            BitNetTransformerBlock(dim, num_heads, dropout=dropout)
+            for _ in range(num_layers)
+        ])
+        self.dropout = nn.Dropout(dropout)
+        self.norm = nn.LayerNorm(dim)
+        # Output projection to vocabulary
+        self.lm_head = BitNetLinear(dim, vocab_size, bias=False)
+        # Initialize embeddings
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.position_embedding.weight, std=0.02)
+        # Register causal mask
+        self.register_buffer(
+            'causal_mask',
+            torch.tril(torch.ones(max_seq_len, max_seq_len)
+                       ).unsqueeze(0).unsqueeze(0)
+        )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None
+    ) -> Dict[str, torch.Tensor]:
+        if input_ids is not None:
+            batch_size, seq_len = input_ids.shape
+            positions = torch.arange(
+                seq_len, device=input_ids.device).unsqueeze(0)
+            x = self.token_embedding(input_ids) + \
+                self.position_embedding(positions)
+        elif inputs_embeds is not None:
+            batch_size, seq_len = inputs_embeds.shape[:2]
+            positions = torch.arange(
+                seq_len, device=inputs_embeds.device).unsqueeze(0)
+            x = inputs_embeds + self.position_embedding(positions)
+        else:
+            raise ValueError(
+                "Either input_ids or inputs_embeds must be provided")
+        x = self.dropout(x)
+        # Create causal mask
+        causal_mask = self.causal_mask[:, :, :seq_len, :seq_len]
+        if attention_mask is not None:
+            # Combine causal mask with padding mask
+            mask = attention_mask.unsqueeze(1).unsqueeze(2) * causal_mask
+        else:
+            mask = causal_mask
+        # Transform through BitNet layers
+        attention_patterns = []
+        for layer in self.layers:
+            x, attn_weights = layer(x, mask)
+            attention_patterns.append(attn_weights)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        loss = None
+        if labels is not None:
+            # Shift labels for causal LM
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100
+            )
+        return {
+            'logits': logits,
+            'loss': loss,
+            'attention_patterns': attention_patterns
+        }
+class CrossModalFusion(nn.Module):
+    """Cross-modal fusion module for text and vision features"""
+    def __init__(
+        self,
+        text_dim: int,
+        vision_dim: int,
+        hidden_dim: int,
+        num_heads: int = 8,
+        num_layers: int = 2
+    ):
+        super().__init__()
+        self.text_dim = text_dim
+        self.vision_dim = vision_dim
+        self.hidden_dim = hidden_dim
+        # Projection layers
+        self.text_proj = BitNetLinear(text_dim, hidden_dim)
+        self.vision_proj = BitNetLinear(vision_dim, hidden_dim)
+        # Cross-attention layers
+        self.cross_attention_layers = nn.ModuleList([
+            BitNetAttention(
+                dim=hidden_dim,
+                num_heads=num_heads
+            ) for _ in range(num_layers)
+        ])
+        # Layer normalization
+        self.layer_norms = nn.ModuleList([
+            nn.LayerNorm(hidden_dim) for _ in range(num_layers)
+        ])
+        # Output projection
+        self.output_proj = BitNetLinear(hidden_dim, hidden_dim)
+    def forward(
+        self,
+        text_features: torch.Tensor,
+        vision_features: torch.Tensor
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            text_features: [batch_size, seq_len, text_dim]
+            vision_features: [batch_size, vision_dim]
+        Returns:
+            fused_features: [batch_size, seq_len, hidden_dim]
+            attention_weights: Dict of attention patterns
+        """
+        batch_size, seq_len = text_features.shape[:2]
+        # Validate input dimensions
+        if text_features.size(-1) != self.text_dim:
+            raise ValueError(f"Text features dimension {text_features.size(-1)} doesn't match expected {self.text_dim}")
+        if vision_features.size(-1) != self.vision_dim:
+            raise ValueError(f"Vision features dimension {vision_features.size(-1)} doesn't match expected {self.vision_dim}")
+        # Project to common dimension
+        # [batch_size, seq_len, hidden_dim]
+        text_proj = self.text_proj(text_features)
+        vision_proj = self.vision_proj(vision_features).unsqueeze(1)  # [batch_size, 1, hidden_dim]
+        # Cross-attention fusion
+        fused = text_proj
+        attention_weights = {}
+        for i, (attn_layer, norm_layer) in enumerate(zip(self.cross_attention_layers, self.layer_norms)):
+            # Text-to-vision cross-attention
+            attn_output, attn_weights = attn_layer(
+                query=fused,
+                key=vision_proj,
+                value=vision_proj
+            )
+            # Residual connection and normalization
+            fused = norm_layer(fused + attn_output)
+            attention_weights[f'layer_{i}'] = attn_weights
+        # Output projection
+        output = self.output_proj(fused)
+        return output, attention_weights
+class VisionEncoder(nn.Module):
+    """Quantized Vision Encoder for DiNOv2 features"""
+    def __init__(
+        self,
+        input_dim: int = 768,
+        hidden_dim: int = 512,
+        output_dim: int = 768,
+        num_layers: int = 2
+    ):
+        super().__init__()
+        # Quantized layers
+        self.layers = nn.ModuleList([
+            BitNetLinear(input_dim if i == 0 else hidden_dim, hidden_dim)
+            for i in range(num_layers)
+        ])
+        # Output projection
+        self.output_proj = BitNetLinear(hidden_dim, output_dim)
+        # Activation and normalization
+        self.activation = nn.GELU()
+        self.layer_norms = nn.ModuleList([
+            nn.LayerNorm(hidden_dim) for _ in range(num_layers)
+        ])
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, vision_features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            vision_features: [batch_size, input_dim] - DiNOv2 features
+        Returns:
+            encoded_features: [batch_size, output_dim]
+        """
+        # Handle potential extra dimensions
+        if vision_features.dim() > 2:
+            # Flatten any extra dimensions except batch
+            original_shape = vision_features.shape
+            vision_features = vision_features.view(original_shape[0], -1)
+            # Ensure we have the expected input dimension
+            if vision_features.size(-1) != self.layers[0].in_features:
+                # Take only the first input_dim features if we have more
+                if vision_features.size(-1) > self.layers[0].in_features:
+                    vision_features = vision_features[:, :self.layers[0].in_features]
+                else:
+                    raise ValueError(f"Vision features dimension {vision_features.size(-1)} is smaller than expected {self.layers[0].in_features}")
+        x = vision_features
+        for layer, norm in zip(self.layers, self.layer_norms):
+            x = layer(x)
+            x = norm(x)
+            x = self.activation(x)
+            x = self.dropout(x)
+        # Output projection
+        output = self.output_proj(x)
+        return output
+class BitMarModel(PreTrainedModel):
+    """
+    BitMar: BitNet-quantized Vision-Language Episodic Memory Transformer
+    Compatible with Hugging Face Transformers
+    """
+    config_class = BitMarConfig
+    base_model_prefix = "bitmar"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["BitNetTransformerBlock"]
+    def __init__(self, config: BitMarConfig):
+        super().__init__(config)
+        self.config = config
+        # Loss balancing parameters
+        self.cross_modal_loss_weight = getattr(config, 'cross_modal_loss_weight', 0.1)
+        self.text_loss_weight = getattr(config, 'text_loss_weight', 1.0)
+        self.vision_loss_weight = getattr(config, 'vision_loss_weight', 0.1)
+        self.memory_loss_weight = getattr(config, 'memory_loss_weight', 0.05)
+        # Dynamic loss scaling
+        self.adaptive_loss_scaling = getattr(config, 'adaptive_loss_scaling', True)
+        self.loss_scale_temperature = getattr(config, 'loss_scale_temperature', 0.07)
+        # Encoder freezing parameters
+        self.freeze_text_encoder_steps = getattr(config, 'freeze_text_encoder_steps', 0)
+        self.freeze_vision_encoder_steps = getattr(config, 'freeze_vision_encoder_steps', 0)
+        self.current_step = 0
+        # BitNet text encoder/decoder
+        self.text_encoder = BitNetTextEncoder(
+            vocab_size=config.vocab_size,
+            dim=config.text_encoder_dim,
+            num_layers=config.text_encoder_layers,
+            num_heads=config.text_encoder_heads,
+            max_seq_len=config.max_seq_len,
+            dropout=config.dropout
+        )
+        self.text_decoder = BitNetTextDecoder(
+            vocab_size=config.vocab_size,
+            dim=config.text_decoder_dim,
+            num_layers=config.text_decoder_layers,
+            num_heads=config.text_decoder_heads,
+            max_seq_len=config.max_seq_len,
+            dropout=config.dropout
+        )
+        # Vision processing with BitNet quantization
+        self.vision_encoder = VisionEncoder(
+            input_dim=config.vision_encoder_dim,
+            hidden_dim=config.vision_hidden_size,
+            output_dim=config.vision_latent_size
+        )
+        # Cross-modal fusion with BitNet
+        self.fusion = CrossModalFusion(
+            text_dim=config.text_encoder_dim,
+            vision_dim=config.vision_latent_size,
+            hidden_dim=config.fusion_hidden_size,
+            num_heads=config.fusion_num_heads,
+            num_layers=config.fusion_num_layers
+        )
+        # Projection to decoder dimension
+        self.decoder_input_proj = BitNetLinear(
+            config.fusion_hidden_size,
+            config.text_decoder_dim
+        )
+        # Initialize tokenizer (for compatibility)
+        try:
+            from transformers import AutoTokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained('gpt2')
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+        except:
+            self.tokenizer = None
+        self.post_init()
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, BitNetLinear)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if hasattr(module, 'bias') and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            if hasattr(module, 'bias') and module.bias is not None:
+                module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def encode_text(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """Encode text using BitNet encoder"""
+        text_features, attention_patterns = self.text_encoder(
+            input_ids=input_ids, attention_mask=attention_mask)
+        return text_features, attention_patterns
+    def encode_vision(self, vision_features: torch.Tensor) -> torch.Tensor:
+        """Encode vision features using quantized vision encoder"""
+        vision_latent = self.vision_encoder(vision_features)
+        return vision_latent
+    def compute_cross_modal_contrastive_loss(
+        self,
+        text_features: torch.Tensor,
+        vision_features: torch.Tensor,
+        temperature: float = 0.07
+    ) -> torch.Tensor:
+        """Compute cross-modal contrastive loss similar to CLIP"""
+        batch_size = text_features.shape[0]
+        # Handle dimension mismatch between text and vision features
+        text_dim = text_features.shape[-1]
+        vision_dim = vision_features.shape[-1]
+        if text_dim != vision_dim:
+            # Project to smaller dimension to maintain compatibility
+            target_dim = min(text_dim, vision_dim)
+            if text_dim > vision_dim:
+                # Project text features to vision dimension
+                text_features = text_features[:, :target_dim]
+            else:
+                # Project vision features to text dimension
+                vision_features = vision_features[:, :target_dim]
+        # Normalize features
+        text_features = F.normalize(text_features, dim=-1)
+        vision_features = F.normalize(vision_features, dim=-1)
+        # Compute similarity matrix
+        logits = torch.matmul(text_features, vision_features.T) / temperature
+        # Create labels (diagonal should be positive pairs)
+        labels = torch.arange(batch_size, device=logits.device)
+        # Compute cross-entropy loss for both directions
+        text_to_vision_loss = F.cross_entropy(logits, labels)
+        vision_to_text_loss = F.cross_entropy(logits.T, labels)
+        return (text_to_vision_loss + vision_to_text_loss) / 2
+    def compute_vision_reconstruction_loss(
+        self,
+        original_vision: torch.Tensor,
+        reconstructed_vision: torch.Tensor
+    ) -> torch.Tensor:
+        """Compute vision reconstruction loss to prevent vision encoder collapse"""
+        return F.mse_loss(reconstructed_vision, original_vision)
+    def compute_balanced_loss(
+        self,
+        decoder_loss: torch.Tensor,
+        cross_modal_loss: torch.Tensor,
+        vision_loss: Optional[torch.Tensor] = None,
+        step: int = 0,
+        adaptive_controller=None
+    ) -> Dict[str, torch.Tensor]:
+        """Compute balanced multi-objective loss with adaptive scaling"""
+        losses = {'decoder_loss': decoder_loss, 'cross_modal_loss': cross_modal_loss}
+        if vision_loss is not None:
+            losses['vision_loss'] = vision_loss
+        if self.adaptive_loss_scaling:
+            # Adaptive scaling based on loss magnitudes
+            with torch.no_grad():
+                # Compute relative loss scales
+                decoder_scale = decoder_loss.detach()
+                cross_modal_scale = cross_modal_loss.detach()
+                # Prevent division by zero
+                if decoder_scale > 1e-8:
+                    adaptive_cross_modal_weight = (decoder_scale / cross_modal_scale.clamp(min=1e-8)) * self.cross_modal_loss_weight
+                else:
+                    adaptive_cross_modal_weight = self.cross_modal_loss_weight
+                # Clamp adaptive weights
+                adaptive_cross_modal_weight = torch.clamp(adaptive_cross_modal_weight, 0.01, 1.0)
+        else:
+            adaptive_cross_modal_weight = self.cross_modal_loss_weight
+        # Apply loss scheduling (increase cross-modal importance over time)
+        cross_modal_schedule = min(1.0, step / 50000)  # Ramp up over 50k steps
+        scheduled_cross_modal_weight = adaptive_cross_modal_weight * cross_modal_schedule
+        # Compute weighted total loss
+        total_loss = (
+            self.text_loss_weight * decoder_loss +
+            scheduled_cross_modal_weight * cross_modal_loss
+        )
+        if vision_loss is not None:
+            total_loss += self.vision_loss_weight * vision_loss
+        losses.update({
+            'total_loss': total_loss,
+            'cross_modal_weight': scheduled_cross_modal_weight,
+            'adaptive_weight': adaptive_cross_modal_weight if self.adaptive_loss_scaling else torch.tensor(0.0)
+        })
+        return losses
+    def apply_encoder_freezing(self, step: int):
+        """Apply temporary encoder freezing based on training step"""
+        self.current_step = step
+        # Freeze text encoder if within freezing window
+        freeze_text = step < self.freeze_text_encoder_steps
+        for param in self.text_encoder.parameters():
+            param.requires_grad = not freeze_text
+        # Freeze vision encoder if within freezing window
+        freeze_vision = step < self.freeze_vision_encoder_steps
+        for param in self.vision_encoder.parameters():
+            param.requires_grad = not freeze_vision
+        return {
+            'text_encoder_frozen': freeze_text,
+            'vision_encoder_frozen': freeze_vision
+        }
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        vision_features: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        mode: str = "train",
+        step: int = 0,
+        has_vision: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> Union[Tuple, CausalLMOutput]:
+        """
+        Forward pass through BitMar model with mixed vision/text batch support
+        Args:
+            has_vision: Boolean tensor [batch_size] indicating which samples have real vision features
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # CRITICAL FIX: Ensure input_ids are integers
+        if input_ids.dtype != torch.long:
+            input_ids = input_ids.long()
+        # CRITICAL FIX: Ensure labels are integers if provided
+        if labels is not None and labels.dtype != torch.long:
+            labels = labels.long()
+        if input_ids is None:
+            raise ValueError("input_ids must be provided")
+        batch_size, seq_len = input_ids.shape
+        # Handle missing attention mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.float)
+        # Ensure attention_mask is float
+        if attention_mask.dtype != torch.float:
+            attention_mask = attention_mask.float()
+        # Handle missing vision features
+        if vision_features is None:
+            vision_features = torch.zeros(batch_size, self.config.vision_encoder_dim,
+                                        device=input_ids.device, dtype=torch.float32)
+        # Validate input tensor dimensions
+        expected_vision_dim = self.config.vision_encoder_dim
+        if vision_features.dim() != 2 or vision_features.size(-1) != expected_vision_dim:
+            if vision_features.dim() > 2:
+                vision_features = vision_features.view(batch_size, -1)
+            if vision_features.size(-1) != expected_vision_dim:
+                # Pad or trim to expected dimension
+                if vision_features.size(-1) > expected_vision_dim:
+                    vision_features = vision_features[:, :expected_vision_dim]
+                else:
+                    padding = expected_vision_dim - vision_features.size(-1)
+                    vision_features = F.pad(vision_features, (0, padding))
+        # Default has_vision to all True if not provided (backward compatibility)
+        if has_vision is None:
+            has_vision = torch.ones(batch_size, dtype=torch.bool, device=input_ids.device)
+        # Apply encoder freezing
+        freezing_status = {}
+        if mode == "train":
+            freezing_status = self.apply_encoder_freezing(step)
+        # Encode text (always available)
+        text_features, text_attention = self.encode_text(input_ids, attention_mask)
+        # Encode vision (with masking for text-only samples)
+        vision_latent = self.encode_vision(vision_features)
+        # Mask vision features for text-only samples
+        vision_mask = has_vision.float().unsqueeze(-1)
+        vision_latent_masked = vision_latent * vision_mask
+        # Cross-modal fusion
+        fused_features, cross_attention = self.fusion(text_features, vision_latent_masked)
+        # Prepare decoder input
+        fused_no_memory = fused_features
+        decoder_input = self.decoder_input_proj(fused_no_memory)
+        # Generate text using BitNet decoder
+        decoder_outputs = self.text_decoder(
+            inputs_embeds=decoder_input,
+            attention_mask=attention_mask,
+            labels=labels
+        )
+        # Compute losses if in training mode
+        final_loss = None
+        loss_dict = {}
+        if mode == "train" and labels is not None:
+            # Primary decoder loss
+            decoder_loss = decoder_outputs['loss']
+            # Cross-modal contrastive loss (only for samples with vision)
+            cross_modal_loss = torch.tensor(0.0, device=input_ids.device)
+            if has_vision.any():
+                vision_indices = has_vision.nonzero(as_tuple=True)[0]
+                if len(vision_indices) > 0:
+                    text_pooled = text_features[vision_indices].mean(dim=1)
+                    vision_for_loss = vision_latent[vision_indices]
+                    cross_modal_loss = self.compute_cross_modal_contrastive_loss(
+                        text_pooled, vision_for_loss, temperature=self.loss_scale_temperature
+                    )
+            # Optional additional losses
+            vision_loss = None
+            # Compute balanced loss
+            loss_dict = self.compute_balanced_loss(
+                decoder_loss, cross_modal_loss, vision_loss, step
+            )
+            final_loss = loss_dict['total_loss']
+        elif decoder_outputs.get('loss') is not None:
+            final_loss = decoder_outputs['loss']
+        # Prepare outputs
+        if return_dict:
+            output = CausalLMOutput(
+                loss=final_loss,
+                logits=decoder_outputs['logits'],
+                hidden_states=fused_features if output_hidden_states else None,
+                attentions=text_attention if output_attentions else None,
+            )
+            # Add additional outputs for analysis
+            if mode == "train":
+                for key, value in loss_dict.items():
+                    setattr(output, key, value)
+                for key, value in freezing_status.items():
+                    setattr(output, key, value)
+            return output
+        else:
+            outputs = (decoder_outputs['logits'],)
+            if final_loss is not None:
+                outputs = (final_loss,) + outputs
+            if output_hidden_states:
+                outputs = outputs + (fused_features,)
+            if output_attentions:
+                outputs = outputs + (text_attention,)
+            return outputs
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        vision_features: Optional[torch.Tensor] = None,
+        max_length: int = 100,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        do_sample: bool = True,
+        **kwargs
+    ) -> torch.LongTensor:
+        """Generate text given input text and vision features"""
+        self.eval()
+        batch_size = input_ids.size(0)
+        device = input_ids.device
+        # Handle missing vision features
+        if vision_features is None:
+            vision_features = torch.zeros(batch_size, self.config.vision_encoder_dim,
+                                        device=device, dtype=torch.float32)
+        # Handle attention mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        generated_ids = input_ids.clone()
+        current_attention_mask = attention_mask.clone()
+        with torch.no_grad():
+            for _ in range(max_length - input_ids.size(1)):
+                # Get model outputs
+                outputs = self.forward(
+                    input_ids=generated_ids,
+                    attention_mask=current_attention_mask,
+                    vision_features=vision_features,
+                    mode="inference",
+                    return_dict=True
+                )
+                # Get next token logits
+                next_token_logits = outputs.logits[:, -1, :] / temperature
+                if do_sample:
+                    # Apply top-p sampling
+                    if top_p < 1.0:
+                        sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
+                        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                        # Remove tokens with cumulative probability above the threshold
+                        sorted_indices_to_remove = cumulative_probs > top_p
+                        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                        sorted_indices_to_remove[..., 0] = 0
+                        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                        next_token_logits[indices_to_remove] = float('-inf')
+                    # Sample from the filtered distribution
+                    probs = F.softmax(next_token_logits, dim=-1)
+                    next_token = torch.multinomial(probs, num_samples=1)
+                else:
+                    # Greedy decoding
+                    next_token = next_token_logits.argmax(dim=-1, keepdim=True)
+                # Append to generated sequence
+                generated_ids = torch.cat([generated_ids, next_token], dim=-1)
+                # Update attention mask
+                current_attention_mask = torch.cat([
+                    current_attention_mask,
+                    torch.ones(batch_size, 1, device=device)
+                ], dim=-1)
+                # Stop if EOS token is generated
+                if (next_token == self.config.eos_token_id).all():
+                    break
+        return generated_ids
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        vision_features=None,
+        **kwargs
+    ):
+        """Prepare inputs for generation"""
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "vision_features": vision_features,
+            "use_cache": kwargs.get("use_cache", True),
+        }
+# Register the model with transformers
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+AutoConfig.register("bitmar", BitMarConfig)
+AutoModel.register(BitMarConfig, BitMarModel)
+AutoModelForCausalLM.register(BitMarConfig, BitMarModel)

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ed1baf7f77db0a118159e6a850c9b1089cfc20f5cb8a329fb50d1195bcf70e0
+size 85226595

training_metadata.json ADDED Viewed

	@@ -0,0 +1,223 @@

+{
+  "epoch": 9,
+  "global_step": 165810,
+  "tokens_processed": 996822486,
+  "target_tokens": 100000000,
+  "best_similarity": 0.33421099185943604,
+  "training_config": {
+    "model": {
+      "vocab_size": 50257,
+      "text_encoder_dim": 128,
+      "text_encoder_layers": 4,
+      "text_encoder_heads": 4,
+      "text_decoder_dim": 128,
+      "text_decoder_layers": 4,
+      "text_decoder_heads": 4,
+      "vision_encoder_dim": 768,
+      "vision_latent_size": 128,
+      "vision_hidden_size": 64,
+      "vision_compression_method": "learned_compression",
+      "vision_spatial_pooling": true,
+      "vision_pool_size": 2,
+      "fusion_hidden_size": 128,
+      "fusion_num_heads": 4,
+      "fusion_num_layers": 2,
+      "max_seq_len": 256,
+      "dropout": 0.15
+    },
+    "token_constraints": {
+      "total_tokens": 100000000,
+      "caption_tokens": 50000000,
+      "text_tokens": 50000000,
+      "enforce_exact_count": true,
+      "uniform_sampling": true,
+      "alignment_priority": "perfect_alignment",
+      "preserve_image_caption_pairs": true,
+      "strict_alignment_validation": true
+    },
+    "vision_feature_reduction": {
+      "enabled": true,
+      "method": "learned_compression",
+      "target_dim": 64,
+      "spatial_pooling": true,
+      "pool_method": "attention",
+      "hidden_dim": 128,
+      "learnable": true,
+      "preserve_spatial_info": true
+    },
+    "data": {
+      "dataset_dir": "../babylm_dataset",
+      "text_encoder_name": "gpt2",
+      "max_seq_length": 256,
+      "count_tokens": true,
+      "target_caption_tokens": 50000000,
+      "target_text_tokens": 50000000,
+      "token_counting_method": "gpt2",
+      "batch_size": 384,
+      "num_workers": 10,
+      "pin_memory": true,
+      "persistent_workers": true,
+      "mix_ratio": 0.5,
+      "shuffle_datasets": true,
+      "ensure_alignment": true,
+      "validate_alignment": true,
+      "alignment_verification": "strict",
+      "never_break_pairs": true,
+      "alignment_check_frequency": 1000,
+      "use_validation": false,
+      "train_only": true
+    },
+    "attention_analysis": {
+      "track_top_k": 5,
+      "log_every_n_steps": 200,
+      "viz_every_n_epochs": 3,
+      "save_head_patterns": true,
+      "analyze_memory_attention": false,
+      "analyze_cross_modal": true,
+      "track_token_alignment": true
+    },
+    "adaptive_training": {
+      "enabled": true,
+      "similarity_window_size": 200,
+      "drop_threshold": 0.12,
+      "min_steps_between_interventions": 800,
+      "freeze_duration_steps": 1500,
+      "loss_rebalance_factor": 2.0,
+      "similarity_smoothing_alpha": 0.15
+    },
+    "training": {
+      "max_epochs": 10,
+      "accumulate_grad_batches": 2,
+      "gradient_clip_val": 0.3,
+      "val_check_interval": 1000,
+      "scheduler": {
+        "T_0": 1000,
+        "T_mult": 2,
+        "eta_min_ratio": 0.1
+      },
+      "min_lr": 5e-05,
+      "warmup_steps": 1000,
+      "learning_rate": 0.0002,
+      "weight_decay": 0.02,
+      "optimizer": "adamw8bit",
+      "cross_modal_loss_weight": 1.5,
+      "text_generation_loss_weight": 1.0,
+      "alignment_consistency_weight": 0.5,
+      "track_token_usage": true,
+      "log_token_progress": true,
+      "stop_at_token_limit": false,
+      "validate_alignment_every_n_steps": 500,
+      "log_alignment_metrics": true,
+      "alignment_loss_scaling": "adaptive"
+    },
+    "wandb": {
+      "project": "bitmar-no-memory",
+      "entity": "babylm-ntust",
+      "api_key": null,
+      "log_every_n_steps": 100,
+      "log_attention": true,
+      "log_memory": false,
+      "log_gradients": true,
+      "log_token_usage": true,
+      "log_cross_modal_similarity": true,
+      "log_alignment_quality": true,
+      "log_caption_image_matching": true,
+      "save_code": true,
+      "create_plots": true,
+      "plot_attention_heatmaps": false,
+      "plot_memory_usage": false,
+      "plot_token_distribution": true,
+      "plot_alignment_metrics": true
+    },
+    "evaluation": {
+      "metrics": [
+        "bleu",
+        "rouge",
+        "cross_modal_similarity"
+      ],
+      "generate_samples": true,
+      "num_samples": 20,
+      "max_generation_length": 32,
+      "temperature": 0.8,
+      "top_p": 0.9,
+      "evaluate_alignment": true,
+      "alignment_metrics": [
+        "cosine_similarity",
+        "retrieval_accuracy",
+        "caption_image_matching",
+        "cross_modal_retrieval"
+      ],
+      "alignment_threshold": 0.8,
+      "validate_pairs_during_eval": true
+    },
+    "output": {
+      "checkpoint_dir": "checkpoints_100M_dataset",
+      "log_dir": "logs_100M_dataset",
+      "attention_dir": "attention_100M_dataset",
+      "results_dir": "results_100M_dataset",
+      "token_logs_dir": "token_logs_100M_dataset"
+    },
+    "performance_targets": {
+      "max_model_size_mb": 50,
+      "target_cross_modal_similarity": 0.75,
+      "target_text_generation_quality": 0.6
+    },
+    "flops_tracking": {
+      "enabled": true,
+      "log_frequency": 100,
+      "save_statistics": true,
+      "estimate_theoretical": true,
+      "track_peak_performance": true,
+      "log_to_wandb": true,
+      "detailed_breakdown": true,
+      "memory_bandwidth_tracking": false,
+      "efficiency_analysis": true,
+      "track_components": [
+        "attention",
+        "feedforward",
+        "layer_norm",
+        "embeddings",
+        "vision_encoder",
+        "cross_modal_fusion"
+      ]
+    },
+    "token_tracking": {
+      "log_frequency": 1000,
+      "save_token_distribution": true,
+      "monitor_caption_text_ratio": true,
+      "enforce_token_limits": false,
+      "early_stopping_on_limit": false,
+      "track_alignment_quality": true,
+      "log_misaligned_samples": true,
+      "alignment_quality_threshold": 0.7,
+      "save_alignment_statistics": true,
+      "correlate_flops_with_tokens": true,
+      "log_computational_efficiency": true,
+      "track_throughput_vs_quality": true
+    },
+    "huggingface_hub": {
+      "enabled": true,
+      "repo_id": "estebancarlin/bitmar-no-memory",
+      "private": true,
+      "upload_after_epoch": true,
+      "upload_final_model": true,
+      "commit_message_template": "BitMar 100M tokens (no memory) - Epoch {epoch} - {tokens_processed:,} tokens processed",
+      "create_model_card": true,
+      "model_card_template": "---\nlanguage: en\nlicense: mit\ntags:\n- bitmar\n- multimodal\n- babylm\n- cross-modal\n- no-memory\ndatasets:\n- babylm_multimodal\nmetrics:\n- bleu\n- cross_modal_similarity\n---\n\n# BitMar 100M Token Model (No Episodic Memory)\n\nThis model was trained on exactly 100 million tokens as part of the BabyLM challenge without episodic memory.\n\n## Training Details\n- Total tokens: 100,000,000\n- Epochs completed: {epoch}\n- Tokens processed: {tokens_processed:,}\n- Cross-modal similarity: {best_similarity:.4f}\n- Episodic memory: Disabled\n\n## Model Architecture\n- Text encoder: {text_encoder_layers} layers, {text_encoder_dim} hidden size\n- Vision encoder: DiNOv2 features compressed to {vision_latent_size}\n- Episodic memory: Disabled for comparison study\n\n## Usage\n```python\nfrom transformers import AutoModel, AutoTokenizer\n\nmodel = AutoModel.from_pretrained(\"{repo_id}\")\ntokenizer = AutoTokenizer.from_pretrained(\"{repo_id}\")\n```\n"
+    },
+    "attention_sinks": {
+      "enabled": true,
+      "attention_sink_size": 4,
+      "attention_sink_window_size": 1020,
+      "inject_to_text_encoder": true,
+      "inject_to_text_decoder": true,
+      "position_shift_enabled": true,
+      "cache_compression": true,
+      "adaptive_window_size": false,
+      "memory_efficient_attention": true,
+      "preserve_episodic_memory": false,
+      "preserve_quantization": true,
+      "preserve_cross_modal_fusion": true
+    }
+  }
+}