estebancarlin commited on
Commit
a6d185f
·
verified ·
1 Parent(s): 31197ab

Upload 4 files

Browse files
Files changed (4) hide show
  1. config.json +16 -0
  2. modeling_bitmar.py +1136 -0
  3. pytorch_model.bin +3 -0
  4. training_metadata.json +223 -0
config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BitMarModel"
4
+ ],
5
+ "model_type": "bitmar",
6
+ "vocab_size": 50257,
7
+ "text_encoder_dim": 128,
8
+ "text_encoder_layers": 4,
9
+ "text_encoder_heads": 4,
10
+ "vision_encoder_dim": 768,
11
+ "vision_latent_size": 128,
12
+ "fusion_hidden_size": 128,
13
+ "max_seq_len": 256,
14
+ "dropout": 0.15,
15
+ "torch_dtype": "float32"
16
+ }
modeling_bitmar.py ADDED
@@ -0,0 +1,1136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BitMar Model for Hugging Face Transformers
3
+ BitNet-quantized Vision-Language Episodic Memory Transformer
4
+ """
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+ import logging
9
+ import math
10
+ import os
11
+ import pickle
12
+ import gzip
13
+ from typing import Dict, List, Optional, Tuple, Union
14
+ from transformers import PreTrainedModel, PretrainedConfig
15
+ from transformers.modeling_outputs import CausalLMOutput, BaseModelOutput
16
+ import time
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class BitMarConfig(PretrainedConfig):
22
+ """Configuration class for BitMar model"""
23
+
24
+ model_type = "bitmar"
25
+
26
+ def __init__(
27
+ self,
28
+ vocab_size: int = 50257,
29
+ text_encoder_dim: int = 128,
30
+ text_encoder_layers: int = 4,
31
+ text_encoder_heads: int = 4,
32
+ text_decoder_dim: int = 128,
33
+ text_decoder_layers: int = 4,
34
+ text_decoder_heads: int = 4,
35
+ vision_encoder_dim: int = 768,
36
+ vision_latent_size: int = 128,
37
+ vision_hidden_size: int = 64,
38
+ vision_compression_method: str = "learned_compression",
39
+ vision_spatial_pooling: bool = True,
40
+ vision_pool_size: int = 2,
41
+ fusion_hidden_size: int = 128,
42
+ fusion_num_heads: int = 4,
43
+ fusion_num_layers: int = 2,
44
+ memory_alpha: float = 0.2,
45
+ direct_writing: bool = True,
46
+ memory_compression: bool = True,
47
+ max_seq_len: int = 256,
48
+ dropout: float = 0.15,
49
+ initializer_range: float = 0.02,
50
+ layer_norm_epsilon: float = 1e-5,
51
+ use_cache: bool = True,
52
+ tie_word_embeddings: bool = True,
53
+ pad_token_id: int = 50256,
54
+ bos_token_id: int = 50256,
55
+ eos_token_id: int = 50256,
56
+ **kwargs
57
+ ):
58
+ super().__init__(
59
+ pad_token_id=pad_token_id,
60
+ bos_token_id=bos_token_id,
61
+ eos_token_id=eos_token_id,
62
+ **kwargs
63
+ )
64
+
65
+ self.vocab_size = vocab_size
66
+ self.text_encoder_dim = text_encoder_dim
67
+ self.text_encoder_layers = text_encoder_layers
68
+ self.text_encoder_heads = text_encoder_heads
69
+ self.text_decoder_dim = text_decoder_dim
70
+ self.text_decoder_layers = text_decoder_layers
71
+ self.text_decoder_heads = text_decoder_heads
72
+ self.vision_encoder_dim = vision_encoder_dim
73
+ self.vision_latent_size = vision_latent_size
74
+ self.vision_hidden_size = vision_hidden_size
75
+ self.vision_compression_method = vision_compression_method
76
+ self.vision_spatial_pooling = vision_spatial_pooling
77
+ self.vision_pool_size = vision_pool_size
78
+ self.fusion_hidden_size = fusion_hidden_size
79
+ self.fusion_num_heads = fusion_num_heads
80
+ self.fusion_num_layers = fusion_num_layers
81
+ self.memory_alpha = memory_alpha
82
+ self.direct_writing = direct_writing
83
+ self.memory_compression = memory_compression
84
+ self.max_seq_len = max_seq_len
85
+ self.dropout = dropout
86
+ self.initializer_range = initializer_range
87
+ self.layer_norm_epsilon = layer_norm_epsilon
88
+ self.use_cache = use_cache
89
+ self.tie_word_embeddings = tie_word_embeddings
90
+
91
+
92
+ class BitNetLinear(nn.Module):
93
+ """1.58-bit Linear layer following BitNet b1.58 architecture - FIXED VERSION"""
94
+
95
+ def __init__(self, in_features: int, out_features: int, bias: bool = True):
96
+ super().__init__()
97
+ self.in_features = in_features
98
+ self.out_features = out_features
99
+
100
+ # Weight parameters (full precision for training)
101
+ self.weight = nn.Parameter(torch.randn(out_features, in_features))
102
+ self.bias = nn.Parameter(torch.zeros(out_features)) if bias else None
103
+
104
+ # FIXED
105
+ self.register_buffer('weight_scale', torch.tensor(1.0))
106
+ self.register_buffer('input_scale', torch.tensor(1.0))
107
+
108
+ def quantize_weights_1_58_bit(self, weight: torch.Tensor) -> torch.Tensor:
109
+ """BitNet b1.58 weight quantization: {-1, 0, +1}"""
110
+
111
+ # Handle empty tensors
112
+ if weight.numel() == 0:
113
+ return weight
114
+
115
+ # Compute scaling factor with numerical stability
116
+ scale = weight.abs().mean()
117
+
118
+ # Handle case where all weights are zero
119
+ if scale < 1e-8:
120
+ scale = torch.tensor(1e-5, device=weight.device, dtype=weight.dtype)
121
+
122
+ self.weight_scale.data = scale.clamp(min=1e-5, max=1e3)
123
+
124
+ # Normalize weights with gradient clipping
125
+ weight_norm = torch.clamp(weight / self.weight_scale, min=-10.0, max=10.0)
126
+
127
+ # 1.58-bit quantization with threshold
128
+ threshold = 2.0 / 3.0 # Optimal threshold for ternary quantization
129
+
130
+ # Create ternary weights
131
+ quantized = torch.zeros_like(weight_norm)
132
+ quantized[weight_norm > threshold] = 1.0
133
+ quantized[weight_norm < -threshold] = -1.0
134
+ # Values between -threshold and threshold remain 0
135
+
136
+ return quantized
137
+
138
+ def quantize_activations_8bit(self, x: torch.Tensor) -> torch.Tensor:
139
+ """8-bit activation quantization with numerical stability"""
140
+
141
+ # Handle empty tensors
142
+ if x.numel() == 0:
143
+ return x
144
+
145
+ # Clamp extreme values to prevent overflow
146
+ x_clamped = torch.clamp(x, min=-1e6, max=1e6)
147
+
148
+ # Handle scalar tensors
149
+ if x_clamped.numel() == 1:
150
+ return x_clamped
151
+
152
+ # Compute quantization parameters
153
+ x_min, x_max = x_clamped.min(), x_clamped.max()
154
+
155
+ # Prevent division by zero
156
+ range_val = x_max - x_min
157
+ if range_val < 1e-8:
158
+ return x_clamped
159
+
160
+ scale = range_val / 255.0
161
+ self.input_scale.data = scale.clamp(min=1e-8, max=1e3)
162
+
163
+ # Quantize to 8-bit
164
+ zero_point = (-x_min / scale).round().clamp(0, 255)
165
+ quantized = ((x_clamped / scale) + zero_point).round().clamp(0, 255)
166
+
167
+ # Dequantize
168
+ dequantized = scale * (quantized - zero_point)
169
+ return dequantized
170
+
171
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
172
+ if self.training:
173
+ # Full precision training with straight-through estimator
174
+ # Forward pass with quantized weights but gradients flow through original weights
175
+ weight_q = self.quantize_weights_1_58_bit(self.weight)
176
+ weight_forward = weight_q * self.weight_scale
177
+
178
+ # Use original weight for gradient computation
179
+ weight_forward = weight_forward + (self.weight - self.weight.detach())
180
+
181
+ return F.linear(x, weight_forward, self.bias)
182
+ else:
183
+ # Inference with full quantization
184
+ weight_q = self.quantize_weights_1_58_bit(self.weight) * self.weight_scale
185
+ x_q = self.quantize_activations_8bit(x)
186
+ return F.linear(x_q, weight_q, self.bias)
187
+
188
+
189
+ class BitNetMLP(nn.Module):
190
+ """BitNet MLP block with 1.58-bit quantization"""
191
+
192
+ def __init__(self, dim: int, hidden_dim: int, dropout: float = 0.1):
193
+ super().__init__()
194
+ self.fc1 = BitNetLinear(dim, hidden_dim)
195
+ self.fc2 = BitNetLinear(hidden_dim, dim)
196
+ self.activation = nn.GELU()
197
+ self.dropout = nn.Dropout(dropout)
198
+ self.norm = nn.LayerNorm(dim)
199
+
200
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
201
+ residual = x
202
+ x = self.fc1(x)
203
+ x = self.activation(x)
204
+ x = self.dropout(x)
205
+ x = self.fc2(x)
206
+ x = self.dropout(x)
207
+ return self.norm(x + residual)
208
+
209
+
210
+ class BitNetAttention(nn.Module):
211
+ """Multi-head attention with BitNet quantization"""
212
+
213
+ def __init__(
214
+ self,
215
+ dim: int,
216
+ num_heads: int,
217
+ dropout: float = 0.1,
218
+ bias: bool = True
219
+ ):
220
+ super().__init__()
221
+ assert dim % num_heads == 0
222
+
223
+ self.dim = dim
224
+ self.num_heads = num_heads
225
+ self.head_dim = dim // num_heads
226
+ self.scale = self.head_dim ** -0.5
227
+
228
+ # BitNet quantized projections
229
+ self.q_proj = BitNetLinear(dim, dim, bias=bias)
230
+ self.k_proj = BitNetLinear(dim, dim, bias=bias)
231
+ self.v_proj = BitNetLinear(dim, dim, bias=bias)
232
+ self.out_proj = BitNetLinear(dim, dim, bias=bias)
233
+
234
+ self.dropout = nn.Dropout(dropout)
235
+
236
+ def forward(
237
+ self,
238
+ query: torch.Tensor,
239
+ key: torch.Tensor,
240
+ value: torch.Tensor,
241
+ mask: Optional[torch.Tensor] = None
242
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
243
+ batch_size, seq_len = query.shape[:2]
244
+
245
+ # Validate input dimensions
246
+ if query.size(-1) != self.dim:
247
+ raise ValueError(f"Query dimension {query.size(-1)} doesn't match expected {self.dim}")
248
+ if key.size(-1) != self.dim:
249
+ raise ValueError(f"Key dimension {key.size(-1)} doesn't match expected {self.dim}")
250
+ if value.size(-1) != self.dim:
251
+ raise ValueError(f"Value dimension {value.size(-1)} doesn't match expected {self.dim}")
252
+
253
+ # Linear projections
254
+ q = self.q_proj(query)
255
+ k = self.k_proj(key)
256
+ v = self.v_proj(value)
257
+
258
+ # Get key/value sequence length (handle different shapes)
259
+ key_seq_len = key.size(1)
260
+
261
+ # Reshape for multi-head attention with proper dimension checking
262
+ q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
263
+ k = k.view(batch_size, key_seq_len, self.num_heads, self.head_dim).transpose(1, 2)
264
+ v = v.view(batch_size, key_seq_len, self.num_heads, self.head_dim).transpose(1, 2)
265
+
266
+ # Attention computation
267
+ attention_scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
268
+
269
+ if mask is not None:
270
+ # Handle mask shape: expand to match attention scores shape
271
+ if mask.dim() == 2: # [batch_size, seq_len]
272
+ mask = mask.unsqueeze(1).unsqueeze(1) # [batch_size, 1, 1, seq_len]
273
+ elif mask.dim() == 3: # [batch_size, seq_len, seq_len]
274
+ mask = mask.unsqueeze(1) # [batch_size, 1, seq_len, seq_len]
275
+
276
+ # Expand mask to match attention scores shape [batch_size, num_heads, seq_len, key_seq_len]
277
+ if mask.size(-1) != key_seq_len:
278
+ # Adjust mask if needed
279
+ if mask.size(-1) == seq_len:
280
+ # Pad or trim mask to match key_seq_len
281
+ if key_seq_len > seq_len:
282
+ pad_size = key_seq_len - seq_len
283
+ mask = torch.cat([mask, torch.zeros(*mask.shape[:-1], pad_size, device=mask.device, dtype=mask.dtype)], dim=-1)
284
+ else:
285
+ mask = mask[..., :key_seq_len]
286
+
287
+ mask = mask.expand(batch_size, self.num_heads, seq_len, key_seq_len)
288
+ attention_scores.masked_fill_(mask == 0, float('-inf'))
289
+
290
+ attention_weights = F.softmax(attention_scores, dim=-1)
291
+ attention_weights = self.dropout(attention_weights)
292
+
293
+ # Apply attention to values
294
+ attended = torch.matmul(attention_weights, v)
295
+
296
+ # Reshape and project output
297
+ attended = attended.transpose(1, 2).contiguous().view(
298
+ batch_size, seq_len, self.dim
299
+ )
300
+ output = self.out_proj(attended)
301
+
302
+ return output, attention_weights.mean(dim=1) # Average across heads
303
+
304
+
305
+ class BitNetTransformerBlock(nn.Module):
306
+ """BitNet Transformer block with quantized components"""
307
+
308
+ def __init__(
309
+ self,
310
+ dim: int,
311
+ num_heads: int,
312
+ mlp_ratio: float = 4.0,
313
+ dropout: float = 0.1
314
+ ):
315
+ super().__init__()
316
+
317
+ self.norm1 = nn.LayerNorm(dim)
318
+ self.attn = BitNetAttention(dim, num_heads, dropout)
319
+
320
+ self.norm2 = nn.LayerNorm(dim)
321
+ self.mlp = BitNetMLP(dim, int(dim * mlp_ratio), dropout)
322
+
323
+ def forward(
324
+ self,
325
+ x: torch.Tensor,
326
+ mask: Optional[torch.Tensor] = None
327
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
328
+ # Self-attention with residual connection
329
+ normed_x = self.norm1(x)
330
+ attn_out, attn_weights = self.attn(normed_x, normed_x, normed_x, mask)
331
+ x = x + attn_out
332
+
333
+ # MLP with residual connection
334
+ x = x + self.mlp(self.norm2(x))
335
+
336
+ return x, attn_weights
337
+
338
+
339
+ class BitNetTextEncoder(nn.Module):
340
+ """BitNet-based text encoder"""
341
+
342
+ def __init__(
343
+ self,
344
+ vocab_size: int,
345
+ dim: int,
346
+ num_layers: int,
347
+ num_heads: int,
348
+ max_seq_len: int = 512,
349
+ dropout: float = 0.1
350
+ ):
351
+ super().__init__()
352
+ self.dim = dim
353
+ self.max_seq_len = max_seq_len
354
+
355
+ # Token embeddings (kept full precision)
356
+ self.token_embedding = nn.Embedding(vocab_size, dim)
357
+ self.position_embedding = nn.Embedding(max_seq_len, dim)
358
+
359
+ # BitNet transformer layers
360
+ self.layers = nn.ModuleList([
361
+ BitNetTransformerBlock(dim, num_heads, dropout=dropout)
362
+ for _ in range(num_layers)
363
+ ])
364
+
365
+ self.dropout = nn.Dropout(dropout)
366
+ self.norm = nn.LayerNorm(dim)
367
+
368
+ # Initialize embeddings
369
+ nn.init.normal_(self.token_embedding.weight, std=0.02)
370
+ nn.init.normal_(self.position_embedding.weight, std=0.02)
371
+
372
+ def forward(
373
+ self,
374
+ input_ids: torch.Tensor,
375
+ attention_mask: Optional[torch.Tensor] = None
376
+ ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
377
+ batch_size, seq_len = input_ids.shape
378
+
379
+ # Embeddings
380
+ positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
381
+ x = self.token_embedding(input_ids) + \
382
+ self.position_embedding(positions)
383
+ x = self.dropout(x)
384
+
385
+ # Transform through BitNet layers
386
+ attention_patterns = []
387
+ for layer in self.layers:
388
+ # Convert attention mask to the right format for the layer
389
+ layer_mask = None
390
+ if attention_mask is not None:
391
+ # Create a mask where 1 means attend, 0 means don't attend
392
+ layer_mask = attention_mask.unsqueeze(
393
+ 1).unsqueeze(2) # [batch_size, 1, 1, seq_len]
394
+
395
+ x, attn_weights = layer(x, layer_mask)
396
+ attention_patterns.append(attn_weights)
397
+
398
+ x = self.norm(x)
399
+ return x, attention_patterns
400
+
401
+
402
+ class BitNetTextDecoder(nn.Module):
403
+ """BitNet-based text decoder with causal masking"""
404
+
405
+ def __init__(
406
+ self,
407
+ vocab_size: int,
408
+ dim: int,
409
+ num_layers: int,
410
+ num_heads: int,
411
+ max_seq_len: int = 512,
412
+ dropout: float = 0.1
413
+ ):
414
+ super().__init__()
415
+ self.dim = dim
416
+ self.max_seq_len = max_seq_len
417
+
418
+ # Token embeddings
419
+ self.token_embedding = nn.Embedding(vocab_size, dim)
420
+ self.position_embedding = nn.Embedding(max_seq_len, dim)
421
+
422
+ # BitNet transformer layers
423
+ self.layers = nn.ModuleList([
424
+ BitNetTransformerBlock(dim, num_heads, dropout=dropout)
425
+ for _ in range(num_layers)
426
+ ])
427
+
428
+ self.dropout = nn.Dropout(dropout)
429
+ self.norm = nn.LayerNorm(dim)
430
+
431
+ # Output projection to vocabulary
432
+ self.lm_head = BitNetLinear(dim, vocab_size, bias=False)
433
+
434
+ # Initialize embeddings
435
+ nn.init.normal_(self.token_embedding.weight, std=0.02)
436
+ nn.init.normal_(self.position_embedding.weight, std=0.02)
437
+
438
+ # Register causal mask
439
+ self.register_buffer(
440
+ 'causal_mask',
441
+ torch.tril(torch.ones(max_seq_len, max_seq_len)
442
+ ).unsqueeze(0).unsqueeze(0)
443
+ )
444
+
445
+ def forward(
446
+ self,
447
+ input_ids: Optional[torch.Tensor] = None,
448
+ inputs_embeds: Optional[torch.Tensor] = None,
449
+ attention_mask: Optional[torch.Tensor] = None,
450
+ labels: Optional[torch.Tensor] = None
451
+ ) -> Dict[str, torch.Tensor]:
452
+
453
+ if input_ids is not None:
454
+ batch_size, seq_len = input_ids.shape
455
+ positions = torch.arange(
456
+ seq_len, device=input_ids.device).unsqueeze(0)
457
+ x = self.token_embedding(input_ids) + \
458
+ self.position_embedding(positions)
459
+ elif inputs_embeds is not None:
460
+ batch_size, seq_len = inputs_embeds.shape[:2]
461
+ positions = torch.arange(
462
+ seq_len, device=inputs_embeds.device).unsqueeze(0)
463
+ x = inputs_embeds + self.position_embedding(positions)
464
+ else:
465
+ raise ValueError(
466
+ "Either input_ids or inputs_embeds must be provided")
467
+
468
+ x = self.dropout(x)
469
+
470
+ # Create causal mask
471
+ causal_mask = self.causal_mask[:, :, :seq_len, :seq_len]
472
+ if attention_mask is not None:
473
+ # Combine causal mask with padding mask
474
+ mask = attention_mask.unsqueeze(1).unsqueeze(2) * causal_mask
475
+ else:
476
+ mask = causal_mask
477
+
478
+ # Transform through BitNet layers
479
+ attention_patterns = []
480
+ for layer in self.layers:
481
+ x, attn_weights = layer(x, mask)
482
+ attention_patterns.append(attn_weights)
483
+
484
+ x = self.norm(x)
485
+ logits = self.lm_head(x)
486
+
487
+ loss = None
488
+ if labels is not None:
489
+ # Shift labels for causal LM
490
+ shift_logits = logits[..., :-1, :].contiguous()
491
+ shift_labels = labels[..., 1:].contiguous()
492
+ loss = F.cross_entropy(
493
+ shift_logits.view(-1, shift_logits.size(-1)),
494
+ shift_labels.view(-1),
495
+ ignore_index=-100
496
+ )
497
+
498
+ return {
499
+ 'logits': logits,
500
+ 'loss': loss,
501
+ 'attention_patterns': attention_patterns
502
+ }
503
+
504
+
505
+ class CrossModalFusion(nn.Module):
506
+ """Cross-modal fusion module for text and vision features"""
507
+
508
+ def __init__(
509
+ self,
510
+ text_dim: int,
511
+ vision_dim: int,
512
+ hidden_dim: int,
513
+ num_heads: int = 8,
514
+ num_layers: int = 2
515
+ ):
516
+ super().__init__()
517
+ self.text_dim = text_dim
518
+ self.vision_dim = vision_dim
519
+ self.hidden_dim = hidden_dim
520
+
521
+ # Projection layers
522
+ self.text_proj = BitNetLinear(text_dim, hidden_dim)
523
+ self.vision_proj = BitNetLinear(vision_dim, hidden_dim)
524
+
525
+ # Cross-attention layers
526
+ self.cross_attention_layers = nn.ModuleList([
527
+ BitNetAttention(
528
+ dim=hidden_dim,
529
+ num_heads=num_heads
530
+ ) for _ in range(num_layers)
531
+ ])
532
+
533
+ # Layer normalization
534
+ self.layer_norms = nn.ModuleList([
535
+ nn.LayerNorm(hidden_dim) for _ in range(num_layers)
536
+ ])
537
+
538
+ # Output projection
539
+ self.output_proj = BitNetLinear(hidden_dim, hidden_dim)
540
+
541
+ def forward(
542
+ self,
543
+ text_features: torch.Tensor,
544
+ vision_features: torch.Tensor
545
+ ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
546
+ """
547
+ Args:
548
+ text_features: [batch_size, seq_len, text_dim]
549
+ vision_features: [batch_size, vision_dim]
550
+
551
+ Returns:
552
+ fused_features: [batch_size, seq_len, hidden_dim]
553
+ attention_weights: Dict of attention patterns
554
+ """
555
+ batch_size, seq_len = text_features.shape[:2]
556
+
557
+ # Validate input dimensions
558
+ if text_features.size(-1) != self.text_dim:
559
+ raise ValueError(f"Text features dimension {text_features.size(-1)} doesn't match expected {self.text_dim}")
560
+ if vision_features.size(-1) != self.vision_dim:
561
+ raise ValueError(f"Vision features dimension {vision_features.size(-1)} doesn't match expected {self.vision_dim}")
562
+
563
+ # Project to common dimension
564
+ # [batch_size, seq_len, hidden_dim]
565
+ text_proj = self.text_proj(text_features)
566
+ vision_proj = self.vision_proj(vision_features).unsqueeze(1) # [batch_size, 1, hidden_dim]
567
+
568
+ # Cross-attention fusion
569
+ fused = text_proj
570
+ attention_weights = {}
571
+
572
+ for i, (attn_layer, norm_layer) in enumerate(zip(self.cross_attention_layers, self.layer_norms)):
573
+ # Text-to-vision cross-attention
574
+ attn_output, attn_weights = attn_layer(
575
+ query=fused,
576
+ key=vision_proj,
577
+ value=vision_proj
578
+ )
579
+
580
+ # Residual connection and normalization
581
+ fused = norm_layer(fused + attn_output)
582
+ attention_weights[f'layer_{i}'] = attn_weights
583
+
584
+ # Output projection
585
+ output = self.output_proj(fused)
586
+
587
+ return output, attention_weights
588
+
589
+
590
+ class VisionEncoder(nn.Module):
591
+ """Quantized Vision Encoder for DiNOv2 features"""
592
+
593
+ def __init__(
594
+ self,
595
+ input_dim: int = 768,
596
+ hidden_dim: int = 512,
597
+ output_dim: int = 768,
598
+ num_layers: int = 2
599
+ ):
600
+ super().__init__()
601
+
602
+ # Quantized layers
603
+ self.layers = nn.ModuleList([
604
+ BitNetLinear(input_dim if i == 0 else hidden_dim, hidden_dim)
605
+ for i in range(num_layers)
606
+ ])
607
+
608
+ # Output projection
609
+ self.output_proj = BitNetLinear(hidden_dim, output_dim)
610
+
611
+ # Activation and normalization
612
+ self.activation = nn.GELU()
613
+ self.layer_norms = nn.ModuleList([
614
+ nn.LayerNorm(hidden_dim) for _ in range(num_layers)
615
+ ])
616
+ self.dropout = nn.Dropout(0.1)
617
+
618
+ def forward(self, vision_features: torch.Tensor) -> torch.Tensor:
619
+ """
620
+ Args:
621
+ vision_features: [batch_size, input_dim] - DiNOv2 features
622
+
623
+ Returns:
624
+ encoded_features: [batch_size, output_dim]
625
+ """
626
+ # Handle potential extra dimensions
627
+ if vision_features.dim() > 2:
628
+ # Flatten any extra dimensions except batch
629
+ original_shape = vision_features.shape
630
+ vision_features = vision_features.view(original_shape[0], -1)
631
+
632
+ # Ensure we have the expected input dimension
633
+ if vision_features.size(-1) != self.layers[0].in_features:
634
+ # Take only the first input_dim features if we have more
635
+ if vision_features.size(-1) > self.layers[0].in_features:
636
+ vision_features = vision_features[:, :self.layers[0].in_features]
637
+ else:
638
+ raise ValueError(f"Vision features dimension {vision_features.size(-1)} is smaller than expected {self.layers[0].in_features}")
639
+
640
+ x = vision_features
641
+
642
+ for layer, norm in zip(self.layers, self.layer_norms):
643
+ x = layer(x)
644
+ x = norm(x)
645
+ x = self.activation(x)
646
+ x = self.dropout(x)
647
+
648
+ # Output projection
649
+ output = self.output_proj(x)
650
+
651
+ return output
652
+
653
+
654
+ class BitMarModel(PreTrainedModel):
655
+ """
656
+ BitMar: BitNet-quantized Vision-Language Episodic Memory Transformer
657
+ Compatible with Hugging Face Transformers
658
+ """
659
+
660
+ config_class = BitMarConfig
661
+ base_model_prefix = "bitmar"
662
+ supports_gradient_checkpointing = True
663
+ _no_split_modules = ["BitNetTransformerBlock"]
664
+
665
+ def __init__(self, config: BitMarConfig):
666
+ super().__init__(config)
667
+ self.config = config
668
+
669
+ # Loss balancing parameters
670
+ self.cross_modal_loss_weight = getattr(config, 'cross_modal_loss_weight', 0.1)
671
+ self.text_loss_weight = getattr(config, 'text_loss_weight', 1.0)
672
+ self.vision_loss_weight = getattr(config, 'vision_loss_weight', 0.1)
673
+ self.memory_loss_weight = getattr(config, 'memory_loss_weight', 0.05)
674
+
675
+ # Dynamic loss scaling
676
+ self.adaptive_loss_scaling = getattr(config, 'adaptive_loss_scaling', True)
677
+ self.loss_scale_temperature = getattr(config, 'loss_scale_temperature', 0.07)
678
+
679
+ # Encoder freezing parameters
680
+ self.freeze_text_encoder_steps = getattr(config, 'freeze_text_encoder_steps', 0)
681
+ self.freeze_vision_encoder_steps = getattr(config, 'freeze_vision_encoder_steps', 0)
682
+ self.current_step = 0
683
+
684
+ # BitNet text encoder/decoder
685
+ self.text_encoder = BitNetTextEncoder(
686
+ vocab_size=config.vocab_size,
687
+ dim=config.text_encoder_dim,
688
+ num_layers=config.text_encoder_layers,
689
+ num_heads=config.text_encoder_heads,
690
+ max_seq_len=config.max_seq_len,
691
+ dropout=config.dropout
692
+ )
693
+
694
+ self.text_decoder = BitNetTextDecoder(
695
+ vocab_size=config.vocab_size,
696
+ dim=config.text_decoder_dim,
697
+ num_layers=config.text_decoder_layers,
698
+ num_heads=config.text_decoder_heads,
699
+ max_seq_len=config.max_seq_len,
700
+ dropout=config.dropout
701
+ )
702
+
703
+ # Vision processing with BitNet quantization
704
+ self.vision_encoder = VisionEncoder(
705
+ input_dim=config.vision_encoder_dim,
706
+ hidden_dim=config.vision_hidden_size,
707
+ output_dim=config.vision_latent_size
708
+ )
709
+
710
+ # Cross-modal fusion with BitNet
711
+ self.fusion = CrossModalFusion(
712
+ text_dim=config.text_encoder_dim,
713
+ vision_dim=config.vision_latent_size,
714
+ hidden_dim=config.fusion_hidden_size,
715
+ num_heads=config.fusion_num_heads,
716
+ num_layers=config.fusion_num_layers
717
+ )
718
+
719
+ # Projection to decoder dimension
720
+ self.decoder_input_proj = BitNetLinear(
721
+ config.fusion_hidden_size,
722
+ config.text_decoder_dim
723
+ )
724
+
725
+ # Initialize tokenizer (for compatibility)
726
+ try:
727
+ from transformers import AutoTokenizer
728
+ self.tokenizer = AutoTokenizer.from_pretrained('gpt2')
729
+ if self.tokenizer.pad_token is None:
730
+ self.tokenizer.pad_token = self.tokenizer.eos_token
731
+ except:
732
+ self.tokenizer = None
733
+
734
+ self.post_init()
735
+
736
+ def _init_weights(self, module):
737
+ """Initialize the weights"""
738
+ if isinstance(module, (nn.Linear, BitNetLinear)):
739
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
740
+ if hasattr(module, 'bias') and module.bias is not None:
741
+ module.bias.data.zero_()
742
+ elif isinstance(module, nn.Embedding):
743
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
744
+ if module.padding_idx is not None:
745
+ module.weight.data[module.padding_idx].zero_()
746
+ elif isinstance(module, nn.LayerNorm):
747
+ if hasattr(module, 'bias') and module.bias is not None:
748
+ module.bias.data.zero_()
749
+ module.weight.data.fill_(1.0)
750
+
751
+ def encode_text(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
752
+ """Encode text using BitNet encoder"""
753
+ text_features, attention_patterns = self.text_encoder(
754
+ input_ids=input_ids, attention_mask=attention_mask)
755
+ return text_features, attention_patterns
756
+
757
+ def encode_vision(self, vision_features: torch.Tensor) -> torch.Tensor:
758
+ """Encode vision features using quantized vision encoder"""
759
+ vision_latent = self.vision_encoder(vision_features)
760
+ return vision_latent
761
+
762
+ def compute_cross_modal_contrastive_loss(
763
+ self,
764
+ text_features: torch.Tensor,
765
+ vision_features: torch.Tensor,
766
+ temperature: float = 0.07
767
+ ) -> torch.Tensor:
768
+ """Compute cross-modal contrastive loss similar to CLIP"""
769
+ batch_size = text_features.shape[0]
770
+
771
+ # Handle dimension mismatch between text and vision features
772
+ text_dim = text_features.shape[-1]
773
+ vision_dim = vision_features.shape[-1]
774
+
775
+ if text_dim != vision_dim:
776
+ # Project to smaller dimension to maintain compatibility
777
+ target_dim = min(text_dim, vision_dim)
778
+
779
+ if text_dim > vision_dim:
780
+ # Project text features to vision dimension
781
+ text_features = text_features[:, :target_dim]
782
+ else:
783
+ # Project vision features to text dimension
784
+ vision_features = vision_features[:, :target_dim]
785
+
786
+ # Normalize features
787
+ text_features = F.normalize(text_features, dim=-1)
788
+ vision_features = F.normalize(vision_features, dim=-1)
789
+
790
+ # Compute similarity matrix
791
+ logits = torch.matmul(text_features, vision_features.T) / temperature
792
+
793
+ # Create labels (diagonal should be positive pairs)
794
+ labels = torch.arange(batch_size, device=logits.device)
795
+
796
+ # Compute cross-entropy loss for both directions
797
+ text_to_vision_loss = F.cross_entropy(logits, labels)
798
+ vision_to_text_loss = F.cross_entropy(logits.T, labels)
799
+
800
+ return (text_to_vision_loss + vision_to_text_loss) / 2
801
+
802
+ def compute_vision_reconstruction_loss(
803
+ self,
804
+ original_vision: torch.Tensor,
805
+ reconstructed_vision: torch.Tensor
806
+ ) -> torch.Tensor:
807
+ """Compute vision reconstruction loss to prevent vision encoder collapse"""
808
+ return F.mse_loss(reconstructed_vision, original_vision)
809
+
810
+ def compute_balanced_loss(
811
+ self,
812
+ decoder_loss: torch.Tensor,
813
+ cross_modal_loss: torch.Tensor,
814
+ vision_loss: Optional[torch.Tensor] = None,
815
+ step: int = 0,
816
+ adaptive_controller=None
817
+ ) -> Dict[str, torch.Tensor]:
818
+ """Compute balanced multi-objective loss with adaptive scaling"""
819
+ losses = {'decoder_loss': decoder_loss, 'cross_modal_loss': cross_modal_loss}
820
+
821
+ if vision_loss is not None:
822
+ losses['vision_loss'] = vision_loss
823
+
824
+ if self.adaptive_loss_scaling:
825
+ # Adaptive scaling based on loss magnitudes
826
+ with torch.no_grad():
827
+ # Compute relative loss scales
828
+ decoder_scale = decoder_loss.detach()
829
+ cross_modal_scale = cross_modal_loss.detach()
830
+
831
+ # Prevent division by zero
832
+ if decoder_scale > 1e-8:
833
+ adaptive_cross_modal_weight = (decoder_scale / cross_modal_scale.clamp(min=1e-8)) * self.cross_modal_loss_weight
834
+ else:
835
+ adaptive_cross_modal_weight = self.cross_modal_loss_weight
836
+
837
+ # Clamp adaptive weights
838
+ adaptive_cross_modal_weight = torch.clamp(adaptive_cross_modal_weight, 0.01, 1.0)
839
+ else:
840
+ adaptive_cross_modal_weight = self.cross_modal_loss_weight
841
+
842
+ # Apply loss scheduling (increase cross-modal importance over time)
843
+ cross_modal_schedule = min(1.0, step / 50000) # Ramp up over 50k steps
844
+ scheduled_cross_modal_weight = adaptive_cross_modal_weight * cross_modal_schedule
845
+
846
+ # Compute weighted total loss
847
+ total_loss = (
848
+ self.text_loss_weight * decoder_loss +
849
+ scheduled_cross_modal_weight * cross_modal_loss
850
+ )
851
+
852
+ if vision_loss is not None:
853
+ total_loss += self.vision_loss_weight * vision_loss
854
+
855
+ losses.update({
856
+ 'total_loss': total_loss,
857
+ 'cross_modal_weight': scheduled_cross_modal_weight,
858
+ 'adaptive_weight': adaptive_cross_modal_weight if self.adaptive_loss_scaling else torch.tensor(0.0)
859
+ })
860
+
861
+ return losses
862
+
863
+ def apply_encoder_freezing(self, step: int):
864
+ """Apply temporary encoder freezing based on training step"""
865
+ self.current_step = step
866
+
867
+ # Freeze text encoder if within freezing window
868
+ freeze_text = step < self.freeze_text_encoder_steps
869
+ for param in self.text_encoder.parameters():
870
+ param.requires_grad = not freeze_text
871
+
872
+ # Freeze vision encoder if within freezing window
873
+ freeze_vision = step < self.freeze_vision_encoder_steps
874
+ for param in self.vision_encoder.parameters():
875
+ param.requires_grad = not freeze_vision
876
+
877
+ return {
878
+ 'text_encoder_frozen': freeze_text,
879
+ 'vision_encoder_frozen': freeze_vision
880
+ }
881
+
882
+ def forward(
883
+ self,
884
+ input_ids: Optional[torch.LongTensor] = None,
885
+ attention_mask: Optional[torch.FloatTensor] = None,
886
+ vision_features: Optional[torch.FloatTensor] = None,
887
+ labels: Optional[torch.LongTensor] = None,
888
+ use_cache: Optional[bool] = None,
889
+ output_attentions: Optional[bool] = None,
890
+ output_hidden_states: Optional[bool] = None,
891
+ return_dict: Optional[bool] = None,
892
+ mode: str = "train",
893
+ step: int = 0,
894
+ has_vision: Optional[torch.Tensor] = None,
895
+ **kwargs
896
+ ) -> Union[Tuple, CausalLMOutput]:
897
+ """
898
+ Forward pass through BitMar model with mixed vision/text batch support
899
+
900
+ Args:
901
+ has_vision: Boolean tensor [batch_size] indicating which samples have real vision features
902
+ """
903
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
904
+
905
+ # CRITICAL FIX: Ensure input_ids are integers
906
+ if input_ids.dtype != torch.long:
907
+ input_ids = input_ids.long()
908
+
909
+ # CRITICAL FIX: Ensure labels are integers if provided
910
+ if labels is not None and labels.dtype != torch.long:
911
+ labels = labels.long()
912
+
913
+ if input_ids is None:
914
+ raise ValueError("input_ids must be provided")
915
+
916
+ batch_size, seq_len = input_ids.shape
917
+
918
+ # Handle missing attention mask
919
+ if attention_mask is None:
920
+ attention_mask = torch.ones_like(input_ids, dtype=torch.float)
921
+
922
+ # Ensure attention_mask is float
923
+ if attention_mask.dtype != torch.float:
924
+ attention_mask = attention_mask.float()
925
+
926
+ # Handle missing vision features
927
+ if vision_features is None:
928
+ vision_features = torch.zeros(batch_size, self.config.vision_encoder_dim,
929
+ device=input_ids.device, dtype=torch.float32)
930
+
931
+ # Validate input tensor dimensions
932
+ expected_vision_dim = self.config.vision_encoder_dim
933
+ if vision_features.dim() != 2 or vision_features.size(-1) != expected_vision_dim:
934
+ if vision_features.dim() > 2:
935
+ vision_features = vision_features.view(batch_size, -1)
936
+ if vision_features.size(-1) != expected_vision_dim:
937
+ # Pad or trim to expected dimension
938
+ if vision_features.size(-1) > expected_vision_dim:
939
+ vision_features = vision_features[:, :expected_vision_dim]
940
+ else:
941
+ padding = expected_vision_dim - vision_features.size(-1)
942
+ vision_features = F.pad(vision_features, (0, padding))
943
+
944
+ # Default has_vision to all True if not provided (backward compatibility)
945
+ if has_vision is None:
946
+ has_vision = torch.ones(batch_size, dtype=torch.bool, device=input_ids.device)
947
+
948
+ # Apply encoder freezing
949
+ freezing_status = {}
950
+ if mode == "train":
951
+ freezing_status = self.apply_encoder_freezing(step)
952
+
953
+ # Encode text (always available)
954
+ text_features, text_attention = self.encode_text(input_ids, attention_mask)
955
+
956
+ # Encode vision (with masking for text-only samples)
957
+ vision_latent = self.encode_vision(vision_features)
958
+
959
+ # Mask vision features for text-only samples
960
+ vision_mask = has_vision.float().unsqueeze(-1)
961
+ vision_latent_masked = vision_latent * vision_mask
962
+
963
+ # Cross-modal fusion
964
+ fused_features, cross_attention = self.fusion(text_features, vision_latent_masked)
965
+
966
+ # Prepare decoder input
967
+ fused_no_memory = fused_features
968
+ decoder_input = self.decoder_input_proj(fused_no_memory)
969
+
970
+ # Generate text using BitNet decoder
971
+ decoder_outputs = self.text_decoder(
972
+ inputs_embeds=decoder_input,
973
+ attention_mask=attention_mask,
974
+ labels=labels
975
+ )
976
+
977
+ # Compute losses if in training mode
978
+ final_loss = None
979
+ loss_dict = {}
980
+
981
+ if mode == "train" and labels is not None:
982
+ # Primary decoder loss
983
+ decoder_loss = decoder_outputs['loss']
984
+
985
+ # Cross-modal contrastive loss (only for samples with vision)
986
+ cross_modal_loss = torch.tensor(0.0, device=input_ids.device)
987
+ if has_vision.any():
988
+ vision_indices = has_vision.nonzero(as_tuple=True)[0]
989
+ if len(vision_indices) > 0:
990
+ text_pooled = text_features[vision_indices].mean(dim=1)
991
+ vision_for_loss = vision_latent[vision_indices]
992
+ cross_modal_loss = self.compute_cross_modal_contrastive_loss(
993
+ text_pooled, vision_for_loss, temperature=self.loss_scale_temperature
994
+ )
995
+
996
+ # Optional additional losses
997
+ vision_loss = None
998
+
999
+ # Compute balanced loss
1000
+ loss_dict = self.compute_balanced_loss(
1001
+ decoder_loss, cross_modal_loss, vision_loss, step
1002
+ )
1003
+
1004
+ final_loss = loss_dict['total_loss']
1005
+ elif decoder_outputs.get('loss') is not None:
1006
+ final_loss = decoder_outputs['loss']
1007
+
1008
+ # Prepare outputs
1009
+ if return_dict:
1010
+ output = CausalLMOutput(
1011
+ loss=final_loss,
1012
+ logits=decoder_outputs['logits'],
1013
+ hidden_states=fused_features if output_hidden_states else None,
1014
+ attentions=text_attention if output_attentions else None,
1015
+ )
1016
+
1017
+ # Add additional outputs for analysis
1018
+ if mode == "train":
1019
+ for key, value in loss_dict.items():
1020
+ setattr(output, key, value)
1021
+ for key, value in freezing_status.items():
1022
+ setattr(output, key, value)
1023
+
1024
+ return output
1025
+ else:
1026
+ outputs = (decoder_outputs['logits'],)
1027
+ if final_loss is not None:
1028
+ outputs = (final_loss,) + outputs
1029
+ if output_hidden_states:
1030
+ outputs = outputs + (fused_features,)
1031
+ if output_attentions:
1032
+ outputs = outputs + (text_attention,)
1033
+ return outputs
1034
+
1035
+ def generate(
1036
+ self,
1037
+ input_ids: torch.Tensor,
1038
+ attention_mask: Optional[torch.Tensor] = None,
1039
+ vision_features: Optional[torch.Tensor] = None,
1040
+ max_length: int = 100,
1041
+ temperature: float = 0.7,
1042
+ top_p: float = 0.9,
1043
+ do_sample: bool = True,
1044
+ **kwargs
1045
+ ) -> torch.LongTensor:
1046
+ """Generate text given input text and vision features"""
1047
+ self.eval()
1048
+
1049
+ batch_size = input_ids.size(0)
1050
+ device = input_ids.device
1051
+
1052
+ # Handle missing vision features
1053
+ if vision_features is None:
1054
+ vision_features = torch.zeros(batch_size, self.config.vision_encoder_dim,
1055
+ device=device, dtype=torch.float32)
1056
+
1057
+ # Handle attention mask
1058
+ if attention_mask is None:
1059
+ attention_mask = torch.ones_like(input_ids)
1060
+
1061
+ generated_ids = input_ids.clone()
1062
+ current_attention_mask = attention_mask.clone()
1063
+
1064
+ with torch.no_grad():
1065
+ for _ in range(max_length - input_ids.size(1)):
1066
+ # Get model outputs
1067
+ outputs = self.forward(
1068
+ input_ids=generated_ids,
1069
+ attention_mask=current_attention_mask,
1070
+ vision_features=vision_features,
1071
+ mode="inference",
1072
+ return_dict=True
1073
+ )
1074
+
1075
+ # Get next token logits
1076
+ next_token_logits = outputs.logits[:, -1, :] / temperature
1077
+
1078
+ if do_sample:
1079
+ # Apply top-p sampling
1080
+ if top_p < 1.0:
1081
+ sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
1082
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
1083
+
1084
+ # Remove tokens with cumulative probability above the threshold
1085
+ sorted_indices_to_remove = cumulative_probs > top_p
1086
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
1087
+ sorted_indices_to_remove[..., 0] = 0
1088
+
1089
+ indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
1090
+ next_token_logits[indices_to_remove] = float('-inf')
1091
+
1092
+ # Sample from the filtered distribution
1093
+ probs = F.softmax(next_token_logits, dim=-1)
1094
+ next_token = torch.multinomial(probs, num_samples=1)
1095
+ else:
1096
+ # Greedy decoding
1097
+ next_token = next_token_logits.argmax(dim=-1, keepdim=True)
1098
+
1099
+ # Append to generated sequence
1100
+ generated_ids = torch.cat([generated_ids, next_token], dim=-1)
1101
+
1102
+ # Update attention mask
1103
+ current_attention_mask = torch.cat([
1104
+ current_attention_mask,
1105
+ torch.ones(batch_size, 1, device=device)
1106
+ ], dim=-1)
1107
+
1108
+ # Stop if EOS token is generated
1109
+ if (next_token == self.config.eos_token_id).all():
1110
+ break
1111
+
1112
+ return generated_ids
1113
+
1114
+ def prepare_inputs_for_generation(
1115
+ self,
1116
+ input_ids,
1117
+ past_key_values=None,
1118
+ attention_mask=None,
1119
+ vision_features=None,
1120
+ **kwargs
1121
+ ):
1122
+ """Prepare inputs for generation"""
1123
+ return {
1124
+ "input_ids": input_ids,
1125
+ "attention_mask": attention_mask,
1126
+ "vision_features": vision_features,
1127
+ "use_cache": kwargs.get("use_cache", True),
1128
+ }
1129
+
1130
+
1131
+ # Register the model with transformers
1132
+ from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
1133
+
1134
+ AutoConfig.register("bitmar", BitMarConfig)
1135
+ AutoModel.register(BitMarConfig, BitMarModel)
1136
+ AutoModelForCausalLM.register(BitMarConfig, BitMarModel)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ed1baf7f77db0a118159e6a850c9b1089cfc20f5cb8a329fb50d1195bcf70e0
3
+ size 85226595
training_metadata.json ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9,
3
+ "global_step": 165810,
4
+ "tokens_processed": 996822486,
5
+ "target_tokens": 100000000,
6
+ "best_similarity": 0.33421099185943604,
7
+ "training_config": {
8
+ "model": {
9
+ "vocab_size": 50257,
10
+ "text_encoder_dim": 128,
11
+ "text_encoder_layers": 4,
12
+ "text_encoder_heads": 4,
13
+ "text_decoder_dim": 128,
14
+ "text_decoder_layers": 4,
15
+ "text_decoder_heads": 4,
16
+ "vision_encoder_dim": 768,
17
+ "vision_latent_size": 128,
18
+ "vision_hidden_size": 64,
19
+ "vision_compression_method": "learned_compression",
20
+ "vision_spatial_pooling": true,
21
+ "vision_pool_size": 2,
22
+ "fusion_hidden_size": 128,
23
+ "fusion_num_heads": 4,
24
+ "fusion_num_layers": 2,
25
+ "max_seq_len": 256,
26
+ "dropout": 0.15
27
+ },
28
+ "token_constraints": {
29
+ "total_tokens": 100000000,
30
+ "caption_tokens": 50000000,
31
+ "text_tokens": 50000000,
32
+ "enforce_exact_count": true,
33
+ "uniform_sampling": true,
34
+ "alignment_priority": "perfect_alignment",
35
+ "preserve_image_caption_pairs": true,
36
+ "strict_alignment_validation": true
37
+ },
38
+ "vision_feature_reduction": {
39
+ "enabled": true,
40
+ "method": "learned_compression",
41
+ "target_dim": 64,
42
+ "spatial_pooling": true,
43
+ "pool_method": "attention",
44
+ "hidden_dim": 128,
45
+ "learnable": true,
46
+ "preserve_spatial_info": true
47
+ },
48
+ "data": {
49
+ "dataset_dir": "../babylm_dataset",
50
+ "text_encoder_name": "gpt2",
51
+ "max_seq_length": 256,
52
+ "count_tokens": true,
53
+ "target_caption_tokens": 50000000,
54
+ "target_text_tokens": 50000000,
55
+ "token_counting_method": "gpt2",
56
+ "batch_size": 384,
57
+ "num_workers": 10,
58
+ "pin_memory": true,
59
+ "persistent_workers": true,
60
+ "mix_ratio": 0.5,
61
+ "shuffle_datasets": true,
62
+ "ensure_alignment": true,
63
+ "validate_alignment": true,
64
+ "alignment_verification": "strict",
65
+ "never_break_pairs": true,
66
+ "alignment_check_frequency": 1000,
67
+ "use_validation": false,
68
+ "train_only": true
69
+ },
70
+ "attention_analysis": {
71
+ "track_top_k": 5,
72
+ "log_every_n_steps": 200,
73
+ "viz_every_n_epochs": 3,
74
+ "save_head_patterns": true,
75
+ "analyze_memory_attention": false,
76
+ "analyze_cross_modal": true,
77
+ "track_token_alignment": true
78
+ },
79
+ "adaptive_training": {
80
+ "enabled": true,
81
+ "similarity_window_size": 200,
82
+ "drop_threshold": 0.12,
83
+ "min_steps_between_interventions": 800,
84
+ "freeze_duration_steps": 1500,
85
+ "loss_rebalance_factor": 2.0,
86
+ "similarity_smoothing_alpha": 0.15
87
+ },
88
+ "training": {
89
+ "max_epochs": 10,
90
+ "accumulate_grad_batches": 2,
91
+ "gradient_clip_val": 0.3,
92
+ "val_check_interval": 1000,
93
+ "scheduler": {
94
+ "T_0": 1000,
95
+ "T_mult": 2,
96
+ "eta_min_ratio": 0.1
97
+ },
98
+ "min_lr": 5e-05,
99
+ "warmup_steps": 1000,
100
+ "learning_rate": 0.0002,
101
+ "weight_decay": 0.02,
102
+ "optimizer": "adamw8bit",
103
+ "cross_modal_loss_weight": 1.5,
104
+ "text_generation_loss_weight": 1.0,
105
+ "alignment_consistency_weight": 0.5,
106
+ "track_token_usage": true,
107
+ "log_token_progress": true,
108
+ "stop_at_token_limit": false,
109
+ "validate_alignment_every_n_steps": 500,
110
+ "log_alignment_metrics": true,
111
+ "alignment_loss_scaling": "adaptive"
112
+ },
113
+ "wandb": {
114
+ "project": "bitmar-no-memory",
115
+ "entity": "babylm-ntust",
116
+ "api_key": null,
117
+ "log_every_n_steps": 100,
118
+ "log_attention": true,
119
+ "log_memory": false,
120
+ "log_gradients": true,
121
+ "log_token_usage": true,
122
+ "log_cross_modal_similarity": true,
123
+ "log_alignment_quality": true,
124
+ "log_caption_image_matching": true,
125
+ "save_code": true,
126
+ "create_plots": true,
127
+ "plot_attention_heatmaps": false,
128
+ "plot_memory_usage": false,
129
+ "plot_token_distribution": true,
130
+ "plot_alignment_metrics": true
131
+ },
132
+ "evaluation": {
133
+ "metrics": [
134
+ "bleu",
135
+ "rouge",
136
+ "cross_modal_similarity"
137
+ ],
138
+ "generate_samples": true,
139
+ "num_samples": 20,
140
+ "max_generation_length": 32,
141
+ "temperature": 0.8,
142
+ "top_p": 0.9,
143
+ "evaluate_alignment": true,
144
+ "alignment_metrics": [
145
+ "cosine_similarity",
146
+ "retrieval_accuracy",
147
+ "caption_image_matching",
148
+ "cross_modal_retrieval"
149
+ ],
150
+ "alignment_threshold": 0.8,
151
+ "validate_pairs_during_eval": true
152
+ },
153
+ "output": {
154
+ "checkpoint_dir": "checkpoints_100M_dataset",
155
+ "log_dir": "logs_100M_dataset",
156
+ "attention_dir": "attention_100M_dataset",
157
+ "results_dir": "results_100M_dataset",
158
+ "token_logs_dir": "token_logs_100M_dataset"
159
+ },
160
+ "performance_targets": {
161
+ "max_model_size_mb": 50,
162
+ "target_cross_modal_similarity": 0.75,
163
+ "target_text_generation_quality": 0.6
164
+ },
165
+ "flops_tracking": {
166
+ "enabled": true,
167
+ "log_frequency": 100,
168
+ "save_statistics": true,
169
+ "estimate_theoretical": true,
170
+ "track_peak_performance": true,
171
+ "log_to_wandb": true,
172
+ "detailed_breakdown": true,
173
+ "memory_bandwidth_tracking": false,
174
+ "efficiency_analysis": true,
175
+ "track_components": [
176
+ "attention",
177
+ "feedforward",
178
+ "layer_norm",
179
+ "embeddings",
180
+ "vision_encoder",
181
+ "cross_modal_fusion"
182
+ ]
183
+ },
184
+ "token_tracking": {
185
+ "log_frequency": 1000,
186
+ "save_token_distribution": true,
187
+ "monitor_caption_text_ratio": true,
188
+ "enforce_token_limits": false,
189
+ "early_stopping_on_limit": false,
190
+ "track_alignment_quality": true,
191
+ "log_misaligned_samples": true,
192
+ "alignment_quality_threshold": 0.7,
193
+ "save_alignment_statistics": true,
194
+ "correlate_flops_with_tokens": true,
195
+ "log_computational_efficiency": true,
196
+ "track_throughput_vs_quality": true
197
+ },
198
+ "huggingface_hub": {
199
+ "enabled": true,
200
+ "repo_id": "estebancarlin/bitmar-no-memory",
201
+ "private": true,
202
+ "upload_after_epoch": true,
203
+ "upload_final_model": true,
204
+ "commit_message_template": "BitMar 100M tokens (no memory) - Epoch {epoch} - {tokens_processed:,} tokens processed",
205
+ "create_model_card": true,
206
+ "model_card_template": "---\nlanguage: en\nlicense: mit\ntags:\n- bitmar\n- multimodal\n- babylm\n- cross-modal\n- no-memory\ndatasets:\n- babylm_multimodal\nmetrics:\n- bleu\n- cross_modal_similarity\n---\n\n# BitMar 100M Token Model (No Episodic Memory)\n\nThis model was trained on exactly 100 million tokens as part of the BabyLM challenge without episodic memory.\n\n## Training Details\n- Total tokens: 100,000,000\n- Epochs completed: {epoch}\n- Tokens processed: {tokens_processed:,}\n- Cross-modal similarity: {best_similarity:.4f}\n- Episodic memory: Disabled\n\n## Model Architecture\n- Text encoder: {text_encoder_layers} layers, {text_encoder_dim} hidden size\n- Vision encoder: DiNOv2 features compressed to {vision_latent_size}\n- Episodic memory: Disabled for comparison study\n\n## Usage\n```python\nfrom transformers import AutoModel, AutoTokenizer\n\nmodel = AutoModel.from_pretrained(\"{repo_id}\")\ntokenizer = AutoTokenizer.from_pretrained(\"{repo_id}\")\n```\n"
207
+ },
208
+ "attention_sinks": {
209
+ "enabled": true,
210
+ "attention_sink_size": 4,
211
+ "attention_sink_window_size": 1020,
212
+ "inject_to_text_encoder": true,
213
+ "inject_to_text_decoder": true,
214
+ "position_shift_enabled": true,
215
+ "cache_compression": true,
216
+ "adaptive_window_size": false,
217
+ "memory_efficient_attention": true,
218
+ "preserve_episodic_memory": false,
219
+ "preserve_quantization": true,
220
+ "preserve_cross_modal_fusion": true
221
+ }
222
+ }
223
+ }