Add main weights for eng

Browse files

Files changed (9) hide show

README.md +61 -0
config.json +27 -0
configuration_gpt_bert.py +22 -0
model.safetensors +3 -0
modeling_gpt_bert.py +448 -0
original_project_config.json +12 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +141 -0

README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+library_name: transformers
+pipeline_tag: text-generation
+tags: [gpt-bert, babylm, remote-code]
+license: other
+---
+# haznitrama/babybabellm-multi_gpu-gpt_bert-eng-main-causal
+GPT-BERT style BabyBabyLLM model for language **eng**.
+This repository may include both *main* and *EMA* variants.
+**Default variant exposed to generic loaders:** `main`
+## Variants Available
+main
+## Files
+- model.safetensors (alias of default variant)
+## Configuration
+```json
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "intermediate_size": 2560,
+  "max_position_embeddings": 512,
+  "position_bucket_size": 32,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "vocab_size": 16384,
+  "layer_norm_eps": 1e-05
+}
+```
+Tokenizer file: `tokenizer_eng.json`
+## Quick Usage
+```python
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+model_id = 'haznitrama/babybabellm-multi_gpu-gpt_bert-eng-main-causal'
+tok = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForMaskedLM.from_pretrained(model_id, trust_remote_code=True)
+out = model(**tok('Hello world', return_tensors='pt'))
+```
+### Causal LM Wrapper
+This repo includes a lightweight GPTBertForCausalLM wrapper.
+Generation example:
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+mid='haznitrama/babybabellm-multi_gpu-gpt_bert-eng-main-causal'
+tok=AutoTokenizer.from_pretrained(mid)
+model=AutoModelForCausalLM.from_pretrained(mid, trust_remote_code=True)
+print(tok.decode(model.generate(**tok('Hello', return_tensors='pt'), max_new_tokens=20)[0], skip_special_tokens=True))
+```
+## Notes
+- Converted on 2025-09-27T15:21:53.977598+00:00
+- Weights are the exact trained parameters; no new layers were initialized.
+- Requires `trust_remote_code=True` due to custom architecture.

config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "architectures": [
+    "GPTBertForMaskedLM",
+    "GPTBertForCausalLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_gpt_bert.GPTBertConfig",
+    "AutoModel": "modeling_gpt_bert.GPTBertForCausalLM",
+    "AutoModelForCausalLM": "modeling_gpt_bert.GPTBertForCausalLM",
+    "AutoModelForMaskedLM": "modeling_gpt_bert.GPTBertForMaskedLM"
+  },
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "intermediate_size": 2560,
+  "layer_norm_eps": 1e-05,
+  "mask_token_id": 4,
+  "max_position_embeddings": 512,
+  "model_type": "gpt_bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 3,
+  "position_bucket_size": 32,
+  "vocab_size": 16384
+}

configuration_gpt_bert.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from transformers import PretrainedConfig
+class GPTBertConfig(PretrainedConfig):
+    model_type = 'gpt_bert'
+    def __init__(self, **kwargs):
+        self.attention_probs_dropout_prob = kwargs.pop('attention_probs_dropout_prob', 0.1)
+        self.hidden_dropout_prob = kwargs.pop('hidden_dropout_prob', 0.1)
+        self.hidden_size = kwargs.pop('hidden_size', 768)
+        self.intermediate_size = kwargs.pop('intermediate_size', 2560)
+        self.max_position_embeddings = kwargs.pop('max_position_embeddings', 512)
+        self.position_bucket_size = kwargs.pop('position_bucket_size', 32)
+        self.num_attention_heads = kwargs.pop('num_attention_heads', 12)
+        self.num_hidden_layers = kwargs.pop('num_hidden_layers', 12)
+        self.vocab_size = kwargs.pop('vocab_size', 16384)
+        self.layer_norm_eps = kwargs.pop('layer_norm_eps', 1e-5)
+        self.auto_map = {
+            'AutoConfig': 'configuration_gpt_bert.GPTBertConfig',
+            'AutoModel': 'modeling_gpt_bert.GPTBertForCausalLM',
+            'AutoModelForCausalLM': 'modeling_gpt_bert.GPTBertForCausalLM',
+            'AutoModelForMaskedLM': 'modeling_gpt_bert.GPTBertForMaskedLM',
+        }
+        super().__init__(**kwargs)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e7ce858c260e48aac28f32d49a28061458f1974dea791677ae05d1be60aa184
+size 553331552

modeling_gpt_bert.py ADDED Viewed

	@@ -0,0 +1,448 @@

+# Original training architecture (verbatim)
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import _softmax_backward_data as _softmax_backward_data
+class Bert(nn.Module):
+    def __init__(self, config, activation_checkpointing=False):
+        super().__init__()
+        self.embedding = Embedding(config)
+        self.transformer = Encoder(config, activation_checkpointing)
+        self.classifier = MaskClassifier(config, self.embedding.word_embedding.weight)
+    def get_contextualized(self, input_ids, attention_mask):
+        static_embeddings, relative_embedding = self.embedding(input_ids)
+        contextualized_embeddings = self.transformer(static_embeddings, attention_mask.unsqueeze(1), relative_embedding)
+        return contextualized_embeddings
+    def forward(self, input_ids, attention_mask, masked_lm_labels, num_masked=None, ratio=None):
+        contextualized_embeddings = self.get_contextualized(input_ids, attention_mask)
+        if num_masked is None:
+            subword_prediction = self.classifier(contextualized_embeddings, masked_lm_labels, num_masked)
+            gold_labels = masked_lm_labels.flatten()
+            gold_labels = gold_labels[gold_labels != -100]
+            loss = F.cross_entropy(subword_prediction, gold_labels, reduction="none").mean()
+            z_loss = torch.logsumexp(subword_prediction, dim=-1).pow(2).mean()
+            with torch.no_grad():
+                accuracy = (subword_prediction.argmax(-1) == gold_labels).float().mean()
+            num_tokens = gold_labels.size(0)
+            return loss, accuracy, z_loss, num_tokens
+        else:
+            masked_subword_prediction, causal_subword_prediction = self.classifier(contextualized_embeddings, masked_lm_labels, num_masked)
+            if masked_subword_prediction is not None:
+                masked_gold_labels = masked_lm_labels[:, :num_masked].flatten()
+                masked_gold_labels = masked_gold_labels[masked_gold_labels != -100]
+                masked_loss = F.cross_entropy(masked_subword_prediction, masked_gold_labels)
+                masked_z_loss = torch.logsumexp(masked_subword_prediction, dim=-1).pow(2).mean()
+                with torch.no_grad():
+                    masked_accuracy = (masked_subword_prediction.argmax(-1) == masked_gold_labels).float().mean()
+                num_masked_tokens = masked_gold_labels.size(0)
+            else:
+                masked_loss = 0.0
+                masked_z_loss = 0.0
+                masked_accuracy = 0.0
+                num_masked_tokens = 0
+            if causal_subword_prediction is not None:
+                causal_gold_labels = masked_lm_labels[:, num_masked:].flatten()
+                causal_gold_labels = causal_gold_labels[causal_gold_labels != -100]
+                causal_loss = F.cross_entropy(causal_subword_prediction, causal_gold_labels)
+                causal_z_loss = torch.logsumexp(causal_subword_prediction, dim=-1).pow(2).mean()
+                with torch.no_grad():
+                    causal_accuracy = (causal_subword_prediction.argmax(-1) == causal_gold_labels).float().mean()
+                num_causal_tokens = causal_gold_labels.size(0)
+            else:
+                causal_loss = 0.0
+                causal_z_loss = 0.0
+                causal_accuracy = 0.0
+                num_causal_tokens = 0
+            loss = ratio * masked_loss + (1 - ratio) * causal_loss
+            z_loss = ratio * masked_z_loss + (1 - ratio) * causal_z_loss
+            with torch.no_grad():
+                accuracy = ratio * masked_accuracy + (1 - ratio) * causal_accuracy
+            num_tokens = num_masked_tokens + num_causal_tokens
+            return loss, masked_loss, causal_loss, accuracy, masked_accuracy, causal_accuracy, z_loss, num_tokens
+# From https://github.com/epfml/DenseFormer
+class InPlaceSetSlice(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, full_tensor, last_slice, x_idx, x_val):
+        full_tensor[x_idx] = x_val
+        ctx.x_idx = x_idx
+        ret = torch.Tensor().to(full_tensor.device)
+        ret.set_(full_tensor[:x_idx + 1])
+        return ret
+    @staticmethod
+    def backward(ctx, grad_out):
+        if ctx.x_idx == 0:
+            return None, None, None, grad_out[ctx.x_idx]
+        else:
+            return None, grad_out[:ctx.x_idx], None, grad_out[ctx.x_idx]
+def apply_inplace_set(x_acc, x_idx, x_val):
+    full_tensor, last_slice = x_acc
+    new_slice = InPlaceSetSlice.apply(full_tensor, last_slice, x_idx, x_val)
+    return full_tensor, new_slice
+class DWAModules(torch.nn.Module):
+    def __init__(self, hidden_size, n_blocks):
+        super().__init__()
+        self.n_blocks = n_blocks
+        self.alphas = nn.ParameterList([nn.Parameter(torch.zeros(i + 2)) for i in range(n_blocks)])
+        self.accumulator = None
+        self._init_weights()
+    def _init_weights(self):
+        for module in self.alphas:
+            module.data.zero_()
+            module.data[-1] = 1.0
+    def init_accumulator(self, x):
+        self.accumulator = (torch.zeros((self.n_blocks + 1, *x.shape), device=x.device, dtype=x.dtype), None)
+        self.accumulator = apply_inplace_set(self.accumulator, 0, x)
+    def forward(self, x, block_idx):
+        assert self.accumulator is not None, "`init_accumulator(x)` needs to be called first"
+        self.accumulator = apply_inplace_set(
+            self.accumulator,
+            block_idx + 1,
+            x
+        )
+        x = torch.tensordot(self.alphas[block_idx], self.accumulator[1], dims=1)
+        return x
+class Encoder(nn.Module):
+    def __init__(self, config, activation_checkpointing=False):
+        super().__init__()
+        self.attention_layers = nn.ModuleList([Attention(config) for _ in range(config.num_hidden_layers)])
+        self.mlp_layers = nn.ModuleList([FeedForward(config) for _ in range(config.num_hidden_layers)])
+        self.dwa_modules = DWAModules(config.hidden_size, config.num_hidden_layers * 2)
+        for i, layer in enumerate(self.mlp_layers):
+            layer.mlp[1].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
+            layer.mlp[-2].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
+        self.activation_checkpointing = activation_checkpointing
+    def forward(self, x, attention_mask, relative_embedding):
+        self.dwa_modules.init_accumulator(x)
+        for i, (attention_layer, mlp_layer) in enumerate(zip(self.attention_layers, self.mlp_layers)):
+            x = x + attention_layer(x, attention_mask, relative_embedding)
+            x = self.dwa_modules(x, block_idx=i * 2)
+            x = x + mlp_layer(x)
+            x = self.dwa_modules(x, block_idx=i * 2 + 1)
+        return x
+class MaskClassifier(nn.Module):
+    def __init__(self, config, subword_embedding):
+        super().__init__()
+        self.nonlinearity = nn.Sequential(
+            nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
+            nn.Linear(config.hidden_size, config.hidden_size),
+            nn.GELU(),
+            nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
+            nn.Dropout(config.hidden_dropout_prob),
+            nn.Linear(subword_embedding.size(1), subword_embedding.size(0))
+        )
+        self.initialize(config.hidden_size, subword_embedding)
+    def initialize(self, hidden_size, embedding):
+        std = math.sqrt(2.0 / (5.0 * hidden_size))
+        nn.init.trunc_normal_(self.nonlinearity[1].weight, mean=0.0, std=std, a=-2*std, b=2*std)
+        self.nonlinearity[-1].weight = embedding
+        self.nonlinearity[1].bias.data.zero_()
+        self.nonlinearity[-1].bias.data.zero_()
+    def forward(self, x, masked_lm_labels, num_masked=None):
+        if num_masked is None:
+            x = torch.index_select(x.flatten(0, 1), 0, torch.nonzero(masked_lm_labels.flatten() != -100).squeeze())
+            x = self.nonlinearity(x)
+            return x
+        else:
+            masked_x, causal_x = torch.tensor_split(x, (num_masked,), dim=1)
+            mntp_masked_lm_labels, causal_masked_lm_labels = torch.tensor_split(masked_lm_labels, (num_masked,), dim=1)
+            if masked_x.size(1) != 0:
+                masked_x = torch.index_select(masked_x.flatten(0, 1), 0, torch.nonzero(mntp_masked_lm_labels.flatten() != -100).squeeze())
+                masked_x = self.nonlinearity(masked_x)
+            else:
+                masked_x = None
+            if causal_x.size(1) != 0:
+                causal_x = torch.index_select(causal_x.flatten(0, 1), 0, torch.nonzero(causal_masked_lm_labels.flatten() != -100).squeeze())
+                causal_x = self.nonlinearity(causal_x)
+            else:
+                causal_x = None
+            return masked_x, causal_x
+class GeGLU(nn.Module):
+    def forward(self, x):
+        x, gate = x.chunk(2, dim=-1)
+        x = x * F.gelu(gate, approximate='tanh')
+        return x
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False),
+            nn.Linear(config.hidden_size, 2*config.intermediate_size, bias=False),
+            GeGLU(),
+            nn.LayerNorm(config.intermediate_size, eps=config.layer_norm_eps, elementwise_affine=False),
+            nn.Linear(config.intermediate_size, config.hidden_size, bias=False),
+            nn.Dropout(config.hidden_dropout_prob)
+        )
+        self.initialize(config.hidden_size)
+    def initialize(self, hidden_size):
+        std = math.sqrt(2.0 / (5.0 * hidden_size))
+        nn.init.trunc_normal_(self.mlp[1].weight, mean=0.0, std=std, a=-2*std, b=2*std)
+        nn.init.trunc_normal_(self.mlp[-2].weight, mean=0.0, std=std, a=-2*std, b=2*std)
+    def forward(self, x):
+        return self.mlp(x)
+class MaskedSoftmax(torch.autograd.Function):
+    @staticmethod
+    def forward(self, x, mask, dim):
+        self.dim = dim
+        x.masked_fill_(mask, float('-inf'))
+        x = torch.softmax(x, self.dim)
+        x.masked_fill_(mask, 0.0)
+        self.save_for_backward(x)
+        return x
+    @staticmethod
+    def backward(self, grad_output):
+        output, = self.saved_tensors
+        inputGrad = _softmax_backward_data(grad_output, output, self.dim, output.dtype)
+        return inputGrad, None, None
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(f"The hidden size {config.hidden_size} is not a multiple of the number of attention heads {config.num_attention_heads}")
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = config.hidden_size // config.num_attention_heads
+        self.in_proj_qk = nn.Linear(config.hidden_size, 2*config.hidden_size, bias=True)
+        self.in_proj_vg = nn.Linear(config.hidden_size, 2*config.hidden_size, bias=True)
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=True)
+        self.pre_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False)
+        self.post_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False)
+        position_indices = torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(1) \
+            - torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(0)
+        position_indices = self.make_log_bucket_position(position_indices, config.position_bucket_size, config.max_position_embeddings)
+        position_indices = config.position_bucket_size - 1 + position_indices
+        self.register_buffer("position_indices", position_indices, persistent=True)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.scale = 1.0 / math.sqrt(3 * self.head_size)
+        self.initialize()
+    def make_log_bucket_position(self, relative_pos, bucket_size, max_position):
+        sign = torch.sign(relative_pos)
+        mid = bucket_size // 2
+        abs_pos = torch.where((relative_pos < mid) & (relative_pos > -mid), mid - 1, torch.abs(relative_pos).clamp(max=max_position - 1))
+        log_pos = torch.ceil(torch.log(abs_pos / mid) / math.log((max_position-1) / mid) * (mid - 1)).int() + mid
+        bucket_pos = torch.where(abs_pos <= mid, relative_pos, log_pos * sign).long()
+        return bucket_pos
+    def initialize(self):
+        std = math.sqrt(2.0 / (5.0 * self.hidden_size))
+        nn.init.trunc_normal_(self.in_proj_qk.weight, mean=0.0, std=std, a=-2*std, b=2*std)
+        nn.init.trunc_normal_(self.in_proj_vg.weight, mean=0.0, std=std, a=-2*std, b=2*std)
+        nn.init.trunc_normal_(self.out_proj.weight, mean=0.0, std=std, a=-2*std, b=2*std)
+        self.in_proj_qk.bias.data.zero_()
+        self.in_proj_vg.bias.data.zero_()
+        self.out_proj.bias.data.zero_()
+    def forward(self, hidden_states, attention_mask, relative_embedding):
+        key_len, batch_size, _ = hidden_states.size()
+        query_len = key_len
+        if self.position_indices.size(0) < query_len:
+            position_indices = torch.arange(query_len, dtype=torch.long).unsqueeze(1) \
+                - torch.arange(query_len, dtype=torch.long).unsqueeze(0)
+            position_indices = self.make_log_bucket_position(position_indices, self.config.position_bucket_size, 512)
+            position_indices = self.config.position_bucket_size - 1 + position_indices
+            self.register_buffer("position_indices", position_indices.to(hidden_states.device), persistent=True)
+        hidden_states = self.pre_layer_norm(hidden_states)
+        query, key = self.in_proj_qk(hidden_states).chunk(2, dim=2)  # shape: [T, B, D]
+        value, gate = self.in_proj_vg(hidden_states).chunk(2, dim=2)  # shape: [T, B, D]
+        gate = F.gelu(gate)
+        pos = self.in_proj_qk(self.dropout(relative_embedding))  # shape: [2T-1, 2D]
+        pos = F.embedding(self.position_indices[:query_len, :key_len], pos)  # shape: [T, T, 2D]
+        query_pos, key_pos = pos.chunk(2, dim=-1)
+        query_pos = query_pos.view(query_len, key_len, self.num_heads, self.head_size)
+        key_pos = key_pos.view(query_len, key_len, self.num_heads, self.head_size)
+        query = query.reshape(query_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
+        key = key.reshape(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
+        value = value.reshape(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
+        attention_scores = torch.bmm(query, key.transpose(1, 2) * self.scale)
+        query = query.view(batch_size, self.num_heads, query_len, self.head_size)
+        key = key.view(batch_size, self.num_heads, query_len, self.head_size)
+        attention_scores = attention_scores.view(batch_size, self.num_heads, query_len, key_len)
+        attention_scores.add_(torch.einsum("bhqd,qkhd->bhqk", query, key_pos * self.scale))
+        attention_scores.add_(torch.einsum("bhkd,qkhd->bhqk", key * self.scale, query_pos))
+        attention_probs = MaskedSoftmax.apply(attention_scores, attention_mask, -1)
+        attention_probs = self.dropout(attention_probs)
+        context = torch.bmm(attention_probs.flatten(0, 1), value)  # shape: [B*H, Q, D]
+        context = context.transpose(0, 1).reshape(context.size(1), -1, self.hidden_size)  # shape: [Q, B, H*D]
+        context = context * gate
+        context = self.post_layer_norm(context)
+        context = self.out_proj(context)
+        context = self.dropout(context)
+        return context
+class Embedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.word_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.word_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.relative_embedding = nn.Parameter(torch.empty(2 * config.position_bucket_size - 1, config.hidden_size))
+        self.relative_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.initialize()
+    def initialize(self):
+        std = math.sqrt(2.0 / (5.0 * self.hidden_size))
+        nn.init.trunc_normal_(self.relative_embedding, mean=0.0, std=std, a=-2*std, b=2*std)
+        nn.init.trunc_normal_(self.word_embedding.weight, mean=0.0, std=std, a=-2*std, b=2*std)
+    def forward(self, input_ids):
+        word_embedding = self.dropout(self.word_layer_norm(self.word_embedding(input_ids)))
+        relative_embeddings = self.relative_layer_norm(self.relative_embedding)
+        return word_embedding, relative_embeddings
+# HF wrappers that preserve state dict keys and behavior
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import MaskedLMOutput, CausalLMOutputWithCrossAttentions
+from .configuration_gpt_bert import GPTBertConfig
+import torch
+import torch.nn as nn
+class GPTBertForMaskedLM(PreTrainedModel):
+    config_class = GPTBertConfig
+    base_model_prefix = 'gpt_bert'
+    def __init__(self, config: GPTBertConfig):
+        super().__init__(config)
+        self.model = Bert(config)
+    def tie_weights(self):
+        try:
+            self.model.classifier.nonlinearity[-1].weight = self.model.embedding.word_embedding.weight
+        except Exception:
+            pass
+        return super().tie_weights()
+    def forward(self, input_ids, attention_mask=None, labels=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        mask_bool = (attention_mask == 0).unsqueeze(1).unsqueeze(1)
+        static_embeddings, relative_embedding = self.model.embedding(input_ids)
+        if static_embeddings.dim() == 3 and static_embeddings.shape[0] == input_ids.shape[0]:
+            static_embeddings = static_embeddings.transpose(0, 1)
+        contextualized = self.model.transformer(static_embeddings, mask_bool, relative_embedding)
+        hs = contextualized.transpose(0, 1)
+        B,S,H = hs.shape
+        flat = hs.reshape(B*S, H)
+        logits_flat = self.model.classifier.nonlinearity(flat)
+        vocab = logits_flat.size(-1)
+        logits = logits_flat.view(B, S, vocab)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(logits.view(-1, vocab), labels.view(-1))
+        return MaskedLMOutput(loss=loss, logits=logits)
+class GPTBertForCausalLM(PreTrainedModel):
+    config_class = GPTBertConfig
+    base_model_prefix = 'gpt_bert'
+    def __init__(self, config: GPTBertConfig):
+        super().__init__(config)
+        self.model = Bert(config)
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        return {'input_ids': input_ids, 'attention_mask': kwargs.get('attention_mask', None)}
+    def forward(self, input_ids, attention_mask=None, labels=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        mask_bool = (attention_mask == 0).unsqueeze(1).unsqueeze(1)
+        static_embeddings, relative_embedding = self.model.embedding(input_ids)
+        if static_embeddings.dim() == 3 and static_embeddings.shape[0] == input_ids.shape[0]:
+            static_embeddings = static_embeddings.transpose(0, 1)
+        contextualized = self.model.transformer(static_embeddings, mask_bool, relative_embedding)
+        hs = contextualized.transpose(0, 1)
+        B,S,H = hs.shape
+        flat = hs.reshape(B*S, H)
+        logits_flat = self.model.classifier.nonlinearity(flat)
+        vocab = logits_flat.size(-1)
+        logits = logits_flat.view(B, S, vocab)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        return CausalLMOutputWithCrossAttentions(loss=loss, logits=logits)

original_project_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "intermediate_size": 2560,
+  "max_position_embeddings": 512,
+  "position_bucket_size": 32,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "vocab_size": 16384,
+  "layer_norm_eps": 1e-05
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,141 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<special_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<special_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<special_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<special_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<special_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<special_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<special_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<special_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<special_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<special_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<special_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}