Add files using upload-large-folder tool

Browse files

Files changed (10) hide show

.gitattributes +1 -0
config.json +162 -0
errors_hist.png +0 -0
model.safetensors +3 -0
modeling_modernbert_reward.py +203 -0
scatter.png +3 -0
special_tokens_map.json +51 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +171 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+scatter.png filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,162 @@

+{
+  "architectures": [
+    "ModernBertForOrdinalAndRegression"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoModelForSequenceClassification": "modeling_modernbert_reward.ModernBertForOrdinalAndRegression"
+  },
+  "blend": 0.66,
+  "bos_token_id": 1,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 6,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "dtype": "float32",
+  "embedding_dropout": 0.0,
+  "eos_token_id": 2,
+  "gamma": 0.025,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 256,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9",
+    "10": "LABEL_10",
+    "11": "LABEL_11",
+    "12": "LABEL_12",
+    "13": "LABEL_13",
+    "14": "LABEL_14",
+    "15": "LABEL_15",
+    "16": "LABEL_16",
+    "17": "LABEL_17",
+    "18": "LABEL_18",
+    "19": "LABEL_19",
+    "20": "LABEL_20",
+    "21": "LABEL_21",
+    "22": "LABEL_22",
+    "23": "LABEL_23",
+    "24": "LABEL_24",
+    "25": "LABEL_25",
+    "26": "LABEL_26",
+    "27": "LABEL_27",
+    "28": "LABEL_28",
+    "29": "LABEL_29",
+    "30": "LABEL_30",
+    "31": "LABEL_31",
+    "32": "LABEL_32",
+    "33": "LABEL_33",
+    "34": "LABEL_34",
+    "35": "LABEL_35",
+    "36": "LABEL_36",
+    "37": "LABEL_37",
+    "38": "LABEL_38",
+    "39": "LABEL_39",
+    "40": "LABEL_40",
+    "41": "LABEL_41",
+    "42": "LABEL_42",
+    "43": "LABEL_43",
+    "44": "LABEL_44",
+    "45": "LABEL_45",
+    "46": "LABEL_46",
+    "47": "LABEL_47",
+    "48": "LABEL_48",
+    "49": "LABEL_49",
+    "50": "LABEL_50"
+  },
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1024,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_11": 11,
+    "LABEL_12": 12,
+    "LABEL_13": 13,
+    "LABEL_14": 14,
+    "LABEL_15": 15,
+    "LABEL_16": 16,
+    "LABEL_17": 17,
+    "LABEL_18": 18,
+    "LABEL_19": 19,
+    "LABEL_2": 2,
+    "LABEL_20": 20,
+    "LABEL_21": 21,
+    "LABEL_22": 22,
+    "LABEL_23": 23,
+    "LABEL_24": 24,
+    "LABEL_25": 25,
+    "LABEL_26": 26,
+    "LABEL_27": 27,
+    "LABEL_28": 28,
+    "LABEL_29": 29,
+    "LABEL_3": 3,
+    "LABEL_30": 30,
+    "LABEL_31": 31,
+    "LABEL_32": 32,
+    "LABEL_33": 33,
+    "LABEL_34": 34,
+    "LABEL_35": 35,
+    "LABEL_36": 36,
+    "LABEL_37": 37,
+    "LABEL_38": 38,
+    "LABEL_39": 39,
+    "LABEL_4": 4,
+    "LABEL_40": 40,
+    "LABEL_41": 41,
+    "LABEL_42": 42,
+    "LABEL_43": 43,
+    "LABEL_44": 44,
+    "LABEL_45": 45,
+    "LABEL_46": 46,
+    "LABEL_47": 47,
+    "LABEL_48": 48,
+    "LABEL_49": 49,
+    "LABEL_5": 5,
+    "LABEL_50": 50,
+    "LABEL_6": 6,
+    "LABEL_7": 7,
+    "LABEL_8": 8,
+    "LABEL_9": 9
+  },
+  "lambda_reg": 0.075,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 4,
+  "num_hidden_layers": 10,
+  "pad_token_id": 3,
+  "position_embedding_type": "rope",
+  "problem_type": "regression",
+  "reg_eps": 0.0001,
+  "reg_temperature": 1.0,
+  "repad_logits_with_grad": false,
+  "score_max": 10.0,
+  "score_min": 0.0,
+  "sep_token_id": 4,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "transformers_version": "4.56.2",
+  "vocab_size": 102400
+}

errors_hist.png ADDED Viewed

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd6e302b976075219ecaa4b5d38f3feb6c79c002f42ced39ee79bbcb7e583575
+size 147094428

modeling_modernbert_reward.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# modeling_modernbert_reward.py
+from typing import Optional, Union, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.modeling_outputs import SequenceClassifierOutput
+from transformers import ModernBertPreTrainedModel
+from transformers.models.modernbert.modeling_modernbert import (
+    ModernBertModel, ModernBertPredictionHead
+)
+import math
+class ModernBertForOrdinalAndRegression(ModernBertPreTrainedModel):
+    """
+    ModernBERT 本体の上に CORAL(順序) + 回帰ヘッドを載せる多目的報酬器。
+    - config.num_labels = K (例: 51 → 0.2刻み)
+    - 学習: L = L_ordinal + lambda_reg * L_regression （両方に sample_weight を掛ける）
+    - 推論: ord/reg のアンサンブル（blend）
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.model = ModernBertModel(config)
+        self.head  = ModernBertPredictionHead(config)
+        self.drop  = nn.Dropout(config.classifier_dropout)
+        self.num_bins   = int(getattr(config, "num_labels", 51))
+        self.lambda_reg = float(getattr(config, "lambda_reg", 0.3))
+        self.reg_temperature = float(getattr(config, "reg_temperature", 1.0))
+        self.reg_eps = float(getattr(config, "reg_eps", 1e-4))
+        self.gamma      = float(getattr(config, "gamma", 0.05))
+        self.blend      = float(getattr(config, "blend", 0.5))
+        self.score_min  = float(getattr(config, "score_min", 0.0))
+        self.score_max  = float(getattr(config, "score_max", 10.0))
+        # CORAL: 共通重み + 単調しきい値
+        self.coral_fc       = nn.Linear(config.hidden_size, 1, bias=False)
+        self.coral_bias_raw = nn.Parameter(torch.zeros(self.num_bins - 1))
+        # 回帰ヘッド
+        self.reg_head = nn.Linear(config.hidden_size, 1)
+        self.config.problem_type = "regression"
+        self.post_init()
+    def _init_weights(self, module: nn.Module):
+        super()._init_weights(module)
+        cutoff_factor = self.config.initializer_cutoff_factor
+        if cutoff_factor is None:
+            cutoff_factor = 3
+        def init_weight(module: nn.Module, std: float):
+            nn.init.trunc_normal_(
+                module.weight,
+                mean=0.0,
+                std=std,
+                a=-cutoff_factor * std,
+                b=cutoff_factor * std,
+            )
+            if isinstance(module, nn.Linear):
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+        stds = {
+            "in": self.config.initializer_range,
+            "out": self.config.initializer_range / math.sqrt(2.0 * self.config.num_hidden_layers),
+            "embedding": self.config.initializer_range,
+            "final_out": self.config.hidden_size**-0.5,
+        }
+        if isinstance(module, ModernBertForOrdinalAndRegression):
+            init_weight(module.coral_fc, stds["final_out"])
+            init_weight(module.reg_head, stds["final_out"])
+            module.coral_bias_raw.zero_()
+    def _thresholds(self) -> torch.Tensor:
+        # softplus で正の差分 → 累積で単調に
+        return torch.cumsum(F.softplus(self.coral_bias_raw), dim=0)
+    def _pool(self, last_hidden, attention_mask) -> torch.Tensor:
+        pooling = getattr(self.config, "classifier_pooling", "cls")
+        if pooling == "mean":
+            mask = attention_mask.unsqueeze(-1).to(last_hidden.dtype)
+            return (last_hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp_min(1e-6)
+        return last_hidden[:, 0]  # "cls"
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        sliding_window_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,        # 未使用
+        labels_cont: Optional[torch.Tensor] = None,   # [B] 0..10
+        labels_bin: Optional[torch.Tensor] = None,    # [B] 0..K-1
+        sample_weight: Optional[torch.Tensor] = None, # [B]
+        indices: Optional[torch.Tensor] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        seq_len: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            sliding_window_mask=sliding_window_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            indices=indices,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        last_hidden = outputs.last_hidden_state
+        pooled = self.head(self._pool(last_hidden, attention_mask))
+        pooled = self.drop(pooled)
+        # ----- Ordinal (CORAL) -----
+        z  = self.coral_fc(pooled).squeeze(-1)   # [B]
+        th = self._thresholds()                  # [K-1]
+        logits_ord = z.unsqueeze(-1) - th.unsqueeze(0)  # [B,K-1]
+        p_gt = torch.sigmoid(logits_ord)
+        ones  = torch.ones(p_gt.size(0), 1, device=p_gt.device, dtype=p_gt.dtype)
+        zeros = torch.zeros(p_gt.size(0), 1, device=p_gt.device, dtype=p_gt.dtype)
+        p_left  = torch.cat([ones,  p_gt], dim=1)
+        p_right = torch.cat([p_gt, zeros], dim=1)
+        p_cls   = (p_left - p_right).clamp_min(0.0)        # [B,K]
+        bins = torch.arange(self.num_bins, device=p_gt.device, dtype=p_gt.dtype).unsqueeze(0)
+        expected_bin = (p_cls * bins).sum(dim=-1)          # [B]
+        score_ord = self.score_min + (self.score_max - self.score_min) * (expected_bin / (self.num_bins - 1))
+        # ----- Regression -----
+        reg_raw   = self.reg_head(pooled).squeeze(-1)      # [B]
+        p = torch.sigmoid(reg_raw / self.reg_temperature)
+        p = p.clamp(self.reg_eps, 1.0 - self.reg_eps)
+        score_reg = self.score_min + (self.score_max - self.score_min) * p  # [B]
+        # ----- Blend（最終スコア）-----
+        score = (1.0 - self.blend) * score_reg + self.blend * score_ord  # [B]
+        logits = score.unsqueeze(-1)                                     # [B,1] 0..10
+        # ----- Loss -----
+        loss = None
+        if (labels_cont is not None) or (labels_bin is not None):
+            if sample_weight is None:
+                sample_weight = torch.ones_like(score)
+            sw = sample_weight.to(score.device).float()
+            sw = sw / (sw.mean() + 1e-12)
+            loss_total = 0.0
+            if labels_bin is not None:
+                # CORAL loss
+                y = labels_bin.to(logits_ord.device).long()
+                Km1 = self.num_bins - 1
+                thr = torch.arange(Km1, device=y.device).unsqueeze(0)
+                target_ord = (y.unsqueeze(1) > thr).float()            # [B,K-1]
+                bce = F.binary_cross_entropy_with_logits(logits_ord, target_ord, reduction="none").mean(dim=-1)
+                loss_ord = (bce * sw).sum() / sw.sum()
+                loss_total = loss_total + loss_ord
+            if labels_cont is not None and self.lambda_reg > 0.0:
+                # Huber loss
+                y_cont = labels_cont.to(score.device).float().clamp(self.score_min, self.score_max)
+                pt = (y_cont - self.score_min) / (self.score_max - self.score_min)
+                pt = pt.clamp(self.reg_eps, 1.0 - self.reg_eps)
+                t = torch.log(pt) - torch.log1p(-pt)
+                t = self.reg_temperature * t
+                huber = F.smooth_l1_loss(reg_raw, t, reduction="none")
+                loss_reg = (huber * sw).sum() / sw.sum()
+                loss_total = loss_total + self.lambda_reg * loss_reg
+                if self.gamma > 0:
+                    loss_total += self.gamma * (F.smooth_l1_loss(score, y_cont, reduction="none") * sw).sum() / sw.sum()
+            loss = loss_total
+        if not return_dict:
+            out = (logits,)
+            return ((loss,) + out) if loss is not None else out
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

scatter.png ADDED Viewed

Git LFS Details

SHA256: cf5df0d73a505aea096f126f7a7c349a3f3aee8ac1f86baf5bfc0aa381444855
Pointer size: 131 Bytes
Size of remote file: 478 kB

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<cls>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<sep>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:008293028e1a9d9a1038d9b63d989a2319797dfeaa03f171093a57b33a3a8277
+size 1831879

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,171 @@

+{
+  "add_bos_token": true,
+  "add_dummy_prefix_space": false,
+  "add_eos_token": true,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<cls>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "8": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "9": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "10": {
+      "content": "<|available_tools|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "11": {
+      "content": "<|tool_calls|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "12": {
+      "content": "<|tool_results|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "13": {
+      "content": "<|code|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "14": {
+      "content": "<|file|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "102397": {
+      "content": "<|prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "102398": {
+      "content": "<|suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "102399": {
+      "content": "<|middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<cls>",
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "extra_ids": 0,
+  "extra_special_tokens": {},
+  "keep_accents": true,
+  "legacy": false,
+  "mask_token": "<mask>",
+  "model_max_length": 8192,
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "sep_token": "<sep>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}