Upload 14 files

Browse files

Files changed (7) hide show

README.md +60 -61
config_chem_mrl.json +8 -9
config_sentence_transformers.json +1 -1
configuration_modchembert.py +25 -4
model.safetensors +1 -1
modeling_modchembert.py +38 -79
similarity_evaluation_pubchem_10m_genmol_similarity_float32_results.csv +12 -12

README.md CHANGED Viewed

@@ -47,14 +47,14 @@ library_name: sentence-transformers
 metrics:
 - spearman
 co2_eq_emissions:
-  emissions: 4039.5232961852894
-  energy_consumed: 19.679154905865374
   source: codecarbon
   training_type: fine-tuning
   on_cloud: false
   cpu_model: AMD Ryzen 7 3700X 8-Core Processor
   ram_total_size: 62.69887161254883
-  hours_used: 74.966
   hardware_used: 2 x NVIDIA GeForce RTX 3090
 model-index:
 - name: 'ChemMRL: SMILES Matryoshka Representation Learning Embedding Transformer'
@@ -67,7 +67,7 @@ model-index:
       type: pubchem_10m_genmol_similarity_validation
     metrics:
     - type: spearman
-      value: 0.9881056976837288
       name: Spearman
   - task:
       type: semantic-similarity
@@ -77,9 +77,8 @@ model-index:
       type: pubchem_10m_genmol_similarity_test
     metrics:
     - type: spearman
-      value: 0.988127555600757
       name: Spearman
-new_version: Derify/ChemMRL
 ---
 # ChemMRL: SMILES Matryoshka Representation Learning Embedding Transformer
@@ -146,9 +145,9 @@ print(embeddings.shape)
 # Get the similarity scores for the embeddings
 similarities = model.backbone.similarity(embeddings, embeddings)
 print(similarities)
-# tensor([[1.0000, 0.4184, 0.0166],
-#         [0.4158, 1.0000, 0.0136],
-#         [0.0167, 0.0137, 1.0000]])
 ```
 ### Direct Usage (Sentence Transformers)
@@ -186,9 +185,9 @@ print(embeddings.shape)
 # Get the similarity scores for the embeddings
 similarities = model.similarity(embeddings, embeddings)
 print(similarities)
-# tensor([[1.0000, 0.5887, 0.0327],
-#         [0.5887, 1.0000, 0.0269],
-#         [0.0327, 0.0269, 1.0000]])
 ```
 </details>
@@ -209,8 +208,8 @@ print(similarities)
 | Split          | Metric       | Value       |
 | :------------- | :----------- | :---------- |
-| **validation** | **spearman** | **0.98811** |
-| **test**       | **spearman** | **0.98813** |
 ## Training Details
@@ -236,11 +235,11 @@ print(similarities)
   ```json
   {
       "loss": "TanimotoSentLoss",
-      "n_layers_per_step": 11,
-      "last_layer_weight": 1.0,
-      "prior_layers_weight": 1.5,
-      "kl_div_weight": 0.5,
-      "kl_temperature": 0.3,
       "matryoshka_dims": [
           1024,
           512,
@@ -261,7 +260,7 @@ print(similarities)
           1,
           1
       ],
-      "n_dims_per_step": 4
   }
   ```
@@ -287,11 +286,11 @@ print(similarities)
   ```json
   {
       "loss": "TanimotoSentLoss",
-      "n_layers_per_step": 11,
-      "last_layer_weight": 1.0,
-      "prior_layers_weight": 1.5,
-      "kl_div_weight": 0.5,
-      "kl_temperature": 0.3,
       "matryoshka_dims": [
           1024,
           512,
@@ -312,7 +311,7 @@ print(similarities)
           1,
           1
       ],
-      "n_dims_per_step": 4
   }
   ```
@@ -334,7 +333,7 @@ print(similarities)
 - `tf32`: True
 - `optim`: stable_adamw
 - `optim_args`: decouple_lr=True,max_lr=8.0e-6
-- `dataloader_pin_memory`: False
 - `eval_on_start`: True
 #### All Hyperparameters
@@ -416,7 +415,7 @@ print(similarities)
 - `ddp_find_unused_parameters`: None
 - `ddp_bucket_cap_mb`: None
 - `ddp_broadcast_buffers`: False
-- `dataloader_pin_memory`: False
 - `dataloader_persistent_workers`: False
 - `skip_memory_metrics`: True
 - `use_legacy_prediction_loop`: False
@@ -427,7 +426,7 @@ print(similarities)
 - `hub_private_repo`: None
 - `hub_always_push`: False
 - `hub_revision`: None
-- `gradient_checkpointing`: False
 - `gradient_checkpointing_kwargs`: None
 - `include_inputs_for_metrics`: False
 - `include_for_metrics`: []
@@ -467,41 +466,41 @@ print(similarities)
 | Epoch  |  Step  | Training Loss | pubchem 10m genmol similarity loss | pubchem_10m_genmol_similarity_spearman |
 | :----: | :----: | :-----------: | :--------------------------------: | :------------------------------------: |
-|   0    |   0    |       -       |              85.7997               |                 0.7261                 |
-| 0.0000 |   1    |    69.0605    |                 -                  |                   -                    |
-| 0.2477 | 25000  |    47.1696    |                 -                  |                   -                    |
-| 0.2500 | 25235  |       -       |              56.9634               |                 0.8997                 |
-| 0.4978 | 50250  |    45.6212    |                 -                  |                   -                    |
-| 0.5000 | 50470  |       -       |              55.4366               |                 0.9599                 |
-| 0.7479 | 75500  |    45.1404    |                 -                  |                   -                    |
-| 0.7500 | 75705  |       -       |              54.5667               |                 0.9755                 |
-| 0.9981 | 100750 |    44.5023    |                 -                  |                   -                    |
-| 1.0000 | 100940 |       -       |              54.1244               |                 0.9810                 |
-| 1.2482 | 126000 |    43.7545    |                 -                  |                   -                    |
-| 1.2500 | 126175 |       -       |              53.6974               |                 0.9838                 |
-| 1.4984 | 151250 |    43.7865    |                 -                  |                   -                    |
-| 1.5000 | 151410 |       -       |              53.4775               |                 0.9855                 |
-| 1.7485 | 176500 |    43.3512    |                 -                  |                   -                    |
-| 1.7499 | 176645 |       -       |              53.3775               |                 0.9866                 |
-| 1.9987 | 201750 |    43.5808    |                 -                  |                   -                    |
-| 1.9999 | 201880 |       -       |              53.3119               |                 0.9874                 |
-| 2.2488 | 227000 |    43.281     |                 -                  |                   -                    |
-| 2.2499 | 227115 |       -       |              53.1854               |                 0.9879                 |
-| 2.4989 | 252250 |    43.3097    |                 -                  |                   -                    |
-| 2.4999 | 252350 |       -       |              53.1972               |                 0.9880                 |
-| 2.7491 | 277500 |    43.2376    |                 -                  |                   -                    |
-| 2.7499 | 277585 |       -       |              53.1833               |                 0.9881                 |
-| 2.9992 | 302750 |    43.2006    |                 -                  |                   -                    |
-| 2.9999 | 302820 |       -       |              53.1241               |                 0.9881                 |
-| 3.0000 | 302829 |       -       |                 -                  |                0.98811                 |
 </details>
 ### Environmental Impact
 Carbon emissions were measured using [CodeCarbon](https://github.com/mlco2/codecarbon).
-- **Energy Consumed**: 19.679 kWh
-- **Carbon Emitted**: 4.040 kg of CO2
-- **Hours Used**: 74.966 hours
 ### Training Hardware
 - **On Cloud**: No
@@ -511,11 +510,11 @@ Carbon emissions were measured using [CodeCarbon](https://github.com/mlco2/codec
 ### Framework Versions
 - Python: 3.13.7
-- Sentence Transformers: 5.1.1
 - Transformers: 4.57.1
 - PyTorch: 2.8.0+cu128
 - Accelerate: 1.10.1
-- Datasets: 3.6.0
 - Tokenizers: 0.22.1
 ## Citation

 metrics:
 - spearman
 co2_eq_emissions:
+  emissions: 6350.153020081601
+  energy_consumed: 30.935740629629628
   source: codecarbon
   training_type: fine-tuning
   on_cloud: false
   cpu_model: AMD Ryzen 7 3700X 8-Core Processor
   ram_total_size: 62.69887161254883
+  hours_used: 116.388
   hardware_used: 2 x NVIDIA GeForce RTX 3090
 model-index:
 - name: 'ChemMRL: SMILES Matryoshka Representation Learning Embedding Transformer'
       type: pubchem_10m_genmol_similarity_validation
     metrics:
     - type: spearman
+      value: 0.989142152637452
       name: Spearman
   - task:
       type: semantic-similarity
       type: pubchem_10m_genmol_similarity_test
     metrics:
     - type: spearman
+      value: 0.9891625268496924
       name: Spearman
 ---
 # ChemMRL: SMILES Matryoshka Representation Learning Embedding Transformer
 # Get the similarity scores for the embeddings
 similarities = model.backbone.similarity(embeddings, embeddings)
 print(similarities)
+# tensor([[1.0000, 0.4179, 0.0165],
+#         [0.4179, 1.0000, 0.0140],
+#         [0.0165, 0.0140, 1.0000]])
 ```
 ### Direct Usage (Sentence Transformers)
 # Get the similarity scores for the embeddings
 similarities = model.similarity(embeddings, embeddings)
 print(similarities)
+# tensor([[1.0000, 0.5894, 0.0326],
+#         [0.5894, 1.0000, 0.0275],
+#         [0.0326, 0.0275, 1.0000]])
 ```
 </details>
 | Split          | Metric       | Value       |
 | :------------- | :----------- | :---------- |
+| **validation** | **spearman** | **0.98914** |
+| **test**       | **spearman** | **0.98916** |
 ## Training Details
   ```json
   {
       "loss": "TanimotoSentLoss",
+      "n_layers_per_step": -1,
+      "last_layer_weight": 2.0,
+      "prior_layers_weight": 1.0,
+      "kl_div_weight": 0.0,
+      "kl_temperature": 0.0,
       "matryoshka_dims": [
           1024,
           512,
           1,
           1
       ],
+      "n_dims_per_step": -1
   }
   ```
   ```json
   {
       "loss": "TanimotoSentLoss",
+      "n_layers_per_step": -1,
+      "last_layer_weight": 2.0,
+      "prior_layers_weight": 1.0,
+      "kl_div_weight": 0.0,
+      "kl_temperature": 0.0,
       "matryoshka_dims": [
           1024,
           512,
           1,
           1
       ],
+      "n_dims_per_step": -1
   }
   ```
 - `tf32`: True
 - `optim`: stable_adamw
 - `optim_args`: decouple_lr=True,max_lr=8.0e-6
+- `gradient_checkpointing`: True
 - `eval_on_start`: True
 #### All Hyperparameters
 - `ddp_find_unused_parameters`: None
 - `ddp_bucket_cap_mb`: None
 - `ddp_broadcast_buffers`: False
+- `dataloader_pin_memory`: True
 - `dataloader_persistent_workers`: False
 - `skip_memory_metrics`: True
 - `use_legacy_prediction_loop`: False
 - `hub_private_repo`: None
 - `hub_always_push`: False
 - `hub_revision`: None
+- `gradient_checkpointing`: True
 - `gradient_checkpointing_kwargs`: None
 - `include_inputs_for_metrics`: False
 - `include_for_metrics`: []
 | Epoch  |  Step  | Training Loss | pubchem 10m genmol similarity loss | pubchem_10m_genmol_similarity_spearman |
 | :----: | :----: | :-----------: | :--------------------------------: | :------------------------------------: |
+|   0    |   0    |       -       |              297.6136              |                 0.7261                 |
+| 0.0000 |   1    |   244.6862    |                 -                  |                   -                    |
+| 0.2477 | 25000  |   161.5037    |                 -                  |                   -                    |
+| 0.2500 | 25235  |       -       |              195.4624              |                 0.9067                 |
+| 0.4978 | 50250  |   155.7822    |                 -                  |                   -                    |
+| 0.5000 | 50470  |       -       |              189.4068              |                 0.9655                 |
+| 0.7479 | 75500  |   152.7915    |                 -                  |                   -                    |
+| 0.7500 | 75705  |       -       |              186.3661              |                 0.9780                 |
+| 0.9981 | 100750 |   151.0411    |                 -                  |                   -                    |
+| 1.0000 | 100940 |       -       |              184.6362              |                 0.9829                 |
+| 1.2482 | 126000 |   149.8544    |                 -                  |                   -                    |
+| 1.2500 | 126175 |       -       |              183.5648              |                 0.9855                 |
+| 1.4984 | 151250 |   149.2916    |                 -                  |                   -                    |
+| 1.5000 | 151410 |       -       |              182.8947              |                 0.9868                 |
+| 1.7485 | 176500 |   148.7942    |                 -                  |                   -                    |
+| 1.7499 | 176645 |       -       |              182.3662              |                 0.9879                 |
+| 1.9987 | 201750 |   148.3459    |                 -                  |                   -                    |
+| 1.9999 | 201880 |       -       |              181.9855              |                 0.9885                 |
+| 2.2488 | 227000 |   148.0316    |                 -                  |                   -                    |
+| 2.2499 | 227115 |       -       |              181.7683              |                 0.9889                 |
+| 2.4989 | 252250 |   147.8658    |                 -                  |                   -                    |
+| 2.4999 | 252350 |       -       |              181.6711              |                 0.9890                 |
+| 2.7491 | 277500 |   147.9642    |                 -                  |                   -                    |
+| 2.7499 | 277585 |       -       |              181.6077              |                 0.9891                 |
+| 2.9992 | 302750 |   147.8874    |                 -                  |                   -                    |
+| 2.9999 | 302820 |       -       |              181.6066              |                 0.9891                 |
+| 3.0000 | 302829 |       -       |                 -                  |                0.98914                 |
 </details>
 ### Environmental Impact
 Carbon emissions were measured using [CodeCarbon](https://github.com/mlco2/codecarbon).
+- **Energy Consumed**: 30.936 kWh
+- **Carbon Emitted**: 6.350 kg of CO2
+- **Hours Used**: 116.388 hours
 ### Training Hardware
 - **On Cloud**: No
 ### Framework Versions
 - Python: 3.13.7
+- Sentence Transformers: 5.1.2
 - Transformers: 4.57.1
 - PyTorch: 2.8.0+cu128
 - Accelerate: 1.10.1
+- Datasets: 4.3.0
 - Tokenizers: 0.22.1
 ## Citation

config_chem_mrl.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
-    "__version__": "0.7.4",
     "embedding_pooling": "mean",
     "eval_metric": "spearman",
     "eval_similarity_fct": "tanimoto",
-    "kl_div_weight": 0.5,
-    "kl_temperature": 0.3,
-    "last_layer_weight": 1.0,
     "loss_func": "tanimotosentloss",
     "model_name": "Derify/ModChemBERT-IR-BASE",
     "mrl_dimension_weights": [
@@ -28,10 +28,9 @@
         16,
         8
     ],
-    "n_dims_per_step": 4,
-    "n_layers_per_step": 11,
-    "prior_layers_weight": 1.5,
     "tanimoto_similarity_loss_func": null,
-    "use_2d_matryoshka": true,
-    "use_query_tokenizer": false
 }

 {
+    "__version__": "0.8.0",
     "embedding_pooling": "mean",
     "eval_metric": "spearman",
     "eval_similarity_fct": "tanimoto",
+    "kl_div_weight": 0.0,
+    "kl_temperature": 0.0,
+    "last_layer_weight": 2.0,
     "loss_func": "tanimotosentloss",
     "model_name": "Derify/ModChemBERT-IR-BASE",
     "mrl_dimension_weights": [
         16,
         8
     ],
+    "n_dims_per_step": -1,
+    "n_layers_per_step": -1,
+    "prior_layers_weight": 1.0,
     "tanimoto_similarity_loss_func": null,
+    "use_2d_matryoshka": true
 }

config_sentence_transformers.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "model_type": "SentenceTransformer",
   "__version__": {
-    "sentence_transformers": "5.1.1",
     "transformers": "4.57.1",
     "pytorch": "2.8.0+cu128"
   },

 {
   "model_type": "SentenceTransformer",
   "__version__": {
+    "sentence_transformers": "5.1.2",
     "transformers": "4.57.1",
     "pytorch": "2.8.0+cu128"
   },

configuration_modchembert.py CHANGED Viewed

@@ -37,14 +37,15 @@ class ModChemBertConfig(ModernBertConfig):
             - "max_cls": Element-wise max pooling over last k hidden states, then take CLS token
             - "cls_mha": Multi-head attention with CLS token as query and full sequence as keys/values
             - "max_seq_mha": Max pooling over last k states + multi-head attention with CLS as query
             - "max_seq_mean": Max pooling over last k hidden states, then mean pooling over sequence
             Defaults to "sum_mean".
         classifier_pooling_num_attention_heads (int, optional): Number of attention heads for multi-head attention
-            pooling strategies (cls_mha, max_seq_mha). Defaults to 4.
         classifier_pooling_attention_dropout (float, optional): Dropout probability for multi-head attention
-            pooling strategies (cls_mha, max_seq_mha). Defaults to 0.0.
-        classifier_pooling_last_k (int, optional): Number of last hidden layers to use for max pooling
-            strategies (max_cls, max_seq_mha, max_seq_mean). Defaults to 8.
         *args: Variable length argument list passed to ModernBertConfig.
         **kwargs: Arbitrary keyword arguments passed to ModernBertConfig.
@@ -68,6 +69,7 @@ class ModChemBertConfig(ModernBertConfig):
             "max_cls",
             "cls_mha",
             "max_seq_mha",
             "max_seq_mean",
         ] = "max_seq_mha",
         classifier_pooling_num_attention_heads: int = 4,
@@ -75,6 +77,25 @@ class ModChemBertConfig(ModernBertConfig):
         classifier_pooling_last_k: int = 8,
         **kwargs,
     ):
         # Pass classifier_pooling="cls" to circumvent ValueError in ModernBertConfig init
         super().__init__(*args, classifier_pooling="cls", **kwargs)
         # Override with custom value

             - "max_cls": Element-wise max pooling over last k hidden states, then take CLS token
             - "cls_mha": Multi-head attention with CLS token as query and full sequence as keys/values
             - "max_seq_mha": Max pooling over last k states + multi-head attention with CLS as query
+            - "mean_seq_mha": Mean pooling over last k states + multi-head attention with CLS as query
             - "max_seq_mean": Max pooling over last k hidden states, then mean pooling over sequence
             Defaults to "sum_mean".
         classifier_pooling_num_attention_heads (int, optional): Number of attention heads for multi-head attention
+            pooling strategies (cls_mha, max_seq_mha, mean_seq_mha). Defaults to 4.
         classifier_pooling_attention_dropout (float, optional): Dropout probability for multi-head attention
+            pooling strategies (cls_mha, max_seq_mha, mean_seq_mha). Defaults to 0.0.
+        classifier_pooling_last_k (int, optional): Number of last hidden layers to use for max/mean pooling
+            strategies (max_cls, max_seq_mha, mean_seq_mha, max_seq_mean). Defaults to 8.
         *args: Variable length argument list passed to ModernBertConfig.
         **kwargs: Arbitrary keyword arguments passed to ModernBertConfig.
             "max_cls",
             "cls_mha",
             "max_seq_mha",
+            "mean_seq_mha",
             "max_seq_mean",
         ] = "max_seq_mha",
         classifier_pooling_num_attention_heads: int = 4,
         classifier_pooling_last_k: int = 8,
         **kwargs,
     ):
+        valid_classifier_pooling_options = [
+            "cls",
+            "mean",
+            "sum_mean",
+            "sum_sum",
+            "mean_mean",
+            "mean_sum",
+            "max_cls",
+            "cls_mha",
+            "max_seq_mha",
+            "mean_seq_mha",
+            "max_seq_mean",
+        ]
+        if classifier_pooling not in valid_classifier_pooling_options:
+            raise ValueError(
+                f"Invalid value for `classifier_pooling`, should be one of {valid_classifier_pooling_options}, "
+                f"but is {classifier_pooling}."
+            )
         # Pass classifier_pooling="cls" to circumvent ValueError in ModernBertConfig init
         super().__init__(*args, classifier_pooling="cls", **kwargs)
         # Override with custom value

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4bdfc920fcc3c65314ef0cf0f5129884443d23748f09e23467015d54d5338ce4
 size 397110232

 version https://git-lfs.github.com/spec/v1
+oid sha256:ed6105dfe64c12207b1e1155a0d85c30cdadebcb42a6f0ea216dc36c3c28cf0c
 size 397110232

modeling_modchembert.py CHANGED Viewed

@@ -19,9 +19,9 @@
 # Modifications include:
 # - Additional classifier_pooling options for ModChemBertForSequenceClassification
 #   - sum_mean, sum_sum, mean_sum, mean_mean: from ChemLM (utilizes all hidden states)
-#   - max_cls, cls_mha, max_seq_mha: from MaxPoolBERT (utilizes last k hidden states)
 #   - max_seq_mean: a merge between sum_mean and max_cls (utilizes last k hidden states)
-# - Addition of ModChemBertPoolingAttention for cls_mha and max_seq_mha pooling options
 import copy
 import math
@@ -122,11 +122,7 @@ class ModChemBertPoolingAttention(nn.Module):
         self.rotary_emb = ModernBertRotaryEmbedding(config=config_copy)
         self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias)
-        self.out_drop = (
-            nn.Dropout(config.attention_dropout)
-            if config.attention_dropout > 0.0
-            else nn.Identity()
-        )
         self.pruned_heads = set()
     def forward(
@@ -179,14 +175,9 @@ class ModChemBertModel(ModernBertPreTrainedModel):
         self.config = config
         self.embeddings = ModernBertEmbeddings(config)
         self.layers = nn.ModuleList(
-            [
-                ModernBertEncoderLayer(config, layer_id)
-                for layer_id in range(config.num_hidden_layers)
-            ]
-        )
-        self.final_norm = nn.LayerNorm(
-            config.hidden_size, eps=config.norm_eps, bias=config.norm_bias
         )
         self.gradient_checkpointing = False
         self.post_init()
@@ -228,13 +219,9 @@ class ModChemBertModel(ModernBertPreTrainedModel):
         seq_len (`int`, *optional*):
             Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
         """  # noqa: E501
-        output_attentions = (
-            output_attentions if output_attentions is not None else self.config.output_attentions
-        )
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -316,25 +303,19 @@ class ModChemBertModel(ModernBertPreTrainedModel):
             )
             if all_hidden_states is not None:
                 all_hidden_states = tuple(
-                    _pad_modernbert_output(
-                        inputs=hs, indices=indices, batch=batch_size, seqlen=seq_len
-                    )  # type: ignore
                     for hs in all_hidden_states
                 )
         if not return_dict:
-            return tuple(
-                v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None
-            )
         return BaseModelOutput(
             last_hidden_state=hidden_states,  # type: ignore
             hidden_states=all_hidden_states,  # type: ignore
             attentions=all_self_attentions,
         )
-    def _update_attention_mask(
-        self, attention_mask: torch.Tensor, output_attentions: bool
-    ) -> torch.Tensor:
         if output_attentions:
             if self.config._attn_implementation == "sdpa":
                 logger.warning_once(  # type: ignore
@@ -357,16 +338,9 @@ class ModChemBertModel(ModernBertPreTrainedModel):
         distance = torch.abs(rows - rows.T)
         # Create sliding window mask (1 for positions within window, 0 outside)
-        window_mask = (
-            (distance <= self.config.local_attention // 2)
-            .unsqueeze(0)
-            .unsqueeze(0)
-            .to(attention_mask.device)
-        )
         # Combine with existing mask
-        sliding_window_mask = global_attention_mask.masked_fill(
-            window_mask.logical_not(), torch.finfo(self.dtype).min
-        )
         return global_attention_mask, sliding_window_mask  # type: ignore
@@ -445,28 +419,22 @@ class ModChemBertForMaskedLM(InitWeightsMixin, ModernBertPreTrainedModel):
                 device = input_ids.device if input_ids is not None else inputs_embeds.device  # type: ignore
                 if attention_mask is None:
-                    attention_mask = torch.ones(
-                        (batch_size, seq_len), device=device, dtype=torch.bool
-                    )  # type: ignore
                 if inputs_embeds is None:
                     with torch.no_grad():
-                        input_ids, indices, cu_seqlens, max_seqlen, position_ids, labels = (
-                            _unpad_modernbert_input(
-                                inputs=input_ids,  # type: ignore
-                                attention_mask=attention_mask,  # type: ignore
-                                position_ids=position_ids,
-                                labels=labels,
-                            )
-                        )
-                else:
-                    inputs_embeds, indices, cu_seqlens, max_seqlen, position_ids, labels = (
-                        _unpad_modernbert_input(
-                            inputs=inputs_embeds,
                             attention_mask=attention_mask,  # type: ignore
                             position_ids=position_ids,
                             labels=labels,
                         )
                     )
         outputs = self.model(
@@ -507,14 +475,8 @@ class ModChemBertForMaskedLM(InitWeightsMixin, ModernBertPreTrainedModel):
             loss = self.loss_function(logits, labels, vocab_size=self.config.vocab_size, **kwargs)
         if self.config._attn_implementation == "flash_attention_2":
-            with (
-                nullcontext()
-                if self.config.repad_logits_with_grad or labels is None
-                else torch.no_grad()
-            ):
-                logits = _pad_modernbert_output(
-                    inputs=logits, indices=indices, batch=batch_size, seqlen=seq_len
-                )  # type: ignore
         if not return_dict:
             output = (logits,)
@@ -537,7 +499,7 @@ class ModChemBertForSequenceClassification(InitWeightsMixin, ModernBertPreTraine
         self.config = config
         self.model = ModernBertModel(config)
-        if self.config.classifier_pooling in {"cls_mha", "max_seq_mha"}:
             self.pooling_attn = ModChemBertPoolingAttention(config=self.config)
         else:
             self.pooling_attn = None
@@ -638,9 +600,7 @@ class ModChemBertForSequenceClassification(InitWeightsMixin, ModernBertPreTraine
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (
-                    labels.dtype == torch.long or labels.dtype == torch.int
-                ):
                     self.config.problem_type = "single_label_classification"
                 else:
                     self.config.problem_type = "multi_label_classification"
@@ -689,6 +649,7 @@ def _pool_modchembert_output(
     - max_cls: Element-wise max pooling over the last k hidden states, then take CLS token
     - cls_mha: Multi-head attention with CLS token as query and full sequence as keys/values
     - max_seq_mha: Max pooling over last k states + multi-head attention with CLS as query
     - max_seq_mean: Max pooling over last k hidden states, then mean pooling over sequence
     - sum_mean: Sum all hidden states across layers, then mean pool over sequence
     - sum_sum: Sum all hidden states across layers, then sum pool over sequence
@@ -705,22 +666,20 @@ def _pool_modchembert_output(
         torch.Tensor: Pooled representation of shape (batch_size, hidden_size)
     Note:
-        Some pooling strategies (cls_mha, max_seq_mha) require the module to have a pooling_attn
         attribute containing a ModChemBertPoolingAttention instance.
     """
     config = typing.cast(ModChemBertConfig, module.config)
     if config.classifier_pooling == "cls":
         last_hidden_state = last_hidden_state[:, 0]
     elif config.classifier_pooling == "mean":
-        last_hidden_state = (last_hidden_state * attention_mask.unsqueeze(-1)).sum(
-            dim=1
-        ) / attention_mask.sum(dim=1, keepdim=True)
     elif config.classifier_pooling == "max_cls":
         k_hidden_states = hidden_states[-config.classifier_pooling_last_k :]
         theta = torch.stack(k_hidden_states, dim=1)  # (batch, k, seq_len, hidden)
-        pooled_seq = torch.max(
-            theta, dim=1
-        ).values  # Element-wise max over k -> (batch, seq_len, hidden)
         last_hidden_state = pooled_seq[:, 0, :]  # (batch, hidden)
     elif config.classifier_pooling == "cls_mha":
         # Similar to max_seq_mha but without the max pooling step
@@ -731,12 +690,13 @@ def _pool_modchembert_output(
             q=q, kv=last_hidden_state, attention_mask=attention_mask
         )  # (batch, seq_len, hidden)
         last_hidden_state = torch.mean(attn_out, dim=1)
-    elif config.classifier_pooling == "max_seq_mha":
         k_hidden_states = hidden_states[-config.classifier_pooling_last_k :]
         theta = torch.stack(k_hidden_states, dim=1)  # (batch, k, seq_len, hidden)
-        pooled_seq = torch.max(
-            theta, dim=1
-        ).values  # Element-wise max over k -> (batch, seq_len, hidden)
         # Query is pooled CLS token (position 0); Keys/Values are pooled sequence
         q = pooled_seq[:, 0, :].unsqueeze(1)  # (batch, 1, hidden)
         q = q.expand(-1, pooled_seq.shape[1], -1)  # (batch, seq_len, hidden)
@@ -747,9 +707,7 @@ def _pool_modchembert_output(
     elif config.classifier_pooling == "max_seq_mean":
         k_hidden_states = hidden_states[-config.classifier_pooling_last_k :]
         theta = torch.stack(k_hidden_states, dim=1)  # (batch, k, seq_len, hidden)
-        pooled_seq = torch.max(
-            theta, dim=1
-        ).values  # Element-wise max over k -> (batch, seq_len, hidden)
         last_hidden_state = torch.mean(pooled_seq, dim=1)  # Mean over sequence length
     elif config.classifier_pooling == "sum_mean":
         # ChemLM uses the mean of all hidden states
@@ -775,6 +733,7 @@ def _pool_modchembert_output(
 __all__ = [
     "ModChemBertForMaskedLM",
     "ModChemBertForSequenceClassification",
 ]

 # Modifications include:
 # - Additional classifier_pooling options for ModChemBertForSequenceClassification
 #   - sum_mean, sum_sum, mean_sum, mean_mean: from ChemLM (utilizes all hidden states)
+#   - max_cls, cls_mha, max_seq_mha, mean_seq_mha: from MaxPoolBERT (utilizes last k hidden states)
 #   - max_seq_mean: a merge between sum_mean and max_cls (utilizes last k hidden states)
+# - Addition of ModChemBertPoolingAttention for cls_mha, max_seq_mha, and mean_seq_mha pooling options
 import copy
 import math
         self.rotary_emb = ModernBertRotaryEmbedding(config=config_copy)
         self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias)
+        self.out_drop = nn.Dropout(config.attention_dropout) if config.attention_dropout > 0.0 else nn.Identity()
         self.pruned_heads = set()
     def forward(
         self.config = config
         self.embeddings = ModernBertEmbeddings(config)
         self.layers = nn.ModuleList(
+            [ModernBertEncoderLayer(config, layer_id) for layer_id in range(config.num_hidden_layers)]
         )
+        self.final_norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias)
         self.gradient_checkpointing = False
         self.post_init()
         seq_len (`int`, *optional*):
             Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
         """  # noqa: E501
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
             )
             if all_hidden_states is not None:
                 all_hidden_states = tuple(
+                    _pad_modernbert_output(inputs=hs, indices=indices, batch=batch_size, seqlen=seq_len)  # type: ignore
                     for hs in all_hidden_states
                 )
         if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
         return BaseModelOutput(
             last_hidden_state=hidden_states,  # type: ignore
             hidden_states=all_hidden_states,  # type: ignore
             attentions=all_self_attentions,
         )
+    def _update_attention_mask(self, attention_mask: torch.Tensor, output_attentions: bool) -> torch.Tensor:
         if output_attentions:
             if self.config._attn_implementation == "sdpa":
                 logger.warning_once(  # type: ignore
         distance = torch.abs(rows - rows.T)
         # Create sliding window mask (1 for positions within window, 0 outside)
+        window_mask = (distance <= self.config.local_attention // 2).unsqueeze(0).unsqueeze(0).to(attention_mask.device)
         # Combine with existing mask
+        sliding_window_mask = global_attention_mask.masked_fill(window_mask.logical_not(), torch.finfo(self.dtype).min)
         return global_attention_mask, sliding_window_mask  # type: ignore
                 device = input_ids.device if input_ids is not None else inputs_embeds.device  # type: ignore
                 if attention_mask is None:
+                    attention_mask = torch.ones((batch_size, seq_len), device=device, dtype=torch.bool)  # type: ignore
                 if inputs_embeds is None:
                     with torch.no_grad():
+                        input_ids, indices, cu_seqlens, max_seqlen, position_ids, labels = _unpad_modernbert_input(
+                            inputs=input_ids,  # type: ignore
                             attention_mask=attention_mask,  # type: ignore
                             position_ids=position_ids,
                             labels=labels,
                         )
+                else:
+                    inputs_embeds, indices, cu_seqlens, max_seqlen, position_ids, labels = _unpad_modernbert_input(
+                        inputs=inputs_embeds,
+                        attention_mask=attention_mask,  # type: ignore
+                        position_ids=position_ids,
+                        labels=labels,
                     )
         outputs = self.model(
             loss = self.loss_function(logits, labels, vocab_size=self.config.vocab_size, **kwargs)
         if self.config._attn_implementation == "flash_attention_2":
+            with nullcontext() if self.config.repad_logits_with_grad or labels is None else torch.no_grad():
+                logits = _pad_modernbert_output(inputs=logits, indices=indices, batch=batch_size, seqlen=seq_len)  # type: ignore
         if not return_dict:
             output = (logits,)
         self.config = config
         self.model = ModernBertModel(config)
+        if self.config.classifier_pooling in {"cls_mha", "max_seq_mha", "mean_seq_mha"}:
             self.pooling_attn = ModChemBertPoolingAttention(config=self.config)
         else:
             self.pooling_attn = None
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                     self.config.problem_type = "single_label_classification"
                 else:
                     self.config.problem_type = "multi_label_classification"
     - max_cls: Element-wise max pooling over the last k hidden states, then take CLS token
     - cls_mha: Multi-head attention with CLS token as query and full sequence as keys/values
     - max_seq_mha: Max pooling over last k states + multi-head attention with CLS as query
+    - mean_seq_mha: Mean pooling over last k states + multi-head attention with CLS as query
     - max_seq_mean: Max pooling over last k hidden states, then mean pooling over sequence
     - sum_mean: Sum all hidden states across layers, then mean pool over sequence
     - sum_sum: Sum all hidden states across layers, then sum pool over sequence
         torch.Tensor: Pooled representation of shape (batch_size, hidden_size)
     Note:
+        Some pooling strategies (cls_mha, max_seq_mha, mean_seq_mha) require the module to have a pooling_attn
         attribute containing a ModChemBertPoolingAttention instance.
     """
     config = typing.cast(ModChemBertConfig, module.config)
     if config.classifier_pooling == "cls":
         last_hidden_state = last_hidden_state[:, 0]
     elif config.classifier_pooling == "mean":
+        last_hidden_state = (last_hidden_state * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(
+            dim=1, keepdim=True
+        )
     elif config.classifier_pooling == "max_cls":
         k_hidden_states = hidden_states[-config.classifier_pooling_last_k :]
         theta = torch.stack(k_hidden_states, dim=1)  # (batch, k, seq_len, hidden)
+        pooled_seq = torch.max(theta, dim=1).values  # Element-wise max over k -> (batch, seq_len, hidden)
         last_hidden_state = pooled_seq[:, 0, :]  # (batch, hidden)
     elif config.classifier_pooling == "cls_mha":
         # Similar to max_seq_mha but without the max pooling step
             q=q, kv=last_hidden_state, attention_mask=attention_mask
         )  # (batch, seq_len, hidden)
         last_hidden_state = torch.mean(attn_out, dim=1)
+    elif config.classifier_pooling in {"max_seq_mha", "mean_seq_mha"}:
         k_hidden_states = hidden_states[-config.classifier_pooling_last_k :]
         theta = torch.stack(k_hidden_states, dim=1)  # (batch, k, seq_len, hidden)
+        if config.classifier_pooling == "max_seq_mha":
+            pooled_seq = torch.max(theta, dim=1).values  # Element-wise max over k -> (batch, seq_len, hidden)
+        else:
+            pooled_seq = torch.mean(theta, dim=1)  # Element-wise mean over k -> (batch, seq_len, hidden)
         # Query is pooled CLS token (position 0); Keys/Values are pooled sequence
         q = pooled_seq[:, 0, :].unsqueeze(1)  # (batch, 1, hidden)
         q = q.expand(-1, pooled_seq.shape[1], -1)  # (batch, seq_len, hidden)
     elif config.classifier_pooling == "max_seq_mean":
         k_hidden_states = hidden_states[-config.classifier_pooling_last_k :]
         theta = torch.stack(k_hidden_states, dim=1)  # (batch, k, seq_len, hidden)
+        pooled_seq = torch.max(theta, dim=1).values  # Element-wise max over k -> (batch, seq_len, hidden)
         last_hidden_state = torch.mean(pooled_seq, dim=1)  # Mean over sequence length
     elif config.classifier_pooling == "sum_mean":
         # ChemLM uses the mean of all hidden states
 __all__ = [
+    "ModChemBertModel",
     "ModChemBertForMaskedLM",
     "ModChemBertForSequenceClassification",
 ]

similarity_evaluation_pubchem_10m_genmol_similarity_float32_results.csv CHANGED Viewed

@@ -1,14 +1,14 @@
 epoch,steps,spearman
 0,0,0.7261446896400275
-0.2499925700642937,25235,0.899727524994741
-0.4999851401285874,50470,0.9599428082697957
-0.7499777101928812,75705,0.9755030703217896
-0.9999702802571748,100940,0.9809624466313892
-1.2499628503214686,126175,0.9838128954121899
-1.4999554203857621,151410,0.9854756886661312
-1.749947990450056,176645,0.9865980464822579
-1.9999405605143497,201880,0.9873943693937194
-2.2499331305786434,227115,0.9878659546563734
-2.499925700642937,252350,0.9879865870047979
-2.749918270707231,277585,0.9881075350289332
-2.9999108407715243,302820,0.9881056976837288

 epoch,steps,spearman
 0,0,0.7261446896400275
+0.2499925700642937,25235,0.906718265018918
+0.4999851401285874,50470,0.9655444741087182
+0.7499777101928812,75705,0.9779964615343857
+0.9999702802571748,100940,0.9828579834801283
+1.2499628503214686,126175,0.9855222540861318
+1.4999554203857621,151410,0.986820997047069
+1.749947990450056,176645,0.9879349539641308
+1.9999405605143497,201880,0.9885304751015874
+2.2499331305786434,227115,0.9889206748932795
+2.499925700642937,252350,0.989034117619882
+2.749918270707231,277585,0.9891366381020936
+2.9999108407715243,302820,0.9891427187036199