Training in progress, step 100

Browse files

Files changed (6) hide show

README.md +6 -6
config.json +86 -53
generation_config.json +14 -0
model-00001-of-00002.safetensors +1 -1
model-00002-of-00002.safetensors +1 -1
training_args.bin +2 -2

README.md CHANGED Viewed

@@ -1,17 +1,17 @@
 ---
-base_model: google/gemma-3-1b-it
 library_name: transformers
 model_name: nmt_21
 tags:
 - generated_from_trainer
-- grpo
 - trl
 licence: license
 ---
 # Model Card for nmt_21
-This model is a fine-tuned version of [google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it).
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
@@ -27,15 +27,15 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/oleg-dats/nmt/runs/f4rb3j16)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 ### Framework versions
-- TRL: 0.23.0
-- Transformers: 4.56.2
 - Pytorch: 2.5.1+cu124
 - Datasets: 4.1.1
 - Tokenizers: 0.22.1

 ---
+base_model: google/gemma-3-4b-it
 library_name: transformers
 model_name: nmt_21
 tags:
 - generated_from_trainer
 - trl
+- grpo
 licence: license
 ---
 # Model Card for nmt_21
+This model is a fine-tuned version of [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it).
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/oleg-dats/nmt/runs/yn1sslos)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 ### Framework versions
+- TRL: 0.23.1
+- Transformers: 4.57.0
 - Pytorch: 2.5.1+cu124
 - Datasets: 4.1.1
 - Tokenizers: 0.22.1

config.json CHANGED Viewed

@@ -1,62 +1,95 @@
 {
-  "_sliding_window_pattern": 6,
   "architectures": [
-    "Gemma3ForCausalLM"
   ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "attn_logit_softcapping": null,
   "bos_token_id": 2,
-  "cache_implementation": "hybrid",
   "dtype": "bfloat16",
   "eos_token_id": 1,
-  "final_logit_softcapping": null,
-  "head_dim": 256,
-  "hidden_activation": "gelu_pytorch_tanh",
-  "hidden_size": 1152,
   "initializer_range": 0.02,
-  "intermediate_size": 6912,
-  "layer_types": [
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "sliding_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "model_type": "gemma3_text",
-  "num_attention_heads": 4,
-  "num_hidden_layers": 26,
-  "num_key_value_heads": 1,
   "pad_token_id": 0,
-  "query_pre_attn_scalar": 256,
-  "rms_norm_eps": 1e-06,
-  "rope_local_base_freq": 10000,
-  "rope_scaling": null,
-  "rope_theta": 1000000,
-  "sliding_window": 512,
-  "transformers_version": "4.56.2",
-  "use_cache": true,
-  "vocab_size": 262144
 }

 {
   "architectures": [
+    "Gemma3ForConditionalGeneration"
   ],
+  "boi_token_index": 255999,
   "bos_token_id": 2,
   "dtype": "bfloat16",
+  "eoi_token_index": 256000,
   "eos_token_id": 1,
+  "image_token_index": 262144,
   "initializer_range": 0.02,
+  "mm_tokens_per_image": 256,
+  "model_type": "gemma3",
   "pad_token_id": 0,
+  "text_config": {
+    "_sliding_window_pattern": 6,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_logit_softcapping": null,
+    "final_logit_softcapping": null,
+    "head_dim": 256,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 10240,
+    "layer_types": [
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention"
+    ],
+    "max_position_embeddings": 131072,
+    "model_type": "gemma3_text",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 34,
+    "num_key_value_heads": 4,
+    "query_pre_attn_scalar": 256,
+    "rms_norm_eps": 1e-06,
+    "rope_local_base_freq": 10000.0,
+    "rope_scaling": {
+      "factor": 8.0,
+      "rope_type": "linear"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": 1024,
+    "use_bidirectional_attention": false,
+    "use_cache": true,
+    "vocab_size": 262208
+  },
+  "transformers_version": "4.57.0",
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 896,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 14,
+    "vision_use_head": false
+  }
 }

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "do_sample": true,
+  "eos_token_id": [
+    1,
+    1,
+    106
+  ],
+  "pad_token_id": 0,
+  "top_k": 64,
+  "top_p": 0.95,
+  "transformers_version": "4.57.0"
+}

model-00001-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:573711064f21bf8dae5dc5a0565759b42a2953f2df47bdeb6b54c8f8bd9cadfb
 size 4961251752

 version https://git-lfs.github.com/spec/v1
+oid sha256:7bc859417ffb5d286952146634b33bf56d1a95155b2a0d01c5a4828dc673330f
 size 4961251752

model-00002-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0d021b8913de4d11ff82a05b8737aa477d5ab13c2f15b1a6e86dfb17540a774b
 size 3639026128

 version https://git-lfs.github.com/spec/v1
+oid sha256:90c7dabe82c7e4abac3c9af029ecff020a1d863885c95bb1d959939a8a0e5d80
 size 3639026128

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:32d82cec56e20a42331cadab044e78a230fceda57b3097692eaf8e50ff1609a0
-size 6776

 version https://git-lfs.github.com/spec/v1
+oid sha256:54eac288f57ef58be29ddb0e8adea9bc712650afeaf8d7d4f42255c44a942cbb
+size 6840