diff --git a/checkpoint-12208/config.json b/checkpoint-12208/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aaa74176892d42e1c7f5979b7ddf8ab15985d3 --- /dev/null +++ b/checkpoint-12208/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "/mnt/parscratch/users/acp23ay/private/models/Llama-3.1-8B-Instruct-ta-madlad-mean/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 138256 +} diff --git a/checkpoint-12208/generation_config.json b/checkpoint-12208/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-12208/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-12208/model-00001-of-00007.safetensors b/checkpoint-12208/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0c1724db00e664e2e52d75210ad9fac42c13401f --- /dev/null +++ b/checkpoint-12208/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42c35fcef8e1e04d32dea15a29c5e93a7f75e90fc909cfdd491e8842a0578dd4 +size 4983197184 diff --git a/checkpoint-12208/model-00002-of-00007.safetensors b/checkpoint-12208/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d32d83cb96a109af411b9cf577e7fbfe07ea76fc --- /dev/null +++ b/checkpoint-12208/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168629c67732309d49f60a5ec48a6d160212bc987365e82e183dfbf74ba0c1f3 +size 4899116432 diff --git a/checkpoint-12208/model-00003-of-00007.safetensors b/checkpoint-12208/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-12208/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-12208/model-00004-of-00007.safetensors b/checkpoint-12208/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-12208/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-12208/model-00005-of-00007.safetensors b/checkpoint-12208/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-12208/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-12208/model-00006-of-00007.safetensors b/checkpoint-12208/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..59929f77abc48da8d7739461781ef3d200968efe --- /dev/null +++ b/checkpoint-12208/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db159783b9bec981ff0f9d655f42d8ecd197ca2a6cc0fb5224070bc07d606d4d +size 4999813120 diff --git a/checkpoint-12208/model-00007-of-00007.safetensors b/checkpoint-12208/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ae909653d4cd0db7cbb64cdb1918f8cf1f023dc3 --- /dev/null +++ b/checkpoint-12208/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b77c27004239a1bce35451437813db39c418bd8f08b406ad22af0b2cd9453374 +size 2734998184 diff --git a/checkpoint-12208/model.safetensors.index.json b/checkpoint-12208/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..318803c6a3dd771c7f7c3b8038a896af7c8322ae --- /dev/null +++ b/checkpoint-12208/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32448724992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-12208/optimizer.pt b/checkpoint-12208/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..50e7d8cfec76b0b87b2ed14e4ac55ac354375f7d --- /dev/null +++ b/checkpoint-12208/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ba069faf91b22bc7301922d2796c42b030a97971b349e79197c9958e4d8d66e +size 16040396334 diff --git a/checkpoint-12208/rng_state.pth b/checkpoint-12208/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-12208/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-12208/scheduler.pt b/checkpoint-12208/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed3b066b7a770b71a74f026fa108a814ac17f832 --- /dev/null +++ b/checkpoint-12208/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abe644ed33a3c4139223f0857a985127f3e6fbaa8c89fa14b57671b49ca52c21 +size 1064 diff --git a/checkpoint-12208/trainer_state.json b/checkpoint-12208/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..da27e894a3373fc6542c5592bd1d5880ee76c329 --- /dev/null +++ b/checkpoint-12208/trainer_state.json @@ -0,0 +1,2784 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4057431534166445, + "eval_steps": 500, + "global_step": 12208, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001030311087476735, + "grad_norm": 60.25547409057617, + "learning_rate": 1.0157273918741808e-06, + "loss": 8.8455, + "step": 31 + }, + { + "epoch": 0.00206062217495347, + "grad_norm": 15.669363975524902, + "learning_rate": 2.0314547837483616e-06, + "loss": 7.1553, + "step": 62 + }, + { + "epoch": 0.003090933262430205, + "grad_norm": 15.366345405578613, + "learning_rate": 3.0471821756225426e-06, + "loss": 5.8784, + "step": 93 + }, + { + "epoch": 0.00412124434990694, + "grad_norm": 36.30561828613281, + "learning_rate": 4.062909567496723e-06, + "loss": 4.7708, + "step": 124 + }, + { + "epoch": 0.005151555437383675, + "grad_norm": 27.202678680419922, + "learning_rate": 5.078636959370905e-06, + "loss": 4.1629, + "step": 155 + }, + { + "epoch": 0.00618186652486041, + "grad_norm": 24.30484962463379, + "learning_rate": 6.094364351245085e-06, + "loss": 3.867, + "step": 186 + }, + { + "epoch": 0.007212177612337145, + "grad_norm": 19.916366577148438, + "learning_rate": 7.110091743119267e-06, + "loss": 3.6131, + "step": 217 + }, + { + "epoch": 0.00824248869981388, + "grad_norm": 17.577274322509766, + "learning_rate": 8.125819134993446e-06, + "loss": 3.4772, + "step": 248 + }, + { + "epoch": 0.009272799787290615, + "grad_norm": 12.133190155029297, + "learning_rate": 9.141546526867629e-06, + "loss": 3.3218, + "step": 279 + }, + { + "epoch": 0.01030311087476735, + "grad_norm": 19.79263687133789, + "learning_rate": 1.015727391874181e-05, + "loss": 3.2055, + "step": 310 + }, + { + "epoch": 0.011333421962244085, + "grad_norm": 16.38133430480957, + "learning_rate": 1.117300131061599e-05, + "loss": 3.1062, + "step": 341 + }, + { + "epoch": 0.01236373304972082, + "grad_norm": 12.638299942016602, + "learning_rate": 1.218872870249017e-05, + "loss": 3.0106, + "step": 372 + }, + { + "epoch": 0.013394044137197554, + "grad_norm": 9.46596908569336, + "learning_rate": 1.3204456094364351e-05, + "loss": 2.924, + "step": 403 + }, + { + "epoch": 0.01442435522467429, + "grad_norm": 10.945392608642578, + "learning_rate": 1.4220183486238533e-05, + "loss": 2.844, + "step": 434 + }, + { + "epoch": 0.015454666312151024, + "grad_norm": 8.474015235900879, + "learning_rate": 1.5235910878112714e-05, + "loss": 2.7892, + "step": 465 + }, + { + "epoch": 0.01648497739962776, + "grad_norm": 9.370804786682129, + "learning_rate": 1.6251638269986893e-05, + "loss": 2.7509, + "step": 496 + }, + { + "epoch": 0.017515288487104493, + "grad_norm": 11.63398551940918, + "learning_rate": 1.7267365661861077e-05, + "loss": 2.6999, + "step": 527 + }, + { + "epoch": 0.01854559957458123, + "grad_norm": 9.17713451385498, + "learning_rate": 1.8283093053735257e-05, + "loss": 2.6459, + "step": 558 + }, + { + "epoch": 0.019575910662057962, + "grad_norm": 7.119054794311523, + "learning_rate": 1.9298820445609438e-05, + "loss": 2.603, + "step": 589 + }, + { + "epoch": 0.0206062217495347, + "grad_norm": 6.653646945953369, + "learning_rate": 2.031454783748362e-05, + "loss": 2.5588, + "step": 620 + }, + { + "epoch": 0.021636532837011432, + "grad_norm": 8.332653045654297, + "learning_rate": 2.13302752293578e-05, + "loss": 2.5357, + "step": 651 + }, + { + "epoch": 0.02266684392448817, + "grad_norm": 6.4949116706848145, + "learning_rate": 2.234600262123198e-05, + "loss": 2.4967, + "step": 682 + }, + { + "epoch": 0.023697155011964902, + "grad_norm": 9.41009521484375, + "learning_rate": 2.336173001310616e-05, + "loss": 2.4563, + "step": 713 + }, + { + "epoch": 0.02472746609944164, + "grad_norm": 7.840345859527588, + "learning_rate": 2.437745740498034e-05, + "loss": 2.4383, + "step": 744 + }, + { + "epoch": 0.025757777186918372, + "grad_norm": 6.116458415985107, + "learning_rate": 2.5393184796854525e-05, + "loss": 2.3817, + "step": 775 + }, + { + "epoch": 0.02678808827439511, + "grad_norm": 5.938300609588623, + "learning_rate": 2.6408912188728702e-05, + "loss": 2.3508, + "step": 806 + }, + { + "epoch": 0.027818399361871842, + "grad_norm": 5.4408345222473145, + "learning_rate": 2.7424639580602886e-05, + "loss": 2.3325, + "step": 837 + }, + { + "epoch": 0.02884871044934858, + "grad_norm": 5.375136375427246, + "learning_rate": 2.8440366972477066e-05, + "loss": 2.3101, + "step": 868 + }, + { + "epoch": 0.029879021536825312, + "grad_norm": 5.149726867675781, + "learning_rate": 2.9456094364351244e-05, + "loss": 2.282, + "step": 899 + }, + { + "epoch": 0.03090933262430205, + "grad_norm": 4.591221332550049, + "learning_rate": 3.0471821756225428e-05, + "loss": 2.2427, + "step": 930 + }, + { + "epoch": 0.031939643711778785, + "grad_norm": 4.977034091949463, + "learning_rate": 3.148754914809961e-05, + "loss": 2.2218, + "step": 961 + }, + { + "epoch": 0.03296995479925552, + "grad_norm": 5.038781642913818, + "learning_rate": 3.2503276539973785e-05, + "loss": 2.2044, + "step": 992 + }, + { + "epoch": 0.03400026588673225, + "grad_norm": 4.872281551361084, + "learning_rate": 3.351900393184797e-05, + "loss": 2.1657, + "step": 1023 + }, + { + "epoch": 0.035030576974208985, + "grad_norm": 4.370841979980469, + "learning_rate": 3.453473132372215e-05, + "loss": 2.1365, + "step": 1054 + }, + { + "epoch": 0.036060888061685725, + "grad_norm": 4.087072849273682, + "learning_rate": 3.555045871559633e-05, + "loss": 2.1253, + "step": 1085 + }, + { + "epoch": 0.03709119914916246, + "grad_norm": 4.113957405090332, + "learning_rate": 3.6566186107470514e-05, + "loss": 2.0973, + "step": 1116 + }, + { + "epoch": 0.03812151023663919, + "grad_norm": 4.0119733810424805, + "learning_rate": 3.7581913499344695e-05, + "loss": 2.1024, + "step": 1147 + }, + { + "epoch": 0.039151821324115925, + "grad_norm": 4.247573375701904, + "learning_rate": 3.8597640891218876e-05, + "loss": 2.0722, + "step": 1178 + }, + { + "epoch": 0.04018213241159266, + "grad_norm": 3.5575129985809326, + "learning_rate": 3.9613368283093056e-05, + "loss": 2.056, + "step": 1209 + }, + { + "epoch": 0.0412124434990694, + "grad_norm": 3.8885862827301025, + "learning_rate": 4.062909567496724e-05, + "loss": 2.0389, + "step": 1240 + }, + { + "epoch": 0.04224275458654613, + "grad_norm": 3.680628538131714, + "learning_rate": 4.164482306684142e-05, + "loss": 2.0385, + "step": 1271 + }, + { + "epoch": 0.043273065674022865, + "grad_norm": 3.780876874923706, + "learning_rate": 4.26605504587156e-05, + "loss": 2.0097, + "step": 1302 + }, + { + "epoch": 0.0443033767614996, + "grad_norm": 4.235328674316406, + "learning_rate": 4.367627785058978e-05, + "loss": 2.0024, + "step": 1333 + }, + { + "epoch": 0.04533368784897634, + "grad_norm": 3.326941967010498, + "learning_rate": 4.469200524246396e-05, + "loss": 1.9953, + "step": 1364 + }, + { + "epoch": 0.04636399893645307, + "grad_norm": 3.28456449508667, + "learning_rate": 4.570773263433814e-05, + "loss": 1.9579, + "step": 1395 + }, + { + "epoch": 0.047394310023929805, + "grad_norm": 16.107433319091797, + "learning_rate": 4.672346002621232e-05, + "loss": 1.9701, + "step": 1426 + }, + { + "epoch": 0.04842462111140654, + "grad_norm": 3.5708224773406982, + "learning_rate": 4.77391874180865e-05, + "loss": 1.9621, + "step": 1457 + }, + { + "epoch": 0.04945493219888328, + "grad_norm": 2.9053499698638916, + "learning_rate": 4.875491480996068e-05, + "loss": 1.9458, + "step": 1488 + }, + { + "epoch": 0.05048524328636001, + "grad_norm": 3.0863258838653564, + "learning_rate": 4.977064220183487e-05, + "loss": 1.9483, + "step": 1519 + }, + { + "epoch": 0.051515554373836744, + "grad_norm": 2.9012269973754883, + "learning_rate": 4.9999915451558777e-05, + "loss": 1.928, + "step": 1550 + }, + { + "epoch": 0.05254586546131348, + "grad_norm": 3.0949041843414307, + "learning_rate": 4.999955597496219e-05, + "loss": 1.9229, + "step": 1581 + }, + { + "epoch": 0.05357617654879022, + "grad_norm": 2.8687901496887207, + "learning_rate": 4.9998914381774255e-05, + "loss": 1.915, + "step": 1612 + }, + { + "epoch": 0.05460648763626695, + "grad_norm": 3.2136878967285156, + "learning_rate": 4.999799067923527e-05, + "loss": 1.9197, + "step": 1643 + }, + { + "epoch": 0.055636798723743684, + "grad_norm": 2.590843677520752, + "learning_rate": 4.999678487776908e-05, + "loss": 1.8756, + "step": 1674 + }, + { + "epoch": 0.05666710981122042, + "grad_norm": 2.64634108543396, + "learning_rate": 4.9995296990983006e-05, + "loss": 1.9033, + "step": 1705 + }, + { + "epoch": 0.05769742089869716, + "grad_norm": 3.0151331424713135, + "learning_rate": 4.999352703566763e-05, + "loss": 1.8883, + "step": 1736 + }, + { + "epoch": 0.05872773198617389, + "grad_norm": 2.526806354522705, + "learning_rate": 4.999147503179668e-05, + "loss": 1.8666, + "step": 1767 + }, + { + "epoch": 0.059758043073650624, + "grad_norm": 2.510300397872925, + "learning_rate": 4.998914100252672e-05, + "loss": 1.854, + "step": 1798 + }, + { + "epoch": 0.06078835416112736, + "grad_norm": 2.4867682456970215, + "learning_rate": 4.998652497419696e-05, + "loss": 1.8548, + "step": 1829 + }, + { + "epoch": 0.0618186652486041, + "grad_norm": 2.3920586109161377, + "learning_rate": 4.9983626976328927e-05, + "loss": 1.8495, + "step": 1860 + }, + { + "epoch": 0.06284897633608083, + "grad_norm": 2.714177370071411, + "learning_rate": 4.998044704162613e-05, + "loss": 1.8433, + "step": 1891 + }, + { + "epoch": 0.06387928742355757, + "grad_norm": 2.3094465732574463, + "learning_rate": 4.9976985205973705e-05, + "loss": 1.8382, + "step": 1922 + }, + { + "epoch": 0.0649095985110343, + "grad_norm": 2.47184419631958, + "learning_rate": 4.997324150843799e-05, + "loss": 1.8464, + "step": 1953 + }, + { + "epoch": 0.06593990959851104, + "grad_norm": 2.391841411590576, + "learning_rate": 4.99692159912661e-05, + "loss": 1.8179, + "step": 1984 + }, + { + "epoch": 0.06697022068598776, + "grad_norm": 2.2471864223480225, + "learning_rate": 4.996490869988546e-05, + "loss": 1.8149, + "step": 2015 + }, + { + "epoch": 0.0680005317734645, + "grad_norm": 2.5497376918792725, + "learning_rate": 4.996031968290326e-05, + "loss": 1.8099, + "step": 2046 + }, + { + "epoch": 0.06903084286094124, + "grad_norm": 2.330463409423828, + "learning_rate": 4.995544899210594e-05, + "loss": 1.8267, + "step": 2077 + }, + { + "epoch": 0.07006115394841797, + "grad_norm": 2.3259341716766357, + "learning_rate": 4.9950296682458583e-05, + "loss": 1.7801, + "step": 2108 + }, + { + "epoch": 0.07109146503589471, + "grad_norm": 2.1711952686309814, + "learning_rate": 4.994486281210429e-05, + "loss": 1.7961, + "step": 2139 + }, + { + "epoch": 0.07212177612337145, + "grad_norm": 2.1808884143829346, + "learning_rate": 4.9939147442363566e-05, + "loss": 1.8109, + "step": 2170 + }, + { + "epoch": 0.07315208721084818, + "grad_norm": 2.089256525039673, + "learning_rate": 4.9933150637733574e-05, + "loss": 1.8026, + "step": 2201 + }, + { + "epoch": 0.07418239829832492, + "grad_norm": 2.0864951610565186, + "learning_rate": 4.992687246588743e-05, + "loss": 1.7753, + "step": 2232 + }, + { + "epoch": 0.07521270938580164, + "grad_norm": 2.36157488822937, + "learning_rate": 4.992031299767347e-05, + "loss": 1.7746, + "step": 2263 + }, + { + "epoch": 0.07624302047327838, + "grad_norm": 2.5334439277648926, + "learning_rate": 4.9913472307114386e-05, + "loss": 1.7927, + "step": 2294 + }, + { + "epoch": 0.07727333156075512, + "grad_norm": 2.2565715312957764, + "learning_rate": 4.9906350471406446e-05, + "loss": 1.7668, + "step": 2325 + }, + { + "epoch": 0.07830364264823185, + "grad_norm": 2.1043128967285156, + "learning_rate": 4.989894757091861e-05, + "loss": 1.7771, + "step": 2356 + }, + { + "epoch": 0.07933395373570859, + "grad_norm": 1.9659819602966309, + "learning_rate": 4.989126368919158e-05, + "loss": 1.7666, + "step": 2387 + }, + { + "epoch": 0.08036426482318532, + "grad_norm": 2.0778403282165527, + "learning_rate": 4.988329891293693e-05, + "loss": 1.7405, + "step": 2418 + }, + { + "epoch": 0.08139457591066206, + "grad_norm": 2.1767923831939697, + "learning_rate": 4.987505333203608e-05, + "loss": 1.7495, + "step": 2449 + }, + { + "epoch": 0.0824248869981388, + "grad_norm": 2.260143280029297, + "learning_rate": 4.9866527039539276e-05, + "loss": 1.7504, + "step": 2480 + }, + { + "epoch": 0.08345519808561552, + "grad_norm": 2.18271803855896, + "learning_rate": 4.9857720131664594e-05, + "loss": 1.7456, + "step": 2511 + }, + { + "epoch": 0.08448550917309226, + "grad_norm": 2.209594964981079, + "learning_rate": 4.9848632707796773e-05, + "loss": 1.7528, + "step": 2542 + }, + { + "epoch": 0.085515820260569, + "grad_norm": 2.0666229724884033, + "learning_rate": 4.9839264870486155e-05, + "loss": 1.7517, + "step": 2573 + }, + { + "epoch": 0.08654613134804573, + "grad_norm": 2.1070454120635986, + "learning_rate": 4.9829616725447526e-05, + "loss": 1.7474, + "step": 2604 + }, + { + "epoch": 0.08757644243552247, + "grad_norm": 1.9430303573608398, + "learning_rate": 4.981968838155888e-05, + "loss": 1.7348, + "step": 2635 + }, + { + "epoch": 0.0886067535229992, + "grad_norm": 1.9638925790786743, + "learning_rate": 4.980947995086024e-05, + "loss": 1.7202, + "step": 2666 + }, + { + "epoch": 0.08963706461047594, + "grad_norm": 1.8845652341842651, + "learning_rate": 4.979899154855234e-05, + "loss": 1.7375, + "step": 2697 + }, + { + "epoch": 0.09066737569795268, + "grad_norm": 5.712058067321777, + "learning_rate": 4.9788223292995386e-05, + "loss": 1.7379, + "step": 2728 + }, + { + "epoch": 0.0916976867854294, + "grad_norm": 1.9520670175552368, + "learning_rate": 4.977717530570768e-05, + "loss": 1.7302, + "step": 2759 + }, + { + "epoch": 0.09272799787290614, + "grad_norm": 1.8802224397659302, + "learning_rate": 4.976584771136425e-05, + "loss": 1.74, + "step": 2790 + }, + { + "epoch": 0.09375830896038288, + "grad_norm": 2.1098153591156006, + "learning_rate": 4.975424063779547e-05, + "loss": 1.7024, + "step": 2821 + }, + { + "epoch": 0.09478862004785961, + "grad_norm": 2.1568291187286377, + "learning_rate": 4.974235421598557e-05, + "loss": 1.7131, + "step": 2852 + }, + { + "epoch": 0.09581893113533635, + "grad_norm": 1.8769980669021606, + "learning_rate": 4.973018858007122e-05, + "loss": 1.7008, + "step": 2883 + }, + { + "epoch": 0.09684924222281308, + "grad_norm": 1.8325533866882324, + "learning_rate": 4.9717743867339963e-05, + "loss": 1.7058, + "step": 2914 + }, + { + "epoch": 0.09787955331028982, + "grad_norm": 2.086416721343994, + "learning_rate": 4.9705020218228695e-05, + "loss": 1.711, + "step": 2945 + }, + { + "epoch": 0.09890986439776656, + "grad_norm": 1.8294793367385864, + "learning_rate": 4.969201777632205e-05, + "loss": 1.6998, + "step": 2976 + }, + { + "epoch": 0.09994017548524328, + "grad_norm": 2.0608153343200684, + "learning_rate": 4.9678736688350846e-05, + "loss": 1.6948, + "step": 3007 + }, + { + "epoch": 0.10097048657272002, + "grad_norm": 3.2166008949279785, + "learning_rate": 4.966517710419033e-05, + "loss": 1.6788, + "step": 3038 + }, + { + "epoch": 0.10200079766019676, + "grad_norm": 1.9431313276290894, + "learning_rate": 4.965133917685858e-05, + "loss": 1.7115, + "step": 3069 + }, + { + "epoch": 0.10303110874767349, + "grad_norm": 1.967512845993042, + "learning_rate": 4.9637223062514714e-05, + "loss": 1.7033, + "step": 3100 + }, + { + "epoch": 0.10406141983515023, + "grad_norm": 1.9253389835357666, + "learning_rate": 4.962282892045718e-05, + "loss": 1.6856, + "step": 3131 + }, + { + "epoch": 0.10509173092262696, + "grad_norm": 1.986840009689331, + "learning_rate": 4.9608156913121904e-05, + "loss": 1.723, + "step": 3162 + }, + { + "epoch": 0.1061220420101037, + "grad_norm": 1.83523690700531, + "learning_rate": 4.959320720608049e-05, + "loss": 1.6912, + "step": 3193 + }, + { + "epoch": 0.10715235309758044, + "grad_norm": 2.1271955966949463, + "learning_rate": 4.9577979968038354e-05, + "loss": 1.7032, + "step": 3224 + }, + { + "epoch": 0.10818266418505716, + "grad_norm": 1.8383768796920776, + "learning_rate": 4.956247537083282e-05, + "loss": 1.6726, + "step": 3255 + }, + { + "epoch": 0.1092129752725339, + "grad_norm": 1.8806651830673218, + "learning_rate": 4.9546693589431145e-05, + "loss": 1.6817, + "step": 3286 + }, + { + "epoch": 0.11024328636001064, + "grad_norm": 1.7535260915756226, + "learning_rate": 4.9530634801928595e-05, + "loss": 1.6875, + "step": 3317 + }, + { + "epoch": 0.11127359744748737, + "grad_norm": 1.765906810760498, + "learning_rate": 4.9514299189546395e-05, + "loss": 1.6859, + "step": 3348 + }, + { + "epoch": 0.11230390853496411, + "grad_norm": 1.869828462600708, + "learning_rate": 4.949768693662973e-05, + "loss": 1.6915, + "step": 3379 + }, + { + "epoch": 0.11333421962244083, + "grad_norm": 1.8347504138946533, + "learning_rate": 4.948079823064559e-05, + "loss": 1.6859, + "step": 3410 + }, + { + "epoch": 0.11436453070991758, + "grad_norm": 1.7692474126815796, + "learning_rate": 4.946363326218074e-05, + "loss": 1.6565, + "step": 3441 + }, + { + "epoch": 0.11539484179739432, + "grad_norm": 1.8231885433197021, + "learning_rate": 4.9446192224939525e-05, + "loss": 1.686, + "step": 3472 + }, + { + "epoch": 0.11642515288487104, + "grad_norm": 1.7155958414077759, + "learning_rate": 4.942847531574167e-05, + "loss": 1.6538, + "step": 3503 + }, + { + "epoch": 0.11745546397234778, + "grad_norm": 1.787183403968811, + "learning_rate": 4.941048273452008e-05, + "loss": 1.6776, + "step": 3534 + }, + { + "epoch": 0.11848577505982451, + "grad_norm": 1.741213083267212, + "learning_rate": 4.9392214684318605e-05, + "loss": 1.6784, + "step": 3565 + }, + { + "epoch": 0.11951608614730125, + "grad_norm": 1.7836824655532837, + "learning_rate": 4.93736713712897e-05, + "loss": 1.6557, + "step": 3596 + }, + { + "epoch": 0.12054639723477799, + "grad_norm": 1.7103859186172485, + "learning_rate": 4.9354853004692124e-05, + "loss": 1.6606, + "step": 3627 + }, + { + "epoch": 0.12157670832225471, + "grad_norm": 1.7865506410598755, + "learning_rate": 4.93357597968886e-05, + "loss": 1.6409, + "step": 3658 + }, + { + "epoch": 0.12260701940973145, + "grad_norm": 1.7770143747329712, + "learning_rate": 4.931639196334338e-05, + "loss": 1.6574, + "step": 3689 + }, + { + "epoch": 0.1236373304972082, + "grad_norm": 1.857575535774231, + "learning_rate": 4.9296749722619826e-05, + "loss": 1.6724, + "step": 3720 + }, + { + "epoch": 0.12466764158468492, + "grad_norm": 1.8742581605911255, + "learning_rate": 4.9276833296377966e-05, + "loss": 1.6506, + "step": 3751 + }, + { + "epoch": 0.12569795267216166, + "grad_norm": 1.827668309211731, + "learning_rate": 4.925664290937196e-05, + "loss": 1.6523, + "step": 3782 + }, + { + "epoch": 0.1267282637596384, + "grad_norm": 1.7517486810684204, + "learning_rate": 4.9236178789447576e-05, + "loss": 1.6459, + "step": 3813 + }, + { + "epoch": 0.12775857484711514, + "grad_norm": 1.8109570741653442, + "learning_rate": 4.921544116753962e-05, + "loss": 1.6614, + "step": 3844 + }, + { + "epoch": 0.12878888593459187, + "grad_norm": 1.692597508430481, + "learning_rate": 4.919443027766935e-05, + "loss": 1.6431, + "step": 3875 + }, + { + "epoch": 0.1298191970220686, + "grad_norm": 1.8650025129318237, + "learning_rate": 4.91731463569418e-05, + "loss": 1.6466, + "step": 3906 + }, + { + "epoch": 0.13084950810954532, + "grad_norm": 1.6794081926345825, + "learning_rate": 4.915158964554312e-05, + "loss": 1.6504, + "step": 3937 + }, + { + "epoch": 0.13187981919702207, + "grad_norm": 1.7685374021530151, + "learning_rate": 4.912976038673786e-05, + "loss": 1.6446, + "step": 3968 + }, + { + "epoch": 0.1329101302844988, + "grad_norm": 1.7601110935211182, + "learning_rate": 4.9107658826866254e-05, + "loss": 1.631, + "step": 3999 + }, + { + "epoch": 0.13394044137197553, + "grad_norm": 2.0616064071655273, + "learning_rate": 4.908528521534139e-05, + "loss": 1.6476, + "step": 4030 + }, + { + "epoch": 0.13497075245945228, + "grad_norm": 1.8973504304885864, + "learning_rate": 4.906263980464644e-05, + "loss": 1.6582, + "step": 4061 + }, + { + "epoch": 0.136001063546929, + "grad_norm": 1.7768895626068115, + "learning_rate": 4.903972285033178e-05, + "loss": 1.6159, + "step": 4092 + }, + { + "epoch": 0.13703137463440573, + "grad_norm": 1.8264424800872803, + "learning_rate": 4.901653461101213e-05, + "loss": 1.6289, + "step": 4123 + }, + { + "epoch": 0.1380616857218825, + "grad_norm": 1.7140119075775146, + "learning_rate": 4.8993075348363626e-05, + "loss": 1.6357, + "step": 4154 + }, + { + "epoch": 0.13909199680935921, + "grad_norm": 1.6964486837387085, + "learning_rate": 4.896934532712084e-05, + "loss": 1.6233, + "step": 4185 + }, + { + "epoch": 0.14012230789683594, + "grad_norm": 1.8008025884628296, + "learning_rate": 4.8945344815073846e-05, + "loss": 1.637, + "step": 4216 + }, + { + "epoch": 0.1411526189843127, + "grad_norm": 1.562730073928833, + "learning_rate": 4.892107408306516e-05, + "loss": 1.6379, + "step": 4247 + }, + { + "epoch": 0.14218293007178942, + "grad_norm": 1.8273371458053589, + "learning_rate": 4.889653340498669e-05, + "loss": 1.6246, + "step": 4278 + }, + { + "epoch": 0.14321324115926615, + "grad_norm": 56.33716583251953, + "learning_rate": 4.8871723057776664e-05, + "loss": 1.6457, + "step": 4309 + }, + { + "epoch": 0.1442435522467429, + "grad_norm": 1.746523380279541, + "learning_rate": 4.8846643321416476e-05, + "loss": 1.6343, + "step": 4340 + }, + { + "epoch": 0.14527386333421963, + "grad_norm": 1.7737531661987305, + "learning_rate": 4.882129447892753e-05, + "loss": 1.6447, + "step": 4371 + }, + { + "epoch": 0.14630417442169635, + "grad_norm": 1.660485863685608, + "learning_rate": 4.8795676816368076e-05, + "loss": 1.6192, + "step": 4402 + }, + { + "epoch": 0.14733448550917308, + "grad_norm": 1.6823406219482422, + "learning_rate": 4.876979062282995e-05, + "loss": 1.6253, + "step": 4433 + }, + { + "epoch": 0.14836479659664983, + "grad_norm": 7.78139066696167, + "learning_rate": 4.8743636190435325e-05, + "loss": 1.6234, + "step": 4464 + }, + { + "epoch": 0.14939510768412656, + "grad_norm": 1.7426058053970337, + "learning_rate": 4.871721381433344e-05, + "loss": 1.6337, + "step": 4495 + }, + { + "epoch": 0.1504254187716033, + "grad_norm": 1.6294783353805542, + "learning_rate": 4.869052379269719e-05, + "loss": 1.6217, + "step": 4526 + }, + { + "epoch": 0.15145572985908004, + "grad_norm": 1.6523306369781494, + "learning_rate": 4.866356642671985e-05, + "loss": 1.605, + "step": 4557 + }, + { + "epoch": 0.15248604094655677, + "grad_norm": 1.8571300506591797, + "learning_rate": 4.8636342020611634e-05, + "loss": 1.6218, + "step": 4588 + }, + { + "epoch": 0.1535163520340335, + "grad_norm": 1.7754936218261719, + "learning_rate": 4.860885088159626e-05, + "loss": 1.6171, + "step": 4619 + }, + { + "epoch": 0.15454666312151025, + "grad_norm": 1.91987943649292, + "learning_rate": 4.858109331990751e-05, + "loss": 1.6167, + "step": 4650 + }, + { + "epoch": 0.15557697420898697, + "grad_norm": 1.5994452238082886, + "learning_rate": 4.855306964878567e-05, + "loss": 1.5951, + "step": 4681 + }, + { + "epoch": 0.1566072852964637, + "grad_norm": 1.6490916013717651, + "learning_rate": 4.8524780184474084e-05, + "loss": 1.616, + "step": 4712 + }, + { + "epoch": 0.15763759638394045, + "grad_norm": 1.5921640396118164, + "learning_rate": 4.8496225246215496e-05, + "loss": 1.6346, + "step": 4743 + }, + { + "epoch": 0.15866790747141718, + "grad_norm": 1.6729261875152588, + "learning_rate": 4.8467405156248505e-05, + "loss": 1.6165, + "step": 4774 + }, + { + "epoch": 0.1596982185588939, + "grad_norm": 1.628113031387329, + "learning_rate": 4.843832023980392e-05, + "loss": 1.6119, + "step": 4805 + }, + { + "epoch": 0.16072852964637063, + "grad_norm": 1.651647925376892, + "learning_rate": 4.840897082510106e-05, + "loss": 1.5997, + "step": 4836 + }, + { + "epoch": 0.1617588407338474, + "grad_norm": 1.5297720432281494, + "learning_rate": 4.8379357243344084e-05, + "loss": 1.6242, + "step": 4867 + }, + { + "epoch": 0.1627891518213241, + "grad_norm": 1.5779869556427002, + "learning_rate": 4.8349479828718236e-05, + "loss": 1.6149, + "step": 4898 + }, + { + "epoch": 0.16381946290880084, + "grad_norm": 1.5843939781188965, + "learning_rate": 4.8319338918386075e-05, + "loss": 1.5926, + "step": 4929 + }, + { + "epoch": 0.1648497739962776, + "grad_norm": 2.3762106895446777, + "learning_rate": 4.828893485248369e-05, + "loss": 1.6108, + "step": 4960 + }, + { + "epoch": 0.16588008508375432, + "grad_norm": 1.5871953964233398, + "learning_rate": 4.825826797411682e-05, + "loss": 1.6103, + "step": 4991 + }, + { + "epoch": 0.16691039617123105, + "grad_norm": 1.5934125185012817, + "learning_rate": 4.822733862935702e-05, + "loss": 1.6091, + "step": 5022 + }, + { + "epoch": 0.1679407072587078, + "grad_norm": 1.6997628211975098, + "learning_rate": 4.819614716723775e-05, + "loss": 1.6098, + "step": 5053 + }, + { + "epoch": 0.16897101834618453, + "grad_norm": 1.682849645614624, + "learning_rate": 4.8164693939750425e-05, + "loss": 1.599, + "step": 5084 + }, + { + "epoch": 0.17000132943366125, + "grad_norm": 1.709743857383728, + "learning_rate": 4.813297930184042e-05, + "loss": 1.6194, + "step": 5115 + }, + { + "epoch": 0.171031640521138, + "grad_norm": 1.725879430770874, + "learning_rate": 4.810100361140314e-05, + "loss": 1.6115, + "step": 5146 + }, + { + "epoch": 0.17206195160861473, + "grad_norm": 1.6710290908813477, + "learning_rate": 4.8068767229279885e-05, + "loss": 1.6032, + "step": 5177 + }, + { + "epoch": 0.17309226269609146, + "grad_norm": 1.6156634092330933, + "learning_rate": 4.8036270519253854e-05, + "loss": 1.5973, + "step": 5208 + }, + { + "epoch": 0.1741225737835682, + "grad_norm": 1.5654059648513794, + "learning_rate": 4.8003513848046e-05, + "loss": 1.5817, + "step": 5239 + }, + { + "epoch": 0.17515288487104494, + "grad_norm": 1.5789822340011597, + "learning_rate": 4.79704975853109e-05, + "loss": 1.6138, + "step": 5270 + }, + { + "epoch": 0.17618319595852167, + "grad_norm": 1.6022037267684937, + "learning_rate": 4.793722210363262e-05, + "loss": 1.5998, + "step": 5301 + }, + { + "epoch": 0.1772135070459984, + "grad_norm": 1.5142741203308105, + "learning_rate": 4.7903687778520414e-05, + "loss": 1.6061, + "step": 5332 + }, + { + "epoch": 0.17824381813347515, + "grad_norm": 1.6454212665557861, + "learning_rate": 4.7869894988404593e-05, + "loss": 1.6063, + "step": 5363 + }, + { + "epoch": 0.17927412922095187, + "grad_norm": 1.5250823497772217, + "learning_rate": 4.783584411463221e-05, + "loss": 1.6038, + "step": 5394 + }, + { + "epoch": 0.1803044403084286, + "grad_norm": 1.5829335451126099, + "learning_rate": 4.780153554146274e-05, + "loss": 1.5949, + "step": 5425 + }, + { + "epoch": 0.18133475139590535, + "grad_norm": 1.5342432260513306, + "learning_rate": 4.7766969656063766e-05, + "loss": 1.5913, + "step": 5456 + }, + { + "epoch": 0.18236506248338208, + "grad_norm": 1.6397250890731812, + "learning_rate": 4.773214684850662e-05, + "loss": 1.6102, + "step": 5487 + }, + { + "epoch": 0.1833953735708588, + "grad_norm": 1.5228471755981445, + "learning_rate": 4.769706751176193e-05, + "loss": 1.5885, + "step": 5518 + }, + { + "epoch": 0.18442568465833556, + "grad_norm": 1.6186103820800781, + "learning_rate": 4.7661732041695264e-05, + "loss": 1.6086, + "step": 5549 + }, + { + "epoch": 0.18545599574581229, + "grad_norm": 1.6024582386016846, + "learning_rate": 4.762614083706258e-05, + "loss": 1.6004, + "step": 5580 + }, + { + "epoch": 0.186486306833289, + "grad_norm": 1.5443711280822754, + "learning_rate": 4.759029429950581e-05, + "loss": 1.6048, + "step": 5611 + }, + { + "epoch": 0.18751661792076577, + "grad_norm": 1.4831629991531372, + "learning_rate": 4.7554192833548235e-05, + "loss": 1.5841, + "step": 5642 + }, + { + "epoch": 0.1885469290082425, + "grad_norm": 1.6426068544387817, + "learning_rate": 4.751783684659e-05, + "loss": 1.587, + "step": 5673 + }, + { + "epoch": 0.18957724009571922, + "grad_norm": 1.4609078168869019, + "learning_rate": 4.748122674890348e-05, + "loss": 1.5945, + "step": 5704 + }, + { + "epoch": 0.19060755118319597, + "grad_norm": 1.5365614891052246, + "learning_rate": 4.7444362953628654e-05, + "loss": 1.5737, + "step": 5735 + }, + { + "epoch": 0.1916378622706727, + "grad_norm": 1.5755670070648193, + "learning_rate": 4.7407245876768424e-05, + "loss": 1.5862, + "step": 5766 + }, + { + "epoch": 0.19266817335814942, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.736987593718397e-05, + "loss": 1.5663, + "step": 5797 + }, + { + "epoch": 0.19369848444562615, + "grad_norm": 1.5927278995513916, + "learning_rate": 4.733225355658999e-05, + "loss": 1.5776, + "step": 5828 + }, + { + "epoch": 0.1947287955331029, + "grad_norm": 1.5593287944793701, + "learning_rate": 4.7294379159549926e-05, + "loss": 1.579, + "step": 5859 + }, + { + "epoch": 0.19575910662057963, + "grad_norm": 1.534055233001709, + "learning_rate": 4.725625317347119e-05, + "loss": 1.6017, + "step": 5890 + }, + { + "epoch": 0.19678941770805636, + "grad_norm": 1.5846387147903442, + "learning_rate": 4.7217876028600374e-05, + "loss": 1.5739, + "step": 5921 + }, + { + "epoch": 0.1978197287955331, + "grad_norm": 1.5377682447433472, + "learning_rate": 4.717924815801832e-05, + "loss": 1.57, + "step": 5952 + }, + { + "epoch": 0.19885003988300984, + "grad_norm": 1.467956781387329, + "learning_rate": 4.714036999763532e-05, + "loss": 1.5736, + "step": 5983 + }, + { + "epoch": 0.19988035097048656, + "grad_norm": 1.601070523262024, + "learning_rate": 4.7101241986186116e-05, + "loss": 1.5861, + "step": 6014 + }, + { + "epoch": 0.20091066205796332, + "grad_norm": 1.5051921606063843, + "learning_rate": 4.7061864565225e-05, + "loss": 1.5735, + "step": 6045 + }, + { + "epoch": 0.20194097314544004, + "grad_norm": 1.462843418121338, + "learning_rate": 4.702223817912081e-05, + "loss": 1.582, + "step": 6076 + }, + { + "epoch": 0.20297128423291677, + "grad_norm": 1.5698682069778442, + "learning_rate": 4.698236327505195e-05, + "loss": 1.5647, + "step": 6107 + }, + { + "epoch": 0.20400159532039353, + "grad_norm": 1.5633916854858398, + "learning_rate": 4.694224030300127e-05, + "loss": 1.5741, + "step": 6138 + }, + { + "epoch": 0.20503190640787025, + "grad_norm": 1.6174733638763428, + "learning_rate": 4.690186971575107e-05, + "loss": 1.5634, + "step": 6169 + }, + { + "epoch": 0.20606221749534698, + "grad_norm": 1.4957518577575684, + "learning_rate": 4.6861251968877916e-05, + "loss": 1.575, + "step": 6200 + }, + { + "epoch": 0.2070925285828237, + "grad_norm": 1.670933485031128, + "learning_rate": 4.68203875207476e-05, + "loss": 1.5792, + "step": 6231 + }, + { + "epoch": 0.20812283967030046, + "grad_norm": 1.5676430463790894, + "learning_rate": 4.677927683250983e-05, + "loss": 1.5689, + "step": 6262 + }, + { + "epoch": 0.20915315075777718, + "grad_norm": 1.5753976106643677, + "learning_rate": 4.6737920368093156e-05, + "loss": 1.5594, + "step": 6293 + }, + { + "epoch": 0.2101834618452539, + "grad_norm": 1.4973617792129517, + "learning_rate": 4.669631859419965e-05, + "loss": 1.5593, + "step": 6324 + }, + { + "epoch": 0.21121377293273066, + "grad_norm": 1.4691433906555176, + "learning_rate": 4.6654471980299676e-05, + "loss": 1.5711, + "step": 6355 + }, + { + "epoch": 0.2122440840202074, + "grad_norm": 1.407630443572998, + "learning_rate": 4.661238099862658e-05, + "loss": 1.5787, + "step": 6386 + }, + { + "epoch": 0.21327439510768412, + "grad_norm": 1.5011677742004395, + "learning_rate": 4.657004612417138e-05, + "loss": 1.5751, + "step": 6417 + }, + { + "epoch": 0.21430470619516087, + "grad_norm": 1.509750485420227, + "learning_rate": 4.6527467834677374e-05, + "loss": 1.5583, + "step": 6448 + }, + { + "epoch": 0.2153350172826376, + "grad_norm": 1.3919882774353027, + "learning_rate": 4.648464661063478e-05, + "loss": 1.5712, + "step": 6479 + }, + { + "epoch": 0.21636532837011432, + "grad_norm": 1.4854936599731445, + "learning_rate": 4.6441582935275264e-05, + "loss": 1.5637, + "step": 6510 + }, + { + "epoch": 0.21739563945759108, + "grad_norm": 1.4413583278656006, + "learning_rate": 4.6398277294566586e-05, + "loss": 1.56, + "step": 6541 + }, + { + "epoch": 0.2184259505450678, + "grad_norm": 1.5063883066177368, + "learning_rate": 4.6354730177207e-05, + "loss": 1.5525, + "step": 6572 + }, + { + "epoch": 0.21945626163254453, + "grad_norm": 1.4899688959121704, + "learning_rate": 4.6310942074619787e-05, + "loss": 1.5817, + "step": 6603 + }, + { + "epoch": 0.22048657272002128, + "grad_norm": 1.3927967548370361, + "learning_rate": 4.626691348094777e-05, + "loss": 1.5407, + "step": 6634 + }, + { + "epoch": 0.221516883807498, + "grad_norm": 1.5378398895263672, + "learning_rate": 4.622264489304762e-05, + "loss": 1.5561, + "step": 6665 + }, + { + "epoch": 0.22254719489497474, + "grad_norm": 1.554624319076538, + "learning_rate": 4.617813681048434e-05, + "loss": 1.5859, + "step": 6696 + }, + { + "epoch": 0.22357750598245146, + "grad_norm": 1.5356658697128296, + "learning_rate": 4.61333897355256e-05, + "loss": 1.5531, + "step": 6727 + }, + { + "epoch": 0.22460781706992822, + "grad_norm": 1.5534918308258057, + "learning_rate": 4.608840417313604e-05, + "loss": 1.5774, + "step": 6758 + }, + { + "epoch": 0.22563812815740494, + "grad_norm": 1.5660988092422485, + "learning_rate": 4.6043180630971646e-05, + "loss": 1.5763, + "step": 6789 + }, + { + "epoch": 0.22666843924488167, + "grad_norm": 1.4993386268615723, + "learning_rate": 4.599771961937391e-05, + "loss": 1.5615, + "step": 6820 + }, + { + "epoch": 0.22769875033235842, + "grad_norm": 1.4630553722381592, + "learning_rate": 4.5952021651364204e-05, + "loss": 1.543, + "step": 6851 + }, + { + "epoch": 0.22872906141983515, + "grad_norm": 1.470173954963684, + "learning_rate": 4.590608724263786e-05, + "loss": 1.5674, + "step": 6882 + }, + { + "epoch": 0.22975937250731188, + "grad_norm": 1.5867971181869507, + "learning_rate": 4.585991691155845e-05, + "loss": 1.5702, + "step": 6913 + }, + { + "epoch": 0.23078968359478863, + "grad_norm": 1.44207763671875, + "learning_rate": 4.581351117915188e-05, + "loss": 1.5436, + "step": 6944 + }, + { + "epoch": 0.23181999468226536, + "grad_norm": 1.4691039323806763, + "learning_rate": 4.5766870569100534e-05, + "loss": 1.5465, + "step": 6975 + }, + { + "epoch": 0.23285030576974208, + "grad_norm": 1.4807918071746826, + "learning_rate": 4.571999560773736e-05, + "loss": 1.5564, + "step": 7006 + }, + { + "epoch": 0.23388061685721884, + "grad_norm": 1.481487512588501, + "learning_rate": 4.5672886824039915e-05, + "loss": 1.5466, + "step": 7037 + }, + { + "epoch": 0.23491092794469556, + "grad_norm": 1.4518013000488281, + "learning_rate": 4.5625544749624435e-05, + "loss": 1.5618, + "step": 7068 + }, + { + "epoch": 0.2359412390321723, + "grad_norm": 1.4186676740646362, + "learning_rate": 4.5577969918739794e-05, + "loss": 1.5528, + "step": 7099 + }, + { + "epoch": 0.23697155011964902, + "grad_norm": 1.5287110805511475, + "learning_rate": 4.5530162868261486e-05, + "loss": 1.5457, + "step": 7130 + }, + { + "epoch": 0.23800186120712577, + "grad_norm": 1.5516417026519775, + "learning_rate": 4.548212413768558e-05, + "loss": 1.5467, + "step": 7161 + }, + { + "epoch": 0.2390321722946025, + "grad_norm": 1.4710053205490112, + "learning_rate": 4.543385426912261e-05, + "loss": 1.5431, + "step": 7192 + }, + { + "epoch": 0.24006248338207922, + "grad_norm": 1.5005567073822021, + "learning_rate": 4.53853538072915e-05, + "loss": 1.5592, + "step": 7223 + }, + { + "epoch": 0.24109279446955598, + "grad_norm": 1.5864965915679932, + "learning_rate": 4.533662329951336e-05, + "loss": 1.5694, + "step": 7254 + }, + { + "epoch": 0.2421231055570327, + "grad_norm": 1.4661896228790283, + "learning_rate": 4.528766329570536e-05, + "loss": 1.545, + "step": 7285 + }, + { + "epoch": 0.24315341664450943, + "grad_norm": 1.5157560110092163, + "learning_rate": 4.523847434837447e-05, + "loss": 1.5458, + "step": 7316 + }, + { + "epoch": 0.24418372773198618, + "grad_norm": 1.4033585786819458, + "learning_rate": 4.518905701261128e-05, + "loss": 1.5464, + "step": 7347 + }, + { + "epoch": 0.2452140388194629, + "grad_norm": 1.5357593297958374, + "learning_rate": 4.5139411846083715e-05, + "loss": 1.5497, + "step": 7378 + }, + { + "epoch": 0.24624434990693964, + "grad_norm": 1.419507384300232, + "learning_rate": 4.508953940903073e-05, + "loss": 1.5414, + "step": 7409 + }, + { + "epoch": 0.2472746609944164, + "grad_norm": 1.5201773643493652, + "learning_rate": 4.5039440264255994e-05, + "loss": 1.5503, + "step": 7440 + }, + { + "epoch": 0.24830497208189312, + "grad_norm": 1.8000444173812866, + "learning_rate": 4.498911497712155e-05, + "loss": 1.5448, + "step": 7471 + }, + { + "epoch": 0.24933528316936984, + "grad_norm": 1.4876810312271118, + "learning_rate": 4.493856411554142e-05, + "loss": 1.5524, + "step": 7502 + }, + { + "epoch": 0.25036559425684657, + "grad_norm": 1.5130078792572021, + "learning_rate": 4.4887788249975206e-05, + "loss": 1.5454, + "step": 7533 + }, + { + "epoch": 0.2513959053443233, + "grad_norm": 1.4829351902008057, + "learning_rate": 4.4836787953421656e-05, + "loss": 1.5407, + "step": 7564 + }, + { + "epoch": 0.2524262164318001, + "grad_norm": 1.521550178527832, + "learning_rate": 4.478556380141218e-05, + "loss": 1.5727, + "step": 7595 + }, + { + "epoch": 0.2534565275192768, + "grad_norm": 1.4377928972244263, + "learning_rate": 4.4734116372004375e-05, + "loss": 1.5432, + "step": 7626 + }, + { + "epoch": 0.25448683860675353, + "grad_norm": 1.4101744890213013, + "learning_rate": 4.4682446245775477e-05, + "loss": 1.547, + "step": 7657 + }, + { + "epoch": 0.2555171496942303, + "grad_norm": 1.522524356842041, + "learning_rate": 4.463055400581586e-05, + "loss": 1.5418, + "step": 7688 + }, + { + "epoch": 0.256547460781707, + "grad_norm": 1.4160797595977783, + "learning_rate": 4.4578440237722374e-05, + "loss": 1.5457, + "step": 7719 + }, + { + "epoch": 0.25757777186918374, + "grad_norm": 1.4106636047363281, + "learning_rate": 4.452610552959183e-05, + "loss": 1.5405, + "step": 7750 + }, + { + "epoch": 0.2586080829566605, + "grad_norm": 1.422723650932312, + "learning_rate": 4.447355047201428e-05, + "loss": 1.5423, + "step": 7781 + }, + { + "epoch": 0.2596383940441372, + "grad_norm": 1.4362592697143555, + "learning_rate": 4.4420775658066414e-05, + "loss": 1.5372, + "step": 7812 + }, + { + "epoch": 0.26066870513161394, + "grad_norm": 1.4319696426391602, + "learning_rate": 4.436778168330484e-05, + "loss": 1.5451, + "step": 7843 + }, + { + "epoch": 0.26169901621909064, + "grad_norm": 1.4069257974624634, + "learning_rate": 4.4314569145759353e-05, + "loss": 1.5221, + "step": 7874 + }, + { + "epoch": 0.2627293273065674, + "grad_norm": 1.4424949884414673, + "learning_rate": 4.42611386459262e-05, + "loss": 1.5419, + "step": 7905 + }, + { + "epoch": 0.26375963839404415, + "grad_norm": 1.4579105377197266, + "learning_rate": 4.420749078676133e-05, + "loss": 1.5116, + "step": 7936 + }, + { + "epoch": 0.26478994948152085, + "grad_norm": 1.4563167095184326, + "learning_rate": 4.4153626173673516e-05, + "loss": 1.5296, + "step": 7967 + }, + { + "epoch": 0.2658202605689976, + "grad_norm": 1.4440968036651611, + "learning_rate": 4.409954541451762e-05, + "loss": 1.5548, + "step": 7998 + }, + { + "epoch": 0.26685057165647436, + "grad_norm": 1.5711034536361694, + "learning_rate": 4.404524911958764e-05, + "loss": 1.535, + "step": 8029 + }, + { + "epoch": 0.26788088274395105, + "grad_norm": 1.5221564769744873, + "learning_rate": 4.399073790160989e-05, + "loss": 1.5495, + "step": 8060 + }, + { + "epoch": 0.2689111938314278, + "grad_norm": 1.392699956893921, + "learning_rate": 4.393601237573607e-05, + "loss": 1.546, + "step": 8091 + }, + { + "epoch": 0.26994150491890456, + "grad_norm": 1.5343137979507446, + "learning_rate": 4.388107315953628e-05, + "loss": 1.549, + "step": 8122 + }, + { + "epoch": 0.27097181600638126, + "grad_norm": 1.4483468532562256, + "learning_rate": 4.382592087299212e-05, + "loss": 1.5424, + "step": 8153 + }, + { + "epoch": 0.272002127093858, + "grad_norm": 1.4963489770889282, + "learning_rate": 4.377055613848964e-05, + "loss": 1.508, + "step": 8184 + }, + { + "epoch": 0.27303243818133477, + "grad_norm": 1.4839162826538086, + "learning_rate": 4.3714979580812355e-05, + "loss": 1.5203, + "step": 8215 + }, + { + "epoch": 0.27406274926881147, + "grad_norm": 1.4272018671035767, + "learning_rate": 4.365919182713416e-05, + "loss": 1.5264, + "step": 8246 + }, + { + "epoch": 0.2750930603562882, + "grad_norm": 1.3808270692825317, + "learning_rate": 4.360319350701226e-05, + "loss": 1.5255, + "step": 8277 + }, + { + "epoch": 0.276123371443765, + "grad_norm": 1.4179162979125977, + "learning_rate": 4.3546985252380115e-05, + "loss": 1.535, + "step": 8308 + }, + { + "epoch": 0.2771536825312417, + "grad_norm": 1.3617374897003174, + "learning_rate": 4.349056769754021e-05, + "loss": 1.5295, + "step": 8339 + }, + { + "epoch": 0.27818399361871843, + "grad_norm": 1.4745615720748901, + "learning_rate": 4.3433941479156994e-05, + "loss": 1.5438, + "step": 8370 + }, + { + "epoch": 0.2792143047061952, + "grad_norm": 1.3661375045776367, + "learning_rate": 4.3377107236249647e-05, + "loss": 1.5134, + "step": 8401 + }, + { + "epoch": 0.2802446157936719, + "grad_norm": 1.3907949924468994, + "learning_rate": 4.332006561018488e-05, + "loss": 1.5237, + "step": 8432 + }, + { + "epoch": 0.28127492688114863, + "grad_norm": 1.3575704097747803, + "learning_rate": 4.3262817244669683e-05, + "loss": 1.5226, + "step": 8463 + }, + { + "epoch": 0.2823052379686254, + "grad_norm": 1.3836462497711182, + "learning_rate": 4.3205362785744083e-05, + "loss": 1.5433, + "step": 8494 + }, + { + "epoch": 0.2833355490561021, + "grad_norm": 1.6108276844024658, + "learning_rate": 4.314770288177384e-05, + "loss": 1.5324, + "step": 8525 + }, + { + "epoch": 0.28436586014357884, + "grad_norm": 1.4650689363479614, + "learning_rate": 4.308983818344313e-05, + "loss": 1.535, + "step": 8556 + }, + { + "epoch": 0.2853961712310556, + "grad_norm": 1.5836583375930786, + "learning_rate": 4.3031769343747206e-05, + "loss": 1.5313, + "step": 8587 + }, + { + "epoch": 0.2864264823185323, + "grad_norm": 1.5348492860794067, + "learning_rate": 4.297349701798505e-05, + "loss": 1.5106, + "step": 8618 + }, + { + "epoch": 0.28745679340600905, + "grad_norm": 1.4060319662094116, + "learning_rate": 4.2915021863751916e-05, + "loss": 1.5283, + "step": 8649 + }, + { + "epoch": 0.2884871044934858, + "grad_norm": 1.531657099723816, + "learning_rate": 4.285634454093198e-05, + "loss": 1.5087, + "step": 8680 + }, + { + "epoch": 0.2895174155809625, + "grad_norm": 1.4756299257278442, + "learning_rate": 4.279746571169086e-05, + "loss": 1.5042, + "step": 8711 + }, + { + "epoch": 0.29054772666843925, + "grad_norm": 1.3221153020858765, + "learning_rate": 4.2738386040468136e-05, + "loss": 1.5244, + "step": 8742 + }, + { + "epoch": 0.29157803775591595, + "grad_norm": 1.4067268371582031, + "learning_rate": 4.2679106193969866e-05, + "loss": 1.5012, + "step": 8773 + }, + { + "epoch": 0.2926083488433927, + "grad_norm": 1.5192064046859741, + "learning_rate": 4.261962684116106e-05, + "loss": 1.521, + "step": 8804 + }, + { + "epoch": 0.29363865993086946, + "grad_norm": 1.3847788572311401, + "learning_rate": 4.2559948653258145e-05, + "loss": 1.5128, + "step": 8835 + }, + { + "epoch": 0.29466897101834616, + "grad_norm": 1.4612780809402466, + "learning_rate": 4.250007230372134e-05, + "loss": 1.5371, + "step": 8866 + }, + { + "epoch": 0.2956992821058229, + "grad_norm": 1.468971610069275, + "learning_rate": 4.2439998468247126e-05, + "loss": 1.5199, + "step": 8897 + }, + { + "epoch": 0.29672959319329967, + "grad_norm": 1.386236310005188, + "learning_rate": 4.2379727824760566e-05, + "loss": 1.5273, + "step": 8928 + }, + { + "epoch": 0.29775990428077637, + "grad_norm": 1.3843929767608643, + "learning_rate": 4.231926105340768e-05, + "loss": 1.5011, + "step": 8959 + }, + { + "epoch": 0.2987902153682531, + "grad_norm": 1.4554557800292969, + "learning_rate": 4.225859883654776e-05, + "loss": 1.5311, + "step": 8990 + }, + { + "epoch": 0.2998205264557299, + "grad_norm": 1.3674421310424805, + "learning_rate": 4.219774185874569e-05, + "loss": 1.5302, + "step": 9021 + }, + { + "epoch": 0.3008508375432066, + "grad_norm": 1.3804330825805664, + "learning_rate": 4.213669080676418e-05, + "loss": 1.538, + "step": 9052 + }, + { + "epoch": 0.3018811486306833, + "grad_norm": 1.4643255472183228, + "learning_rate": 4.2075446369556056e-05, + "loss": 1.5172, + "step": 9083 + }, + { + "epoch": 0.3029114597181601, + "grad_norm": 1.3375928401947021, + "learning_rate": 4.201400923825648e-05, + "loss": 1.5123, + "step": 9114 + }, + { + "epoch": 0.3039417708056368, + "grad_norm": 1.4321980476379395, + "learning_rate": 4.195238010617511e-05, + "loss": 1.5196, + "step": 9145 + }, + { + "epoch": 0.30497208189311353, + "grad_norm": 1.4312376976013184, + "learning_rate": 4.1890559668788344e-05, + "loss": 1.5138, + "step": 9176 + }, + { + "epoch": 0.3060023929805903, + "grad_norm": 1.3089646100997925, + "learning_rate": 4.1828548623731405e-05, + "loss": 1.5027, + "step": 9207 + }, + { + "epoch": 0.307032704068067, + "grad_norm": 1.4863250255584717, + "learning_rate": 4.1766347670790506e-05, + "loss": 1.5091, + "step": 9238 + }, + { + "epoch": 0.30806301515554374, + "grad_norm": 1.373666763305664, + "learning_rate": 4.170395751189495e-05, + "loss": 1.5256, + "step": 9269 + }, + { + "epoch": 0.3090933262430205, + "grad_norm": 1.4160584211349487, + "learning_rate": 4.164137885110921e-05, + "loss": 1.4938, + "step": 9300 + }, + { + "epoch": 0.3101236373304972, + "grad_norm": 2.112110137939453, + "learning_rate": 4.157861239462495e-05, + "loss": 1.5106, + "step": 9331 + }, + { + "epoch": 0.31115394841797395, + "grad_norm": 1.337058663368225, + "learning_rate": 4.1515658850753114e-05, + "loss": 1.4999, + "step": 9362 + }, + { + "epoch": 0.3121842595054507, + "grad_norm": 1.3625296354293823, + "learning_rate": 4.145251892991588e-05, + "loss": 1.5136, + "step": 9393 + }, + { + "epoch": 0.3132145705929274, + "grad_norm": 1.399491548538208, + "learning_rate": 4.138919334463868e-05, + "loss": 1.499, + "step": 9424 + }, + { + "epoch": 0.31424488168040415, + "grad_norm": 1.4202344417572021, + "learning_rate": 4.1325682809542124e-05, + "loss": 1.5049, + "step": 9455 + }, + { + "epoch": 0.3152751927678809, + "grad_norm": 1.392248272895813, + "learning_rate": 4.126198804133398e-05, + "loss": 1.5287, + "step": 9486 + }, + { + "epoch": 0.3163055038553576, + "grad_norm": 1.3807618618011475, + "learning_rate": 4.1198109758801055e-05, + "loss": 1.5309, + "step": 9517 + }, + { + "epoch": 0.31733581494283436, + "grad_norm": 1.3117905855178833, + "learning_rate": 4.113404868280107e-05, + "loss": 1.4933, + "step": 9548 + }, + { + "epoch": 0.3183661260303111, + "grad_norm": 1.452086091041565, + "learning_rate": 4.106980553625457e-05, + "loss": 1.5221, + "step": 9579 + }, + { + "epoch": 0.3193964371177878, + "grad_norm": 1.477364182472229, + "learning_rate": 4.100538104413674e-05, + "loss": 1.4904, + "step": 9610 + }, + { + "epoch": 0.32042674820526457, + "grad_norm": 1.3584345579147339, + "learning_rate": 4.09407759334692e-05, + "loss": 1.4953, + "step": 9641 + }, + { + "epoch": 0.32145705929274127, + "grad_norm": 1.3619811534881592, + "learning_rate": 4.087599093331186e-05, + "loss": 1.4956, + "step": 9672 + }, + { + "epoch": 0.322487370380218, + "grad_norm": 1.4507052898406982, + "learning_rate": 4.081102677475462e-05, + "loss": 1.5197, + "step": 9703 + }, + { + "epoch": 0.3235176814676948, + "grad_norm": 1.4229698181152344, + "learning_rate": 4.0745884190909194e-05, + "loss": 1.498, + "step": 9734 + }, + { + "epoch": 0.32454799255517147, + "grad_norm": 1.3074679374694824, + "learning_rate": 4.0680563916900796e-05, + "loss": 1.5146, + "step": 9765 + }, + { + "epoch": 0.3255783036426482, + "grad_norm": 1.397815465927124, + "learning_rate": 4.0615066689859815e-05, + "loss": 1.5291, + "step": 9796 + }, + { + "epoch": 0.326608614730125, + "grad_norm": 1.3196336030960083, + "learning_rate": 4.0549393248913584e-05, + "loss": 1.5077, + "step": 9827 + }, + { + "epoch": 0.3276389258176017, + "grad_norm": 1.3129957914352417, + "learning_rate": 4.048354433517794e-05, + "loss": 1.4965, + "step": 9858 + }, + { + "epoch": 0.32866923690507843, + "grad_norm": 1.4380089044570923, + "learning_rate": 4.0417520691748916e-05, + "loss": 1.5115, + "step": 9889 + }, + { + "epoch": 0.3296995479925552, + "grad_norm": 1.3162370920181274, + "learning_rate": 4.035132306369438e-05, + "loss": 1.5029, + "step": 9920 + }, + { + "epoch": 0.3307298590800319, + "grad_norm": 1.3739668130874634, + "learning_rate": 4.028495219804555e-05, + "loss": 1.5083, + "step": 9951 + }, + { + "epoch": 0.33176017016750864, + "grad_norm": 1.3673723936080933, + "learning_rate": 4.021840884378864e-05, + "loss": 1.5223, + "step": 9982 + }, + { + "epoch": 0.3327904812549854, + "grad_norm": 1.3970317840576172, + "learning_rate": 4.015169375185633e-05, + "loss": 1.5003, + "step": 10013 + }, + { + "epoch": 0.3338207923424621, + "grad_norm": 1.2982394695281982, + "learning_rate": 4.0084807675119396e-05, + "loss": 1.5066, + "step": 10044 + }, + { + "epoch": 0.33485110342993885, + "grad_norm": 1.4548689126968384, + "learning_rate": 4.0017751368378106e-05, + "loss": 1.4993, + "step": 10075 + }, + { + "epoch": 0.3358814145174156, + "grad_norm": 1.3693586587905884, + "learning_rate": 3.995052558835377e-05, + "loss": 1.4987, + "step": 10106 + }, + { + "epoch": 0.3369117256048923, + "grad_norm": 1.4046767950057983, + "learning_rate": 3.988313109368017e-05, + "loss": 1.5098, + "step": 10137 + }, + { + "epoch": 0.33794203669236905, + "grad_norm": 1.3772069215774536, + "learning_rate": 3.981556864489504e-05, + "loss": 1.5165, + "step": 10168 + }, + { + "epoch": 0.3389723477798458, + "grad_norm": 1.471211314201355, + "learning_rate": 3.974783900443142e-05, + "loss": 1.5037, + "step": 10199 + }, + { + "epoch": 0.3400026588673225, + "grad_norm": 1.3990979194641113, + "learning_rate": 3.9679942936609095e-05, + "loss": 1.5096, + "step": 10230 + }, + { + "epoch": 0.34103296995479926, + "grad_norm": 1.3779234886169434, + "learning_rate": 3.961188120762596e-05, + "loss": 1.4914, + "step": 10261 + }, + { + "epoch": 0.342063281042276, + "grad_norm": 1.2866768836975098, + "learning_rate": 3.954365458554938e-05, + "loss": 1.5026, + "step": 10292 + }, + { + "epoch": 0.3430935921297527, + "grad_norm": 1.353468894958496, + "learning_rate": 3.947526384030751e-05, + "loss": 1.5063, + "step": 10323 + }, + { + "epoch": 0.34412390321722947, + "grad_norm": 1.3264256715774536, + "learning_rate": 3.9406709743680624e-05, + "loss": 1.4911, + "step": 10354 + }, + { + "epoch": 0.3451542143047062, + "grad_norm": 1.3496876955032349, + "learning_rate": 3.9337993069292366e-05, + "loss": 1.4921, + "step": 10385 + }, + { + "epoch": 0.3461845253921829, + "grad_norm": 1.3812434673309326, + "learning_rate": 3.926911459260109e-05, + "loss": 1.4826, + "step": 10416 + }, + { + "epoch": 0.34721483647965967, + "grad_norm": 1.4926965236663818, + "learning_rate": 3.920007509089102e-05, + "loss": 1.4994, + "step": 10447 + }, + { + "epoch": 0.3482451475671364, + "grad_norm": 1.3446170091629028, + "learning_rate": 3.913087534326357e-05, + "loss": 1.5114, + "step": 10478 + }, + { + "epoch": 0.3492754586546131, + "grad_norm": 1.3100495338439941, + "learning_rate": 3.9061516130628475e-05, + "loss": 1.5066, + "step": 10509 + }, + { + "epoch": 0.3503057697420899, + "grad_norm": 1.395874261856079, + "learning_rate": 3.8991998235695025e-05, + "loss": 1.4999, + "step": 10540 + }, + { + "epoch": 0.3513360808295666, + "grad_norm": 1.3682137727737427, + "learning_rate": 3.8922322442963224e-05, + "loss": 1.4778, + "step": 10571 + }, + { + "epoch": 0.35236639191704333, + "grad_norm": 1.4196573495864868, + "learning_rate": 3.885248953871491e-05, + "loss": 1.4909, + "step": 10602 + }, + { + "epoch": 0.3533967030045201, + "grad_norm": 1.4299864768981934, + "learning_rate": 3.8782500311004915e-05, + "loss": 1.5025, + "step": 10633 + }, + { + "epoch": 0.3544270140919968, + "grad_norm": 1.39677095413208, + "learning_rate": 3.871235554965218e-05, + "loss": 1.4932, + "step": 10664 + }, + { + "epoch": 0.35545732517947354, + "grad_norm": 1.3219736814498901, + "learning_rate": 3.864205604623078e-05, + "loss": 1.4795, + "step": 10695 + }, + { + "epoch": 0.3564876362669503, + "grad_norm": 1.3649324178695679, + "learning_rate": 3.857160259406107e-05, + "loss": 1.4838, + "step": 10726 + }, + { + "epoch": 0.357517947354427, + "grad_norm": 1.4109989404678345, + "learning_rate": 3.8500995988200674e-05, + "loss": 1.5058, + "step": 10757 + }, + { + "epoch": 0.35854825844190374, + "grad_norm": 1.3625038862228394, + "learning_rate": 3.843023702543556e-05, + "loss": 1.4912, + "step": 10788 + }, + { + "epoch": 0.3595785695293805, + "grad_norm": 1.4725775718688965, + "learning_rate": 3.8359326504270984e-05, + "loss": 1.5012, + "step": 10819 + }, + { + "epoch": 0.3606088806168572, + "grad_norm": 1.4126085042953491, + "learning_rate": 3.828826522492255e-05, + "loss": 1.4977, + "step": 10850 + }, + { + "epoch": 0.36163919170433395, + "grad_norm": 1.3949086666107178, + "learning_rate": 3.821705398930713e-05, + "loss": 1.4903, + "step": 10881 + }, + { + "epoch": 0.3626695027918107, + "grad_norm": 1.286792516708374, + "learning_rate": 3.814569360103385e-05, + "loss": 1.5067, + "step": 10912 + }, + { + "epoch": 0.3636998138792874, + "grad_norm": 1.274703025817871, + "learning_rate": 3.807418486539499e-05, + "loss": 1.4583, + "step": 10943 + }, + { + "epoch": 0.36473012496676416, + "grad_norm": 1.401455283164978, + "learning_rate": 3.80025285893569e-05, + "loss": 1.4834, + "step": 10974 + }, + { + "epoch": 0.3657604360542409, + "grad_norm": 1.308361530303955, + "learning_rate": 3.793072558155093e-05, + "loss": 1.4832, + "step": 11005 + }, + { + "epoch": 0.3667907471417176, + "grad_norm": 1.654733419418335, + "learning_rate": 3.785877665226426e-05, + "loss": 1.4867, + "step": 11036 + }, + { + "epoch": 0.36782105822919436, + "grad_norm": 1.3530856370925903, + "learning_rate": 3.778668261343079e-05, + "loss": 1.4873, + "step": 11067 + }, + { + "epoch": 0.3688513693166711, + "grad_norm": 1.3567407131195068, + "learning_rate": 3.771444427862192e-05, + "loss": 1.4935, + "step": 11098 + }, + { + "epoch": 0.3698816804041478, + "grad_norm": 1.3184572458267212, + "learning_rate": 3.7642062463037465e-05, + "loss": 1.4891, + "step": 11129 + }, + { + "epoch": 0.37091199149162457, + "grad_norm": 1.366489291191101, + "learning_rate": 3.7569537983496373e-05, + "loss": 1.5159, + "step": 11160 + }, + { + "epoch": 0.3719423025791013, + "grad_norm": 1.423258662223816, + "learning_rate": 3.749687165842753e-05, + "loss": 1.4938, + "step": 11191 + }, + { + "epoch": 0.372972613666578, + "grad_norm": 1.3226194381713867, + "learning_rate": 3.7424064307860536e-05, + "loss": 1.499, + "step": 11222 + }, + { + "epoch": 0.3740029247540548, + "grad_norm": 1.350500464439392, + "learning_rate": 3.735111675341645e-05, + "loss": 1.4952, + "step": 11253 + }, + { + "epoch": 0.37503323584153153, + "grad_norm": 1.3667839765548706, + "learning_rate": 3.7278029818298524e-05, + "loss": 1.4763, + "step": 11284 + }, + { + "epoch": 0.37606354692900823, + "grad_norm": 1.4876132011413574, + "learning_rate": 3.720480432728287e-05, + "loss": 1.4913, + "step": 11315 + }, + { + "epoch": 0.377093858016485, + "grad_norm": 1.3927743434906006, + "learning_rate": 3.71314411067092e-05, + "loss": 1.4948, + "step": 11346 + }, + { + "epoch": 0.37812416910396174, + "grad_norm": 1.3752413988113403, + "learning_rate": 3.70579409844715e-05, + "loss": 1.4763, + "step": 11377 + }, + { + "epoch": 0.37915448019143844, + "grad_norm": 1.3530951738357544, + "learning_rate": 3.698430479000865e-05, + "loss": 1.5077, + "step": 11408 + }, + { + "epoch": 0.3801847912789152, + "grad_norm": 1.4309345483779907, + "learning_rate": 3.691053335429509e-05, + "loss": 1.4945, + "step": 11439 + }, + { + "epoch": 0.38121510236639194, + "grad_norm": 1.2874380350112915, + "learning_rate": 3.683662750983147e-05, + "loss": 1.4698, + "step": 11470 + }, + { + "epoch": 0.38224541345386864, + "grad_norm": 1.3356250524520874, + "learning_rate": 3.676258809063518e-05, + "loss": 1.4924, + "step": 11501 + }, + { + "epoch": 0.3832757245413454, + "grad_norm": 1.304559588432312, + "learning_rate": 3.6688415932231004e-05, + "loss": 1.4682, + "step": 11532 + }, + { + "epoch": 0.3843060356288221, + "grad_norm": 1.4153447151184082, + "learning_rate": 3.661411187164166e-05, + "loss": 1.4989, + "step": 11563 + }, + { + "epoch": 0.38533634671629885, + "grad_norm": 1.356992244720459, + "learning_rate": 3.65396767473784e-05, + "loss": 1.4854, + "step": 11594 + }, + { + "epoch": 0.3863666578037756, + "grad_norm": 1.322449803352356, + "learning_rate": 3.6465111399431465e-05, + "loss": 1.4877, + "step": 11625 + }, + { + "epoch": 0.3873969688912523, + "grad_norm": 1.3981350660324097, + "learning_rate": 3.6390416669260674e-05, + "loss": 1.499, + "step": 11656 + }, + { + "epoch": 0.38842727997872906, + "grad_norm": 1.324871301651001, + "learning_rate": 3.63155933997859e-05, + "loss": 1.4814, + "step": 11687 + }, + { + "epoch": 0.3894575910662058, + "grad_norm": 1.3940790891647339, + "learning_rate": 3.624064243537758e-05, + "loss": 1.4754, + "step": 11718 + }, + { + "epoch": 0.3904879021536825, + "grad_norm": 1.2880780696868896, + "learning_rate": 3.616556462184716e-05, + "loss": 1.4832, + "step": 11749 + }, + { + "epoch": 0.39151821324115926, + "grad_norm": 1.315329670906067, + "learning_rate": 3.609036080643755e-05, + "loss": 1.4853, + "step": 11780 + }, + { + "epoch": 0.392548524328636, + "grad_norm": 1.4093523025512695, + "learning_rate": 3.60150318378136e-05, + "loss": 1.4978, + "step": 11811 + }, + { + "epoch": 0.3935788354161127, + "grad_norm": 1.271151065826416, + "learning_rate": 3.5939578566052465e-05, + "loss": 1.4933, + "step": 11842 + }, + { + "epoch": 0.39460914650358947, + "grad_norm": 1.2910923957824707, + "learning_rate": 3.586400184263408e-05, + "loss": 1.4853, + "step": 11873 + }, + { + "epoch": 0.3956394575910662, + "grad_norm": 1.2480064630508423, + "learning_rate": 3.578830252043148e-05, + "loss": 1.4642, + "step": 11904 + }, + { + "epoch": 0.3966697686785429, + "grad_norm": 1.263197422027588, + "learning_rate": 3.571248145370125e-05, + "loss": 1.4812, + "step": 11935 + }, + { + "epoch": 0.3977000797660197, + "grad_norm": 1.3231288194656372, + "learning_rate": 3.5636539498073794e-05, + "loss": 1.4744, + "step": 11966 + }, + { + "epoch": 0.39873039085349643, + "grad_norm": 1.3933110237121582, + "learning_rate": 3.556047751054378e-05, + "loss": 1.4849, + "step": 11997 + }, + { + "epoch": 0.39976070194097313, + "grad_norm": 1.3615801334381104, + "learning_rate": 3.548429634946039e-05, + "loss": 1.4866, + "step": 12028 + }, + { + "epoch": 0.4007910130284499, + "grad_norm": 1.298638939857483, + "learning_rate": 3.540799687451768e-05, + "loss": 1.4664, + "step": 12059 + }, + { + "epoch": 0.40182132411592664, + "grad_norm": 1.29216468334198, + "learning_rate": 3.533157994674485e-05, + "loss": 1.4697, + "step": 12090 + }, + { + "epoch": 0.40285163520340334, + "grad_norm": 1.3759845495224, + "learning_rate": 3.5255046428496546e-05, + "loss": 1.4854, + "step": 12121 + }, + { + "epoch": 0.4038819462908801, + "grad_norm": 1.4045615196228027, + "learning_rate": 3.517839718344311e-05, + "loss": 1.4622, + "step": 12152 + }, + { + "epoch": 0.40491225737835684, + "grad_norm": 1.2979034185409546, + "learning_rate": 3.510163307656086e-05, + "loss": 1.4797, + "step": 12183 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.055780216089084e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12208/training_args.bin b/checkpoint-12208/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..974208468b82a3c5684aaa384776477cf21c18ca --- /dev/null +++ b/checkpoint-12208/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a23be0ff07d6d3142f7c0980f91dddba845519c24fcb411cbb4b9ddb1513ff +size 5304 diff --git a/checkpoint-15260/config.json b/checkpoint-15260/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aaa74176892d42e1c7f5979b7ddf8ab15985d3 --- /dev/null +++ b/checkpoint-15260/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "/mnt/parscratch/users/acp23ay/private/models/Llama-3.1-8B-Instruct-ta-madlad-mean/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 138256 +} diff --git a/checkpoint-15260/generation_config.json b/checkpoint-15260/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-15260/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-15260/model-00001-of-00007.safetensors b/checkpoint-15260/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b479742db37b44e5dc5d713936e23a6ebb6d8180 --- /dev/null +++ b/checkpoint-15260/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea0f35225868c2f9f8770492cc681c525266286582cc49af99ef751c57491ac0 +size 4983197184 diff --git a/checkpoint-15260/model-00002-of-00007.safetensors b/checkpoint-15260/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d32d83cb96a109af411b9cf577e7fbfe07ea76fc --- /dev/null +++ b/checkpoint-15260/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168629c67732309d49f60a5ec48a6d160212bc987365e82e183dfbf74ba0c1f3 +size 4899116432 diff --git a/checkpoint-15260/model-00003-of-00007.safetensors b/checkpoint-15260/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-15260/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-15260/model-00004-of-00007.safetensors b/checkpoint-15260/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-15260/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-15260/model-00005-of-00007.safetensors b/checkpoint-15260/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-15260/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-15260/model-00006-of-00007.safetensors b/checkpoint-15260/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..107d0ba9a11ed5dbff11dff3b6dda1829f419445 --- /dev/null +++ b/checkpoint-15260/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df9d6dc5fc81adef766e05947ef61bc406443da64661faffeeb59936c5d5d820 +size 4999813120 diff --git a/checkpoint-15260/model-00007-of-00007.safetensors b/checkpoint-15260/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8de1d4937b732a532455ee703a7110e04145306b --- /dev/null +++ b/checkpoint-15260/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e0b06bfc5416ea4594646dde5eb4b46407b14a7966270db1b3615f59746734c +size 2734998184 diff --git a/checkpoint-15260/model.safetensors.index.json b/checkpoint-15260/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..318803c6a3dd771c7f7c3b8038a896af7c8322ae --- /dev/null +++ b/checkpoint-15260/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32448724992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-15260/optimizer.pt b/checkpoint-15260/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f22d88350fdffc3fd66623c49a61162fc334b50 --- /dev/null +++ b/checkpoint-15260/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebb046bf713a5a9daa84654fa90f5a2c50f9ed6b428513e61d24fad0c1c30fd4 +size 16040396334 diff --git a/checkpoint-15260/rng_state.pth b/checkpoint-15260/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-15260/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-15260/scheduler.pt b/checkpoint-15260/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..38f65fadce527ecb01fb7d7fd7ef901b9b082813 --- /dev/null +++ b/checkpoint-15260/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ce335347efaebf038b8915bc0a8d2d587a4a1aa08ad0e015b4bc7cc4fba634e +size 1064 diff --git a/checkpoint-15260/trainer_state.json b/checkpoint-15260/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2cff833b7cb1b51aca64b2d027eb88d4ea45022b --- /dev/null +++ b/checkpoint-15260/trainer_state.json @@ -0,0 +1,3477 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5071789417708057, + "eval_steps": 500, + "global_step": 15260, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001030311087476735, + "grad_norm": 60.25547409057617, + "learning_rate": 1.0157273918741808e-06, + "loss": 8.8455, + "step": 31 + }, + { + "epoch": 0.00206062217495347, + "grad_norm": 15.669363975524902, + "learning_rate": 2.0314547837483616e-06, + "loss": 7.1553, + "step": 62 + }, + { + "epoch": 0.003090933262430205, + "grad_norm": 15.366345405578613, + "learning_rate": 3.0471821756225426e-06, + "loss": 5.8784, + "step": 93 + }, + { + "epoch": 0.00412124434990694, + "grad_norm": 36.30561828613281, + "learning_rate": 4.062909567496723e-06, + "loss": 4.7708, + "step": 124 + }, + { + "epoch": 0.005151555437383675, + "grad_norm": 27.202678680419922, + "learning_rate": 5.078636959370905e-06, + "loss": 4.1629, + "step": 155 + }, + { + "epoch": 0.00618186652486041, + "grad_norm": 24.30484962463379, + "learning_rate": 6.094364351245085e-06, + "loss": 3.867, + "step": 186 + }, + { + "epoch": 0.007212177612337145, + "grad_norm": 19.916366577148438, + "learning_rate": 7.110091743119267e-06, + "loss": 3.6131, + "step": 217 + }, + { + "epoch": 0.00824248869981388, + "grad_norm": 17.577274322509766, + "learning_rate": 8.125819134993446e-06, + "loss": 3.4772, + "step": 248 + }, + { + "epoch": 0.009272799787290615, + "grad_norm": 12.133190155029297, + "learning_rate": 9.141546526867629e-06, + "loss": 3.3218, + "step": 279 + }, + { + "epoch": 0.01030311087476735, + "grad_norm": 19.79263687133789, + "learning_rate": 1.015727391874181e-05, + "loss": 3.2055, + "step": 310 + }, + { + "epoch": 0.011333421962244085, + "grad_norm": 16.38133430480957, + "learning_rate": 1.117300131061599e-05, + "loss": 3.1062, + "step": 341 + }, + { + "epoch": 0.01236373304972082, + "grad_norm": 12.638299942016602, + "learning_rate": 1.218872870249017e-05, + "loss": 3.0106, + "step": 372 + }, + { + "epoch": 0.013394044137197554, + "grad_norm": 9.46596908569336, + "learning_rate": 1.3204456094364351e-05, + "loss": 2.924, + "step": 403 + }, + { + "epoch": 0.01442435522467429, + "grad_norm": 10.945392608642578, + "learning_rate": 1.4220183486238533e-05, + "loss": 2.844, + "step": 434 + }, + { + "epoch": 0.015454666312151024, + "grad_norm": 8.474015235900879, + "learning_rate": 1.5235910878112714e-05, + "loss": 2.7892, + "step": 465 + }, + { + "epoch": 0.01648497739962776, + "grad_norm": 9.370804786682129, + "learning_rate": 1.6251638269986893e-05, + "loss": 2.7509, + "step": 496 + }, + { + "epoch": 0.017515288487104493, + "grad_norm": 11.63398551940918, + "learning_rate": 1.7267365661861077e-05, + "loss": 2.6999, + "step": 527 + }, + { + "epoch": 0.01854559957458123, + "grad_norm": 9.17713451385498, + "learning_rate": 1.8283093053735257e-05, + "loss": 2.6459, + "step": 558 + }, + { + "epoch": 0.019575910662057962, + "grad_norm": 7.119054794311523, + "learning_rate": 1.9298820445609438e-05, + "loss": 2.603, + "step": 589 + }, + { + "epoch": 0.0206062217495347, + "grad_norm": 6.653646945953369, + "learning_rate": 2.031454783748362e-05, + "loss": 2.5588, + "step": 620 + }, + { + "epoch": 0.021636532837011432, + "grad_norm": 8.332653045654297, + "learning_rate": 2.13302752293578e-05, + "loss": 2.5357, + "step": 651 + }, + { + "epoch": 0.02266684392448817, + "grad_norm": 6.4949116706848145, + "learning_rate": 2.234600262123198e-05, + "loss": 2.4967, + "step": 682 + }, + { + "epoch": 0.023697155011964902, + "grad_norm": 9.41009521484375, + "learning_rate": 2.336173001310616e-05, + "loss": 2.4563, + "step": 713 + }, + { + "epoch": 0.02472746609944164, + "grad_norm": 7.840345859527588, + "learning_rate": 2.437745740498034e-05, + "loss": 2.4383, + "step": 744 + }, + { + "epoch": 0.025757777186918372, + "grad_norm": 6.116458415985107, + "learning_rate": 2.5393184796854525e-05, + "loss": 2.3817, + "step": 775 + }, + { + "epoch": 0.02678808827439511, + "grad_norm": 5.938300609588623, + "learning_rate": 2.6408912188728702e-05, + "loss": 2.3508, + "step": 806 + }, + { + "epoch": 0.027818399361871842, + "grad_norm": 5.4408345222473145, + "learning_rate": 2.7424639580602886e-05, + "loss": 2.3325, + "step": 837 + }, + { + "epoch": 0.02884871044934858, + "grad_norm": 5.375136375427246, + "learning_rate": 2.8440366972477066e-05, + "loss": 2.3101, + "step": 868 + }, + { + "epoch": 0.029879021536825312, + "grad_norm": 5.149726867675781, + "learning_rate": 2.9456094364351244e-05, + "loss": 2.282, + "step": 899 + }, + { + "epoch": 0.03090933262430205, + "grad_norm": 4.591221332550049, + "learning_rate": 3.0471821756225428e-05, + "loss": 2.2427, + "step": 930 + }, + { + "epoch": 0.031939643711778785, + "grad_norm": 4.977034091949463, + "learning_rate": 3.148754914809961e-05, + "loss": 2.2218, + "step": 961 + }, + { + "epoch": 0.03296995479925552, + "grad_norm": 5.038781642913818, + "learning_rate": 3.2503276539973785e-05, + "loss": 2.2044, + "step": 992 + }, + { + "epoch": 0.03400026588673225, + "grad_norm": 4.872281551361084, + "learning_rate": 3.351900393184797e-05, + "loss": 2.1657, + "step": 1023 + }, + { + "epoch": 0.035030576974208985, + "grad_norm": 4.370841979980469, + "learning_rate": 3.453473132372215e-05, + "loss": 2.1365, + "step": 1054 + }, + { + "epoch": 0.036060888061685725, + "grad_norm": 4.087072849273682, + "learning_rate": 3.555045871559633e-05, + "loss": 2.1253, + "step": 1085 + }, + { + "epoch": 0.03709119914916246, + "grad_norm": 4.113957405090332, + "learning_rate": 3.6566186107470514e-05, + "loss": 2.0973, + "step": 1116 + }, + { + "epoch": 0.03812151023663919, + "grad_norm": 4.0119733810424805, + "learning_rate": 3.7581913499344695e-05, + "loss": 2.1024, + "step": 1147 + }, + { + "epoch": 0.039151821324115925, + "grad_norm": 4.247573375701904, + "learning_rate": 3.8597640891218876e-05, + "loss": 2.0722, + "step": 1178 + }, + { + "epoch": 0.04018213241159266, + "grad_norm": 3.5575129985809326, + "learning_rate": 3.9613368283093056e-05, + "loss": 2.056, + "step": 1209 + }, + { + "epoch": 0.0412124434990694, + "grad_norm": 3.8885862827301025, + "learning_rate": 4.062909567496724e-05, + "loss": 2.0389, + "step": 1240 + }, + { + "epoch": 0.04224275458654613, + "grad_norm": 3.680628538131714, + "learning_rate": 4.164482306684142e-05, + "loss": 2.0385, + "step": 1271 + }, + { + "epoch": 0.043273065674022865, + "grad_norm": 3.780876874923706, + "learning_rate": 4.26605504587156e-05, + "loss": 2.0097, + "step": 1302 + }, + { + "epoch": 0.0443033767614996, + "grad_norm": 4.235328674316406, + "learning_rate": 4.367627785058978e-05, + "loss": 2.0024, + "step": 1333 + }, + { + "epoch": 0.04533368784897634, + "grad_norm": 3.326941967010498, + "learning_rate": 4.469200524246396e-05, + "loss": 1.9953, + "step": 1364 + }, + { + "epoch": 0.04636399893645307, + "grad_norm": 3.28456449508667, + "learning_rate": 4.570773263433814e-05, + "loss": 1.9579, + "step": 1395 + }, + { + "epoch": 0.047394310023929805, + "grad_norm": 16.107433319091797, + "learning_rate": 4.672346002621232e-05, + "loss": 1.9701, + "step": 1426 + }, + { + "epoch": 0.04842462111140654, + "grad_norm": 3.5708224773406982, + "learning_rate": 4.77391874180865e-05, + "loss": 1.9621, + "step": 1457 + }, + { + "epoch": 0.04945493219888328, + "grad_norm": 2.9053499698638916, + "learning_rate": 4.875491480996068e-05, + "loss": 1.9458, + "step": 1488 + }, + { + "epoch": 0.05048524328636001, + "grad_norm": 3.0863258838653564, + "learning_rate": 4.977064220183487e-05, + "loss": 1.9483, + "step": 1519 + }, + { + "epoch": 0.051515554373836744, + "grad_norm": 2.9012269973754883, + "learning_rate": 4.9999915451558777e-05, + "loss": 1.928, + "step": 1550 + }, + { + "epoch": 0.05254586546131348, + "grad_norm": 3.0949041843414307, + "learning_rate": 4.999955597496219e-05, + "loss": 1.9229, + "step": 1581 + }, + { + "epoch": 0.05357617654879022, + "grad_norm": 2.8687901496887207, + "learning_rate": 4.9998914381774255e-05, + "loss": 1.915, + "step": 1612 + }, + { + "epoch": 0.05460648763626695, + "grad_norm": 3.2136878967285156, + "learning_rate": 4.999799067923527e-05, + "loss": 1.9197, + "step": 1643 + }, + { + "epoch": 0.055636798723743684, + "grad_norm": 2.590843677520752, + "learning_rate": 4.999678487776908e-05, + "loss": 1.8756, + "step": 1674 + }, + { + "epoch": 0.05666710981122042, + "grad_norm": 2.64634108543396, + "learning_rate": 4.9995296990983006e-05, + "loss": 1.9033, + "step": 1705 + }, + { + "epoch": 0.05769742089869716, + "grad_norm": 3.0151331424713135, + "learning_rate": 4.999352703566763e-05, + "loss": 1.8883, + "step": 1736 + }, + { + "epoch": 0.05872773198617389, + "grad_norm": 2.526806354522705, + "learning_rate": 4.999147503179668e-05, + "loss": 1.8666, + "step": 1767 + }, + { + "epoch": 0.059758043073650624, + "grad_norm": 2.510300397872925, + "learning_rate": 4.998914100252672e-05, + "loss": 1.854, + "step": 1798 + }, + { + "epoch": 0.06078835416112736, + "grad_norm": 2.4867682456970215, + "learning_rate": 4.998652497419696e-05, + "loss": 1.8548, + "step": 1829 + }, + { + "epoch": 0.0618186652486041, + "grad_norm": 2.3920586109161377, + "learning_rate": 4.9983626976328927e-05, + "loss": 1.8495, + "step": 1860 + }, + { + "epoch": 0.06284897633608083, + "grad_norm": 2.714177370071411, + "learning_rate": 4.998044704162613e-05, + "loss": 1.8433, + "step": 1891 + }, + { + "epoch": 0.06387928742355757, + "grad_norm": 2.3094465732574463, + "learning_rate": 4.9976985205973705e-05, + "loss": 1.8382, + "step": 1922 + }, + { + "epoch": 0.0649095985110343, + "grad_norm": 2.47184419631958, + "learning_rate": 4.997324150843799e-05, + "loss": 1.8464, + "step": 1953 + }, + { + "epoch": 0.06593990959851104, + "grad_norm": 2.391841411590576, + "learning_rate": 4.99692159912661e-05, + "loss": 1.8179, + "step": 1984 + }, + { + "epoch": 0.06697022068598776, + "grad_norm": 2.2471864223480225, + "learning_rate": 4.996490869988546e-05, + "loss": 1.8149, + "step": 2015 + }, + { + "epoch": 0.0680005317734645, + "grad_norm": 2.5497376918792725, + "learning_rate": 4.996031968290326e-05, + "loss": 1.8099, + "step": 2046 + }, + { + "epoch": 0.06903084286094124, + "grad_norm": 2.330463409423828, + "learning_rate": 4.995544899210594e-05, + "loss": 1.8267, + "step": 2077 + }, + { + "epoch": 0.07006115394841797, + "grad_norm": 2.3259341716766357, + "learning_rate": 4.9950296682458583e-05, + "loss": 1.7801, + "step": 2108 + }, + { + "epoch": 0.07109146503589471, + "grad_norm": 2.1711952686309814, + "learning_rate": 4.994486281210429e-05, + "loss": 1.7961, + "step": 2139 + }, + { + "epoch": 0.07212177612337145, + "grad_norm": 2.1808884143829346, + "learning_rate": 4.9939147442363566e-05, + "loss": 1.8109, + "step": 2170 + }, + { + "epoch": 0.07315208721084818, + "grad_norm": 2.089256525039673, + "learning_rate": 4.9933150637733574e-05, + "loss": 1.8026, + "step": 2201 + }, + { + "epoch": 0.07418239829832492, + "grad_norm": 2.0864951610565186, + "learning_rate": 4.992687246588743e-05, + "loss": 1.7753, + "step": 2232 + }, + { + "epoch": 0.07521270938580164, + "grad_norm": 2.36157488822937, + "learning_rate": 4.992031299767347e-05, + "loss": 1.7746, + "step": 2263 + }, + { + "epoch": 0.07624302047327838, + "grad_norm": 2.5334439277648926, + "learning_rate": 4.9913472307114386e-05, + "loss": 1.7927, + "step": 2294 + }, + { + "epoch": 0.07727333156075512, + "grad_norm": 2.2565715312957764, + "learning_rate": 4.9906350471406446e-05, + "loss": 1.7668, + "step": 2325 + }, + { + "epoch": 0.07830364264823185, + "grad_norm": 2.1043128967285156, + "learning_rate": 4.989894757091861e-05, + "loss": 1.7771, + "step": 2356 + }, + { + "epoch": 0.07933395373570859, + "grad_norm": 1.9659819602966309, + "learning_rate": 4.989126368919158e-05, + "loss": 1.7666, + "step": 2387 + }, + { + "epoch": 0.08036426482318532, + "grad_norm": 2.0778403282165527, + "learning_rate": 4.988329891293693e-05, + "loss": 1.7405, + "step": 2418 + }, + { + "epoch": 0.08139457591066206, + "grad_norm": 2.1767923831939697, + "learning_rate": 4.987505333203608e-05, + "loss": 1.7495, + "step": 2449 + }, + { + "epoch": 0.0824248869981388, + "grad_norm": 2.260143280029297, + "learning_rate": 4.9866527039539276e-05, + "loss": 1.7504, + "step": 2480 + }, + { + "epoch": 0.08345519808561552, + "grad_norm": 2.18271803855896, + "learning_rate": 4.9857720131664594e-05, + "loss": 1.7456, + "step": 2511 + }, + { + "epoch": 0.08448550917309226, + "grad_norm": 2.209594964981079, + "learning_rate": 4.9848632707796773e-05, + "loss": 1.7528, + "step": 2542 + }, + { + "epoch": 0.085515820260569, + "grad_norm": 2.0666229724884033, + "learning_rate": 4.9839264870486155e-05, + "loss": 1.7517, + "step": 2573 + }, + { + "epoch": 0.08654613134804573, + "grad_norm": 2.1070454120635986, + "learning_rate": 4.9829616725447526e-05, + "loss": 1.7474, + "step": 2604 + }, + { + "epoch": 0.08757644243552247, + "grad_norm": 1.9430303573608398, + "learning_rate": 4.981968838155888e-05, + "loss": 1.7348, + "step": 2635 + }, + { + "epoch": 0.0886067535229992, + "grad_norm": 1.9638925790786743, + "learning_rate": 4.980947995086024e-05, + "loss": 1.7202, + "step": 2666 + }, + { + "epoch": 0.08963706461047594, + "grad_norm": 1.8845652341842651, + "learning_rate": 4.979899154855234e-05, + "loss": 1.7375, + "step": 2697 + }, + { + "epoch": 0.09066737569795268, + "grad_norm": 5.712058067321777, + "learning_rate": 4.9788223292995386e-05, + "loss": 1.7379, + "step": 2728 + }, + { + "epoch": 0.0916976867854294, + "grad_norm": 1.9520670175552368, + "learning_rate": 4.977717530570768e-05, + "loss": 1.7302, + "step": 2759 + }, + { + "epoch": 0.09272799787290614, + "grad_norm": 1.8802224397659302, + "learning_rate": 4.976584771136425e-05, + "loss": 1.74, + "step": 2790 + }, + { + "epoch": 0.09375830896038288, + "grad_norm": 2.1098153591156006, + "learning_rate": 4.975424063779547e-05, + "loss": 1.7024, + "step": 2821 + }, + { + "epoch": 0.09478862004785961, + "grad_norm": 2.1568291187286377, + "learning_rate": 4.974235421598557e-05, + "loss": 1.7131, + "step": 2852 + }, + { + "epoch": 0.09581893113533635, + "grad_norm": 1.8769980669021606, + "learning_rate": 4.973018858007122e-05, + "loss": 1.7008, + "step": 2883 + }, + { + "epoch": 0.09684924222281308, + "grad_norm": 1.8325533866882324, + "learning_rate": 4.9717743867339963e-05, + "loss": 1.7058, + "step": 2914 + }, + { + "epoch": 0.09787955331028982, + "grad_norm": 2.086416721343994, + "learning_rate": 4.9705020218228695e-05, + "loss": 1.711, + "step": 2945 + }, + { + "epoch": 0.09890986439776656, + "grad_norm": 1.8294793367385864, + "learning_rate": 4.969201777632205e-05, + "loss": 1.6998, + "step": 2976 + }, + { + "epoch": 0.09994017548524328, + "grad_norm": 2.0608153343200684, + "learning_rate": 4.9678736688350846e-05, + "loss": 1.6948, + "step": 3007 + }, + { + "epoch": 0.10097048657272002, + "grad_norm": 3.2166008949279785, + "learning_rate": 4.966517710419033e-05, + "loss": 1.6788, + "step": 3038 + }, + { + "epoch": 0.10200079766019676, + "grad_norm": 1.9431313276290894, + "learning_rate": 4.965133917685858e-05, + "loss": 1.7115, + "step": 3069 + }, + { + "epoch": 0.10303110874767349, + "grad_norm": 1.967512845993042, + "learning_rate": 4.9637223062514714e-05, + "loss": 1.7033, + "step": 3100 + }, + { + "epoch": 0.10406141983515023, + "grad_norm": 1.9253389835357666, + "learning_rate": 4.962282892045718e-05, + "loss": 1.6856, + "step": 3131 + }, + { + "epoch": 0.10509173092262696, + "grad_norm": 1.986840009689331, + "learning_rate": 4.9608156913121904e-05, + "loss": 1.723, + "step": 3162 + }, + { + "epoch": 0.1061220420101037, + "grad_norm": 1.83523690700531, + "learning_rate": 4.959320720608049e-05, + "loss": 1.6912, + "step": 3193 + }, + { + "epoch": 0.10715235309758044, + "grad_norm": 2.1271955966949463, + "learning_rate": 4.9577979968038354e-05, + "loss": 1.7032, + "step": 3224 + }, + { + "epoch": 0.10818266418505716, + "grad_norm": 1.8383768796920776, + "learning_rate": 4.956247537083282e-05, + "loss": 1.6726, + "step": 3255 + }, + { + "epoch": 0.1092129752725339, + "grad_norm": 1.8806651830673218, + "learning_rate": 4.9546693589431145e-05, + "loss": 1.6817, + "step": 3286 + }, + { + "epoch": 0.11024328636001064, + "grad_norm": 1.7535260915756226, + "learning_rate": 4.9530634801928595e-05, + "loss": 1.6875, + "step": 3317 + }, + { + "epoch": 0.11127359744748737, + "grad_norm": 1.765906810760498, + "learning_rate": 4.9514299189546395e-05, + "loss": 1.6859, + "step": 3348 + }, + { + "epoch": 0.11230390853496411, + "grad_norm": 1.869828462600708, + "learning_rate": 4.949768693662973e-05, + "loss": 1.6915, + "step": 3379 + }, + { + "epoch": 0.11333421962244083, + "grad_norm": 1.8347504138946533, + "learning_rate": 4.948079823064559e-05, + "loss": 1.6859, + "step": 3410 + }, + { + "epoch": 0.11436453070991758, + "grad_norm": 1.7692474126815796, + "learning_rate": 4.946363326218074e-05, + "loss": 1.6565, + "step": 3441 + }, + { + "epoch": 0.11539484179739432, + "grad_norm": 1.8231885433197021, + "learning_rate": 4.9446192224939525e-05, + "loss": 1.686, + "step": 3472 + }, + { + "epoch": 0.11642515288487104, + "grad_norm": 1.7155958414077759, + "learning_rate": 4.942847531574167e-05, + "loss": 1.6538, + "step": 3503 + }, + { + "epoch": 0.11745546397234778, + "grad_norm": 1.787183403968811, + "learning_rate": 4.941048273452008e-05, + "loss": 1.6776, + "step": 3534 + }, + { + "epoch": 0.11848577505982451, + "grad_norm": 1.741213083267212, + "learning_rate": 4.9392214684318605e-05, + "loss": 1.6784, + "step": 3565 + }, + { + "epoch": 0.11951608614730125, + "grad_norm": 1.7836824655532837, + "learning_rate": 4.93736713712897e-05, + "loss": 1.6557, + "step": 3596 + }, + { + "epoch": 0.12054639723477799, + "grad_norm": 1.7103859186172485, + "learning_rate": 4.9354853004692124e-05, + "loss": 1.6606, + "step": 3627 + }, + { + "epoch": 0.12157670832225471, + "grad_norm": 1.7865506410598755, + "learning_rate": 4.93357597968886e-05, + "loss": 1.6409, + "step": 3658 + }, + { + "epoch": 0.12260701940973145, + "grad_norm": 1.7770143747329712, + "learning_rate": 4.931639196334338e-05, + "loss": 1.6574, + "step": 3689 + }, + { + "epoch": 0.1236373304972082, + "grad_norm": 1.857575535774231, + "learning_rate": 4.9296749722619826e-05, + "loss": 1.6724, + "step": 3720 + }, + { + "epoch": 0.12466764158468492, + "grad_norm": 1.8742581605911255, + "learning_rate": 4.9276833296377966e-05, + "loss": 1.6506, + "step": 3751 + }, + { + "epoch": 0.12569795267216166, + "grad_norm": 1.827668309211731, + "learning_rate": 4.925664290937196e-05, + "loss": 1.6523, + "step": 3782 + }, + { + "epoch": 0.1267282637596384, + "grad_norm": 1.7517486810684204, + "learning_rate": 4.9236178789447576e-05, + "loss": 1.6459, + "step": 3813 + }, + { + "epoch": 0.12775857484711514, + "grad_norm": 1.8109570741653442, + "learning_rate": 4.921544116753962e-05, + "loss": 1.6614, + "step": 3844 + }, + { + "epoch": 0.12878888593459187, + "grad_norm": 1.692597508430481, + "learning_rate": 4.919443027766935e-05, + "loss": 1.6431, + "step": 3875 + }, + { + "epoch": 0.1298191970220686, + "grad_norm": 1.8650025129318237, + "learning_rate": 4.91731463569418e-05, + "loss": 1.6466, + "step": 3906 + }, + { + "epoch": 0.13084950810954532, + "grad_norm": 1.6794081926345825, + "learning_rate": 4.915158964554312e-05, + "loss": 1.6504, + "step": 3937 + }, + { + "epoch": 0.13187981919702207, + "grad_norm": 1.7685374021530151, + "learning_rate": 4.912976038673786e-05, + "loss": 1.6446, + "step": 3968 + }, + { + "epoch": 0.1329101302844988, + "grad_norm": 1.7601110935211182, + "learning_rate": 4.9107658826866254e-05, + "loss": 1.631, + "step": 3999 + }, + { + "epoch": 0.13394044137197553, + "grad_norm": 2.0616064071655273, + "learning_rate": 4.908528521534139e-05, + "loss": 1.6476, + "step": 4030 + }, + { + "epoch": 0.13497075245945228, + "grad_norm": 1.8973504304885864, + "learning_rate": 4.906263980464644e-05, + "loss": 1.6582, + "step": 4061 + }, + { + "epoch": 0.136001063546929, + "grad_norm": 1.7768895626068115, + "learning_rate": 4.903972285033178e-05, + "loss": 1.6159, + "step": 4092 + }, + { + "epoch": 0.13703137463440573, + "grad_norm": 1.8264424800872803, + "learning_rate": 4.901653461101213e-05, + "loss": 1.6289, + "step": 4123 + }, + { + "epoch": 0.1380616857218825, + "grad_norm": 1.7140119075775146, + "learning_rate": 4.8993075348363626e-05, + "loss": 1.6357, + "step": 4154 + }, + { + "epoch": 0.13909199680935921, + "grad_norm": 1.6964486837387085, + "learning_rate": 4.896934532712084e-05, + "loss": 1.6233, + "step": 4185 + }, + { + "epoch": 0.14012230789683594, + "grad_norm": 1.8008025884628296, + "learning_rate": 4.8945344815073846e-05, + "loss": 1.637, + "step": 4216 + }, + { + "epoch": 0.1411526189843127, + "grad_norm": 1.562730073928833, + "learning_rate": 4.892107408306516e-05, + "loss": 1.6379, + "step": 4247 + }, + { + "epoch": 0.14218293007178942, + "grad_norm": 1.8273371458053589, + "learning_rate": 4.889653340498669e-05, + "loss": 1.6246, + "step": 4278 + }, + { + "epoch": 0.14321324115926615, + "grad_norm": 56.33716583251953, + "learning_rate": 4.8871723057776664e-05, + "loss": 1.6457, + "step": 4309 + }, + { + "epoch": 0.1442435522467429, + "grad_norm": 1.746523380279541, + "learning_rate": 4.8846643321416476e-05, + "loss": 1.6343, + "step": 4340 + }, + { + "epoch": 0.14527386333421963, + "grad_norm": 1.7737531661987305, + "learning_rate": 4.882129447892753e-05, + "loss": 1.6447, + "step": 4371 + }, + { + "epoch": 0.14630417442169635, + "grad_norm": 1.660485863685608, + "learning_rate": 4.8795676816368076e-05, + "loss": 1.6192, + "step": 4402 + }, + { + "epoch": 0.14733448550917308, + "grad_norm": 1.6823406219482422, + "learning_rate": 4.876979062282995e-05, + "loss": 1.6253, + "step": 4433 + }, + { + "epoch": 0.14836479659664983, + "grad_norm": 7.78139066696167, + "learning_rate": 4.8743636190435325e-05, + "loss": 1.6234, + "step": 4464 + }, + { + "epoch": 0.14939510768412656, + "grad_norm": 1.7426058053970337, + "learning_rate": 4.871721381433344e-05, + "loss": 1.6337, + "step": 4495 + }, + { + "epoch": 0.1504254187716033, + "grad_norm": 1.6294783353805542, + "learning_rate": 4.869052379269719e-05, + "loss": 1.6217, + "step": 4526 + }, + { + "epoch": 0.15145572985908004, + "grad_norm": 1.6523306369781494, + "learning_rate": 4.866356642671985e-05, + "loss": 1.605, + "step": 4557 + }, + { + "epoch": 0.15248604094655677, + "grad_norm": 1.8571300506591797, + "learning_rate": 4.8636342020611634e-05, + "loss": 1.6218, + "step": 4588 + }, + { + "epoch": 0.1535163520340335, + "grad_norm": 1.7754936218261719, + "learning_rate": 4.860885088159626e-05, + "loss": 1.6171, + "step": 4619 + }, + { + "epoch": 0.15454666312151025, + "grad_norm": 1.91987943649292, + "learning_rate": 4.858109331990751e-05, + "loss": 1.6167, + "step": 4650 + }, + { + "epoch": 0.15557697420898697, + "grad_norm": 1.5994452238082886, + "learning_rate": 4.855306964878567e-05, + "loss": 1.5951, + "step": 4681 + }, + { + "epoch": 0.1566072852964637, + "grad_norm": 1.6490916013717651, + "learning_rate": 4.8524780184474084e-05, + "loss": 1.616, + "step": 4712 + }, + { + "epoch": 0.15763759638394045, + "grad_norm": 1.5921640396118164, + "learning_rate": 4.8496225246215496e-05, + "loss": 1.6346, + "step": 4743 + }, + { + "epoch": 0.15866790747141718, + "grad_norm": 1.6729261875152588, + "learning_rate": 4.8467405156248505e-05, + "loss": 1.6165, + "step": 4774 + }, + { + "epoch": 0.1596982185588939, + "grad_norm": 1.628113031387329, + "learning_rate": 4.843832023980392e-05, + "loss": 1.6119, + "step": 4805 + }, + { + "epoch": 0.16072852964637063, + "grad_norm": 1.651647925376892, + "learning_rate": 4.840897082510106e-05, + "loss": 1.5997, + "step": 4836 + }, + { + "epoch": 0.1617588407338474, + "grad_norm": 1.5297720432281494, + "learning_rate": 4.8379357243344084e-05, + "loss": 1.6242, + "step": 4867 + }, + { + "epoch": 0.1627891518213241, + "grad_norm": 1.5779869556427002, + "learning_rate": 4.8349479828718236e-05, + "loss": 1.6149, + "step": 4898 + }, + { + "epoch": 0.16381946290880084, + "grad_norm": 1.5843939781188965, + "learning_rate": 4.8319338918386075e-05, + "loss": 1.5926, + "step": 4929 + }, + { + "epoch": 0.1648497739962776, + "grad_norm": 2.3762106895446777, + "learning_rate": 4.828893485248369e-05, + "loss": 1.6108, + "step": 4960 + }, + { + "epoch": 0.16588008508375432, + "grad_norm": 1.5871953964233398, + "learning_rate": 4.825826797411682e-05, + "loss": 1.6103, + "step": 4991 + }, + { + "epoch": 0.16691039617123105, + "grad_norm": 1.5934125185012817, + "learning_rate": 4.822733862935702e-05, + "loss": 1.6091, + "step": 5022 + }, + { + "epoch": 0.1679407072587078, + "grad_norm": 1.6997628211975098, + "learning_rate": 4.819614716723775e-05, + "loss": 1.6098, + "step": 5053 + }, + { + "epoch": 0.16897101834618453, + "grad_norm": 1.682849645614624, + "learning_rate": 4.8164693939750425e-05, + "loss": 1.599, + "step": 5084 + }, + { + "epoch": 0.17000132943366125, + "grad_norm": 1.709743857383728, + "learning_rate": 4.813297930184042e-05, + "loss": 1.6194, + "step": 5115 + }, + { + "epoch": 0.171031640521138, + "grad_norm": 1.725879430770874, + "learning_rate": 4.810100361140314e-05, + "loss": 1.6115, + "step": 5146 + }, + { + "epoch": 0.17206195160861473, + "grad_norm": 1.6710290908813477, + "learning_rate": 4.8068767229279885e-05, + "loss": 1.6032, + "step": 5177 + }, + { + "epoch": 0.17309226269609146, + "grad_norm": 1.6156634092330933, + "learning_rate": 4.8036270519253854e-05, + "loss": 1.5973, + "step": 5208 + }, + { + "epoch": 0.1741225737835682, + "grad_norm": 1.5654059648513794, + "learning_rate": 4.8003513848046e-05, + "loss": 1.5817, + "step": 5239 + }, + { + "epoch": 0.17515288487104494, + "grad_norm": 1.5789822340011597, + "learning_rate": 4.79704975853109e-05, + "loss": 1.6138, + "step": 5270 + }, + { + "epoch": 0.17618319595852167, + "grad_norm": 1.6022037267684937, + "learning_rate": 4.793722210363262e-05, + "loss": 1.5998, + "step": 5301 + }, + { + "epoch": 0.1772135070459984, + "grad_norm": 1.5142741203308105, + "learning_rate": 4.7903687778520414e-05, + "loss": 1.6061, + "step": 5332 + }, + { + "epoch": 0.17824381813347515, + "grad_norm": 1.6454212665557861, + "learning_rate": 4.7869894988404593e-05, + "loss": 1.6063, + "step": 5363 + }, + { + "epoch": 0.17927412922095187, + "grad_norm": 1.5250823497772217, + "learning_rate": 4.783584411463221e-05, + "loss": 1.6038, + "step": 5394 + }, + { + "epoch": 0.1803044403084286, + "grad_norm": 1.5829335451126099, + "learning_rate": 4.780153554146274e-05, + "loss": 1.5949, + "step": 5425 + }, + { + "epoch": 0.18133475139590535, + "grad_norm": 1.5342432260513306, + "learning_rate": 4.7766969656063766e-05, + "loss": 1.5913, + "step": 5456 + }, + { + "epoch": 0.18236506248338208, + "grad_norm": 1.6397250890731812, + "learning_rate": 4.773214684850662e-05, + "loss": 1.6102, + "step": 5487 + }, + { + "epoch": 0.1833953735708588, + "grad_norm": 1.5228471755981445, + "learning_rate": 4.769706751176193e-05, + "loss": 1.5885, + "step": 5518 + }, + { + "epoch": 0.18442568465833556, + "grad_norm": 1.6186103820800781, + "learning_rate": 4.7661732041695264e-05, + "loss": 1.6086, + "step": 5549 + }, + { + "epoch": 0.18545599574581229, + "grad_norm": 1.6024582386016846, + "learning_rate": 4.762614083706258e-05, + "loss": 1.6004, + "step": 5580 + }, + { + "epoch": 0.186486306833289, + "grad_norm": 1.5443711280822754, + "learning_rate": 4.759029429950581e-05, + "loss": 1.6048, + "step": 5611 + }, + { + "epoch": 0.18751661792076577, + "grad_norm": 1.4831629991531372, + "learning_rate": 4.7554192833548235e-05, + "loss": 1.5841, + "step": 5642 + }, + { + "epoch": 0.1885469290082425, + "grad_norm": 1.6426068544387817, + "learning_rate": 4.751783684659e-05, + "loss": 1.587, + "step": 5673 + }, + { + "epoch": 0.18957724009571922, + "grad_norm": 1.4609078168869019, + "learning_rate": 4.748122674890348e-05, + "loss": 1.5945, + "step": 5704 + }, + { + "epoch": 0.19060755118319597, + "grad_norm": 1.5365614891052246, + "learning_rate": 4.7444362953628654e-05, + "loss": 1.5737, + "step": 5735 + }, + { + "epoch": 0.1916378622706727, + "grad_norm": 1.5755670070648193, + "learning_rate": 4.7407245876768424e-05, + "loss": 1.5862, + "step": 5766 + }, + { + "epoch": 0.19266817335814942, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.736987593718397e-05, + "loss": 1.5663, + "step": 5797 + }, + { + "epoch": 0.19369848444562615, + "grad_norm": 1.5927278995513916, + "learning_rate": 4.733225355658999e-05, + "loss": 1.5776, + "step": 5828 + }, + { + "epoch": 0.1947287955331029, + "grad_norm": 1.5593287944793701, + "learning_rate": 4.7294379159549926e-05, + "loss": 1.579, + "step": 5859 + }, + { + "epoch": 0.19575910662057963, + "grad_norm": 1.534055233001709, + "learning_rate": 4.725625317347119e-05, + "loss": 1.6017, + "step": 5890 + }, + { + "epoch": 0.19678941770805636, + "grad_norm": 1.5846387147903442, + "learning_rate": 4.7217876028600374e-05, + "loss": 1.5739, + "step": 5921 + }, + { + "epoch": 0.1978197287955331, + "grad_norm": 1.5377682447433472, + "learning_rate": 4.717924815801832e-05, + "loss": 1.57, + "step": 5952 + }, + { + "epoch": 0.19885003988300984, + "grad_norm": 1.467956781387329, + "learning_rate": 4.714036999763532e-05, + "loss": 1.5736, + "step": 5983 + }, + { + "epoch": 0.19988035097048656, + "grad_norm": 1.601070523262024, + "learning_rate": 4.7101241986186116e-05, + "loss": 1.5861, + "step": 6014 + }, + { + "epoch": 0.20091066205796332, + "grad_norm": 1.5051921606063843, + "learning_rate": 4.7061864565225e-05, + "loss": 1.5735, + "step": 6045 + }, + { + "epoch": 0.20194097314544004, + "grad_norm": 1.462843418121338, + "learning_rate": 4.702223817912081e-05, + "loss": 1.582, + "step": 6076 + }, + { + "epoch": 0.20297128423291677, + "grad_norm": 1.5698682069778442, + "learning_rate": 4.698236327505195e-05, + "loss": 1.5647, + "step": 6107 + }, + { + "epoch": 0.20400159532039353, + "grad_norm": 1.5633916854858398, + "learning_rate": 4.694224030300127e-05, + "loss": 1.5741, + "step": 6138 + }, + { + "epoch": 0.20503190640787025, + "grad_norm": 1.6174733638763428, + "learning_rate": 4.690186971575107e-05, + "loss": 1.5634, + "step": 6169 + }, + { + "epoch": 0.20606221749534698, + "grad_norm": 1.4957518577575684, + "learning_rate": 4.6861251968877916e-05, + "loss": 1.575, + "step": 6200 + }, + { + "epoch": 0.2070925285828237, + "grad_norm": 1.670933485031128, + "learning_rate": 4.68203875207476e-05, + "loss": 1.5792, + "step": 6231 + }, + { + "epoch": 0.20812283967030046, + "grad_norm": 1.5676430463790894, + "learning_rate": 4.677927683250983e-05, + "loss": 1.5689, + "step": 6262 + }, + { + "epoch": 0.20915315075777718, + "grad_norm": 1.5753976106643677, + "learning_rate": 4.6737920368093156e-05, + "loss": 1.5594, + "step": 6293 + }, + { + "epoch": 0.2101834618452539, + "grad_norm": 1.4973617792129517, + "learning_rate": 4.669631859419965e-05, + "loss": 1.5593, + "step": 6324 + }, + { + "epoch": 0.21121377293273066, + "grad_norm": 1.4691433906555176, + "learning_rate": 4.6654471980299676e-05, + "loss": 1.5711, + "step": 6355 + }, + { + "epoch": 0.2122440840202074, + "grad_norm": 1.407630443572998, + "learning_rate": 4.661238099862658e-05, + "loss": 1.5787, + "step": 6386 + }, + { + "epoch": 0.21327439510768412, + "grad_norm": 1.5011677742004395, + "learning_rate": 4.657004612417138e-05, + "loss": 1.5751, + "step": 6417 + }, + { + "epoch": 0.21430470619516087, + "grad_norm": 1.509750485420227, + "learning_rate": 4.6527467834677374e-05, + "loss": 1.5583, + "step": 6448 + }, + { + "epoch": 0.2153350172826376, + "grad_norm": 1.3919882774353027, + "learning_rate": 4.648464661063478e-05, + "loss": 1.5712, + "step": 6479 + }, + { + "epoch": 0.21636532837011432, + "grad_norm": 1.4854936599731445, + "learning_rate": 4.6441582935275264e-05, + "loss": 1.5637, + "step": 6510 + }, + { + "epoch": 0.21739563945759108, + "grad_norm": 1.4413583278656006, + "learning_rate": 4.6398277294566586e-05, + "loss": 1.56, + "step": 6541 + }, + { + "epoch": 0.2184259505450678, + "grad_norm": 1.5063883066177368, + "learning_rate": 4.6354730177207e-05, + "loss": 1.5525, + "step": 6572 + }, + { + "epoch": 0.21945626163254453, + "grad_norm": 1.4899688959121704, + "learning_rate": 4.6310942074619787e-05, + "loss": 1.5817, + "step": 6603 + }, + { + "epoch": 0.22048657272002128, + "grad_norm": 1.3927967548370361, + "learning_rate": 4.626691348094777e-05, + "loss": 1.5407, + "step": 6634 + }, + { + "epoch": 0.221516883807498, + "grad_norm": 1.5378398895263672, + "learning_rate": 4.622264489304762e-05, + "loss": 1.5561, + "step": 6665 + }, + { + "epoch": 0.22254719489497474, + "grad_norm": 1.554624319076538, + "learning_rate": 4.617813681048434e-05, + "loss": 1.5859, + "step": 6696 + }, + { + "epoch": 0.22357750598245146, + "grad_norm": 1.5356658697128296, + "learning_rate": 4.61333897355256e-05, + "loss": 1.5531, + "step": 6727 + }, + { + "epoch": 0.22460781706992822, + "grad_norm": 1.5534918308258057, + "learning_rate": 4.608840417313604e-05, + "loss": 1.5774, + "step": 6758 + }, + { + "epoch": 0.22563812815740494, + "grad_norm": 1.5660988092422485, + "learning_rate": 4.6043180630971646e-05, + "loss": 1.5763, + "step": 6789 + }, + { + "epoch": 0.22666843924488167, + "grad_norm": 1.4993386268615723, + "learning_rate": 4.599771961937391e-05, + "loss": 1.5615, + "step": 6820 + }, + { + "epoch": 0.22769875033235842, + "grad_norm": 1.4630553722381592, + "learning_rate": 4.5952021651364204e-05, + "loss": 1.543, + "step": 6851 + }, + { + "epoch": 0.22872906141983515, + "grad_norm": 1.470173954963684, + "learning_rate": 4.590608724263786e-05, + "loss": 1.5674, + "step": 6882 + }, + { + "epoch": 0.22975937250731188, + "grad_norm": 1.5867971181869507, + "learning_rate": 4.585991691155845e-05, + "loss": 1.5702, + "step": 6913 + }, + { + "epoch": 0.23078968359478863, + "grad_norm": 1.44207763671875, + "learning_rate": 4.581351117915188e-05, + "loss": 1.5436, + "step": 6944 + }, + { + "epoch": 0.23181999468226536, + "grad_norm": 1.4691039323806763, + "learning_rate": 4.5766870569100534e-05, + "loss": 1.5465, + "step": 6975 + }, + { + "epoch": 0.23285030576974208, + "grad_norm": 1.4807918071746826, + "learning_rate": 4.571999560773736e-05, + "loss": 1.5564, + "step": 7006 + }, + { + "epoch": 0.23388061685721884, + "grad_norm": 1.481487512588501, + "learning_rate": 4.5672886824039915e-05, + "loss": 1.5466, + "step": 7037 + }, + { + "epoch": 0.23491092794469556, + "grad_norm": 1.4518013000488281, + "learning_rate": 4.5625544749624435e-05, + "loss": 1.5618, + "step": 7068 + }, + { + "epoch": 0.2359412390321723, + "grad_norm": 1.4186676740646362, + "learning_rate": 4.5577969918739794e-05, + "loss": 1.5528, + "step": 7099 + }, + { + "epoch": 0.23697155011964902, + "grad_norm": 1.5287110805511475, + "learning_rate": 4.5530162868261486e-05, + "loss": 1.5457, + "step": 7130 + }, + { + "epoch": 0.23800186120712577, + "grad_norm": 1.5516417026519775, + "learning_rate": 4.548212413768558e-05, + "loss": 1.5467, + "step": 7161 + }, + { + "epoch": 0.2390321722946025, + "grad_norm": 1.4710053205490112, + "learning_rate": 4.543385426912261e-05, + "loss": 1.5431, + "step": 7192 + }, + { + "epoch": 0.24006248338207922, + "grad_norm": 1.5005567073822021, + "learning_rate": 4.53853538072915e-05, + "loss": 1.5592, + "step": 7223 + }, + { + "epoch": 0.24109279446955598, + "grad_norm": 1.5864965915679932, + "learning_rate": 4.533662329951336e-05, + "loss": 1.5694, + "step": 7254 + }, + { + "epoch": 0.2421231055570327, + "grad_norm": 1.4661896228790283, + "learning_rate": 4.528766329570536e-05, + "loss": 1.545, + "step": 7285 + }, + { + "epoch": 0.24315341664450943, + "grad_norm": 1.5157560110092163, + "learning_rate": 4.523847434837447e-05, + "loss": 1.5458, + "step": 7316 + }, + { + "epoch": 0.24418372773198618, + "grad_norm": 1.4033585786819458, + "learning_rate": 4.518905701261128e-05, + "loss": 1.5464, + "step": 7347 + }, + { + "epoch": 0.2452140388194629, + "grad_norm": 1.5357593297958374, + "learning_rate": 4.5139411846083715e-05, + "loss": 1.5497, + "step": 7378 + }, + { + "epoch": 0.24624434990693964, + "grad_norm": 1.419507384300232, + "learning_rate": 4.508953940903073e-05, + "loss": 1.5414, + "step": 7409 + }, + { + "epoch": 0.2472746609944164, + "grad_norm": 1.5201773643493652, + "learning_rate": 4.5039440264255994e-05, + "loss": 1.5503, + "step": 7440 + }, + { + "epoch": 0.24830497208189312, + "grad_norm": 1.8000444173812866, + "learning_rate": 4.498911497712155e-05, + "loss": 1.5448, + "step": 7471 + }, + { + "epoch": 0.24933528316936984, + "grad_norm": 1.4876810312271118, + "learning_rate": 4.493856411554142e-05, + "loss": 1.5524, + "step": 7502 + }, + { + "epoch": 0.25036559425684657, + "grad_norm": 1.5130078792572021, + "learning_rate": 4.4887788249975206e-05, + "loss": 1.5454, + "step": 7533 + }, + { + "epoch": 0.2513959053443233, + "grad_norm": 1.4829351902008057, + "learning_rate": 4.4836787953421656e-05, + "loss": 1.5407, + "step": 7564 + }, + { + "epoch": 0.2524262164318001, + "grad_norm": 1.521550178527832, + "learning_rate": 4.478556380141218e-05, + "loss": 1.5727, + "step": 7595 + }, + { + "epoch": 0.2534565275192768, + "grad_norm": 1.4377928972244263, + "learning_rate": 4.4734116372004375e-05, + "loss": 1.5432, + "step": 7626 + }, + { + "epoch": 0.25448683860675353, + "grad_norm": 1.4101744890213013, + "learning_rate": 4.4682446245775477e-05, + "loss": 1.547, + "step": 7657 + }, + { + "epoch": 0.2555171496942303, + "grad_norm": 1.522524356842041, + "learning_rate": 4.463055400581586e-05, + "loss": 1.5418, + "step": 7688 + }, + { + "epoch": 0.256547460781707, + "grad_norm": 1.4160797595977783, + "learning_rate": 4.4578440237722374e-05, + "loss": 1.5457, + "step": 7719 + }, + { + "epoch": 0.25757777186918374, + "grad_norm": 1.4106636047363281, + "learning_rate": 4.452610552959183e-05, + "loss": 1.5405, + "step": 7750 + }, + { + "epoch": 0.2586080829566605, + "grad_norm": 1.422723650932312, + "learning_rate": 4.447355047201428e-05, + "loss": 1.5423, + "step": 7781 + }, + { + "epoch": 0.2596383940441372, + "grad_norm": 1.4362592697143555, + "learning_rate": 4.4420775658066414e-05, + "loss": 1.5372, + "step": 7812 + }, + { + "epoch": 0.26066870513161394, + "grad_norm": 1.4319696426391602, + "learning_rate": 4.436778168330484e-05, + "loss": 1.5451, + "step": 7843 + }, + { + "epoch": 0.26169901621909064, + "grad_norm": 1.4069257974624634, + "learning_rate": 4.4314569145759353e-05, + "loss": 1.5221, + "step": 7874 + }, + { + "epoch": 0.2627293273065674, + "grad_norm": 1.4424949884414673, + "learning_rate": 4.42611386459262e-05, + "loss": 1.5419, + "step": 7905 + }, + { + "epoch": 0.26375963839404415, + "grad_norm": 1.4579105377197266, + "learning_rate": 4.420749078676133e-05, + "loss": 1.5116, + "step": 7936 + }, + { + "epoch": 0.26478994948152085, + "grad_norm": 1.4563167095184326, + "learning_rate": 4.4153626173673516e-05, + "loss": 1.5296, + "step": 7967 + }, + { + "epoch": 0.2658202605689976, + "grad_norm": 1.4440968036651611, + "learning_rate": 4.409954541451762e-05, + "loss": 1.5548, + "step": 7998 + }, + { + "epoch": 0.26685057165647436, + "grad_norm": 1.5711034536361694, + "learning_rate": 4.404524911958764e-05, + "loss": 1.535, + "step": 8029 + }, + { + "epoch": 0.26788088274395105, + "grad_norm": 1.5221564769744873, + "learning_rate": 4.399073790160989e-05, + "loss": 1.5495, + "step": 8060 + }, + { + "epoch": 0.2689111938314278, + "grad_norm": 1.392699956893921, + "learning_rate": 4.393601237573607e-05, + "loss": 1.546, + "step": 8091 + }, + { + "epoch": 0.26994150491890456, + "grad_norm": 1.5343137979507446, + "learning_rate": 4.388107315953628e-05, + "loss": 1.549, + "step": 8122 + }, + { + "epoch": 0.27097181600638126, + "grad_norm": 1.4483468532562256, + "learning_rate": 4.382592087299212e-05, + "loss": 1.5424, + "step": 8153 + }, + { + "epoch": 0.272002127093858, + "grad_norm": 1.4963489770889282, + "learning_rate": 4.377055613848964e-05, + "loss": 1.508, + "step": 8184 + }, + { + "epoch": 0.27303243818133477, + "grad_norm": 1.4839162826538086, + "learning_rate": 4.3714979580812355e-05, + "loss": 1.5203, + "step": 8215 + }, + { + "epoch": 0.27406274926881147, + "grad_norm": 1.4272018671035767, + "learning_rate": 4.365919182713416e-05, + "loss": 1.5264, + "step": 8246 + }, + { + "epoch": 0.2750930603562882, + "grad_norm": 1.3808270692825317, + "learning_rate": 4.360319350701226e-05, + "loss": 1.5255, + "step": 8277 + }, + { + "epoch": 0.276123371443765, + "grad_norm": 1.4179162979125977, + "learning_rate": 4.3546985252380115e-05, + "loss": 1.535, + "step": 8308 + }, + { + "epoch": 0.2771536825312417, + "grad_norm": 1.3617374897003174, + "learning_rate": 4.349056769754021e-05, + "loss": 1.5295, + "step": 8339 + }, + { + "epoch": 0.27818399361871843, + "grad_norm": 1.4745615720748901, + "learning_rate": 4.3433941479156994e-05, + "loss": 1.5438, + "step": 8370 + }, + { + "epoch": 0.2792143047061952, + "grad_norm": 1.3661375045776367, + "learning_rate": 4.3377107236249647e-05, + "loss": 1.5134, + "step": 8401 + }, + { + "epoch": 0.2802446157936719, + "grad_norm": 1.3907949924468994, + "learning_rate": 4.332006561018488e-05, + "loss": 1.5237, + "step": 8432 + }, + { + "epoch": 0.28127492688114863, + "grad_norm": 1.3575704097747803, + "learning_rate": 4.3262817244669683e-05, + "loss": 1.5226, + "step": 8463 + }, + { + "epoch": 0.2823052379686254, + "grad_norm": 1.3836462497711182, + "learning_rate": 4.3205362785744083e-05, + "loss": 1.5433, + "step": 8494 + }, + { + "epoch": 0.2833355490561021, + "grad_norm": 1.6108276844024658, + "learning_rate": 4.314770288177384e-05, + "loss": 1.5324, + "step": 8525 + }, + { + "epoch": 0.28436586014357884, + "grad_norm": 1.4650689363479614, + "learning_rate": 4.308983818344313e-05, + "loss": 1.535, + "step": 8556 + }, + { + "epoch": 0.2853961712310556, + "grad_norm": 1.5836583375930786, + "learning_rate": 4.3031769343747206e-05, + "loss": 1.5313, + "step": 8587 + }, + { + "epoch": 0.2864264823185323, + "grad_norm": 1.5348492860794067, + "learning_rate": 4.297349701798505e-05, + "loss": 1.5106, + "step": 8618 + }, + { + "epoch": 0.28745679340600905, + "grad_norm": 1.4060319662094116, + "learning_rate": 4.2915021863751916e-05, + "loss": 1.5283, + "step": 8649 + }, + { + "epoch": 0.2884871044934858, + "grad_norm": 1.531657099723816, + "learning_rate": 4.285634454093198e-05, + "loss": 1.5087, + "step": 8680 + }, + { + "epoch": 0.2895174155809625, + "grad_norm": 1.4756299257278442, + "learning_rate": 4.279746571169086e-05, + "loss": 1.5042, + "step": 8711 + }, + { + "epoch": 0.29054772666843925, + "grad_norm": 1.3221153020858765, + "learning_rate": 4.2738386040468136e-05, + "loss": 1.5244, + "step": 8742 + }, + { + "epoch": 0.29157803775591595, + "grad_norm": 1.4067268371582031, + "learning_rate": 4.2679106193969866e-05, + "loss": 1.5012, + "step": 8773 + }, + { + "epoch": 0.2926083488433927, + "grad_norm": 1.5192064046859741, + "learning_rate": 4.261962684116106e-05, + "loss": 1.521, + "step": 8804 + }, + { + "epoch": 0.29363865993086946, + "grad_norm": 1.3847788572311401, + "learning_rate": 4.2559948653258145e-05, + "loss": 1.5128, + "step": 8835 + }, + { + "epoch": 0.29466897101834616, + "grad_norm": 1.4612780809402466, + "learning_rate": 4.250007230372134e-05, + "loss": 1.5371, + "step": 8866 + }, + { + "epoch": 0.2956992821058229, + "grad_norm": 1.468971610069275, + "learning_rate": 4.2439998468247126e-05, + "loss": 1.5199, + "step": 8897 + }, + { + "epoch": 0.29672959319329967, + "grad_norm": 1.386236310005188, + "learning_rate": 4.2379727824760566e-05, + "loss": 1.5273, + "step": 8928 + }, + { + "epoch": 0.29775990428077637, + "grad_norm": 1.3843929767608643, + "learning_rate": 4.231926105340768e-05, + "loss": 1.5011, + "step": 8959 + }, + { + "epoch": 0.2987902153682531, + "grad_norm": 1.4554557800292969, + "learning_rate": 4.225859883654776e-05, + "loss": 1.5311, + "step": 8990 + }, + { + "epoch": 0.2998205264557299, + "grad_norm": 1.3674421310424805, + "learning_rate": 4.219774185874569e-05, + "loss": 1.5302, + "step": 9021 + }, + { + "epoch": 0.3008508375432066, + "grad_norm": 1.3804330825805664, + "learning_rate": 4.213669080676418e-05, + "loss": 1.538, + "step": 9052 + }, + { + "epoch": 0.3018811486306833, + "grad_norm": 1.4643255472183228, + "learning_rate": 4.2075446369556056e-05, + "loss": 1.5172, + "step": 9083 + }, + { + "epoch": 0.3029114597181601, + "grad_norm": 1.3375928401947021, + "learning_rate": 4.201400923825648e-05, + "loss": 1.5123, + "step": 9114 + }, + { + "epoch": 0.3039417708056368, + "grad_norm": 1.4321980476379395, + "learning_rate": 4.195238010617511e-05, + "loss": 1.5196, + "step": 9145 + }, + { + "epoch": 0.30497208189311353, + "grad_norm": 1.4312376976013184, + "learning_rate": 4.1890559668788344e-05, + "loss": 1.5138, + "step": 9176 + }, + { + "epoch": 0.3060023929805903, + "grad_norm": 1.3089646100997925, + "learning_rate": 4.1828548623731405e-05, + "loss": 1.5027, + "step": 9207 + }, + { + "epoch": 0.307032704068067, + "grad_norm": 1.4863250255584717, + "learning_rate": 4.1766347670790506e-05, + "loss": 1.5091, + "step": 9238 + }, + { + "epoch": 0.30806301515554374, + "grad_norm": 1.373666763305664, + "learning_rate": 4.170395751189495e-05, + "loss": 1.5256, + "step": 9269 + }, + { + "epoch": 0.3090933262430205, + "grad_norm": 1.4160584211349487, + "learning_rate": 4.164137885110921e-05, + "loss": 1.4938, + "step": 9300 + }, + { + "epoch": 0.3101236373304972, + "grad_norm": 2.112110137939453, + "learning_rate": 4.157861239462495e-05, + "loss": 1.5106, + "step": 9331 + }, + { + "epoch": 0.31115394841797395, + "grad_norm": 1.337058663368225, + "learning_rate": 4.1515658850753114e-05, + "loss": 1.4999, + "step": 9362 + }, + { + "epoch": 0.3121842595054507, + "grad_norm": 1.3625296354293823, + "learning_rate": 4.145251892991588e-05, + "loss": 1.5136, + "step": 9393 + }, + { + "epoch": 0.3132145705929274, + "grad_norm": 1.399491548538208, + "learning_rate": 4.138919334463868e-05, + "loss": 1.499, + "step": 9424 + }, + { + "epoch": 0.31424488168040415, + "grad_norm": 1.4202344417572021, + "learning_rate": 4.1325682809542124e-05, + "loss": 1.5049, + "step": 9455 + }, + { + "epoch": 0.3152751927678809, + "grad_norm": 1.392248272895813, + "learning_rate": 4.126198804133398e-05, + "loss": 1.5287, + "step": 9486 + }, + { + "epoch": 0.3163055038553576, + "grad_norm": 1.3807618618011475, + "learning_rate": 4.1198109758801055e-05, + "loss": 1.5309, + "step": 9517 + }, + { + "epoch": 0.31733581494283436, + "grad_norm": 1.3117905855178833, + "learning_rate": 4.113404868280107e-05, + "loss": 1.4933, + "step": 9548 + }, + { + "epoch": 0.3183661260303111, + "grad_norm": 1.452086091041565, + "learning_rate": 4.106980553625457e-05, + "loss": 1.5221, + "step": 9579 + }, + { + "epoch": 0.3193964371177878, + "grad_norm": 1.477364182472229, + "learning_rate": 4.100538104413674e-05, + "loss": 1.4904, + "step": 9610 + }, + { + "epoch": 0.32042674820526457, + "grad_norm": 1.3584345579147339, + "learning_rate": 4.09407759334692e-05, + "loss": 1.4953, + "step": 9641 + }, + { + "epoch": 0.32145705929274127, + "grad_norm": 1.3619811534881592, + "learning_rate": 4.087599093331186e-05, + "loss": 1.4956, + "step": 9672 + }, + { + "epoch": 0.322487370380218, + "grad_norm": 1.4507052898406982, + "learning_rate": 4.081102677475462e-05, + "loss": 1.5197, + "step": 9703 + }, + { + "epoch": 0.3235176814676948, + "grad_norm": 1.4229698181152344, + "learning_rate": 4.0745884190909194e-05, + "loss": 1.498, + "step": 9734 + }, + { + "epoch": 0.32454799255517147, + "grad_norm": 1.3074679374694824, + "learning_rate": 4.0680563916900796e-05, + "loss": 1.5146, + "step": 9765 + }, + { + "epoch": 0.3255783036426482, + "grad_norm": 1.397815465927124, + "learning_rate": 4.0615066689859815e-05, + "loss": 1.5291, + "step": 9796 + }, + { + "epoch": 0.326608614730125, + "grad_norm": 1.3196336030960083, + "learning_rate": 4.0549393248913584e-05, + "loss": 1.5077, + "step": 9827 + }, + { + "epoch": 0.3276389258176017, + "grad_norm": 1.3129957914352417, + "learning_rate": 4.048354433517794e-05, + "loss": 1.4965, + "step": 9858 + }, + { + "epoch": 0.32866923690507843, + "grad_norm": 1.4380089044570923, + "learning_rate": 4.0417520691748916e-05, + "loss": 1.5115, + "step": 9889 + }, + { + "epoch": 0.3296995479925552, + "grad_norm": 1.3162370920181274, + "learning_rate": 4.035132306369438e-05, + "loss": 1.5029, + "step": 9920 + }, + { + "epoch": 0.3307298590800319, + "grad_norm": 1.3739668130874634, + "learning_rate": 4.028495219804555e-05, + "loss": 1.5083, + "step": 9951 + }, + { + "epoch": 0.33176017016750864, + "grad_norm": 1.3673723936080933, + "learning_rate": 4.021840884378864e-05, + "loss": 1.5223, + "step": 9982 + }, + { + "epoch": 0.3327904812549854, + "grad_norm": 1.3970317840576172, + "learning_rate": 4.015169375185633e-05, + "loss": 1.5003, + "step": 10013 + }, + { + "epoch": 0.3338207923424621, + "grad_norm": 1.2982394695281982, + "learning_rate": 4.0084807675119396e-05, + "loss": 1.5066, + "step": 10044 + }, + { + "epoch": 0.33485110342993885, + "grad_norm": 1.4548689126968384, + "learning_rate": 4.0017751368378106e-05, + "loss": 1.4993, + "step": 10075 + }, + { + "epoch": 0.3358814145174156, + "grad_norm": 1.3693586587905884, + "learning_rate": 3.995052558835377e-05, + "loss": 1.4987, + "step": 10106 + }, + { + "epoch": 0.3369117256048923, + "grad_norm": 1.4046767950057983, + "learning_rate": 3.988313109368017e-05, + "loss": 1.5098, + "step": 10137 + }, + { + "epoch": 0.33794203669236905, + "grad_norm": 1.3772069215774536, + "learning_rate": 3.981556864489504e-05, + "loss": 1.5165, + "step": 10168 + }, + { + "epoch": 0.3389723477798458, + "grad_norm": 1.471211314201355, + "learning_rate": 3.974783900443142e-05, + "loss": 1.5037, + "step": 10199 + }, + { + "epoch": 0.3400026588673225, + "grad_norm": 1.3990979194641113, + "learning_rate": 3.9679942936609095e-05, + "loss": 1.5096, + "step": 10230 + }, + { + "epoch": 0.34103296995479926, + "grad_norm": 1.3779234886169434, + "learning_rate": 3.961188120762596e-05, + "loss": 1.4914, + "step": 10261 + }, + { + "epoch": 0.342063281042276, + "grad_norm": 1.2866768836975098, + "learning_rate": 3.954365458554938e-05, + "loss": 1.5026, + "step": 10292 + }, + { + "epoch": 0.3430935921297527, + "grad_norm": 1.353468894958496, + "learning_rate": 3.947526384030751e-05, + "loss": 1.5063, + "step": 10323 + }, + { + "epoch": 0.34412390321722947, + "grad_norm": 1.3264256715774536, + "learning_rate": 3.9406709743680624e-05, + "loss": 1.4911, + "step": 10354 + }, + { + "epoch": 0.3451542143047062, + "grad_norm": 1.3496876955032349, + "learning_rate": 3.9337993069292366e-05, + "loss": 1.4921, + "step": 10385 + }, + { + "epoch": 0.3461845253921829, + "grad_norm": 1.3812434673309326, + "learning_rate": 3.926911459260109e-05, + "loss": 1.4826, + "step": 10416 + }, + { + "epoch": 0.34721483647965967, + "grad_norm": 1.4926965236663818, + "learning_rate": 3.920007509089102e-05, + "loss": 1.4994, + "step": 10447 + }, + { + "epoch": 0.3482451475671364, + "grad_norm": 1.3446170091629028, + "learning_rate": 3.913087534326357e-05, + "loss": 1.5114, + "step": 10478 + }, + { + "epoch": 0.3492754586546131, + "grad_norm": 1.3100495338439941, + "learning_rate": 3.9061516130628475e-05, + "loss": 1.5066, + "step": 10509 + }, + { + "epoch": 0.3503057697420899, + "grad_norm": 1.395874261856079, + "learning_rate": 3.8991998235695025e-05, + "loss": 1.4999, + "step": 10540 + }, + { + "epoch": 0.3513360808295666, + "grad_norm": 1.3682137727737427, + "learning_rate": 3.8922322442963224e-05, + "loss": 1.4778, + "step": 10571 + }, + { + "epoch": 0.35236639191704333, + "grad_norm": 1.4196573495864868, + "learning_rate": 3.885248953871491e-05, + "loss": 1.4909, + "step": 10602 + }, + { + "epoch": 0.3533967030045201, + "grad_norm": 1.4299864768981934, + "learning_rate": 3.8782500311004915e-05, + "loss": 1.5025, + "step": 10633 + }, + { + "epoch": 0.3544270140919968, + "grad_norm": 1.39677095413208, + "learning_rate": 3.871235554965218e-05, + "loss": 1.4932, + "step": 10664 + }, + { + "epoch": 0.35545732517947354, + "grad_norm": 1.3219736814498901, + "learning_rate": 3.864205604623078e-05, + "loss": 1.4795, + "step": 10695 + }, + { + "epoch": 0.3564876362669503, + "grad_norm": 1.3649324178695679, + "learning_rate": 3.857160259406107e-05, + "loss": 1.4838, + "step": 10726 + }, + { + "epoch": 0.357517947354427, + "grad_norm": 1.4109989404678345, + "learning_rate": 3.8500995988200674e-05, + "loss": 1.5058, + "step": 10757 + }, + { + "epoch": 0.35854825844190374, + "grad_norm": 1.3625038862228394, + "learning_rate": 3.843023702543556e-05, + "loss": 1.4912, + "step": 10788 + }, + { + "epoch": 0.3595785695293805, + "grad_norm": 1.4725775718688965, + "learning_rate": 3.8359326504270984e-05, + "loss": 1.5012, + "step": 10819 + }, + { + "epoch": 0.3606088806168572, + "grad_norm": 1.4126085042953491, + "learning_rate": 3.828826522492255e-05, + "loss": 1.4977, + "step": 10850 + }, + { + "epoch": 0.36163919170433395, + "grad_norm": 1.3949086666107178, + "learning_rate": 3.821705398930713e-05, + "loss": 1.4903, + "step": 10881 + }, + { + "epoch": 0.3626695027918107, + "grad_norm": 1.286792516708374, + "learning_rate": 3.814569360103385e-05, + "loss": 1.5067, + "step": 10912 + }, + { + "epoch": 0.3636998138792874, + "grad_norm": 1.274703025817871, + "learning_rate": 3.807418486539499e-05, + "loss": 1.4583, + "step": 10943 + }, + { + "epoch": 0.36473012496676416, + "grad_norm": 1.401455283164978, + "learning_rate": 3.80025285893569e-05, + "loss": 1.4834, + "step": 10974 + }, + { + "epoch": 0.3657604360542409, + "grad_norm": 1.308361530303955, + "learning_rate": 3.793072558155093e-05, + "loss": 1.4832, + "step": 11005 + }, + { + "epoch": 0.3667907471417176, + "grad_norm": 1.654733419418335, + "learning_rate": 3.785877665226426e-05, + "loss": 1.4867, + "step": 11036 + }, + { + "epoch": 0.36782105822919436, + "grad_norm": 1.3530856370925903, + "learning_rate": 3.778668261343079e-05, + "loss": 1.4873, + "step": 11067 + }, + { + "epoch": 0.3688513693166711, + "grad_norm": 1.3567407131195068, + "learning_rate": 3.771444427862192e-05, + "loss": 1.4935, + "step": 11098 + }, + { + "epoch": 0.3698816804041478, + "grad_norm": 1.3184572458267212, + "learning_rate": 3.7642062463037465e-05, + "loss": 1.4891, + "step": 11129 + }, + { + "epoch": 0.37091199149162457, + "grad_norm": 1.366489291191101, + "learning_rate": 3.7569537983496373e-05, + "loss": 1.5159, + "step": 11160 + }, + { + "epoch": 0.3719423025791013, + "grad_norm": 1.423258662223816, + "learning_rate": 3.749687165842753e-05, + "loss": 1.4938, + "step": 11191 + }, + { + "epoch": 0.372972613666578, + "grad_norm": 1.3226194381713867, + "learning_rate": 3.7424064307860536e-05, + "loss": 1.499, + "step": 11222 + }, + { + "epoch": 0.3740029247540548, + "grad_norm": 1.350500464439392, + "learning_rate": 3.735111675341645e-05, + "loss": 1.4952, + "step": 11253 + }, + { + "epoch": 0.37503323584153153, + "grad_norm": 1.3667839765548706, + "learning_rate": 3.7278029818298524e-05, + "loss": 1.4763, + "step": 11284 + }, + { + "epoch": 0.37606354692900823, + "grad_norm": 1.4876132011413574, + "learning_rate": 3.720480432728287e-05, + "loss": 1.4913, + "step": 11315 + }, + { + "epoch": 0.377093858016485, + "grad_norm": 1.3927743434906006, + "learning_rate": 3.71314411067092e-05, + "loss": 1.4948, + "step": 11346 + }, + { + "epoch": 0.37812416910396174, + "grad_norm": 1.3752413988113403, + "learning_rate": 3.70579409844715e-05, + "loss": 1.4763, + "step": 11377 + }, + { + "epoch": 0.37915448019143844, + "grad_norm": 1.3530951738357544, + "learning_rate": 3.698430479000865e-05, + "loss": 1.5077, + "step": 11408 + }, + { + "epoch": 0.3801847912789152, + "grad_norm": 1.4309345483779907, + "learning_rate": 3.691053335429509e-05, + "loss": 1.4945, + "step": 11439 + }, + { + "epoch": 0.38121510236639194, + "grad_norm": 1.2874380350112915, + "learning_rate": 3.683662750983147e-05, + "loss": 1.4698, + "step": 11470 + }, + { + "epoch": 0.38224541345386864, + "grad_norm": 1.3356250524520874, + "learning_rate": 3.676258809063518e-05, + "loss": 1.4924, + "step": 11501 + }, + { + "epoch": 0.3832757245413454, + "grad_norm": 1.304559588432312, + "learning_rate": 3.6688415932231004e-05, + "loss": 1.4682, + "step": 11532 + }, + { + "epoch": 0.3843060356288221, + "grad_norm": 1.4153447151184082, + "learning_rate": 3.661411187164166e-05, + "loss": 1.4989, + "step": 11563 + }, + { + "epoch": 0.38533634671629885, + "grad_norm": 1.356992244720459, + "learning_rate": 3.65396767473784e-05, + "loss": 1.4854, + "step": 11594 + }, + { + "epoch": 0.3863666578037756, + "grad_norm": 1.322449803352356, + "learning_rate": 3.6465111399431465e-05, + "loss": 1.4877, + "step": 11625 + }, + { + "epoch": 0.3873969688912523, + "grad_norm": 1.3981350660324097, + "learning_rate": 3.6390416669260674e-05, + "loss": 1.499, + "step": 11656 + }, + { + "epoch": 0.38842727997872906, + "grad_norm": 1.324871301651001, + "learning_rate": 3.63155933997859e-05, + "loss": 1.4814, + "step": 11687 + }, + { + "epoch": 0.3894575910662058, + "grad_norm": 1.3940790891647339, + "learning_rate": 3.624064243537758e-05, + "loss": 1.4754, + "step": 11718 + }, + { + "epoch": 0.3904879021536825, + "grad_norm": 1.2880780696868896, + "learning_rate": 3.616556462184716e-05, + "loss": 1.4832, + "step": 11749 + }, + { + "epoch": 0.39151821324115926, + "grad_norm": 1.315329670906067, + "learning_rate": 3.609036080643755e-05, + "loss": 1.4853, + "step": 11780 + }, + { + "epoch": 0.392548524328636, + "grad_norm": 1.4093523025512695, + "learning_rate": 3.60150318378136e-05, + "loss": 1.4978, + "step": 11811 + }, + { + "epoch": 0.3935788354161127, + "grad_norm": 1.271151065826416, + "learning_rate": 3.5939578566052465e-05, + "loss": 1.4933, + "step": 11842 + }, + { + "epoch": 0.39460914650358947, + "grad_norm": 1.2910923957824707, + "learning_rate": 3.586400184263408e-05, + "loss": 1.4853, + "step": 11873 + }, + { + "epoch": 0.3956394575910662, + "grad_norm": 1.2480064630508423, + "learning_rate": 3.578830252043148e-05, + "loss": 1.4642, + "step": 11904 + }, + { + "epoch": 0.3966697686785429, + "grad_norm": 1.263197422027588, + "learning_rate": 3.571248145370125e-05, + "loss": 1.4812, + "step": 11935 + }, + { + "epoch": 0.3977000797660197, + "grad_norm": 1.3231288194656372, + "learning_rate": 3.5636539498073794e-05, + "loss": 1.4744, + "step": 11966 + }, + { + "epoch": 0.39873039085349643, + "grad_norm": 1.3933110237121582, + "learning_rate": 3.556047751054378e-05, + "loss": 1.4849, + "step": 11997 + }, + { + "epoch": 0.39976070194097313, + "grad_norm": 1.3615801334381104, + "learning_rate": 3.548429634946039e-05, + "loss": 1.4866, + "step": 12028 + }, + { + "epoch": 0.4007910130284499, + "grad_norm": 1.298638939857483, + "learning_rate": 3.540799687451768e-05, + "loss": 1.4664, + "step": 12059 + }, + { + "epoch": 0.40182132411592664, + "grad_norm": 1.29216468334198, + "learning_rate": 3.533157994674485e-05, + "loss": 1.4697, + "step": 12090 + }, + { + "epoch": 0.40285163520340334, + "grad_norm": 1.3759845495224, + "learning_rate": 3.5255046428496546e-05, + "loss": 1.4854, + "step": 12121 + }, + { + "epoch": 0.4038819462908801, + "grad_norm": 1.4045615196228027, + "learning_rate": 3.517839718344311e-05, + "loss": 1.4622, + "step": 12152 + }, + { + "epoch": 0.40491225737835684, + "grad_norm": 1.2979034185409546, + "learning_rate": 3.510163307656086e-05, + "loss": 1.4797, + "step": 12183 + }, + { + "epoch": 0.40594256846583354, + "grad_norm": 1.303139567375183, + "learning_rate": 3.5024754974122324e-05, + "loss": 1.4588, + "step": 12214 + }, + { + "epoch": 0.4069728795533103, + "grad_norm": 1.287781834602356, + "learning_rate": 3.494776374368643e-05, + "loss": 1.4834, + "step": 12245 + }, + { + "epoch": 0.40800319064078705, + "grad_norm": 1.3806688785552979, + "learning_rate": 3.4870660254088724e-05, + "loss": 1.4807, + "step": 12276 + }, + { + "epoch": 0.40903350172826375, + "grad_norm": 1.4059745073318481, + "learning_rate": 3.479344537543164e-05, + "loss": 1.4906, + "step": 12307 + }, + { + "epoch": 0.4100638128157405, + "grad_norm": 1.3052942752838135, + "learning_rate": 3.4716119979074565e-05, + "loss": 1.4801, + "step": 12338 + }, + { + "epoch": 0.41109412390321726, + "grad_norm": 1.3306844234466553, + "learning_rate": 3.463868493762412e-05, + "loss": 1.4911, + "step": 12369 + }, + { + "epoch": 0.41212443499069396, + "grad_norm": 1.3276656866073608, + "learning_rate": 3.456114112492418e-05, + "loss": 1.4678, + "step": 12400 + }, + { + "epoch": 0.4131547460781707, + "grad_norm": 1.3164253234863281, + "learning_rate": 3.4483489416046164e-05, + "loss": 1.4816, + "step": 12431 + }, + { + "epoch": 0.4141850571656474, + "grad_norm": 1.3827886581420898, + "learning_rate": 3.440573068727905e-05, + "loss": 1.481, + "step": 12462 + }, + { + "epoch": 0.41521536825312416, + "grad_norm": 1.2899463176727295, + "learning_rate": 3.4327865816119495e-05, + "loss": 1.4575, + "step": 12493 + }, + { + "epoch": 0.4162456793406009, + "grad_norm": 1.3136677742004395, + "learning_rate": 3.4249895681262025e-05, + "loss": 1.4695, + "step": 12524 + }, + { + "epoch": 0.4172759904280776, + "grad_norm": 1.2920372486114502, + "learning_rate": 3.417182116258899e-05, + "loss": 1.4765, + "step": 12555 + }, + { + "epoch": 0.41830630151555437, + "grad_norm": 1.3285510540008545, + "learning_rate": 3.409364314116074e-05, + "loss": 1.4559, + "step": 12586 + }, + { + "epoch": 0.4193366126030311, + "grad_norm": 1.2834984064102173, + "learning_rate": 3.401536249920559e-05, + "loss": 1.4706, + "step": 12617 + }, + { + "epoch": 0.4203669236905078, + "grad_norm": 1.315942645072937, + "learning_rate": 3.393698012010998e-05, + "loss": 1.4692, + "step": 12648 + }, + { + "epoch": 0.4213972347779846, + "grad_norm": 1.3668091297149658, + "learning_rate": 3.385849688840839e-05, + "loss": 1.4801, + "step": 12679 + }, + { + "epoch": 0.42242754586546133, + "grad_norm": 1.312280297279358, + "learning_rate": 3.3779913689773414e-05, + "loss": 1.4673, + "step": 12710 + }, + { + "epoch": 0.423457856952938, + "grad_norm": 1.3579858541488647, + "learning_rate": 3.370123141100578e-05, + "loss": 1.4578, + "step": 12741 + }, + { + "epoch": 0.4244881680404148, + "grad_norm": 1.4001456499099731, + "learning_rate": 3.3622450940024305e-05, + "loss": 1.4787, + "step": 12772 + }, + { + "epoch": 0.42551847912789154, + "grad_norm": 1.352629542350769, + "learning_rate": 3.35435731658559e-05, + "loss": 1.457, + "step": 12803 + }, + { + "epoch": 0.42654879021536823, + "grad_norm": 1.4044222831726074, + "learning_rate": 3.346459897862552e-05, + "loss": 1.4979, + "step": 12834 + }, + { + "epoch": 0.427579101302845, + "grad_norm": 1.2666436433792114, + "learning_rate": 3.338552926954613e-05, + "loss": 1.4712, + "step": 12865 + }, + { + "epoch": 0.42860941239032174, + "grad_norm": 1.2487694025039673, + "learning_rate": 3.330636493090868e-05, + "loss": 1.4784, + "step": 12896 + }, + { + "epoch": 0.42963972347779844, + "grad_norm": 1.2346290349960327, + "learning_rate": 3.322710685607193e-05, + "loss": 1.4754, + "step": 12927 + }, + { + "epoch": 0.4306700345652752, + "grad_norm": 1.2908893823623657, + "learning_rate": 3.314775593945251e-05, + "loss": 1.4677, + "step": 12958 + }, + { + "epoch": 0.43170034565275195, + "grad_norm": 1.3283506631851196, + "learning_rate": 3.3068313076514714e-05, + "loss": 1.4661, + "step": 12989 + }, + { + "epoch": 0.43273065674022865, + "grad_norm": 1.2982537746429443, + "learning_rate": 3.298877916376047e-05, + "loss": 1.4838, + "step": 13020 + }, + { + "epoch": 0.4337609678277054, + "grad_norm": 1.3566454648971558, + "learning_rate": 3.290915509871915e-05, + "loss": 1.4683, + "step": 13051 + }, + { + "epoch": 0.43479127891518216, + "grad_norm": 1.3470877408981323, + "learning_rate": 3.282944177993753e-05, + "loss": 1.4724, + "step": 13082 + }, + { + "epoch": 0.43582159000265885, + "grad_norm": 1.451150894165039, + "learning_rate": 3.274964010696957e-05, + "loss": 1.4731, + "step": 13113 + }, + { + "epoch": 0.4368519010901356, + "grad_norm": 1.3415958881378174, + "learning_rate": 3.266975098036629e-05, + "loss": 1.4809, + "step": 13144 + }, + { + "epoch": 0.43788221217761236, + "grad_norm": 1.2775352001190186, + "learning_rate": 3.258977530166562e-05, + "loss": 1.4523, + "step": 13175 + }, + { + "epoch": 0.43891252326508906, + "grad_norm": 1.365050196647644, + "learning_rate": 3.250971397338227e-05, + "loss": 1.4611, + "step": 13206 + }, + { + "epoch": 0.4399428343525658, + "grad_norm": 1.3481686115264893, + "learning_rate": 3.2429567898997404e-05, + "loss": 1.4708, + "step": 13237 + }, + { + "epoch": 0.44097314544004257, + "grad_norm": 1.3418121337890625, + "learning_rate": 3.234933798294859e-05, + "loss": 1.485, + "step": 13268 + }, + { + "epoch": 0.44200345652751927, + "grad_norm": 1.3098441362380981, + "learning_rate": 3.2269025130619535e-05, + "loss": 1.472, + "step": 13299 + }, + { + "epoch": 0.443033767614996, + "grad_norm": 1.2792437076568604, + "learning_rate": 3.218863024832985e-05, + "loss": 1.4592, + "step": 13330 + }, + { + "epoch": 0.4440640787024727, + "grad_norm": 1.3804035186767578, + "learning_rate": 3.2108154243324864e-05, + "loss": 1.4546, + "step": 13361 + }, + { + "epoch": 0.4450943897899495, + "grad_norm": 1.287787675857544, + "learning_rate": 3.2027598023765345e-05, + "loss": 1.4477, + "step": 13392 + }, + { + "epoch": 0.44612470087742623, + "grad_norm": 1.5964646339416504, + "learning_rate": 3.194696249871729e-05, + "loss": 1.4468, + "step": 13423 + }, + { + "epoch": 0.4471550119649029, + "grad_norm": 1.3253474235534668, + "learning_rate": 3.186624857814164e-05, + "loss": 1.4588, + "step": 13454 + }, + { + "epoch": 0.4481853230523797, + "grad_norm": 1.288176417350769, + "learning_rate": 3.178545717288401e-05, + "loss": 1.4644, + "step": 13485 + }, + { + "epoch": 0.44921563413985643, + "grad_norm": 1.3357142210006714, + "learning_rate": 3.170458919466444e-05, + "loss": 1.4871, + "step": 13516 + }, + { + "epoch": 0.45024594522733313, + "grad_norm": 1.2954436540603638, + "learning_rate": 3.1623645556067063e-05, + "loss": 1.4571, + "step": 13547 + }, + { + "epoch": 0.4512762563148099, + "grad_norm": 1.344789981842041, + "learning_rate": 3.154262717052985e-05, + "loss": 1.459, + "step": 13578 + }, + { + "epoch": 0.45230656740228664, + "grad_norm": 1.2648475170135498, + "learning_rate": 3.146153495233426e-05, + "loss": 1.4496, + "step": 13609 + }, + { + "epoch": 0.45333687848976334, + "grad_norm": 1.312733769416809, + "learning_rate": 3.1380369816594944e-05, + "loss": 1.4309, + "step": 13640 + }, + { + "epoch": 0.4543671895772401, + "grad_norm": 1.3719325065612793, + "learning_rate": 3.129913267924946e-05, + "loss": 1.4723, + "step": 13671 + }, + { + "epoch": 0.45539750066471685, + "grad_norm": 1.2850617170333862, + "learning_rate": 3.121782445704782e-05, + "loss": 1.4599, + "step": 13702 + }, + { + "epoch": 0.45642781175219355, + "grad_norm": 1.3335177898406982, + "learning_rate": 3.11364460675423e-05, + "loss": 1.4821, + "step": 13733 + }, + { + "epoch": 0.4574581228396703, + "grad_norm": 1.1675069332122803, + "learning_rate": 3.1054998429076934e-05, + "loss": 1.453, + "step": 13764 + }, + { + "epoch": 0.45848843392714705, + "grad_norm": 1.283544898033142, + "learning_rate": 3.097348246077728e-05, + "loss": 1.4545, + "step": 13795 + }, + { + "epoch": 0.45951874501462375, + "grad_norm": 1.4358693361282349, + "learning_rate": 3.0891899082539924e-05, + "loss": 1.4673, + "step": 13826 + }, + { + "epoch": 0.4605490561021005, + "grad_norm": 1.2551497220993042, + "learning_rate": 3.0810249215022233e-05, + "loss": 1.4532, + "step": 13857 + }, + { + "epoch": 0.46157936718957726, + "grad_norm": 1.2574602365493774, + "learning_rate": 3.0728533779631865e-05, + "loss": 1.4762, + "step": 13888 + }, + { + "epoch": 0.46260967827705396, + "grad_norm": 1.2202764749526978, + "learning_rate": 3.064675369851637e-05, + "loss": 1.4461, + "step": 13919 + }, + { + "epoch": 0.4636399893645307, + "grad_norm": 1.2787501811981201, + "learning_rate": 3.056490989455289e-05, + "loss": 1.4607, + "step": 13950 + }, + { + "epoch": 0.46467030045200747, + "grad_norm": 1.2511006593704224, + "learning_rate": 3.0483003291337596e-05, + "loss": 1.4548, + "step": 13981 + }, + { + "epoch": 0.46570061153948417, + "grad_norm": 1.2749834060668945, + "learning_rate": 3.040103481317539e-05, + "loss": 1.4394, + "step": 14012 + }, + { + "epoch": 0.4667309226269609, + "grad_norm": 1.223057746887207, + "learning_rate": 3.03190053850694e-05, + "loss": 1.4684, + "step": 14043 + }, + { + "epoch": 0.4677612337144377, + "grad_norm": 1.39846932888031, + "learning_rate": 3.0236915932710573e-05, + "loss": 1.4657, + "step": 14074 + }, + { + "epoch": 0.4687915448019144, + "grad_norm": 1.5305665731430054, + "learning_rate": 3.0154767382467232e-05, + "loss": 1.4795, + "step": 14105 + }, + { + "epoch": 0.4698218558893911, + "grad_norm": 1.2569035291671753, + "learning_rate": 3.0072560661374582e-05, + "loss": 1.4756, + "step": 14136 + }, + { + "epoch": 0.4708521669768679, + "grad_norm": 1.3472824096679688, + "learning_rate": 2.999029669712431e-05, + "loss": 1.4682, + "step": 14167 + }, + { + "epoch": 0.4718824780643446, + "grad_norm": 1.271714210510254, + "learning_rate": 2.990797641805408e-05, + "loss": 1.4509, + "step": 14198 + }, + { + "epoch": 0.47291278915182133, + "grad_norm": 1.3342047929763794, + "learning_rate": 2.982560075313704e-05, + "loss": 1.4528, + "step": 14229 + }, + { + "epoch": 0.47394310023929803, + "grad_norm": 1.5821506977081299, + "learning_rate": 2.9743170631971368e-05, + "loss": 1.4609, + "step": 14260 + }, + { + "epoch": 0.4749734113267748, + "grad_norm": 1.2598062753677368, + "learning_rate": 2.9660686984769792e-05, + "loss": 1.471, + "step": 14291 + }, + { + "epoch": 0.47600372241425154, + "grad_norm": 1.2648885250091553, + "learning_rate": 2.9578150742349047e-05, + "loss": 1.4708, + "step": 14322 + }, + { + "epoch": 0.47703403350172824, + "grad_norm": 1.559665560722351, + "learning_rate": 2.949556283611942e-05, + "loss": 1.4516, + "step": 14353 + }, + { + "epoch": 0.478064344589205, + "grad_norm": 1.2621581554412842, + "learning_rate": 2.9412924198074206e-05, + "loss": 1.446, + "step": 14384 + }, + { + "epoch": 0.47909465567668175, + "grad_norm": 1.2775017023086548, + "learning_rate": 2.9330235760779208e-05, + "loss": 1.4496, + "step": 14415 + }, + { + "epoch": 0.48012496676415845, + "grad_norm": 1.2010388374328613, + "learning_rate": 2.9247498457362188e-05, + "loss": 1.4606, + "step": 14446 + }, + { + "epoch": 0.4811552778516352, + "grad_norm": 1.3053895235061646, + "learning_rate": 2.9164713221502373e-05, + "loss": 1.4536, + "step": 14477 + }, + { + "epoch": 0.48218558893911195, + "grad_norm": 1.311596155166626, + "learning_rate": 2.9081880987419912e-05, + "loss": 1.4409, + "step": 14508 + }, + { + "epoch": 0.48321590002658865, + "grad_norm": 1.3888933658599854, + "learning_rate": 2.8999002689865296e-05, + "loss": 1.4314, + "step": 14539 + }, + { + "epoch": 0.4842462111140654, + "grad_norm": 1.288619875907898, + "learning_rate": 2.8916079264108852e-05, + "loss": 1.4539, + "step": 14570 + }, + { + "epoch": 0.48527652220154216, + "grad_norm": 1.2974294424057007, + "learning_rate": 2.883311164593017e-05, + "loss": 1.4627, + "step": 14601 + }, + { + "epoch": 0.48630683328901886, + "grad_norm": 1.2057379484176636, + "learning_rate": 2.875010077160754e-05, + "loss": 1.4578, + "step": 14632 + }, + { + "epoch": 0.4873371443764956, + "grad_norm": 1.363971471786499, + "learning_rate": 2.866704757790741e-05, + "loss": 1.4671, + "step": 14663 + }, + { + "epoch": 0.48836745546397237, + "grad_norm": 1.2696925401687622, + "learning_rate": 2.858395300207376e-05, + "loss": 1.4333, + "step": 14694 + }, + { + "epoch": 0.48939776655144906, + "grad_norm": 1.2653478384017944, + "learning_rate": 2.8500817981817607e-05, + "loss": 1.4662, + "step": 14725 + }, + { + "epoch": 0.4904280776389258, + "grad_norm": 1.3011239767074585, + "learning_rate": 2.8417643455306336e-05, + "loss": 1.4589, + "step": 14756 + }, + { + "epoch": 0.4914583887264026, + "grad_norm": 1.3312432765960693, + "learning_rate": 2.8334430361153185e-05, + "loss": 1.4368, + "step": 14787 + }, + { + "epoch": 0.49248869981387927, + "grad_norm": 1.3015661239624023, + "learning_rate": 2.8251179638406612e-05, + "loss": 1.466, + "step": 14818 + }, + { + "epoch": 0.493519010901356, + "grad_norm": 1.3215759992599487, + "learning_rate": 2.8167892226539704e-05, + "loss": 1.4486, + "step": 14849 + }, + { + "epoch": 0.4945493219888328, + "grad_norm": 1.2909883260726929, + "learning_rate": 2.8084569065439588e-05, + "loss": 1.4433, + "step": 14880 + }, + { + "epoch": 0.4955796330763095, + "grad_norm": 1.364015817642212, + "learning_rate": 2.8001211095396807e-05, + "loss": 1.4449, + "step": 14911 + }, + { + "epoch": 0.49660994416378623, + "grad_norm": 1.2468819618225098, + "learning_rate": 2.791781925709473e-05, + "loss": 1.4572, + "step": 14942 + }, + { + "epoch": 0.497640255251263, + "grad_norm": 1.2739325761795044, + "learning_rate": 2.7834394491598908e-05, + "loss": 1.4478, + "step": 14973 + }, + { + "epoch": 0.4986705663387397, + "grad_norm": 1.3384937047958374, + "learning_rate": 2.7750937740346485e-05, + "loss": 1.4429, + "step": 15004 + }, + { + "epoch": 0.49970087742621644, + "grad_norm": 1.231088399887085, + "learning_rate": 2.7667449945135564e-05, + "loss": 1.4631, + "step": 15035 + }, + { + "epoch": 0.5007311885136931, + "grad_norm": 1.2262307405471802, + "learning_rate": 2.7583932048114557e-05, + "loss": 1.4508, + "step": 15066 + }, + { + "epoch": 0.5017614996011699, + "grad_norm": 1.3427774906158447, + "learning_rate": 2.7500384991771587e-05, + "loss": 1.4441, + "step": 15097 + }, + { + "epoch": 0.5027918106886466, + "grad_norm": 1.2950241565704346, + "learning_rate": 2.7416809718923825e-05, + "loss": 1.4427, + "step": 15128 + }, + { + "epoch": 0.5038221217761234, + "grad_norm": 1.4129016399383545, + "learning_rate": 2.7333207172706864e-05, + "loss": 1.4562, + "step": 15159 + }, + { + "epoch": 0.5048524328636002, + "grad_norm": 1.2751520872116089, + "learning_rate": 2.7249578296564088e-05, + "loss": 1.4517, + "step": 15190 + }, + { + "epoch": 0.5058827439510768, + "grad_norm": 1.302485466003418, + "learning_rate": 2.7165924034235973e-05, + "loss": 1.4327, + "step": 15221 + }, + { + "epoch": 0.5069130550385536, + "grad_norm": 1.295390009880066, + "learning_rate": 2.708224532974953e-05, + "loss": 1.4455, + "step": 15252 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1319725270111355e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-15260/training_args.bin b/checkpoint-15260/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..974208468b82a3c5684aaa384776477cf21c18ca --- /dev/null +++ b/checkpoint-15260/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a23be0ff07d6d3142f7c0980f91dddba845519c24fcb411cbb4b9ddb1513ff +size 5304 diff --git a/checkpoint-18312/config.json b/checkpoint-18312/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aaa74176892d42e1c7f5979b7ddf8ab15985d3 --- /dev/null +++ b/checkpoint-18312/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "/mnt/parscratch/users/acp23ay/private/models/Llama-3.1-8B-Instruct-ta-madlad-mean/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 138256 +} diff --git a/checkpoint-18312/generation_config.json b/checkpoint-18312/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-18312/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-18312/model-00001-of-00007.safetensors b/checkpoint-18312/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9366ba8c07db45284bb7c25c7ff0ae7971612208 --- /dev/null +++ b/checkpoint-18312/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:176c8b34bed0fe83d7de45fa0883e11f12556007adaec5461125663407ad56e9 +size 4983197184 diff --git a/checkpoint-18312/model-00002-of-00007.safetensors b/checkpoint-18312/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d32d83cb96a109af411b9cf577e7fbfe07ea76fc --- /dev/null +++ b/checkpoint-18312/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168629c67732309d49f60a5ec48a6d160212bc987365e82e183dfbf74ba0c1f3 +size 4899116432 diff --git a/checkpoint-18312/model-00003-of-00007.safetensors b/checkpoint-18312/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-18312/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-18312/model-00004-of-00007.safetensors b/checkpoint-18312/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-18312/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-18312/model-00005-of-00007.safetensors b/checkpoint-18312/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-18312/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-18312/model-00006-of-00007.safetensors b/checkpoint-18312/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ffa6e219d95c37fae98cb85314195e227fb97b1e --- /dev/null +++ b/checkpoint-18312/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fca2d0a06f00b713aa3dad380d1fec95db9cd12c94b4ab55eaf000762605a13f +size 4999813120 diff --git a/checkpoint-18312/model-00007-of-00007.safetensors b/checkpoint-18312/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6ba8a9a7df574b4572b578b0da2534c9a7ae7464 --- /dev/null +++ b/checkpoint-18312/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9be579eb0b874544937eb7b61367b399fdf570f8845569d7e909a3d9d90b5922 +size 2734998184 diff --git a/checkpoint-18312/model.safetensors.index.json b/checkpoint-18312/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..318803c6a3dd771c7f7c3b8038a896af7c8322ae --- /dev/null +++ b/checkpoint-18312/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32448724992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-18312/optimizer.pt b/checkpoint-18312/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c01fd80067214a9f4347554ee04eb7abeb6eae3 --- /dev/null +++ b/checkpoint-18312/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b9f31e5915c8018416928425cb14962b8311870cc229d12289dfdf38b52cb27 +size 16040396334 diff --git a/checkpoint-18312/rng_state.pth b/checkpoint-18312/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-18312/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-18312/scheduler.pt b/checkpoint-18312/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..59a8b46d1ac64fc3cd4c673b6051786fee3ed26d --- /dev/null +++ b/checkpoint-18312/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0e65c3d6f29e706fd941a38280ce5628189a6998eac6d29abbeab00ad838d00 +size 1064 diff --git a/checkpoint-18312/trainer_state.json b/checkpoint-18312/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b4a2d68fccb63e9e2410e243bf1a609a8e23ac7a --- /dev/null +++ b/checkpoint-18312/trainer_state.json @@ -0,0 +1,4163 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6086147301249668, + "eval_steps": 500, + "global_step": 18312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001030311087476735, + "grad_norm": 60.25547409057617, + "learning_rate": 1.0157273918741808e-06, + "loss": 8.8455, + "step": 31 + }, + { + "epoch": 0.00206062217495347, + "grad_norm": 15.669363975524902, + "learning_rate": 2.0314547837483616e-06, + "loss": 7.1553, + "step": 62 + }, + { + "epoch": 0.003090933262430205, + "grad_norm": 15.366345405578613, + "learning_rate": 3.0471821756225426e-06, + "loss": 5.8784, + "step": 93 + }, + { + "epoch": 0.00412124434990694, + "grad_norm": 36.30561828613281, + "learning_rate": 4.062909567496723e-06, + "loss": 4.7708, + "step": 124 + }, + { + "epoch": 0.005151555437383675, + "grad_norm": 27.202678680419922, + "learning_rate": 5.078636959370905e-06, + "loss": 4.1629, + "step": 155 + }, + { + "epoch": 0.00618186652486041, + "grad_norm": 24.30484962463379, + "learning_rate": 6.094364351245085e-06, + "loss": 3.867, + "step": 186 + }, + { + "epoch": 0.007212177612337145, + "grad_norm": 19.916366577148438, + "learning_rate": 7.110091743119267e-06, + "loss": 3.6131, + "step": 217 + }, + { + "epoch": 0.00824248869981388, + "grad_norm": 17.577274322509766, + "learning_rate": 8.125819134993446e-06, + "loss": 3.4772, + "step": 248 + }, + { + "epoch": 0.009272799787290615, + "grad_norm": 12.133190155029297, + "learning_rate": 9.141546526867629e-06, + "loss": 3.3218, + "step": 279 + }, + { + "epoch": 0.01030311087476735, + "grad_norm": 19.79263687133789, + "learning_rate": 1.015727391874181e-05, + "loss": 3.2055, + "step": 310 + }, + { + "epoch": 0.011333421962244085, + "grad_norm": 16.38133430480957, + "learning_rate": 1.117300131061599e-05, + "loss": 3.1062, + "step": 341 + }, + { + "epoch": 0.01236373304972082, + "grad_norm": 12.638299942016602, + "learning_rate": 1.218872870249017e-05, + "loss": 3.0106, + "step": 372 + }, + { + "epoch": 0.013394044137197554, + "grad_norm": 9.46596908569336, + "learning_rate": 1.3204456094364351e-05, + "loss": 2.924, + "step": 403 + }, + { + "epoch": 0.01442435522467429, + "grad_norm": 10.945392608642578, + "learning_rate": 1.4220183486238533e-05, + "loss": 2.844, + "step": 434 + }, + { + "epoch": 0.015454666312151024, + "grad_norm": 8.474015235900879, + "learning_rate": 1.5235910878112714e-05, + "loss": 2.7892, + "step": 465 + }, + { + "epoch": 0.01648497739962776, + "grad_norm": 9.370804786682129, + "learning_rate": 1.6251638269986893e-05, + "loss": 2.7509, + "step": 496 + }, + { + "epoch": 0.017515288487104493, + "grad_norm": 11.63398551940918, + "learning_rate": 1.7267365661861077e-05, + "loss": 2.6999, + "step": 527 + }, + { + "epoch": 0.01854559957458123, + "grad_norm": 9.17713451385498, + "learning_rate": 1.8283093053735257e-05, + "loss": 2.6459, + "step": 558 + }, + { + "epoch": 0.019575910662057962, + "grad_norm": 7.119054794311523, + "learning_rate": 1.9298820445609438e-05, + "loss": 2.603, + "step": 589 + }, + { + "epoch": 0.0206062217495347, + "grad_norm": 6.653646945953369, + "learning_rate": 2.031454783748362e-05, + "loss": 2.5588, + "step": 620 + }, + { + "epoch": 0.021636532837011432, + "grad_norm": 8.332653045654297, + "learning_rate": 2.13302752293578e-05, + "loss": 2.5357, + "step": 651 + }, + { + "epoch": 0.02266684392448817, + "grad_norm": 6.4949116706848145, + "learning_rate": 2.234600262123198e-05, + "loss": 2.4967, + "step": 682 + }, + { + "epoch": 0.023697155011964902, + "grad_norm": 9.41009521484375, + "learning_rate": 2.336173001310616e-05, + "loss": 2.4563, + "step": 713 + }, + { + "epoch": 0.02472746609944164, + "grad_norm": 7.840345859527588, + "learning_rate": 2.437745740498034e-05, + "loss": 2.4383, + "step": 744 + }, + { + "epoch": 0.025757777186918372, + "grad_norm": 6.116458415985107, + "learning_rate": 2.5393184796854525e-05, + "loss": 2.3817, + "step": 775 + }, + { + "epoch": 0.02678808827439511, + "grad_norm": 5.938300609588623, + "learning_rate": 2.6408912188728702e-05, + "loss": 2.3508, + "step": 806 + }, + { + "epoch": 0.027818399361871842, + "grad_norm": 5.4408345222473145, + "learning_rate": 2.7424639580602886e-05, + "loss": 2.3325, + "step": 837 + }, + { + "epoch": 0.02884871044934858, + "grad_norm": 5.375136375427246, + "learning_rate": 2.8440366972477066e-05, + "loss": 2.3101, + "step": 868 + }, + { + "epoch": 0.029879021536825312, + "grad_norm": 5.149726867675781, + "learning_rate": 2.9456094364351244e-05, + "loss": 2.282, + "step": 899 + }, + { + "epoch": 0.03090933262430205, + "grad_norm": 4.591221332550049, + "learning_rate": 3.0471821756225428e-05, + "loss": 2.2427, + "step": 930 + }, + { + "epoch": 0.031939643711778785, + "grad_norm": 4.977034091949463, + "learning_rate": 3.148754914809961e-05, + "loss": 2.2218, + "step": 961 + }, + { + "epoch": 0.03296995479925552, + "grad_norm": 5.038781642913818, + "learning_rate": 3.2503276539973785e-05, + "loss": 2.2044, + "step": 992 + }, + { + "epoch": 0.03400026588673225, + "grad_norm": 4.872281551361084, + "learning_rate": 3.351900393184797e-05, + "loss": 2.1657, + "step": 1023 + }, + { + "epoch": 0.035030576974208985, + "grad_norm": 4.370841979980469, + "learning_rate": 3.453473132372215e-05, + "loss": 2.1365, + "step": 1054 + }, + { + "epoch": 0.036060888061685725, + "grad_norm": 4.087072849273682, + "learning_rate": 3.555045871559633e-05, + "loss": 2.1253, + "step": 1085 + }, + { + "epoch": 0.03709119914916246, + "grad_norm": 4.113957405090332, + "learning_rate": 3.6566186107470514e-05, + "loss": 2.0973, + "step": 1116 + }, + { + "epoch": 0.03812151023663919, + "grad_norm": 4.0119733810424805, + "learning_rate": 3.7581913499344695e-05, + "loss": 2.1024, + "step": 1147 + }, + { + "epoch": 0.039151821324115925, + "grad_norm": 4.247573375701904, + "learning_rate": 3.8597640891218876e-05, + "loss": 2.0722, + "step": 1178 + }, + { + "epoch": 0.04018213241159266, + "grad_norm": 3.5575129985809326, + "learning_rate": 3.9613368283093056e-05, + "loss": 2.056, + "step": 1209 + }, + { + "epoch": 0.0412124434990694, + "grad_norm": 3.8885862827301025, + "learning_rate": 4.062909567496724e-05, + "loss": 2.0389, + "step": 1240 + }, + { + "epoch": 0.04224275458654613, + "grad_norm": 3.680628538131714, + "learning_rate": 4.164482306684142e-05, + "loss": 2.0385, + "step": 1271 + }, + { + "epoch": 0.043273065674022865, + "grad_norm": 3.780876874923706, + "learning_rate": 4.26605504587156e-05, + "loss": 2.0097, + "step": 1302 + }, + { + "epoch": 0.0443033767614996, + "grad_norm": 4.235328674316406, + "learning_rate": 4.367627785058978e-05, + "loss": 2.0024, + "step": 1333 + }, + { + "epoch": 0.04533368784897634, + "grad_norm": 3.326941967010498, + "learning_rate": 4.469200524246396e-05, + "loss": 1.9953, + "step": 1364 + }, + { + "epoch": 0.04636399893645307, + "grad_norm": 3.28456449508667, + "learning_rate": 4.570773263433814e-05, + "loss": 1.9579, + "step": 1395 + }, + { + "epoch": 0.047394310023929805, + "grad_norm": 16.107433319091797, + "learning_rate": 4.672346002621232e-05, + "loss": 1.9701, + "step": 1426 + }, + { + "epoch": 0.04842462111140654, + "grad_norm": 3.5708224773406982, + "learning_rate": 4.77391874180865e-05, + "loss": 1.9621, + "step": 1457 + }, + { + "epoch": 0.04945493219888328, + "grad_norm": 2.9053499698638916, + "learning_rate": 4.875491480996068e-05, + "loss": 1.9458, + "step": 1488 + }, + { + "epoch": 0.05048524328636001, + "grad_norm": 3.0863258838653564, + "learning_rate": 4.977064220183487e-05, + "loss": 1.9483, + "step": 1519 + }, + { + "epoch": 0.051515554373836744, + "grad_norm": 2.9012269973754883, + "learning_rate": 4.9999915451558777e-05, + "loss": 1.928, + "step": 1550 + }, + { + "epoch": 0.05254586546131348, + "grad_norm": 3.0949041843414307, + "learning_rate": 4.999955597496219e-05, + "loss": 1.9229, + "step": 1581 + }, + { + "epoch": 0.05357617654879022, + "grad_norm": 2.8687901496887207, + "learning_rate": 4.9998914381774255e-05, + "loss": 1.915, + "step": 1612 + }, + { + "epoch": 0.05460648763626695, + "grad_norm": 3.2136878967285156, + "learning_rate": 4.999799067923527e-05, + "loss": 1.9197, + "step": 1643 + }, + { + "epoch": 0.055636798723743684, + "grad_norm": 2.590843677520752, + "learning_rate": 4.999678487776908e-05, + "loss": 1.8756, + "step": 1674 + }, + { + "epoch": 0.05666710981122042, + "grad_norm": 2.64634108543396, + "learning_rate": 4.9995296990983006e-05, + "loss": 1.9033, + "step": 1705 + }, + { + "epoch": 0.05769742089869716, + "grad_norm": 3.0151331424713135, + "learning_rate": 4.999352703566763e-05, + "loss": 1.8883, + "step": 1736 + }, + { + "epoch": 0.05872773198617389, + "grad_norm": 2.526806354522705, + "learning_rate": 4.999147503179668e-05, + "loss": 1.8666, + "step": 1767 + }, + { + "epoch": 0.059758043073650624, + "grad_norm": 2.510300397872925, + "learning_rate": 4.998914100252672e-05, + "loss": 1.854, + "step": 1798 + }, + { + "epoch": 0.06078835416112736, + "grad_norm": 2.4867682456970215, + "learning_rate": 4.998652497419696e-05, + "loss": 1.8548, + "step": 1829 + }, + { + "epoch": 0.0618186652486041, + "grad_norm": 2.3920586109161377, + "learning_rate": 4.9983626976328927e-05, + "loss": 1.8495, + "step": 1860 + }, + { + "epoch": 0.06284897633608083, + "grad_norm": 2.714177370071411, + "learning_rate": 4.998044704162613e-05, + "loss": 1.8433, + "step": 1891 + }, + { + "epoch": 0.06387928742355757, + "grad_norm": 2.3094465732574463, + "learning_rate": 4.9976985205973705e-05, + "loss": 1.8382, + "step": 1922 + }, + { + "epoch": 0.0649095985110343, + "grad_norm": 2.47184419631958, + "learning_rate": 4.997324150843799e-05, + "loss": 1.8464, + "step": 1953 + }, + { + "epoch": 0.06593990959851104, + "grad_norm": 2.391841411590576, + "learning_rate": 4.99692159912661e-05, + "loss": 1.8179, + "step": 1984 + }, + { + "epoch": 0.06697022068598776, + "grad_norm": 2.2471864223480225, + "learning_rate": 4.996490869988546e-05, + "loss": 1.8149, + "step": 2015 + }, + { + "epoch": 0.0680005317734645, + "grad_norm": 2.5497376918792725, + "learning_rate": 4.996031968290326e-05, + "loss": 1.8099, + "step": 2046 + }, + { + "epoch": 0.06903084286094124, + "grad_norm": 2.330463409423828, + "learning_rate": 4.995544899210594e-05, + "loss": 1.8267, + "step": 2077 + }, + { + "epoch": 0.07006115394841797, + "grad_norm": 2.3259341716766357, + "learning_rate": 4.9950296682458583e-05, + "loss": 1.7801, + "step": 2108 + }, + { + "epoch": 0.07109146503589471, + "grad_norm": 2.1711952686309814, + "learning_rate": 4.994486281210429e-05, + "loss": 1.7961, + "step": 2139 + }, + { + "epoch": 0.07212177612337145, + "grad_norm": 2.1808884143829346, + "learning_rate": 4.9939147442363566e-05, + "loss": 1.8109, + "step": 2170 + }, + { + "epoch": 0.07315208721084818, + "grad_norm": 2.089256525039673, + "learning_rate": 4.9933150637733574e-05, + "loss": 1.8026, + "step": 2201 + }, + { + "epoch": 0.07418239829832492, + "grad_norm": 2.0864951610565186, + "learning_rate": 4.992687246588743e-05, + "loss": 1.7753, + "step": 2232 + }, + { + "epoch": 0.07521270938580164, + "grad_norm": 2.36157488822937, + "learning_rate": 4.992031299767347e-05, + "loss": 1.7746, + "step": 2263 + }, + { + "epoch": 0.07624302047327838, + "grad_norm": 2.5334439277648926, + "learning_rate": 4.9913472307114386e-05, + "loss": 1.7927, + "step": 2294 + }, + { + "epoch": 0.07727333156075512, + "grad_norm": 2.2565715312957764, + "learning_rate": 4.9906350471406446e-05, + "loss": 1.7668, + "step": 2325 + }, + { + "epoch": 0.07830364264823185, + "grad_norm": 2.1043128967285156, + "learning_rate": 4.989894757091861e-05, + "loss": 1.7771, + "step": 2356 + }, + { + "epoch": 0.07933395373570859, + "grad_norm": 1.9659819602966309, + "learning_rate": 4.989126368919158e-05, + "loss": 1.7666, + "step": 2387 + }, + { + "epoch": 0.08036426482318532, + "grad_norm": 2.0778403282165527, + "learning_rate": 4.988329891293693e-05, + "loss": 1.7405, + "step": 2418 + }, + { + "epoch": 0.08139457591066206, + "grad_norm": 2.1767923831939697, + "learning_rate": 4.987505333203608e-05, + "loss": 1.7495, + "step": 2449 + }, + { + "epoch": 0.0824248869981388, + "grad_norm": 2.260143280029297, + "learning_rate": 4.9866527039539276e-05, + "loss": 1.7504, + "step": 2480 + }, + { + "epoch": 0.08345519808561552, + "grad_norm": 2.18271803855896, + "learning_rate": 4.9857720131664594e-05, + "loss": 1.7456, + "step": 2511 + }, + { + "epoch": 0.08448550917309226, + "grad_norm": 2.209594964981079, + "learning_rate": 4.9848632707796773e-05, + "loss": 1.7528, + "step": 2542 + }, + { + "epoch": 0.085515820260569, + "grad_norm": 2.0666229724884033, + "learning_rate": 4.9839264870486155e-05, + "loss": 1.7517, + "step": 2573 + }, + { + "epoch": 0.08654613134804573, + "grad_norm": 2.1070454120635986, + "learning_rate": 4.9829616725447526e-05, + "loss": 1.7474, + "step": 2604 + }, + { + "epoch": 0.08757644243552247, + "grad_norm": 1.9430303573608398, + "learning_rate": 4.981968838155888e-05, + "loss": 1.7348, + "step": 2635 + }, + { + "epoch": 0.0886067535229992, + "grad_norm": 1.9638925790786743, + "learning_rate": 4.980947995086024e-05, + "loss": 1.7202, + "step": 2666 + }, + { + "epoch": 0.08963706461047594, + "grad_norm": 1.8845652341842651, + "learning_rate": 4.979899154855234e-05, + "loss": 1.7375, + "step": 2697 + }, + { + "epoch": 0.09066737569795268, + "grad_norm": 5.712058067321777, + "learning_rate": 4.9788223292995386e-05, + "loss": 1.7379, + "step": 2728 + }, + { + "epoch": 0.0916976867854294, + "grad_norm": 1.9520670175552368, + "learning_rate": 4.977717530570768e-05, + "loss": 1.7302, + "step": 2759 + }, + { + "epoch": 0.09272799787290614, + "grad_norm": 1.8802224397659302, + "learning_rate": 4.976584771136425e-05, + "loss": 1.74, + "step": 2790 + }, + { + "epoch": 0.09375830896038288, + "grad_norm": 2.1098153591156006, + "learning_rate": 4.975424063779547e-05, + "loss": 1.7024, + "step": 2821 + }, + { + "epoch": 0.09478862004785961, + "grad_norm": 2.1568291187286377, + "learning_rate": 4.974235421598557e-05, + "loss": 1.7131, + "step": 2852 + }, + { + "epoch": 0.09581893113533635, + "grad_norm": 1.8769980669021606, + "learning_rate": 4.973018858007122e-05, + "loss": 1.7008, + "step": 2883 + }, + { + "epoch": 0.09684924222281308, + "grad_norm": 1.8325533866882324, + "learning_rate": 4.9717743867339963e-05, + "loss": 1.7058, + "step": 2914 + }, + { + "epoch": 0.09787955331028982, + "grad_norm": 2.086416721343994, + "learning_rate": 4.9705020218228695e-05, + "loss": 1.711, + "step": 2945 + }, + { + "epoch": 0.09890986439776656, + "grad_norm": 1.8294793367385864, + "learning_rate": 4.969201777632205e-05, + "loss": 1.6998, + "step": 2976 + }, + { + "epoch": 0.09994017548524328, + "grad_norm": 2.0608153343200684, + "learning_rate": 4.9678736688350846e-05, + "loss": 1.6948, + "step": 3007 + }, + { + "epoch": 0.10097048657272002, + "grad_norm": 3.2166008949279785, + "learning_rate": 4.966517710419033e-05, + "loss": 1.6788, + "step": 3038 + }, + { + "epoch": 0.10200079766019676, + "grad_norm": 1.9431313276290894, + "learning_rate": 4.965133917685858e-05, + "loss": 1.7115, + "step": 3069 + }, + { + "epoch": 0.10303110874767349, + "grad_norm": 1.967512845993042, + "learning_rate": 4.9637223062514714e-05, + "loss": 1.7033, + "step": 3100 + }, + { + "epoch": 0.10406141983515023, + "grad_norm": 1.9253389835357666, + "learning_rate": 4.962282892045718e-05, + "loss": 1.6856, + "step": 3131 + }, + { + "epoch": 0.10509173092262696, + "grad_norm": 1.986840009689331, + "learning_rate": 4.9608156913121904e-05, + "loss": 1.723, + "step": 3162 + }, + { + "epoch": 0.1061220420101037, + "grad_norm": 1.83523690700531, + "learning_rate": 4.959320720608049e-05, + "loss": 1.6912, + "step": 3193 + }, + { + "epoch": 0.10715235309758044, + "grad_norm": 2.1271955966949463, + "learning_rate": 4.9577979968038354e-05, + "loss": 1.7032, + "step": 3224 + }, + { + "epoch": 0.10818266418505716, + "grad_norm": 1.8383768796920776, + "learning_rate": 4.956247537083282e-05, + "loss": 1.6726, + "step": 3255 + }, + { + "epoch": 0.1092129752725339, + "grad_norm": 1.8806651830673218, + "learning_rate": 4.9546693589431145e-05, + "loss": 1.6817, + "step": 3286 + }, + { + "epoch": 0.11024328636001064, + "grad_norm": 1.7535260915756226, + "learning_rate": 4.9530634801928595e-05, + "loss": 1.6875, + "step": 3317 + }, + { + "epoch": 0.11127359744748737, + "grad_norm": 1.765906810760498, + "learning_rate": 4.9514299189546395e-05, + "loss": 1.6859, + "step": 3348 + }, + { + "epoch": 0.11230390853496411, + "grad_norm": 1.869828462600708, + "learning_rate": 4.949768693662973e-05, + "loss": 1.6915, + "step": 3379 + }, + { + "epoch": 0.11333421962244083, + "grad_norm": 1.8347504138946533, + "learning_rate": 4.948079823064559e-05, + "loss": 1.6859, + "step": 3410 + }, + { + "epoch": 0.11436453070991758, + "grad_norm": 1.7692474126815796, + "learning_rate": 4.946363326218074e-05, + "loss": 1.6565, + "step": 3441 + }, + { + "epoch": 0.11539484179739432, + "grad_norm": 1.8231885433197021, + "learning_rate": 4.9446192224939525e-05, + "loss": 1.686, + "step": 3472 + }, + { + "epoch": 0.11642515288487104, + "grad_norm": 1.7155958414077759, + "learning_rate": 4.942847531574167e-05, + "loss": 1.6538, + "step": 3503 + }, + { + "epoch": 0.11745546397234778, + "grad_norm": 1.787183403968811, + "learning_rate": 4.941048273452008e-05, + "loss": 1.6776, + "step": 3534 + }, + { + "epoch": 0.11848577505982451, + "grad_norm": 1.741213083267212, + "learning_rate": 4.9392214684318605e-05, + "loss": 1.6784, + "step": 3565 + }, + { + "epoch": 0.11951608614730125, + "grad_norm": 1.7836824655532837, + "learning_rate": 4.93736713712897e-05, + "loss": 1.6557, + "step": 3596 + }, + { + "epoch": 0.12054639723477799, + "grad_norm": 1.7103859186172485, + "learning_rate": 4.9354853004692124e-05, + "loss": 1.6606, + "step": 3627 + }, + { + "epoch": 0.12157670832225471, + "grad_norm": 1.7865506410598755, + "learning_rate": 4.93357597968886e-05, + "loss": 1.6409, + "step": 3658 + }, + { + "epoch": 0.12260701940973145, + "grad_norm": 1.7770143747329712, + "learning_rate": 4.931639196334338e-05, + "loss": 1.6574, + "step": 3689 + }, + { + "epoch": 0.1236373304972082, + "grad_norm": 1.857575535774231, + "learning_rate": 4.9296749722619826e-05, + "loss": 1.6724, + "step": 3720 + }, + { + "epoch": 0.12466764158468492, + "grad_norm": 1.8742581605911255, + "learning_rate": 4.9276833296377966e-05, + "loss": 1.6506, + "step": 3751 + }, + { + "epoch": 0.12569795267216166, + "grad_norm": 1.827668309211731, + "learning_rate": 4.925664290937196e-05, + "loss": 1.6523, + "step": 3782 + }, + { + "epoch": 0.1267282637596384, + "grad_norm": 1.7517486810684204, + "learning_rate": 4.9236178789447576e-05, + "loss": 1.6459, + "step": 3813 + }, + { + "epoch": 0.12775857484711514, + "grad_norm": 1.8109570741653442, + "learning_rate": 4.921544116753962e-05, + "loss": 1.6614, + "step": 3844 + }, + { + "epoch": 0.12878888593459187, + "grad_norm": 1.692597508430481, + "learning_rate": 4.919443027766935e-05, + "loss": 1.6431, + "step": 3875 + }, + { + "epoch": 0.1298191970220686, + "grad_norm": 1.8650025129318237, + "learning_rate": 4.91731463569418e-05, + "loss": 1.6466, + "step": 3906 + }, + { + "epoch": 0.13084950810954532, + "grad_norm": 1.6794081926345825, + "learning_rate": 4.915158964554312e-05, + "loss": 1.6504, + "step": 3937 + }, + { + "epoch": 0.13187981919702207, + "grad_norm": 1.7685374021530151, + "learning_rate": 4.912976038673786e-05, + "loss": 1.6446, + "step": 3968 + }, + { + "epoch": 0.1329101302844988, + "grad_norm": 1.7601110935211182, + "learning_rate": 4.9107658826866254e-05, + "loss": 1.631, + "step": 3999 + }, + { + "epoch": 0.13394044137197553, + "grad_norm": 2.0616064071655273, + "learning_rate": 4.908528521534139e-05, + "loss": 1.6476, + "step": 4030 + }, + { + "epoch": 0.13497075245945228, + "grad_norm": 1.8973504304885864, + "learning_rate": 4.906263980464644e-05, + "loss": 1.6582, + "step": 4061 + }, + { + "epoch": 0.136001063546929, + "grad_norm": 1.7768895626068115, + "learning_rate": 4.903972285033178e-05, + "loss": 1.6159, + "step": 4092 + }, + { + "epoch": 0.13703137463440573, + "grad_norm": 1.8264424800872803, + "learning_rate": 4.901653461101213e-05, + "loss": 1.6289, + "step": 4123 + }, + { + "epoch": 0.1380616857218825, + "grad_norm": 1.7140119075775146, + "learning_rate": 4.8993075348363626e-05, + "loss": 1.6357, + "step": 4154 + }, + { + "epoch": 0.13909199680935921, + "grad_norm": 1.6964486837387085, + "learning_rate": 4.896934532712084e-05, + "loss": 1.6233, + "step": 4185 + }, + { + "epoch": 0.14012230789683594, + "grad_norm": 1.8008025884628296, + "learning_rate": 4.8945344815073846e-05, + "loss": 1.637, + "step": 4216 + }, + { + "epoch": 0.1411526189843127, + "grad_norm": 1.562730073928833, + "learning_rate": 4.892107408306516e-05, + "loss": 1.6379, + "step": 4247 + }, + { + "epoch": 0.14218293007178942, + "grad_norm": 1.8273371458053589, + "learning_rate": 4.889653340498669e-05, + "loss": 1.6246, + "step": 4278 + }, + { + "epoch": 0.14321324115926615, + "grad_norm": 56.33716583251953, + "learning_rate": 4.8871723057776664e-05, + "loss": 1.6457, + "step": 4309 + }, + { + "epoch": 0.1442435522467429, + "grad_norm": 1.746523380279541, + "learning_rate": 4.8846643321416476e-05, + "loss": 1.6343, + "step": 4340 + }, + { + "epoch": 0.14527386333421963, + "grad_norm": 1.7737531661987305, + "learning_rate": 4.882129447892753e-05, + "loss": 1.6447, + "step": 4371 + }, + { + "epoch": 0.14630417442169635, + "grad_norm": 1.660485863685608, + "learning_rate": 4.8795676816368076e-05, + "loss": 1.6192, + "step": 4402 + }, + { + "epoch": 0.14733448550917308, + "grad_norm": 1.6823406219482422, + "learning_rate": 4.876979062282995e-05, + "loss": 1.6253, + "step": 4433 + }, + { + "epoch": 0.14836479659664983, + "grad_norm": 7.78139066696167, + "learning_rate": 4.8743636190435325e-05, + "loss": 1.6234, + "step": 4464 + }, + { + "epoch": 0.14939510768412656, + "grad_norm": 1.7426058053970337, + "learning_rate": 4.871721381433344e-05, + "loss": 1.6337, + "step": 4495 + }, + { + "epoch": 0.1504254187716033, + "grad_norm": 1.6294783353805542, + "learning_rate": 4.869052379269719e-05, + "loss": 1.6217, + "step": 4526 + }, + { + "epoch": 0.15145572985908004, + "grad_norm": 1.6523306369781494, + "learning_rate": 4.866356642671985e-05, + "loss": 1.605, + "step": 4557 + }, + { + "epoch": 0.15248604094655677, + "grad_norm": 1.8571300506591797, + "learning_rate": 4.8636342020611634e-05, + "loss": 1.6218, + "step": 4588 + }, + { + "epoch": 0.1535163520340335, + "grad_norm": 1.7754936218261719, + "learning_rate": 4.860885088159626e-05, + "loss": 1.6171, + "step": 4619 + }, + { + "epoch": 0.15454666312151025, + "grad_norm": 1.91987943649292, + "learning_rate": 4.858109331990751e-05, + "loss": 1.6167, + "step": 4650 + }, + { + "epoch": 0.15557697420898697, + "grad_norm": 1.5994452238082886, + "learning_rate": 4.855306964878567e-05, + "loss": 1.5951, + "step": 4681 + }, + { + "epoch": 0.1566072852964637, + "grad_norm": 1.6490916013717651, + "learning_rate": 4.8524780184474084e-05, + "loss": 1.616, + "step": 4712 + }, + { + "epoch": 0.15763759638394045, + "grad_norm": 1.5921640396118164, + "learning_rate": 4.8496225246215496e-05, + "loss": 1.6346, + "step": 4743 + }, + { + "epoch": 0.15866790747141718, + "grad_norm": 1.6729261875152588, + "learning_rate": 4.8467405156248505e-05, + "loss": 1.6165, + "step": 4774 + }, + { + "epoch": 0.1596982185588939, + "grad_norm": 1.628113031387329, + "learning_rate": 4.843832023980392e-05, + "loss": 1.6119, + "step": 4805 + }, + { + "epoch": 0.16072852964637063, + "grad_norm": 1.651647925376892, + "learning_rate": 4.840897082510106e-05, + "loss": 1.5997, + "step": 4836 + }, + { + "epoch": 0.1617588407338474, + "grad_norm": 1.5297720432281494, + "learning_rate": 4.8379357243344084e-05, + "loss": 1.6242, + "step": 4867 + }, + { + "epoch": 0.1627891518213241, + "grad_norm": 1.5779869556427002, + "learning_rate": 4.8349479828718236e-05, + "loss": 1.6149, + "step": 4898 + }, + { + "epoch": 0.16381946290880084, + "grad_norm": 1.5843939781188965, + "learning_rate": 4.8319338918386075e-05, + "loss": 1.5926, + "step": 4929 + }, + { + "epoch": 0.1648497739962776, + "grad_norm": 2.3762106895446777, + "learning_rate": 4.828893485248369e-05, + "loss": 1.6108, + "step": 4960 + }, + { + "epoch": 0.16588008508375432, + "grad_norm": 1.5871953964233398, + "learning_rate": 4.825826797411682e-05, + "loss": 1.6103, + "step": 4991 + }, + { + "epoch": 0.16691039617123105, + "grad_norm": 1.5934125185012817, + "learning_rate": 4.822733862935702e-05, + "loss": 1.6091, + "step": 5022 + }, + { + "epoch": 0.1679407072587078, + "grad_norm": 1.6997628211975098, + "learning_rate": 4.819614716723775e-05, + "loss": 1.6098, + "step": 5053 + }, + { + "epoch": 0.16897101834618453, + "grad_norm": 1.682849645614624, + "learning_rate": 4.8164693939750425e-05, + "loss": 1.599, + "step": 5084 + }, + { + "epoch": 0.17000132943366125, + "grad_norm": 1.709743857383728, + "learning_rate": 4.813297930184042e-05, + "loss": 1.6194, + "step": 5115 + }, + { + "epoch": 0.171031640521138, + "grad_norm": 1.725879430770874, + "learning_rate": 4.810100361140314e-05, + "loss": 1.6115, + "step": 5146 + }, + { + "epoch": 0.17206195160861473, + "grad_norm": 1.6710290908813477, + "learning_rate": 4.8068767229279885e-05, + "loss": 1.6032, + "step": 5177 + }, + { + "epoch": 0.17309226269609146, + "grad_norm": 1.6156634092330933, + "learning_rate": 4.8036270519253854e-05, + "loss": 1.5973, + "step": 5208 + }, + { + "epoch": 0.1741225737835682, + "grad_norm": 1.5654059648513794, + "learning_rate": 4.8003513848046e-05, + "loss": 1.5817, + "step": 5239 + }, + { + "epoch": 0.17515288487104494, + "grad_norm": 1.5789822340011597, + "learning_rate": 4.79704975853109e-05, + "loss": 1.6138, + "step": 5270 + }, + { + "epoch": 0.17618319595852167, + "grad_norm": 1.6022037267684937, + "learning_rate": 4.793722210363262e-05, + "loss": 1.5998, + "step": 5301 + }, + { + "epoch": 0.1772135070459984, + "grad_norm": 1.5142741203308105, + "learning_rate": 4.7903687778520414e-05, + "loss": 1.6061, + "step": 5332 + }, + { + "epoch": 0.17824381813347515, + "grad_norm": 1.6454212665557861, + "learning_rate": 4.7869894988404593e-05, + "loss": 1.6063, + "step": 5363 + }, + { + "epoch": 0.17927412922095187, + "grad_norm": 1.5250823497772217, + "learning_rate": 4.783584411463221e-05, + "loss": 1.6038, + "step": 5394 + }, + { + "epoch": 0.1803044403084286, + "grad_norm": 1.5829335451126099, + "learning_rate": 4.780153554146274e-05, + "loss": 1.5949, + "step": 5425 + }, + { + "epoch": 0.18133475139590535, + "grad_norm": 1.5342432260513306, + "learning_rate": 4.7766969656063766e-05, + "loss": 1.5913, + "step": 5456 + }, + { + "epoch": 0.18236506248338208, + "grad_norm": 1.6397250890731812, + "learning_rate": 4.773214684850662e-05, + "loss": 1.6102, + "step": 5487 + }, + { + "epoch": 0.1833953735708588, + "grad_norm": 1.5228471755981445, + "learning_rate": 4.769706751176193e-05, + "loss": 1.5885, + "step": 5518 + }, + { + "epoch": 0.18442568465833556, + "grad_norm": 1.6186103820800781, + "learning_rate": 4.7661732041695264e-05, + "loss": 1.6086, + "step": 5549 + }, + { + "epoch": 0.18545599574581229, + "grad_norm": 1.6024582386016846, + "learning_rate": 4.762614083706258e-05, + "loss": 1.6004, + "step": 5580 + }, + { + "epoch": 0.186486306833289, + "grad_norm": 1.5443711280822754, + "learning_rate": 4.759029429950581e-05, + "loss": 1.6048, + "step": 5611 + }, + { + "epoch": 0.18751661792076577, + "grad_norm": 1.4831629991531372, + "learning_rate": 4.7554192833548235e-05, + "loss": 1.5841, + "step": 5642 + }, + { + "epoch": 0.1885469290082425, + "grad_norm": 1.6426068544387817, + "learning_rate": 4.751783684659e-05, + "loss": 1.587, + "step": 5673 + }, + { + "epoch": 0.18957724009571922, + "grad_norm": 1.4609078168869019, + "learning_rate": 4.748122674890348e-05, + "loss": 1.5945, + "step": 5704 + }, + { + "epoch": 0.19060755118319597, + "grad_norm": 1.5365614891052246, + "learning_rate": 4.7444362953628654e-05, + "loss": 1.5737, + "step": 5735 + }, + { + "epoch": 0.1916378622706727, + "grad_norm": 1.5755670070648193, + "learning_rate": 4.7407245876768424e-05, + "loss": 1.5862, + "step": 5766 + }, + { + "epoch": 0.19266817335814942, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.736987593718397e-05, + "loss": 1.5663, + "step": 5797 + }, + { + "epoch": 0.19369848444562615, + "grad_norm": 1.5927278995513916, + "learning_rate": 4.733225355658999e-05, + "loss": 1.5776, + "step": 5828 + }, + { + "epoch": 0.1947287955331029, + "grad_norm": 1.5593287944793701, + "learning_rate": 4.7294379159549926e-05, + "loss": 1.579, + "step": 5859 + }, + { + "epoch": 0.19575910662057963, + "grad_norm": 1.534055233001709, + "learning_rate": 4.725625317347119e-05, + "loss": 1.6017, + "step": 5890 + }, + { + "epoch": 0.19678941770805636, + "grad_norm": 1.5846387147903442, + "learning_rate": 4.7217876028600374e-05, + "loss": 1.5739, + "step": 5921 + }, + { + "epoch": 0.1978197287955331, + "grad_norm": 1.5377682447433472, + "learning_rate": 4.717924815801832e-05, + "loss": 1.57, + "step": 5952 + }, + { + "epoch": 0.19885003988300984, + "grad_norm": 1.467956781387329, + "learning_rate": 4.714036999763532e-05, + "loss": 1.5736, + "step": 5983 + }, + { + "epoch": 0.19988035097048656, + "grad_norm": 1.601070523262024, + "learning_rate": 4.7101241986186116e-05, + "loss": 1.5861, + "step": 6014 + }, + { + "epoch": 0.20091066205796332, + "grad_norm": 1.5051921606063843, + "learning_rate": 4.7061864565225e-05, + "loss": 1.5735, + "step": 6045 + }, + { + "epoch": 0.20194097314544004, + "grad_norm": 1.462843418121338, + "learning_rate": 4.702223817912081e-05, + "loss": 1.582, + "step": 6076 + }, + { + "epoch": 0.20297128423291677, + "grad_norm": 1.5698682069778442, + "learning_rate": 4.698236327505195e-05, + "loss": 1.5647, + "step": 6107 + }, + { + "epoch": 0.20400159532039353, + "grad_norm": 1.5633916854858398, + "learning_rate": 4.694224030300127e-05, + "loss": 1.5741, + "step": 6138 + }, + { + "epoch": 0.20503190640787025, + "grad_norm": 1.6174733638763428, + "learning_rate": 4.690186971575107e-05, + "loss": 1.5634, + "step": 6169 + }, + { + "epoch": 0.20606221749534698, + "grad_norm": 1.4957518577575684, + "learning_rate": 4.6861251968877916e-05, + "loss": 1.575, + "step": 6200 + }, + { + "epoch": 0.2070925285828237, + "grad_norm": 1.670933485031128, + "learning_rate": 4.68203875207476e-05, + "loss": 1.5792, + "step": 6231 + }, + { + "epoch": 0.20812283967030046, + "grad_norm": 1.5676430463790894, + "learning_rate": 4.677927683250983e-05, + "loss": 1.5689, + "step": 6262 + }, + { + "epoch": 0.20915315075777718, + "grad_norm": 1.5753976106643677, + "learning_rate": 4.6737920368093156e-05, + "loss": 1.5594, + "step": 6293 + }, + { + "epoch": 0.2101834618452539, + "grad_norm": 1.4973617792129517, + "learning_rate": 4.669631859419965e-05, + "loss": 1.5593, + "step": 6324 + }, + { + "epoch": 0.21121377293273066, + "grad_norm": 1.4691433906555176, + "learning_rate": 4.6654471980299676e-05, + "loss": 1.5711, + "step": 6355 + }, + { + "epoch": 0.2122440840202074, + "grad_norm": 1.407630443572998, + "learning_rate": 4.661238099862658e-05, + "loss": 1.5787, + "step": 6386 + }, + { + "epoch": 0.21327439510768412, + "grad_norm": 1.5011677742004395, + "learning_rate": 4.657004612417138e-05, + "loss": 1.5751, + "step": 6417 + }, + { + "epoch": 0.21430470619516087, + "grad_norm": 1.509750485420227, + "learning_rate": 4.6527467834677374e-05, + "loss": 1.5583, + "step": 6448 + }, + { + "epoch": 0.2153350172826376, + "grad_norm": 1.3919882774353027, + "learning_rate": 4.648464661063478e-05, + "loss": 1.5712, + "step": 6479 + }, + { + "epoch": 0.21636532837011432, + "grad_norm": 1.4854936599731445, + "learning_rate": 4.6441582935275264e-05, + "loss": 1.5637, + "step": 6510 + }, + { + "epoch": 0.21739563945759108, + "grad_norm": 1.4413583278656006, + "learning_rate": 4.6398277294566586e-05, + "loss": 1.56, + "step": 6541 + }, + { + "epoch": 0.2184259505450678, + "grad_norm": 1.5063883066177368, + "learning_rate": 4.6354730177207e-05, + "loss": 1.5525, + "step": 6572 + }, + { + "epoch": 0.21945626163254453, + "grad_norm": 1.4899688959121704, + "learning_rate": 4.6310942074619787e-05, + "loss": 1.5817, + "step": 6603 + }, + { + "epoch": 0.22048657272002128, + "grad_norm": 1.3927967548370361, + "learning_rate": 4.626691348094777e-05, + "loss": 1.5407, + "step": 6634 + }, + { + "epoch": 0.221516883807498, + "grad_norm": 1.5378398895263672, + "learning_rate": 4.622264489304762e-05, + "loss": 1.5561, + "step": 6665 + }, + { + "epoch": 0.22254719489497474, + "grad_norm": 1.554624319076538, + "learning_rate": 4.617813681048434e-05, + "loss": 1.5859, + "step": 6696 + }, + { + "epoch": 0.22357750598245146, + "grad_norm": 1.5356658697128296, + "learning_rate": 4.61333897355256e-05, + "loss": 1.5531, + "step": 6727 + }, + { + "epoch": 0.22460781706992822, + "grad_norm": 1.5534918308258057, + "learning_rate": 4.608840417313604e-05, + "loss": 1.5774, + "step": 6758 + }, + { + "epoch": 0.22563812815740494, + "grad_norm": 1.5660988092422485, + "learning_rate": 4.6043180630971646e-05, + "loss": 1.5763, + "step": 6789 + }, + { + "epoch": 0.22666843924488167, + "grad_norm": 1.4993386268615723, + "learning_rate": 4.599771961937391e-05, + "loss": 1.5615, + "step": 6820 + }, + { + "epoch": 0.22769875033235842, + "grad_norm": 1.4630553722381592, + "learning_rate": 4.5952021651364204e-05, + "loss": 1.543, + "step": 6851 + }, + { + "epoch": 0.22872906141983515, + "grad_norm": 1.470173954963684, + "learning_rate": 4.590608724263786e-05, + "loss": 1.5674, + "step": 6882 + }, + { + "epoch": 0.22975937250731188, + "grad_norm": 1.5867971181869507, + "learning_rate": 4.585991691155845e-05, + "loss": 1.5702, + "step": 6913 + }, + { + "epoch": 0.23078968359478863, + "grad_norm": 1.44207763671875, + "learning_rate": 4.581351117915188e-05, + "loss": 1.5436, + "step": 6944 + }, + { + "epoch": 0.23181999468226536, + "grad_norm": 1.4691039323806763, + "learning_rate": 4.5766870569100534e-05, + "loss": 1.5465, + "step": 6975 + }, + { + "epoch": 0.23285030576974208, + "grad_norm": 1.4807918071746826, + "learning_rate": 4.571999560773736e-05, + "loss": 1.5564, + "step": 7006 + }, + { + "epoch": 0.23388061685721884, + "grad_norm": 1.481487512588501, + "learning_rate": 4.5672886824039915e-05, + "loss": 1.5466, + "step": 7037 + }, + { + "epoch": 0.23491092794469556, + "grad_norm": 1.4518013000488281, + "learning_rate": 4.5625544749624435e-05, + "loss": 1.5618, + "step": 7068 + }, + { + "epoch": 0.2359412390321723, + "grad_norm": 1.4186676740646362, + "learning_rate": 4.5577969918739794e-05, + "loss": 1.5528, + "step": 7099 + }, + { + "epoch": 0.23697155011964902, + "grad_norm": 1.5287110805511475, + "learning_rate": 4.5530162868261486e-05, + "loss": 1.5457, + "step": 7130 + }, + { + "epoch": 0.23800186120712577, + "grad_norm": 1.5516417026519775, + "learning_rate": 4.548212413768558e-05, + "loss": 1.5467, + "step": 7161 + }, + { + "epoch": 0.2390321722946025, + "grad_norm": 1.4710053205490112, + "learning_rate": 4.543385426912261e-05, + "loss": 1.5431, + "step": 7192 + }, + { + "epoch": 0.24006248338207922, + "grad_norm": 1.5005567073822021, + "learning_rate": 4.53853538072915e-05, + "loss": 1.5592, + "step": 7223 + }, + { + "epoch": 0.24109279446955598, + "grad_norm": 1.5864965915679932, + "learning_rate": 4.533662329951336e-05, + "loss": 1.5694, + "step": 7254 + }, + { + "epoch": 0.2421231055570327, + "grad_norm": 1.4661896228790283, + "learning_rate": 4.528766329570536e-05, + "loss": 1.545, + "step": 7285 + }, + { + "epoch": 0.24315341664450943, + "grad_norm": 1.5157560110092163, + "learning_rate": 4.523847434837447e-05, + "loss": 1.5458, + "step": 7316 + }, + { + "epoch": 0.24418372773198618, + "grad_norm": 1.4033585786819458, + "learning_rate": 4.518905701261128e-05, + "loss": 1.5464, + "step": 7347 + }, + { + "epoch": 0.2452140388194629, + "grad_norm": 1.5357593297958374, + "learning_rate": 4.5139411846083715e-05, + "loss": 1.5497, + "step": 7378 + }, + { + "epoch": 0.24624434990693964, + "grad_norm": 1.419507384300232, + "learning_rate": 4.508953940903073e-05, + "loss": 1.5414, + "step": 7409 + }, + { + "epoch": 0.2472746609944164, + "grad_norm": 1.5201773643493652, + "learning_rate": 4.5039440264255994e-05, + "loss": 1.5503, + "step": 7440 + }, + { + "epoch": 0.24830497208189312, + "grad_norm": 1.8000444173812866, + "learning_rate": 4.498911497712155e-05, + "loss": 1.5448, + "step": 7471 + }, + { + "epoch": 0.24933528316936984, + "grad_norm": 1.4876810312271118, + "learning_rate": 4.493856411554142e-05, + "loss": 1.5524, + "step": 7502 + }, + { + "epoch": 0.25036559425684657, + "grad_norm": 1.5130078792572021, + "learning_rate": 4.4887788249975206e-05, + "loss": 1.5454, + "step": 7533 + }, + { + "epoch": 0.2513959053443233, + "grad_norm": 1.4829351902008057, + "learning_rate": 4.4836787953421656e-05, + "loss": 1.5407, + "step": 7564 + }, + { + "epoch": 0.2524262164318001, + "grad_norm": 1.521550178527832, + "learning_rate": 4.478556380141218e-05, + "loss": 1.5727, + "step": 7595 + }, + { + "epoch": 0.2534565275192768, + "grad_norm": 1.4377928972244263, + "learning_rate": 4.4734116372004375e-05, + "loss": 1.5432, + "step": 7626 + }, + { + "epoch": 0.25448683860675353, + "grad_norm": 1.4101744890213013, + "learning_rate": 4.4682446245775477e-05, + "loss": 1.547, + "step": 7657 + }, + { + "epoch": 0.2555171496942303, + "grad_norm": 1.522524356842041, + "learning_rate": 4.463055400581586e-05, + "loss": 1.5418, + "step": 7688 + }, + { + "epoch": 0.256547460781707, + "grad_norm": 1.4160797595977783, + "learning_rate": 4.4578440237722374e-05, + "loss": 1.5457, + "step": 7719 + }, + { + "epoch": 0.25757777186918374, + "grad_norm": 1.4106636047363281, + "learning_rate": 4.452610552959183e-05, + "loss": 1.5405, + "step": 7750 + }, + { + "epoch": 0.2586080829566605, + "grad_norm": 1.422723650932312, + "learning_rate": 4.447355047201428e-05, + "loss": 1.5423, + "step": 7781 + }, + { + "epoch": 0.2596383940441372, + "grad_norm": 1.4362592697143555, + "learning_rate": 4.4420775658066414e-05, + "loss": 1.5372, + "step": 7812 + }, + { + "epoch": 0.26066870513161394, + "grad_norm": 1.4319696426391602, + "learning_rate": 4.436778168330484e-05, + "loss": 1.5451, + "step": 7843 + }, + { + "epoch": 0.26169901621909064, + "grad_norm": 1.4069257974624634, + "learning_rate": 4.4314569145759353e-05, + "loss": 1.5221, + "step": 7874 + }, + { + "epoch": 0.2627293273065674, + "grad_norm": 1.4424949884414673, + "learning_rate": 4.42611386459262e-05, + "loss": 1.5419, + "step": 7905 + }, + { + "epoch": 0.26375963839404415, + "grad_norm": 1.4579105377197266, + "learning_rate": 4.420749078676133e-05, + "loss": 1.5116, + "step": 7936 + }, + { + "epoch": 0.26478994948152085, + "grad_norm": 1.4563167095184326, + "learning_rate": 4.4153626173673516e-05, + "loss": 1.5296, + "step": 7967 + }, + { + "epoch": 0.2658202605689976, + "grad_norm": 1.4440968036651611, + "learning_rate": 4.409954541451762e-05, + "loss": 1.5548, + "step": 7998 + }, + { + "epoch": 0.26685057165647436, + "grad_norm": 1.5711034536361694, + "learning_rate": 4.404524911958764e-05, + "loss": 1.535, + "step": 8029 + }, + { + "epoch": 0.26788088274395105, + "grad_norm": 1.5221564769744873, + "learning_rate": 4.399073790160989e-05, + "loss": 1.5495, + "step": 8060 + }, + { + "epoch": 0.2689111938314278, + "grad_norm": 1.392699956893921, + "learning_rate": 4.393601237573607e-05, + "loss": 1.546, + "step": 8091 + }, + { + "epoch": 0.26994150491890456, + "grad_norm": 1.5343137979507446, + "learning_rate": 4.388107315953628e-05, + "loss": 1.549, + "step": 8122 + }, + { + "epoch": 0.27097181600638126, + "grad_norm": 1.4483468532562256, + "learning_rate": 4.382592087299212e-05, + "loss": 1.5424, + "step": 8153 + }, + { + "epoch": 0.272002127093858, + "grad_norm": 1.4963489770889282, + "learning_rate": 4.377055613848964e-05, + "loss": 1.508, + "step": 8184 + }, + { + "epoch": 0.27303243818133477, + "grad_norm": 1.4839162826538086, + "learning_rate": 4.3714979580812355e-05, + "loss": 1.5203, + "step": 8215 + }, + { + "epoch": 0.27406274926881147, + "grad_norm": 1.4272018671035767, + "learning_rate": 4.365919182713416e-05, + "loss": 1.5264, + "step": 8246 + }, + { + "epoch": 0.2750930603562882, + "grad_norm": 1.3808270692825317, + "learning_rate": 4.360319350701226e-05, + "loss": 1.5255, + "step": 8277 + }, + { + "epoch": 0.276123371443765, + "grad_norm": 1.4179162979125977, + "learning_rate": 4.3546985252380115e-05, + "loss": 1.535, + "step": 8308 + }, + { + "epoch": 0.2771536825312417, + "grad_norm": 1.3617374897003174, + "learning_rate": 4.349056769754021e-05, + "loss": 1.5295, + "step": 8339 + }, + { + "epoch": 0.27818399361871843, + "grad_norm": 1.4745615720748901, + "learning_rate": 4.3433941479156994e-05, + "loss": 1.5438, + "step": 8370 + }, + { + "epoch": 0.2792143047061952, + "grad_norm": 1.3661375045776367, + "learning_rate": 4.3377107236249647e-05, + "loss": 1.5134, + "step": 8401 + }, + { + "epoch": 0.2802446157936719, + "grad_norm": 1.3907949924468994, + "learning_rate": 4.332006561018488e-05, + "loss": 1.5237, + "step": 8432 + }, + { + "epoch": 0.28127492688114863, + "grad_norm": 1.3575704097747803, + "learning_rate": 4.3262817244669683e-05, + "loss": 1.5226, + "step": 8463 + }, + { + "epoch": 0.2823052379686254, + "grad_norm": 1.3836462497711182, + "learning_rate": 4.3205362785744083e-05, + "loss": 1.5433, + "step": 8494 + }, + { + "epoch": 0.2833355490561021, + "grad_norm": 1.6108276844024658, + "learning_rate": 4.314770288177384e-05, + "loss": 1.5324, + "step": 8525 + }, + { + "epoch": 0.28436586014357884, + "grad_norm": 1.4650689363479614, + "learning_rate": 4.308983818344313e-05, + "loss": 1.535, + "step": 8556 + }, + { + "epoch": 0.2853961712310556, + "grad_norm": 1.5836583375930786, + "learning_rate": 4.3031769343747206e-05, + "loss": 1.5313, + "step": 8587 + }, + { + "epoch": 0.2864264823185323, + "grad_norm": 1.5348492860794067, + "learning_rate": 4.297349701798505e-05, + "loss": 1.5106, + "step": 8618 + }, + { + "epoch": 0.28745679340600905, + "grad_norm": 1.4060319662094116, + "learning_rate": 4.2915021863751916e-05, + "loss": 1.5283, + "step": 8649 + }, + { + "epoch": 0.2884871044934858, + "grad_norm": 1.531657099723816, + "learning_rate": 4.285634454093198e-05, + "loss": 1.5087, + "step": 8680 + }, + { + "epoch": 0.2895174155809625, + "grad_norm": 1.4756299257278442, + "learning_rate": 4.279746571169086e-05, + "loss": 1.5042, + "step": 8711 + }, + { + "epoch": 0.29054772666843925, + "grad_norm": 1.3221153020858765, + "learning_rate": 4.2738386040468136e-05, + "loss": 1.5244, + "step": 8742 + }, + { + "epoch": 0.29157803775591595, + "grad_norm": 1.4067268371582031, + "learning_rate": 4.2679106193969866e-05, + "loss": 1.5012, + "step": 8773 + }, + { + "epoch": 0.2926083488433927, + "grad_norm": 1.5192064046859741, + "learning_rate": 4.261962684116106e-05, + "loss": 1.521, + "step": 8804 + }, + { + "epoch": 0.29363865993086946, + "grad_norm": 1.3847788572311401, + "learning_rate": 4.2559948653258145e-05, + "loss": 1.5128, + "step": 8835 + }, + { + "epoch": 0.29466897101834616, + "grad_norm": 1.4612780809402466, + "learning_rate": 4.250007230372134e-05, + "loss": 1.5371, + "step": 8866 + }, + { + "epoch": 0.2956992821058229, + "grad_norm": 1.468971610069275, + "learning_rate": 4.2439998468247126e-05, + "loss": 1.5199, + "step": 8897 + }, + { + "epoch": 0.29672959319329967, + "grad_norm": 1.386236310005188, + "learning_rate": 4.2379727824760566e-05, + "loss": 1.5273, + "step": 8928 + }, + { + "epoch": 0.29775990428077637, + "grad_norm": 1.3843929767608643, + "learning_rate": 4.231926105340768e-05, + "loss": 1.5011, + "step": 8959 + }, + { + "epoch": 0.2987902153682531, + "grad_norm": 1.4554557800292969, + "learning_rate": 4.225859883654776e-05, + "loss": 1.5311, + "step": 8990 + }, + { + "epoch": 0.2998205264557299, + "grad_norm": 1.3674421310424805, + "learning_rate": 4.219774185874569e-05, + "loss": 1.5302, + "step": 9021 + }, + { + "epoch": 0.3008508375432066, + "grad_norm": 1.3804330825805664, + "learning_rate": 4.213669080676418e-05, + "loss": 1.538, + "step": 9052 + }, + { + "epoch": 0.3018811486306833, + "grad_norm": 1.4643255472183228, + "learning_rate": 4.2075446369556056e-05, + "loss": 1.5172, + "step": 9083 + }, + { + "epoch": 0.3029114597181601, + "grad_norm": 1.3375928401947021, + "learning_rate": 4.201400923825648e-05, + "loss": 1.5123, + "step": 9114 + }, + { + "epoch": 0.3039417708056368, + "grad_norm": 1.4321980476379395, + "learning_rate": 4.195238010617511e-05, + "loss": 1.5196, + "step": 9145 + }, + { + "epoch": 0.30497208189311353, + "grad_norm": 1.4312376976013184, + "learning_rate": 4.1890559668788344e-05, + "loss": 1.5138, + "step": 9176 + }, + { + "epoch": 0.3060023929805903, + "grad_norm": 1.3089646100997925, + "learning_rate": 4.1828548623731405e-05, + "loss": 1.5027, + "step": 9207 + }, + { + "epoch": 0.307032704068067, + "grad_norm": 1.4863250255584717, + "learning_rate": 4.1766347670790506e-05, + "loss": 1.5091, + "step": 9238 + }, + { + "epoch": 0.30806301515554374, + "grad_norm": 1.373666763305664, + "learning_rate": 4.170395751189495e-05, + "loss": 1.5256, + "step": 9269 + }, + { + "epoch": 0.3090933262430205, + "grad_norm": 1.4160584211349487, + "learning_rate": 4.164137885110921e-05, + "loss": 1.4938, + "step": 9300 + }, + { + "epoch": 0.3101236373304972, + "grad_norm": 2.112110137939453, + "learning_rate": 4.157861239462495e-05, + "loss": 1.5106, + "step": 9331 + }, + { + "epoch": 0.31115394841797395, + "grad_norm": 1.337058663368225, + "learning_rate": 4.1515658850753114e-05, + "loss": 1.4999, + "step": 9362 + }, + { + "epoch": 0.3121842595054507, + "grad_norm": 1.3625296354293823, + "learning_rate": 4.145251892991588e-05, + "loss": 1.5136, + "step": 9393 + }, + { + "epoch": 0.3132145705929274, + "grad_norm": 1.399491548538208, + "learning_rate": 4.138919334463868e-05, + "loss": 1.499, + "step": 9424 + }, + { + "epoch": 0.31424488168040415, + "grad_norm": 1.4202344417572021, + "learning_rate": 4.1325682809542124e-05, + "loss": 1.5049, + "step": 9455 + }, + { + "epoch": 0.3152751927678809, + "grad_norm": 1.392248272895813, + "learning_rate": 4.126198804133398e-05, + "loss": 1.5287, + "step": 9486 + }, + { + "epoch": 0.3163055038553576, + "grad_norm": 1.3807618618011475, + "learning_rate": 4.1198109758801055e-05, + "loss": 1.5309, + "step": 9517 + }, + { + "epoch": 0.31733581494283436, + "grad_norm": 1.3117905855178833, + "learning_rate": 4.113404868280107e-05, + "loss": 1.4933, + "step": 9548 + }, + { + "epoch": 0.3183661260303111, + "grad_norm": 1.452086091041565, + "learning_rate": 4.106980553625457e-05, + "loss": 1.5221, + "step": 9579 + }, + { + "epoch": 0.3193964371177878, + "grad_norm": 1.477364182472229, + "learning_rate": 4.100538104413674e-05, + "loss": 1.4904, + "step": 9610 + }, + { + "epoch": 0.32042674820526457, + "grad_norm": 1.3584345579147339, + "learning_rate": 4.09407759334692e-05, + "loss": 1.4953, + "step": 9641 + }, + { + "epoch": 0.32145705929274127, + "grad_norm": 1.3619811534881592, + "learning_rate": 4.087599093331186e-05, + "loss": 1.4956, + "step": 9672 + }, + { + "epoch": 0.322487370380218, + "grad_norm": 1.4507052898406982, + "learning_rate": 4.081102677475462e-05, + "loss": 1.5197, + "step": 9703 + }, + { + "epoch": 0.3235176814676948, + "grad_norm": 1.4229698181152344, + "learning_rate": 4.0745884190909194e-05, + "loss": 1.498, + "step": 9734 + }, + { + "epoch": 0.32454799255517147, + "grad_norm": 1.3074679374694824, + "learning_rate": 4.0680563916900796e-05, + "loss": 1.5146, + "step": 9765 + }, + { + "epoch": 0.3255783036426482, + "grad_norm": 1.397815465927124, + "learning_rate": 4.0615066689859815e-05, + "loss": 1.5291, + "step": 9796 + }, + { + "epoch": 0.326608614730125, + "grad_norm": 1.3196336030960083, + "learning_rate": 4.0549393248913584e-05, + "loss": 1.5077, + "step": 9827 + }, + { + "epoch": 0.3276389258176017, + "grad_norm": 1.3129957914352417, + "learning_rate": 4.048354433517794e-05, + "loss": 1.4965, + "step": 9858 + }, + { + "epoch": 0.32866923690507843, + "grad_norm": 1.4380089044570923, + "learning_rate": 4.0417520691748916e-05, + "loss": 1.5115, + "step": 9889 + }, + { + "epoch": 0.3296995479925552, + "grad_norm": 1.3162370920181274, + "learning_rate": 4.035132306369438e-05, + "loss": 1.5029, + "step": 9920 + }, + { + "epoch": 0.3307298590800319, + "grad_norm": 1.3739668130874634, + "learning_rate": 4.028495219804555e-05, + "loss": 1.5083, + "step": 9951 + }, + { + "epoch": 0.33176017016750864, + "grad_norm": 1.3673723936080933, + "learning_rate": 4.021840884378864e-05, + "loss": 1.5223, + "step": 9982 + }, + { + "epoch": 0.3327904812549854, + "grad_norm": 1.3970317840576172, + "learning_rate": 4.015169375185633e-05, + "loss": 1.5003, + "step": 10013 + }, + { + "epoch": 0.3338207923424621, + "grad_norm": 1.2982394695281982, + "learning_rate": 4.0084807675119396e-05, + "loss": 1.5066, + "step": 10044 + }, + { + "epoch": 0.33485110342993885, + "grad_norm": 1.4548689126968384, + "learning_rate": 4.0017751368378106e-05, + "loss": 1.4993, + "step": 10075 + }, + { + "epoch": 0.3358814145174156, + "grad_norm": 1.3693586587905884, + "learning_rate": 3.995052558835377e-05, + "loss": 1.4987, + "step": 10106 + }, + { + "epoch": 0.3369117256048923, + "grad_norm": 1.4046767950057983, + "learning_rate": 3.988313109368017e-05, + "loss": 1.5098, + "step": 10137 + }, + { + "epoch": 0.33794203669236905, + "grad_norm": 1.3772069215774536, + "learning_rate": 3.981556864489504e-05, + "loss": 1.5165, + "step": 10168 + }, + { + "epoch": 0.3389723477798458, + "grad_norm": 1.471211314201355, + "learning_rate": 3.974783900443142e-05, + "loss": 1.5037, + "step": 10199 + }, + { + "epoch": 0.3400026588673225, + "grad_norm": 1.3990979194641113, + "learning_rate": 3.9679942936609095e-05, + "loss": 1.5096, + "step": 10230 + }, + { + "epoch": 0.34103296995479926, + "grad_norm": 1.3779234886169434, + "learning_rate": 3.961188120762596e-05, + "loss": 1.4914, + "step": 10261 + }, + { + "epoch": 0.342063281042276, + "grad_norm": 1.2866768836975098, + "learning_rate": 3.954365458554938e-05, + "loss": 1.5026, + "step": 10292 + }, + { + "epoch": 0.3430935921297527, + "grad_norm": 1.353468894958496, + "learning_rate": 3.947526384030751e-05, + "loss": 1.5063, + "step": 10323 + }, + { + "epoch": 0.34412390321722947, + "grad_norm": 1.3264256715774536, + "learning_rate": 3.9406709743680624e-05, + "loss": 1.4911, + "step": 10354 + }, + { + "epoch": 0.3451542143047062, + "grad_norm": 1.3496876955032349, + "learning_rate": 3.9337993069292366e-05, + "loss": 1.4921, + "step": 10385 + }, + { + "epoch": 0.3461845253921829, + "grad_norm": 1.3812434673309326, + "learning_rate": 3.926911459260109e-05, + "loss": 1.4826, + "step": 10416 + }, + { + "epoch": 0.34721483647965967, + "grad_norm": 1.4926965236663818, + "learning_rate": 3.920007509089102e-05, + "loss": 1.4994, + "step": 10447 + }, + { + "epoch": 0.3482451475671364, + "grad_norm": 1.3446170091629028, + "learning_rate": 3.913087534326357e-05, + "loss": 1.5114, + "step": 10478 + }, + { + "epoch": 0.3492754586546131, + "grad_norm": 1.3100495338439941, + "learning_rate": 3.9061516130628475e-05, + "loss": 1.5066, + "step": 10509 + }, + { + "epoch": 0.3503057697420899, + "grad_norm": 1.395874261856079, + "learning_rate": 3.8991998235695025e-05, + "loss": 1.4999, + "step": 10540 + }, + { + "epoch": 0.3513360808295666, + "grad_norm": 1.3682137727737427, + "learning_rate": 3.8922322442963224e-05, + "loss": 1.4778, + "step": 10571 + }, + { + "epoch": 0.35236639191704333, + "grad_norm": 1.4196573495864868, + "learning_rate": 3.885248953871491e-05, + "loss": 1.4909, + "step": 10602 + }, + { + "epoch": 0.3533967030045201, + "grad_norm": 1.4299864768981934, + "learning_rate": 3.8782500311004915e-05, + "loss": 1.5025, + "step": 10633 + }, + { + "epoch": 0.3544270140919968, + "grad_norm": 1.39677095413208, + "learning_rate": 3.871235554965218e-05, + "loss": 1.4932, + "step": 10664 + }, + { + "epoch": 0.35545732517947354, + "grad_norm": 1.3219736814498901, + "learning_rate": 3.864205604623078e-05, + "loss": 1.4795, + "step": 10695 + }, + { + "epoch": 0.3564876362669503, + "grad_norm": 1.3649324178695679, + "learning_rate": 3.857160259406107e-05, + "loss": 1.4838, + "step": 10726 + }, + { + "epoch": 0.357517947354427, + "grad_norm": 1.4109989404678345, + "learning_rate": 3.8500995988200674e-05, + "loss": 1.5058, + "step": 10757 + }, + { + "epoch": 0.35854825844190374, + "grad_norm": 1.3625038862228394, + "learning_rate": 3.843023702543556e-05, + "loss": 1.4912, + "step": 10788 + }, + { + "epoch": 0.3595785695293805, + "grad_norm": 1.4725775718688965, + "learning_rate": 3.8359326504270984e-05, + "loss": 1.5012, + "step": 10819 + }, + { + "epoch": 0.3606088806168572, + "grad_norm": 1.4126085042953491, + "learning_rate": 3.828826522492255e-05, + "loss": 1.4977, + "step": 10850 + }, + { + "epoch": 0.36163919170433395, + "grad_norm": 1.3949086666107178, + "learning_rate": 3.821705398930713e-05, + "loss": 1.4903, + "step": 10881 + }, + { + "epoch": 0.3626695027918107, + "grad_norm": 1.286792516708374, + "learning_rate": 3.814569360103385e-05, + "loss": 1.5067, + "step": 10912 + }, + { + "epoch": 0.3636998138792874, + "grad_norm": 1.274703025817871, + "learning_rate": 3.807418486539499e-05, + "loss": 1.4583, + "step": 10943 + }, + { + "epoch": 0.36473012496676416, + "grad_norm": 1.401455283164978, + "learning_rate": 3.80025285893569e-05, + "loss": 1.4834, + "step": 10974 + }, + { + "epoch": 0.3657604360542409, + "grad_norm": 1.308361530303955, + "learning_rate": 3.793072558155093e-05, + "loss": 1.4832, + "step": 11005 + }, + { + "epoch": 0.3667907471417176, + "grad_norm": 1.654733419418335, + "learning_rate": 3.785877665226426e-05, + "loss": 1.4867, + "step": 11036 + }, + { + "epoch": 0.36782105822919436, + "grad_norm": 1.3530856370925903, + "learning_rate": 3.778668261343079e-05, + "loss": 1.4873, + "step": 11067 + }, + { + "epoch": 0.3688513693166711, + "grad_norm": 1.3567407131195068, + "learning_rate": 3.771444427862192e-05, + "loss": 1.4935, + "step": 11098 + }, + { + "epoch": 0.3698816804041478, + "grad_norm": 1.3184572458267212, + "learning_rate": 3.7642062463037465e-05, + "loss": 1.4891, + "step": 11129 + }, + { + "epoch": 0.37091199149162457, + "grad_norm": 1.366489291191101, + "learning_rate": 3.7569537983496373e-05, + "loss": 1.5159, + "step": 11160 + }, + { + "epoch": 0.3719423025791013, + "grad_norm": 1.423258662223816, + "learning_rate": 3.749687165842753e-05, + "loss": 1.4938, + "step": 11191 + }, + { + "epoch": 0.372972613666578, + "grad_norm": 1.3226194381713867, + "learning_rate": 3.7424064307860536e-05, + "loss": 1.499, + "step": 11222 + }, + { + "epoch": 0.3740029247540548, + "grad_norm": 1.350500464439392, + "learning_rate": 3.735111675341645e-05, + "loss": 1.4952, + "step": 11253 + }, + { + "epoch": 0.37503323584153153, + "grad_norm": 1.3667839765548706, + "learning_rate": 3.7278029818298524e-05, + "loss": 1.4763, + "step": 11284 + }, + { + "epoch": 0.37606354692900823, + "grad_norm": 1.4876132011413574, + "learning_rate": 3.720480432728287e-05, + "loss": 1.4913, + "step": 11315 + }, + { + "epoch": 0.377093858016485, + "grad_norm": 1.3927743434906006, + "learning_rate": 3.71314411067092e-05, + "loss": 1.4948, + "step": 11346 + }, + { + "epoch": 0.37812416910396174, + "grad_norm": 1.3752413988113403, + "learning_rate": 3.70579409844715e-05, + "loss": 1.4763, + "step": 11377 + }, + { + "epoch": 0.37915448019143844, + "grad_norm": 1.3530951738357544, + "learning_rate": 3.698430479000865e-05, + "loss": 1.5077, + "step": 11408 + }, + { + "epoch": 0.3801847912789152, + "grad_norm": 1.4309345483779907, + "learning_rate": 3.691053335429509e-05, + "loss": 1.4945, + "step": 11439 + }, + { + "epoch": 0.38121510236639194, + "grad_norm": 1.2874380350112915, + "learning_rate": 3.683662750983147e-05, + "loss": 1.4698, + "step": 11470 + }, + { + "epoch": 0.38224541345386864, + "grad_norm": 1.3356250524520874, + "learning_rate": 3.676258809063518e-05, + "loss": 1.4924, + "step": 11501 + }, + { + "epoch": 0.3832757245413454, + "grad_norm": 1.304559588432312, + "learning_rate": 3.6688415932231004e-05, + "loss": 1.4682, + "step": 11532 + }, + { + "epoch": 0.3843060356288221, + "grad_norm": 1.4153447151184082, + "learning_rate": 3.661411187164166e-05, + "loss": 1.4989, + "step": 11563 + }, + { + "epoch": 0.38533634671629885, + "grad_norm": 1.356992244720459, + "learning_rate": 3.65396767473784e-05, + "loss": 1.4854, + "step": 11594 + }, + { + "epoch": 0.3863666578037756, + "grad_norm": 1.322449803352356, + "learning_rate": 3.6465111399431465e-05, + "loss": 1.4877, + "step": 11625 + }, + { + "epoch": 0.3873969688912523, + "grad_norm": 1.3981350660324097, + "learning_rate": 3.6390416669260674e-05, + "loss": 1.499, + "step": 11656 + }, + { + "epoch": 0.38842727997872906, + "grad_norm": 1.324871301651001, + "learning_rate": 3.63155933997859e-05, + "loss": 1.4814, + "step": 11687 + }, + { + "epoch": 0.3894575910662058, + "grad_norm": 1.3940790891647339, + "learning_rate": 3.624064243537758e-05, + "loss": 1.4754, + "step": 11718 + }, + { + "epoch": 0.3904879021536825, + "grad_norm": 1.2880780696868896, + "learning_rate": 3.616556462184716e-05, + "loss": 1.4832, + "step": 11749 + }, + { + "epoch": 0.39151821324115926, + "grad_norm": 1.315329670906067, + "learning_rate": 3.609036080643755e-05, + "loss": 1.4853, + "step": 11780 + }, + { + "epoch": 0.392548524328636, + "grad_norm": 1.4093523025512695, + "learning_rate": 3.60150318378136e-05, + "loss": 1.4978, + "step": 11811 + }, + { + "epoch": 0.3935788354161127, + "grad_norm": 1.271151065826416, + "learning_rate": 3.5939578566052465e-05, + "loss": 1.4933, + "step": 11842 + }, + { + "epoch": 0.39460914650358947, + "grad_norm": 1.2910923957824707, + "learning_rate": 3.586400184263408e-05, + "loss": 1.4853, + "step": 11873 + }, + { + "epoch": 0.3956394575910662, + "grad_norm": 1.2480064630508423, + "learning_rate": 3.578830252043148e-05, + "loss": 1.4642, + "step": 11904 + }, + { + "epoch": 0.3966697686785429, + "grad_norm": 1.263197422027588, + "learning_rate": 3.571248145370125e-05, + "loss": 1.4812, + "step": 11935 + }, + { + "epoch": 0.3977000797660197, + "grad_norm": 1.3231288194656372, + "learning_rate": 3.5636539498073794e-05, + "loss": 1.4744, + "step": 11966 + }, + { + "epoch": 0.39873039085349643, + "grad_norm": 1.3933110237121582, + "learning_rate": 3.556047751054378e-05, + "loss": 1.4849, + "step": 11997 + }, + { + "epoch": 0.39976070194097313, + "grad_norm": 1.3615801334381104, + "learning_rate": 3.548429634946039e-05, + "loss": 1.4866, + "step": 12028 + }, + { + "epoch": 0.4007910130284499, + "grad_norm": 1.298638939857483, + "learning_rate": 3.540799687451768e-05, + "loss": 1.4664, + "step": 12059 + }, + { + "epoch": 0.40182132411592664, + "grad_norm": 1.29216468334198, + "learning_rate": 3.533157994674485e-05, + "loss": 1.4697, + "step": 12090 + }, + { + "epoch": 0.40285163520340334, + "grad_norm": 1.3759845495224, + "learning_rate": 3.5255046428496546e-05, + "loss": 1.4854, + "step": 12121 + }, + { + "epoch": 0.4038819462908801, + "grad_norm": 1.4045615196228027, + "learning_rate": 3.517839718344311e-05, + "loss": 1.4622, + "step": 12152 + }, + { + "epoch": 0.40491225737835684, + "grad_norm": 1.2979034185409546, + "learning_rate": 3.510163307656086e-05, + "loss": 1.4797, + "step": 12183 + }, + { + "epoch": 0.40594256846583354, + "grad_norm": 1.303139567375183, + "learning_rate": 3.5024754974122324e-05, + "loss": 1.4588, + "step": 12214 + }, + { + "epoch": 0.4069728795533103, + "grad_norm": 1.287781834602356, + "learning_rate": 3.494776374368643e-05, + "loss": 1.4834, + "step": 12245 + }, + { + "epoch": 0.40800319064078705, + "grad_norm": 1.3806688785552979, + "learning_rate": 3.4870660254088724e-05, + "loss": 1.4807, + "step": 12276 + }, + { + "epoch": 0.40903350172826375, + "grad_norm": 1.4059745073318481, + "learning_rate": 3.479344537543164e-05, + "loss": 1.4906, + "step": 12307 + }, + { + "epoch": 0.4100638128157405, + "grad_norm": 1.3052942752838135, + "learning_rate": 3.4716119979074565e-05, + "loss": 1.4801, + "step": 12338 + }, + { + "epoch": 0.41109412390321726, + "grad_norm": 1.3306844234466553, + "learning_rate": 3.463868493762412e-05, + "loss": 1.4911, + "step": 12369 + }, + { + "epoch": 0.41212443499069396, + "grad_norm": 1.3276656866073608, + "learning_rate": 3.456114112492418e-05, + "loss": 1.4678, + "step": 12400 + }, + { + "epoch": 0.4131547460781707, + "grad_norm": 1.3164253234863281, + "learning_rate": 3.4483489416046164e-05, + "loss": 1.4816, + "step": 12431 + }, + { + "epoch": 0.4141850571656474, + "grad_norm": 1.3827886581420898, + "learning_rate": 3.440573068727905e-05, + "loss": 1.481, + "step": 12462 + }, + { + "epoch": 0.41521536825312416, + "grad_norm": 1.2899463176727295, + "learning_rate": 3.4327865816119495e-05, + "loss": 1.4575, + "step": 12493 + }, + { + "epoch": 0.4162456793406009, + "grad_norm": 1.3136677742004395, + "learning_rate": 3.4249895681262025e-05, + "loss": 1.4695, + "step": 12524 + }, + { + "epoch": 0.4172759904280776, + "grad_norm": 1.2920372486114502, + "learning_rate": 3.417182116258899e-05, + "loss": 1.4765, + "step": 12555 + }, + { + "epoch": 0.41830630151555437, + "grad_norm": 1.3285510540008545, + "learning_rate": 3.409364314116074e-05, + "loss": 1.4559, + "step": 12586 + }, + { + "epoch": 0.4193366126030311, + "grad_norm": 1.2834984064102173, + "learning_rate": 3.401536249920559e-05, + "loss": 1.4706, + "step": 12617 + }, + { + "epoch": 0.4203669236905078, + "grad_norm": 1.315942645072937, + "learning_rate": 3.393698012010998e-05, + "loss": 1.4692, + "step": 12648 + }, + { + "epoch": 0.4213972347779846, + "grad_norm": 1.3668091297149658, + "learning_rate": 3.385849688840839e-05, + "loss": 1.4801, + "step": 12679 + }, + { + "epoch": 0.42242754586546133, + "grad_norm": 1.312280297279358, + "learning_rate": 3.3779913689773414e-05, + "loss": 1.4673, + "step": 12710 + }, + { + "epoch": 0.423457856952938, + "grad_norm": 1.3579858541488647, + "learning_rate": 3.370123141100578e-05, + "loss": 1.4578, + "step": 12741 + }, + { + "epoch": 0.4244881680404148, + "grad_norm": 1.4001456499099731, + "learning_rate": 3.3622450940024305e-05, + "loss": 1.4787, + "step": 12772 + }, + { + "epoch": 0.42551847912789154, + "grad_norm": 1.352629542350769, + "learning_rate": 3.35435731658559e-05, + "loss": 1.457, + "step": 12803 + }, + { + "epoch": 0.42654879021536823, + "grad_norm": 1.4044222831726074, + "learning_rate": 3.346459897862552e-05, + "loss": 1.4979, + "step": 12834 + }, + { + "epoch": 0.427579101302845, + "grad_norm": 1.2666436433792114, + "learning_rate": 3.338552926954613e-05, + "loss": 1.4712, + "step": 12865 + }, + { + "epoch": 0.42860941239032174, + "grad_norm": 1.2487694025039673, + "learning_rate": 3.330636493090868e-05, + "loss": 1.4784, + "step": 12896 + }, + { + "epoch": 0.42963972347779844, + "grad_norm": 1.2346290349960327, + "learning_rate": 3.322710685607193e-05, + "loss": 1.4754, + "step": 12927 + }, + { + "epoch": 0.4306700345652752, + "grad_norm": 1.2908893823623657, + "learning_rate": 3.314775593945251e-05, + "loss": 1.4677, + "step": 12958 + }, + { + "epoch": 0.43170034565275195, + "grad_norm": 1.3283506631851196, + "learning_rate": 3.3068313076514714e-05, + "loss": 1.4661, + "step": 12989 + }, + { + "epoch": 0.43273065674022865, + "grad_norm": 1.2982537746429443, + "learning_rate": 3.298877916376047e-05, + "loss": 1.4838, + "step": 13020 + }, + { + "epoch": 0.4337609678277054, + "grad_norm": 1.3566454648971558, + "learning_rate": 3.290915509871915e-05, + "loss": 1.4683, + "step": 13051 + }, + { + "epoch": 0.43479127891518216, + "grad_norm": 1.3470877408981323, + "learning_rate": 3.282944177993753e-05, + "loss": 1.4724, + "step": 13082 + }, + { + "epoch": 0.43582159000265885, + "grad_norm": 1.451150894165039, + "learning_rate": 3.274964010696957e-05, + "loss": 1.4731, + "step": 13113 + }, + { + "epoch": 0.4368519010901356, + "grad_norm": 1.3415958881378174, + "learning_rate": 3.266975098036629e-05, + "loss": 1.4809, + "step": 13144 + }, + { + "epoch": 0.43788221217761236, + "grad_norm": 1.2775352001190186, + "learning_rate": 3.258977530166562e-05, + "loss": 1.4523, + "step": 13175 + }, + { + "epoch": 0.43891252326508906, + "grad_norm": 1.365050196647644, + "learning_rate": 3.250971397338227e-05, + "loss": 1.4611, + "step": 13206 + }, + { + "epoch": 0.4399428343525658, + "grad_norm": 1.3481686115264893, + "learning_rate": 3.2429567898997404e-05, + "loss": 1.4708, + "step": 13237 + }, + { + "epoch": 0.44097314544004257, + "grad_norm": 1.3418121337890625, + "learning_rate": 3.234933798294859e-05, + "loss": 1.485, + "step": 13268 + }, + { + "epoch": 0.44200345652751927, + "grad_norm": 1.3098441362380981, + "learning_rate": 3.2269025130619535e-05, + "loss": 1.472, + "step": 13299 + }, + { + "epoch": 0.443033767614996, + "grad_norm": 1.2792437076568604, + "learning_rate": 3.218863024832985e-05, + "loss": 1.4592, + "step": 13330 + }, + { + "epoch": 0.4440640787024727, + "grad_norm": 1.3804035186767578, + "learning_rate": 3.2108154243324864e-05, + "loss": 1.4546, + "step": 13361 + }, + { + "epoch": 0.4450943897899495, + "grad_norm": 1.287787675857544, + "learning_rate": 3.2027598023765345e-05, + "loss": 1.4477, + "step": 13392 + }, + { + "epoch": 0.44612470087742623, + "grad_norm": 1.5964646339416504, + "learning_rate": 3.194696249871729e-05, + "loss": 1.4468, + "step": 13423 + }, + { + "epoch": 0.4471550119649029, + "grad_norm": 1.3253474235534668, + "learning_rate": 3.186624857814164e-05, + "loss": 1.4588, + "step": 13454 + }, + { + "epoch": 0.4481853230523797, + "grad_norm": 1.288176417350769, + "learning_rate": 3.178545717288401e-05, + "loss": 1.4644, + "step": 13485 + }, + { + "epoch": 0.44921563413985643, + "grad_norm": 1.3357142210006714, + "learning_rate": 3.170458919466444e-05, + "loss": 1.4871, + "step": 13516 + }, + { + "epoch": 0.45024594522733313, + "grad_norm": 1.2954436540603638, + "learning_rate": 3.1623645556067063e-05, + "loss": 1.4571, + "step": 13547 + }, + { + "epoch": 0.4512762563148099, + "grad_norm": 1.344789981842041, + "learning_rate": 3.154262717052985e-05, + "loss": 1.459, + "step": 13578 + }, + { + "epoch": 0.45230656740228664, + "grad_norm": 1.2648475170135498, + "learning_rate": 3.146153495233426e-05, + "loss": 1.4496, + "step": 13609 + }, + { + "epoch": 0.45333687848976334, + "grad_norm": 1.312733769416809, + "learning_rate": 3.1380369816594944e-05, + "loss": 1.4309, + "step": 13640 + }, + { + "epoch": 0.4543671895772401, + "grad_norm": 1.3719325065612793, + "learning_rate": 3.129913267924946e-05, + "loss": 1.4723, + "step": 13671 + }, + { + "epoch": 0.45539750066471685, + "grad_norm": 1.2850617170333862, + "learning_rate": 3.121782445704782e-05, + "loss": 1.4599, + "step": 13702 + }, + { + "epoch": 0.45642781175219355, + "grad_norm": 1.3335177898406982, + "learning_rate": 3.11364460675423e-05, + "loss": 1.4821, + "step": 13733 + }, + { + "epoch": 0.4574581228396703, + "grad_norm": 1.1675069332122803, + "learning_rate": 3.1054998429076934e-05, + "loss": 1.453, + "step": 13764 + }, + { + "epoch": 0.45848843392714705, + "grad_norm": 1.283544898033142, + "learning_rate": 3.097348246077728e-05, + "loss": 1.4545, + "step": 13795 + }, + { + "epoch": 0.45951874501462375, + "grad_norm": 1.4358693361282349, + "learning_rate": 3.0891899082539924e-05, + "loss": 1.4673, + "step": 13826 + }, + { + "epoch": 0.4605490561021005, + "grad_norm": 1.2551497220993042, + "learning_rate": 3.0810249215022233e-05, + "loss": 1.4532, + "step": 13857 + }, + { + "epoch": 0.46157936718957726, + "grad_norm": 1.2574602365493774, + "learning_rate": 3.0728533779631865e-05, + "loss": 1.4762, + "step": 13888 + }, + { + "epoch": 0.46260967827705396, + "grad_norm": 1.2202764749526978, + "learning_rate": 3.064675369851637e-05, + "loss": 1.4461, + "step": 13919 + }, + { + "epoch": 0.4636399893645307, + "grad_norm": 1.2787501811981201, + "learning_rate": 3.056490989455289e-05, + "loss": 1.4607, + "step": 13950 + }, + { + "epoch": 0.46467030045200747, + "grad_norm": 1.2511006593704224, + "learning_rate": 3.0483003291337596e-05, + "loss": 1.4548, + "step": 13981 + }, + { + "epoch": 0.46570061153948417, + "grad_norm": 1.2749834060668945, + "learning_rate": 3.040103481317539e-05, + "loss": 1.4394, + "step": 14012 + }, + { + "epoch": 0.4667309226269609, + "grad_norm": 1.223057746887207, + "learning_rate": 3.03190053850694e-05, + "loss": 1.4684, + "step": 14043 + }, + { + "epoch": 0.4677612337144377, + "grad_norm": 1.39846932888031, + "learning_rate": 3.0236915932710573e-05, + "loss": 1.4657, + "step": 14074 + }, + { + "epoch": 0.4687915448019144, + "grad_norm": 1.5305665731430054, + "learning_rate": 3.0154767382467232e-05, + "loss": 1.4795, + "step": 14105 + }, + { + "epoch": 0.4698218558893911, + "grad_norm": 1.2569035291671753, + "learning_rate": 3.0072560661374582e-05, + "loss": 1.4756, + "step": 14136 + }, + { + "epoch": 0.4708521669768679, + "grad_norm": 1.3472824096679688, + "learning_rate": 2.999029669712431e-05, + "loss": 1.4682, + "step": 14167 + }, + { + "epoch": 0.4718824780643446, + "grad_norm": 1.271714210510254, + "learning_rate": 2.990797641805408e-05, + "loss": 1.4509, + "step": 14198 + }, + { + "epoch": 0.47291278915182133, + "grad_norm": 1.3342047929763794, + "learning_rate": 2.982560075313704e-05, + "loss": 1.4528, + "step": 14229 + }, + { + "epoch": 0.47394310023929803, + "grad_norm": 1.5821506977081299, + "learning_rate": 2.9743170631971368e-05, + "loss": 1.4609, + "step": 14260 + }, + { + "epoch": 0.4749734113267748, + "grad_norm": 1.2598062753677368, + "learning_rate": 2.9660686984769792e-05, + "loss": 1.471, + "step": 14291 + }, + { + "epoch": 0.47600372241425154, + "grad_norm": 1.2648885250091553, + "learning_rate": 2.9578150742349047e-05, + "loss": 1.4708, + "step": 14322 + }, + { + "epoch": 0.47703403350172824, + "grad_norm": 1.559665560722351, + "learning_rate": 2.949556283611942e-05, + "loss": 1.4516, + "step": 14353 + }, + { + "epoch": 0.478064344589205, + "grad_norm": 1.2621581554412842, + "learning_rate": 2.9412924198074206e-05, + "loss": 1.446, + "step": 14384 + }, + { + "epoch": 0.47909465567668175, + "grad_norm": 1.2775017023086548, + "learning_rate": 2.9330235760779208e-05, + "loss": 1.4496, + "step": 14415 + }, + { + "epoch": 0.48012496676415845, + "grad_norm": 1.2010388374328613, + "learning_rate": 2.9247498457362188e-05, + "loss": 1.4606, + "step": 14446 + }, + { + "epoch": 0.4811552778516352, + "grad_norm": 1.3053895235061646, + "learning_rate": 2.9164713221502373e-05, + "loss": 1.4536, + "step": 14477 + }, + { + "epoch": 0.48218558893911195, + "grad_norm": 1.311596155166626, + "learning_rate": 2.9081880987419912e-05, + "loss": 1.4409, + "step": 14508 + }, + { + "epoch": 0.48321590002658865, + "grad_norm": 1.3888933658599854, + "learning_rate": 2.8999002689865296e-05, + "loss": 1.4314, + "step": 14539 + }, + { + "epoch": 0.4842462111140654, + "grad_norm": 1.288619875907898, + "learning_rate": 2.8916079264108852e-05, + "loss": 1.4539, + "step": 14570 + }, + { + "epoch": 0.48527652220154216, + "grad_norm": 1.2974294424057007, + "learning_rate": 2.883311164593017e-05, + "loss": 1.4627, + "step": 14601 + }, + { + "epoch": 0.48630683328901886, + "grad_norm": 1.2057379484176636, + "learning_rate": 2.875010077160754e-05, + "loss": 1.4578, + "step": 14632 + }, + { + "epoch": 0.4873371443764956, + "grad_norm": 1.363971471786499, + "learning_rate": 2.866704757790741e-05, + "loss": 1.4671, + "step": 14663 + }, + { + "epoch": 0.48836745546397237, + "grad_norm": 1.2696925401687622, + "learning_rate": 2.858395300207376e-05, + "loss": 1.4333, + "step": 14694 + }, + { + "epoch": 0.48939776655144906, + "grad_norm": 1.2653478384017944, + "learning_rate": 2.8500817981817607e-05, + "loss": 1.4662, + "step": 14725 + }, + { + "epoch": 0.4904280776389258, + "grad_norm": 1.3011239767074585, + "learning_rate": 2.8417643455306336e-05, + "loss": 1.4589, + "step": 14756 + }, + { + "epoch": 0.4914583887264026, + "grad_norm": 1.3312432765960693, + "learning_rate": 2.8334430361153185e-05, + "loss": 1.4368, + "step": 14787 + }, + { + "epoch": 0.49248869981387927, + "grad_norm": 1.3015661239624023, + "learning_rate": 2.8251179638406612e-05, + "loss": 1.466, + "step": 14818 + }, + { + "epoch": 0.493519010901356, + "grad_norm": 1.3215759992599487, + "learning_rate": 2.8167892226539704e-05, + "loss": 1.4486, + "step": 14849 + }, + { + "epoch": 0.4945493219888328, + "grad_norm": 1.2909883260726929, + "learning_rate": 2.8084569065439588e-05, + "loss": 1.4433, + "step": 14880 + }, + { + "epoch": 0.4955796330763095, + "grad_norm": 1.364015817642212, + "learning_rate": 2.8001211095396807e-05, + "loss": 1.4449, + "step": 14911 + }, + { + "epoch": 0.49660994416378623, + "grad_norm": 1.2468819618225098, + "learning_rate": 2.791781925709473e-05, + "loss": 1.4572, + "step": 14942 + }, + { + "epoch": 0.497640255251263, + "grad_norm": 1.2739325761795044, + "learning_rate": 2.7834394491598908e-05, + "loss": 1.4478, + "step": 14973 + }, + { + "epoch": 0.4986705663387397, + "grad_norm": 1.3384937047958374, + "learning_rate": 2.7750937740346485e-05, + "loss": 1.4429, + "step": 15004 + }, + { + "epoch": 0.49970087742621644, + "grad_norm": 1.231088399887085, + "learning_rate": 2.7667449945135564e-05, + "loss": 1.4631, + "step": 15035 + }, + { + "epoch": 0.5007311885136931, + "grad_norm": 1.2262307405471802, + "learning_rate": 2.7583932048114557e-05, + "loss": 1.4508, + "step": 15066 + }, + { + "epoch": 0.5017614996011699, + "grad_norm": 1.3427774906158447, + "learning_rate": 2.7500384991771587e-05, + "loss": 1.4441, + "step": 15097 + }, + { + "epoch": 0.5027918106886466, + "grad_norm": 1.2950241565704346, + "learning_rate": 2.7416809718923825e-05, + "loss": 1.4427, + "step": 15128 + }, + { + "epoch": 0.5038221217761234, + "grad_norm": 1.4129016399383545, + "learning_rate": 2.7333207172706864e-05, + "loss": 1.4562, + "step": 15159 + }, + { + "epoch": 0.5048524328636002, + "grad_norm": 1.2751520872116089, + "learning_rate": 2.7249578296564088e-05, + "loss": 1.4517, + "step": 15190 + }, + { + "epoch": 0.5058827439510768, + "grad_norm": 1.302485466003418, + "learning_rate": 2.7165924034235973e-05, + "loss": 1.4327, + "step": 15221 + }, + { + "epoch": 0.5069130550385536, + "grad_norm": 1.295390009880066, + "learning_rate": 2.708224532974953e-05, + "loss": 1.4455, + "step": 15252 + }, + { + "epoch": 0.5079433661260303, + "grad_norm": 1.3160103559494019, + "learning_rate": 2.6998543127407538e-05, + "loss": 1.4556, + "step": 15283 + }, + { + "epoch": 0.5089736772135071, + "grad_norm": 1.2997361421585083, + "learning_rate": 2.6914818371777988e-05, + "loss": 1.444, + "step": 15314 + }, + { + "epoch": 0.5100039883009838, + "grad_norm": 1.2427833080291748, + "learning_rate": 2.6831072007683373e-05, + "loss": 1.4501, + "step": 15345 + }, + { + "epoch": 0.5110342993884606, + "grad_norm": 1.2402199506759644, + "learning_rate": 2.6747304980190018e-05, + "loss": 1.4543, + "step": 15376 + }, + { + "epoch": 0.5120646104759372, + "grad_norm": 1.2938770055770874, + "learning_rate": 2.6663518234597453e-05, + "loss": 1.4394, + "step": 15407 + }, + { + "epoch": 0.513094921563414, + "grad_norm": 1.1747736930847168, + "learning_rate": 2.6579712716427696e-05, + "loss": 1.4389, + "step": 15438 + }, + { + "epoch": 0.5141252326508907, + "grad_norm": 1.326824426651001, + "learning_rate": 2.6495889371414652e-05, + "loss": 1.4365, + "step": 15469 + }, + { + "epoch": 0.5151555437383675, + "grad_norm": 1.245665431022644, + "learning_rate": 2.6412049145493367e-05, + "loss": 1.4525, + "step": 15500 + }, + { + "epoch": 0.5161858548258442, + "grad_norm": 1.1753687858581543, + "learning_rate": 2.632819298478939e-05, + "loss": 1.447, + "step": 15531 + }, + { + "epoch": 0.517216165913321, + "grad_norm": 1.3870874643325806, + "learning_rate": 2.6244321835608105e-05, + "loss": 1.4577, + "step": 15562 + }, + { + "epoch": 0.5182464770007976, + "grad_norm": 1.2849411964416504, + "learning_rate": 2.6160436644424024e-05, + "loss": 1.4371, + "step": 15593 + }, + { + "epoch": 0.5192767880882744, + "grad_norm": 1.292443037033081, + "learning_rate": 2.6076538357870133e-05, + "loss": 1.4558, + "step": 15624 + }, + { + "epoch": 0.5203070991757511, + "grad_norm": 1.279961347579956, + "learning_rate": 2.5992627922727196e-05, + "loss": 1.4384, + "step": 15655 + }, + { + "epoch": 0.5213374102632279, + "grad_norm": 1.3141279220581055, + "learning_rate": 2.5908706285913066e-05, + "loss": 1.45, + "step": 15686 + }, + { + "epoch": 0.5223677213507046, + "grad_norm": 1.3931515216827393, + "learning_rate": 2.5824774394472008e-05, + "loss": 1.4403, + "step": 15717 + }, + { + "epoch": 0.5233980324381813, + "grad_norm": 1.2564170360565186, + "learning_rate": 2.5740833195563996e-05, + "loss": 1.4482, + "step": 15748 + }, + { + "epoch": 0.524428343525658, + "grad_norm": 1.5450046062469482, + "learning_rate": 2.5656883636454067e-05, + "loss": 1.4443, + "step": 15779 + }, + { + "epoch": 0.5254586546131348, + "grad_norm": 1.2659518718719482, + "learning_rate": 2.557292666450159e-05, + "loss": 1.4653, + "step": 15810 + }, + { + "epoch": 0.5264889657006115, + "grad_norm": 1.2940540313720703, + "learning_rate": 2.5488963227149566e-05, + "loss": 1.4302, + "step": 15841 + }, + { + "epoch": 0.5275192767880883, + "grad_norm": 1.2514533996582031, + "learning_rate": 2.5404994271913983e-05, + "loss": 1.4412, + "step": 15872 + }, + { + "epoch": 0.528549587875565, + "grad_norm": 1.2681846618652344, + "learning_rate": 2.5321020746373085e-05, + "loss": 1.4411, + "step": 15903 + }, + { + "epoch": 0.5295798989630417, + "grad_norm": 1.2581806182861328, + "learning_rate": 2.52370435981567e-05, + "loss": 1.4503, + "step": 15934 + }, + { + "epoch": 0.5306102100505184, + "grad_norm": 1.3299468755722046, + "learning_rate": 2.5153063774935533e-05, + "loss": 1.4392, + "step": 15965 + }, + { + "epoch": 0.5316405211379952, + "grad_norm": 1.240678310394287, + "learning_rate": 2.506908222441045e-05, + "loss": 1.4412, + "step": 15996 + }, + { + "epoch": 0.532670832225472, + "grad_norm": 1.337936520576477, + "learning_rate": 2.498509989430187e-05, + "loss": 1.4254, + "step": 16027 + }, + { + "epoch": 0.5337011433129487, + "grad_norm": 1.302909016609192, + "learning_rate": 2.4901117732338958e-05, + "loss": 1.4436, + "step": 16058 + }, + { + "epoch": 0.5347314544004255, + "grad_norm": 1.2539550065994263, + "learning_rate": 2.481713668624899e-05, + "loss": 1.4496, + "step": 16089 + }, + { + "epoch": 0.5357617654879021, + "grad_norm": 1.287431001663208, + "learning_rate": 2.4733157703746663e-05, + "loss": 1.424, + "step": 16120 + }, + { + "epoch": 0.5367920765753789, + "grad_norm": 1.5333632230758667, + "learning_rate": 2.4649181732523392e-05, + "loss": 1.4399, + "step": 16151 + }, + { + "epoch": 0.5378223876628556, + "grad_norm": 1.2591406106948853, + "learning_rate": 2.4565209720236582e-05, + "loss": 1.439, + "step": 16182 + }, + { + "epoch": 0.5388526987503324, + "grad_norm": 1.3093276023864746, + "learning_rate": 2.4481242614498975e-05, + "loss": 1.4279, + "step": 16213 + }, + { + "epoch": 0.5398830098378091, + "grad_norm": 1.2824875116348267, + "learning_rate": 2.439728136286796e-05, + "loss": 1.4428, + "step": 16244 + }, + { + "epoch": 0.5409133209252859, + "grad_norm": 1.2775593996047974, + "learning_rate": 2.4313326912834852e-05, + "loss": 1.4352, + "step": 16275 + }, + { + "epoch": 0.5419436320127625, + "grad_norm": 1.4667550325393677, + "learning_rate": 2.4229380211814206e-05, + "loss": 1.4633, + "step": 16306 + }, + { + "epoch": 0.5429739431002393, + "grad_norm": 1.2620900869369507, + "learning_rate": 2.4145442207133124e-05, + "loss": 1.4482, + "step": 16337 + }, + { + "epoch": 0.544004254187716, + "grad_norm": 1.3041224479675293, + "learning_rate": 2.406151384602059e-05, + "loss": 1.4431, + "step": 16368 + }, + { + "epoch": 0.5450345652751928, + "grad_norm": 1.3634989261627197, + "learning_rate": 2.3977596075596747e-05, + "loss": 1.4186, + "step": 16399 + }, + { + "epoch": 0.5460648763626695, + "grad_norm": 1.2322940826416016, + "learning_rate": 2.3893689842862223e-05, + "loss": 1.4322, + "step": 16430 + }, + { + "epoch": 0.5470951874501463, + "grad_norm": 1.5554733276367188, + "learning_rate": 2.3809796094687475e-05, + "loss": 1.4337, + "step": 16461 + }, + { + "epoch": 0.5481254985376229, + "grad_norm": 1.4745500087738037, + "learning_rate": 2.372591577780202e-05, + "loss": 1.4411, + "step": 16492 + }, + { + "epoch": 0.5491558096250997, + "grad_norm": 1.2865196466445923, + "learning_rate": 2.3642049838783838e-05, + "loss": 1.429, + "step": 16523 + }, + { + "epoch": 0.5501861207125764, + "grad_norm": 1.399247407913208, + "learning_rate": 2.3558199224048666e-05, + "loss": 1.4753, + "step": 16554 + }, + { + "epoch": 0.5512164318000532, + "grad_norm": 1.2135406732559204, + "learning_rate": 2.347436487983929e-05, + "loss": 1.4553, + "step": 16585 + }, + { + "epoch": 0.55224674288753, + "grad_norm": 1.164150357246399, + "learning_rate": 2.3390547752214888e-05, + "loss": 1.4268, + "step": 16616 + }, + { + "epoch": 0.5532770539750066, + "grad_norm": 1.2363818883895874, + "learning_rate": 2.330674878704035e-05, + "loss": 1.4381, + "step": 16647 + }, + { + "epoch": 0.5543073650624833, + "grad_norm": 1.286139726638794, + "learning_rate": 2.322296892997561e-05, + "loss": 1.4492, + "step": 16678 + }, + { + "epoch": 0.5553376761499601, + "grad_norm": 1.2836147546768188, + "learning_rate": 2.313920912646497e-05, + "loss": 1.4128, + "step": 16709 + }, + { + "epoch": 0.5563679872374369, + "grad_norm": 1.253727674484253, + "learning_rate": 2.305547032172643e-05, + "loss": 1.4472, + "step": 16740 + }, + { + "epoch": 0.5573982983249136, + "grad_norm": 1.2580201625823975, + "learning_rate": 2.2971753460741014e-05, + "loss": 1.4461, + "step": 16771 + }, + { + "epoch": 0.5584286094123904, + "grad_norm": 1.2446421384811401, + "learning_rate": 2.288805948824212e-05, + "loss": 1.4267, + "step": 16802 + }, + { + "epoch": 0.559458920499867, + "grad_norm": 1.3572150468826294, + "learning_rate": 2.2804389348704858e-05, + "loss": 1.4222, + "step": 16833 + }, + { + "epoch": 0.5604892315873438, + "grad_norm": 1.3694707155227661, + "learning_rate": 2.2720743986335374e-05, + "loss": 1.4624, + "step": 16864 + }, + { + "epoch": 0.5615195426748205, + "grad_norm": 1.2654088735580444, + "learning_rate": 2.2637124345060233e-05, + "loss": 1.4379, + "step": 16895 + }, + { + "epoch": 0.5625498537622973, + "grad_norm": 1.3349469900131226, + "learning_rate": 2.2553531368515695e-05, + "loss": 1.4404, + "step": 16926 + }, + { + "epoch": 0.563580164849774, + "grad_norm": 1.2259774208068848, + "learning_rate": 2.2469966000037144e-05, + "loss": 1.4335, + "step": 16957 + }, + { + "epoch": 0.5646104759372508, + "grad_norm": 1.2973053455352783, + "learning_rate": 2.2386429182648417e-05, + "loss": 1.4397, + "step": 16988 + }, + { + "epoch": 0.5656407870247274, + "grad_norm": 1.2674601078033447, + "learning_rate": 2.230292185905114e-05, + "loss": 1.4256, + "step": 17019 + }, + { + "epoch": 0.5666710981122042, + "grad_norm": 1.243605136871338, + "learning_rate": 2.2219444971614116e-05, + "loss": 1.4404, + "step": 17050 + }, + { + "epoch": 0.5677014091996809, + "grad_norm": 1.2108361721038818, + "learning_rate": 2.2135999462362655e-05, + "loss": 1.4318, + "step": 17081 + }, + { + "epoch": 0.5687317202871577, + "grad_norm": 1.2497962713241577, + "learning_rate": 2.2052586272968003e-05, + "loss": 1.4409, + "step": 17112 + }, + { + "epoch": 0.5697620313746344, + "grad_norm": 1.2269086837768555, + "learning_rate": 2.196920634473666e-05, + "loss": 1.4417, + "step": 17143 + }, + { + "epoch": 0.5707923424621112, + "grad_norm": 1.3165903091430664, + "learning_rate": 2.1885860618599787e-05, + "loss": 1.4541, + "step": 17174 + }, + { + "epoch": 0.5718226535495878, + "grad_norm": 1.2117608785629272, + "learning_rate": 2.1802550035102577e-05, + "loss": 1.4457, + "step": 17205 + }, + { + "epoch": 0.5728529646370646, + "grad_norm": 1.2482073307037354, + "learning_rate": 2.171927553439363e-05, + "loss": 1.4408, + "step": 17236 + }, + { + "epoch": 0.5738832757245413, + "grad_norm": 1.2258682250976562, + "learning_rate": 2.1636038056214376e-05, + "loss": 1.4366, + "step": 17267 + }, + { + "epoch": 0.5749135868120181, + "grad_norm": 1.254062294960022, + "learning_rate": 2.155283853988844e-05, + "loss": 1.4187, + "step": 17298 + }, + { + "epoch": 0.5759438978994948, + "grad_norm": 1.3397905826568604, + "learning_rate": 2.146967792431106e-05, + "loss": 1.4316, + "step": 17329 + }, + { + "epoch": 0.5769742089869716, + "grad_norm": 1.3253263235092163, + "learning_rate": 2.138655714793849e-05, + "loss": 1.4361, + "step": 17360 + }, + { + "epoch": 0.5780045200744482, + "grad_norm": 1.2624903917312622, + "learning_rate": 2.1303477148777367e-05, + "loss": 1.4136, + "step": 17391 + }, + { + "epoch": 0.579034831161925, + "grad_norm": 1.3255977630615234, + "learning_rate": 2.122043886437421e-05, + "loss": 1.4552, + "step": 17422 + }, + { + "epoch": 0.5800651422494018, + "grad_norm": 1.300898790359497, + "learning_rate": 2.1137443231804765e-05, + "loss": 1.4152, + "step": 17453 + }, + { + "epoch": 0.5810954533368785, + "grad_norm": 1.2904343605041504, + "learning_rate": 2.105449118766347e-05, + "loss": 1.4195, + "step": 17484 + }, + { + "epoch": 0.5821257644243553, + "grad_norm": 1.3146878480911255, + "learning_rate": 2.097158366805287e-05, + "loss": 1.426, + "step": 17515 + }, + { + "epoch": 0.5831560755118319, + "grad_norm": 1.2454010248184204, + "learning_rate": 2.0888721608573047e-05, + "loss": 1.4239, + "step": 17546 + }, + { + "epoch": 0.5841863865993087, + "grad_norm": 1.194626808166504, + "learning_rate": 2.0805905944311087e-05, + "loss": 1.4416, + "step": 17577 + }, + { + "epoch": 0.5852166976867854, + "grad_norm": 1.359053373336792, + "learning_rate": 2.0723137609830497e-05, + "loss": 1.4112, + "step": 17608 + }, + { + "epoch": 0.5862470087742622, + "grad_norm": 1.2577933073043823, + "learning_rate": 2.0640417539160686e-05, + "loss": 1.4432, + "step": 17639 + }, + { + "epoch": 0.5872773198617389, + "grad_norm": 1.2604849338531494, + "learning_rate": 2.0557746665786427e-05, + "loss": 1.4184, + "step": 17670 + }, + { + "epoch": 0.5883076309492157, + "grad_norm": 1.2511252164840698, + "learning_rate": 2.0475125922637256e-05, + "loss": 1.4276, + "step": 17701 + }, + { + "epoch": 0.5893379420366923, + "grad_norm": 1.2841278314590454, + "learning_rate": 2.0392556242077047e-05, + "loss": 1.4345, + "step": 17732 + }, + { + "epoch": 0.5903682531241691, + "grad_norm": 1.3342245817184448, + "learning_rate": 2.031003855589343e-05, + "loss": 1.4212, + "step": 17763 + }, + { + "epoch": 0.5913985642116458, + "grad_norm": 1.352387547492981, + "learning_rate": 2.022757379528727e-05, + "loss": 1.4316, + "step": 17794 + }, + { + "epoch": 0.5924288752991226, + "grad_norm": 1.3534374237060547, + "learning_rate": 2.0145162890862184e-05, + "loss": 1.4352, + "step": 17825 + }, + { + "epoch": 0.5934591863865993, + "grad_norm": 1.2957963943481445, + "learning_rate": 2.0062806772614022e-05, + "loss": 1.4057, + "step": 17856 + }, + { + "epoch": 0.5944894974740761, + "grad_norm": 1.3178727626800537, + "learning_rate": 1.9980506369920392e-05, + "loss": 1.4323, + "step": 17887 + }, + { + "epoch": 0.5955198085615527, + "grad_norm": 1.3364850282669067, + "learning_rate": 1.989826261153015e-05, + "loss": 1.4228, + "step": 17918 + }, + { + "epoch": 0.5965501196490295, + "grad_norm": 1.283200979232788, + "learning_rate": 1.9816076425552923e-05, + "loss": 1.4348, + "step": 17949 + }, + { + "epoch": 0.5975804307365062, + "grad_norm": 1.2856223583221436, + "learning_rate": 1.9733948739448676e-05, + "loss": 1.4176, + "step": 17980 + }, + { + "epoch": 0.598610741823983, + "grad_norm": 1.253180742263794, + "learning_rate": 1.9651880480017155e-05, + "loss": 1.4175, + "step": 18011 + }, + { + "epoch": 0.5996410529114597, + "grad_norm": 1.3471016883850098, + "learning_rate": 1.9569872573387516e-05, + "loss": 1.433, + "step": 18042 + }, + { + "epoch": 0.6006713639989365, + "grad_norm": 1.2449748516082764, + "learning_rate": 1.9487925945007854e-05, + "loss": 1.4091, + "step": 18073 + }, + { + "epoch": 0.6017016750864131, + "grad_norm": 1.3311972618103027, + "learning_rate": 1.9406041519634726e-05, + "loss": 1.403, + "step": 18104 + }, + { + "epoch": 0.6027319861738899, + "grad_norm": 1.2645657062530518, + "learning_rate": 1.932422022132275e-05, + "loss": 1.4265, + "step": 18135 + }, + { + "epoch": 0.6037622972613667, + "grad_norm": 1.3313370943069458, + "learning_rate": 1.924246297341414e-05, + "loss": 1.4275, + "step": 18166 + }, + { + "epoch": 0.6047926083488434, + "grad_norm": 1.2827123403549194, + "learning_rate": 1.9160770698528338e-05, + "loss": 1.4277, + "step": 18197 + }, + { + "epoch": 0.6058229194363202, + "grad_norm": 1.2230308055877686, + "learning_rate": 1.907914431855156e-05, + "loss": 1.4391, + "step": 18228 + }, + { + "epoch": 0.6068532305237969, + "grad_norm": 1.2785223722457886, + "learning_rate": 1.8997584754626412e-05, + "loss": 1.4152, + "step": 18259 + }, + { + "epoch": 0.6078835416112736, + "grad_norm": 1.3152620792388916, + "learning_rate": 1.8916092927141486e-05, + "loss": 1.4137, + "step": 18290 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3583670324133626e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-18312/training_args.bin b/checkpoint-18312/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..974208468b82a3c5684aaa384776477cf21c18ca --- /dev/null +++ b/checkpoint-18312/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a23be0ff07d6d3142f7c0980f91dddba845519c24fcb411cbb4b9ddb1513ff +size 5304 diff --git a/checkpoint-21364/config.json b/checkpoint-21364/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aaa74176892d42e1c7f5979b7ddf8ab15985d3 --- /dev/null +++ b/checkpoint-21364/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "/mnt/parscratch/users/acp23ay/private/models/Llama-3.1-8B-Instruct-ta-madlad-mean/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 138256 +} diff --git a/checkpoint-21364/generation_config.json b/checkpoint-21364/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-21364/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-21364/model-00001-of-00007.safetensors b/checkpoint-21364/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..609503b183dfc75af79790a1bf0690e514f64787 --- /dev/null +++ b/checkpoint-21364/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c8ea81152baa18da53fb7a56880bb8364fd5199a4d6623e2f74408a5fc25d11 +size 4983197184 diff --git a/checkpoint-21364/model-00002-of-00007.safetensors b/checkpoint-21364/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d32d83cb96a109af411b9cf577e7fbfe07ea76fc --- /dev/null +++ b/checkpoint-21364/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168629c67732309d49f60a5ec48a6d160212bc987365e82e183dfbf74ba0c1f3 +size 4899116432 diff --git a/checkpoint-21364/model-00003-of-00007.safetensors b/checkpoint-21364/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-21364/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-21364/model-00004-of-00007.safetensors b/checkpoint-21364/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-21364/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-21364/model-00005-of-00007.safetensors b/checkpoint-21364/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-21364/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-21364/model-00006-of-00007.safetensors b/checkpoint-21364/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e98cf1738ecf055c021f242d83c52f152a8a3526 --- /dev/null +++ b/checkpoint-21364/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39f51390ce8c40ca8f432ed03d714b134779fafdcc041db552f79cdb9ddbf365 +size 4999813120 diff --git a/checkpoint-21364/model-00007-of-00007.safetensors b/checkpoint-21364/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bdb0f5d063fd46fef621927548b18ab31f2961e8 --- /dev/null +++ b/checkpoint-21364/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7ab099bb28b949df14f0dbe7c838a4e500023d70707c846f18379d0eaa24107 +size 2734998184 diff --git a/checkpoint-21364/model.safetensors.index.json b/checkpoint-21364/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..318803c6a3dd771c7f7c3b8038a896af7c8322ae --- /dev/null +++ b/checkpoint-21364/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32448724992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-21364/optimizer.pt b/checkpoint-21364/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..846528866864a904dadcebd847d45df71832f275 --- /dev/null +++ b/checkpoint-21364/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7239a9b214bfea5b03f206e53953b001fbe6209063fd48176f2025c774c8a4ac +size 16040396334 diff --git a/checkpoint-21364/rng_state.pth b/checkpoint-21364/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-21364/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-21364/scheduler.pt b/checkpoint-21364/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..17783d26dc88c55a75e7564f8dcbad9eacfa9913 --- /dev/null +++ b/checkpoint-21364/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2827eb82750c76bd3279b469098a24605426f9a47a96b155384bcef2e3f4fe20 +size 1064 diff --git a/checkpoint-21364/trainer_state.json b/checkpoint-21364/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..180f353f17b6b861654d5a9d07dcc7f1a591b2a7 --- /dev/null +++ b/checkpoint-21364/trainer_state.json @@ -0,0 +1,4856 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7100505184791279, + "eval_steps": 500, + "global_step": 21364, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001030311087476735, + "grad_norm": 60.25547409057617, + "learning_rate": 1.0157273918741808e-06, + "loss": 8.8455, + "step": 31 + }, + { + "epoch": 0.00206062217495347, + "grad_norm": 15.669363975524902, + "learning_rate": 2.0314547837483616e-06, + "loss": 7.1553, + "step": 62 + }, + { + "epoch": 0.003090933262430205, + "grad_norm": 15.366345405578613, + "learning_rate": 3.0471821756225426e-06, + "loss": 5.8784, + "step": 93 + }, + { + "epoch": 0.00412124434990694, + "grad_norm": 36.30561828613281, + "learning_rate": 4.062909567496723e-06, + "loss": 4.7708, + "step": 124 + }, + { + "epoch": 0.005151555437383675, + "grad_norm": 27.202678680419922, + "learning_rate": 5.078636959370905e-06, + "loss": 4.1629, + "step": 155 + }, + { + "epoch": 0.00618186652486041, + "grad_norm": 24.30484962463379, + "learning_rate": 6.094364351245085e-06, + "loss": 3.867, + "step": 186 + }, + { + "epoch": 0.007212177612337145, + "grad_norm": 19.916366577148438, + "learning_rate": 7.110091743119267e-06, + "loss": 3.6131, + "step": 217 + }, + { + "epoch": 0.00824248869981388, + "grad_norm": 17.577274322509766, + "learning_rate": 8.125819134993446e-06, + "loss": 3.4772, + "step": 248 + }, + { + "epoch": 0.009272799787290615, + "grad_norm": 12.133190155029297, + "learning_rate": 9.141546526867629e-06, + "loss": 3.3218, + "step": 279 + }, + { + "epoch": 0.01030311087476735, + "grad_norm": 19.79263687133789, + "learning_rate": 1.015727391874181e-05, + "loss": 3.2055, + "step": 310 + }, + { + "epoch": 0.011333421962244085, + "grad_norm": 16.38133430480957, + "learning_rate": 1.117300131061599e-05, + "loss": 3.1062, + "step": 341 + }, + { + "epoch": 0.01236373304972082, + "grad_norm": 12.638299942016602, + "learning_rate": 1.218872870249017e-05, + "loss": 3.0106, + "step": 372 + }, + { + "epoch": 0.013394044137197554, + "grad_norm": 9.46596908569336, + "learning_rate": 1.3204456094364351e-05, + "loss": 2.924, + "step": 403 + }, + { + "epoch": 0.01442435522467429, + "grad_norm": 10.945392608642578, + "learning_rate": 1.4220183486238533e-05, + "loss": 2.844, + "step": 434 + }, + { + "epoch": 0.015454666312151024, + "grad_norm": 8.474015235900879, + "learning_rate": 1.5235910878112714e-05, + "loss": 2.7892, + "step": 465 + }, + { + "epoch": 0.01648497739962776, + "grad_norm": 9.370804786682129, + "learning_rate": 1.6251638269986893e-05, + "loss": 2.7509, + "step": 496 + }, + { + "epoch": 0.017515288487104493, + "grad_norm": 11.63398551940918, + "learning_rate": 1.7267365661861077e-05, + "loss": 2.6999, + "step": 527 + }, + { + "epoch": 0.01854559957458123, + "grad_norm": 9.17713451385498, + "learning_rate": 1.8283093053735257e-05, + "loss": 2.6459, + "step": 558 + }, + { + "epoch": 0.019575910662057962, + "grad_norm": 7.119054794311523, + "learning_rate": 1.9298820445609438e-05, + "loss": 2.603, + "step": 589 + }, + { + "epoch": 0.0206062217495347, + "grad_norm": 6.653646945953369, + "learning_rate": 2.031454783748362e-05, + "loss": 2.5588, + "step": 620 + }, + { + "epoch": 0.021636532837011432, + "grad_norm": 8.332653045654297, + "learning_rate": 2.13302752293578e-05, + "loss": 2.5357, + "step": 651 + }, + { + "epoch": 0.02266684392448817, + "grad_norm": 6.4949116706848145, + "learning_rate": 2.234600262123198e-05, + "loss": 2.4967, + "step": 682 + }, + { + "epoch": 0.023697155011964902, + "grad_norm": 9.41009521484375, + "learning_rate": 2.336173001310616e-05, + "loss": 2.4563, + "step": 713 + }, + { + "epoch": 0.02472746609944164, + "grad_norm": 7.840345859527588, + "learning_rate": 2.437745740498034e-05, + "loss": 2.4383, + "step": 744 + }, + { + "epoch": 0.025757777186918372, + "grad_norm": 6.116458415985107, + "learning_rate": 2.5393184796854525e-05, + "loss": 2.3817, + "step": 775 + }, + { + "epoch": 0.02678808827439511, + "grad_norm": 5.938300609588623, + "learning_rate": 2.6408912188728702e-05, + "loss": 2.3508, + "step": 806 + }, + { + "epoch": 0.027818399361871842, + "grad_norm": 5.4408345222473145, + "learning_rate": 2.7424639580602886e-05, + "loss": 2.3325, + "step": 837 + }, + { + "epoch": 0.02884871044934858, + "grad_norm": 5.375136375427246, + "learning_rate": 2.8440366972477066e-05, + "loss": 2.3101, + "step": 868 + }, + { + "epoch": 0.029879021536825312, + "grad_norm": 5.149726867675781, + "learning_rate": 2.9456094364351244e-05, + "loss": 2.282, + "step": 899 + }, + { + "epoch": 0.03090933262430205, + "grad_norm": 4.591221332550049, + "learning_rate": 3.0471821756225428e-05, + "loss": 2.2427, + "step": 930 + }, + { + "epoch": 0.031939643711778785, + "grad_norm": 4.977034091949463, + "learning_rate": 3.148754914809961e-05, + "loss": 2.2218, + "step": 961 + }, + { + "epoch": 0.03296995479925552, + "grad_norm": 5.038781642913818, + "learning_rate": 3.2503276539973785e-05, + "loss": 2.2044, + "step": 992 + }, + { + "epoch": 0.03400026588673225, + "grad_norm": 4.872281551361084, + "learning_rate": 3.351900393184797e-05, + "loss": 2.1657, + "step": 1023 + }, + { + "epoch": 0.035030576974208985, + "grad_norm": 4.370841979980469, + "learning_rate": 3.453473132372215e-05, + "loss": 2.1365, + "step": 1054 + }, + { + "epoch": 0.036060888061685725, + "grad_norm": 4.087072849273682, + "learning_rate": 3.555045871559633e-05, + "loss": 2.1253, + "step": 1085 + }, + { + "epoch": 0.03709119914916246, + "grad_norm": 4.113957405090332, + "learning_rate": 3.6566186107470514e-05, + "loss": 2.0973, + "step": 1116 + }, + { + "epoch": 0.03812151023663919, + "grad_norm": 4.0119733810424805, + "learning_rate": 3.7581913499344695e-05, + "loss": 2.1024, + "step": 1147 + }, + { + "epoch": 0.039151821324115925, + "grad_norm": 4.247573375701904, + "learning_rate": 3.8597640891218876e-05, + "loss": 2.0722, + "step": 1178 + }, + { + "epoch": 0.04018213241159266, + "grad_norm": 3.5575129985809326, + "learning_rate": 3.9613368283093056e-05, + "loss": 2.056, + "step": 1209 + }, + { + "epoch": 0.0412124434990694, + "grad_norm": 3.8885862827301025, + "learning_rate": 4.062909567496724e-05, + "loss": 2.0389, + "step": 1240 + }, + { + "epoch": 0.04224275458654613, + "grad_norm": 3.680628538131714, + "learning_rate": 4.164482306684142e-05, + "loss": 2.0385, + "step": 1271 + }, + { + "epoch": 0.043273065674022865, + "grad_norm": 3.780876874923706, + "learning_rate": 4.26605504587156e-05, + "loss": 2.0097, + "step": 1302 + }, + { + "epoch": 0.0443033767614996, + "grad_norm": 4.235328674316406, + "learning_rate": 4.367627785058978e-05, + "loss": 2.0024, + "step": 1333 + }, + { + "epoch": 0.04533368784897634, + "grad_norm": 3.326941967010498, + "learning_rate": 4.469200524246396e-05, + "loss": 1.9953, + "step": 1364 + }, + { + "epoch": 0.04636399893645307, + "grad_norm": 3.28456449508667, + "learning_rate": 4.570773263433814e-05, + "loss": 1.9579, + "step": 1395 + }, + { + "epoch": 0.047394310023929805, + "grad_norm": 16.107433319091797, + "learning_rate": 4.672346002621232e-05, + "loss": 1.9701, + "step": 1426 + }, + { + "epoch": 0.04842462111140654, + "grad_norm": 3.5708224773406982, + "learning_rate": 4.77391874180865e-05, + "loss": 1.9621, + "step": 1457 + }, + { + "epoch": 0.04945493219888328, + "grad_norm": 2.9053499698638916, + "learning_rate": 4.875491480996068e-05, + "loss": 1.9458, + "step": 1488 + }, + { + "epoch": 0.05048524328636001, + "grad_norm": 3.0863258838653564, + "learning_rate": 4.977064220183487e-05, + "loss": 1.9483, + "step": 1519 + }, + { + "epoch": 0.051515554373836744, + "grad_norm": 2.9012269973754883, + "learning_rate": 4.9999915451558777e-05, + "loss": 1.928, + "step": 1550 + }, + { + "epoch": 0.05254586546131348, + "grad_norm": 3.0949041843414307, + "learning_rate": 4.999955597496219e-05, + "loss": 1.9229, + "step": 1581 + }, + { + "epoch": 0.05357617654879022, + "grad_norm": 2.8687901496887207, + "learning_rate": 4.9998914381774255e-05, + "loss": 1.915, + "step": 1612 + }, + { + "epoch": 0.05460648763626695, + "grad_norm": 3.2136878967285156, + "learning_rate": 4.999799067923527e-05, + "loss": 1.9197, + "step": 1643 + }, + { + "epoch": 0.055636798723743684, + "grad_norm": 2.590843677520752, + "learning_rate": 4.999678487776908e-05, + "loss": 1.8756, + "step": 1674 + }, + { + "epoch": 0.05666710981122042, + "grad_norm": 2.64634108543396, + "learning_rate": 4.9995296990983006e-05, + "loss": 1.9033, + "step": 1705 + }, + { + "epoch": 0.05769742089869716, + "grad_norm": 3.0151331424713135, + "learning_rate": 4.999352703566763e-05, + "loss": 1.8883, + "step": 1736 + }, + { + "epoch": 0.05872773198617389, + "grad_norm": 2.526806354522705, + "learning_rate": 4.999147503179668e-05, + "loss": 1.8666, + "step": 1767 + }, + { + "epoch": 0.059758043073650624, + "grad_norm": 2.510300397872925, + "learning_rate": 4.998914100252672e-05, + "loss": 1.854, + "step": 1798 + }, + { + "epoch": 0.06078835416112736, + "grad_norm": 2.4867682456970215, + "learning_rate": 4.998652497419696e-05, + "loss": 1.8548, + "step": 1829 + }, + { + "epoch": 0.0618186652486041, + "grad_norm": 2.3920586109161377, + "learning_rate": 4.9983626976328927e-05, + "loss": 1.8495, + "step": 1860 + }, + { + "epoch": 0.06284897633608083, + "grad_norm": 2.714177370071411, + "learning_rate": 4.998044704162613e-05, + "loss": 1.8433, + "step": 1891 + }, + { + "epoch": 0.06387928742355757, + "grad_norm": 2.3094465732574463, + "learning_rate": 4.9976985205973705e-05, + "loss": 1.8382, + "step": 1922 + }, + { + "epoch": 0.0649095985110343, + "grad_norm": 2.47184419631958, + "learning_rate": 4.997324150843799e-05, + "loss": 1.8464, + "step": 1953 + }, + { + "epoch": 0.06593990959851104, + "grad_norm": 2.391841411590576, + "learning_rate": 4.99692159912661e-05, + "loss": 1.8179, + "step": 1984 + }, + { + "epoch": 0.06697022068598776, + "grad_norm": 2.2471864223480225, + "learning_rate": 4.996490869988546e-05, + "loss": 1.8149, + "step": 2015 + }, + { + "epoch": 0.0680005317734645, + "grad_norm": 2.5497376918792725, + "learning_rate": 4.996031968290326e-05, + "loss": 1.8099, + "step": 2046 + }, + { + "epoch": 0.06903084286094124, + "grad_norm": 2.330463409423828, + "learning_rate": 4.995544899210594e-05, + "loss": 1.8267, + "step": 2077 + }, + { + "epoch": 0.07006115394841797, + "grad_norm": 2.3259341716766357, + "learning_rate": 4.9950296682458583e-05, + "loss": 1.7801, + "step": 2108 + }, + { + "epoch": 0.07109146503589471, + "grad_norm": 2.1711952686309814, + "learning_rate": 4.994486281210429e-05, + "loss": 1.7961, + "step": 2139 + }, + { + "epoch": 0.07212177612337145, + "grad_norm": 2.1808884143829346, + "learning_rate": 4.9939147442363566e-05, + "loss": 1.8109, + "step": 2170 + }, + { + "epoch": 0.07315208721084818, + "grad_norm": 2.089256525039673, + "learning_rate": 4.9933150637733574e-05, + "loss": 1.8026, + "step": 2201 + }, + { + "epoch": 0.07418239829832492, + "grad_norm": 2.0864951610565186, + "learning_rate": 4.992687246588743e-05, + "loss": 1.7753, + "step": 2232 + }, + { + "epoch": 0.07521270938580164, + "grad_norm": 2.36157488822937, + "learning_rate": 4.992031299767347e-05, + "loss": 1.7746, + "step": 2263 + }, + { + "epoch": 0.07624302047327838, + "grad_norm": 2.5334439277648926, + "learning_rate": 4.9913472307114386e-05, + "loss": 1.7927, + "step": 2294 + }, + { + "epoch": 0.07727333156075512, + "grad_norm": 2.2565715312957764, + "learning_rate": 4.9906350471406446e-05, + "loss": 1.7668, + "step": 2325 + }, + { + "epoch": 0.07830364264823185, + "grad_norm": 2.1043128967285156, + "learning_rate": 4.989894757091861e-05, + "loss": 1.7771, + "step": 2356 + }, + { + "epoch": 0.07933395373570859, + "grad_norm": 1.9659819602966309, + "learning_rate": 4.989126368919158e-05, + "loss": 1.7666, + "step": 2387 + }, + { + "epoch": 0.08036426482318532, + "grad_norm": 2.0778403282165527, + "learning_rate": 4.988329891293693e-05, + "loss": 1.7405, + "step": 2418 + }, + { + "epoch": 0.08139457591066206, + "grad_norm": 2.1767923831939697, + "learning_rate": 4.987505333203608e-05, + "loss": 1.7495, + "step": 2449 + }, + { + "epoch": 0.0824248869981388, + "grad_norm": 2.260143280029297, + "learning_rate": 4.9866527039539276e-05, + "loss": 1.7504, + "step": 2480 + }, + { + "epoch": 0.08345519808561552, + "grad_norm": 2.18271803855896, + "learning_rate": 4.9857720131664594e-05, + "loss": 1.7456, + "step": 2511 + }, + { + "epoch": 0.08448550917309226, + "grad_norm": 2.209594964981079, + "learning_rate": 4.9848632707796773e-05, + "loss": 1.7528, + "step": 2542 + }, + { + "epoch": 0.085515820260569, + "grad_norm": 2.0666229724884033, + "learning_rate": 4.9839264870486155e-05, + "loss": 1.7517, + "step": 2573 + }, + { + "epoch": 0.08654613134804573, + "grad_norm": 2.1070454120635986, + "learning_rate": 4.9829616725447526e-05, + "loss": 1.7474, + "step": 2604 + }, + { + "epoch": 0.08757644243552247, + "grad_norm": 1.9430303573608398, + "learning_rate": 4.981968838155888e-05, + "loss": 1.7348, + "step": 2635 + }, + { + "epoch": 0.0886067535229992, + "grad_norm": 1.9638925790786743, + "learning_rate": 4.980947995086024e-05, + "loss": 1.7202, + "step": 2666 + }, + { + "epoch": 0.08963706461047594, + "grad_norm": 1.8845652341842651, + "learning_rate": 4.979899154855234e-05, + "loss": 1.7375, + "step": 2697 + }, + { + "epoch": 0.09066737569795268, + "grad_norm": 5.712058067321777, + "learning_rate": 4.9788223292995386e-05, + "loss": 1.7379, + "step": 2728 + }, + { + "epoch": 0.0916976867854294, + "grad_norm": 1.9520670175552368, + "learning_rate": 4.977717530570768e-05, + "loss": 1.7302, + "step": 2759 + }, + { + "epoch": 0.09272799787290614, + "grad_norm": 1.8802224397659302, + "learning_rate": 4.976584771136425e-05, + "loss": 1.74, + "step": 2790 + }, + { + "epoch": 0.09375830896038288, + "grad_norm": 2.1098153591156006, + "learning_rate": 4.975424063779547e-05, + "loss": 1.7024, + "step": 2821 + }, + { + "epoch": 0.09478862004785961, + "grad_norm": 2.1568291187286377, + "learning_rate": 4.974235421598557e-05, + "loss": 1.7131, + "step": 2852 + }, + { + "epoch": 0.09581893113533635, + "grad_norm": 1.8769980669021606, + "learning_rate": 4.973018858007122e-05, + "loss": 1.7008, + "step": 2883 + }, + { + "epoch": 0.09684924222281308, + "grad_norm": 1.8325533866882324, + "learning_rate": 4.9717743867339963e-05, + "loss": 1.7058, + "step": 2914 + }, + { + "epoch": 0.09787955331028982, + "grad_norm": 2.086416721343994, + "learning_rate": 4.9705020218228695e-05, + "loss": 1.711, + "step": 2945 + }, + { + "epoch": 0.09890986439776656, + "grad_norm": 1.8294793367385864, + "learning_rate": 4.969201777632205e-05, + "loss": 1.6998, + "step": 2976 + }, + { + "epoch": 0.09994017548524328, + "grad_norm": 2.0608153343200684, + "learning_rate": 4.9678736688350846e-05, + "loss": 1.6948, + "step": 3007 + }, + { + "epoch": 0.10097048657272002, + "grad_norm": 3.2166008949279785, + "learning_rate": 4.966517710419033e-05, + "loss": 1.6788, + "step": 3038 + }, + { + "epoch": 0.10200079766019676, + "grad_norm": 1.9431313276290894, + "learning_rate": 4.965133917685858e-05, + "loss": 1.7115, + "step": 3069 + }, + { + "epoch": 0.10303110874767349, + "grad_norm": 1.967512845993042, + "learning_rate": 4.9637223062514714e-05, + "loss": 1.7033, + "step": 3100 + }, + { + "epoch": 0.10406141983515023, + "grad_norm": 1.9253389835357666, + "learning_rate": 4.962282892045718e-05, + "loss": 1.6856, + "step": 3131 + }, + { + "epoch": 0.10509173092262696, + "grad_norm": 1.986840009689331, + "learning_rate": 4.9608156913121904e-05, + "loss": 1.723, + "step": 3162 + }, + { + "epoch": 0.1061220420101037, + "grad_norm": 1.83523690700531, + "learning_rate": 4.959320720608049e-05, + "loss": 1.6912, + "step": 3193 + }, + { + "epoch": 0.10715235309758044, + "grad_norm": 2.1271955966949463, + "learning_rate": 4.9577979968038354e-05, + "loss": 1.7032, + "step": 3224 + }, + { + "epoch": 0.10818266418505716, + "grad_norm": 1.8383768796920776, + "learning_rate": 4.956247537083282e-05, + "loss": 1.6726, + "step": 3255 + }, + { + "epoch": 0.1092129752725339, + "grad_norm": 1.8806651830673218, + "learning_rate": 4.9546693589431145e-05, + "loss": 1.6817, + "step": 3286 + }, + { + "epoch": 0.11024328636001064, + "grad_norm": 1.7535260915756226, + "learning_rate": 4.9530634801928595e-05, + "loss": 1.6875, + "step": 3317 + }, + { + "epoch": 0.11127359744748737, + "grad_norm": 1.765906810760498, + "learning_rate": 4.9514299189546395e-05, + "loss": 1.6859, + "step": 3348 + }, + { + "epoch": 0.11230390853496411, + "grad_norm": 1.869828462600708, + "learning_rate": 4.949768693662973e-05, + "loss": 1.6915, + "step": 3379 + }, + { + "epoch": 0.11333421962244083, + "grad_norm": 1.8347504138946533, + "learning_rate": 4.948079823064559e-05, + "loss": 1.6859, + "step": 3410 + }, + { + "epoch": 0.11436453070991758, + "grad_norm": 1.7692474126815796, + "learning_rate": 4.946363326218074e-05, + "loss": 1.6565, + "step": 3441 + }, + { + "epoch": 0.11539484179739432, + "grad_norm": 1.8231885433197021, + "learning_rate": 4.9446192224939525e-05, + "loss": 1.686, + "step": 3472 + }, + { + "epoch": 0.11642515288487104, + "grad_norm": 1.7155958414077759, + "learning_rate": 4.942847531574167e-05, + "loss": 1.6538, + "step": 3503 + }, + { + "epoch": 0.11745546397234778, + "grad_norm": 1.787183403968811, + "learning_rate": 4.941048273452008e-05, + "loss": 1.6776, + "step": 3534 + }, + { + "epoch": 0.11848577505982451, + "grad_norm": 1.741213083267212, + "learning_rate": 4.9392214684318605e-05, + "loss": 1.6784, + "step": 3565 + }, + { + "epoch": 0.11951608614730125, + "grad_norm": 1.7836824655532837, + "learning_rate": 4.93736713712897e-05, + "loss": 1.6557, + "step": 3596 + }, + { + "epoch": 0.12054639723477799, + "grad_norm": 1.7103859186172485, + "learning_rate": 4.9354853004692124e-05, + "loss": 1.6606, + "step": 3627 + }, + { + "epoch": 0.12157670832225471, + "grad_norm": 1.7865506410598755, + "learning_rate": 4.93357597968886e-05, + "loss": 1.6409, + "step": 3658 + }, + { + "epoch": 0.12260701940973145, + "grad_norm": 1.7770143747329712, + "learning_rate": 4.931639196334338e-05, + "loss": 1.6574, + "step": 3689 + }, + { + "epoch": 0.1236373304972082, + "grad_norm": 1.857575535774231, + "learning_rate": 4.9296749722619826e-05, + "loss": 1.6724, + "step": 3720 + }, + { + "epoch": 0.12466764158468492, + "grad_norm": 1.8742581605911255, + "learning_rate": 4.9276833296377966e-05, + "loss": 1.6506, + "step": 3751 + }, + { + "epoch": 0.12569795267216166, + "grad_norm": 1.827668309211731, + "learning_rate": 4.925664290937196e-05, + "loss": 1.6523, + "step": 3782 + }, + { + "epoch": 0.1267282637596384, + "grad_norm": 1.7517486810684204, + "learning_rate": 4.9236178789447576e-05, + "loss": 1.6459, + "step": 3813 + }, + { + "epoch": 0.12775857484711514, + "grad_norm": 1.8109570741653442, + "learning_rate": 4.921544116753962e-05, + "loss": 1.6614, + "step": 3844 + }, + { + "epoch": 0.12878888593459187, + "grad_norm": 1.692597508430481, + "learning_rate": 4.919443027766935e-05, + "loss": 1.6431, + "step": 3875 + }, + { + "epoch": 0.1298191970220686, + "grad_norm": 1.8650025129318237, + "learning_rate": 4.91731463569418e-05, + "loss": 1.6466, + "step": 3906 + }, + { + "epoch": 0.13084950810954532, + "grad_norm": 1.6794081926345825, + "learning_rate": 4.915158964554312e-05, + "loss": 1.6504, + "step": 3937 + }, + { + "epoch": 0.13187981919702207, + "grad_norm": 1.7685374021530151, + "learning_rate": 4.912976038673786e-05, + "loss": 1.6446, + "step": 3968 + }, + { + "epoch": 0.1329101302844988, + "grad_norm": 1.7601110935211182, + "learning_rate": 4.9107658826866254e-05, + "loss": 1.631, + "step": 3999 + }, + { + "epoch": 0.13394044137197553, + "grad_norm": 2.0616064071655273, + "learning_rate": 4.908528521534139e-05, + "loss": 1.6476, + "step": 4030 + }, + { + "epoch": 0.13497075245945228, + "grad_norm": 1.8973504304885864, + "learning_rate": 4.906263980464644e-05, + "loss": 1.6582, + "step": 4061 + }, + { + "epoch": 0.136001063546929, + "grad_norm": 1.7768895626068115, + "learning_rate": 4.903972285033178e-05, + "loss": 1.6159, + "step": 4092 + }, + { + "epoch": 0.13703137463440573, + "grad_norm": 1.8264424800872803, + "learning_rate": 4.901653461101213e-05, + "loss": 1.6289, + "step": 4123 + }, + { + "epoch": 0.1380616857218825, + "grad_norm": 1.7140119075775146, + "learning_rate": 4.8993075348363626e-05, + "loss": 1.6357, + "step": 4154 + }, + { + "epoch": 0.13909199680935921, + "grad_norm": 1.6964486837387085, + "learning_rate": 4.896934532712084e-05, + "loss": 1.6233, + "step": 4185 + }, + { + "epoch": 0.14012230789683594, + "grad_norm": 1.8008025884628296, + "learning_rate": 4.8945344815073846e-05, + "loss": 1.637, + "step": 4216 + }, + { + "epoch": 0.1411526189843127, + "grad_norm": 1.562730073928833, + "learning_rate": 4.892107408306516e-05, + "loss": 1.6379, + "step": 4247 + }, + { + "epoch": 0.14218293007178942, + "grad_norm": 1.8273371458053589, + "learning_rate": 4.889653340498669e-05, + "loss": 1.6246, + "step": 4278 + }, + { + "epoch": 0.14321324115926615, + "grad_norm": 56.33716583251953, + "learning_rate": 4.8871723057776664e-05, + "loss": 1.6457, + "step": 4309 + }, + { + "epoch": 0.1442435522467429, + "grad_norm": 1.746523380279541, + "learning_rate": 4.8846643321416476e-05, + "loss": 1.6343, + "step": 4340 + }, + { + "epoch": 0.14527386333421963, + "grad_norm": 1.7737531661987305, + "learning_rate": 4.882129447892753e-05, + "loss": 1.6447, + "step": 4371 + }, + { + "epoch": 0.14630417442169635, + "grad_norm": 1.660485863685608, + "learning_rate": 4.8795676816368076e-05, + "loss": 1.6192, + "step": 4402 + }, + { + "epoch": 0.14733448550917308, + "grad_norm": 1.6823406219482422, + "learning_rate": 4.876979062282995e-05, + "loss": 1.6253, + "step": 4433 + }, + { + "epoch": 0.14836479659664983, + "grad_norm": 7.78139066696167, + "learning_rate": 4.8743636190435325e-05, + "loss": 1.6234, + "step": 4464 + }, + { + "epoch": 0.14939510768412656, + "grad_norm": 1.7426058053970337, + "learning_rate": 4.871721381433344e-05, + "loss": 1.6337, + "step": 4495 + }, + { + "epoch": 0.1504254187716033, + "grad_norm": 1.6294783353805542, + "learning_rate": 4.869052379269719e-05, + "loss": 1.6217, + "step": 4526 + }, + { + "epoch": 0.15145572985908004, + "grad_norm": 1.6523306369781494, + "learning_rate": 4.866356642671985e-05, + "loss": 1.605, + "step": 4557 + }, + { + "epoch": 0.15248604094655677, + "grad_norm": 1.8571300506591797, + "learning_rate": 4.8636342020611634e-05, + "loss": 1.6218, + "step": 4588 + }, + { + "epoch": 0.1535163520340335, + "grad_norm": 1.7754936218261719, + "learning_rate": 4.860885088159626e-05, + "loss": 1.6171, + "step": 4619 + }, + { + "epoch": 0.15454666312151025, + "grad_norm": 1.91987943649292, + "learning_rate": 4.858109331990751e-05, + "loss": 1.6167, + "step": 4650 + }, + { + "epoch": 0.15557697420898697, + "grad_norm": 1.5994452238082886, + "learning_rate": 4.855306964878567e-05, + "loss": 1.5951, + "step": 4681 + }, + { + "epoch": 0.1566072852964637, + "grad_norm": 1.6490916013717651, + "learning_rate": 4.8524780184474084e-05, + "loss": 1.616, + "step": 4712 + }, + { + "epoch": 0.15763759638394045, + "grad_norm": 1.5921640396118164, + "learning_rate": 4.8496225246215496e-05, + "loss": 1.6346, + "step": 4743 + }, + { + "epoch": 0.15866790747141718, + "grad_norm": 1.6729261875152588, + "learning_rate": 4.8467405156248505e-05, + "loss": 1.6165, + "step": 4774 + }, + { + "epoch": 0.1596982185588939, + "grad_norm": 1.628113031387329, + "learning_rate": 4.843832023980392e-05, + "loss": 1.6119, + "step": 4805 + }, + { + "epoch": 0.16072852964637063, + "grad_norm": 1.651647925376892, + "learning_rate": 4.840897082510106e-05, + "loss": 1.5997, + "step": 4836 + }, + { + "epoch": 0.1617588407338474, + "grad_norm": 1.5297720432281494, + "learning_rate": 4.8379357243344084e-05, + "loss": 1.6242, + "step": 4867 + }, + { + "epoch": 0.1627891518213241, + "grad_norm": 1.5779869556427002, + "learning_rate": 4.8349479828718236e-05, + "loss": 1.6149, + "step": 4898 + }, + { + "epoch": 0.16381946290880084, + "grad_norm": 1.5843939781188965, + "learning_rate": 4.8319338918386075e-05, + "loss": 1.5926, + "step": 4929 + }, + { + "epoch": 0.1648497739962776, + "grad_norm": 2.3762106895446777, + "learning_rate": 4.828893485248369e-05, + "loss": 1.6108, + "step": 4960 + }, + { + "epoch": 0.16588008508375432, + "grad_norm": 1.5871953964233398, + "learning_rate": 4.825826797411682e-05, + "loss": 1.6103, + "step": 4991 + }, + { + "epoch": 0.16691039617123105, + "grad_norm": 1.5934125185012817, + "learning_rate": 4.822733862935702e-05, + "loss": 1.6091, + "step": 5022 + }, + { + "epoch": 0.1679407072587078, + "grad_norm": 1.6997628211975098, + "learning_rate": 4.819614716723775e-05, + "loss": 1.6098, + "step": 5053 + }, + { + "epoch": 0.16897101834618453, + "grad_norm": 1.682849645614624, + "learning_rate": 4.8164693939750425e-05, + "loss": 1.599, + "step": 5084 + }, + { + "epoch": 0.17000132943366125, + "grad_norm": 1.709743857383728, + "learning_rate": 4.813297930184042e-05, + "loss": 1.6194, + "step": 5115 + }, + { + "epoch": 0.171031640521138, + "grad_norm": 1.725879430770874, + "learning_rate": 4.810100361140314e-05, + "loss": 1.6115, + "step": 5146 + }, + { + "epoch": 0.17206195160861473, + "grad_norm": 1.6710290908813477, + "learning_rate": 4.8068767229279885e-05, + "loss": 1.6032, + "step": 5177 + }, + { + "epoch": 0.17309226269609146, + "grad_norm": 1.6156634092330933, + "learning_rate": 4.8036270519253854e-05, + "loss": 1.5973, + "step": 5208 + }, + { + "epoch": 0.1741225737835682, + "grad_norm": 1.5654059648513794, + "learning_rate": 4.8003513848046e-05, + "loss": 1.5817, + "step": 5239 + }, + { + "epoch": 0.17515288487104494, + "grad_norm": 1.5789822340011597, + "learning_rate": 4.79704975853109e-05, + "loss": 1.6138, + "step": 5270 + }, + { + "epoch": 0.17618319595852167, + "grad_norm": 1.6022037267684937, + "learning_rate": 4.793722210363262e-05, + "loss": 1.5998, + "step": 5301 + }, + { + "epoch": 0.1772135070459984, + "grad_norm": 1.5142741203308105, + "learning_rate": 4.7903687778520414e-05, + "loss": 1.6061, + "step": 5332 + }, + { + "epoch": 0.17824381813347515, + "grad_norm": 1.6454212665557861, + "learning_rate": 4.7869894988404593e-05, + "loss": 1.6063, + "step": 5363 + }, + { + "epoch": 0.17927412922095187, + "grad_norm": 1.5250823497772217, + "learning_rate": 4.783584411463221e-05, + "loss": 1.6038, + "step": 5394 + }, + { + "epoch": 0.1803044403084286, + "grad_norm": 1.5829335451126099, + "learning_rate": 4.780153554146274e-05, + "loss": 1.5949, + "step": 5425 + }, + { + "epoch": 0.18133475139590535, + "grad_norm": 1.5342432260513306, + "learning_rate": 4.7766969656063766e-05, + "loss": 1.5913, + "step": 5456 + }, + { + "epoch": 0.18236506248338208, + "grad_norm": 1.6397250890731812, + "learning_rate": 4.773214684850662e-05, + "loss": 1.6102, + "step": 5487 + }, + { + "epoch": 0.1833953735708588, + "grad_norm": 1.5228471755981445, + "learning_rate": 4.769706751176193e-05, + "loss": 1.5885, + "step": 5518 + }, + { + "epoch": 0.18442568465833556, + "grad_norm": 1.6186103820800781, + "learning_rate": 4.7661732041695264e-05, + "loss": 1.6086, + "step": 5549 + }, + { + "epoch": 0.18545599574581229, + "grad_norm": 1.6024582386016846, + "learning_rate": 4.762614083706258e-05, + "loss": 1.6004, + "step": 5580 + }, + { + "epoch": 0.186486306833289, + "grad_norm": 1.5443711280822754, + "learning_rate": 4.759029429950581e-05, + "loss": 1.6048, + "step": 5611 + }, + { + "epoch": 0.18751661792076577, + "grad_norm": 1.4831629991531372, + "learning_rate": 4.7554192833548235e-05, + "loss": 1.5841, + "step": 5642 + }, + { + "epoch": 0.1885469290082425, + "grad_norm": 1.6426068544387817, + "learning_rate": 4.751783684659e-05, + "loss": 1.587, + "step": 5673 + }, + { + "epoch": 0.18957724009571922, + "grad_norm": 1.4609078168869019, + "learning_rate": 4.748122674890348e-05, + "loss": 1.5945, + "step": 5704 + }, + { + "epoch": 0.19060755118319597, + "grad_norm": 1.5365614891052246, + "learning_rate": 4.7444362953628654e-05, + "loss": 1.5737, + "step": 5735 + }, + { + "epoch": 0.1916378622706727, + "grad_norm": 1.5755670070648193, + "learning_rate": 4.7407245876768424e-05, + "loss": 1.5862, + "step": 5766 + }, + { + "epoch": 0.19266817335814942, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.736987593718397e-05, + "loss": 1.5663, + "step": 5797 + }, + { + "epoch": 0.19369848444562615, + "grad_norm": 1.5927278995513916, + "learning_rate": 4.733225355658999e-05, + "loss": 1.5776, + "step": 5828 + }, + { + "epoch": 0.1947287955331029, + "grad_norm": 1.5593287944793701, + "learning_rate": 4.7294379159549926e-05, + "loss": 1.579, + "step": 5859 + }, + { + "epoch": 0.19575910662057963, + "grad_norm": 1.534055233001709, + "learning_rate": 4.725625317347119e-05, + "loss": 1.6017, + "step": 5890 + }, + { + "epoch": 0.19678941770805636, + "grad_norm": 1.5846387147903442, + "learning_rate": 4.7217876028600374e-05, + "loss": 1.5739, + "step": 5921 + }, + { + "epoch": 0.1978197287955331, + "grad_norm": 1.5377682447433472, + "learning_rate": 4.717924815801832e-05, + "loss": 1.57, + "step": 5952 + }, + { + "epoch": 0.19885003988300984, + "grad_norm": 1.467956781387329, + "learning_rate": 4.714036999763532e-05, + "loss": 1.5736, + "step": 5983 + }, + { + "epoch": 0.19988035097048656, + "grad_norm": 1.601070523262024, + "learning_rate": 4.7101241986186116e-05, + "loss": 1.5861, + "step": 6014 + }, + { + "epoch": 0.20091066205796332, + "grad_norm": 1.5051921606063843, + "learning_rate": 4.7061864565225e-05, + "loss": 1.5735, + "step": 6045 + }, + { + "epoch": 0.20194097314544004, + "grad_norm": 1.462843418121338, + "learning_rate": 4.702223817912081e-05, + "loss": 1.582, + "step": 6076 + }, + { + "epoch": 0.20297128423291677, + "grad_norm": 1.5698682069778442, + "learning_rate": 4.698236327505195e-05, + "loss": 1.5647, + "step": 6107 + }, + { + "epoch": 0.20400159532039353, + "grad_norm": 1.5633916854858398, + "learning_rate": 4.694224030300127e-05, + "loss": 1.5741, + "step": 6138 + }, + { + "epoch": 0.20503190640787025, + "grad_norm": 1.6174733638763428, + "learning_rate": 4.690186971575107e-05, + "loss": 1.5634, + "step": 6169 + }, + { + "epoch": 0.20606221749534698, + "grad_norm": 1.4957518577575684, + "learning_rate": 4.6861251968877916e-05, + "loss": 1.575, + "step": 6200 + }, + { + "epoch": 0.2070925285828237, + "grad_norm": 1.670933485031128, + "learning_rate": 4.68203875207476e-05, + "loss": 1.5792, + "step": 6231 + }, + { + "epoch": 0.20812283967030046, + "grad_norm": 1.5676430463790894, + "learning_rate": 4.677927683250983e-05, + "loss": 1.5689, + "step": 6262 + }, + { + "epoch": 0.20915315075777718, + "grad_norm": 1.5753976106643677, + "learning_rate": 4.6737920368093156e-05, + "loss": 1.5594, + "step": 6293 + }, + { + "epoch": 0.2101834618452539, + "grad_norm": 1.4973617792129517, + "learning_rate": 4.669631859419965e-05, + "loss": 1.5593, + "step": 6324 + }, + { + "epoch": 0.21121377293273066, + "grad_norm": 1.4691433906555176, + "learning_rate": 4.6654471980299676e-05, + "loss": 1.5711, + "step": 6355 + }, + { + "epoch": 0.2122440840202074, + "grad_norm": 1.407630443572998, + "learning_rate": 4.661238099862658e-05, + "loss": 1.5787, + "step": 6386 + }, + { + "epoch": 0.21327439510768412, + "grad_norm": 1.5011677742004395, + "learning_rate": 4.657004612417138e-05, + "loss": 1.5751, + "step": 6417 + }, + { + "epoch": 0.21430470619516087, + "grad_norm": 1.509750485420227, + "learning_rate": 4.6527467834677374e-05, + "loss": 1.5583, + "step": 6448 + }, + { + "epoch": 0.2153350172826376, + "grad_norm": 1.3919882774353027, + "learning_rate": 4.648464661063478e-05, + "loss": 1.5712, + "step": 6479 + }, + { + "epoch": 0.21636532837011432, + "grad_norm": 1.4854936599731445, + "learning_rate": 4.6441582935275264e-05, + "loss": 1.5637, + "step": 6510 + }, + { + "epoch": 0.21739563945759108, + "grad_norm": 1.4413583278656006, + "learning_rate": 4.6398277294566586e-05, + "loss": 1.56, + "step": 6541 + }, + { + "epoch": 0.2184259505450678, + "grad_norm": 1.5063883066177368, + "learning_rate": 4.6354730177207e-05, + "loss": 1.5525, + "step": 6572 + }, + { + "epoch": 0.21945626163254453, + "grad_norm": 1.4899688959121704, + "learning_rate": 4.6310942074619787e-05, + "loss": 1.5817, + "step": 6603 + }, + { + "epoch": 0.22048657272002128, + "grad_norm": 1.3927967548370361, + "learning_rate": 4.626691348094777e-05, + "loss": 1.5407, + "step": 6634 + }, + { + "epoch": 0.221516883807498, + "grad_norm": 1.5378398895263672, + "learning_rate": 4.622264489304762e-05, + "loss": 1.5561, + "step": 6665 + }, + { + "epoch": 0.22254719489497474, + "grad_norm": 1.554624319076538, + "learning_rate": 4.617813681048434e-05, + "loss": 1.5859, + "step": 6696 + }, + { + "epoch": 0.22357750598245146, + "grad_norm": 1.5356658697128296, + "learning_rate": 4.61333897355256e-05, + "loss": 1.5531, + "step": 6727 + }, + { + "epoch": 0.22460781706992822, + "grad_norm": 1.5534918308258057, + "learning_rate": 4.608840417313604e-05, + "loss": 1.5774, + "step": 6758 + }, + { + "epoch": 0.22563812815740494, + "grad_norm": 1.5660988092422485, + "learning_rate": 4.6043180630971646e-05, + "loss": 1.5763, + "step": 6789 + }, + { + "epoch": 0.22666843924488167, + "grad_norm": 1.4993386268615723, + "learning_rate": 4.599771961937391e-05, + "loss": 1.5615, + "step": 6820 + }, + { + "epoch": 0.22769875033235842, + "grad_norm": 1.4630553722381592, + "learning_rate": 4.5952021651364204e-05, + "loss": 1.543, + "step": 6851 + }, + { + "epoch": 0.22872906141983515, + "grad_norm": 1.470173954963684, + "learning_rate": 4.590608724263786e-05, + "loss": 1.5674, + "step": 6882 + }, + { + "epoch": 0.22975937250731188, + "grad_norm": 1.5867971181869507, + "learning_rate": 4.585991691155845e-05, + "loss": 1.5702, + "step": 6913 + }, + { + "epoch": 0.23078968359478863, + "grad_norm": 1.44207763671875, + "learning_rate": 4.581351117915188e-05, + "loss": 1.5436, + "step": 6944 + }, + { + "epoch": 0.23181999468226536, + "grad_norm": 1.4691039323806763, + "learning_rate": 4.5766870569100534e-05, + "loss": 1.5465, + "step": 6975 + }, + { + "epoch": 0.23285030576974208, + "grad_norm": 1.4807918071746826, + "learning_rate": 4.571999560773736e-05, + "loss": 1.5564, + "step": 7006 + }, + { + "epoch": 0.23388061685721884, + "grad_norm": 1.481487512588501, + "learning_rate": 4.5672886824039915e-05, + "loss": 1.5466, + "step": 7037 + }, + { + "epoch": 0.23491092794469556, + "grad_norm": 1.4518013000488281, + "learning_rate": 4.5625544749624435e-05, + "loss": 1.5618, + "step": 7068 + }, + { + "epoch": 0.2359412390321723, + "grad_norm": 1.4186676740646362, + "learning_rate": 4.5577969918739794e-05, + "loss": 1.5528, + "step": 7099 + }, + { + "epoch": 0.23697155011964902, + "grad_norm": 1.5287110805511475, + "learning_rate": 4.5530162868261486e-05, + "loss": 1.5457, + "step": 7130 + }, + { + "epoch": 0.23800186120712577, + "grad_norm": 1.5516417026519775, + "learning_rate": 4.548212413768558e-05, + "loss": 1.5467, + "step": 7161 + }, + { + "epoch": 0.2390321722946025, + "grad_norm": 1.4710053205490112, + "learning_rate": 4.543385426912261e-05, + "loss": 1.5431, + "step": 7192 + }, + { + "epoch": 0.24006248338207922, + "grad_norm": 1.5005567073822021, + "learning_rate": 4.53853538072915e-05, + "loss": 1.5592, + "step": 7223 + }, + { + "epoch": 0.24109279446955598, + "grad_norm": 1.5864965915679932, + "learning_rate": 4.533662329951336e-05, + "loss": 1.5694, + "step": 7254 + }, + { + "epoch": 0.2421231055570327, + "grad_norm": 1.4661896228790283, + "learning_rate": 4.528766329570536e-05, + "loss": 1.545, + "step": 7285 + }, + { + "epoch": 0.24315341664450943, + "grad_norm": 1.5157560110092163, + "learning_rate": 4.523847434837447e-05, + "loss": 1.5458, + "step": 7316 + }, + { + "epoch": 0.24418372773198618, + "grad_norm": 1.4033585786819458, + "learning_rate": 4.518905701261128e-05, + "loss": 1.5464, + "step": 7347 + }, + { + "epoch": 0.2452140388194629, + "grad_norm": 1.5357593297958374, + "learning_rate": 4.5139411846083715e-05, + "loss": 1.5497, + "step": 7378 + }, + { + "epoch": 0.24624434990693964, + "grad_norm": 1.419507384300232, + "learning_rate": 4.508953940903073e-05, + "loss": 1.5414, + "step": 7409 + }, + { + "epoch": 0.2472746609944164, + "grad_norm": 1.5201773643493652, + "learning_rate": 4.5039440264255994e-05, + "loss": 1.5503, + "step": 7440 + }, + { + "epoch": 0.24830497208189312, + "grad_norm": 1.8000444173812866, + "learning_rate": 4.498911497712155e-05, + "loss": 1.5448, + "step": 7471 + }, + { + "epoch": 0.24933528316936984, + "grad_norm": 1.4876810312271118, + "learning_rate": 4.493856411554142e-05, + "loss": 1.5524, + "step": 7502 + }, + { + "epoch": 0.25036559425684657, + "grad_norm": 1.5130078792572021, + "learning_rate": 4.4887788249975206e-05, + "loss": 1.5454, + "step": 7533 + }, + { + "epoch": 0.2513959053443233, + "grad_norm": 1.4829351902008057, + "learning_rate": 4.4836787953421656e-05, + "loss": 1.5407, + "step": 7564 + }, + { + "epoch": 0.2524262164318001, + "grad_norm": 1.521550178527832, + "learning_rate": 4.478556380141218e-05, + "loss": 1.5727, + "step": 7595 + }, + { + "epoch": 0.2534565275192768, + "grad_norm": 1.4377928972244263, + "learning_rate": 4.4734116372004375e-05, + "loss": 1.5432, + "step": 7626 + }, + { + "epoch": 0.25448683860675353, + "grad_norm": 1.4101744890213013, + "learning_rate": 4.4682446245775477e-05, + "loss": 1.547, + "step": 7657 + }, + { + "epoch": 0.2555171496942303, + "grad_norm": 1.522524356842041, + "learning_rate": 4.463055400581586e-05, + "loss": 1.5418, + "step": 7688 + }, + { + "epoch": 0.256547460781707, + "grad_norm": 1.4160797595977783, + "learning_rate": 4.4578440237722374e-05, + "loss": 1.5457, + "step": 7719 + }, + { + "epoch": 0.25757777186918374, + "grad_norm": 1.4106636047363281, + "learning_rate": 4.452610552959183e-05, + "loss": 1.5405, + "step": 7750 + }, + { + "epoch": 0.2586080829566605, + "grad_norm": 1.422723650932312, + "learning_rate": 4.447355047201428e-05, + "loss": 1.5423, + "step": 7781 + }, + { + "epoch": 0.2596383940441372, + "grad_norm": 1.4362592697143555, + "learning_rate": 4.4420775658066414e-05, + "loss": 1.5372, + "step": 7812 + }, + { + "epoch": 0.26066870513161394, + "grad_norm": 1.4319696426391602, + "learning_rate": 4.436778168330484e-05, + "loss": 1.5451, + "step": 7843 + }, + { + "epoch": 0.26169901621909064, + "grad_norm": 1.4069257974624634, + "learning_rate": 4.4314569145759353e-05, + "loss": 1.5221, + "step": 7874 + }, + { + "epoch": 0.2627293273065674, + "grad_norm": 1.4424949884414673, + "learning_rate": 4.42611386459262e-05, + "loss": 1.5419, + "step": 7905 + }, + { + "epoch": 0.26375963839404415, + "grad_norm": 1.4579105377197266, + "learning_rate": 4.420749078676133e-05, + "loss": 1.5116, + "step": 7936 + }, + { + "epoch": 0.26478994948152085, + "grad_norm": 1.4563167095184326, + "learning_rate": 4.4153626173673516e-05, + "loss": 1.5296, + "step": 7967 + }, + { + "epoch": 0.2658202605689976, + "grad_norm": 1.4440968036651611, + "learning_rate": 4.409954541451762e-05, + "loss": 1.5548, + "step": 7998 + }, + { + "epoch": 0.26685057165647436, + "grad_norm": 1.5711034536361694, + "learning_rate": 4.404524911958764e-05, + "loss": 1.535, + "step": 8029 + }, + { + "epoch": 0.26788088274395105, + "grad_norm": 1.5221564769744873, + "learning_rate": 4.399073790160989e-05, + "loss": 1.5495, + "step": 8060 + }, + { + "epoch": 0.2689111938314278, + "grad_norm": 1.392699956893921, + "learning_rate": 4.393601237573607e-05, + "loss": 1.546, + "step": 8091 + }, + { + "epoch": 0.26994150491890456, + "grad_norm": 1.5343137979507446, + "learning_rate": 4.388107315953628e-05, + "loss": 1.549, + "step": 8122 + }, + { + "epoch": 0.27097181600638126, + "grad_norm": 1.4483468532562256, + "learning_rate": 4.382592087299212e-05, + "loss": 1.5424, + "step": 8153 + }, + { + "epoch": 0.272002127093858, + "grad_norm": 1.4963489770889282, + "learning_rate": 4.377055613848964e-05, + "loss": 1.508, + "step": 8184 + }, + { + "epoch": 0.27303243818133477, + "grad_norm": 1.4839162826538086, + "learning_rate": 4.3714979580812355e-05, + "loss": 1.5203, + "step": 8215 + }, + { + "epoch": 0.27406274926881147, + "grad_norm": 1.4272018671035767, + "learning_rate": 4.365919182713416e-05, + "loss": 1.5264, + "step": 8246 + }, + { + "epoch": 0.2750930603562882, + "grad_norm": 1.3808270692825317, + "learning_rate": 4.360319350701226e-05, + "loss": 1.5255, + "step": 8277 + }, + { + "epoch": 0.276123371443765, + "grad_norm": 1.4179162979125977, + "learning_rate": 4.3546985252380115e-05, + "loss": 1.535, + "step": 8308 + }, + { + "epoch": 0.2771536825312417, + "grad_norm": 1.3617374897003174, + "learning_rate": 4.349056769754021e-05, + "loss": 1.5295, + "step": 8339 + }, + { + "epoch": 0.27818399361871843, + "grad_norm": 1.4745615720748901, + "learning_rate": 4.3433941479156994e-05, + "loss": 1.5438, + "step": 8370 + }, + { + "epoch": 0.2792143047061952, + "grad_norm": 1.3661375045776367, + "learning_rate": 4.3377107236249647e-05, + "loss": 1.5134, + "step": 8401 + }, + { + "epoch": 0.2802446157936719, + "grad_norm": 1.3907949924468994, + "learning_rate": 4.332006561018488e-05, + "loss": 1.5237, + "step": 8432 + }, + { + "epoch": 0.28127492688114863, + "grad_norm": 1.3575704097747803, + "learning_rate": 4.3262817244669683e-05, + "loss": 1.5226, + "step": 8463 + }, + { + "epoch": 0.2823052379686254, + "grad_norm": 1.3836462497711182, + "learning_rate": 4.3205362785744083e-05, + "loss": 1.5433, + "step": 8494 + }, + { + "epoch": 0.2833355490561021, + "grad_norm": 1.6108276844024658, + "learning_rate": 4.314770288177384e-05, + "loss": 1.5324, + "step": 8525 + }, + { + "epoch": 0.28436586014357884, + "grad_norm": 1.4650689363479614, + "learning_rate": 4.308983818344313e-05, + "loss": 1.535, + "step": 8556 + }, + { + "epoch": 0.2853961712310556, + "grad_norm": 1.5836583375930786, + "learning_rate": 4.3031769343747206e-05, + "loss": 1.5313, + "step": 8587 + }, + { + "epoch": 0.2864264823185323, + "grad_norm": 1.5348492860794067, + "learning_rate": 4.297349701798505e-05, + "loss": 1.5106, + "step": 8618 + }, + { + "epoch": 0.28745679340600905, + "grad_norm": 1.4060319662094116, + "learning_rate": 4.2915021863751916e-05, + "loss": 1.5283, + "step": 8649 + }, + { + "epoch": 0.2884871044934858, + "grad_norm": 1.531657099723816, + "learning_rate": 4.285634454093198e-05, + "loss": 1.5087, + "step": 8680 + }, + { + "epoch": 0.2895174155809625, + "grad_norm": 1.4756299257278442, + "learning_rate": 4.279746571169086e-05, + "loss": 1.5042, + "step": 8711 + }, + { + "epoch": 0.29054772666843925, + "grad_norm": 1.3221153020858765, + "learning_rate": 4.2738386040468136e-05, + "loss": 1.5244, + "step": 8742 + }, + { + "epoch": 0.29157803775591595, + "grad_norm": 1.4067268371582031, + "learning_rate": 4.2679106193969866e-05, + "loss": 1.5012, + "step": 8773 + }, + { + "epoch": 0.2926083488433927, + "grad_norm": 1.5192064046859741, + "learning_rate": 4.261962684116106e-05, + "loss": 1.521, + "step": 8804 + }, + { + "epoch": 0.29363865993086946, + "grad_norm": 1.3847788572311401, + "learning_rate": 4.2559948653258145e-05, + "loss": 1.5128, + "step": 8835 + }, + { + "epoch": 0.29466897101834616, + "grad_norm": 1.4612780809402466, + "learning_rate": 4.250007230372134e-05, + "loss": 1.5371, + "step": 8866 + }, + { + "epoch": 0.2956992821058229, + "grad_norm": 1.468971610069275, + "learning_rate": 4.2439998468247126e-05, + "loss": 1.5199, + "step": 8897 + }, + { + "epoch": 0.29672959319329967, + "grad_norm": 1.386236310005188, + "learning_rate": 4.2379727824760566e-05, + "loss": 1.5273, + "step": 8928 + }, + { + "epoch": 0.29775990428077637, + "grad_norm": 1.3843929767608643, + "learning_rate": 4.231926105340768e-05, + "loss": 1.5011, + "step": 8959 + }, + { + "epoch": 0.2987902153682531, + "grad_norm": 1.4554557800292969, + "learning_rate": 4.225859883654776e-05, + "loss": 1.5311, + "step": 8990 + }, + { + "epoch": 0.2998205264557299, + "grad_norm": 1.3674421310424805, + "learning_rate": 4.219774185874569e-05, + "loss": 1.5302, + "step": 9021 + }, + { + "epoch": 0.3008508375432066, + "grad_norm": 1.3804330825805664, + "learning_rate": 4.213669080676418e-05, + "loss": 1.538, + "step": 9052 + }, + { + "epoch": 0.3018811486306833, + "grad_norm": 1.4643255472183228, + "learning_rate": 4.2075446369556056e-05, + "loss": 1.5172, + "step": 9083 + }, + { + "epoch": 0.3029114597181601, + "grad_norm": 1.3375928401947021, + "learning_rate": 4.201400923825648e-05, + "loss": 1.5123, + "step": 9114 + }, + { + "epoch": 0.3039417708056368, + "grad_norm": 1.4321980476379395, + "learning_rate": 4.195238010617511e-05, + "loss": 1.5196, + "step": 9145 + }, + { + "epoch": 0.30497208189311353, + "grad_norm": 1.4312376976013184, + "learning_rate": 4.1890559668788344e-05, + "loss": 1.5138, + "step": 9176 + }, + { + "epoch": 0.3060023929805903, + "grad_norm": 1.3089646100997925, + "learning_rate": 4.1828548623731405e-05, + "loss": 1.5027, + "step": 9207 + }, + { + "epoch": 0.307032704068067, + "grad_norm": 1.4863250255584717, + "learning_rate": 4.1766347670790506e-05, + "loss": 1.5091, + "step": 9238 + }, + { + "epoch": 0.30806301515554374, + "grad_norm": 1.373666763305664, + "learning_rate": 4.170395751189495e-05, + "loss": 1.5256, + "step": 9269 + }, + { + "epoch": 0.3090933262430205, + "grad_norm": 1.4160584211349487, + "learning_rate": 4.164137885110921e-05, + "loss": 1.4938, + "step": 9300 + }, + { + "epoch": 0.3101236373304972, + "grad_norm": 2.112110137939453, + "learning_rate": 4.157861239462495e-05, + "loss": 1.5106, + "step": 9331 + }, + { + "epoch": 0.31115394841797395, + "grad_norm": 1.337058663368225, + "learning_rate": 4.1515658850753114e-05, + "loss": 1.4999, + "step": 9362 + }, + { + "epoch": 0.3121842595054507, + "grad_norm": 1.3625296354293823, + "learning_rate": 4.145251892991588e-05, + "loss": 1.5136, + "step": 9393 + }, + { + "epoch": 0.3132145705929274, + "grad_norm": 1.399491548538208, + "learning_rate": 4.138919334463868e-05, + "loss": 1.499, + "step": 9424 + }, + { + "epoch": 0.31424488168040415, + "grad_norm": 1.4202344417572021, + "learning_rate": 4.1325682809542124e-05, + "loss": 1.5049, + "step": 9455 + }, + { + "epoch": 0.3152751927678809, + "grad_norm": 1.392248272895813, + "learning_rate": 4.126198804133398e-05, + "loss": 1.5287, + "step": 9486 + }, + { + "epoch": 0.3163055038553576, + "grad_norm": 1.3807618618011475, + "learning_rate": 4.1198109758801055e-05, + "loss": 1.5309, + "step": 9517 + }, + { + "epoch": 0.31733581494283436, + "grad_norm": 1.3117905855178833, + "learning_rate": 4.113404868280107e-05, + "loss": 1.4933, + "step": 9548 + }, + { + "epoch": 0.3183661260303111, + "grad_norm": 1.452086091041565, + "learning_rate": 4.106980553625457e-05, + "loss": 1.5221, + "step": 9579 + }, + { + "epoch": 0.3193964371177878, + "grad_norm": 1.477364182472229, + "learning_rate": 4.100538104413674e-05, + "loss": 1.4904, + "step": 9610 + }, + { + "epoch": 0.32042674820526457, + "grad_norm": 1.3584345579147339, + "learning_rate": 4.09407759334692e-05, + "loss": 1.4953, + "step": 9641 + }, + { + "epoch": 0.32145705929274127, + "grad_norm": 1.3619811534881592, + "learning_rate": 4.087599093331186e-05, + "loss": 1.4956, + "step": 9672 + }, + { + "epoch": 0.322487370380218, + "grad_norm": 1.4507052898406982, + "learning_rate": 4.081102677475462e-05, + "loss": 1.5197, + "step": 9703 + }, + { + "epoch": 0.3235176814676948, + "grad_norm": 1.4229698181152344, + "learning_rate": 4.0745884190909194e-05, + "loss": 1.498, + "step": 9734 + }, + { + "epoch": 0.32454799255517147, + "grad_norm": 1.3074679374694824, + "learning_rate": 4.0680563916900796e-05, + "loss": 1.5146, + "step": 9765 + }, + { + "epoch": 0.3255783036426482, + "grad_norm": 1.397815465927124, + "learning_rate": 4.0615066689859815e-05, + "loss": 1.5291, + "step": 9796 + }, + { + "epoch": 0.326608614730125, + "grad_norm": 1.3196336030960083, + "learning_rate": 4.0549393248913584e-05, + "loss": 1.5077, + "step": 9827 + }, + { + "epoch": 0.3276389258176017, + "grad_norm": 1.3129957914352417, + "learning_rate": 4.048354433517794e-05, + "loss": 1.4965, + "step": 9858 + }, + { + "epoch": 0.32866923690507843, + "grad_norm": 1.4380089044570923, + "learning_rate": 4.0417520691748916e-05, + "loss": 1.5115, + "step": 9889 + }, + { + "epoch": 0.3296995479925552, + "grad_norm": 1.3162370920181274, + "learning_rate": 4.035132306369438e-05, + "loss": 1.5029, + "step": 9920 + }, + { + "epoch": 0.3307298590800319, + "grad_norm": 1.3739668130874634, + "learning_rate": 4.028495219804555e-05, + "loss": 1.5083, + "step": 9951 + }, + { + "epoch": 0.33176017016750864, + "grad_norm": 1.3673723936080933, + "learning_rate": 4.021840884378864e-05, + "loss": 1.5223, + "step": 9982 + }, + { + "epoch": 0.3327904812549854, + "grad_norm": 1.3970317840576172, + "learning_rate": 4.015169375185633e-05, + "loss": 1.5003, + "step": 10013 + }, + { + "epoch": 0.3338207923424621, + "grad_norm": 1.2982394695281982, + "learning_rate": 4.0084807675119396e-05, + "loss": 1.5066, + "step": 10044 + }, + { + "epoch": 0.33485110342993885, + "grad_norm": 1.4548689126968384, + "learning_rate": 4.0017751368378106e-05, + "loss": 1.4993, + "step": 10075 + }, + { + "epoch": 0.3358814145174156, + "grad_norm": 1.3693586587905884, + "learning_rate": 3.995052558835377e-05, + "loss": 1.4987, + "step": 10106 + }, + { + "epoch": 0.3369117256048923, + "grad_norm": 1.4046767950057983, + "learning_rate": 3.988313109368017e-05, + "loss": 1.5098, + "step": 10137 + }, + { + "epoch": 0.33794203669236905, + "grad_norm": 1.3772069215774536, + "learning_rate": 3.981556864489504e-05, + "loss": 1.5165, + "step": 10168 + }, + { + "epoch": 0.3389723477798458, + "grad_norm": 1.471211314201355, + "learning_rate": 3.974783900443142e-05, + "loss": 1.5037, + "step": 10199 + }, + { + "epoch": 0.3400026588673225, + "grad_norm": 1.3990979194641113, + "learning_rate": 3.9679942936609095e-05, + "loss": 1.5096, + "step": 10230 + }, + { + "epoch": 0.34103296995479926, + "grad_norm": 1.3779234886169434, + "learning_rate": 3.961188120762596e-05, + "loss": 1.4914, + "step": 10261 + }, + { + "epoch": 0.342063281042276, + "grad_norm": 1.2866768836975098, + "learning_rate": 3.954365458554938e-05, + "loss": 1.5026, + "step": 10292 + }, + { + "epoch": 0.3430935921297527, + "grad_norm": 1.353468894958496, + "learning_rate": 3.947526384030751e-05, + "loss": 1.5063, + "step": 10323 + }, + { + "epoch": 0.34412390321722947, + "grad_norm": 1.3264256715774536, + "learning_rate": 3.9406709743680624e-05, + "loss": 1.4911, + "step": 10354 + }, + { + "epoch": 0.3451542143047062, + "grad_norm": 1.3496876955032349, + "learning_rate": 3.9337993069292366e-05, + "loss": 1.4921, + "step": 10385 + }, + { + "epoch": 0.3461845253921829, + "grad_norm": 1.3812434673309326, + "learning_rate": 3.926911459260109e-05, + "loss": 1.4826, + "step": 10416 + }, + { + "epoch": 0.34721483647965967, + "grad_norm": 1.4926965236663818, + "learning_rate": 3.920007509089102e-05, + "loss": 1.4994, + "step": 10447 + }, + { + "epoch": 0.3482451475671364, + "grad_norm": 1.3446170091629028, + "learning_rate": 3.913087534326357e-05, + "loss": 1.5114, + "step": 10478 + }, + { + "epoch": 0.3492754586546131, + "grad_norm": 1.3100495338439941, + "learning_rate": 3.9061516130628475e-05, + "loss": 1.5066, + "step": 10509 + }, + { + "epoch": 0.3503057697420899, + "grad_norm": 1.395874261856079, + "learning_rate": 3.8991998235695025e-05, + "loss": 1.4999, + "step": 10540 + }, + { + "epoch": 0.3513360808295666, + "grad_norm": 1.3682137727737427, + "learning_rate": 3.8922322442963224e-05, + "loss": 1.4778, + "step": 10571 + }, + { + "epoch": 0.35236639191704333, + "grad_norm": 1.4196573495864868, + "learning_rate": 3.885248953871491e-05, + "loss": 1.4909, + "step": 10602 + }, + { + "epoch": 0.3533967030045201, + "grad_norm": 1.4299864768981934, + "learning_rate": 3.8782500311004915e-05, + "loss": 1.5025, + "step": 10633 + }, + { + "epoch": 0.3544270140919968, + "grad_norm": 1.39677095413208, + "learning_rate": 3.871235554965218e-05, + "loss": 1.4932, + "step": 10664 + }, + { + "epoch": 0.35545732517947354, + "grad_norm": 1.3219736814498901, + "learning_rate": 3.864205604623078e-05, + "loss": 1.4795, + "step": 10695 + }, + { + "epoch": 0.3564876362669503, + "grad_norm": 1.3649324178695679, + "learning_rate": 3.857160259406107e-05, + "loss": 1.4838, + "step": 10726 + }, + { + "epoch": 0.357517947354427, + "grad_norm": 1.4109989404678345, + "learning_rate": 3.8500995988200674e-05, + "loss": 1.5058, + "step": 10757 + }, + { + "epoch": 0.35854825844190374, + "grad_norm": 1.3625038862228394, + "learning_rate": 3.843023702543556e-05, + "loss": 1.4912, + "step": 10788 + }, + { + "epoch": 0.3595785695293805, + "grad_norm": 1.4725775718688965, + "learning_rate": 3.8359326504270984e-05, + "loss": 1.5012, + "step": 10819 + }, + { + "epoch": 0.3606088806168572, + "grad_norm": 1.4126085042953491, + "learning_rate": 3.828826522492255e-05, + "loss": 1.4977, + "step": 10850 + }, + { + "epoch": 0.36163919170433395, + "grad_norm": 1.3949086666107178, + "learning_rate": 3.821705398930713e-05, + "loss": 1.4903, + "step": 10881 + }, + { + "epoch": 0.3626695027918107, + "grad_norm": 1.286792516708374, + "learning_rate": 3.814569360103385e-05, + "loss": 1.5067, + "step": 10912 + }, + { + "epoch": 0.3636998138792874, + "grad_norm": 1.274703025817871, + "learning_rate": 3.807418486539499e-05, + "loss": 1.4583, + "step": 10943 + }, + { + "epoch": 0.36473012496676416, + "grad_norm": 1.401455283164978, + "learning_rate": 3.80025285893569e-05, + "loss": 1.4834, + "step": 10974 + }, + { + "epoch": 0.3657604360542409, + "grad_norm": 1.308361530303955, + "learning_rate": 3.793072558155093e-05, + "loss": 1.4832, + "step": 11005 + }, + { + "epoch": 0.3667907471417176, + "grad_norm": 1.654733419418335, + "learning_rate": 3.785877665226426e-05, + "loss": 1.4867, + "step": 11036 + }, + { + "epoch": 0.36782105822919436, + "grad_norm": 1.3530856370925903, + "learning_rate": 3.778668261343079e-05, + "loss": 1.4873, + "step": 11067 + }, + { + "epoch": 0.3688513693166711, + "grad_norm": 1.3567407131195068, + "learning_rate": 3.771444427862192e-05, + "loss": 1.4935, + "step": 11098 + }, + { + "epoch": 0.3698816804041478, + "grad_norm": 1.3184572458267212, + "learning_rate": 3.7642062463037465e-05, + "loss": 1.4891, + "step": 11129 + }, + { + "epoch": 0.37091199149162457, + "grad_norm": 1.366489291191101, + "learning_rate": 3.7569537983496373e-05, + "loss": 1.5159, + "step": 11160 + }, + { + "epoch": 0.3719423025791013, + "grad_norm": 1.423258662223816, + "learning_rate": 3.749687165842753e-05, + "loss": 1.4938, + "step": 11191 + }, + { + "epoch": 0.372972613666578, + "grad_norm": 1.3226194381713867, + "learning_rate": 3.7424064307860536e-05, + "loss": 1.499, + "step": 11222 + }, + { + "epoch": 0.3740029247540548, + "grad_norm": 1.350500464439392, + "learning_rate": 3.735111675341645e-05, + "loss": 1.4952, + "step": 11253 + }, + { + "epoch": 0.37503323584153153, + "grad_norm": 1.3667839765548706, + "learning_rate": 3.7278029818298524e-05, + "loss": 1.4763, + "step": 11284 + }, + { + "epoch": 0.37606354692900823, + "grad_norm": 1.4876132011413574, + "learning_rate": 3.720480432728287e-05, + "loss": 1.4913, + "step": 11315 + }, + { + "epoch": 0.377093858016485, + "grad_norm": 1.3927743434906006, + "learning_rate": 3.71314411067092e-05, + "loss": 1.4948, + "step": 11346 + }, + { + "epoch": 0.37812416910396174, + "grad_norm": 1.3752413988113403, + "learning_rate": 3.70579409844715e-05, + "loss": 1.4763, + "step": 11377 + }, + { + "epoch": 0.37915448019143844, + "grad_norm": 1.3530951738357544, + "learning_rate": 3.698430479000865e-05, + "loss": 1.5077, + "step": 11408 + }, + { + "epoch": 0.3801847912789152, + "grad_norm": 1.4309345483779907, + "learning_rate": 3.691053335429509e-05, + "loss": 1.4945, + "step": 11439 + }, + { + "epoch": 0.38121510236639194, + "grad_norm": 1.2874380350112915, + "learning_rate": 3.683662750983147e-05, + "loss": 1.4698, + "step": 11470 + }, + { + "epoch": 0.38224541345386864, + "grad_norm": 1.3356250524520874, + "learning_rate": 3.676258809063518e-05, + "loss": 1.4924, + "step": 11501 + }, + { + "epoch": 0.3832757245413454, + "grad_norm": 1.304559588432312, + "learning_rate": 3.6688415932231004e-05, + "loss": 1.4682, + "step": 11532 + }, + { + "epoch": 0.3843060356288221, + "grad_norm": 1.4153447151184082, + "learning_rate": 3.661411187164166e-05, + "loss": 1.4989, + "step": 11563 + }, + { + "epoch": 0.38533634671629885, + "grad_norm": 1.356992244720459, + "learning_rate": 3.65396767473784e-05, + "loss": 1.4854, + "step": 11594 + }, + { + "epoch": 0.3863666578037756, + "grad_norm": 1.322449803352356, + "learning_rate": 3.6465111399431465e-05, + "loss": 1.4877, + "step": 11625 + }, + { + "epoch": 0.3873969688912523, + "grad_norm": 1.3981350660324097, + "learning_rate": 3.6390416669260674e-05, + "loss": 1.499, + "step": 11656 + }, + { + "epoch": 0.38842727997872906, + "grad_norm": 1.324871301651001, + "learning_rate": 3.63155933997859e-05, + "loss": 1.4814, + "step": 11687 + }, + { + "epoch": 0.3894575910662058, + "grad_norm": 1.3940790891647339, + "learning_rate": 3.624064243537758e-05, + "loss": 1.4754, + "step": 11718 + }, + { + "epoch": 0.3904879021536825, + "grad_norm": 1.2880780696868896, + "learning_rate": 3.616556462184716e-05, + "loss": 1.4832, + "step": 11749 + }, + { + "epoch": 0.39151821324115926, + "grad_norm": 1.315329670906067, + "learning_rate": 3.609036080643755e-05, + "loss": 1.4853, + "step": 11780 + }, + { + "epoch": 0.392548524328636, + "grad_norm": 1.4093523025512695, + "learning_rate": 3.60150318378136e-05, + "loss": 1.4978, + "step": 11811 + }, + { + "epoch": 0.3935788354161127, + "grad_norm": 1.271151065826416, + "learning_rate": 3.5939578566052465e-05, + "loss": 1.4933, + "step": 11842 + }, + { + "epoch": 0.39460914650358947, + "grad_norm": 1.2910923957824707, + "learning_rate": 3.586400184263408e-05, + "loss": 1.4853, + "step": 11873 + }, + { + "epoch": 0.3956394575910662, + "grad_norm": 1.2480064630508423, + "learning_rate": 3.578830252043148e-05, + "loss": 1.4642, + "step": 11904 + }, + { + "epoch": 0.3966697686785429, + "grad_norm": 1.263197422027588, + "learning_rate": 3.571248145370125e-05, + "loss": 1.4812, + "step": 11935 + }, + { + "epoch": 0.3977000797660197, + "grad_norm": 1.3231288194656372, + "learning_rate": 3.5636539498073794e-05, + "loss": 1.4744, + "step": 11966 + }, + { + "epoch": 0.39873039085349643, + "grad_norm": 1.3933110237121582, + "learning_rate": 3.556047751054378e-05, + "loss": 1.4849, + "step": 11997 + }, + { + "epoch": 0.39976070194097313, + "grad_norm": 1.3615801334381104, + "learning_rate": 3.548429634946039e-05, + "loss": 1.4866, + "step": 12028 + }, + { + "epoch": 0.4007910130284499, + "grad_norm": 1.298638939857483, + "learning_rate": 3.540799687451768e-05, + "loss": 1.4664, + "step": 12059 + }, + { + "epoch": 0.40182132411592664, + "grad_norm": 1.29216468334198, + "learning_rate": 3.533157994674485e-05, + "loss": 1.4697, + "step": 12090 + }, + { + "epoch": 0.40285163520340334, + "grad_norm": 1.3759845495224, + "learning_rate": 3.5255046428496546e-05, + "loss": 1.4854, + "step": 12121 + }, + { + "epoch": 0.4038819462908801, + "grad_norm": 1.4045615196228027, + "learning_rate": 3.517839718344311e-05, + "loss": 1.4622, + "step": 12152 + }, + { + "epoch": 0.40491225737835684, + "grad_norm": 1.2979034185409546, + "learning_rate": 3.510163307656086e-05, + "loss": 1.4797, + "step": 12183 + }, + { + "epoch": 0.40594256846583354, + "grad_norm": 1.303139567375183, + "learning_rate": 3.5024754974122324e-05, + "loss": 1.4588, + "step": 12214 + }, + { + "epoch": 0.4069728795533103, + "grad_norm": 1.287781834602356, + "learning_rate": 3.494776374368643e-05, + "loss": 1.4834, + "step": 12245 + }, + { + "epoch": 0.40800319064078705, + "grad_norm": 1.3806688785552979, + "learning_rate": 3.4870660254088724e-05, + "loss": 1.4807, + "step": 12276 + }, + { + "epoch": 0.40903350172826375, + "grad_norm": 1.4059745073318481, + "learning_rate": 3.479344537543164e-05, + "loss": 1.4906, + "step": 12307 + }, + { + "epoch": 0.4100638128157405, + "grad_norm": 1.3052942752838135, + "learning_rate": 3.4716119979074565e-05, + "loss": 1.4801, + "step": 12338 + }, + { + "epoch": 0.41109412390321726, + "grad_norm": 1.3306844234466553, + "learning_rate": 3.463868493762412e-05, + "loss": 1.4911, + "step": 12369 + }, + { + "epoch": 0.41212443499069396, + "grad_norm": 1.3276656866073608, + "learning_rate": 3.456114112492418e-05, + "loss": 1.4678, + "step": 12400 + }, + { + "epoch": 0.4131547460781707, + "grad_norm": 1.3164253234863281, + "learning_rate": 3.4483489416046164e-05, + "loss": 1.4816, + "step": 12431 + }, + { + "epoch": 0.4141850571656474, + "grad_norm": 1.3827886581420898, + "learning_rate": 3.440573068727905e-05, + "loss": 1.481, + "step": 12462 + }, + { + "epoch": 0.41521536825312416, + "grad_norm": 1.2899463176727295, + "learning_rate": 3.4327865816119495e-05, + "loss": 1.4575, + "step": 12493 + }, + { + "epoch": 0.4162456793406009, + "grad_norm": 1.3136677742004395, + "learning_rate": 3.4249895681262025e-05, + "loss": 1.4695, + "step": 12524 + }, + { + "epoch": 0.4172759904280776, + "grad_norm": 1.2920372486114502, + "learning_rate": 3.417182116258899e-05, + "loss": 1.4765, + "step": 12555 + }, + { + "epoch": 0.41830630151555437, + "grad_norm": 1.3285510540008545, + "learning_rate": 3.409364314116074e-05, + "loss": 1.4559, + "step": 12586 + }, + { + "epoch": 0.4193366126030311, + "grad_norm": 1.2834984064102173, + "learning_rate": 3.401536249920559e-05, + "loss": 1.4706, + "step": 12617 + }, + { + "epoch": 0.4203669236905078, + "grad_norm": 1.315942645072937, + "learning_rate": 3.393698012010998e-05, + "loss": 1.4692, + "step": 12648 + }, + { + "epoch": 0.4213972347779846, + "grad_norm": 1.3668091297149658, + "learning_rate": 3.385849688840839e-05, + "loss": 1.4801, + "step": 12679 + }, + { + "epoch": 0.42242754586546133, + "grad_norm": 1.312280297279358, + "learning_rate": 3.3779913689773414e-05, + "loss": 1.4673, + "step": 12710 + }, + { + "epoch": 0.423457856952938, + "grad_norm": 1.3579858541488647, + "learning_rate": 3.370123141100578e-05, + "loss": 1.4578, + "step": 12741 + }, + { + "epoch": 0.4244881680404148, + "grad_norm": 1.4001456499099731, + "learning_rate": 3.3622450940024305e-05, + "loss": 1.4787, + "step": 12772 + }, + { + "epoch": 0.42551847912789154, + "grad_norm": 1.352629542350769, + "learning_rate": 3.35435731658559e-05, + "loss": 1.457, + "step": 12803 + }, + { + "epoch": 0.42654879021536823, + "grad_norm": 1.4044222831726074, + "learning_rate": 3.346459897862552e-05, + "loss": 1.4979, + "step": 12834 + }, + { + "epoch": 0.427579101302845, + "grad_norm": 1.2666436433792114, + "learning_rate": 3.338552926954613e-05, + "loss": 1.4712, + "step": 12865 + }, + { + "epoch": 0.42860941239032174, + "grad_norm": 1.2487694025039673, + "learning_rate": 3.330636493090868e-05, + "loss": 1.4784, + "step": 12896 + }, + { + "epoch": 0.42963972347779844, + "grad_norm": 1.2346290349960327, + "learning_rate": 3.322710685607193e-05, + "loss": 1.4754, + "step": 12927 + }, + { + "epoch": 0.4306700345652752, + "grad_norm": 1.2908893823623657, + "learning_rate": 3.314775593945251e-05, + "loss": 1.4677, + "step": 12958 + }, + { + "epoch": 0.43170034565275195, + "grad_norm": 1.3283506631851196, + "learning_rate": 3.3068313076514714e-05, + "loss": 1.4661, + "step": 12989 + }, + { + "epoch": 0.43273065674022865, + "grad_norm": 1.2982537746429443, + "learning_rate": 3.298877916376047e-05, + "loss": 1.4838, + "step": 13020 + }, + { + "epoch": 0.4337609678277054, + "grad_norm": 1.3566454648971558, + "learning_rate": 3.290915509871915e-05, + "loss": 1.4683, + "step": 13051 + }, + { + "epoch": 0.43479127891518216, + "grad_norm": 1.3470877408981323, + "learning_rate": 3.282944177993753e-05, + "loss": 1.4724, + "step": 13082 + }, + { + "epoch": 0.43582159000265885, + "grad_norm": 1.451150894165039, + "learning_rate": 3.274964010696957e-05, + "loss": 1.4731, + "step": 13113 + }, + { + "epoch": 0.4368519010901356, + "grad_norm": 1.3415958881378174, + "learning_rate": 3.266975098036629e-05, + "loss": 1.4809, + "step": 13144 + }, + { + "epoch": 0.43788221217761236, + "grad_norm": 1.2775352001190186, + "learning_rate": 3.258977530166562e-05, + "loss": 1.4523, + "step": 13175 + }, + { + "epoch": 0.43891252326508906, + "grad_norm": 1.365050196647644, + "learning_rate": 3.250971397338227e-05, + "loss": 1.4611, + "step": 13206 + }, + { + "epoch": 0.4399428343525658, + "grad_norm": 1.3481686115264893, + "learning_rate": 3.2429567898997404e-05, + "loss": 1.4708, + "step": 13237 + }, + { + "epoch": 0.44097314544004257, + "grad_norm": 1.3418121337890625, + "learning_rate": 3.234933798294859e-05, + "loss": 1.485, + "step": 13268 + }, + { + "epoch": 0.44200345652751927, + "grad_norm": 1.3098441362380981, + "learning_rate": 3.2269025130619535e-05, + "loss": 1.472, + "step": 13299 + }, + { + "epoch": 0.443033767614996, + "grad_norm": 1.2792437076568604, + "learning_rate": 3.218863024832985e-05, + "loss": 1.4592, + "step": 13330 + }, + { + "epoch": 0.4440640787024727, + "grad_norm": 1.3804035186767578, + "learning_rate": 3.2108154243324864e-05, + "loss": 1.4546, + "step": 13361 + }, + { + "epoch": 0.4450943897899495, + "grad_norm": 1.287787675857544, + "learning_rate": 3.2027598023765345e-05, + "loss": 1.4477, + "step": 13392 + }, + { + "epoch": 0.44612470087742623, + "grad_norm": 1.5964646339416504, + "learning_rate": 3.194696249871729e-05, + "loss": 1.4468, + "step": 13423 + }, + { + "epoch": 0.4471550119649029, + "grad_norm": 1.3253474235534668, + "learning_rate": 3.186624857814164e-05, + "loss": 1.4588, + "step": 13454 + }, + { + "epoch": 0.4481853230523797, + "grad_norm": 1.288176417350769, + "learning_rate": 3.178545717288401e-05, + "loss": 1.4644, + "step": 13485 + }, + { + "epoch": 0.44921563413985643, + "grad_norm": 1.3357142210006714, + "learning_rate": 3.170458919466444e-05, + "loss": 1.4871, + "step": 13516 + }, + { + "epoch": 0.45024594522733313, + "grad_norm": 1.2954436540603638, + "learning_rate": 3.1623645556067063e-05, + "loss": 1.4571, + "step": 13547 + }, + { + "epoch": 0.4512762563148099, + "grad_norm": 1.344789981842041, + "learning_rate": 3.154262717052985e-05, + "loss": 1.459, + "step": 13578 + }, + { + "epoch": 0.45230656740228664, + "grad_norm": 1.2648475170135498, + "learning_rate": 3.146153495233426e-05, + "loss": 1.4496, + "step": 13609 + }, + { + "epoch": 0.45333687848976334, + "grad_norm": 1.312733769416809, + "learning_rate": 3.1380369816594944e-05, + "loss": 1.4309, + "step": 13640 + }, + { + "epoch": 0.4543671895772401, + "grad_norm": 1.3719325065612793, + "learning_rate": 3.129913267924946e-05, + "loss": 1.4723, + "step": 13671 + }, + { + "epoch": 0.45539750066471685, + "grad_norm": 1.2850617170333862, + "learning_rate": 3.121782445704782e-05, + "loss": 1.4599, + "step": 13702 + }, + { + "epoch": 0.45642781175219355, + "grad_norm": 1.3335177898406982, + "learning_rate": 3.11364460675423e-05, + "loss": 1.4821, + "step": 13733 + }, + { + "epoch": 0.4574581228396703, + "grad_norm": 1.1675069332122803, + "learning_rate": 3.1054998429076934e-05, + "loss": 1.453, + "step": 13764 + }, + { + "epoch": 0.45848843392714705, + "grad_norm": 1.283544898033142, + "learning_rate": 3.097348246077728e-05, + "loss": 1.4545, + "step": 13795 + }, + { + "epoch": 0.45951874501462375, + "grad_norm": 1.4358693361282349, + "learning_rate": 3.0891899082539924e-05, + "loss": 1.4673, + "step": 13826 + }, + { + "epoch": 0.4605490561021005, + "grad_norm": 1.2551497220993042, + "learning_rate": 3.0810249215022233e-05, + "loss": 1.4532, + "step": 13857 + }, + { + "epoch": 0.46157936718957726, + "grad_norm": 1.2574602365493774, + "learning_rate": 3.0728533779631865e-05, + "loss": 1.4762, + "step": 13888 + }, + { + "epoch": 0.46260967827705396, + "grad_norm": 1.2202764749526978, + "learning_rate": 3.064675369851637e-05, + "loss": 1.4461, + "step": 13919 + }, + { + "epoch": 0.4636399893645307, + "grad_norm": 1.2787501811981201, + "learning_rate": 3.056490989455289e-05, + "loss": 1.4607, + "step": 13950 + }, + { + "epoch": 0.46467030045200747, + "grad_norm": 1.2511006593704224, + "learning_rate": 3.0483003291337596e-05, + "loss": 1.4548, + "step": 13981 + }, + { + "epoch": 0.46570061153948417, + "grad_norm": 1.2749834060668945, + "learning_rate": 3.040103481317539e-05, + "loss": 1.4394, + "step": 14012 + }, + { + "epoch": 0.4667309226269609, + "grad_norm": 1.223057746887207, + "learning_rate": 3.03190053850694e-05, + "loss": 1.4684, + "step": 14043 + }, + { + "epoch": 0.4677612337144377, + "grad_norm": 1.39846932888031, + "learning_rate": 3.0236915932710573e-05, + "loss": 1.4657, + "step": 14074 + }, + { + "epoch": 0.4687915448019144, + "grad_norm": 1.5305665731430054, + "learning_rate": 3.0154767382467232e-05, + "loss": 1.4795, + "step": 14105 + }, + { + "epoch": 0.4698218558893911, + "grad_norm": 1.2569035291671753, + "learning_rate": 3.0072560661374582e-05, + "loss": 1.4756, + "step": 14136 + }, + { + "epoch": 0.4708521669768679, + "grad_norm": 1.3472824096679688, + "learning_rate": 2.999029669712431e-05, + "loss": 1.4682, + "step": 14167 + }, + { + "epoch": 0.4718824780643446, + "grad_norm": 1.271714210510254, + "learning_rate": 2.990797641805408e-05, + "loss": 1.4509, + "step": 14198 + }, + { + "epoch": 0.47291278915182133, + "grad_norm": 1.3342047929763794, + "learning_rate": 2.982560075313704e-05, + "loss": 1.4528, + "step": 14229 + }, + { + "epoch": 0.47394310023929803, + "grad_norm": 1.5821506977081299, + "learning_rate": 2.9743170631971368e-05, + "loss": 1.4609, + "step": 14260 + }, + { + "epoch": 0.4749734113267748, + "grad_norm": 1.2598062753677368, + "learning_rate": 2.9660686984769792e-05, + "loss": 1.471, + "step": 14291 + }, + { + "epoch": 0.47600372241425154, + "grad_norm": 1.2648885250091553, + "learning_rate": 2.9578150742349047e-05, + "loss": 1.4708, + "step": 14322 + }, + { + "epoch": 0.47703403350172824, + "grad_norm": 1.559665560722351, + "learning_rate": 2.949556283611942e-05, + "loss": 1.4516, + "step": 14353 + }, + { + "epoch": 0.478064344589205, + "grad_norm": 1.2621581554412842, + "learning_rate": 2.9412924198074206e-05, + "loss": 1.446, + "step": 14384 + }, + { + "epoch": 0.47909465567668175, + "grad_norm": 1.2775017023086548, + "learning_rate": 2.9330235760779208e-05, + "loss": 1.4496, + "step": 14415 + }, + { + "epoch": 0.48012496676415845, + "grad_norm": 1.2010388374328613, + "learning_rate": 2.9247498457362188e-05, + "loss": 1.4606, + "step": 14446 + }, + { + "epoch": 0.4811552778516352, + "grad_norm": 1.3053895235061646, + "learning_rate": 2.9164713221502373e-05, + "loss": 1.4536, + "step": 14477 + }, + { + "epoch": 0.48218558893911195, + "grad_norm": 1.311596155166626, + "learning_rate": 2.9081880987419912e-05, + "loss": 1.4409, + "step": 14508 + }, + { + "epoch": 0.48321590002658865, + "grad_norm": 1.3888933658599854, + "learning_rate": 2.8999002689865296e-05, + "loss": 1.4314, + "step": 14539 + }, + { + "epoch": 0.4842462111140654, + "grad_norm": 1.288619875907898, + "learning_rate": 2.8916079264108852e-05, + "loss": 1.4539, + "step": 14570 + }, + { + "epoch": 0.48527652220154216, + "grad_norm": 1.2974294424057007, + "learning_rate": 2.883311164593017e-05, + "loss": 1.4627, + "step": 14601 + }, + { + "epoch": 0.48630683328901886, + "grad_norm": 1.2057379484176636, + "learning_rate": 2.875010077160754e-05, + "loss": 1.4578, + "step": 14632 + }, + { + "epoch": 0.4873371443764956, + "grad_norm": 1.363971471786499, + "learning_rate": 2.866704757790741e-05, + "loss": 1.4671, + "step": 14663 + }, + { + "epoch": 0.48836745546397237, + "grad_norm": 1.2696925401687622, + "learning_rate": 2.858395300207376e-05, + "loss": 1.4333, + "step": 14694 + }, + { + "epoch": 0.48939776655144906, + "grad_norm": 1.2653478384017944, + "learning_rate": 2.8500817981817607e-05, + "loss": 1.4662, + "step": 14725 + }, + { + "epoch": 0.4904280776389258, + "grad_norm": 1.3011239767074585, + "learning_rate": 2.8417643455306336e-05, + "loss": 1.4589, + "step": 14756 + }, + { + "epoch": 0.4914583887264026, + "grad_norm": 1.3312432765960693, + "learning_rate": 2.8334430361153185e-05, + "loss": 1.4368, + "step": 14787 + }, + { + "epoch": 0.49248869981387927, + "grad_norm": 1.3015661239624023, + "learning_rate": 2.8251179638406612e-05, + "loss": 1.466, + "step": 14818 + }, + { + "epoch": 0.493519010901356, + "grad_norm": 1.3215759992599487, + "learning_rate": 2.8167892226539704e-05, + "loss": 1.4486, + "step": 14849 + }, + { + "epoch": 0.4945493219888328, + "grad_norm": 1.2909883260726929, + "learning_rate": 2.8084569065439588e-05, + "loss": 1.4433, + "step": 14880 + }, + { + "epoch": 0.4955796330763095, + "grad_norm": 1.364015817642212, + "learning_rate": 2.8001211095396807e-05, + "loss": 1.4449, + "step": 14911 + }, + { + "epoch": 0.49660994416378623, + "grad_norm": 1.2468819618225098, + "learning_rate": 2.791781925709473e-05, + "loss": 1.4572, + "step": 14942 + }, + { + "epoch": 0.497640255251263, + "grad_norm": 1.2739325761795044, + "learning_rate": 2.7834394491598908e-05, + "loss": 1.4478, + "step": 14973 + }, + { + "epoch": 0.4986705663387397, + "grad_norm": 1.3384937047958374, + "learning_rate": 2.7750937740346485e-05, + "loss": 1.4429, + "step": 15004 + }, + { + "epoch": 0.49970087742621644, + "grad_norm": 1.231088399887085, + "learning_rate": 2.7667449945135564e-05, + "loss": 1.4631, + "step": 15035 + }, + { + "epoch": 0.5007311885136931, + "grad_norm": 1.2262307405471802, + "learning_rate": 2.7583932048114557e-05, + "loss": 1.4508, + "step": 15066 + }, + { + "epoch": 0.5017614996011699, + "grad_norm": 1.3427774906158447, + "learning_rate": 2.7500384991771587e-05, + "loss": 1.4441, + "step": 15097 + }, + { + "epoch": 0.5027918106886466, + "grad_norm": 1.2950241565704346, + "learning_rate": 2.7416809718923825e-05, + "loss": 1.4427, + "step": 15128 + }, + { + "epoch": 0.5038221217761234, + "grad_norm": 1.4129016399383545, + "learning_rate": 2.7333207172706864e-05, + "loss": 1.4562, + "step": 15159 + }, + { + "epoch": 0.5048524328636002, + "grad_norm": 1.2751520872116089, + "learning_rate": 2.7249578296564088e-05, + "loss": 1.4517, + "step": 15190 + }, + { + "epoch": 0.5058827439510768, + "grad_norm": 1.302485466003418, + "learning_rate": 2.7165924034235973e-05, + "loss": 1.4327, + "step": 15221 + }, + { + "epoch": 0.5069130550385536, + "grad_norm": 1.295390009880066, + "learning_rate": 2.708224532974953e-05, + "loss": 1.4455, + "step": 15252 + }, + { + "epoch": 0.5079433661260303, + "grad_norm": 1.3160103559494019, + "learning_rate": 2.6998543127407538e-05, + "loss": 1.4556, + "step": 15283 + }, + { + "epoch": 0.5089736772135071, + "grad_norm": 1.2997361421585083, + "learning_rate": 2.6914818371777988e-05, + "loss": 1.444, + "step": 15314 + }, + { + "epoch": 0.5100039883009838, + "grad_norm": 1.2427833080291748, + "learning_rate": 2.6831072007683373e-05, + "loss": 1.4501, + "step": 15345 + }, + { + "epoch": 0.5110342993884606, + "grad_norm": 1.2402199506759644, + "learning_rate": 2.6747304980190018e-05, + "loss": 1.4543, + "step": 15376 + }, + { + "epoch": 0.5120646104759372, + "grad_norm": 1.2938770055770874, + "learning_rate": 2.6663518234597453e-05, + "loss": 1.4394, + "step": 15407 + }, + { + "epoch": 0.513094921563414, + "grad_norm": 1.1747736930847168, + "learning_rate": 2.6579712716427696e-05, + "loss": 1.4389, + "step": 15438 + }, + { + "epoch": 0.5141252326508907, + "grad_norm": 1.326824426651001, + "learning_rate": 2.6495889371414652e-05, + "loss": 1.4365, + "step": 15469 + }, + { + "epoch": 0.5151555437383675, + "grad_norm": 1.245665431022644, + "learning_rate": 2.6412049145493367e-05, + "loss": 1.4525, + "step": 15500 + }, + { + "epoch": 0.5161858548258442, + "grad_norm": 1.1753687858581543, + "learning_rate": 2.632819298478939e-05, + "loss": 1.447, + "step": 15531 + }, + { + "epoch": 0.517216165913321, + "grad_norm": 1.3870874643325806, + "learning_rate": 2.6244321835608105e-05, + "loss": 1.4577, + "step": 15562 + }, + { + "epoch": 0.5182464770007976, + "grad_norm": 1.2849411964416504, + "learning_rate": 2.6160436644424024e-05, + "loss": 1.4371, + "step": 15593 + }, + { + "epoch": 0.5192767880882744, + "grad_norm": 1.292443037033081, + "learning_rate": 2.6076538357870133e-05, + "loss": 1.4558, + "step": 15624 + }, + { + "epoch": 0.5203070991757511, + "grad_norm": 1.279961347579956, + "learning_rate": 2.5992627922727196e-05, + "loss": 1.4384, + "step": 15655 + }, + { + "epoch": 0.5213374102632279, + "grad_norm": 1.3141279220581055, + "learning_rate": 2.5908706285913066e-05, + "loss": 1.45, + "step": 15686 + }, + { + "epoch": 0.5223677213507046, + "grad_norm": 1.3931515216827393, + "learning_rate": 2.5824774394472008e-05, + "loss": 1.4403, + "step": 15717 + }, + { + "epoch": 0.5233980324381813, + "grad_norm": 1.2564170360565186, + "learning_rate": 2.5740833195563996e-05, + "loss": 1.4482, + "step": 15748 + }, + { + "epoch": 0.524428343525658, + "grad_norm": 1.5450046062469482, + "learning_rate": 2.5656883636454067e-05, + "loss": 1.4443, + "step": 15779 + }, + { + "epoch": 0.5254586546131348, + "grad_norm": 1.2659518718719482, + "learning_rate": 2.557292666450159e-05, + "loss": 1.4653, + "step": 15810 + }, + { + "epoch": 0.5264889657006115, + "grad_norm": 1.2940540313720703, + "learning_rate": 2.5488963227149566e-05, + "loss": 1.4302, + "step": 15841 + }, + { + "epoch": 0.5275192767880883, + "grad_norm": 1.2514533996582031, + "learning_rate": 2.5404994271913983e-05, + "loss": 1.4412, + "step": 15872 + }, + { + "epoch": 0.528549587875565, + "grad_norm": 1.2681846618652344, + "learning_rate": 2.5321020746373085e-05, + "loss": 1.4411, + "step": 15903 + }, + { + "epoch": 0.5295798989630417, + "grad_norm": 1.2581806182861328, + "learning_rate": 2.52370435981567e-05, + "loss": 1.4503, + "step": 15934 + }, + { + "epoch": 0.5306102100505184, + "grad_norm": 1.3299468755722046, + "learning_rate": 2.5153063774935533e-05, + "loss": 1.4392, + "step": 15965 + }, + { + "epoch": 0.5316405211379952, + "grad_norm": 1.240678310394287, + "learning_rate": 2.506908222441045e-05, + "loss": 1.4412, + "step": 15996 + }, + { + "epoch": 0.532670832225472, + "grad_norm": 1.337936520576477, + "learning_rate": 2.498509989430187e-05, + "loss": 1.4254, + "step": 16027 + }, + { + "epoch": 0.5337011433129487, + "grad_norm": 1.302909016609192, + "learning_rate": 2.4901117732338958e-05, + "loss": 1.4436, + "step": 16058 + }, + { + "epoch": 0.5347314544004255, + "grad_norm": 1.2539550065994263, + "learning_rate": 2.481713668624899e-05, + "loss": 1.4496, + "step": 16089 + }, + { + "epoch": 0.5357617654879021, + "grad_norm": 1.287431001663208, + "learning_rate": 2.4733157703746663e-05, + "loss": 1.424, + "step": 16120 + }, + { + "epoch": 0.5367920765753789, + "grad_norm": 1.5333632230758667, + "learning_rate": 2.4649181732523392e-05, + "loss": 1.4399, + "step": 16151 + }, + { + "epoch": 0.5378223876628556, + "grad_norm": 1.2591406106948853, + "learning_rate": 2.4565209720236582e-05, + "loss": 1.439, + "step": 16182 + }, + { + "epoch": 0.5388526987503324, + "grad_norm": 1.3093276023864746, + "learning_rate": 2.4481242614498975e-05, + "loss": 1.4279, + "step": 16213 + }, + { + "epoch": 0.5398830098378091, + "grad_norm": 1.2824875116348267, + "learning_rate": 2.439728136286796e-05, + "loss": 1.4428, + "step": 16244 + }, + { + "epoch": 0.5409133209252859, + "grad_norm": 1.2775593996047974, + "learning_rate": 2.4313326912834852e-05, + "loss": 1.4352, + "step": 16275 + }, + { + "epoch": 0.5419436320127625, + "grad_norm": 1.4667550325393677, + "learning_rate": 2.4229380211814206e-05, + "loss": 1.4633, + "step": 16306 + }, + { + "epoch": 0.5429739431002393, + "grad_norm": 1.2620900869369507, + "learning_rate": 2.4145442207133124e-05, + "loss": 1.4482, + "step": 16337 + }, + { + "epoch": 0.544004254187716, + "grad_norm": 1.3041224479675293, + "learning_rate": 2.406151384602059e-05, + "loss": 1.4431, + "step": 16368 + }, + { + "epoch": 0.5450345652751928, + "grad_norm": 1.3634989261627197, + "learning_rate": 2.3977596075596747e-05, + "loss": 1.4186, + "step": 16399 + }, + { + "epoch": 0.5460648763626695, + "grad_norm": 1.2322940826416016, + "learning_rate": 2.3893689842862223e-05, + "loss": 1.4322, + "step": 16430 + }, + { + "epoch": 0.5470951874501463, + "grad_norm": 1.5554733276367188, + "learning_rate": 2.3809796094687475e-05, + "loss": 1.4337, + "step": 16461 + }, + { + "epoch": 0.5481254985376229, + "grad_norm": 1.4745500087738037, + "learning_rate": 2.372591577780202e-05, + "loss": 1.4411, + "step": 16492 + }, + { + "epoch": 0.5491558096250997, + "grad_norm": 1.2865196466445923, + "learning_rate": 2.3642049838783838e-05, + "loss": 1.429, + "step": 16523 + }, + { + "epoch": 0.5501861207125764, + "grad_norm": 1.399247407913208, + "learning_rate": 2.3558199224048666e-05, + "loss": 1.4753, + "step": 16554 + }, + { + "epoch": 0.5512164318000532, + "grad_norm": 1.2135406732559204, + "learning_rate": 2.347436487983929e-05, + "loss": 1.4553, + "step": 16585 + }, + { + "epoch": 0.55224674288753, + "grad_norm": 1.164150357246399, + "learning_rate": 2.3390547752214888e-05, + "loss": 1.4268, + "step": 16616 + }, + { + "epoch": 0.5532770539750066, + "grad_norm": 1.2363818883895874, + "learning_rate": 2.330674878704035e-05, + "loss": 1.4381, + "step": 16647 + }, + { + "epoch": 0.5543073650624833, + "grad_norm": 1.286139726638794, + "learning_rate": 2.322296892997561e-05, + "loss": 1.4492, + "step": 16678 + }, + { + "epoch": 0.5553376761499601, + "grad_norm": 1.2836147546768188, + "learning_rate": 2.313920912646497e-05, + "loss": 1.4128, + "step": 16709 + }, + { + "epoch": 0.5563679872374369, + "grad_norm": 1.253727674484253, + "learning_rate": 2.305547032172643e-05, + "loss": 1.4472, + "step": 16740 + }, + { + "epoch": 0.5573982983249136, + "grad_norm": 1.2580201625823975, + "learning_rate": 2.2971753460741014e-05, + "loss": 1.4461, + "step": 16771 + }, + { + "epoch": 0.5584286094123904, + "grad_norm": 1.2446421384811401, + "learning_rate": 2.288805948824212e-05, + "loss": 1.4267, + "step": 16802 + }, + { + "epoch": 0.559458920499867, + "grad_norm": 1.3572150468826294, + "learning_rate": 2.2804389348704858e-05, + "loss": 1.4222, + "step": 16833 + }, + { + "epoch": 0.5604892315873438, + "grad_norm": 1.3694707155227661, + "learning_rate": 2.2720743986335374e-05, + "loss": 1.4624, + "step": 16864 + }, + { + "epoch": 0.5615195426748205, + "grad_norm": 1.2654088735580444, + "learning_rate": 2.2637124345060233e-05, + "loss": 1.4379, + "step": 16895 + }, + { + "epoch": 0.5625498537622973, + "grad_norm": 1.3349469900131226, + "learning_rate": 2.2553531368515695e-05, + "loss": 1.4404, + "step": 16926 + }, + { + "epoch": 0.563580164849774, + "grad_norm": 1.2259774208068848, + "learning_rate": 2.2469966000037144e-05, + "loss": 1.4335, + "step": 16957 + }, + { + "epoch": 0.5646104759372508, + "grad_norm": 1.2973053455352783, + "learning_rate": 2.2386429182648417e-05, + "loss": 1.4397, + "step": 16988 + }, + { + "epoch": 0.5656407870247274, + "grad_norm": 1.2674601078033447, + "learning_rate": 2.230292185905114e-05, + "loss": 1.4256, + "step": 17019 + }, + { + "epoch": 0.5666710981122042, + "grad_norm": 1.243605136871338, + "learning_rate": 2.2219444971614116e-05, + "loss": 1.4404, + "step": 17050 + }, + { + "epoch": 0.5677014091996809, + "grad_norm": 1.2108361721038818, + "learning_rate": 2.2135999462362655e-05, + "loss": 1.4318, + "step": 17081 + }, + { + "epoch": 0.5687317202871577, + "grad_norm": 1.2497962713241577, + "learning_rate": 2.2052586272968003e-05, + "loss": 1.4409, + "step": 17112 + }, + { + "epoch": 0.5697620313746344, + "grad_norm": 1.2269086837768555, + "learning_rate": 2.196920634473666e-05, + "loss": 1.4417, + "step": 17143 + }, + { + "epoch": 0.5707923424621112, + "grad_norm": 1.3165903091430664, + "learning_rate": 2.1885860618599787e-05, + "loss": 1.4541, + "step": 17174 + }, + { + "epoch": 0.5718226535495878, + "grad_norm": 1.2117608785629272, + "learning_rate": 2.1802550035102577e-05, + "loss": 1.4457, + "step": 17205 + }, + { + "epoch": 0.5728529646370646, + "grad_norm": 1.2482073307037354, + "learning_rate": 2.171927553439363e-05, + "loss": 1.4408, + "step": 17236 + }, + { + "epoch": 0.5738832757245413, + "grad_norm": 1.2258682250976562, + "learning_rate": 2.1636038056214376e-05, + "loss": 1.4366, + "step": 17267 + }, + { + "epoch": 0.5749135868120181, + "grad_norm": 1.254062294960022, + "learning_rate": 2.155283853988844e-05, + "loss": 1.4187, + "step": 17298 + }, + { + "epoch": 0.5759438978994948, + "grad_norm": 1.3397905826568604, + "learning_rate": 2.146967792431106e-05, + "loss": 1.4316, + "step": 17329 + }, + { + "epoch": 0.5769742089869716, + "grad_norm": 1.3253263235092163, + "learning_rate": 2.138655714793849e-05, + "loss": 1.4361, + "step": 17360 + }, + { + "epoch": 0.5780045200744482, + "grad_norm": 1.2624903917312622, + "learning_rate": 2.1303477148777367e-05, + "loss": 1.4136, + "step": 17391 + }, + { + "epoch": 0.579034831161925, + "grad_norm": 1.3255977630615234, + "learning_rate": 2.122043886437421e-05, + "loss": 1.4552, + "step": 17422 + }, + { + "epoch": 0.5800651422494018, + "grad_norm": 1.300898790359497, + "learning_rate": 2.1137443231804765e-05, + "loss": 1.4152, + "step": 17453 + }, + { + "epoch": 0.5810954533368785, + "grad_norm": 1.2904343605041504, + "learning_rate": 2.105449118766347e-05, + "loss": 1.4195, + "step": 17484 + }, + { + "epoch": 0.5821257644243553, + "grad_norm": 1.3146878480911255, + "learning_rate": 2.097158366805287e-05, + "loss": 1.426, + "step": 17515 + }, + { + "epoch": 0.5831560755118319, + "grad_norm": 1.2454010248184204, + "learning_rate": 2.0888721608573047e-05, + "loss": 1.4239, + "step": 17546 + }, + { + "epoch": 0.5841863865993087, + "grad_norm": 1.194626808166504, + "learning_rate": 2.0805905944311087e-05, + "loss": 1.4416, + "step": 17577 + }, + { + "epoch": 0.5852166976867854, + "grad_norm": 1.359053373336792, + "learning_rate": 2.0723137609830497e-05, + "loss": 1.4112, + "step": 17608 + }, + { + "epoch": 0.5862470087742622, + "grad_norm": 1.2577933073043823, + "learning_rate": 2.0640417539160686e-05, + "loss": 1.4432, + "step": 17639 + }, + { + "epoch": 0.5872773198617389, + "grad_norm": 1.2604849338531494, + "learning_rate": 2.0557746665786427e-05, + "loss": 1.4184, + "step": 17670 + }, + { + "epoch": 0.5883076309492157, + "grad_norm": 1.2511252164840698, + "learning_rate": 2.0475125922637256e-05, + "loss": 1.4276, + "step": 17701 + }, + { + "epoch": 0.5893379420366923, + "grad_norm": 1.2841278314590454, + "learning_rate": 2.0392556242077047e-05, + "loss": 1.4345, + "step": 17732 + }, + { + "epoch": 0.5903682531241691, + "grad_norm": 1.3342245817184448, + "learning_rate": 2.031003855589343e-05, + "loss": 1.4212, + "step": 17763 + }, + { + "epoch": 0.5913985642116458, + "grad_norm": 1.352387547492981, + "learning_rate": 2.022757379528727e-05, + "loss": 1.4316, + "step": 17794 + }, + { + "epoch": 0.5924288752991226, + "grad_norm": 1.3534374237060547, + "learning_rate": 2.0145162890862184e-05, + "loss": 1.4352, + "step": 17825 + }, + { + "epoch": 0.5934591863865993, + "grad_norm": 1.2957963943481445, + "learning_rate": 2.0062806772614022e-05, + "loss": 1.4057, + "step": 17856 + }, + { + "epoch": 0.5944894974740761, + "grad_norm": 1.3178727626800537, + "learning_rate": 1.9980506369920392e-05, + "loss": 1.4323, + "step": 17887 + }, + { + "epoch": 0.5955198085615527, + "grad_norm": 1.3364850282669067, + "learning_rate": 1.989826261153015e-05, + "loss": 1.4228, + "step": 17918 + }, + { + "epoch": 0.5965501196490295, + "grad_norm": 1.283200979232788, + "learning_rate": 1.9816076425552923e-05, + "loss": 1.4348, + "step": 17949 + }, + { + "epoch": 0.5975804307365062, + "grad_norm": 1.2856223583221436, + "learning_rate": 1.9733948739448676e-05, + "loss": 1.4176, + "step": 17980 + }, + { + "epoch": 0.598610741823983, + "grad_norm": 1.253180742263794, + "learning_rate": 1.9651880480017155e-05, + "loss": 1.4175, + "step": 18011 + }, + { + "epoch": 0.5996410529114597, + "grad_norm": 1.3471016883850098, + "learning_rate": 1.9569872573387516e-05, + "loss": 1.433, + "step": 18042 + }, + { + "epoch": 0.6006713639989365, + "grad_norm": 1.2449748516082764, + "learning_rate": 1.9487925945007854e-05, + "loss": 1.4091, + "step": 18073 + }, + { + "epoch": 0.6017016750864131, + "grad_norm": 1.3311972618103027, + "learning_rate": 1.9406041519634726e-05, + "loss": 1.403, + "step": 18104 + }, + { + "epoch": 0.6027319861738899, + "grad_norm": 1.2645657062530518, + "learning_rate": 1.932422022132275e-05, + "loss": 1.4265, + "step": 18135 + }, + { + "epoch": 0.6037622972613667, + "grad_norm": 1.3313370943069458, + "learning_rate": 1.924246297341414e-05, + "loss": 1.4275, + "step": 18166 + }, + { + "epoch": 0.6047926083488434, + "grad_norm": 1.2827123403549194, + "learning_rate": 1.9160770698528338e-05, + "loss": 1.4277, + "step": 18197 + }, + { + "epoch": 0.6058229194363202, + "grad_norm": 1.2230308055877686, + "learning_rate": 1.907914431855156e-05, + "loss": 1.4391, + "step": 18228 + }, + { + "epoch": 0.6068532305237969, + "grad_norm": 1.2785223722457886, + "learning_rate": 1.8997584754626412e-05, + "loss": 1.4152, + "step": 18259 + }, + { + "epoch": 0.6078835416112736, + "grad_norm": 1.3152620792388916, + "learning_rate": 1.8916092927141486e-05, + "loss": 1.4137, + "step": 18290 + }, + { + "epoch": 0.6089138526987503, + "grad_norm": 1.1842609643936157, + "learning_rate": 1.883466975572098e-05, + "loss": 1.4141, + "step": 18321 + }, + { + "epoch": 0.6099441637862271, + "grad_norm": 1.2319703102111816, + "learning_rate": 1.8753316159214312e-05, + "loss": 1.4216, + "step": 18352 + }, + { + "epoch": 0.6109744748737038, + "grad_norm": 1.3239370584487915, + "learning_rate": 1.8672033055685766e-05, + "loss": 1.4184, + "step": 18383 + }, + { + "epoch": 0.6120047859611806, + "grad_norm": 1.2665941715240479, + "learning_rate": 1.8590821362404116e-05, + "loss": 1.4249, + "step": 18414 + }, + { + "epoch": 0.6130350970486572, + "grad_norm": 1.2569379806518555, + "learning_rate": 1.8509681995832294e-05, + "loss": 1.4242, + "step": 18445 + }, + { + "epoch": 0.614065408136134, + "grad_norm": 1.2848411798477173, + "learning_rate": 1.8428615871617004e-05, + "loss": 1.4166, + "step": 18476 + }, + { + "epoch": 0.6150957192236107, + "grad_norm": 1.2636574506759644, + "learning_rate": 1.8347623904578448e-05, + "loss": 1.4297, + "step": 18507 + }, + { + "epoch": 0.6161260303110875, + "grad_norm": 1.2672234773635864, + "learning_rate": 1.8266707008699975e-05, + "loss": 1.4244, + "step": 18538 + }, + { + "epoch": 0.6171563413985642, + "grad_norm": 1.2299143075942993, + "learning_rate": 1.818586609711774e-05, + "loss": 1.408, + "step": 18569 + }, + { + "epoch": 0.618186652486041, + "grad_norm": 1.2221580743789673, + "learning_rate": 1.8105102082110462e-05, + "loss": 1.4242, + "step": 18600 + }, + { + "epoch": 0.6192169635735176, + "grad_norm": 1.290737509727478, + "learning_rate": 1.8024415875089058e-05, + "loss": 1.4167, + "step": 18631 + }, + { + "epoch": 0.6202472746609944, + "grad_norm": 1.3236243724822998, + "learning_rate": 1.7943808386586407e-05, + "loss": 1.4341, + "step": 18662 + }, + { + "epoch": 0.6212775857484711, + "grad_norm": 1.1983164548873901, + "learning_rate": 1.7863280526247073e-05, + "loss": 1.4171, + "step": 18693 + }, + { + "epoch": 0.6223078968359479, + "grad_norm": 1.2706191539764404, + "learning_rate": 1.7782833202817003e-05, + "loss": 1.4268, + "step": 18724 + }, + { + "epoch": 0.6233382079234246, + "grad_norm": 1.2584494352340698, + "learning_rate": 1.7702467324133327e-05, + "loss": 1.4364, + "step": 18755 + }, + { + "epoch": 0.6243685190109014, + "grad_norm": 1.345226526260376, + "learning_rate": 1.7622183797114042e-05, + "loss": 1.4274, + "step": 18786 + }, + { + "epoch": 0.625398830098378, + "grad_norm": 1.3055671453475952, + "learning_rate": 1.7541983527747838e-05, + "loss": 1.4101, + "step": 18817 + }, + { + "epoch": 0.6264291411858548, + "grad_norm": 1.2878341674804688, + "learning_rate": 1.746186742108387e-05, + "loss": 1.4133, + "step": 18848 + }, + { + "epoch": 0.6274594522733316, + "grad_norm": 1.241191029548645, + "learning_rate": 1.73818363812215e-05, + "loss": 1.4038, + "step": 18879 + }, + { + "epoch": 0.6284897633608083, + "grad_norm": 1.8631796836853027, + "learning_rate": 1.7301891311300153e-05, + "loss": 1.3961, + "step": 18910 + }, + { + "epoch": 0.6295200744482851, + "grad_norm": 1.2781902551651, + "learning_rate": 1.7222033113489055e-05, + "loss": 1.4238, + "step": 18941 + }, + { + "epoch": 0.6305503855357618, + "grad_norm": 1.2679165601730347, + "learning_rate": 1.7142262688977127e-05, + "loss": 1.4236, + "step": 18972 + }, + { + "epoch": 0.6315806966232385, + "grad_norm": 1.257203459739685, + "learning_rate": 1.7062580937962764e-05, + "loss": 1.4156, + "step": 19003 + }, + { + "epoch": 0.6326110077107152, + "grad_norm": 1.284470796585083, + "learning_rate": 1.698298875964369e-05, + "loss": 1.4241, + "step": 19034 + }, + { + "epoch": 0.633641318798192, + "grad_norm": 1.310545802116394, + "learning_rate": 1.690348705220684e-05, + "loss": 1.4205, + "step": 19065 + }, + { + "epoch": 0.6346716298856687, + "grad_norm": 1.2868564128875732, + "learning_rate": 1.6824076712818156e-05, + "loss": 1.4238, + "step": 19096 + }, + { + "epoch": 0.6357019409731455, + "grad_norm": 1.2508702278137207, + "learning_rate": 1.6744758637612533e-05, + "loss": 1.4046, + "step": 19127 + }, + { + "epoch": 0.6367322520606222, + "grad_norm": 1.3149102926254272, + "learning_rate": 1.6665533721683664e-05, + "loss": 1.4211, + "step": 19158 + }, + { + "epoch": 0.6377625631480989, + "grad_norm": 1.3485240936279297, + "learning_rate": 1.6586402859073974e-05, + "loss": 1.4167, + "step": 19189 + }, + { + "epoch": 0.6387928742355756, + "grad_norm": 1.2397938966751099, + "learning_rate": 1.6507366942764463e-05, + "loss": 1.4242, + "step": 19220 + }, + { + "epoch": 0.6398231853230524, + "grad_norm": 1.2909672260284424, + "learning_rate": 1.6428426864664732e-05, + "loss": 1.403, + "step": 19251 + }, + { + "epoch": 0.6408534964105291, + "grad_norm": 1.290385365486145, + "learning_rate": 1.6349583515602816e-05, + "loss": 1.4082, + "step": 19282 + }, + { + "epoch": 0.6418838074980059, + "grad_norm": 1.3623126745224, + "learning_rate": 1.6270837785315208e-05, + "loss": 1.4075, + "step": 19313 + }, + { + "epoch": 0.6429141185854825, + "grad_norm": 1.276903510093689, + "learning_rate": 1.619219056243676e-05, + "loss": 1.4135, + "step": 19344 + }, + { + "epoch": 0.6439444296729593, + "grad_norm": 1.2038910388946533, + "learning_rate": 1.6113642734490698e-05, + "loss": 1.4162, + "step": 19375 + }, + { + "epoch": 0.644974740760436, + "grad_norm": 1.2092891931533813, + "learning_rate": 1.6035195187878577e-05, + "loss": 1.4285, + "step": 19406 + }, + { + "epoch": 0.6460050518479128, + "grad_norm": 1.2983031272888184, + "learning_rate": 1.5956848807870305e-05, + "loss": 1.4128, + "step": 19437 + }, + { + "epoch": 0.6470353629353895, + "grad_norm": 1.279845952987671, + "learning_rate": 1.587860447859413e-05, + "loss": 1.4351, + "step": 19468 + }, + { + "epoch": 0.6480656740228663, + "grad_norm": 1.2781362533569336, + "learning_rate": 1.5800463083026686e-05, + "loss": 1.4118, + "step": 19499 + }, + { + "epoch": 0.6490959851103429, + "grad_norm": 1.2652825117111206, + "learning_rate": 1.572242550298298e-05, + "loss": 1.4195, + "step": 19530 + }, + { + "epoch": 0.6501262961978197, + "grad_norm": 1.3177101612091064, + "learning_rate": 1.56444926191065e-05, + "loss": 1.4307, + "step": 19561 + }, + { + "epoch": 0.6511566072852965, + "grad_norm": 1.2758272886276245, + "learning_rate": 1.5566665310859257e-05, + "loss": 1.4096, + "step": 19592 + }, + { + "epoch": 0.6521869183727732, + "grad_norm": 1.2265219688415527, + "learning_rate": 1.5488944456511846e-05, + "loss": 1.4098, + "step": 19623 + }, + { + "epoch": 0.65321722946025, + "grad_norm": 1.258945345878601, + "learning_rate": 1.5411330933133546e-05, + "loss": 1.4274, + "step": 19654 + }, + { + "epoch": 0.6542475405477267, + "grad_norm": 1.2599055767059326, + "learning_rate": 1.533382561658241e-05, + "loss": 1.4207, + "step": 19685 + }, + { + "epoch": 0.6552778516352034, + "grad_norm": 1.2502135038375854, + "learning_rate": 1.525642938149541e-05, + "loss": 1.4046, + "step": 19716 + }, + { + "epoch": 0.6563081627226801, + "grad_norm": 1.2734349966049194, + "learning_rate": 1.5179143101278536e-05, + "loss": 1.41, + "step": 19747 + }, + { + "epoch": 0.6573384738101569, + "grad_norm": 1.2801038026809692, + "learning_rate": 1.5101967648096955e-05, + "loss": 1.4088, + "step": 19778 + }, + { + "epoch": 0.6583687848976336, + "grad_norm": 1.2488126754760742, + "learning_rate": 1.5024903892865172e-05, + "loss": 1.4111, + "step": 19809 + }, + { + "epoch": 0.6593990959851104, + "grad_norm": 1.2418783903121948, + "learning_rate": 1.4947952705237184e-05, + "loss": 1.384, + "step": 19840 + }, + { + "epoch": 0.6604294070725871, + "grad_norm": 1.2566567659378052, + "learning_rate": 1.4871114953596682e-05, + "loss": 1.4127, + "step": 19871 + }, + { + "epoch": 0.6614597181600638, + "grad_norm": 1.2431600093841553, + "learning_rate": 1.4794391505047256e-05, + "loss": 1.4015, + "step": 19902 + }, + { + "epoch": 0.6624900292475405, + "grad_norm": 1.3174066543579102, + "learning_rate": 1.4717783225402596e-05, + "loss": 1.4113, + "step": 19933 + }, + { + "epoch": 0.6635203403350173, + "grad_norm": 1.3124332427978516, + "learning_rate": 1.4641290979176735e-05, + "loss": 1.421, + "step": 19964 + }, + { + "epoch": 0.664550651422494, + "grad_norm": 1.2595762014389038, + "learning_rate": 1.4564915629574246e-05, + "loss": 1.409, + "step": 19995 + }, + { + "epoch": 0.6655809625099708, + "grad_norm": 1.2872180938720703, + "learning_rate": 1.4488658038480601e-05, + "loss": 1.4082, + "step": 20026 + }, + { + "epoch": 0.6666112735974475, + "grad_norm": 1.27680242061615, + "learning_rate": 1.4412519066452323e-05, + "loss": 1.3979, + "step": 20057 + }, + { + "epoch": 0.6676415846849242, + "grad_norm": 1.2753857374191284, + "learning_rate": 1.4336499572707373e-05, + "loss": 1.4227, + "step": 20088 + }, + { + "epoch": 0.6686718957724009, + "grad_norm": 1.2680202722549438, + "learning_rate": 1.4260600415115433e-05, + "loss": 1.418, + "step": 20119 + }, + { + "epoch": 0.6697022068598777, + "grad_norm": 1.3002320528030396, + "learning_rate": 1.4184822450188137e-05, + "loss": 1.4133, + "step": 20150 + }, + { + "epoch": 0.6707325179473544, + "grad_norm": 1.3236373662948608, + "learning_rate": 1.410916653306954e-05, + "loss": 1.4133, + "step": 20181 + }, + { + "epoch": 0.6717628290348312, + "grad_norm": 1.3784340620040894, + "learning_rate": 1.403363351752639e-05, + "loss": 1.4064, + "step": 20212 + }, + { + "epoch": 0.6727931401223078, + "grad_norm": 1.2793350219726562, + "learning_rate": 1.3958224255938485e-05, + "loss": 1.4203, + "step": 20243 + }, + { + "epoch": 0.6738234512097846, + "grad_norm": 1.3510205745697021, + "learning_rate": 1.388293959928911e-05, + "loss": 1.418, + "step": 20274 + }, + { + "epoch": 0.6748537622972614, + "grad_norm": 1.2981188297271729, + "learning_rate": 1.3807780397155379e-05, + "loss": 1.4019, + "step": 20305 + }, + { + "epoch": 0.6758840733847381, + "grad_norm": 1.2599388360977173, + "learning_rate": 1.3732747497698655e-05, + "loss": 1.4187, + "step": 20336 + }, + { + "epoch": 0.6769143844722149, + "grad_norm": 1.2741434574127197, + "learning_rate": 1.3657841747655038e-05, + "loss": 1.4183, + "step": 20367 + }, + { + "epoch": 0.6779446955596916, + "grad_norm": 1.2376216650009155, + "learning_rate": 1.3583063992325706e-05, + "loss": 1.4208, + "step": 20398 + }, + { + "epoch": 0.6789750066471683, + "grad_norm": 1.341134786605835, + "learning_rate": 1.3508415075567496e-05, + "loss": 1.4015, + "step": 20429 + }, + { + "epoch": 0.680005317734645, + "grad_norm": 1.3483457565307617, + "learning_rate": 1.343389583978327e-05, + "loss": 1.4043, + "step": 20460 + }, + { + "epoch": 0.6810356288221218, + "grad_norm": 1.3255680799484253, + "learning_rate": 1.3359507125912468e-05, + "loss": 1.4162, + "step": 20491 + }, + { + "epoch": 0.6820659399095985, + "grad_norm": 1.211305022239685, + "learning_rate": 1.3285249773421627e-05, + "loss": 1.4043, + "step": 20522 + }, + { + "epoch": 0.6830962509970753, + "grad_norm": 1.3049174547195435, + "learning_rate": 1.3211124620294884e-05, + "loss": 1.4012, + "step": 20553 + }, + { + "epoch": 0.684126562084552, + "grad_norm": 1.2884812355041504, + "learning_rate": 1.313713250302451e-05, + "loss": 1.419, + "step": 20584 + }, + { + "epoch": 0.6851568731720287, + "grad_norm": 1.2465201616287231, + "learning_rate": 1.3063274256601479e-05, + "loss": 1.394, + "step": 20615 + }, + { + "epoch": 0.6861871842595054, + "grad_norm": 1.2868762016296387, + "learning_rate": 1.2989550714506086e-05, + "loss": 1.3975, + "step": 20646 + }, + { + "epoch": 0.6872174953469822, + "grad_norm": 1.2728379964828491, + "learning_rate": 1.291596270869846e-05, + "loss": 1.3918, + "step": 20677 + }, + { + "epoch": 0.6882478064344589, + "grad_norm": 1.265869379043579, + "learning_rate": 1.284251106960927e-05, + "loss": 1.402, + "step": 20708 + }, + { + "epoch": 0.6892781175219357, + "grad_norm": 1.3357373476028442, + "learning_rate": 1.2769196626130263e-05, + "loss": 1.3975, + "step": 20739 + }, + { + "epoch": 0.6903084286094124, + "grad_norm": 1.216797947883606, + "learning_rate": 1.2696020205604969e-05, + "loss": 1.3953, + "step": 20770 + }, + { + "epoch": 0.6913387396968891, + "grad_norm": 1.269227385520935, + "learning_rate": 1.2622982633819359e-05, + "loss": 1.4154, + "step": 20801 + }, + { + "epoch": 0.6923690507843658, + "grad_norm": 1.3336331844329834, + "learning_rate": 1.2550084734992484e-05, + "loss": 1.3992, + "step": 20832 + }, + { + "epoch": 0.6933993618718426, + "grad_norm": 1.2936463356018066, + "learning_rate": 1.247732733176724e-05, + "loss": 1.4147, + "step": 20863 + }, + { + "epoch": 0.6944296729593193, + "grad_norm": 1.344826102256775, + "learning_rate": 1.2404711245201044e-05, + "loss": 1.3878, + "step": 20894 + }, + { + "epoch": 0.6954599840467961, + "grad_norm": 1.2611995935440063, + "learning_rate": 1.2332237294756535e-05, + "loss": 1.4088, + "step": 20925 + }, + { + "epoch": 0.6964902951342729, + "grad_norm": 1.3274885416030884, + "learning_rate": 1.225990629829241e-05, + "loss": 1.4036, + "step": 20956 + }, + { + "epoch": 0.6975206062217495, + "grad_norm": 1.2847373485565186, + "learning_rate": 1.2187719072054136e-05, + "loss": 1.398, + "step": 20987 + }, + { + "epoch": 0.6985509173092262, + "grad_norm": 1.2856248617172241, + "learning_rate": 1.2115676430664735e-05, + "loss": 1.4101, + "step": 21018 + }, + { + "epoch": 0.699581228396703, + "grad_norm": 1.3064154386520386, + "learning_rate": 1.2043779187115647e-05, + "loss": 1.4081, + "step": 21049 + }, + { + "epoch": 0.7006115394841798, + "grad_norm": 1.253602147102356, + "learning_rate": 1.1972028152757476e-05, + "loss": 1.4123, + "step": 21080 + }, + { + "epoch": 0.7016418505716565, + "grad_norm": 1.2678899765014648, + "learning_rate": 1.1900424137290889e-05, + "loss": 1.3969, + "step": 21111 + }, + { + "epoch": 0.7026721616591332, + "grad_norm": 1.2261760234832764, + "learning_rate": 1.1828967948757482e-05, + "loss": 1.4009, + "step": 21142 + }, + { + "epoch": 0.7037024727466099, + "grad_norm": 1.540486216545105, + "learning_rate": 1.175766039353062e-05, + "loss": 1.4215, + "step": 21173 + }, + { + "epoch": 0.7047327838340867, + "grad_norm": 1.2508059740066528, + "learning_rate": 1.1686502276306382e-05, + "loss": 1.4046, + "step": 21204 + }, + { + "epoch": 0.7057630949215634, + "grad_norm": 1.2918591499328613, + "learning_rate": 1.1615494400094445e-05, + "loss": 1.4301, + "step": 21235 + }, + { + "epoch": 0.7067934060090402, + "grad_norm": 1.240178108215332, + "learning_rate": 1.1544637566209029e-05, + "loss": 1.3888, + "step": 21266 + }, + { + "epoch": 0.7078237170965169, + "grad_norm": 1.2358977794647217, + "learning_rate": 1.1473932574259886e-05, + "loss": 1.415, + "step": 21297 + }, + { + "epoch": 0.7088540281839936, + "grad_norm": 1.2963451147079468, + "learning_rate": 1.1403380222143247e-05, + "loss": 1.4002, + "step": 21328 + }, + { + "epoch": 0.7098843392714703, + "grad_norm": 1.3245363235473633, + "learning_rate": 1.1332981306032808e-05, + "loss": 1.3945, + "step": 21359 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5847615378155897e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-21364/training_args.bin b/checkpoint-21364/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..974208468b82a3c5684aaa384776477cf21c18ca --- /dev/null +++ b/checkpoint-21364/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a23be0ff07d6d3142f7c0980f91dddba845519c24fcb411cbb4b9ddb1513ff +size 5304 diff --git a/checkpoint-24416/config.json b/checkpoint-24416/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aaa74176892d42e1c7f5979b7ddf8ab15985d3 --- /dev/null +++ b/checkpoint-24416/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "/mnt/parscratch/users/acp23ay/private/models/Llama-3.1-8B-Instruct-ta-madlad-mean/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 138256 +} diff --git a/checkpoint-24416/generation_config.json b/checkpoint-24416/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-24416/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-24416/model-00001-of-00007.safetensors b/checkpoint-24416/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1213ab9d1bfcf6348311bb54b64c12faf103674b --- /dev/null +++ b/checkpoint-24416/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5aec21cfd7dfe441e013d476d8347477fc7eaf4ede85de62fb93f7c99239e43 +size 4983197184 diff --git a/checkpoint-24416/model-00002-of-00007.safetensors b/checkpoint-24416/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d32d83cb96a109af411b9cf577e7fbfe07ea76fc --- /dev/null +++ b/checkpoint-24416/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168629c67732309d49f60a5ec48a6d160212bc987365e82e183dfbf74ba0c1f3 +size 4899116432 diff --git a/checkpoint-24416/model-00003-of-00007.safetensors b/checkpoint-24416/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-24416/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-24416/model-00004-of-00007.safetensors b/checkpoint-24416/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-24416/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-24416/model-00005-of-00007.safetensors b/checkpoint-24416/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-24416/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-24416/model-00006-of-00007.safetensors b/checkpoint-24416/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..82b9f26c6d2212ec2f89fb9e093d94af89228fad --- /dev/null +++ b/checkpoint-24416/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c298ea2edffdac654f6f104ece2192862cc2d2ce08c007f2f990395cfd5eabcc +size 4999813120 diff --git a/checkpoint-24416/model-00007-of-00007.safetensors b/checkpoint-24416/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a6640d38744a2e71b931e18e72fbc281d292790f --- /dev/null +++ b/checkpoint-24416/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81c325feabc04b61226d770f3dfdec89db0ca36cd69919e6cde0e9f86d905230 +size 2734998184 diff --git a/checkpoint-24416/model.safetensors.index.json b/checkpoint-24416/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..318803c6a3dd771c7f7c3b8038a896af7c8322ae --- /dev/null +++ b/checkpoint-24416/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32448724992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-24416/optimizer.pt b/checkpoint-24416/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0dbf2db3b913591c3e98af5737d5f6b92129a8d --- /dev/null +++ b/checkpoint-24416/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea1ea845f247cc04a841bceca64d6feab88fa5fab970bbb6efdd0ec23e23c956 +size 16040396334 diff --git a/checkpoint-24416/rng_state.pth b/checkpoint-24416/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-24416/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-24416/scheduler.pt b/checkpoint-24416/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c42ab2446b20c095538f06fcf92f01ac58007a07 --- /dev/null +++ b/checkpoint-24416/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:719f421c0e2563868e52a38d7c300a4ceee2dbf15648505f514dae6bb8a5e723 +size 1064 diff --git a/checkpoint-24416/trainer_state.json b/checkpoint-24416/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..23a24f0cbc712674a827493f6c84aa846c566477 --- /dev/null +++ b/checkpoint-24416/trainer_state.json @@ -0,0 +1,5542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.811486306833289, + "eval_steps": 500, + "global_step": 24416, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001030311087476735, + "grad_norm": 60.25547409057617, + "learning_rate": 1.0157273918741808e-06, + "loss": 8.8455, + "step": 31 + }, + { + "epoch": 0.00206062217495347, + "grad_norm": 15.669363975524902, + "learning_rate": 2.0314547837483616e-06, + "loss": 7.1553, + "step": 62 + }, + { + "epoch": 0.003090933262430205, + "grad_norm": 15.366345405578613, + "learning_rate": 3.0471821756225426e-06, + "loss": 5.8784, + "step": 93 + }, + { + "epoch": 0.00412124434990694, + "grad_norm": 36.30561828613281, + "learning_rate": 4.062909567496723e-06, + "loss": 4.7708, + "step": 124 + }, + { + "epoch": 0.005151555437383675, + "grad_norm": 27.202678680419922, + "learning_rate": 5.078636959370905e-06, + "loss": 4.1629, + "step": 155 + }, + { + "epoch": 0.00618186652486041, + "grad_norm": 24.30484962463379, + "learning_rate": 6.094364351245085e-06, + "loss": 3.867, + "step": 186 + }, + { + "epoch": 0.007212177612337145, + "grad_norm": 19.916366577148438, + "learning_rate": 7.110091743119267e-06, + "loss": 3.6131, + "step": 217 + }, + { + "epoch": 0.00824248869981388, + "grad_norm": 17.577274322509766, + "learning_rate": 8.125819134993446e-06, + "loss": 3.4772, + "step": 248 + }, + { + "epoch": 0.009272799787290615, + "grad_norm": 12.133190155029297, + "learning_rate": 9.141546526867629e-06, + "loss": 3.3218, + "step": 279 + }, + { + "epoch": 0.01030311087476735, + "grad_norm": 19.79263687133789, + "learning_rate": 1.015727391874181e-05, + "loss": 3.2055, + "step": 310 + }, + { + "epoch": 0.011333421962244085, + "grad_norm": 16.38133430480957, + "learning_rate": 1.117300131061599e-05, + "loss": 3.1062, + "step": 341 + }, + { + "epoch": 0.01236373304972082, + "grad_norm": 12.638299942016602, + "learning_rate": 1.218872870249017e-05, + "loss": 3.0106, + "step": 372 + }, + { + "epoch": 0.013394044137197554, + "grad_norm": 9.46596908569336, + "learning_rate": 1.3204456094364351e-05, + "loss": 2.924, + "step": 403 + }, + { + "epoch": 0.01442435522467429, + "grad_norm": 10.945392608642578, + "learning_rate": 1.4220183486238533e-05, + "loss": 2.844, + "step": 434 + }, + { + "epoch": 0.015454666312151024, + "grad_norm": 8.474015235900879, + "learning_rate": 1.5235910878112714e-05, + "loss": 2.7892, + "step": 465 + }, + { + "epoch": 0.01648497739962776, + "grad_norm": 9.370804786682129, + "learning_rate": 1.6251638269986893e-05, + "loss": 2.7509, + "step": 496 + }, + { + "epoch": 0.017515288487104493, + "grad_norm": 11.63398551940918, + "learning_rate": 1.7267365661861077e-05, + "loss": 2.6999, + "step": 527 + }, + { + "epoch": 0.01854559957458123, + "grad_norm": 9.17713451385498, + "learning_rate": 1.8283093053735257e-05, + "loss": 2.6459, + "step": 558 + }, + { + "epoch": 0.019575910662057962, + "grad_norm": 7.119054794311523, + "learning_rate": 1.9298820445609438e-05, + "loss": 2.603, + "step": 589 + }, + { + "epoch": 0.0206062217495347, + "grad_norm": 6.653646945953369, + "learning_rate": 2.031454783748362e-05, + "loss": 2.5588, + "step": 620 + }, + { + "epoch": 0.021636532837011432, + "grad_norm": 8.332653045654297, + "learning_rate": 2.13302752293578e-05, + "loss": 2.5357, + "step": 651 + }, + { + "epoch": 0.02266684392448817, + "grad_norm": 6.4949116706848145, + "learning_rate": 2.234600262123198e-05, + "loss": 2.4967, + "step": 682 + }, + { + "epoch": 0.023697155011964902, + "grad_norm": 9.41009521484375, + "learning_rate": 2.336173001310616e-05, + "loss": 2.4563, + "step": 713 + }, + { + "epoch": 0.02472746609944164, + "grad_norm": 7.840345859527588, + "learning_rate": 2.437745740498034e-05, + "loss": 2.4383, + "step": 744 + }, + { + "epoch": 0.025757777186918372, + "grad_norm": 6.116458415985107, + "learning_rate": 2.5393184796854525e-05, + "loss": 2.3817, + "step": 775 + }, + { + "epoch": 0.02678808827439511, + "grad_norm": 5.938300609588623, + "learning_rate": 2.6408912188728702e-05, + "loss": 2.3508, + "step": 806 + }, + { + "epoch": 0.027818399361871842, + "grad_norm": 5.4408345222473145, + "learning_rate": 2.7424639580602886e-05, + "loss": 2.3325, + "step": 837 + }, + { + "epoch": 0.02884871044934858, + "grad_norm": 5.375136375427246, + "learning_rate": 2.8440366972477066e-05, + "loss": 2.3101, + "step": 868 + }, + { + "epoch": 0.029879021536825312, + "grad_norm": 5.149726867675781, + "learning_rate": 2.9456094364351244e-05, + "loss": 2.282, + "step": 899 + }, + { + "epoch": 0.03090933262430205, + "grad_norm": 4.591221332550049, + "learning_rate": 3.0471821756225428e-05, + "loss": 2.2427, + "step": 930 + }, + { + "epoch": 0.031939643711778785, + "grad_norm": 4.977034091949463, + "learning_rate": 3.148754914809961e-05, + "loss": 2.2218, + "step": 961 + }, + { + "epoch": 0.03296995479925552, + "grad_norm": 5.038781642913818, + "learning_rate": 3.2503276539973785e-05, + "loss": 2.2044, + "step": 992 + }, + { + "epoch": 0.03400026588673225, + "grad_norm": 4.872281551361084, + "learning_rate": 3.351900393184797e-05, + "loss": 2.1657, + "step": 1023 + }, + { + "epoch": 0.035030576974208985, + "grad_norm": 4.370841979980469, + "learning_rate": 3.453473132372215e-05, + "loss": 2.1365, + "step": 1054 + }, + { + "epoch": 0.036060888061685725, + "grad_norm": 4.087072849273682, + "learning_rate": 3.555045871559633e-05, + "loss": 2.1253, + "step": 1085 + }, + { + "epoch": 0.03709119914916246, + "grad_norm": 4.113957405090332, + "learning_rate": 3.6566186107470514e-05, + "loss": 2.0973, + "step": 1116 + }, + { + "epoch": 0.03812151023663919, + "grad_norm": 4.0119733810424805, + "learning_rate": 3.7581913499344695e-05, + "loss": 2.1024, + "step": 1147 + }, + { + "epoch": 0.039151821324115925, + "grad_norm": 4.247573375701904, + "learning_rate": 3.8597640891218876e-05, + "loss": 2.0722, + "step": 1178 + }, + { + "epoch": 0.04018213241159266, + "grad_norm": 3.5575129985809326, + "learning_rate": 3.9613368283093056e-05, + "loss": 2.056, + "step": 1209 + }, + { + "epoch": 0.0412124434990694, + "grad_norm": 3.8885862827301025, + "learning_rate": 4.062909567496724e-05, + "loss": 2.0389, + "step": 1240 + }, + { + "epoch": 0.04224275458654613, + "grad_norm": 3.680628538131714, + "learning_rate": 4.164482306684142e-05, + "loss": 2.0385, + "step": 1271 + }, + { + "epoch": 0.043273065674022865, + "grad_norm": 3.780876874923706, + "learning_rate": 4.26605504587156e-05, + "loss": 2.0097, + "step": 1302 + }, + { + "epoch": 0.0443033767614996, + "grad_norm": 4.235328674316406, + "learning_rate": 4.367627785058978e-05, + "loss": 2.0024, + "step": 1333 + }, + { + "epoch": 0.04533368784897634, + "grad_norm": 3.326941967010498, + "learning_rate": 4.469200524246396e-05, + "loss": 1.9953, + "step": 1364 + }, + { + "epoch": 0.04636399893645307, + "grad_norm": 3.28456449508667, + "learning_rate": 4.570773263433814e-05, + "loss": 1.9579, + "step": 1395 + }, + { + "epoch": 0.047394310023929805, + "grad_norm": 16.107433319091797, + "learning_rate": 4.672346002621232e-05, + "loss": 1.9701, + "step": 1426 + }, + { + "epoch": 0.04842462111140654, + "grad_norm": 3.5708224773406982, + "learning_rate": 4.77391874180865e-05, + "loss": 1.9621, + "step": 1457 + }, + { + "epoch": 0.04945493219888328, + "grad_norm": 2.9053499698638916, + "learning_rate": 4.875491480996068e-05, + "loss": 1.9458, + "step": 1488 + }, + { + "epoch": 0.05048524328636001, + "grad_norm": 3.0863258838653564, + "learning_rate": 4.977064220183487e-05, + "loss": 1.9483, + "step": 1519 + }, + { + "epoch": 0.051515554373836744, + "grad_norm": 2.9012269973754883, + "learning_rate": 4.9999915451558777e-05, + "loss": 1.928, + "step": 1550 + }, + { + "epoch": 0.05254586546131348, + "grad_norm": 3.0949041843414307, + "learning_rate": 4.999955597496219e-05, + "loss": 1.9229, + "step": 1581 + }, + { + "epoch": 0.05357617654879022, + "grad_norm": 2.8687901496887207, + "learning_rate": 4.9998914381774255e-05, + "loss": 1.915, + "step": 1612 + }, + { + "epoch": 0.05460648763626695, + "grad_norm": 3.2136878967285156, + "learning_rate": 4.999799067923527e-05, + "loss": 1.9197, + "step": 1643 + }, + { + "epoch": 0.055636798723743684, + "grad_norm": 2.590843677520752, + "learning_rate": 4.999678487776908e-05, + "loss": 1.8756, + "step": 1674 + }, + { + "epoch": 0.05666710981122042, + "grad_norm": 2.64634108543396, + "learning_rate": 4.9995296990983006e-05, + "loss": 1.9033, + "step": 1705 + }, + { + "epoch": 0.05769742089869716, + "grad_norm": 3.0151331424713135, + "learning_rate": 4.999352703566763e-05, + "loss": 1.8883, + "step": 1736 + }, + { + "epoch": 0.05872773198617389, + "grad_norm": 2.526806354522705, + "learning_rate": 4.999147503179668e-05, + "loss": 1.8666, + "step": 1767 + }, + { + "epoch": 0.059758043073650624, + "grad_norm": 2.510300397872925, + "learning_rate": 4.998914100252672e-05, + "loss": 1.854, + "step": 1798 + }, + { + "epoch": 0.06078835416112736, + "grad_norm": 2.4867682456970215, + "learning_rate": 4.998652497419696e-05, + "loss": 1.8548, + "step": 1829 + }, + { + "epoch": 0.0618186652486041, + "grad_norm": 2.3920586109161377, + "learning_rate": 4.9983626976328927e-05, + "loss": 1.8495, + "step": 1860 + }, + { + "epoch": 0.06284897633608083, + "grad_norm": 2.714177370071411, + "learning_rate": 4.998044704162613e-05, + "loss": 1.8433, + "step": 1891 + }, + { + "epoch": 0.06387928742355757, + "grad_norm": 2.3094465732574463, + "learning_rate": 4.9976985205973705e-05, + "loss": 1.8382, + "step": 1922 + }, + { + "epoch": 0.0649095985110343, + "grad_norm": 2.47184419631958, + "learning_rate": 4.997324150843799e-05, + "loss": 1.8464, + "step": 1953 + }, + { + "epoch": 0.06593990959851104, + "grad_norm": 2.391841411590576, + "learning_rate": 4.99692159912661e-05, + "loss": 1.8179, + "step": 1984 + }, + { + "epoch": 0.06697022068598776, + "grad_norm": 2.2471864223480225, + "learning_rate": 4.996490869988546e-05, + "loss": 1.8149, + "step": 2015 + }, + { + "epoch": 0.0680005317734645, + "grad_norm": 2.5497376918792725, + "learning_rate": 4.996031968290326e-05, + "loss": 1.8099, + "step": 2046 + }, + { + "epoch": 0.06903084286094124, + "grad_norm": 2.330463409423828, + "learning_rate": 4.995544899210594e-05, + "loss": 1.8267, + "step": 2077 + }, + { + "epoch": 0.07006115394841797, + "grad_norm": 2.3259341716766357, + "learning_rate": 4.9950296682458583e-05, + "loss": 1.7801, + "step": 2108 + }, + { + "epoch": 0.07109146503589471, + "grad_norm": 2.1711952686309814, + "learning_rate": 4.994486281210429e-05, + "loss": 1.7961, + "step": 2139 + }, + { + "epoch": 0.07212177612337145, + "grad_norm": 2.1808884143829346, + "learning_rate": 4.9939147442363566e-05, + "loss": 1.8109, + "step": 2170 + }, + { + "epoch": 0.07315208721084818, + "grad_norm": 2.089256525039673, + "learning_rate": 4.9933150637733574e-05, + "loss": 1.8026, + "step": 2201 + }, + { + "epoch": 0.07418239829832492, + "grad_norm": 2.0864951610565186, + "learning_rate": 4.992687246588743e-05, + "loss": 1.7753, + "step": 2232 + }, + { + "epoch": 0.07521270938580164, + "grad_norm": 2.36157488822937, + "learning_rate": 4.992031299767347e-05, + "loss": 1.7746, + "step": 2263 + }, + { + "epoch": 0.07624302047327838, + "grad_norm": 2.5334439277648926, + "learning_rate": 4.9913472307114386e-05, + "loss": 1.7927, + "step": 2294 + }, + { + "epoch": 0.07727333156075512, + "grad_norm": 2.2565715312957764, + "learning_rate": 4.9906350471406446e-05, + "loss": 1.7668, + "step": 2325 + }, + { + "epoch": 0.07830364264823185, + "grad_norm": 2.1043128967285156, + "learning_rate": 4.989894757091861e-05, + "loss": 1.7771, + "step": 2356 + }, + { + "epoch": 0.07933395373570859, + "grad_norm": 1.9659819602966309, + "learning_rate": 4.989126368919158e-05, + "loss": 1.7666, + "step": 2387 + }, + { + "epoch": 0.08036426482318532, + "grad_norm": 2.0778403282165527, + "learning_rate": 4.988329891293693e-05, + "loss": 1.7405, + "step": 2418 + }, + { + "epoch": 0.08139457591066206, + "grad_norm": 2.1767923831939697, + "learning_rate": 4.987505333203608e-05, + "loss": 1.7495, + "step": 2449 + }, + { + "epoch": 0.0824248869981388, + "grad_norm": 2.260143280029297, + "learning_rate": 4.9866527039539276e-05, + "loss": 1.7504, + "step": 2480 + }, + { + "epoch": 0.08345519808561552, + "grad_norm": 2.18271803855896, + "learning_rate": 4.9857720131664594e-05, + "loss": 1.7456, + "step": 2511 + }, + { + "epoch": 0.08448550917309226, + "grad_norm": 2.209594964981079, + "learning_rate": 4.9848632707796773e-05, + "loss": 1.7528, + "step": 2542 + }, + { + "epoch": 0.085515820260569, + "grad_norm": 2.0666229724884033, + "learning_rate": 4.9839264870486155e-05, + "loss": 1.7517, + "step": 2573 + }, + { + "epoch": 0.08654613134804573, + "grad_norm": 2.1070454120635986, + "learning_rate": 4.9829616725447526e-05, + "loss": 1.7474, + "step": 2604 + }, + { + "epoch": 0.08757644243552247, + "grad_norm": 1.9430303573608398, + "learning_rate": 4.981968838155888e-05, + "loss": 1.7348, + "step": 2635 + }, + { + "epoch": 0.0886067535229992, + "grad_norm": 1.9638925790786743, + "learning_rate": 4.980947995086024e-05, + "loss": 1.7202, + "step": 2666 + }, + { + "epoch": 0.08963706461047594, + "grad_norm": 1.8845652341842651, + "learning_rate": 4.979899154855234e-05, + "loss": 1.7375, + "step": 2697 + }, + { + "epoch": 0.09066737569795268, + "grad_norm": 5.712058067321777, + "learning_rate": 4.9788223292995386e-05, + "loss": 1.7379, + "step": 2728 + }, + { + "epoch": 0.0916976867854294, + "grad_norm": 1.9520670175552368, + "learning_rate": 4.977717530570768e-05, + "loss": 1.7302, + "step": 2759 + }, + { + "epoch": 0.09272799787290614, + "grad_norm": 1.8802224397659302, + "learning_rate": 4.976584771136425e-05, + "loss": 1.74, + "step": 2790 + }, + { + "epoch": 0.09375830896038288, + "grad_norm": 2.1098153591156006, + "learning_rate": 4.975424063779547e-05, + "loss": 1.7024, + "step": 2821 + }, + { + "epoch": 0.09478862004785961, + "grad_norm": 2.1568291187286377, + "learning_rate": 4.974235421598557e-05, + "loss": 1.7131, + "step": 2852 + }, + { + "epoch": 0.09581893113533635, + "grad_norm": 1.8769980669021606, + "learning_rate": 4.973018858007122e-05, + "loss": 1.7008, + "step": 2883 + }, + { + "epoch": 0.09684924222281308, + "grad_norm": 1.8325533866882324, + "learning_rate": 4.9717743867339963e-05, + "loss": 1.7058, + "step": 2914 + }, + { + "epoch": 0.09787955331028982, + "grad_norm": 2.086416721343994, + "learning_rate": 4.9705020218228695e-05, + "loss": 1.711, + "step": 2945 + }, + { + "epoch": 0.09890986439776656, + "grad_norm": 1.8294793367385864, + "learning_rate": 4.969201777632205e-05, + "loss": 1.6998, + "step": 2976 + }, + { + "epoch": 0.09994017548524328, + "grad_norm": 2.0608153343200684, + "learning_rate": 4.9678736688350846e-05, + "loss": 1.6948, + "step": 3007 + }, + { + "epoch": 0.10097048657272002, + "grad_norm": 3.2166008949279785, + "learning_rate": 4.966517710419033e-05, + "loss": 1.6788, + "step": 3038 + }, + { + "epoch": 0.10200079766019676, + "grad_norm": 1.9431313276290894, + "learning_rate": 4.965133917685858e-05, + "loss": 1.7115, + "step": 3069 + }, + { + "epoch": 0.10303110874767349, + "grad_norm": 1.967512845993042, + "learning_rate": 4.9637223062514714e-05, + "loss": 1.7033, + "step": 3100 + }, + { + "epoch": 0.10406141983515023, + "grad_norm": 1.9253389835357666, + "learning_rate": 4.962282892045718e-05, + "loss": 1.6856, + "step": 3131 + }, + { + "epoch": 0.10509173092262696, + "grad_norm": 1.986840009689331, + "learning_rate": 4.9608156913121904e-05, + "loss": 1.723, + "step": 3162 + }, + { + "epoch": 0.1061220420101037, + "grad_norm": 1.83523690700531, + "learning_rate": 4.959320720608049e-05, + "loss": 1.6912, + "step": 3193 + }, + { + "epoch": 0.10715235309758044, + "grad_norm": 2.1271955966949463, + "learning_rate": 4.9577979968038354e-05, + "loss": 1.7032, + "step": 3224 + }, + { + "epoch": 0.10818266418505716, + "grad_norm": 1.8383768796920776, + "learning_rate": 4.956247537083282e-05, + "loss": 1.6726, + "step": 3255 + }, + { + "epoch": 0.1092129752725339, + "grad_norm": 1.8806651830673218, + "learning_rate": 4.9546693589431145e-05, + "loss": 1.6817, + "step": 3286 + }, + { + "epoch": 0.11024328636001064, + "grad_norm": 1.7535260915756226, + "learning_rate": 4.9530634801928595e-05, + "loss": 1.6875, + "step": 3317 + }, + { + "epoch": 0.11127359744748737, + "grad_norm": 1.765906810760498, + "learning_rate": 4.9514299189546395e-05, + "loss": 1.6859, + "step": 3348 + }, + { + "epoch": 0.11230390853496411, + "grad_norm": 1.869828462600708, + "learning_rate": 4.949768693662973e-05, + "loss": 1.6915, + "step": 3379 + }, + { + "epoch": 0.11333421962244083, + "grad_norm": 1.8347504138946533, + "learning_rate": 4.948079823064559e-05, + "loss": 1.6859, + "step": 3410 + }, + { + "epoch": 0.11436453070991758, + "grad_norm": 1.7692474126815796, + "learning_rate": 4.946363326218074e-05, + "loss": 1.6565, + "step": 3441 + }, + { + "epoch": 0.11539484179739432, + "grad_norm": 1.8231885433197021, + "learning_rate": 4.9446192224939525e-05, + "loss": 1.686, + "step": 3472 + }, + { + "epoch": 0.11642515288487104, + "grad_norm": 1.7155958414077759, + "learning_rate": 4.942847531574167e-05, + "loss": 1.6538, + "step": 3503 + }, + { + "epoch": 0.11745546397234778, + "grad_norm": 1.787183403968811, + "learning_rate": 4.941048273452008e-05, + "loss": 1.6776, + "step": 3534 + }, + { + "epoch": 0.11848577505982451, + "grad_norm": 1.741213083267212, + "learning_rate": 4.9392214684318605e-05, + "loss": 1.6784, + "step": 3565 + }, + { + "epoch": 0.11951608614730125, + "grad_norm": 1.7836824655532837, + "learning_rate": 4.93736713712897e-05, + "loss": 1.6557, + "step": 3596 + }, + { + "epoch": 0.12054639723477799, + "grad_norm": 1.7103859186172485, + "learning_rate": 4.9354853004692124e-05, + "loss": 1.6606, + "step": 3627 + }, + { + "epoch": 0.12157670832225471, + "grad_norm": 1.7865506410598755, + "learning_rate": 4.93357597968886e-05, + "loss": 1.6409, + "step": 3658 + }, + { + "epoch": 0.12260701940973145, + "grad_norm": 1.7770143747329712, + "learning_rate": 4.931639196334338e-05, + "loss": 1.6574, + "step": 3689 + }, + { + "epoch": 0.1236373304972082, + "grad_norm": 1.857575535774231, + "learning_rate": 4.9296749722619826e-05, + "loss": 1.6724, + "step": 3720 + }, + { + "epoch": 0.12466764158468492, + "grad_norm": 1.8742581605911255, + "learning_rate": 4.9276833296377966e-05, + "loss": 1.6506, + "step": 3751 + }, + { + "epoch": 0.12569795267216166, + "grad_norm": 1.827668309211731, + "learning_rate": 4.925664290937196e-05, + "loss": 1.6523, + "step": 3782 + }, + { + "epoch": 0.1267282637596384, + "grad_norm": 1.7517486810684204, + "learning_rate": 4.9236178789447576e-05, + "loss": 1.6459, + "step": 3813 + }, + { + "epoch": 0.12775857484711514, + "grad_norm": 1.8109570741653442, + "learning_rate": 4.921544116753962e-05, + "loss": 1.6614, + "step": 3844 + }, + { + "epoch": 0.12878888593459187, + "grad_norm": 1.692597508430481, + "learning_rate": 4.919443027766935e-05, + "loss": 1.6431, + "step": 3875 + }, + { + "epoch": 0.1298191970220686, + "grad_norm": 1.8650025129318237, + "learning_rate": 4.91731463569418e-05, + "loss": 1.6466, + "step": 3906 + }, + { + "epoch": 0.13084950810954532, + "grad_norm": 1.6794081926345825, + "learning_rate": 4.915158964554312e-05, + "loss": 1.6504, + "step": 3937 + }, + { + "epoch": 0.13187981919702207, + "grad_norm": 1.7685374021530151, + "learning_rate": 4.912976038673786e-05, + "loss": 1.6446, + "step": 3968 + }, + { + "epoch": 0.1329101302844988, + "grad_norm": 1.7601110935211182, + "learning_rate": 4.9107658826866254e-05, + "loss": 1.631, + "step": 3999 + }, + { + "epoch": 0.13394044137197553, + "grad_norm": 2.0616064071655273, + "learning_rate": 4.908528521534139e-05, + "loss": 1.6476, + "step": 4030 + }, + { + "epoch": 0.13497075245945228, + "grad_norm": 1.8973504304885864, + "learning_rate": 4.906263980464644e-05, + "loss": 1.6582, + "step": 4061 + }, + { + "epoch": 0.136001063546929, + "grad_norm": 1.7768895626068115, + "learning_rate": 4.903972285033178e-05, + "loss": 1.6159, + "step": 4092 + }, + { + "epoch": 0.13703137463440573, + "grad_norm": 1.8264424800872803, + "learning_rate": 4.901653461101213e-05, + "loss": 1.6289, + "step": 4123 + }, + { + "epoch": 0.1380616857218825, + "grad_norm": 1.7140119075775146, + "learning_rate": 4.8993075348363626e-05, + "loss": 1.6357, + "step": 4154 + }, + { + "epoch": 0.13909199680935921, + "grad_norm": 1.6964486837387085, + "learning_rate": 4.896934532712084e-05, + "loss": 1.6233, + "step": 4185 + }, + { + "epoch": 0.14012230789683594, + "grad_norm": 1.8008025884628296, + "learning_rate": 4.8945344815073846e-05, + "loss": 1.637, + "step": 4216 + }, + { + "epoch": 0.1411526189843127, + "grad_norm": 1.562730073928833, + "learning_rate": 4.892107408306516e-05, + "loss": 1.6379, + "step": 4247 + }, + { + "epoch": 0.14218293007178942, + "grad_norm": 1.8273371458053589, + "learning_rate": 4.889653340498669e-05, + "loss": 1.6246, + "step": 4278 + }, + { + "epoch": 0.14321324115926615, + "grad_norm": 56.33716583251953, + "learning_rate": 4.8871723057776664e-05, + "loss": 1.6457, + "step": 4309 + }, + { + "epoch": 0.1442435522467429, + "grad_norm": 1.746523380279541, + "learning_rate": 4.8846643321416476e-05, + "loss": 1.6343, + "step": 4340 + }, + { + "epoch": 0.14527386333421963, + "grad_norm": 1.7737531661987305, + "learning_rate": 4.882129447892753e-05, + "loss": 1.6447, + "step": 4371 + }, + { + "epoch": 0.14630417442169635, + "grad_norm": 1.660485863685608, + "learning_rate": 4.8795676816368076e-05, + "loss": 1.6192, + "step": 4402 + }, + { + "epoch": 0.14733448550917308, + "grad_norm": 1.6823406219482422, + "learning_rate": 4.876979062282995e-05, + "loss": 1.6253, + "step": 4433 + }, + { + "epoch": 0.14836479659664983, + "grad_norm": 7.78139066696167, + "learning_rate": 4.8743636190435325e-05, + "loss": 1.6234, + "step": 4464 + }, + { + "epoch": 0.14939510768412656, + "grad_norm": 1.7426058053970337, + "learning_rate": 4.871721381433344e-05, + "loss": 1.6337, + "step": 4495 + }, + { + "epoch": 0.1504254187716033, + "grad_norm": 1.6294783353805542, + "learning_rate": 4.869052379269719e-05, + "loss": 1.6217, + "step": 4526 + }, + { + "epoch": 0.15145572985908004, + "grad_norm": 1.6523306369781494, + "learning_rate": 4.866356642671985e-05, + "loss": 1.605, + "step": 4557 + }, + { + "epoch": 0.15248604094655677, + "grad_norm": 1.8571300506591797, + "learning_rate": 4.8636342020611634e-05, + "loss": 1.6218, + "step": 4588 + }, + { + "epoch": 0.1535163520340335, + "grad_norm": 1.7754936218261719, + "learning_rate": 4.860885088159626e-05, + "loss": 1.6171, + "step": 4619 + }, + { + "epoch": 0.15454666312151025, + "grad_norm": 1.91987943649292, + "learning_rate": 4.858109331990751e-05, + "loss": 1.6167, + "step": 4650 + }, + { + "epoch": 0.15557697420898697, + "grad_norm": 1.5994452238082886, + "learning_rate": 4.855306964878567e-05, + "loss": 1.5951, + "step": 4681 + }, + { + "epoch": 0.1566072852964637, + "grad_norm": 1.6490916013717651, + "learning_rate": 4.8524780184474084e-05, + "loss": 1.616, + "step": 4712 + }, + { + "epoch": 0.15763759638394045, + "grad_norm": 1.5921640396118164, + "learning_rate": 4.8496225246215496e-05, + "loss": 1.6346, + "step": 4743 + }, + { + "epoch": 0.15866790747141718, + "grad_norm": 1.6729261875152588, + "learning_rate": 4.8467405156248505e-05, + "loss": 1.6165, + "step": 4774 + }, + { + "epoch": 0.1596982185588939, + "grad_norm": 1.628113031387329, + "learning_rate": 4.843832023980392e-05, + "loss": 1.6119, + "step": 4805 + }, + { + "epoch": 0.16072852964637063, + "grad_norm": 1.651647925376892, + "learning_rate": 4.840897082510106e-05, + "loss": 1.5997, + "step": 4836 + }, + { + "epoch": 0.1617588407338474, + "grad_norm": 1.5297720432281494, + "learning_rate": 4.8379357243344084e-05, + "loss": 1.6242, + "step": 4867 + }, + { + "epoch": 0.1627891518213241, + "grad_norm": 1.5779869556427002, + "learning_rate": 4.8349479828718236e-05, + "loss": 1.6149, + "step": 4898 + }, + { + "epoch": 0.16381946290880084, + "grad_norm": 1.5843939781188965, + "learning_rate": 4.8319338918386075e-05, + "loss": 1.5926, + "step": 4929 + }, + { + "epoch": 0.1648497739962776, + "grad_norm": 2.3762106895446777, + "learning_rate": 4.828893485248369e-05, + "loss": 1.6108, + "step": 4960 + }, + { + "epoch": 0.16588008508375432, + "grad_norm": 1.5871953964233398, + "learning_rate": 4.825826797411682e-05, + "loss": 1.6103, + "step": 4991 + }, + { + "epoch": 0.16691039617123105, + "grad_norm": 1.5934125185012817, + "learning_rate": 4.822733862935702e-05, + "loss": 1.6091, + "step": 5022 + }, + { + "epoch": 0.1679407072587078, + "grad_norm": 1.6997628211975098, + "learning_rate": 4.819614716723775e-05, + "loss": 1.6098, + "step": 5053 + }, + { + "epoch": 0.16897101834618453, + "grad_norm": 1.682849645614624, + "learning_rate": 4.8164693939750425e-05, + "loss": 1.599, + "step": 5084 + }, + { + "epoch": 0.17000132943366125, + "grad_norm": 1.709743857383728, + "learning_rate": 4.813297930184042e-05, + "loss": 1.6194, + "step": 5115 + }, + { + "epoch": 0.171031640521138, + "grad_norm": 1.725879430770874, + "learning_rate": 4.810100361140314e-05, + "loss": 1.6115, + "step": 5146 + }, + { + "epoch": 0.17206195160861473, + "grad_norm": 1.6710290908813477, + "learning_rate": 4.8068767229279885e-05, + "loss": 1.6032, + "step": 5177 + }, + { + "epoch": 0.17309226269609146, + "grad_norm": 1.6156634092330933, + "learning_rate": 4.8036270519253854e-05, + "loss": 1.5973, + "step": 5208 + }, + { + "epoch": 0.1741225737835682, + "grad_norm": 1.5654059648513794, + "learning_rate": 4.8003513848046e-05, + "loss": 1.5817, + "step": 5239 + }, + { + "epoch": 0.17515288487104494, + "grad_norm": 1.5789822340011597, + "learning_rate": 4.79704975853109e-05, + "loss": 1.6138, + "step": 5270 + }, + { + "epoch": 0.17618319595852167, + "grad_norm": 1.6022037267684937, + "learning_rate": 4.793722210363262e-05, + "loss": 1.5998, + "step": 5301 + }, + { + "epoch": 0.1772135070459984, + "grad_norm": 1.5142741203308105, + "learning_rate": 4.7903687778520414e-05, + "loss": 1.6061, + "step": 5332 + }, + { + "epoch": 0.17824381813347515, + "grad_norm": 1.6454212665557861, + "learning_rate": 4.7869894988404593e-05, + "loss": 1.6063, + "step": 5363 + }, + { + "epoch": 0.17927412922095187, + "grad_norm": 1.5250823497772217, + "learning_rate": 4.783584411463221e-05, + "loss": 1.6038, + "step": 5394 + }, + { + "epoch": 0.1803044403084286, + "grad_norm": 1.5829335451126099, + "learning_rate": 4.780153554146274e-05, + "loss": 1.5949, + "step": 5425 + }, + { + "epoch": 0.18133475139590535, + "grad_norm": 1.5342432260513306, + "learning_rate": 4.7766969656063766e-05, + "loss": 1.5913, + "step": 5456 + }, + { + "epoch": 0.18236506248338208, + "grad_norm": 1.6397250890731812, + "learning_rate": 4.773214684850662e-05, + "loss": 1.6102, + "step": 5487 + }, + { + "epoch": 0.1833953735708588, + "grad_norm": 1.5228471755981445, + "learning_rate": 4.769706751176193e-05, + "loss": 1.5885, + "step": 5518 + }, + { + "epoch": 0.18442568465833556, + "grad_norm": 1.6186103820800781, + "learning_rate": 4.7661732041695264e-05, + "loss": 1.6086, + "step": 5549 + }, + { + "epoch": 0.18545599574581229, + "grad_norm": 1.6024582386016846, + "learning_rate": 4.762614083706258e-05, + "loss": 1.6004, + "step": 5580 + }, + { + "epoch": 0.186486306833289, + "grad_norm": 1.5443711280822754, + "learning_rate": 4.759029429950581e-05, + "loss": 1.6048, + "step": 5611 + }, + { + "epoch": 0.18751661792076577, + "grad_norm": 1.4831629991531372, + "learning_rate": 4.7554192833548235e-05, + "loss": 1.5841, + "step": 5642 + }, + { + "epoch": 0.1885469290082425, + "grad_norm": 1.6426068544387817, + "learning_rate": 4.751783684659e-05, + "loss": 1.587, + "step": 5673 + }, + { + "epoch": 0.18957724009571922, + "grad_norm": 1.4609078168869019, + "learning_rate": 4.748122674890348e-05, + "loss": 1.5945, + "step": 5704 + }, + { + "epoch": 0.19060755118319597, + "grad_norm": 1.5365614891052246, + "learning_rate": 4.7444362953628654e-05, + "loss": 1.5737, + "step": 5735 + }, + { + "epoch": 0.1916378622706727, + "grad_norm": 1.5755670070648193, + "learning_rate": 4.7407245876768424e-05, + "loss": 1.5862, + "step": 5766 + }, + { + "epoch": 0.19266817335814942, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.736987593718397e-05, + "loss": 1.5663, + "step": 5797 + }, + { + "epoch": 0.19369848444562615, + "grad_norm": 1.5927278995513916, + "learning_rate": 4.733225355658999e-05, + "loss": 1.5776, + "step": 5828 + }, + { + "epoch": 0.1947287955331029, + "grad_norm": 1.5593287944793701, + "learning_rate": 4.7294379159549926e-05, + "loss": 1.579, + "step": 5859 + }, + { + "epoch": 0.19575910662057963, + "grad_norm": 1.534055233001709, + "learning_rate": 4.725625317347119e-05, + "loss": 1.6017, + "step": 5890 + }, + { + "epoch": 0.19678941770805636, + "grad_norm": 1.5846387147903442, + "learning_rate": 4.7217876028600374e-05, + "loss": 1.5739, + "step": 5921 + }, + { + "epoch": 0.1978197287955331, + "grad_norm": 1.5377682447433472, + "learning_rate": 4.717924815801832e-05, + "loss": 1.57, + "step": 5952 + }, + { + "epoch": 0.19885003988300984, + "grad_norm": 1.467956781387329, + "learning_rate": 4.714036999763532e-05, + "loss": 1.5736, + "step": 5983 + }, + { + "epoch": 0.19988035097048656, + "grad_norm": 1.601070523262024, + "learning_rate": 4.7101241986186116e-05, + "loss": 1.5861, + "step": 6014 + }, + { + "epoch": 0.20091066205796332, + "grad_norm": 1.5051921606063843, + "learning_rate": 4.7061864565225e-05, + "loss": 1.5735, + "step": 6045 + }, + { + "epoch": 0.20194097314544004, + "grad_norm": 1.462843418121338, + "learning_rate": 4.702223817912081e-05, + "loss": 1.582, + "step": 6076 + }, + { + "epoch": 0.20297128423291677, + "grad_norm": 1.5698682069778442, + "learning_rate": 4.698236327505195e-05, + "loss": 1.5647, + "step": 6107 + }, + { + "epoch": 0.20400159532039353, + "grad_norm": 1.5633916854858398, + "learning_rate": 4.694224030300127e-05, + "loss": 1.5741, + "step": 6138 + }, + { + "epoch": 0.20503190640787025, + "grad_norm": 1.6174733638763428, + "learning_rate": 4.690186971575107e-05, + "loss": 1.5634, + "step": 6169 + }, + { + "epoch": 0.20606221749534698, + "grad_norm": 1.4957518577575684, + "learning_rate": 4.6861251968877916e-05, + "loss": 1.575, + "step": 6200 + }, + { + "epoch": 0.2070925285828237, + "grad_norm": 1.670933485031128, + "learning_rate": 4.68203875207476e-05, + "loss": 1.5792, + "step": 6231 + }, + { + "epoch": 0.20812283967030046, + "grad_norm": 1.5676430463790894, + "learning_rate": 4.677927683250983e-05, + "loss": 1.5689, + "step": 6262 + }, + { + "epoch": 0.20915315075777718, + "grad_norm": 1.5753976106643677, + "learning_rate": 4.6737920368093156e-05, + "loss": 1.5594, + "step": 6293 + }, + { + "epoch": 0.2101834618452539, + "grad_norm": 1.4973617792129517, + "learning_rate": 4.669631859419965e-05, + "loss": 1.5593, + "step": 6324 + }, + { + "epoch": 0.21121377293273066, + "grad_norm": 1.4691433906555176, + "learning_rate": 4.6654471980299676e-05, + "loss": 1.5711, + "step": 6355 + }, + { + "epoch": 0.2122440840202074, + "grad_norm": 1.407630443572998, + "learning_rate": 4.661238099862658e-05, + "loss": 1.5787, + "step": 6386 + }, + { + "epoch": 0.21327439510768412, + "grad_norm": 1.5011677742004395, + "learning_rate": 4.657004612417138e-05, + "loss": 1.5751, + "step": 6417 + }, + { + "epoch": 0.21430470619516087, + "grad_norm": 1.509750485420227, + "learning_rate": 4.6527467834677374e-05, + "loss": 1.5583, + "step": 6448 + }, + { + "epoch": 0.2153350172826376, + "grad_norm": 1.3919882774353027, + "learning_rate": 4.648464661063478e-05, + "loss": 1.5712, + "step": 6479 + }, + { + "epoch": 0.21636532837011432, + "grad_norm": 1.4854936599731445, + "learning_rate": 4.6441582935275264e-05, + "loss": 1.5637, + "step": 6510 + }, + { + "epoch": 0.21739563945759108, + "grad_norm": 1.4413583278656006, + "learning_rate": 4.6398277294566586e-05, + "loss": 1.56, + "step": 6541 + }, + { + "epoch": 0.2184259505450678, + "grad_norm": 1.5063883066177368, + "learning_rate": 4.6354730177207e-05, + "loss": 1.5525, + "step": 6572 + }, + { + "epoch": 0.21945626163254453, + "grad_norm": 1.4899688959121704, + "learning_rate": 4.6310942074619787e-05, + "loss": 1.5817, + "step": 6603 + }, + { + "epoch": 0.22048657272002128, + "grad_norm": 1.3927967548370361, + "learning_rate": 4.626691348094777e-05, + "loss": 1.5407, + "step": 6634 + }, + { + "epoch": 0.221516883807498, + "grad_norm": 1.5378398895263672, + "learning_rate": 4.622264489304762e-05, + "loss": 1.5561, + "step": 6665 + }, + { + "epoch": 0.22254719489497474, + "grad_norm": 1.554624319076538, + "learning_rate": 4.617813681048434e-05, + "loss": 1.5859, + "step": 6696 + }, + { + "epoch": 0.22357750598245146, + "grad_norm": 1.5356658697128296, + "learning_rate": 4.61333897355256e-05, + "loss": 1.5531, + "step": 6727 + }, + { + "epoch": 0.22460781706992822, + "grad_norm": 1.5534918308258057, + "learning_rate": 4.608840417313604e-05, + "loss": 1.5774, + "step": 6758 + }, + { + "epoch": 0.22563812815740494, + "grad_norm": 1.5660988092422485, + "learning_rate": 4.6043180630971646e-05, + "loss": 1.5763, + "step": 6789 + }, + { + "epoch": 0.22666843924488167, + "grad_norm": 1.4993386268615723, + "learning_rate": 4.599771961937391e-05, + "loss": 1.5615, + "step": 6820 + }, + { + "epoch": 0.22769875033235842, + "grad_norm": 1.4630553722381592, + "learning_rate": 4.5952021651364204e-05, + "loss": 1.543, + "step": 6851 + }, + { + "epoch": 0.22872906141983515, + "grad_norm": 1.470173954963684, + "learning_rate": 4.590608724263786e-05, + "loss": 1.5674, + "step": 6882 + }, + { + "epoch": 0.22975937250731188, + "grad_norm": 1.5867971181869507, + "learning_rate": 4.585991691155845e-05, + "loss": 1.5702, + "step": 6913 + }, + { + "epoch": 0.23078968359478863, + "grad_norm": 1.44207763671875, + "learning_rate": 4.581351117915188e-05, + "loss": 1.5436, + "step": 6944 + }, + { + "epoch": 0.23181999468226536, + "grad_norm": 1.4691039323806763, + "learning_rate": 4.5766870569100534e-05, + "loss": 1.5465, + "step": 6975 + }, + { + "epoch": 0.23285030576974208, + "grad_norm": 1.4807918071746826, + "learning_rate": 4.571999560773736e-05, + "loss": 1.5564, + "step": 7006 + }, + { + "epoch": 0.23388061685721884, + "grad_norm": 1.481487512588501, + "learning_rate": 4.5672886824039915e-05, + "loss": 1.5466, + "step": 7037 + }, + { + "epoch": 0.23491092794469556, + "grad_norm": 1.4518013000488281, + "learning_rate": 4.5625544749624435e-05, + "loss": 1.5618, + "step": 7068 + }, + { + "epoch": 0.2359412390321723, + "grad_norm": 1.4186676740646362, + "learning_rate": 4.5577969918739794e-05, + "loss": 1.5528, + "step": 7099 + }, + { + "epoch": 0.23697155011964902, + "grad_norm": 1.5287110805511475, + "learning_rate": 4.5530162868261486e-05, + "loss": 1.5457, + "step": 7130 + }, + { + "epoch": 0.23800186120712577, + "grad_norm": 1.5516417026519775, + "learning_rate": 4.548212413768558e-05, + "loss": 1.5467, + "step": 7161 + }, + { + "epoch": 0.2390321722946025, + "grad_norm": 1.4710053205490112, + "learning_rate": 4.543385426912261e-05, + "loss": 1.5431, + "step": 7192 + }, + { + "epoch": 0.24006248338207922, + "grad_norm": 1.5005567073822021, + "learning_rate": 4.53853538072915e-05, + "loss": 1.5592, + "step": 7223 + }, + { + "epoch": 0.24109279446955598, + "grad_norm": 1.5864965915679932, + "learning_rate": 4.533662329951336e-05, + "loss": 1.5694, + "step": 7254 + }, + { + "epoch": 0.2421231055570327, + "grad_norm": 1.4661896228790283, + "learning_rate": 4.528766329570536e-05, + "loss": 1.545, + "step": 7285 + }, + { + "epoch": 0.24315341664450943, + "grad_norm": 1.5157560110092163, + "learning_rate": 4.523847434837447e-05, + "loss": 1.5458, + "step": 7316 + }, + { + "epoch": 0.24418372773198618, + "grad_norm": 1.4033585786819458, + "learning_rate": 4.518905701261128e-05, + "loss": 1.5464, + "step": 7347 + }, + { + "epoch": 0.2452140388194629, + "grad_norm": 1.5357593297958374, + "learning_rate": 4.5139411846083715e-05, + "loss": 1.5497, + "step": 7378 + }, + { + "epoch": 0.24624434990693964, + "grad_norm": 1.419507384300232, + "learning_rate": 4.508953940903073e-05, + "loss": 1.5414, + "step": 7409 + }, + { + "epoch": 0.2472746609944164, + "grad_norm": 1.5201773643493652, + "learning_rate": 4.5039440264255994e-05, + "loss": 1.5503, + "step": 7440 + }, + { + "epoch": 0.24830497208189312, + "grad_norm": 1.8000444173812866, + "learning_rate": 4.498911497712155e-05, + "loss": 1.5448, + "step": 7471 + }, + { + "epoch": 0.24933528316936984, + "grad_norm": 1.4876810312271118, + "learning_rate": 4.493856411554142e-05, + "loss": 1.5524, + "step": 7502 + }, + { + "epoch": 0.25036559425684657, + "grad_norm": 1.5130078792572021, + "learning_rate": 4.4887788249975206e-05, + "loss": 1.5454, + "step": 7533 + }, + { + "epoch": 0.2513959053443233, + "grad_norm": 1.4829351902008057, + "learning_rate": 4.4836787953421656e-05, + "loss": 1.5407, + "step": 7564 + }, + { + "epoch": 0.2524262164318001, + "grad_norm": 1.521550178527832, + "learning_rate": 4.478556380141218e-05, + "loss": 1.5727, + "step": 7595 + }, + { + "epoch": 0.2534565275192768, + "grad_norm": 1.4377928972244263, + "learning_rate": 4.4734116372004375e-05, + "loss": 1.5432, + "step": 7626 + }, + { + "epoch": 0.25448683860675353, + "grad_norm": 1.4101744890213013, + "learning_rate": 4.4682446245775477e-05, + "loss": 1.547, + "step": 7657 + }, + { + "epoch": 0.2555171496942303, + "grad_norm": 1.522524356842041, + "learning_rate": 4.463055400581586e-05, + "loss": 1.5418, + "step": 7688 + }, + { + "epoch": 0.256547460781707, + "grad_norm": 1.4160797595977783, + "learning_rate": 4.4578440237722374e-05, + "loss": 1.5457, + "step": 7719 + }, + { + "epoch": 0.25757777186918374, + "grad_norm": 1.4106636047363281, + "learning_rate": 4.452610552959183e-05, + "loss": 1.5405, + "step": 7750 + }, + { + "epoch": 0.2586080829566605, + "grad_norm": 1.422723650932312, + "learning_rate": 4.447355047201428e-05, + "loss": 1.5423, + "step": 7781 + }, + { + "epoch": 0.2596383940441372, + "grad_norm": 1.4362592697143555, + "learning_rate": 4.4420775658066414e-05, + "loss": 1.5372, + "step": 7812 + }, + { + "epoch": 0.26066870513161394, + "grad_norm": 1.4319696426391602, + "learning_rate": 4.436778168330484e-05, + "loss": 1.5451, + "step": 7843 + }, + { + "epoch": 0.26169901621909064, + "grad_norm": 1.4069257974624634, + "learning_rate": 4.4314569145759353e-05, + "loss": 1.5221, + "step": 7874 + }, + { + "epoch": 0.2627293273065674, + "grad_norm": 1.4424949884414673, + "learning_rate": 4.42611386459262e-05, + "loss": 1.5419, + "step": 7905 + }, + { + "epoch": 0.26375963839404415, + "grad_norm": 1.4579105377197266, + "learning_rate": 4.420749078676133e-05, + "loss": 1.5116, + "step": 7936 + }, + { + "epoch": 0.26478994948152085, + "grad_norm": 1.4563167095184326, + "learning_rate": 4.4153626173673516e-05, + "loss": 1.5296, + "step": 7967 + }, + { + "epoch": 0.2658202605689976, + "grad_norm": 1.4440968036651611, + "learning_rate": 4.409954541451762e-05, + "loss": 1.5548, + "step": 7998 + }, + { + "epoch": 0.26685057165647436, + "grad_norm": 1.5711034536361694, + "learning_rate": 4.404524911958764e-05, + "loss": 1.535, + "step": 8029 + }, + { + "epoch": 0.26788088274395105, + "grad_norm": 1.5221564769744873, + "learning_rate": 4.399073790160989e-05, + "loss": 1.5495, + "step": 8060 + }, + { + "epoch": 0.2689111938314278, + "grad_norm": 1.392699956893921, + "learning_rate": 4.393601237573607e-05, + "loss": 1.546, + "step": 8091 + }, + { + "epoch": 0.26994150491890456, + "grad_norm": 1.5343137979507446, + "learning_rate": 4.388107315953628e-05, + "loss": 1.549, + "step": 8122 + }, + { + "epoch": 0.27097181600638126, + "grad_norm": 1.4483468532562256, + "learning_rate": 4.382592087299212e-05, + "loss": 1.5424, + "step": 8153 + }, + { + "epoch": 0.272002127093858, + "grad_norm": 1.4963489770889282, + "learning_rate": 4.377055613848964e-05, + "loss": 1.508, + "step": 8184 + }, + { + "epoch": 0.27303243818133477, + "grad_norm": 1.4839162826538086, + "learning_rate": 4.3714979580812355e-05, + "loss": 1.5203, + "step": 8215 + }, + { + "epoch": 0.27406274926881147, + "grad_norm": 1.4272018671035767, + "learning_rate": 4.365919182713416e-05, + "loss": 1.5264, + "step": 8246 + }, + { + "epoch": 0.2750930603562882, + "grad_norm": 1.3808270692825317, + "learning_rate": 4.360319350701226e-05, + "loss": 1.5255, + "step": 8277 + }, + { + "epoch": 0.276123371443765, + "grad_norm": 1.4179162979125977, + "learning_rate": 4.3546985252380115e-05, + "loss": 1.535, + "step": 8308 + }, + { + "epoch": 0.2771536825312417, + "grad_norm": 1.3617374897003174, + "learning_rate": 4.349056769754021e-05, + "loss": 1.5295, + "step": 8339 + }, + { + "epoch": 0.27818399361871843, + "grad_norm": 1.4745615720748901, + "learning_rate": 4.3433941479156994e-05, + "loss": 1.5438, + "step": 8370 + }, + { + "epoch": 0.2792143047061952, + "grad_norm": 1.3661375045776367, + "learning_rate": 4.3377107236249647e-05, + "loss": 1.5134, + "step": 8401 + }, + { + "epoch": 0.2802446157936719, + "grad_norm": 1.3907949924468994, + "learning_rate": 4.332006561018488e-05, + "loss": 1.5237, + "step": 8432 + }, + { + "epoch": 0.28127492688114863, + "grad_norm": 1.3575704097747803, + "learning_rate": 4.3262817244669683e-05, + "loss": 1.5226, + "step": 8463 + }, + { + "epoch": 0.2823052379686254, + "grad_norm": 1.3836462497711182, + "learning_rate": 4.3205362785744083e-05, + "loss": 1.5433, + "step": 8494 + }, + { + "epoch": 0.2833355490561021, + "grad_norm": 1.6108276844024658, + "learning_rate": 4.314770288177384e-05, + "loss": 1.5324, + "step": 8525 + }, + { + "epoch": 0.28436586014357884, + "grad_norm": 1.4650689363479614, + "learning_rate": 4.308983818344313e-05, + "loss": 1.535, + "step": 8556 + }, + { + "epoch": 0.2853961712310556, + "grad_norm": 1.5836583375930786, + "learning_rate": 4.3031769343747206e-05, + "loss": 1.5313, + "step": 8587 + }, + { + "epoch": 0.2864264823185323, + "grad_norm": 1.5348492860794067, + "learning_rate": 4.297349701798505e-05, + "loss": 1.5106, + "step": 8618 + }, + { + "epoch": 0.28745679340600905, + "grad_norm": 1.4060319662094116, + "learning_rate": 4.2915021863751916e-05, + "loss": 1.5283, + "step": 8649 + }, + { + "epoch": 0.2884871044934858, + "grad_norm": 1.531657099723816, + "learning_rate": 4.285634454093198e-05, + "loss": 1.5087, + "step": 8680 + }, + { + "epoch": 0.2895174155809625, + "grad_norm": 1.4756299257278442, + "learning_rate": 4.279746571169086e-05, + "loss": 1.5042, + "step": 8711 + }, + { + "epoch": 0.29054772666843925, + "grad_norm": 1.3221153020858765, + "learning_rate": 4.2738386040468136e-05, + "loss": 1.5244, + "step": 8742 + }, + { + "epoch": 0.29157803775591595, + "grad_norm": 1.4067268371582031, + "learning_rate": 4.2679106193969866e-05, + "loss": 1.5012, + "step": 8773 + }, + { + "epoch": 0.2926083488433927, + "grad_norm": 1.5192064046859741, + "learning_rate": 4.261962684116106e-05, + "loss": 1.521, + "step": 8804 + }, + { + "epoch": 0.29363865993086946, + "grad_norm": 1.3847788572311401, + "learning_rate": 4.2559948653258145e-05, + "loss": 1.5128, + "step": 8835 + }, + { + "epoch": 0.29466897101834616, + "grad_norm": 1.4612780809402466, + "learning_rate": 4.250007230372134e-05, + "loss": 1.5371, + "step": 8866 + }, + { + "epoch": 0.2956992821058229, + "grad_norm": 1.468971610069275, + "learning_rate": 4.2439998468247126e-05, + "loss": 1.5199, + "step": 8897 + }, + { + "epoch": 0.29672959319329967, + "grad_norm": 1.386236310005188, + "learning_rate": 4.2379727824760566e-05, + "loss": 1.5273, + "step": 8928 + }, + { + "epoch": 0.29775990428077637, + "grad_norm": 1.3843929767608643, + "learning_rate": 4.231926105340768e-05, + "loss": 1.5011, + "step": 8959 + }, + { + "epoch": 0.2987902153682531, + "grad_norm": 1.4554557800292969, + "learning_rate": 4.225859883654776e-05, + "loss": 1.5311, + "step": 8990 + }, + { + "epoch": 0.2998205264557299, + "grad_norm": 1.3674421310424805, + "learning_rate": 4.219774185874569e-05, + "loss": 1.5302, + "step": 9021 + }, + { + "epoch": 0.3008508375432066, + "grad_norm": 1.3804330825805664, + "learning_rate": 4.213669080676418e-05, + "loss": 1.538, + "step": 9052 + }, + { + "epoch": 0.3018811486306833, + "grad_norm": 1.4643255472183228, + "learning_rate": 4.2075446369556056e-05, + "loss": 1.5172, + "step": 9083 + }, + { + "epoch": 0.3029114597181601, + "grad_norm": 1.3375928401947021, + "learning_rate": 4.201400923825648e-05, + "loss": 1.5123, + "step": 9114 + }, + { + "epoch": 0.3039417708056368, + "grad_norm": 1.4321980476379395, + "learning_rate": 4.195238010617511e-05, + "loss": 1.5196, + "step": 9145 + }, + { + "epoch": 0.30497208189311353, + "grad_norm": 1.4312376976013184, + "learning_rate": 4.1890559668788344e-05, + "loss": 1.5138, + "step": 9176 + }, + { + "epoch": 0.3060023929805903, + "grad_norm": 1.3089646100997925, + "learning_rate": 4.1828548623731405e-05, + "loss": 1.5027, + "step": 9207 + }, + { + "epoch": 0.307032704068067, + "grad_norm": 1.4863250255584717, + "learning_rate": 4.1766347670790506e-05, + "loss": 1.5091, + "step": 9238 + }, + { + "epoch": 0.30806301515554374, + "grad_norm": 1.373666763305664, + "learning_rate": 4.170395751189495e-05, + "loss": 1.5256, + "step": 9269 + }, + { + "epoch": 0.3090933262430205, + "grad_norm": 1.4160584211349487, + "learning_rate": 4.164137885110921e-05, + "loss": 1.4938, + "step": 9300 + }, + { + "epoch": 0.3101236373304972, + "grad_norm": 2.112110137939453, + "learning_rate": 4.157861239462495e-05, + "loss": 1.5106, + "step": 9331 + }, + { + "epoch": 0.31115394841797395, + "grad_norm": 1.337058663368225, + "learning_rate": 4.1515658850753114e-05, + "loss": 1.4999, + "step": 9362 + }, + { + "epoch": 0.3121842595054507, + "grad_norm": 1.3625296354293823, + "learning_rate": 4.145251892991588e-05, + "loss": 1.5136, + "step": 9393 + }, + { + "epoch": 0.3132145705929274, + "grad_norm": 1.399491548538208, + "learning_rate": 4.138919334463868e-05, + "loss": 1.499, + "step": 9424 + }, + { + "epoch": 0.31424488168040415, + "grad_norm": 1.4202344417572021, + "learning_rate": 4.1325682809542124e-05, + "loss": 1.5049, + "step": 9455 + }, + { + "epoch": 0.3152751927678809, + "grad_norm": 1.392248272895813, + "learning_rate": 4.126198804133398e-05, + "loss": 1.5287, + "step": 9486 + }, + { + "epoch": 0.3163055038553576, + "grad_norm": 1.3807618618011475, + "learning_rate": 4.1198109758801055e-05, + "loss": 1.5309, + "step": 9517 + }, + { + "epoch": 0.31733581494283436, + "grad_norm": 1.3117905855178833, + "learning_rate": 4.113404868280107e-05, + "loss": 1.4933, + "step": 9548 + }, + { + "epoch": 0.3183661260303111, + "grad_norm": 1.452086091041565, + "learning_rate": 4.106980553625457e-05, + "loss": 1.5221, + "step": 9579 + }, + { + "epoch": 0.3193964371177878, + "grad_norm": 1.477364182472229, + "learning_rate": 4.100538104413674e-05, + "loss": 1.4904, + "step": 9610 + }, + { + "epoch": 0.32042674820526457, + "grad_norm": 1.3584345579147339, + "learning_rate": 4.09407759334692e-05, + "loss": 1.4953, + "step": 9641 + }, + { + "epoch": 0.32145705929274127, + "grad_norm": 1.3619811534881592, + "learning_rate": 4.087599093331186e-05, + "loss": 1.4956, + "step": 9672 + }, + { + "epoch": 0.322487370380218, + "grad_norm": 1.4507052898406982, + "learning_rate": 4.081102677475462e-05, + "loss": 1.5197, + "step": 9703 + }, + { + "epoch": 0.3235176814676948, + "grad_norm": 1.4229698181152344, + "learning_rate": 4.0745884190909194e-05, + "loss": 1.498, + "step": 9734 + }, + { + "epoch": 0.32454799255517147, + "grad_norm": 1.3074679374694824, + "learning_rate": 4.0680563916900796e-05, + "loss": 1.5146, + "step": 9765 + }, + { + "epoch": 0.3255783036426482, + "grad_norm": 1.397815465927124, + "learning_rate": 4.0615066689859815e-05, + "loss": 1.5291, + "step": 9796 + }, + { + "epoch": 0.326608614730125, + "grad_norm": 1.3196336030960083, + "learning_rate": 4.0549393248913584e-05, + "loss": 1.5077, + "step": 9827 + }, + { + "epoch": 0.3276389258176017, + "grad_norm": 1.3129957914352417, + "learning_rate": 4.048354433517794e-05, + "loss": 1.4965, + "step": 9858 + }, + { + "epoch": 0.32866923690507843, + "grad_norm": 1.4380089044570923, + "learning_rate": 4.0417520691748916e-05, + "loss": 1.5115, + "step": 9889 + }, + { + "epoch": 0.3296995479925552, + "grad_norm": 1.3162370920181274, + "learning_rate": 4.035132306369438e-05, + "loss": 1.5029, + "step": 9920 + }, + { + "epoch": 0.3307298590800319, + "grad_norm": 1.3739668130874634, + "learning_rate": 4.028495219804555e-05, + "loss": 1.5083, + "step": 9951 + }, + { + "epoch": 0.33176017016750864, + "grad_norm": 1.3673723936080933, + "learning_rate": 4.021840884378864e-05, + "loss": 1.5223, + "step": 9982 + }, + { + "epoch": 0.3327904812549854, + "grad_norm": 1.3970317840576172, + "learning_rate": 4.015169375185633e-05, + "loss": 1.5003, + "step": 10013 + }, + { + "epoch": 0.3338207923424621, + "grad_norm": 1.2982394695281982, + "learning_rate": 4.0084807675119396e-05, + "loss": 1.5066, + "step": 10044 + }, + { + "epoch": 0.33485110342993885, + "grad_norm": 1.4548689126968384, + "learning_rate": 4.0017751368378106e-05, + "loss": 1.4993, + "step": 10075 + }, + { + "epoch": 0.3358814145174156, + "grad_norm": 1.3693586587905884, + "learning_rate": 3.995052558835377e-05, + "loss": 1.4987, + "step": 10106 + }, + { + "epoch": 0.3369117256048923, + "grad_norm": 1.4046767950057983, + "learning_rate": 3.988313109368017e-05, + "loss": 1.5098, + "step": 10137 + }, + { + "epoch": 0.33794203669236905, + "grad_norm": 1.3772069215774536, + "learning_rate": 3.981556864489504e-05, + "loss": 1.5165, + "step": 10168 + }, + { + "epoch": 0.3389723477798458, + "grad_norm": 1.471211314201355, + "learning_rate": 3.974783900443142e-05, + "loss": 1.5037, + "step": 10199 + }, + { + "epoch": 0.3400026588673225, + "grad_norm": 1.3990979194641113, + "learning_rate": 3.9679942936609095e-05, + "loss": 1.5096, + "step": 10230 + }, + { + "epoch": 0.34103296995479926, + "grad_norm": 1.3779234886169434, + "learning_rate": 3.961188120762596e-05, + "loss": 1.4914, + "step": 10261 + }, + { + "epoch": 0.342063281042276, + "grad_norm": 1.2866768836975098, + "learning_rate": 3.954365458554938e-05, + "loss": 1.5026, + "step": 10292 + }, + { + "epoch": 0.3430935921297527, + "grad_norm": 1.353468894958496, + "learning_rate": 3.947526384030751e-05, + "loss": 1.5063, + "step": 10323 + }, + { + "epoch": 0.34412390321722947, + "grad_norm": 1.3264256715774536, + "learning_rate": 3.9406709743680624e-05, + "loss": 1.4911, + "step": 10354 + }, + { + "epoch": 0.3451542143047062, + "grad_norm": 1.3496876955032349, + "learning_rate": 3.9337993069292366e-05, + "loss": 1.4921, + "step": 10385 + }, + { + "epoch": 0.3461845253921829, + "grad_norm": 1.3812434673309326, + "learning_rate": 3.926911459260109e-05, + "loss": 1.4826, + "step": 10416 + }, + { + "epoch": 0.34721483647965967, + "grad_norm": 1.4926965236663818, + "learning_rate": 3.920007509089102e-05, + "loss": 1.4994, + "step": 10447 + }, + { + "epoch": 0.3482451475671364, + "grad_norm": 1.3446170091629028, + "learning_rate": 3.913087534326357e-05, + "loss": 1.5114, + "step": 10478 + }, + { + "epoch": 0.3492754586546131, + "grad_norm": 1.3100495338439941, + "learning_rate": 3.9061516130628475e-05, + "loss": 1.5066, + "step": 10509 + }, + { + "epoch": 0.3503057697420899, + "grad_norm": 1.395874261856079, + "learning_rate": 3.8991998235695025e-05, + "loss": 1.4999, + "step": 10540 + }, + { + "epoch": 0.3513360808295666, + "grad_norm": 1.3682137727737427, + "learning_rate": 3.8922322442963224e-05, + "loss": 1.4778, + "step": 10571 + }, + { + "epoch": 0.35236639191704333, + "grad_norm": 1.4196573495864868, + "learning_rate": 3.885248953871491e-05, + "loss": 1.4909, + "step": 10602 + }, + { + "epoch": 0.3533967030045201, + "grad_norm": 1.4299864768981934, + "learning_rate": 3.8782500311004915e-05, + "loss": 1.5025, + "step": 10633 + }, + { + "epoch": 0.3544270140919968, + "grad_norm": 1.39677095413208, + "learning_rate": 3.871235554965218e-05, + "loss": 1.4932, + "step": 10664 + }, + { + "epoch": 0.35545732517947354, + "grad_norm": 1.3219736814498901, + "learning_rate": 3.864205604623078e-05, + "loss": 1.4795, + "step": 10695 + }, + { + "epoch": 0.3564876362669503, + "grad_norm": 1.3649324178695679, + "learning_rate": 3.857160259406107e-05, + "loss": 1.4838, + "step": 10726 + }, + { + "epoch": 0.357517947354427, + "grad_norm": 1.4109989404678345, + "learning_rate": 3.8500995988200674e-05, + "loss": 1.5058, + "step": 10757 + }, + { + "epoch": 0.35854825844190374, + "grad_norm": 1.3625038862228394, + "learning_rate": 3.843023702543556e-05, + "loss": 1.4912, + "step": 10788 + }, + { + "epoch": 0.3595785695293805, + "grad_norm": 1.4725775718688965, + "learning_rate": 3.8359326504270984e-05, + "loss": 1.5012, + "step": 10819 + }, + { + "epoch": 0.3606088806168572, + "grad_norm": 1.4126085042953491, + "learning_rate": 3.828826522492255e-05, + "loss": 1.4977, + "step": 10850 + }, + { + "epoch": 0.36163919170433395, + "grad_norm": 1.3949086666107178, + "learning_rate": 3.821705398930713e-05, + "loss": 1.4903, + "step": 10881 + }, + { + "epoch": 0.3626695027918107, + "grad_norm": 1.286792516708374, + "learning_rate": 3.814569360103385e-05, + "loss": 1.5067, + "step": 10912 + }, + { + "epoch": 0.3636998138792874, + "grad_norm": 1.274703025817871, + "learning_rate": 3.807418486539499e-05, + "loss": 1.4583, + "step": 10943 + }, + { + "epoch": 0.36473012496676416, + "grad_norm": 1.401455283164978, + "learning_rate": 3.80025285893569e-05, + "loss": 1.4834, + "step": 10974 + }, + { + "epoch": 0.3657604360542409, + "grad_norm": 1.308361530303955, + "learning_rate": 3.793072558155093e-05, + "loss": 1.4832, + "step": 11005 + }, + { + "epoch": 0.3667907471417176, + "grad_norm": 1.654733419418335, + "learning_rate": 3.785877665226426e-05, + "loss": 1.4867, + "step": 11036 + }, + { + "epoch": 0.36782105822919436, + "grad_norm": 1.3530856370925903, + "learning_rate": 3.778668261343079e-05, + "loss": 1.4873, + "step": 11067 + }, + { + "epoch": 0.3688513693166711, + "grad_norm": 1.3567407131195068, + "learning_rate": 3.771444427862192e-05, + "loss": 1.4935, + "step": 11098 + }, + { + "epoch": 0.3698816804041478, + "grad_norm": 1.3184572458267212, + "learning_rate": 3.7642062463037465e-05, + "loss": 1.4891, + "step": 11129 + }, + { + "epoch": 0.37091199149162457, + "grad_norm": 1.366489291191101, + "learning_rate": 3.7569537983496373e-05, + "loss": 1.5159, + "step": 11160 + }, + { + "epoch": 0.3719423025791013, + "grad_norm": 1.423258662223816, + "learning_rate": 3.749687165842753e-05, + "loss": 1.4938, + "step": 11191 + }, + { + "epoch": 0.372972613666578, + "grad_norm": 1.3226194381713867, + "learning_rate": 3.7424064307860536e-05, + "loss": 1.499, + "step": 11222 + }, + { + "epoch": 0.3740029247540548, + "grad_norm": 1.350500464439392, + "learning_rate": 3.735111675341645e-05, + "loss": 1.4952, + "step": 11253 + }, + { + "epoch": 0.37503323584153153, + "grad_norm": 1.3667839765548706, + "learning_rate": 3.7278029818298524e-05, + "loss": 1.4763, + "step": 11284 + }, + { + "epoch": 0.37606354692900823, + "grad_norm": 1.4876132011413574, + "learning_rate": 3.720480432728287e-05, + "loss": 1.4913, + "step": 11315 + }, + { + "epoch": 0.377093858016485, + "grad_norm": 1.3927743434906006, + "learning_rate": 3.71314411067092e-05, + "loss": 1.4948, + "step": 11346 + }, + { + "epoch": 0.37812416910396174, + "grad_norm": 1.3752413988113403, + "learning_rate": 3.70579409844715e-05, + "loss": 1.4763, + "step": 11377 + }, + { + "epoch": 0.37915448019143844, + "grad_norm": 1.3530951738357544, + "learning_rate": 3.698430479000865e-05, + "loss": 1.5077, + "step": 11408 + }, + { + "epoch": 0.3801847912789152, + "grad_norm": 1.4309345483779907, + "learning_rate": 3.691053335429509e-05, + "loss": 1.4945, + "step": 11439 + }, + { + "epoch": 0.38121510236639194, + "grad_norm": 1.2874380350112915, + "learning_rate": 3.683662750983147e-05, + "loss": 1.4698, + "step": 11470 + }, + { + "epoch": 0.38224541345386864, + "grad_norm": 1.3356250524520874, + "learning_rate": 3.676258809063518e-05, + "loss": 1.4924, + "step": 11501 + }, + { + "epoch": 0.3832757245413454, + "grad_norm": 1.304559588432312, + "learning_rate": 3.6688415932231004e-05, + "loss": 1.4682, + "step": 11532 + }, + { + "epoch": 0.3843060356288221, + "grad_norm": 1.4153447151184082, + "learning_rate": 3.661411187164166e-05, + "loss": 1.4989, + "step": 11563 + }, + { + "epoch": 0.38533634671629885, + "grad_norm": 1.356992244720459, + "learning_rate": 3.65396767473784e-05, + "loss": 1.4854, + "step": 11594 + }, + { + "epoch": 0.3863666578037756, + "grad_norm": 1.322449803352356, + "learning_rate": 3.6465111399431465e-05, + "loss": 1.4877, + "step": 11625 + }, + { + "epoch": 0.3873969688912523, + "grad_norm": 1.3981350660324097, + "learning_rate": 3.6390416669260674e-05, + "loss": 1.499, + "step": 11656 + }, + { + "epoch": 0.38842727997872906, + "grad_norm": 1.324871301651001, + "learning_rate": 3.63155933997859e-05, + "loss": 1.4814, + "step": 11687 + }, + { + "epoch": 0.3894575910662058, + "grad_norm": 1.3940790891647339, + "learning_rate": 3.624064243537758e-05, + "loss": 1.4754, + "step": 11718 + }, + { + "epoch": 0.3904879021536825, + "grad_norm": 1.2880780696868896, + "learning_rate": 3.616556462184716e-05, + "loss": 1.4832, + "step": 11749 + }, + { + "epoch": 0.39151821324115926, + "grad_norm": 1.315329670906067, + "learning_rate": 3.609036080643755e-05, + "loss": 1.4853, + "step": 11780 + }, + { + "epoch": 0.392548524328636, + "grad_norm": 1.4093523025512695, + "learning_rate": 3.60150318378136e-05, + "loss": 1.4978, + "step": 11811 + }, + { + "epoch": 0.3935788354161127, + "grad_norm": 1.271151065826416, + "learning_rate": 3.5939578566052465e-05, + "loss": 1.4933, + "step": 11842 + }, + { + "epoch": 0.39460914650358947, + "grad_norm": 1.2910923957824707, + "learning_rate": 3.586400184263408e-05, + "loss": 1.4853, + "step": 11873 + }, + { + "epoch": 0.3956394575910662, + "grad_norm": 1.2480064630508423, + "learning_rate": 3.578830252043148e-05, + "loss": 1.4642, + "step": 11904 + }, + { + "epoch": 0.3966697686785429, + "grad_norm": 1.263197422027588, + "learning_rate": 3.571248145370125e-05, + "loss": 1.4812, + "step": 11935 + }, + { + "epoch": 0.3977000797660197, + "grad_norm": 1.3231288194656372, + "learning_rate": 3.5636539498073794e-05, + "loss": 1.4744, + "step": 11966 + }, + { + "epoch": 0.39873039085349643, + "grad_norm": 1.3933110237121582, + "learning_rate": 3.556047751054378e-05, + "loss": 1.4849, + "step": 11997 + }, + { + "epoch": 0.39976070194097313, + "grad_norm": 1.3615801334381104, + "learning_rate": 3.548429634946039e-05, + "loss": 1.4866, + "step": 12028 + }, + { + "epoch": 0.4007910130284499, + "grad_norm": 1.298638939857483, + "learning_rate": 3.540799687451768e-05, + "loss": 1.4664, + "step": 12059 + }, + { + "epoch": 0.40182132411592664, + "grad_norm": 1.29216468334198, + "learning_rate": 3.533157994674485e-05, + "loss": 1.4697, + "step": 12090 + }, + { + "epoch": 0.40285163520340334, + "grad_norm": 1.3759845495224, + "learning_rate": 3.5255046428496546e-05, + "loss": 1.4854, + "step": 12121 + }, + { + "epoch": 0.4038819462908801, + "grad_norm": 1.4045615196228027, + "learning_rate": 3.517839718344311e-05, + "loss": 1.4622, + "step": 12152 + }, + { + "epoch": 0.40491225737835684, + "grad_norm": 1.2979034185409546, + "learning_rate": 3.510163307656086e-05, + "loss": 1.4797, + "step": 12183 + }, + { + "epoch": 0.40594256846583354, + "grad_norm": 1.303139567375183, + "learning_rate": 3.5024754974122324e-05, + "loss": 1.4588, + "step": 12214 + }, + { + "epoch": 0.4069728795533103, + "grad_norm": 1.287781834602356, + "learning_rate": 3.494776374368643e-05, + "loss": 1.4834, + "step": 12245 + }, + { + "epoch": 0.40800319064078705, + "grad_norm": 1.3806688785552979, + "learning_rate": 3.4870660254088724e-05, + "loss": 1.4807, + "step": 12276 + }, + { + "epoch": 0.40903350172826375, + "grad_norm": 1.4059745073318481, + "learning_rate": 3.479344537543164e-05, + "loss": 1.4906, + "step": 12307 + }, + { + "epoch": 0.4100638128157405, + "grad_norm": 1.3052942752838135, + "learning_rate": 3.4716119979074565e-05, + "loss": 1.4801, + "step": 12338 + }, + { + "epoch": 0.41109412390321726, + "grad_norm": 1.3306844234466553, + "learning_rate": 3.463868493762412e-05, + "loss": 1.4911, + "step": 12369 + }, + { + "epoch": 0.41212443499069396, + "grad_norm": 1.3276656866073608, + "learning_rate": 3.456114112492418e-05, + "loss": 1.4678, + "step": 12400 + }, + { + "epoch": 0.4131547460781707, + "grad_norm": 1.3164253234863281, + "learning_rate": 3.4483489416046164e-05, + "loss": 1.4816, + "step": 12431 + }, + { + "epoch": 0.4141850571656474, + "grad_norm": 1.3827886581420898, + "learning_rate": 3.440573068727905e-05, + "loss": 1.481, + "step": 12462 + }, + { + "epoch": 0.41521536825312416, + "grad_norm": 1.2899463176727295, + "learning_rate": 3.4327865816119495e-05, + "loss": 1.4575, + "step": 12493 + }, + { + "epoch": 0.4162456793406009, + "grad_norm": 1.3136677742004395, + "learning_rate": 3.4249895681262025e-05, + "loss": 1.4695, + "step": 12524 + }, + { + "epoch": 0.4172759904280776, + "grad_norm": 1.2920372486114502, + "learning_rate": 3.417182116258899e-05, + "loss": 1.4765, + "step": 12555 + }, + { + "epoch": 0.41830630151555437, + "grad_norm": 1.3285510540008545, + "learning_rate": 3.409364314116074e-05, + "loss": 1.4559, + "step": 12586 + }, + { + "epoch": 0.4193366126030311, + "grad_norm": 1.2834984064102173, + "learning_rate": 3.401536249920559e-05, + "loss": 1.4706, + "step": 12617 + }, + { + "epoch": 0.4203669236905078, + "grad_norm": 1.315942645072937, + "learning_rate": 3.393698012010998e-05, + "loss": 1.4692, + "step": 12648 + }, + { + "epoch": 0.4213972347779846, + "grad_norm": 1.3668091297149658, + "learning_rate": 3.385849688840839e-05, + "loss": 1.4801, + "step": 12679 + }, + { + "epoch": 0.42242754586546133, + "grad_norm": 1.312280297279358, + "learning_rate": 3.3779913689773414e-05, + "loss": 1.4673, + "step": 12710 + }, + { + "epoch": 0.423457856952938, + "grad_norm": 1.3579858541488647, + "learning_rate": 3.370123141100578e-05, + "loss": 1.4578, + "step": 12741 + }, + { + "epoch": 0.4244881680404148, + "grad_norm": 1.4001456499099731, + "learning_rate": 3.3622450940024305e-05, + "loss": 1.4787, + "step": 12772 + }, + { + "epoch": 0.42551847912789154, + "grad_norm": 1.352629542350769, + "learning_rate": 3.35435731658559e-05, + "loss": 1.457, + "step": 12803 + }, + { + "epoch": 0.42654879021536823, + "grad_norm": 1.4044222831726074, + "learning_rate": 3.346459897862552e-05, + "loss": 1.4979, + "step": 12834 + }, + { + "epoch": 0.427579101302845, + "grad_norm": 1.2666436433792114, + "learning_rate": 3.338552926954613e-05, + "loss": 1.4712, + "step": 12865 + }, + { + "epoch": 0.42860941239032174, + "grad_norm": 1.2487694025039673, + "learning_rate": 3.330636493090868e-05, + "loss": 1.4784, + "step": 12896 + }, + { + "epoch": 0.42963972347779844, + "grad_norm": 1.2346290349960327, + "learning_rate": 3.322710685607193e-05, + "loss": 1.4754, + "step": 12927 + }, + { + "epoch": 0.4306700345652752, + "grad_norm": 1.2908893823623657, + "learning_rate": 3.314775593945251e-05, + "loss": 1.4677, + "step": 12958 + }, + { + "epoch": 0.43170034565275195, + "grad_norm": 1.3283506631851196, + "learning_rate": 3.3068313076514714e-05, + "loss": 1.4661, + "step": 12989 + }, + { + "epoch": 0.43273065674022865, + "grad_norm": 1.2982537746429443, + "learning_rate": 3.298877916376047e-05, + "loss": 1.4838, + "step": 13020 + }, + { + "epoch": 0.4337609678277054, + "grad_norm": 1.3566454648971558, + "learning_rate": 3.290915509871915e-05, + "loss": 1.4683, + "step": 13051 + }, + { + "epoch": 0.43479127891518216, + "grad_norm": 1.3470877408981323, + "learning_rate": 3.282944177993753e-05, + "loss": 1.4724, + "step": 13082 + }, + { + "epoch": 0.43582159000265885, + "grad_norm": 1.451150894165039, + "learning_rate": 3.274964010696957e-05, + "loss": 1.4731, + "step": 13113 + }, + { + "epoch": 0.4368519010901356, + "grad_norm": 1.3415958881378174, + "learning_rate": 3.266975098036629e-05, + "loss": 1.4809, + "step": 13144 + }, + { + "epoch": 0.43788221217761236, + "grad_norm": 1.2775352001190186, + "learning_rate": 3.258977530166562e-05, + "loss": 1.4523, + "step": 13175 + }, + { + "epoch": 0.43891252326508906, + "grad_norm": 1.365050196647644, + "learning_rate": 3.250971397338227e-05, + "loss": 1.4611, + "step": 13206 + }, + { + "epoch": 0.4399428343525658, + "grad_norm": 1.3481686115264893, + "learning_rate": 3.2429567898997404e-05, + "loss": 1.4708, + "step": 13237 + }, + { + "epoch": 0.44097314544004257, + "grad_norm": 1.3418121337890625, + "learning_rate": 3.234933798294859e-05, + "loss": 1.485, + "step": 13268 + }, + { + "epoch": 0.44200345652751927, + "grad_norm": 1.3098441362380981, + "learning_rate": 3.2269025130619535e-05, + "loss": 1.472, + "step": 13299 + }, + { + "epoch": 0.443033767614996, + "grad_norm": 1.2792437076568604, + "learning_rate": 3.218863024832985e-05, + "loss": 1.4592, + "step": 13330 + }, + { + "epoch": 0.4440640787024727, + "grad_norm": 1.3804035186767578, + "learning_rate": 3.2108154243324864e-05, + "loss": 1.4546, + "step": 13361 + }, + { + "epoch": 0.4450943897899495, + "grad_norm": 1.287787675857544, + "learning_rate": 3.2027598023765345e-05, + "loss": 1.4477, + "step": 13392 + }, + { + "epoch": 0.44612470087742623, + "grad_norm": 1.5964646339416504, + "learning_rate": 3.194696249871729e-05, + "loss": 1.4468, + "step": 13423 + }, + { + "epoch": 0.4471550119649029, + "grad_norm": 1.3253474235534668, + "learning_rate": 3.186624857814164e-05, + "loss": 1.4588, + "step": 13454 + }, + { + "epoch": 0.4481853230523797, + "grad_norm": 1.288176417350769, + "learning_rate": 3.178545717288401e-05, + "loss": 1.4644, + "step": 13485 + }, + { + "epoch": 0.44921563413985643, + "grad_norm": 1.3357142210006714, + "learning_rate": 3.170458919466444e-05, + "loss": 1.4871, + "step": 13516 + }, + { + "epoch": 0.45024594522733313, + "grad_norm": 1.2954436540603638, + "learning_rate": 3.1623645556067063e-05, + "loss": 1.4571, + "step": 13547 + }, + { + "epoch": 0.4512762563148099, + "grad_norm": 1.344789981842041, + "learning_rate": 3.154262717052985e-05, + "loss": 1.459, + "step": 13578 + }, + { + "epoch": 0.45230656740228664, + "grad_norm": 1.2648475170135498, + "learning_rate": 3.146153495233426e-05, + "loss": 1.4496, + "step": 13609 + }, + { + "epoch": 0.45333687848976334, + "grad_norm": 1.312733769416809, + "learning_rate": 3.1380369816594944e-05, + "loss": 1.4309, + "step": 13640 + }, + { + "epoch": 0.4543671895772401, + "grad_norm": 1.3719325065612793, + "learning_rate": 3.129913267924946e-05, + "loss": 1.4723, + "step": 13671 + }, + { + "epoch": 0.45539750066471685, + "grad_norm": 1.2850617170333862, + "learning_rate": 3.121782445704782e-05, + "loss": 1.4599, + "step": 13702 + }, + { + "epoch": 0.45642781175219355, + "grad_norm": 1.3335177898406982, + "learning_rate": 3.11364460675423e-05, + "loss": 1.4821, + "step": 13733 + }, + { + "epoch": 0.4574581228396703, + "grad_norm": 1.1675069332122803, + "learning_rate": 3.1054998429076934e-05, + "loss": 1.453, + "step": 13764 + }, + { + "epoch": 0.45848843392714705, + "grad_norm": 1.283544898033142, + "learning_rate": 3.097348246077728e-05, + "loss": 1.4545, + "step": 13795 + }, + { + "epoch": 0.45951874501462375, + "grad_norm": 1.4358693361282349, + "learning_rate": 3.0891899082539924e-05, + "loss": 1.4673, + "step": 13826 + }, + { + "epoch": 0.4605490561021005, + "grad_norm": 1.2551497220993042, + "learning_rate": 3.0810249215022233e-05, + "loss": 1.4532, + "step": 13857 + }, + { + "epoch": 0.46157936718957726, + "grad_norm": 1.2574602365493774, + "learning_rate": 3.0728533779631865e-05, + "loss": 1.4762, + "step": 13888 + }, + { + "epoch": 0.46260967827705396, + "grad_norm": 1.2202764749526978, + "learning_rate": 3.064675369851637e-05, + "loss": 1.4461, + "step": 13919 + }, + { + "epoch": 0.4636399893645307, + "grad_norm": 1.2787501811981201, + "learning_rate": 3.056490989455289e-05, + "loss": 1.4607, + "step": 13950 + }, + { + "epoch": 0.46467030045200747, + "grad_norm": 1.2511006593704224, + "learning_rate": 3.0483003291337596e-05, + "loss": 1.4548, + "step": 13981 + }, + { + "epoch": 0.46570061153948417, + "grad_norm": 1.2749834060668945, + "learning_rate": 3.040103481317539e-05, + "loss": 1.4394, + "step": 14012 + }, + { + "epoch": 0.4667309226269609, + "grad_norm": 1.223057746887207, + "learning_rate": 3.03190053850694e-05, + "loss": 1.4684, + "step": 14043 + }, + { + "epoch": 0.4677612337144377, + "grad_norm": 1.39846932888031, + "learning_rate": 3.0236915932710573e-05, + "loss": 1.4657, + "step": 14074 + }, + { + "epoch": 0.4687915448019144, + "grad_norm": 1.5305665731430054, + "learning_rate": 3.0154767382467232e-05, + "loss": 1.4795, + "step": 14105 + }, + { + "epoch": 0.4698218558893911, + "grad_norm": 1.2569035291671753, + "learning_rate": 3.0072560661374582e-05, + "loss": 1.4756, + "step": 14136 + }, + { + "epoch": 0.4708521669768679, + "grad_norm": 1.3472824096679688, + "learning_rate": 2.999029669712431e-05, + "loss": 1.4682, + "step": 14167 + }, + { + "epoch": 0.4718824780643446, + "grad_norm": 1.271714210510254, + "learning_rate": 2.990797641805408e-05, + "loss": 1.4509, + "step": 14198 + }, + { + "epoch": 0.47291278915182133, + "grad_norm": 1.3342047929763794, + "learning_rate": 2.982560075313704e-05, + "loss": 1.4528, + "step": 14229 + }, + { + "epoch": 0.47394310023929803, + "grad_norm": 1.5821506977081299, + "learning_rate": 2.9743170631971368e-05, + "loss": 1.4609, + "step": 14260 + }, + { + "epoch": 0.4749734113267748, + "grad_norm": 1.2598062753677368, + "learning_rate": 2.9660686984769792e-05, + "loss": 1.471, + "step": 14291 + }, + { + "epoch": 0.47600372241425154, + "grad_norm": 1.2648885250091553, + "learning_rate": 2.9578150742349047e-05, + "loss": 1.4708, + "step": 14322 + }, + { + "epoch": 0.47703403350172824, + "grad_norm": 1.559665560722351, + "learning_rate": 2.949556283611942e-05, + "loss": 1.4516, + "step": 14353 + }, + { + "epoch": 0.478064344589205, + "grad_norm": 1.2621581554412842, + "learning_rate": 2.9412924198074206e-05, + "loss": 1.446, + "step": 14384 + }, + { + "epoch": 0.47909465567668175, + "grad_norm": 1.2775017023086548, + "learning_rate": 2.9330235760779208e-05, + "loss": 1.4496, + "step": 14415 + }, + { + "epoch": 0.48012496676415845, + "grad_norm": 1.2010388374328613, + "learning_rate": 2.9247498457362188e-05, + "loss": 1.4606, + "step": 14446 + }, + { + "epoch": 0.4811552778516352, + "grad_norm": 1.3053895235061646, + "learning_rate": 2.9164713221502373e-05, + "loss": 1.4536, + "step": 14477 + }, + { + "epoch": 0.48218558893911195, + "grad_norm": 1.311596155166626, + "learning_rate": 2.9081880987419912e-05, + "loss": 1.4409, + "step": 14508 + }, + { + "epoch": 0.48321590002658865, + "grad_norm": 1.3888933658599854, + "learning_rate": 2.8999002689865296e-05, + "loss": 1.4314, + "step": 14539 + }, + { + "epoch": 0.4842462111140654, + "grad_norm": 1.288619875907898, + "learning_rate": 2.8916079264108852e-05, + "loss": 1.4539, + "step": 14570 + }, + { + "epoch": 0.48527652220154216, + "grad_norm": 1.2974294424057007, + "learning_rate": 2.883311164593017e-05, + "loss": 1.4627, + "step": 14601 + }, + { + "epoch": 0.48630683328901886, + "grad_norm": 1.2057379484176636, + "learning_rate": 2.875010077160754e-05, + "loss": 1.4578, + "step": 14632 + }, + { + "epoch": 0.4873371443764956, + "grad_norm": 1.363971471786499, + "learning_rate": 2.866704757790741e-05, + "loss": 1.4671, + "step": 14663 + }, + { + "epoch": 0.48836745546397237, + "grad_norm": 1.2696925401687622, + "learning_rate": 2.858395300207376e-05, + "loss": 1.4333, + "step": 14694 + }, + { + "epoch": 0.48939776655144906, + "grad_norm": 1.2653478384017944, + "learning_rate": 2.8500817981817607e-05, + "loss": 1.4662, + "step": 14725 + }, + { + "epoch": 0.4904280776389258, + "grad_norm": 1.3011239767074585, + "learning_rate": 2.8417643455306336e-05, + "loss": 1.4589, + "step": 14756 + }, + { + "epoch": 0.4914583887264026, + "grad_norm": 1.3312432765960693, + "learning_rate": 2.8334430361153185e-05, + "loss": 1.4368, + "step": 14787 + }, + { + "epoch": 0.49248869981387927, + "grad_norm": 1.3015661239624023, + "learning_rate": 2.8251179638406612e-05, + "loss": 1.466, + "step": 14818 + }, + { + "epoch": 0.493519010901356, + "grad_norm": 1.3215759992599487, + "learning_rate": 2.8167892226539704e-05, + "loss": 1.4486, + "step": 14849 + }, + { + "epoch": 0.4945493219888328, + "grad_norm": 1.2909883260726929, + "learning_rate": 2.8084569065439588e-05, + "loss": 1.4433, + "step": 14880 + }, + { + "epoch": 0.4955796330763095, + "grad_norm": 1.364015817642212, + "learning_rate": 2.8001211095396807e-05, + "loss": 1.4449, + "step": 14911 + }, + { + "epoch": 0.49660994416378623, + "grad_norm": 1.2468819618225098, + "learning_rate": 2.791781925709473e-05, + "loss": 1.4572, + "step": 14942 + }, + { + "epoch": 0.497640255251263, + "grad_norm": 1.2739325761795044, + "learning_rate": 2.7834394491598908e-05, + "loss": 1.4478, + "step": 14973 + }, + { + "epoch": 0.4986705663387397, + "grad_norm": 1.3384937047958374, + "learning_rate": 2.7750937740346485e-05, + "loss": 1.4429, + "step": 15004 + }, + { + "epoch": 0.49970087742621644, + "grad_norm": 1.231088399887085, + "learning_rate": 2.7667449945135564e-05, + "loss": 1.4631, + "step": 15035 + }, + { + "epoch": 0.5007311885136931, + "grad_norm": 1.2262307405471802, + "learning_rate": 2.7583932048114557e-05, + "loss": 1.4508, + "step": 15066 + }, + { + "epoch": 0.5017614996011699, + "grad_norm": 1.3427774906158447, + "learning_rate": 2.7500384991771587e-05, + "loss": 1.4441, + "step": 15097 + }, + { + "epoch": 0.5027918106886466, + "grad_norm": 1.2950241565704346, + "learning_rate": 2.7416809718923825e-05, + "loss": 1.4427, + "step": 15128 + }, + { + "epoch": 0.5038221217761234, + "grad_norm": 1.4129016399383545, + "learning_rate": 2.7333207172706864e-05, + "loss": 1.4562, + "step": 15159 + }, + { + "epoch": 0.5048524328636002, + "grad_norm": 1.2751520872116089, + "learning_rate": 2.7249578296564088e-05, + "loss": 1.4517, + "step": 15190 + }, + { + "epoch": 0.5058827439510768, + "grad_norm": 1.302485466003418, + "learning_rate": 2.7165924034235973e-05, + "loss": 1.4327, + "step": 15221 + }, + { + "epoch": 0.5069130550385536, + "grad_norm": 1.295390009880066, + "learning_rate": 2.708224532974953e-05, + "loss": 1.4455, + "step": 15252 + }, + { + "epoch": 0.5079433661260303, + "grad_norm": 1.3160103559494019, + "learning_rate": 2.6998543127407538e-05, + "loss": 1.4556, + "step": 15283 + }, + { + "epoch": 0.5089736772135071, + "grad_norm": 1.2997361421585083, + "learning_rate": 2.6914818371777988e-05, + "loss": 1.444, + "step": 15314 + }, + { + "epoch": 0.5100039883009838, + "grad_norm": 1.2427833080291748, + "learning_rate": 2.6831072007683373e-05, + "loss": 1.4501, + "step": 15345 + }, + { + "epoch": 0.5110342993884606, + "grad_norm": 1.2402199506759644, + "learning_rate": 2.6747304980190018e-05, + "loss": 1.4543, + "step": 15376 + }, + { + "epoch": 0.5120646104759372, + "grad_norm": 1.2938770055770874, + "learning_rate": 2.6663518234597453e-05, + "loss": 1.4394, + "step": 15407 + }, + { + "epoch": 0.513094921563414, + "grad_norm": 1.1747736930847168, + "learning_rate": 2.6579712716427696e-05, + "loss": 1.4389, + "step": 15438 + }, + { + "epoch": 0.5141252326508907, + "grad_norm": 1.326824426651001, + "learning_rate": 2.6495889371414652e-05, + "loss": 1.4365, + "step": 15469 + }, + { + "epoch": 0.5151555437383675, + "grad_norm": 1.245665431022644, + "learning_rate": 2.6412049145493367e-05, + "loss": 1.4525, + "step": 15500 + }, + { + "epoch": 0.5161858548258442, + "grad_norm": 1.1753687858581543, + "learning_rate": 2.632819298478939e-05, + "loss": 1.447, + "step": 15531 + }, + { + "epoch": 0.517216165913321, + "grad_norm": 1.3870874643325806, + "learning_rate": 2.6244321835608105e-05, + "loss": 1.4577, + "step": 15562 + }, + { + "epoch": 0.5182464770007976, + "grad_norm": 1.2849411964416504, + "learning_rate": 2.6160436644424024e-05, + "loss": 1.4371, + "step": 15593 + }, + { + "epoch": 0.5192767880882744, + "grad_norm": 1.292443037033081, + "learning_rate": 2.6076538357870133e-05, + "loss": 1.4558, + "step": 15624 + }, + { + "epoch": 0.5203070991757511, + "grad_norm": 1.279961347579956, + "learning_rate": 2.5992627922727196e-05, + "loss": 1.4384, + "step": 15655 + }, + { + "epoch": 0.5213374102632279, + "grad_norm": 1.3141279220581055, + "learning_rate": 2.5908706285913066e-05, + "loss": 1.45, + "step": 15686 + }, + { + "epoch": 0.5223677213507046, + "grad_norm": 1.3931515216827393, + "learning_rate": 2.5824774394472008e-05, + "loss": 1.4403, + "step": 15717 + }, + { + "epoch": 0.5233980324381813, + "grad_norm": 1.2564170360565186, + "learning_rate": 2.5740833195563996e-05, + "loss": 1.4482, + "step": 15748 + }, + { + "epoch": 0.524428343525658, + "grad_norm": 1.5450046062469482, + "learning_rate": 2.5656883636454067e-05, + "loss": 1.4443, + "step": 15779 + }, + { + "epoch": 0.5254586546131348, + "grad_norm": 1.2659518718719482, + "learning_rate": 2.557292666450159e-05, + "loss": 1.4653, + "step": 15810 + }, + { + "epoch": 0.5264889657006115, + "grad_norm": 1.2940540313720703, + "learning_rate": 2.5488963227149566e-05, + "loss": 1.4302, + "step": 15841 + }, + { + "epoch": 0.5275192767880883, + "grad_norm": 1.2514533996582031, + "learning_rate": 2.5404994271913983e-05, + "loss": 1.4412, + "step": 15872 + }, + { + "epoch": 0.528549587875565, + "grad_norm": 1.2681846618652344, + "learning_rate": 2.5321020746373085e-05, + "loss": 1.4411, + "step": 15903 + }, + { + "epoch": 0.5295798989630417, + "grad_norm": 1.2581806182861328, + "learning_rate": 2.52370435981567e-05, + "loss": 1.4503, + "step": 15934 + }, + { + "epoch": 0.5306102100505184, + "grad_norm": 1.3299468755722046, + "learning_rate": 2.5153063774935533e-05, + "loss": 1.4392, + "step": 15965 + }, + { + "epoch": 0.5316405211379952, + "grad_norm": 1.240678310394287, + "learning_rate": 2.506908222441045e-05, + "loss": 1.4412, + "step": 15996 + }, + { + "epoch": 0.532670832225472, + "grad_norm": 1.337936520576477, + "learning_rate": 2.498509989430187e-05, + "loss": 1.4254, + "step": 16027 + }, + { + "epoch": 0.5337011433129487, + "grad_norm": 1.302909016609192, + "learning_rate": 2.4901117732338958e-05, + "loss": 1.4436, + "step": 16058 + }, + { + "epoch": 0.5347314544004255, + "grad_norm": 1.2539550065994263, + "learning_rate": 2.481713668624899e-05, + "loss": 1.4496, + "step": 16089 + }, + { + "epoch": 0.5357617654879021, + "grad_norm": 1.287431001663208, + "learning_rate": 2.4733157703746663e-05, + "loss": 1.424, + "step": 16120 + }, + { + "epoch": 0.5367920765753789, + "grad_norm": 1.5333632230758667, + "learning_rate": 2.4649181732523392e-05, + "loss": 1.4399, + "step": 16151 + }, + { + "epoch": 0.5378223876628556, + "grad_norm": 1.2591406106948853, + "learning_rate": 2.4565209720236582e-05, + "loss": 1.439, + "step": 16182 + }, + { + "epoch": 0.5388526987503324, + "grad_norm": 1.3093276023864746, + "learning_rate": 2.4481242614498975e-05, + "loss": 1.4279, + "step": 16213 + }, + { + "epoch": 0.5398830098378091, + "grad_norm": 1.2824875116348267, + "learning_rate": 2.439728136286796e-05, + "loss": 1.4428, + "step": 16244 + }, + { + "epoch": 0.5409133209252859, + "grad_norm": 1.2775593996047974, + "learning_rate": 2.4313326912834852e-05, + "loss": 1.4352, + "step": 16275 + }, + { + "epoch": 0.5419436320127625, + "grad_norm": 1.4667550325393677, + "learning_rate": 2.4229380211814206e-05, + "loss": 1.4633, + "step": 16306 + }, + { + "epoch": 0.5429739431002393, + "grad_norm": 1.2620900869369507, + "learning_rate": 2.4145442207133124e-05, + "loss": 1.4482, + "step": 16337 + }, + { + "epoch": 0.544004254187716, + "grad_norm": 1.3041224479675293, + "learning_rate": 2.406151384602059e-05, + "loss": 1.4431, + "step": 16368 + }, + { + "epoch": 0.5450345652751928, + "grad_norm": 1.3634989261627197, + "learning_rate": 2.3977596075596747e-05, + "loss": 1.4186, + "step": 16399 + }, + { + "epoch": 0.5460648763626695, + "grad_norm": 1.2322940826416016, + "learning_rate": 2.3893689842862223e-05, + "loss": 1.4322, + "step": 16430 + }, + { + "epoch": 0.5470951874501463, + "grad_norm": 1.5554733276367188, + "learning_rate": 2.3809796094687475e-05, + "loss": 1.4337, + "step": 16461 + }, + { + "epoch": 0.5481254985376229, + "grad_norm": 1.4745500087738037, + "learning_rate": 2.372591577780202e-05, + "loss": 1.4411, + "step": 16492 + }, + { + "epoch": 0.5491558096250997, + "grad_norm": 1.2865196466445923, + "learning_rate": 2.3642049838783838e-05, + "loss": 1.429, + "step": 16523 + }, + { + "epoch": 0.5501861207125764, + "grad_norm": 1.399247407913208, + "learning_rate": 2.3558199224048666e-05, + "loss": 1.4753, + "step": 16554 + }, + { + "epoch": 0.5512164318000532, + "grad_norm": 1.2135406732559204, + "learning_rate": 2.347436487983929e-05, + "loss": 1.4553, + "step": 16585 + }, + { + "epoch": 0.55224674288753, + "grad_norm": 1.164150357246399, + "learning_rate": 2.3390547752214888e-05, + "loss": 1.4268, + "step": 16616 + }, + { + "epoch": 0.5532770539750066, + "grad_norm": 1.2363818883895874, + "learning_rate": 2.330674878704035e-05, + "loss": 1.4381, + "step": 16647 + }, + { + "epoch": 0.5543073650624833, + "grad_norm": 1.286139726638794, + "learning_rate": 2.322296892997561e-05, + "loss": 1.4492, + "step": 16678 + }, + { + "epoch": 0.5553376761499601, + "grad_norm": 1.2836147546768188, + "learning_rate": 2.313920912646497e-05, + "loss": 1.4128, + "step": 16709 + }, + { + "epoch": 0.5563679872374369, + "grad_norm": 1.253727674484253, + "learning_rate": 2.305547032172643e-05, + "loss": 1.4472, + "step": 16740 + }, + { + "epoch": 0.5573982983249136, + "grad_norm": 1.2580201625823975, + "learning_rate": 2.2971753460741014e-05, + "loss": 1.4461, + "step": 16771 + }, + { + "epoch": 0.5584286094123904, + "grad_norm": 1.2446421384811401, + "learning_rate": 2.288805948824212e-05, + "loss": 1.4267, + "step": 16802 + }, + { + "epoch": 0.559458920499867, + "grad_norm": 1.3572150468826294, + "learning_rate": 2.2804389348704858e-05, + "loss": 1.4222, + "step": 16833 + }, + { + "epoch": 0.5604892315873438, + "grad_norm": 1.3694707155227661, + "learning_rate": 2.2720743986335374e-05, + "loss": 1.4624, + "step": 16864 + }, + { + "epoch": 0.5615195426748205, + "grad_norm": 1.2654088735580444, + "learning_rate": 2.2637124345060233e-05, + "loss": 1.4379, + "step": 16895 + }, + { + "epoch": 0.5625498537622973, + "grad_norm": 1.3349469900131226, + "learning_rate": 2.2553531368515695e-05, + "loss": 1.4404, + "step": 16926 + }, + { + "epoch": 0.563580164849774, + "grad_norm": 1.2259774208068848, + "learning_rate": 2.2469966000037144e-05, + "loss": 1.4335, + "step": 16957 + }, + { + "epoch": 0.5646104759372508, + "grad_norm": 1.2973053455352783, + "learning_rate": 2.2386429182648417e-05, + "loss": 1.4397, + "step": 16988 + }, + { + "epoch": 0.5656407870247274, + "grad_norm": 1.2674601078033447, + "learning_rate": 2.230292185905114e-05, + "loss": 1.4256, + "step": 17019 + }, + { + "epoch": 0.5666710981122042, + "grad_norm": 1.243605136871338, + "learning_rate": 2.2219444971614116e-05, + "loss": 1.4404, + "step": 17050 + }, + { + "epoch": 0.5677014091996809, + "grad_norm": 1.2108361721038818, + "learning_rate": 2.2135999462362655e-05, + "loss": 1.4318, + "step": 17081 + }, + { + "epoch": 0.5687317202871577, + "grad_norm": 1.2497962713241577, + "learning_rate": 2.2052586272968003e-05, + "loss": 1.4409, + "step": 17112 + }, + { + "epoch": 0.5697620313746344, + "grad_norm": 1.2269086837768555, + "learning_rate": 2.196920634473666e-05, + "loss": 1.4417, + "step": 17143 + }, + { + "epoch": 0.5707923424621112, + "grad_norm": 1.3165903091430664, + "learning_rate": 2.1885860618599787e-05, + "loss": 1.4541, + "step": 17174 + }, + { + "epoch": 0.5718226535495878, + "grad_norm": 1.2117608785629272, + "learning_rate": 2.1802550035102577e-05, + "loss": 1.4457, + "step": 17205 + }, + { + "epoch": 0.5728529646370646, + "grad_norm": 1.2482073307037354, + "learning_rate": 2.171927553439363e-05, + "loss": 1.4408, + "step": 17236 + }, + { + "epoch": 0.5738832757245413, + "grad_norm": 1.2258682250976562, + "learning_rate": 2.1636038056214376e-05, + "loss": 1.4366, + "step": 17267 + }, + { + "epoch": 0.5749135868120181, + "grad_norm": 1.254062294960022, + "learning_rate": 2.155283853988844e-05, + "loss": 1.4187, + "step": 17298 + }, + { + "epoch": 0.5759438978994948, + "grad_norm": 1.3397905826568604, + "learning_rate": 2.146967792431106e-05, + "loss": 1.4316, + "step": 17329 + }, + { + "epoch": 0.5769742089869716, + "grad_norm": 1.3253263235092163, + "learning_rate": 2.138655714793849e-05, + "loss": 1.4361, + "step": 17360 + }, + { + "epoch": 0.5780045200744482, + "grad_norm": 1.2624903917312622, + "learning_rate": 2.1303477148777367e-05, + "loss": 1.4136, + "step": 17391 + }, + { + "epoch": 0.579034831161925, + "grad_norm": 1.3255977630615234, + "learning_rate": 2.122043886437421e-05, + "loss": 1.4552, + "step": 17422 + }, + { + "epoch": 0.5800651422494018, + "grad_norm": 1.300898790359497, + "learning_rate": 2.1137443231804765e-05, + "loss": 1.4152, + "step": 17453 + }, + { + "epoch": 0.5810954533368785, + "grad_norm": 1.2904343605041504, + "learning_rate": 2.105449118766347e-05, + "loss": 1.4195, + "step": 17484 + }, + { + "epoch": 0.5821257644243553, + "grad_norm": 1.3146878480911255, + "learning_rate": 2.097158366805287e-05, + "loss": 1.426, + "step": 17515 + }, + { + "epoch": 0.5831560755118319, + "grad_norm": 1.2454010248184204, + "learning_rate": 2.0888721608573047e-05, + "loss": 1.4239, + "step": 17546 + }, + { + "epoch": 0.5841863865993087, + "grad_norm": 1.194626808166504, + "learning_rate": 2.0805905944311087e-05, + "loss": 1.4416, + "step": 17577 + }, + { + "epoch": 0.5852166976867854, + "grad_norm": 1.359053373336792, + "learning_rate": 2.0723137609830497e-05, + "loss": 1.4112, + "step": 17608 + }, + { + "epoch": 0.5862470087742622, + "grad_norm": 1.2577933073043823, + "learning_rate": 2.0640417539160686e-05, + "loss": 1.4432, + "step": 17639 + }, + { + "epoch": 0.5872773198617389, + "grad_norm": 1.2604849338531494, + "learning_rate": 2.0557746665786427e-05, + "loss": 1.4184, + "step": 17670 + }, + { + "epoch": 0.5883076309492157, + "grad_norm": 1.2511252164840698, + "learning_rate": 2.0475125922637256e-05, + "loss": 1.4276, + "step": 17701 + }, + { + "epoch": 0.5893379420366923, + "grad_norm": 1.2841278314590454, + "learning_rate": 2.0392556242077047e-05, + "loss": 1.4345, + "step": 17732 + }, + { + "epoch": 0.5903682531241691, + "grad_norm": 1.3342245817184448, + "learning_rate": 2.031003855589343e-05, + "loss": 1.4212, + "step": 17763 + }, + { + "epoch": 0.5913985642116458, + "grad_norm": 1.352387547492981, + "learning_rate": 2.022757379528727e-05, + "loss": 1.4316, + "step": 17794 + }, + { + "epoch": 0.5924288752991226, + "grad_norm": 1.3534374237060547, + "learning_rate": 2.0145162890862184e-05, + "loss": 1.4352, + "step": 17825 + }, + { + "epoch": 0.5934591863865993, + "grad_norm": 1.2957963943481445, + "learning_rate": 2.0062806772614022e-05, + "loss": 1.4057, + "step": 17856 + }, + { + "epoch": 0.5944894974740761, + "grad_norm": 1.3178727626800537, + "learning_rate": 1.9980506369920392e-05, + "loss": 1.4323, + "step": 17887 + }, + { + "epoch": 0.5955198085615527, + "grad_norm": 1.3364850282669067, + "learning_rate": 1.989826261153015e-05, + "loss": 1.4228, + "step": 17918 + }, + { + "epoch": 0.5965501196490295, + "grad_norm": 1.283200979232788, + "learning_rate": 1.9816076425552923e-05, + "loss": 1.4348, + "step": 17949 + }, + { + "epoch": 0.5975804307365062, + "grad_norm": 1.2856223583221436, + "learning_rate": 1.9733948739448676e-05, + "loss": 1.4176, + "step": 17980 + }, + { + "epoch": 0.598610741823983, + "grad_norm": 1.253180742263794, + "learning_rate": 1.9651880480017155e-05, + "loss": 1.4175, + "step": 18011 + }, + { + "epoch": 0.5996410529114597, + "grad_norm": 1.3471016883850098, + "learning_rate": 1.9569872573387516e-05, + "loss": 1.433, + "step": 18042 + }, + { + "epoch": 0.6006713639989365, + "grad_norm": 1.2449748516082764, + "learning_rate": 1.9487925945007854e-05, + "loss": 1.4091, + "step": 18073 + }, + { + "epoch": 0.6017016750864131, + "grad_norm": 1.3311972618103027, + "learning_rate": 1.9406041519634726e-05, + "loss": 1.403, + "step": 18104 + }, + { + "epoch": 0.6027319861738899, + "grad_norm": 1.2645657062530518, + "learning_rate": 1.932422022132275e-05, + "loss": 1.4265, + "step": 18135 + }, + { + "epoch": 0.6037622972613667, + "grad_norm": 1.3313370943069458, + "learning_rate": 1.924246297341414e-05, + "loss": 1.4275, + "step": 18166 + }, + { + "epoch": 0.6047926083488434, + "grad_norm": 1.2827123403549194, + "learning_rate": 1.9160770698528338e-05, + "loss": 1.4277, + "step": 18197 + }, + { + "epoch": 0.6058229194363202, + "grad_norm": 1.2230308055877686, + "learning_rate": 1.907914431855156e-05, + "loss": 1.4391, + "step": 18228 + }, + { + "epoch": 0.6068532305237969, + "grad_norm": 1.2785223722457886, + "learning_rate": 1.8997584754626412e-05, + "loss": 1.4152, + "step": 18259 + }, + { + "epoch": 0.6078835416112736, + "grad_norm": 1.3152620792388916, + "learning_rate": 1.8916092927141486e-05, + "loss": 1.4137, + "step": 18290 + }, + { + "epoch": 0.6089138526987503, + "grad_norm": 1.1842609643936157, + "learning_rate": 1.883466975572098e-05, + "loss": 1.4141, + "step": 18321 + }, + { + "epoch": 0.6099441637862271, + "grad_norm": 1.2319703102111816, + "learning_rate": 1.8753316159214312e-05, + "loss": 1.4216, + "step": 18352 + }, + { + "epoch": 0.6109744748737038, + "grad_norm": 1.3239370584487915, + "learning_rate": 1.8672033055685766e-05, + "loss": 1.4184, + "step": 18383 + }, + { + "epoch": 0.6120047859611806, + "grad_norm": 1.2665941715240479, + "learning_rate": 1.8590821362404116e-05, + "loss": 1.4249, + "step": 18414 + }, + { + "epoch": 0.6130350970486572, + "grad_norm": 1.2569379806518555, + "learning_rate": 1.8509681995832294e-05, + "loss": 1.4242, + "step": 18445 + }, + { + "epoch": 0.614065408136134, + "grad_norm": 1.2848411798477173, + "learning_rate": 1.8428615871617004e-05, + "loss": 1.4166, + "step": 18476 + }, + { + "epoch": 0.6150957192236107, + "grad_norm": 1.2636574506759644, + "learning_rate": 1.8347623904578448e-05, + "loss": 1.4297, + "step": 18507 + }, + { + "epoch": 0.6161260303110875, + "grad_norm": 1.2672234773635864, + "learning_rate": 1.8266707008699975e-05, + "loss": 1.4244, + "step": 18538 + }, + { + "epoch": 0.6171563413985642, + "grad_norm": 1.2299143075942993, + "learning_rate": 1.818586609711774e-05, + "loss": 1.408, + "step": 18569 + }, + { + "epoch": 0.618186652486041, + "grad_norm": 1.2221580743789673, + "learning_rate": 1.8105102082110462e-05, + "loss": 1.4242, + "step": 18600 + }, + { + "epoch": 0.6192169635735176, + "grad_norm": 1.290737509727478, + "learning_rate": 1.8024415875089058e-05, + "loss": 1.4167, + "step": 18631 + }, + { + "epoch": 0.6202472746609944, + "grad_norm": 1.3236243724822998, + "learning_rate": 1.7943808386586407e-05, + "loss": 1.4341, + "step": 18662 + }, + { + "epoch": 0.6212775857484711, + "grad_norm": 1.1983164548873901, + "learning_rate": 1.7863280526247073e-05, + "loss": 1.4171, + "step": 18693 + }, + { + "epoch": 0.6223078968359479, + "grad_norm": 1.2706191539764404, + "learning_rate": 1.7782833202817003e-05, + "loss": 1.4268, + "step": 18724 + }, + { + "epoch": 0.6233382079234246, + "grad_norm": 1.2584494352340698, + "learning_rate": 1.7702467324133327e-05, + "loss": 1.4364, + "step": 18755 + }, + { + "epoch": 0.6243685190109014, + "grad_norm": 1.345226526260376, + "learning_rate": 1.7622183797114042e-05, + "loss": 1.4274, + "step": 18786 + }, + { + "epoch": 0.625398830098378, + "grad_norm": 1.3055671453475952, + "learning_rate": 1.7541983527747838e-05, + "loss": 1.4101, + "step": 18817 + }, + { + "epoch": 0.6264291411858548, + "grad_norm": 1.2878341674804688, + "learning_rate": 1.746186742108387e-05, + "loss": 1.4133, + "step": 18848 + }, + { + "epoch": 0.6274594522733316, + "grad_norm": 1.241191029548645, + "learning_rate": 1.73818363812215e-05, + "loss": 1.4038, + "step": 18879 + }, + { + "epoch": 0.6284897633608083, + "grad_norm": 1.8631796836853027, + "learning_rate": 1.7301891311300153e-05, + "loss": 1.3961, + "step": 18910 + }, + { + "epoch": 0.6295200744482851, + "grad_norm": 1.2781902551651, + "learning_rate": 1.7222033113489055e-05, + "loss": 1.4238, + "step": 18941 + }, + { + "epoch": 0.6305503855357618, + "grad_norm": 1.2679165601730347, + "learning_rate": 1.7142262688977127e-05, + "loss": 1.4236, + "step": 18972 + }, + { + "epoch": 0.6315806966232385, + "grad_norm": 1.257203459739685, + "learning_rate": 1.7062580937962764e-05, + "loss": 1.4156, + "step": 19003 + }, + { + "epoch": 0.6326110077107152, + "grad_norm": 1.284470796585083, + "learning_rate": 1.698298875964369e-05, + "loss": 1.4241, + "step": 19034 + }, + { + "epoch": 0.633641318798192, + "grad_norm": 1.310545802116394, + "learning_rate": 1.690348705220684e-05, + "loss": 1.4205, + "step": 19065 + }, + { + "epoch": 0.6346716298856687, + "grad_norm": 1.2868564128875732, + "learning_rate": 1.6824076712818156e-05, + "loss": 1.4238, + "step": 19096 + }, + { + "epoch": 0.6357019409731455, + "grad_norm": 1.2508702278137207, + "learning_rate": 1.6744758637612533e-05, + "loss": 1.4046, + "step": 19127 + }, + { + "epoch": 0.6367322520606222, + "grad_norm": 1.3149102926254272, + "learning_rate": 1.6665533721683664e-05, + "loss": 1.4211, + "step": 19158 + }, + { + "epoch": 0.6377625631480989, + "grad_norm": 1.3485240936279297, + "learning_rate": 1.6586402859073974e-05, + "loss": 1.4167, + "step": 19189 + }, + { + "epoch": 0.6387928742355756, + "grad_norm": 1.2397938966751099, + "learning_rate": 1.6507366942764463e-05, + "loss": 1.4242, + "step": 19220 + }, + { + "epoch": 0.6398231853230524, + "grad_norm": 1.2909672260284424, + "learning_rate": 1.6428426864664732e-05, + "loss": 1.403, + "step": 19251 + }, + { + "epoch": 0.6408534964105291, + "grad_norm": 1.290385365486145, + "learning_rate": 1.6349583515602816e-05, + "loss": 1.4082, + "step": 19282 + }, + { + "epoch": 0.6418838074980059, + "grad_norm": 1.3623126745224, + "learning_rate": 1.6270837785315208e-05, + "loss": 1.4075, + "step": 19313 + }, + { + "epoch": 0.6429141185854825, + "grad_norm": 1.276903510093689, + "learning_rate": 1.619219056243676e-05, + "loss": 1.4135, + "step": 19344 + }, + { + "epoch": 0.6439444296729593, + "grad_norm": 1.2038910388946533, + "learning_rate": 1.6113642734490698e-05, + "loss": 1.4162, + "step": 19375 + }, + { + "epoch": 0.644974740760436, + "grad_norm": 1.2092891931533813, + "learning_rate": 1.6035195187878577e-05, + "loss": 1.4285, + "step": 19406 + }, + { + "epoch": 0.6460050518479128, + "grad_norm": 1.2983031272888184, + "learning_rate": 1.5956848807870305e-05, + "loss": 1.4128, + "step": 19437 + }, + { + "epoch": 0.6470353629353895, + "grad_norm": 1.279845952987671, + "learning_rate": 1.587860447859413e-05, + "loss": 1.4351, + "step": 19468 + }, + { + "epoch": 0.6480656740228663, + "grad_norm": 1.2781362533569336, + "learning_rate": 1.5800463083026686e-05, + "loss": 1.4118, + "step": 19499 + }, + { + "epoch": 0.6490959851103429, + "grad_norm": 1.2652825117111206, + "learning_rate": 1.572242550298298e-05, + "loss": 1.4195, + "step": 19530 + }, + { + "epoch": 0.6501262961978197, + "grad_norm": 1.3177101612091064, + "learning_rate": 1.56444926191065e-05, + "loss": 1.4307, + "step": 19561 + }, + { + "epoch": 0.6511566072852965, + "grad_norm": 1.2758272886276245, + "learning_rate": 1.5566665310859257e-05, + "loss": 1.4096, + "step": 19592 + }, + { + "epoch": 0.6521869183727732, + "grad_norm": 1.2265219688415527, + "learning_rate": 1.5488944456511846e-05, + "loss": 1.4098, + "step": 19623 + }, + { + "epoch": 0.65321722946025, + "grad_norm": 1.258945345878601, + "learning_rate": 1.5411330933133546e-05, + "loss": 1.4274, + "step": 19654 + }, + { + "epoch": 0.6542475405477267, + "grad_norm": 1.2599055767059326, + "learning_rate": 1.533382561658241e-05, + "loss": 1.4207, + "step": 19685 + }, + { + "epoch": 0.6552778516352034, + "grad_norm": 1.2502135038375854, + "learning_rate": 1.525642938149541e-05, + "loss": 1.4046, + "step": 19716 + }, + { + "epoch": 0.6563081627226801, + "grad_norm": 1.2734349966049194, + "learning_rate": 1.5179143101278536e-05, + "loss": 1.41, + "step": 19747 + }, + { + "epoch": 0.6573384738101569, + "grad_norm": 1.2801038026809692, + "learning_rate": 1.5101967648096955e-05, + "loss": 1.4088, + "step": 19778 + }, + { + "epoch": 0.6583687848976336, + "grad_norm": 1.2488126754760742, + "learning_rate": 1.5024903892865172e-05, + "loss": 1.4111, + "step": 19809 + }, + { + "epoch": 0.6593990959851104, + "grad_norm": 1.2418783903121948, + "learning_rate": 1.4947952705237184e-05, + "loss": 1.384, + "step": 19840 + }, + { + "epoch": 0.6604294070725871, + "grad_norm": 1.2566567659378052, + "learning_rate": 1.4871114953596682e-05, + "loss": 1.4127, + "step": 19871 + }, + { + "epoch": 0.6614597181600638, + "grad_norm": 1.2431600093841553, + "learning_rate": 1.4794391505047256e-05, + "loss": 1.4015, + "step": 19902 + }, + { + "epoch": 0.6624900292475405, + "grad_norm": 1.3174066543579102, + "learning_rate": 1.4717783225402596e-05, + "loss": 1.4113, + "step": 19933 + }, + { + "epoch": 0.6635203403350173, + "grad_norm": 1.3124332427978516, + "learning_rate": 1.4641290979176735e-05, + "loss": 1.421, + "step": 19964 + }, + { + "epoch": 0.664550651422494, + "grad_norm": 1.2595762014389038, + "learning_rate": 1.4564915629574246e-05, + "loss": 1.409, + "step": 19995 + }, + { + "epoch": 0.6655809625099708, + "grad_norm": 1.2872180938720703, + "learning_rate": 1.4488658038480601e-05, + "loss": 1.4082, + "step": 20026 + }, + { + "epoch": 0.6666112735974475, + "grad_norm": 1.27680242061615, + "learning_rate": 1.4412519066452323e-05, + "loss": 1.3979, + "step": 20057 + }, + { + "epoch": 0.6676415846849242, + "grad_norm": 1.2753857374191284, + "learning_rate": 1.4336499572707373e-05, + "loss": 1.4227, + "step": 20088 + }, + { + "epoch": 0.6686718957724009, + "grad_norm": 1.2680202722549438, + "learning_rate": 1.4260600415115433e-05, + "loss": 1.418, + "step": 20119 + }, + { + "epoch": 0.6697022068598777, + "grad_norm": 1.3002320528030396, + "learning_rate": 1.4184822450188137e-05, + "loss": 1.4133, + "step": 20150 + }, + { + "epoch": 0.6707325179473544, + "grad_norm": 1.3236373662948608, + "learning_rate": 1.410916653306954e-05, + "loss": 1.4133, + "step": 20181 + }, + { + "epoch": 0.6717628290348312, + "grad_norm": 1.3784340620040894, + "learning_rate": 1.403363351752639e-05, + "loss": 1.4064, + "step": 20212 + }, + { + "epoch": 0.6727931401223078, + "grad_norm": 1.2793350219726562, + "learning_rate": 1.3958224255938485e-05, + "loss": 1.4203, + "step": 20243 + }, + { + "epoch": 0.6738234512097846, + "grad_norm": 1.3510205745697021, + "learning_rate": 1.388293959928911e-05, + "loss": 1.418, + "step": 20274 + }, + { + "epoch": 0.6748537622972614, + "grad_norm": 1.2981188297271729, + "learning_rate": 1.3807780397155379e-05, + "loss": 1.4019, + "step": 20305 + }, + { + "epoch": 0.6758840733847381, + "grad_norm": 1.2599388360977173, + "learning_rate": 1.3732747497698655e-05, + "loss": 1.4187, + "step": 20336 + }, + { + "epoch": 0.6769143844722149, + "grad_norm": 1.2741434574127197, + "learning_rate": 1.3657841747655038e-05, + "loss": 1.4183, + "step": 20367 + }, + { + "epoch": 0.6779446955596916, + "grad_norm": 1.2376216650009155, + "learning_rate": 1.3583063992325706e-05, + "loss": 1.4208, + "step": 20398 + }, + { + "epoch": 0.6789750066471683, + "grad_norm": 1.341134786605835, + "learning_rate": 1.3508415075567496e-05, + "loss": 1.4015, + "step": 20429 + }, + { + "epoch": 0.680005317734645, + "grad_norm": 1.3483457565307617, + "learning_rate": 1.343389583978327e-05, + "loss": 1.4043, + "step": 20460 + }, + { + "epoch": 0.6810356288221218, + "grad_norm": 1.3255680799484253, + "learning_rate": 1.3359507125912468e-05, + "loss": 1.4162, + "step": 20491 + }, + { + "epoch": 0.6820659399095985, + "grad_norm": 1.211305022239685, + "learning_rate": 1.3285249773421627e-05, + "loss": 1.4043, + "step": 20522 + }, + { + "epoch": 0.6830962509970753, + "grad_norm": 1.3049174547195435, + "learning_rate": 1.3211124620294884e-05, + "loss": 1.4012, + "step": 20553 + }, + { + "epoch": 0.684126562084552, + "grad_norm": 1.2884812355041504, + "learning_rate": 1.313713250302451e-05, + "loss": 1.419, + "step": 20584 + }, + { + "epoch": 0.6851568731720287, + "grad_norm": 1.2465201616287231, + "learning_rate": 1.3063274256601479e-05, + "loss": 1.394, + "step": 20615 + }, + { + "epoch": 0.6861871842595054, + "grad_norm": 1.2868762016296387, + "learning_rate": 1.2989550714506086e-05, + "loss": 1.3975, + "step": 20646 + }, + { + "epoch": 0.6872174953469822, + "grad_norm": 1.2728379964828491, + "learning_rate": 1.291596270869846e-05, + "loss": 1.3918, + "step": 20677 + }, + { + "epoch": 0.6882478064344589, + "grad_norm": 1.265869379043579, + "learning_rate": 1.284251106960927e-05, + "loss": 1.402, + "step": 20708 + }, + { + "epoch": 0.6892781175219357, + "grad_norm": 1.3357373476028442, + "learning_rate": 1.2769196626130263e-05, + "loss": 1.3975, + "step": 20739 + }, + { + "epoch": 0.6903084286094124, + "grad_norm": 1.216797947883606, + "learning_rate": 1.2696020205604969e-05, + "loss": 1.3953, + "step": 20770 + }, + { + "epoch": 0.6913387396968891, + "grad_norm": 1.269227385520935, + "learning_rate": 1.2622982633819359e-05, + "loss": 1.4154, + "step": 20801 + }, + { + "epoch": 0.6923690507843658, + "grad_norm": 1.3336331844329834, + "learning_rate": 1.2550084734992484e-05, + "loss": 1.3992, + "step": 20832 + }, + { + "epoch": 0.6933993618718426, + "grad_norm": 1.2936463356018066, + "learning_rate": 1.247732733176724e-05, + "loss": 1.4147, + "step": 20863 + }, + { + "epoch": 0.6944296729593193, + "grad_norm": 1.344826102256775, + "learning_rate": 1.2404711245201044e-05, + "loss": 1.3878, + "step": 20894 + }, + { + "epoch": 0.6954599840467961, + "grad_norm": 1.2611995935440063, + "learning_rate": 1.2332237294756535e-05, + "loss": 1.4088, + "step": 20925 + }, + { + "epoch": 0.6964902951342729, + "grad_norm": 1.3274885416030884, + "learning_rate": 1.225990629829241e-05, + "loss": 1.4036, + "step": 20956 + }, + { + "epoch": 0.6975206062217495, + "grad_norm": 1.2847373485565186, + "learning_rate": 1.2187719072054136e-05, + "loss": 1.398, + "step": 20987 + }, + { + "epoch": 0.6985509173092262, + "grad_norm": 1.2856248617172241, + "learning_rate": 1.2115676430664735e-05, + "loss": 1.4101, + "step": 21018 + }, + { + "epoch": 0.699581228396703, + "grad_norm": 1.3064154386520386, + "learning_rate": 1.2043779187115647e-05, + "loss": 1.4081, + "step": 21049 + }, + { + "epoch": 0.7006115394841798, + "grad_norm": 1.253602147102356, + "learning_rate": 1.1972028152757476e-05, + "loss": 1.4123, + "step": 21080 + }, + { + "epoch": 0.7016418505716565, + "grad_norm": 1.2678899765014648, + "learning_rate": 1.1900424137290889e-05, + "loss": 1.3969, + "step": 21111 + }, + { + "epoch": 0.7026721616591332, + "grad_norm": 1.2261760234832764, + "learning_rate": 1.1828967948757482e-05, + "loss": 1.4009, + "step": 21142 + }, + { + "epoch": 0.7037024727466099, + "grad_norm": 1.540486216545105, + "learning_rate": 1.175766039353062e-05, + "loss": 1.4215, + "step": 21173 + }, + { + "epoch": 0.7047327838340867, + "grad_norm": 1.2508059740066528, + "learning_rate": 1.1686502276306382e-05, + "loss": 1.4046, + "step": 21204 + }, + { + "epoch": 0.7057630949215634, + "grad_norm": 1.2918591499328613, + "learning_rate": 1.1615494400094445e-05, + "loss": 1.4301, + "step": 21235 + }, + { + "epoch": 0.7067934060090402, + "grad_norm": 1.240178108215332, + "learning_rate": 1.1544637566209029e-05, + "loss": 1.3888, + "step": 21266 + }, + { + "epoch": 0.7078237170965169, + "grad_norm": 1.2358977794647217, + "learning_rate": 1.1473932574259886e-05, + "loss": 1.415, + "step": 21297 + }, + { + "epoch": 0.7088540281839936, + "grad_norm": 1.2963451147079468, + "learning_rate": 1.1403380222143247e-05, + "loss": 1.4002, + "step": 21328 + }, + { + "epoch": 0.7098843392714703, + "grad_norm": 1.3245363235473633, + "learning_rate": 1.1332981306032808e-05, + "loss": 1.3945, + "step": 21359 + }, + { + "epoch": 0.7109146503589471, + "grad_norm": 1.2833342552185059, + "learning_rate": 1.1262736620370762e-05, + "loss": 1.4054, + "step": 21390 + }, + { + "epoch": 0.7119449614464238, + "grad_norm": 1.3230944871902466, + "learning_rate": 1.1192646957858854e-05, + "loss": 1.398, + "step": 21421 + }, + { + "epoch": 0.7129752725339006, + "grad_norm": 1.2515650987625122, + "learning_rate": 1.1122713109449381e-05, + "loss": 1.3958, + "step": 21452 + }, + { + "epoch": 0.7140055836213773, + "grad_norm": 1.313057780265808, + "learning_rate": 1.105293586433634e-05, + "loss": 1.3909, + "step": 21483 + }, + { + "epoch": 0.715035894708854, + "grad_norm": 1.2700668573379517, + "learning_rate": 1.0983316009946446e-05, + "loss": 1.3939, + "step": 21514 + }, + { + "epoch": 0.7160662057963307, + "grad_norm": 1.2487835884094238, + "learning_rate": 1.0913854331930282e-05, + "loss": 1.4162, + "step": 21545 + }, + { + "epoch": 0.7170965168838075, + "grad_norm": 1.2748737335205078, + "learning_rate": 1.0844551614153456e-05, + "loss": 1.3984, + "step": 21576 + }, + { + "epoch": 0.7181268279712842, + "grad_norm": 1.24228036403656, + "learning_rate": 1.0775408638687725e-05, + "loss": 1.4002, + "step": 21607 + }, + { + "epoch": 0.719157139058761, + "grad_norm": 1.3365492820739746, + "learning_rate": 1.0706426185802165e-05, + "loss": 1.4091, + "step": 21638 + }, + { + "epoch": 0.7201874501462378, + "grad_norm": 1.2073006629943848, + "learning_rate": 1.0637605033954371e-05, + "loss": 1.4034, + "step": 21669 + }, + { + "epoch": 0.7212177612337144, + "grad_norm": 1.2873163223266602, + "learning_rate": 1.05689459597817e-05, + "loss": 1.3994, + "step": 21700 + }, + { + "epoch": 0.7222480723211911, + "grad_norm": 1.3623207807540894, + "learning_rate": 1.050044973809246e-05, + "loss": 1.3827, + "step": 21731 + }, + { + "epoch": 0.7232783834086679, + "grad_norm": 1.256643533706665, + "learning_rate": 1.043211714185722e-05, + "loss": 1.3989, + "step": 21762 + }, + { + "epoch": 0.7243086944961447, + "grad_norm": 1.201434850692749, + "learning_rate": 1.036394894220003e-05, + "loss": 1.3892, + "step": 21793 + }, + { + "epoch": 0.7253390055836214, + "grad_norm": 1.335642695426941, + "learning_rate": 1.0295945908389751e-05, + "loss": 1.4077, + "step": 21824 + }, + { + "epoch": 0.7263693166710982, + "grad_norm": 1.252847671508789, + "learning_rate": 1.0228108807831393e-05, + "loss": 1.4077, + "step": 21855 + }, + { + "epoch": 0.7273996277585748, + "grad_norm": 1.3838329315185547, + "learning_rate": 1.01604384060574e-05, + "loss": 1.3944, + "step": 21886 + }, + { + "epoch": 0.7284299388460516, + "grad_norm": 1.3425817489624023, + "learning_rate": 1.009293546671907e-05, + "loss": 1.4067, + "step": 21917 + }, + { + "epoch": 0.7294602499335283, + "grad_norm": 1.3198227882385254, + "learning_rate": 1.002560075157791e-05, + "loss": 1.4043, + "step": 21948 + }, + { + "epoch": 0.7304905610210051, + "grad_norm": 1.3169294595718384, + "learning_rate": 9.958435020496995e-06, + "loss": 1.3743, + "step": 21979 + }, + { + "epoch": 0.7315208721084818, + "grad_norm": 1.2145452499389648, + "learning_rate": 9.89143903143249e-06, + "loss": 1.3875, + "step": 22010 + }, + { + "epoch": 0.7325511831959585, + "grad_norm": 1.368464469909668, + "learning_rate": 9.824613540425038e-06, + "loss": 1.3939, + "step": 22041 + }, + { + "epoch": 0.7335814942834352, + "grad_norm": 1.2481716871261597, + "learning_rate": 9.757959301591197e-06, + "loss": 1.4032, + "step": 22072 + }, + { + "epoch": 0.734611805370912, + "grad_norm": 1.225689172744751, + "learning_rate": 9.691477067115017e-06, + "loss": 1.4057, + "step": 22103 + }, + { + "epoch": 0.7356421164583887, + "grad_norm": 1.2322176694869995, + "learning_rate": 9.625167587239467e-06, + "loss": 1.3983, + "step": 22134 + }, + { + "epoch": 0.7366724275458655, + "grad_norm": 1.2423603534698486, + "learning_rate": 9.559031610258007e-06, + "loss": 1.4246, + "step": 22165 + }, + { + "epoch": 0.7377027386333422, + "grad_norm": 1.2707546949386597, + "learning_rate": 9.493069882506164e-06, + "loss": 1.4033, + "step": 22196 + }, + { + "epoch": 0.7387330497208189, + "grad_norm": 1.2819782495498657, + "learning_rate": 9.427283148353056e-06, + "loss": 1.3942, + "step": 22227 + }, + { + "epoch": 0.7397633608082956, + "grad_norm": 1.278111219406128, + "learning_rate": 9.361672150193052e-06, + "loss": 1.4124, + "step": 22258 + }, + { + "epoch": 0.7407936718957724, + "grad_norm": 1.2402000427246094, + "learning_rate": 9.29623762843734e-06, + "loss": 1.3784, + "step": 22289 + }, + { + "epoch": 0.7418239829832491, + "grad_norm": 1.2294648885726929, + "learning_rate": 9.230980321505594e-06, + "loss": 1.3998, + "step": 22320 + }, + { + "epoch": 0.7428542940707259, + "grad_norm": 1.3570529222488403, + "learning_rate": 9.165900965817668e-06, + "loss": 1.3867, + "step": 22351 + }, + { + "epoch": 0.7438846051582026, + "grad_norm": 1.2765589952468872, + "learning_rate": 9.101000295785245e-06, + "loss": 1.3848, + "step": 22382 + }, + { + "epoch": 0.7449149162456793, + "grad_norm": 1.301269292831421, + "learning_rate": 9.036279043803565e-06, + "loss": 1.3976, + "step": 22413 + }, + { + "epoch": 0.745945227333156, + "grad_norm": 1.3582361936569214, + "learning_rate": 8.971737940243147e-06, + "loss": 1.398, + "step": 22444 + }, + { + "epoch": 0.7469755384206328, + "grad_norm": 1.3054485321044922, + "learning_rate": 8.907377713441592e-06, + "loss": 1.402, + "step": 22475 + }, + { + "epoch": 0.7480058495081096, + "grad_norm": 1.2361812591552734, + "learning_rate": 8.843199089695293e-06, + "loss": 1.4097, + "step": 22506 + }, + { + "epoch": 0.7490361605955863, + "grad_norm": 1.2720493078231812, + "learning_rate": 8.779202793251311e-06, + "loss": 1.4046, + "step": 22537 + }, + { + "epoch": 0.7500664716830631, + "grad_norm": 1.2494639158248901, + "learning_rate": 8.715389546299149e-06, + "loss": 1.3858, + "step": 22568 + }, + { + "epoch": 0.7510967827705397, + "grad_norm": 1.2343871593475342, + "learning_rate": 8.651760068962617e-06, + "loss": 1.3896, + "step": 22599 + }, + { + "epoch": 0.7521270938580165, + "grad_norm": 1.1934345960617065, + "learning_rate": 8.588315079291733e-06, + "loss": 1.4095, + "step": 22630 + }, + { + "epoch": 0.7531574049454932, + "grad_norm": 1.2811630964279175, + "learning_rate": 8.52505529325457e-06, + "loss": 1.3954, + "step": 22661 + }, + { + "epoch": 0.75418771603297, + "grad_norm": 1.2676504850387573, + "learning_rate": 8.461981424729216e-06, + "loss": 1.3901, + "step": 22692 + }, + { + "epoch": 0.7552180271204467, + "grad_norm": 1.3221408128738403, + "learning_rate": 8.399094185495725e-06, + "loss": 1.4057, + "step": 22723 + }, + { + "epoch": 0.7562483382079235, + "grad_norm": 1.2741389274597168, + "learning_rate": 8.336394285228017e-06, + "loss": 1.3964, + "step": 22754 + }, + { + "epoch": 0.7572786492954001, + "grad_norm": 1.329860806465149, + "learning_rate": 8.273882431485952e-06, + "loss": 1.3946, + "step": 22785 + }, + { + "epoch": 0.7583089603828769, + "grad_norm": 1.3073118925094604, + "learning_rate": 8.211559329707316e-06, + "loss": 1.3937, + "step": 22816 + }, + { + "epoch": 0.7593392714703536, + "grad_norm": 1.2866522073745728, + "learning_rate": 8.149425683199823e-06, + "loss": 1.3999, + "step": 22847 + }, + { + "epoch": 0.7603695825578304, + "grad_norm": 1.2539178133010864, + "learning_rate": 8.08748219313325e-06, + "loss": 1.398, + "step": 22878 + }, + { + "epoch": 0.7613998936453071, + "grad_norm": 1.279863715171814, + "learning_rate": 8.025729558531453e-06, + "loss": 1.4155, + "step": 22909 + }, + { + "epoch": 0.7624302047327839, + "grad_norm": 1.2936811447143555, + "learning_rate": 7.964168476264508e-06, + "loss": 1.4036, + "step": 22940 + }, + { + "epoch": 0.7634605158202605, + "grad_norm": 1.2729599475860596, + "learning_rate": 7.902799641040884e-06, + "loss": 1.4003, + "step": 22971 + }, + { + "epoch": 0.7644908269077373, + "grad_norm": 1.2257497310638428, + "learning_rate": 7.841623745399523e-06, + "loss": 1.408, + "step": 23002 + }, + { + "epoch": 0.765521137995214, + "grad_norm": 1.254761815071106, + "learning_rate": 7.780641479702114e-06, + "loss": 1.3925, + "step": 23033 + }, + { + "epoch": 0.7665514490826908, + "grad_norm": 1.2740334272384644, + "learning_rate": 7.719853532125227e-06, + "loss": 1.3996, + "step": 23064 + }, + { + "epoch": 0.7675817601701675, + "grad_norm": 1.2421025037765503, + "learning_rate": 7.65926058865258e-06, + "loss": 1.3852, + "step": 23095 + }, + { + "epoch": 0.7686120712576442, + "grad_norm": 1.3271669149398804, + "learning_rate": 7.598863333067313e-06, + "loss": 1.408, + "step": 23126 + }, + { + "epoch": 0.769642382345121, + "grad_norm": 1.3040279150009155, + "learning_rate": 7.538662446944253e-06, + "loss": 1.3718, + "step": 23157 + }, + { + "epoch": 0.7706726934325977, + "grad_norm": 1.230797290802002, + "learning_rate": 7.478658609642211e-06, + "loss": 1.3776, + "step": 23188 + }, + { + "epoch": 0.7717030045200745, + "grad_norm": 1.2709274291992188, + "learning_rate": 7.418852498296327e-06, + "loss": 1.3975, + "step": 23219 + }, + { + "epoch": 0.7727333156075512, + "grad_norm": 1.227398157119751, + "learning_rate": 7.359244787810457e-06, + "loss": 1.382, + "step": 23250 + }, + { + "epoch": 0.773763626695028, + "grad_norm": 1.242308259010315, + "learning_rate": 7.299836150849493e-06, + "loss": 1.3792, + "step": 23281 + }, + { + "epoch": 0.7747939377825046, + "grad_norm": 1.2658405303955078, + "learning_rate": 7.240627257831847e-06, + "loss": 1.3699, + "step": 23312 + }, + { + "epoch": 0.7758242488699814, + "grad_norm": 1.3357101678848267, + "learning_rate": 7.1816187769218195e-06, + "loss": 1.3972, + "step": 23343 + }, + { + "epoch": 0.7768545599574581, + "grad_norm": 1.2248833179473877, + "learning_rate": 7.1228113740220895e-06, + "loss": 1.3875, + "step": 23374 + }, + { + "epoch": 0.7778848710449349, + "grad_norm": 1.2615251541137695, + "learning_rate": 7.064205712766226e-06, + "loss": 1.3947, + "step": 23405 + }, + { + "epoch": 0.7789151821324116, + "grad_norm": 1.2719477415084839, + "learning_rate": 7.005802454511129e-06, + "loss": 1.3943, + "step": 23436 + }, + { + "epoch": 0.7799454932198884, + "grad_norm": 1.2429877519607544, + "learning_rate": 6.947602258329639e-06, + "loss": 1.3924, + "step": 23467 + }, + { + "epoch": 0.780975804307365, + "grad_norm": 1.3180112838745117, + "learning_rate": 6.889605781003078e-06, + "loss": 1.4095, + "step": 23498 + }, + { + "epoch": 0.7820061153948418, + "grad_norm": 1.3340109586715698, + "learning_rate": 6.831813677013776e-06, + "loss": 1.3873, + "step": 23529 + }, + { + "epoch": 0.7830364264823185, + "grad_norm": 1.2713093757629395, + "learning_rate": 6.774226598537792e-06, + "loss": 1.3882, + "step": 23560 + }, + { + "epoch": 0.7840667375697953, + "grad_norm": 1.2504241466522217, + "learning_rate": 6.716845195437482e-06, + "loss": 1.3795, + "step": 23591 + }, + { + "epoch": 0.785097048657272, + "grad_norm": 1.273703694343567, + "learning_rate": 6.659670115254168e-06, + "loss": 1.3819, + "step": 23622 + }, + { + "epoch": 0.7861273597447488, + "grad_norm": 1.3121949434280396, + "learning_rate": 6.602702003200872e-06, + "loss": 1.3827, + "step": 23653 + }, + { + "epoch": 0.7871576708322254, + "grad_norm": 1.2552127838134766, + "learning_rate": 6.545941502154992e-06, + "loss": 1.3935, + "step": 23684 + }, + { + "epoch": 0.7881879819197022, + "grad_norm": 1.2457008361816406, + "learning_rate": 6.489389252651057e-06, + "loss": 1.3847, + "step": 23715 + }, + { + "epoch": 0.7892182930071789, + "grad_norm": 1.2819870710372925, + "learning_rate": 6.4330458928735325e-06, + "loss": 1.3965, + "step": 23746 + }, + { + "epoch": 0.7902486040946557, + "grad_norm": 1.2543584108352661, + "learning_rate": 6.376912058649559e-06, + "loss": 1.4025, + "step": 23777 + }, + { + "epoch": 0.7912789151821324, + "grad_norm": 1.2502461671829224, + "learning_rate": 6.320988383441845e-06, + "loss": 1.3799, + "step": 23808 + }, + { + "epoch": 0.7923092262696092, + "grad_norm": 1.2568906545639038, + "learning_rate": 6.265275498341452e-06, + "loss": 1.3887, + "step": 23839 + }, + { + "epoch": 0.7933395373570858, + "grad_norm": 1.2879040241241455, + "learning_rate": 6.209774032060714e-06, + "loss": 1.3922, + "step": 23870 + }, + { + "epoch": 0.7943698484445626, + "grad_norm": 1.2547533512115479, + "learning_rate": 6.1544846109261365e-06, + "loss": 1.3891, + "step": 23901 + }, + { + "epoch": 0.7954001595320394, + "grad_norm": 1.2941306829452515, + "learning_rate": 6.099407858871342e-06, + "loss": 1.3914, + "step": 23932 + }, + { + "epoch": 0.7964304706195161, + "grad_norm": 1.3194507360458374, + "learning_rate": 6.044544397429958e-06, + "loss": 1.3857, + "step": 23963 + }, + { + "epoch": 0.7974607817069929, + "grad_norm": 1.2143921852111816, + "learning_rate": 5.989894845728708e-06, + "loss": 1.4041, + "step": 23994 + }, + { + "epoch": 0.7984910927944695, + "grad_norm": 1.2587990760803223, + "learning_rate": 5.9354598204803605e-06, + "loss": 1.3901, + "step": 24025 + }, + { + "epoch": 0.7995214038819463, + "grad_norm": 1.2482203245162964, + "learning_rate": 5.881239935976762e-06, + "loss": 1.384, + "step": 24056 + }, + { + "epoch": 0.800551714969423, + "grad_norm": 1.2880163192749023, + "learning_rate": 5.827235804081954e-06, + "loss": 1.3876, + "step": 24087 + }, + { + "epoch": 0.8015820260568998, + "grad_norm": 1.2727841138839722, + "learning_rate": 5.773448034225221e-06, + "loss": 1.3752, + "step": 24118 + }, + { + "epoch": 0.8026123371443765, + "grad_norm": 1.2767062187194824, + "learning_rate": 5.719877233394228e-06, + "loss": 1.4, + "step": 24149 + }, + { + "epoch": 0.8036426482318533, + "grad_norm": 1.2654463052749634, + "learning_rate": 5.666524006128191e-06, + "loss": 1.39, + "step": 24180 + }, + { + "epoch": 0.8046729593193299, + "grad_norm": 1.2623034715652466, + "learning_rate": 5.613388954511015e-06, + "loss": 1.3885, + "step": 24211 + }, + { + "epoch": 0.8057032704068067, + "grad_norm": 1.303368330001831, + "learning_rate": 5.560472678164552e-06, + "loss": 1.3933, + "step": 24242 + }, + { + "epoch": 0.8067335814942834, + "grad_norm": 1.232909917831421, + "learning_rate": 5.507775774241775e-06, + "loss": 1.3897, + "step": 24273 + }, + { + "epoch": 0.8077638925817602, + "grad_norm": 1.3074171543121338, + "learning_rate": 5.4552988374200945e-06, + "loss": 1.3836, + "step": 24304 + }, + { + "epoch": 0.8087942036692369, + "grad_norm": 1.287463903427124, + "learning_rate": 5.403042459894597e-06, + "loss": 1.3889, + "step": 24335 + }, + { + "epoch": 0.8098245147567137, + "grad_norm": 1.2616747617721558, + "learning_rate": 5.3510072313714135e-06, + "loss": 1.3978, + "step": 24366 + }, + { + "epoch": 0.8108548258441903, + "grad_norm": 1.2531288862228394, + "learning_rate": 5.2991937390610205e-06, + "loss": 1.4116, + "step": 24397 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8111560432178168e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-24416/training_args.bin b/checkpoint-24416/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..974208468b82a3c5684aaa384776477cf21c18ca --- /dev/null +++ b/checkpoint-24416/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a23be0ff07d6d3142f7c0980f91dddba845519c24fcb411cbb4b9ddb1513ff +size 5304 diff --git a/checkpoint-27468/config.json b/checkpoint-27468/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aaa74176892d42e1c7f5979b7ddf8ab15985d3 --- /dev/null +++ b/checkpoint-27468/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "/mnt/parscratch/users/acp23ay/private/models/Llama-3.1-8B-Instruct-ta-madlad-mean/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 138256 +} diff --git a/checkpoint-27468/generation_config.json b/checkpoint-27468/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-27468/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-27468/model-00001-of-00007.safetensors b/checkpoint-27468/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8363011e37d45397bdafb7583c37c44701ef085d --- /dev/null +++ b/checkpoint-27468/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c4b0a6de37b7dcc73983d06e47fd3f42535dfcc1b82389bf50345ed20c40c70 +size 4983197184 diff --git a/checkpoint-27468/model-00002-of-00007.safetensors b/checkpoint-27468/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d32d83cb96a109af411b9cf577e7fbfe07ea76fc --- /dev/null +++ b/checkpoint-27468/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168629c67732309d49f60a5ec48a6d160212bc987365e82e183dfbf74ba0c1f3 +size 4899116432 diff --git a/checkpoint-27468/model-00003-of-00007.safetensors b/checkpoint-27468/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-27468/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-27468/model-00004-of-00007.safetensors b/checkpoint-27468/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-27468/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-27468/model-00005-of-00007.safetensors b/checkpoint-27468/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-27468/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-27468/model-00006-of-00007.safetensors b/checkpoint-27468/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4040407da7515f4aa192160d68343b0ee56ff4b8 --- /dev/null +++ b/checkpoint-27468/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93cc4b82af2ec836f7c9485d57eeae4b2724b04b563e51c55d51177725e06670 +size 4999813120 diff --git a/checkpoint-27468/model-00007-of-00007.safetensors b/checkpoint-27468/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..06732d9699ac151c39fc4bc221926f5e0982ef58 --- /dev/null +++ b/checkpoint-27468/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ba375f5912b918d21a925664ae53d94a999a6a32c49fac6e8a79ecf62b21a07 +size 2734998184 diff --git a/checkpoint-27468/model.safetensors.index.json b/checkpoint-27468/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..318803c6a3dd771c7f7c3b8038a896af7c8322ae --- /dev/null +++ b/checkpoint-27468/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32448724992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-27468/optimizer.pt b/checkpoint-27468/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d668172113bc427ea817fe972d629f6d78d63cdd --- /dev/null +++ b/checkpoint-27468/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a61038582ddc1003ef3f4b318f77cae1b128704a7a053e2ea9b54a9f2e46421a +size 16040396334 diff --git a/checkpoint-27468/rng_state.pth b/checkpoint-27468/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-27468/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-27468/scheduler.pt b/checkpoint-27468/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec6a33a1e16fa727d72d8610d56b97fd04ba15e3 --- /dev/null +++ b/checkpoint-27468/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d47b007e64bffbb0dc51c02560ea2fea14f1ab5035228332be1bd00a38697eb +size 1064 diff --git a/checkpoint-27468/trainer_state.json b/checkpoint-27468/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6e4f18d67a8627de38f584deba5d8ee7e2e8c82a --- /dev/null +++ b/checkpoint-27468/trainer_state.json @@ -0,0 +1,6235 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9129220951874502, + "eval_steps": 500, + "global_step": 27468, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001030311087476735, + "grad_norm": 60.25547409057617, + "learning_rate": 1.0157273918741808e-06, + "loss": 8.8455, + "step": 31 + }, + { + "epoch": 0.00206062217495347, + "grad_norm": 15.669363975524902, + "learning_rate": 2.0314547837483616e-06, + "loss": 7.1553, + "step": 62 + }, + { + "epoch": 0.003090933262430205, + "grad_norm": 15.366345405578613, + "learning_rate": 3.0471821756225426e-06, + "loss": 5.8784, + "step": 93 + }, + { + "epoch": 0.00412124434990694, + "grad_norm": 36.30561828613281, + "learning_rate": 4.062909567496723e-06, + "loss": 4.7708, + "step": 124 + }, + { + "epoch": 0.005151555437383675, + "grad_norm": 27.202678680419922, + "learning_rate": 5.078636959370905e-06, + "loss": 4.1629, + "step": 155 + }, + { + "epoch": 0.00618186652486041, + "grad_norm": 24.30484962463379, + "learning_rate": 6.094364351245085e-06, + "loss": 3.867, + "step": 186 + }, + { + "epoch": 0.007212177612337145, + "grad_norm": 19.916366577148438, + "learning_rate": 7.110091743119267e-06, + "loss": 3.6131, + "step": 217 + }, + { + "epoch": 0.00824248869981388, + "grad_norm": 17.577274322509766, + "learning_rate": 8.125819134993446e-06, + "loss": 3.4772, + "step": 248 + }, + { + "epoch": 0.009272799787290615, + "grad_norm": 12.133190155029297, + "learning_rate": 9.141546526867629e-06, + "loss": 3.3218, + "step": 279 + }, + { + "epoch": 0.01030311087476735, + "grad_norm": 19.79263687133789, + "learning_rate": 1.015727391874181e-05, + "loss": 3.2055, + "step": 310 + }, + { + "epoch": 0.011333421962244085, + "grad_norm": 16.38133430480957, + "learning_rate": 1.117300131061599e-05, + "loss": 3.1062, + "step": 341 + }, + { + "epoch": 0.01236373304972082, + "grad_norm": 12.638299942016602, + "learning_rate": 1.218872870249017e-05, + "loss": 3.0106, + "step": 372 + }, + { + "epoch": 0.013394044137197554, + "grad_norm": 9.46596908569336, + "learning_rate": 1.3204456094364351e-05, + "loss": 2.924, + "step": 403 + }, + { + "epoch": 0.01442435522467429, + "grad_norm": 10.945392608642578, + "learning_rate": 1.4220183486238533e-05, + "loss": 2.844, + "step": 434 + }, + { + "epoch": 0.015454666312151024, + "grad_norm": 8.474015235900879, + "learning_rate": 1.5235910878112714e-05, + "loss": 2.7892, + "step": 465 + }, + { + "epoch": 0.01648497739962776, + "grad_norm": 9.370804786682129, + "learning_rate": 1.6251638269986893e-05, + "loss": 2.7509, + "step": 496 + }, + { + "epoch": 0.017515288487104493, + "grad_norm": 11.63398551940918, + "learning_rate": 1.7267365661861077e-05, + "loss": 2.6999, + "step": 527 + }, + { + "epoch": 0.01854559957458123, + "grad_norm": 9.17713451385498, + "learning_rate": 1.8283093053735257e-05, + "loss": 2.6459, + "step": 558 + }, + { + "epoch": 0.019575910662057962, + "grad_norm": 7.119054794311523, + "learning_rate": 1.9298820445609438e-05, + "loss": 2.603, + "step": 589 + }, + { + "epoch": 0.0206062217495347, + "grad_norm": 6.653646945953369, + "learning_rate": 2.031454783748362e-05, + "loss": 2.5588, + "step": 620 + }, + { + "epoch": 0.021636532837011432, + "grad_norm": 8.332653045654297, + "learning_rate": 2.13302752293578e-05, + "loss": 2.5357, + "step": 651 + }, + { + "epoch": 0.02266684392448817, + "grad_norm": 6.4949116706848145, + "learning_rate": 2.234600262123198e-05, + "loss": 2.4967, + "step": 682 + }, + { + "epoch": 0.023697155011964902, + "grad_norm": 9.41009521484375, + "learning_rate": 2.336173001310616e-05, + "loss": 2.4563, + "step": 713 + }, + { + "epoch": 0.02472746609944164, + "grad_norm": 7.840345859527588, + "learning_rate": 2.437745740498034e-05, + "loss": 2.4383, + "step": 744 + }, + { + "epoch": 0.025757777186918372, + "grad_norm": 6.116458415985107, + "learning_rate": 2.5393184796854525e-05, + "loss": 2.3817, + "step": 775 + }, + { + "epoch": 0.02678808827439511, + "grad_norm": 5.938300609588623, + "learning_rate": 2.6408912188728702e-05, + "loss": 2.3508, + "step": 806 + }, + { + "epoch": 0.027818399361871842, + "grad_norm": 5.4408345222473145, + "learning_rate": 2.7424639580602886e-05, + "loss": 2.3325, + "step": 837 + }, + { + "epoch": 0.02884871044934858, + "grad_norm": 5.375136375427246, + "learning_rate": 2.8440366972477066e-05, + "loss": 2.3101, + "step": 868 + }, + { + "epoch": 0.029879021536825312, + "grad_norm": 5.149726867675781, + "learning_rate": 2.9456094364351244e-05, + "loss": 2.282, + "step": 899 + }, + { + "epoch": 0.03090933262430205, + "grad_norm": 4.591221332550049, + "learning_rate": 3.0471821756225428e-05, + "loss": 2.2427, + "step": 930 + }, + { + "epoch": 0.031939643711778785, + "grad_norm": 4.977034091949463, + "learning_rate": 3.148754914809961e-05, + "loss": 2.2218, + "step": 961 + }, + { + "epoch": 0.03296995479925552, + "grad_norm": 5.038781642913818, + "learning_rate": 3.2503276539973785e-05, + "loss": 2.2044, + "step": 992 + }, + { + "epoch": 0.03400026588673225, + "grad_norm": 4.872281551361084, + "learning_rate": 3.351900393184797e-05, + "loss": 2.1657, + "step": 1023 + }, + { + "epoch": 0.035030576974208985, + "grad_norm": 4.370841979980469, + "learning_rate": 3.453473132372215e-05, + "loss": 2.1365, + "step": 1054 + }, + { + "epoch": 0.036060888061685725, + "grad_norm": 4.087072849273682, + "learning_rate": 3.555045871559633e-05, + "loss": 2.1253, + "step": 1085 + }, + { + "epoch": 0.03709119914916246, + "grad_norm": 4.113957405090332, + "learning_rate": 3.6566186107470514e-05, + "loss": 2.0973, + "step": 1116 + }, + { + "epoch": 0.03812151023663919, + "grad_norm": 4.0119733810424805, + "learning_rate": 3.7581913499344695e-05, + "loss": 2.1024, + "step": 1147 + }, + { + "epoch": 0.039151821324115925, + "grad_norm": 4.247573375701904, + "learning_rate": 3.8597640891218876e-05, + "loss": 2.0722, + "step": 1178 + }, + { + "epoch": 0.04018213241159266, + "grad_norm": 3.5575129985809326, + "learning_rate": 3.9613368283093056e-05, + "loss": 2.056, + "step": 1209 + }, + { + "epoch": 0.0412124434990694, + "grad_norm": 3.8885862827301025, + "learning_rate": 4.062909567496724e-05, + "loss": 2.0389, + "step": 1240 + }, + { + "epoch": 0.04224275458654613, + "grad_norm": 3.680628538131714, + "learning_rate": 4.164482306684142e-05, + "loss": 2.0385, + "step": 1271 + }, + { + "epoch": 0.043273065674022865, + "grad_norm": 3.780876874923706, + "learning_rate": 4.26605504587156e-05, + "loss": 2.0097, + "step": 1302 + }, + { + "epoch": 0.0443033767614996, + "grad_norm": 4.235328674316406, + "learning_rate": 4.367627785058978e-05, + "loss": 2.0024, + "step": 1333 + }, + { + "epoch": 0.04533368784897634, + "grad_norm": 3.326941967010498, + "learning_rate": 4.469200524246396e-05, + "loss": 1.9953, + "step": 1364 + }, + { + "epoch": 0.04636399893645307, + "grad_norm": 3.28456449508667, + "learning_rate": 4.570773263433814e-05, + "loss": 1.9579, + "step": 1395 + }, + { + "epoch": 0.047394310023929805, + "grad_norm": 16.107433319091797, + "learning_rate": 4.672346002621232e-05, + "loss": 1.9701, + "step": 1426 + }, + { + "epoch": 0.04842462111140654, + "grad_norm": 3.5708224773406982, + "learning_rate": 4.77391874180865e-05, + "loss": 1.9621, + "step": 1457 + }, + { + "epoch": 0.04945493219888328, + "grad_norm": 2.9053499698638916, + "learning_rate": 4.875491480996068e-05, + "loss": 1.9458, + "step": 1488 + }, + { + "epoch": 0.05048524328636001, + "grad_norm": 3.0863258838653564, + "learning_rate": 4.977064220183487e-05, + "loss": 1.9483, + "step": 1519 + }, + { + "epoch": 0.051515554373836744, + "grad_norm": 2.9012269973754883, + "learning_rate": 4.9999915451558777e-05, + "loss": 1.928, + "step": 1550 + }, + { + "epoch": 0.05254586546131348, + "grad_norm": 3.0949041843414307, + "learning_rate": 4.999955597496219e-05, + "loss": 1.9229, + "step": 1581 + }, + { + "epoch": 0.05357617654879022, + "grad_norm": 2.8687901496887207, + "learning_rate": 4.9998914381774255e-05, + "loss": 1.915, + "step": 1612 + }, + { + "epoch": 0.05460648763626695, + "grad_norm": 3.2136878967285156, + "learning_rate": 4.999799067923527e-05, + "loss": 1.9197, + "step": 1643 + }, + { + "epoch": 0.055636798723743684, + "grad_norm": 2.590843677520752, + "learning_rate": 4.999678487776908e-05, + "loss": 1.8756, + "step": 1674 + }, + { + "epoch": 0.05666710981122042, + "grad_norm": 2.64634108543396, + "learning_rate": 4.9995296990983006e-05, + "loss": 1.9033, + "step": 1705 + }, + { + "epoch": 0.05769742089869716, + "grad_norm": 3.0151331424713135, + "learning_rate": 4.999352703566763e-05, + "loss": 1.8883, + "step": 1736 + }, + { + "epoch": 0.05872773198617389, + "grad_norm": 2.526806354522705, + "learning_rate": 4.999147503179668e-05, + "loss": 1.8666, + "step": 1767 + }, + { + "epoch": 0.059758043073650624, + "grad_norm": 2.510300397872925, + "learning_rate": 4.998914100252672e-05, + "loss": 1.854, + "step": 1798 + }, + { + "epoch": 0.06078835416112736, + "grad_norm": 2.4867682456970215, + "learning_rate": 4.998652497419696e-05, + "loss": 1.8548, + "step": 1829 + }, + { + "epoch": 0.0618186652486041, + "grad_norm": 2.3920586109161377, + "learning_rate": 4.9983626976328927e-05, + "loss": 1.8495, + "step": 1860 + }, + { + "epoch": 0.06284897633608083, + "grad_norm": 2.714177370071411, + "learning_rate": 4.998044704162613e-05, + "loss": 1.8433, + "step": 1891 + }, + { + "epoch": 0.06387928742355757, + "grad_norm": 2.3094465732574463, + "learning_rate": 4.9976985205973705e-05, + "loss": 1.8382, + "step": 1922 + }, + { + "epoch": 0.0649095985110343, + "grad_norm": 2.47184419631958, + "learning_rate": 4.997324150843799e-05, + "loss": 1.8464, + "step": 1953 + }, + { + "epoch": 0.06593990959851104, + "grad_norm": 2.391841411590576, + "learning_rate": 4.99692159912661e-05, + "loss": 1.8179, + "step": 1984 + }, + { + "epoch": 0.06697022068598776, + "grad_norm": 2.2471864223480225, + "learning_rate": 4.996490869988546e-05, + "loss": 1.8149, + "step": 2015 + }, + { + "epoch": 0.0680005317734645, + "grad_norm": 2.5497376918792725, + "learning_rate": 4.996031968290326e-05, + "loss": 1.8099, + "step": 2046 + }, + { + "epoch": 0.06903084286094124, + "grad_norm": 2.330463409423828, + "learning_rate": 4.995544899210594e-05, + "loss": 1.8267, + "step": 2077 + }, + { + "epoch": 0.07006115394841797, + "grad_norm": 2.3259341716766357, + "learning_rate": 4.9950296682458583e-05, + "loss": 1.7801, + "step": 2108 + }, + { + "epoch": 0.07109146503589471, + "grad_norm": 2.1711952686309814, + "learning_rate": 4.994486281210429e-05, + "loss": 1.7961, + "step": 2139 + }, + { + "epoch": 0.07212177612337145, + "grad_norm": 2.1808884143829346, + "learning_rate": 4.9939147442363566e-05, + "loss": 1.8109, + "step": 2170 + }, + { + "epoch": 0.07315208721084818, + "grad_norm": 2.089256525039673, + "learning_rate": 4.9933150637733574e-05, + "loss": 1.8026, + "step": 2201 + }, + { + "epoch": 0.07418239829832492, + "grad_norm": 2.0864951610565186, + "learning_rate": 4.992687246588743e-05, + "loss": 1.7753, + "step": 2232 + }, + { + "epoch": 0.07521270938580164, + "grad_norm": 2.36157488822937, + "learning_rate": 4.992031299767347e-05, + "loss": 1.7746, + "step": 2263 + }, + { + "epoch": 0.07624302047327838, + "grad_norm": 2.5334439277648926, + "learning_rate": 4.9913472307114386e-05, + "loss": 1.7927, + "step": 2294 + }, + { + "epoch": 0.07727333156075512, + "grad_norm": 2.2565715312957764, + "learning_rate": 4.9906350471406446e-05, + "loss": 1.7668, + "step": 2325 + }, + { + "epoch": 0.07830364264823185, + "grad_norm": 2.1043128967285156, + "learning_rate": 4.989894757091861e-05, + "loss": 1.7771, + "step": 2356 + }, + { + "epoch": 0.07933395373570859, + "grad_norm": 1.9659819602966309, + "learning_rate": 4.989126368919158e-05, + "loss": 1.7666, + "step": 2387 + }, + { + "epoch": 0.08036426482318532, + "grad_norm": 2.0778403282165527, + "learning_rate": 4.988329891293693e-05, + "loss": 1.7405, + "step": 2418 + }, + { + "epoch": 0.08139457591066206, + "grad_norm": 2.1767923831939697, + "learning_rate": 4.987505333203608e-05, + "loss": 1.7495, + "step": 2449 + }, + { + "epoch": 0.0824248869981388, + "grad_norm": 2.260143280029297, + "learning_rate": 4.9866527039539276e-05, + "loss": 1.7504, + "step": 2480 + }, + { + "epoch": 0.08345519808561552, + "grad_norm": 2.18271803855896, + "learning_rate": 4.9857720131664594e-05, + "loss": 1.7456, + "step": 2511 + }, + { + "epoch": 0.08448550917309226, + "grad_norm": 2.209594964981079, + "learning_rate": 4.9848632707796773e-05, + "loss": 1.7528, + "step": 2542 + }, + { + "epoch": 0.085515820260569, + "grad_norm": 2.0666229724884033, + "learning_rate": 4.9839264870486155e-05, + "loss": 1.7517, + "step": 2573 + }, + { + "epoch": 0.08654613134804573, + "grad_norm": 2.1070454120635986, + "learning_rate": 4.9829616725447526e-05, + "loss": 1.7474, + "step": 2604 + }, + { + "epoch": 0.08757644243552247, + "grad_norm": 1.9430303573608398, + "learning_rate": 4.981968838155888e-05, + "loss": 1.7348, + "step": 2635 + }, + { + "epoch": 0.0886067535229992, + "grad_norm": 1.9638925790786743, + "learning_rate": 4.980947995086024e-05, + "loss": 1.7202, + "step": 2666 + }, + { + "epoch": 0.08963706461047594, + "grad_norm": 1.8845652341842651, + "learning_rate": 4.979899154855234e-05, + "loss": 1.7375, + "step": 2697 + }, + { + "epoch": 0.09066737569795268, + "grad_norm": 5.712058067321777, + "learning_rate": 4.9788223292995386e-05, + "loss": 1.7379, + "step": 2728 + }, + { + "epoch": 0.0916976867854294, + "grad_norm": 1.9520670175552368, + "learning_rate": 4.977717530570768e-05, + "loss": 1.7302, + "step": 2759 + }, + { + "epoch": 0.09272799787290614, + "grad_norm": 1.8802224397659302, + "learning_rate": 4.976584771136425e-05, + "loss": 1.74, + "step": 2790 + }, + { + "epoch": 0.09375830896038288, + "grad_norm": 2.1098153591156006, + "learning_rate": 4.975424063779547e-05, + "loss": 1.7024, + "step": 2821 + }, + { + "epoch": 0.09478862004785961, + "grad_norm": 2.1568291187286377, + "learning_rate": 4.974235421598557e-05, + "loss": 1.7131, + "step": 2852 + }, + { + "epoch": 0.09581893113533635, + "grad_norm": 1.8769980669021606, + "learning_rate": 4.973018858007122e-05, + "loss": 1.7008, + "step": 2883 + }, + { + "epoch": 0.09684924222281308, + "grad_norm": 1.8325533866882324, + "learning_rate": 4.9717743867339963e-05, + "loss": 1.7058, + "step": 2914 + }, + { + "epoch": 0.09787955331028982, + "grad_norm": 2.086416721343994, + "learning_rate": 4.9705020218228695e-05, + "loss": 1.711, + "step": 2945 + }, + { + "epoch": 0.09890986439776656, + "grad_norm": 1.8294793367385864, + "learning_rate": 4.969201777632205e-05, + "loss": 1.6998, + "step": 2976 + }, + { + "epoch": 0.09994017548524328, + "grad_norm": 2.0608153343200684, + "learning_rate": 4.9678736688350846e-05, + "loss": 1.6948, + "step": 3007 + }, + { + "epoch": 0.10097048657272002, + "grad_norm": 3.2166008949279785, + "learning_rate": 4.966517710419033e-05, + "loss": 1.6788, + "step": 3038 + }, + { + "epoch": 0.10200079766019676, + "grad_norm": 1.9431313276290894, + "learning_rate": 4.965133917685858e-05, + "loss": 1.7115, + "step": 3069 + }, + { + "epoch": 0.10303110874767349, + "grad_norm": 1.967512845993042, + "learning_rate": 4.9637223062514714e-05, + "loss": 1.7033, + "step": 3100 + }, + { + "epoch": 0.10406141983515023, + "grad_norm": 1.9253389835357666, + "learning_rate": 4.962282892045718e-05, + "loss": 1.6856, + "step": 3131 + }, + { + "epoch": 0.10509173092262696, + "grad_norm": 1.986840009689331, + "learning_rate": 4.9608156913121904e-05, + "loss": 1.723, + "step": 3162 + }, + { + "epoch": 0.1061220420101037, + "grad_norm": 1.83523690700531, + "learning_rate": 4.959320720608049e-05, + "loss": 1.6912, + "step": 3193 + }, + { + "epoch": 0.10715235309758044, + "grad_norm": 2.1271955966949463, + "learning_rate": 4.9577979968038354e-05, + "loss": 1.7032, + "step": 3224 + }, + { + "epoch": 0.10818266418505716, + "grad_norm": 1.8383768796920776, + "learning_rate": 4.956247537083282e-05, + "loss": 1.6726, + "step": 3255 + }, + { + "epoch": 0.1092129752725339, + "grad_norm": 1.8806651830673218, + "learning_rate": 4.9546693589431145e-05, + "loss": 1.6817, + "step": 3286 + }, + { + "epoch": 0.11024328636001064, + "grad_norm": 1.7535260915756226, + "learning_rate": 4.9530634801928595e-05, + "loss": 1.6875, + "step": 3317 + }, + { + "epoch": 0.11127359744748737, + "grad_norm": 1.765906810760498, + "learning_rate": 4.9514299189546395e-05, + "loss": 1.6859, + "step": 3348 + }, + { + "epoch": 0.11230390853496411, + "grad_norm": 1.869828462600708, + "learning_rate": 4.949768693662973e-05, + "loss": 1.6915, + "step": 3379 + }, + { + "epoch": 0.11333421962244083, + "grad_norm": 1.8347504138946533, + "learning_rate": 4.948079823064559e-05, + "loss": 1.6859, + "step": 3410 + }, + { + "epoch": 0.11436453070991758, + "grad_norm": 1.7692474126815796, + "learning_rate": 4.946363326218074e-05, + "loss": 1.6565, + "step": 3441 + }, + { + "epoch": 0.11539484179739432, + "grad_norm": 1.8231885433197021, + "learning_rate": 4.9446192224939525e-05, + "loss": 1.686, + "step": 3472 + }, + { + "epoch": 0.11642515288487104, + "grad_norm": 1.7155958414077759, + "learning_rate": 4.942847531574167e-05, + "loss": 1.6538, + "step": 3503 + }, + { + "epoch": 0.11745546397234778, + "grad_norm": 1.787183403968811, + "learning_rate": 4.941048273452008e-05, + "loss": 1.6776, + "step": 3534 + }, + { + "epoch": 0.11848577505982451, + "grad_norm": 1.741213083267212, + "learning_rate": 4.9392214684318605e-05, + "loss": 1.6784, + "step": 3565 + }, + { + "epoch": 0.11951608614730125, + "grad_norm": 1.7836824655532837, + "learning_rate": 4.93736713712897e-05, + "loss": 1.6557, + "step": 3596 + }, + { + "epoch": 0.12054639723477799, + "grad_norm": 1.7103859186172485, + "learning_rate": 4.9354853004692124e-05, + "loss": 1.6606, + "step": 3627 + }, + { + "epoch": 0.12157670832225471, + "grad_norm": 1.7865506410598755, + "learning_rate": 4.93357597968886e-05, + "loss": 1.6409, + "step": 3658 + }, + { + "epoch": 0.12260701940973145, + "grad_norm": 1.7770143747329712, + "learning_rate": 4.931639196334338e-05, + "loss": 1.6574, + "step": 3689 + }, + { + "epoch": 0.1236373304972082, + "grad_norm": 1.857575535774231, + "learning_rate": 4.9296749722619826e-05, + "loss": 1.6724, + "step": 3720 + }, + { + "epoch": 0.12466764158468492, + "grad_norm": 1.8742581605911255, + "learning_rate": 4.9276833296377966e-05, + "loss": 1.6506, + "step": 3751 + }, + { + "epoch": 0.12569795267216166, + "grad_norm": 1.827668309211731, + "learning_rate": 4.925664290937196e-05, + "loss": 1.6523, + "step": 3782 + }, + { + "epoch": 0.1267282637596384, + "grad_norm": 1.7517486810684204, + "learning_rate": 4.9236178789447576e-05, + "loss": 1.6459, + "step": 3813 + }, + { + "epoch": 0.12775857484711514, + "grad_norm": 1.8109570741653442, + "learning_rate": 4.921544116753962e-05, + "loss": 1.6614, + "step": 3844 + }, + { + "epoch": 0.12878888593459187, + "grad_norm": 1.692597508430481, + "learning_rate": 4.919443027766935e-05, + "loss": 1.6431, + "step": 3875 + }, + { + "epoch": 0.1298191970220686, + "grad_norm": 1.8650025129318237, + "learning_rate": 4.91731463569418e-05, + "loss": 1.6466, + "step": 3906 + }, + { + "epoch": 0.13084950810954532, + "grad_norm": 1.6794081926345825, + "learning_rate": 4.915158964554312e-05, + "loss": 1.6504, + "step": 3937 + }, + { + "epoch": 0.13187981919702207, + "grad_norm": 1.7685374021530151, + "learning_rate": 4.912976038673786e-05, + "loss": 1.6446, + "step": 3968 + }, + { + "epoch": 0.1329101302844988, + "grad_norm": 1.7601110935211182, + "learning_rate": 4.9107658826866254e-05, + "loss": 1.631, + "step": 3999 + }, + { + "epoch": 0.13394044137197553, + "grad_norm": 2.0616064071655273, + "learning_rate": 4.908528521534139e-05, + "loss": 1.6476, + "step": 4030 + }, + { + "epoch": 0.13497075245945228, + "grad_norm": 1.8973504304885864, + "learning_rate": 4.906263980464644e-05, + "loss": 1.6582, + "step": 4061 + }, + { + "epoch": 0.136001063546929, + "grad_norm": 1.7768895626068115, + "learning_rate": 4.903972285033178e-05, + "loss": 1.6159, + "step": 4092 + }, + { + "epoch": 0.13703137463440573, + "grad_norm": 1.8264424800872803, + "learning_rate": 4.901653461101213e-05, + "loss": 1.6289, + "step": 4123 + }, + { + "epoch": 0.1380616857218825, + "grad_norm": 1.7140119075775146, + "learning_rate": 4.8993075348363626e-05, + "loss": 1.6357, + "step": 4154 + }, + { + "epoch": 0.13909199680935921, + "grad_norm": 1.6964486837387085, + "learning_rate": 4.896934532712084e-05, + "loss": 1.6233, + "step": 4185 + }, + { + "epoch": 0.14012230789683594, + "grad_norm": 1.8008025884628296, + "learning_rate": 4.8945344815073846e-05, + "loss": 1.637, + "step": 4216 + }, + { + "epoch": 0.1411526189843127, + "grad_norm": 1.562730073928833, + "learning_rate": 4.892107408306516e-05, + "loss": 1.6379, + "step": 4247 + }, + { + "epoch": 0.14218293007178942, + "grad_norm": 1.8273371458053589, + "learning_rate": 4.889653340498669e-05, + "loss": 1.6246, + "step": 4278 + }, + { + "epoch": 0.14321324115926615, + "grad_norm": 56.33716583251953, + "learning_rate": 4.8871723057776664e-05, + "loss": 1.6457, + "step": 4309 + }, + { + "epoch": 0.1442435522467429, + "grad_norm": 1.746523380279541, + "learning_rate": 4.8846643321416476e-05, + "loss": 1.6343, + "step": 4340 + }, + { + "epoch": 0.14527386333421963, + "grad_norm": 1.7737531661987305, + "learning_rate": 4.882129447892753e-05, + "loss": 1.6447, + "step": 4371 + }, + { + "epoch": 0.14630417442169635, + "grad_norm": 1.660485863685608, + "learning_rate": 4.8795676816368076e-05, + "loss": 1.6192, + "step": 4402 + }, + { + "epoch": 0.14733448550917308, + "grad_norm": 1.6823406219482422, + "learning_rate": 4.876979062282995e-05, + "loss": 1.6253, + "step": 4433 + }, + { + "epoch": 0.14836479659664983, + "grad_norm": 7.78139066696167, + "learning_rate": 4.8743636190435325e-05, + "loss": 1.6234, + "step": 4464 + }, + { + "epoch": 0.14939510768412656, + "grad_norm": 1.7426058053970337, + "learning_rate": 4.871721381433344e-05, + "loss": 1.6337, + "step": 4495 + }, + { + "epoch": 0.1504254187716033, + "grad_norm": 1.6294783353805542, + "learning_rate": 4.869052379269719e-05, + "loss": 1.6217, + "step": 4526 + }, + { + "epoch": 0.15145572985908004, + "grad_norm": 1.6523306369781494, + "learning_rate": 4.866356642671985e-05, + "loss": 1.605, + "step": 4557 + }, + { + "epoch": 0.15248604094655677, + "grad_norm": 1.8571300506591797, + "learning_rate": 4.8636342020611634e-05, + "loss": 1.6218, + "step": 4588 + }, + { + "epoch": 0.1535163520340335, + "grad_norm": 1.7754936218261719, + "learning_rate": 4.860885088159626e-05, + "loss": 1.6171, + "step": 4619 + }, + { + "epoch": 0.15454666312151025, + "grad_norm": 1.91987943649292, + "learning_rate": 4.858109331990751e-05, + "loss": 1.6167, + "step": 4650 + }, + { + "epoch": 0.15557697420898697, + "grad_norm": 1.5994452238082886, + "learning_rate": 4.855306964878567e-05, + "loss": 1.5951, + "step": 4681 + }, + { + "epoch": 0.1566072852964637, + "grad_norm": 1.6490916013717651, + "learning_rate": 4.8524780184474084e-05, + "loss": 1.616, + "step": 4712 + }, + { + "epoch": 0.15763759638394045, + "grad_norm": 1.5921640396118164, + "learning_rate": 4.8496225246215496e-05, + "loss": 1.6346, + "step": 4743 + }, + { + "epoch": 0.15866790747141718, + "grad_norm": 1.6729261875152588, + "learning_rate": 4.8467405156248505e-05, + "loss": 1.6165, + "step": 4774 + }, + { + "epoch": 0.1596982185588939, + "grad_norm": 1.628113031387329, + "learning_rate": 4.843832023980392e-05, + "loss": 1.6119, + "step": 4805 + }, + { + "epoch": 0.16072852964637063, + "grad_norm": 1.651647925376892, + "learning_rate": 4.840897082510106e-05, + "loss": 1.5997, + "step": 4836 + }, + { + "epoch": 0.1617588407338474, + "grad_norm": 1.5297720432281494, + "learning_rate": 4.8379357243344084e-05, + "loss": 1.6242, + "step": 4867 + }, + { + "epoch": 0.1627891518213241, + "grad_norm": 1.5779869556427002, + "learning_rate": 4.8349479828718236e-05, + "loss": 1.6149, + "step": 4898 + }, + { + "epoch": 0.16381946290880084, + "grad_norm": 1.5843939781188965, + "learning_rate": 4.8319338918386075e-05, + "loss": 1.5926, + "step": 4929 + }, + { + "epoch": 0.1648497739962776, + "grad_norm": 2.3762106895446777, + "learning_rate": 4.828893485248369e-05, + "loss": 1.6108, + "step": 4960 + }, + { + "epoch": 0.16588008508375432, + "grad_norm": 1.5871953964233398, + "learning_rate": 4.825826797411682e-05, + "loss": 1.6103, + "step": 4991 + }, + { + "epoch": 0.16691039617123105, + "grad_norm": 1.5934125185012817, + "learning_rate": 4.822733862935702e-05, + "loss": 1.6091, + "step": 5022 + }, + { + "epoch": 0.1679407072587078, + "grad_norm": 1.6997628211975098, + "learning_rate": 4.819614716723775e-05, + "loss": 1.6098, + "step": 5053 + }, + { + "epoch": 0.16897101834618453, + "grad_norm": 1.682849645614624, + "learning_rate": 4.8164693939750425e-05, + "loss": 1.599, + "step": 5084 + }, + { + "epoch": 0.17000132943366125, + "grad_norm": 1.709743857383728, + "learning_rate": 4.813297930184042e-05, + "loss": 1.6194, + "step": 5115 + }, + { + "epoch": 0.171031640521138, + "grad_norm": 1.725879430770874, + "learning_rate": 4.810100361140314e-05, + "loss": 1.6115, + "step": 5146 + }, + { + "epoch": 0.17206195160861473, + "grad_norm": 1.6710290908813477, + "learning_rate": 4.8068767229279885e-05, + "loss": 1.6032, + "step": 5177 + }, + { + "epoch": 0.17309226269609146, + "grad_norm": 1.6156634092330933, + "learning_rate": 4.8036270519253854e-05, + "loss": 1.5973, + "step": 5208 + }, + { + "epoch": 0.1741225737835682, + "grad_norm": 1.5654059648513794, + "learning_rate": 4.8003513848046e-05, + "loss": 1.5817, + "step": 5239 + }, + { + "epoch": 0.17515288487104494, + "grad_norm": 1.5789822340011597, + "learning_rate": 4.79704975853109e-05, + "loss": 1.6138, + "step": 5270 + }, + { + "epoch": 0.17618319595852167, + "grad_norm": 1.6022037267684937, + "learning_rate": 4.793722210363262e-05, + "loss": 1.5998, + "step": 5301 + }, + { + "epoch": 0.1772135070459984, + "grad_norm": 1.5142741203308105, + "learning_rate": 4.7903687778520414e-05, + "loss": 1.6061, + "step": 5332 + }, + { + "epoch": 0.17824381813347515, + "grad_norm": 1.6454212665557861, + "learning_rate": 4.7869894988404593e-05, + "loss": 1.6063, + "step": 5363 + }, + { + "epoch": 0.17927412922095187, + "grad_norm": 1.5250823497772217, + "learning_rate": 4.783584411463221e-05, + "loss": 1.6038, + "step": 5394 + }, + { + "epoch": 0.1803044403084286, + "grad_norm": 1.5829335451126099, + "learning_rate": 4.780153554146274e-05, + "loss": 1.5949, + "step": 5425 + }, + { + "epoch": 0.18133475139590535, + "grad_norm": 1.5342432260513306, + "learning_rate": 4.7766969656063766e-05, + "loss": 1.5913, + "step": 5456 + }, + { + "epoch": 0.18236506248338208, + "grad_norm": 1.6397250890731812, + "learning_rate": 4.773214684850662e-05, + "loss": 1.6102, + "step": 5487 + }, + { + "epoch": 0.1833953735708588, + "grad_norm": 1.5228471755981445, + "learning_rate": 4.769706751176193e-05, + "loss": 1.5885, + "step": 5518 + }, + { + "epoch": 0.18442568465833556, + "grad_norm": 1.6186103820800781, + "learning_rate": 4.7661732041695264e-05, + "loss": 1.6086, + "step": 5549 + }, + { + "epoch": 0.18545599574581229, + "grad_norm": 1.6024582386016846, + "learning_rate": 4.762614083706258e-05, + "loss": 1.6004, + "step": 5580 + }, + { + "epoch": 0.186486306833289, + "grad_norm": 1.5443711280822754, + "learning_rate": 4.759029429950581e-05, + "loss": 1.6048, + "step": 5611 + }, + { + "epoch": 0.18751661792076577, + "grad_norm": 1.4831629991531372, + "learning_rate": 4.7554192833548235e-05, + "loss": 1.5841, + "step": 5642 + }, + { + "epoch": 0.1885469290082425, + "grad_norm": 1.6426068544387817, + "learning_rate": 4.751783684659e-05, + "loss": 1.587, + "step": 5673 + }, + { + "epoch": 0.18957724009571922, + "grad_norm": 1.4609078168869019, + "learning_rate": 4.748122674890348e-05, + "loss": 1.5945, + "step": 5704 + }, + { + "epoch": 0.19060755118319597, + "grad_norm": 1.5365614891052246, + "learning_rate": 4.7444362953628654e-05, + "loss": 1.5737, + "step": 5735 + }, + { + "epoch": 0.1916378622706727, + "grad_norm": 1.5755670070648193, + "learning_rate": 4.7407245876768424e-05, + "loss": 1.5862, + "step": 5766 + }, + { + "epoch": 0.19266817335814942, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.736987593718397e-05, + "loss": 1.5663, + "step": 5797 + }, + { + "epoch": 0.19369848444562615, + "grad_norm": 1.5927278995513916, + "learning_rate": 4.733225355658999e-05, + "loss": 1.5776, + "step": 5828 + }, + { + "epoch": 0.1947287955331029, + "grad_norm": 1.5593287944793701, + "learning_rate": 4.7294379159549926e-05, + "loss": 1.579, + "step": 5859 + }, + { + "epoch": 0.19575910662057963, + "grad_norm": 1.534055233001709, + "learning_rate": 4.725625317347119e-05, + "loss": 1.6017, + "step": 5890 + }, + { + "epoch": 0.19678941770805636, + "grad_norm": 1.5846387147903442, + "learning_rate": 4.7217876028600374e-05, + "loss": 1.5739, + "step": 5921 + }, + { + "epoch": 0.1978197287955331, + "grad_norm": 1.5377682447433472, + "learning_rate": 4.717924815801832e-05, + "loss": 1.57, + "step": 5952 + }, + { + "epoch": 0.19885003988300984, + "grad_norm": 1.467956781387329, + "learning_rate": 4.714036999763532e-05, + "loss": 1.5736, + "step": 5983 + }, + { + "epoch": 0.19988035097048656, + "grad_norm": 1.601070523262024, + "learning_rate": 4.7101241986186116e-05, + "loss": 1.5861, + "step": 6014 + }, + { + "epoch": 0.20091066205796332, + "grad_norm": 1.5051921606063843, + "learning_rate": 4.7061864565225e-05, + "loss": 1.5735, + "step": 6045 + }, + { + "epoch": 0.20194097314544004, + "grad_norm": 1.462843418121338, + "learning_rate": 4.702223817912081e-05, + "loss": 1.582, + "step": 6076 + }, + { + "epoch": 0.20297128423291677, + "grad_norm": 1.5698682069778442, + "learning_rate": 4.698236327505195e-05, + "loss": 1.5647, + "step": 6107 + }, + { + "epoch": 0.20400159532039353, + "grad_norm": 1.5633916854858398, + "learning_rate": 4.694224030300127e-05, + "loss": 1.5741, + "step": 6138 + }, + { + "epoch": 0.20503190640787025, + "grad_norm": 1.6174733638763428, + "learning_rate": 4.690186971575107e-05, + "loss": 1.5634, + "step": 6169 + }, + { + "epoch": 0.20606221749534698, + "grad_norm": 1.4957518577575684, + "learning_rate": 4.6861251968877916e-05, + "loss": 1.575, + "step": 6200 + }, + { + "epoch": 0.2070925285828237, + "grad_norm": 1.670933485031128, + "learning_rate": 4.68203875207476e-05, + "loss": 1.5792, + "step": 6231 + }, + { + "epoch": 0.20812283967030046, + "grad_norm": 1.5676430463790894, + "learning_rate": 4.677927683250983e-05, + "loss": 1.5689, + "step": 6262 + }, + { + "epoch": 0.20915315075777718, + "grad_norm": 1.5753976106643677, + "learning_rate": 4.6737920368093156e-05, + "loss": 1.5594, + "step": 6293 + }, + { + "epoch": 0.2101834618452539, + "grad_norm": 1.4973617792129517, + "learning_rate": 4.669631859419965e-05, + "loss": 1.5593, + "step": 6324 + }, + { + "epoch": 0.21121377293273066, + "grad_norm": 1.4691433906555176, + "learning_rate": 4.6654471980299676e-05, + "loss": 1.5711, + "step": 6355 + }, + { + "epoch": 0.2122440840202074, + "grad_norm": 1.407630443572998, + "learning_rate": 4.661238099862658e-05, + "loss": 1.5787, + "step": 6386 + }, + { + "epoch": 0.21327439510768412, + "grad_norm": 1.5011677742004395, + "learning_rate": 4.657004612417138e-05, + "loss": 1.5751, + "step": 6417 + }, + { + "epoch": 0.21430470619516087, + "grad_norm": 1.509750485420227, + "learning_rate": 4.6527467834677374e-05, + "loss": 1.5583, + "step": 6448 + }, + { + "epoch": 0.2153350172826376, + "grad_norm": 1.3919882774353027, + "learning_rate": 4.648464661063478e-05, + "loss": 1.5712, + "step": 6479 + }, + { + "epoch": 0.21636532837011432, + "grad_norm": 1.4854936599731445, + "learning_rate": 4.6441582935275264e-05, + "loss": 1.5637, + "step": 6510 + }, + { + "epoch": 0.21739563945759108, + "grad_norm": 1.4413583278656006, + "learning_rate": 4.6398277294566586e-05, + "loss": 1.56, + "step": 6541 + }, + { + "epoch": 0.2184259505450678, + "grad_norm": 1.5063883066177368, + "learning_rate": 4.6354730177207e-05, + "loss": 1.5525, + "step": 6572 + }, + { + "epoch": 0.21945626163254453, + "grad_norm": 1.4899688959121704, + "learning_rate": 4.6310942074619787e-05, + "loss": 1.5817, + "step": 6603 + }, + { + "epoch": 0.22048657272002128, + "grad_norm": 1.3927967548370361, + "learning_rate": 4.626691348094777e-05, + "loss": 1.5407, + "step": 6634 + }, + { + "epoch": 0.221516883807498, + "grad_norm": 1.5378398895263672, + "learning_rate": 4.622264489304762e-05, + "loss": 1.5561, + "step": 6665 + }, + { + "epoch": 0.22254719489497474, + "grad_norm": 1.554624319076538, + "learning_rate": 4.617813681048434e-05, + "loss": 1.5859, + "step": 6696 + }, + { + "epoch": 0.22357750598245146, + "grad_norm": 1.5356658697128296, + "learning_rate": 4.61333897355256e-05, + "loss": 1.5531, + "step": 6727 + }, + { + "epoch": 0.22460781706992822, + "grad_norm": 1.5534918308258057, + "learning_rate": 4.608840417313604e-05, + "loss": 1.5774, + "step": 6758 + }, + { + "epoch": 0.22563812815740494, + "grad_norm": 1.5660988092422485, + "learning_rate": 4.6043180630971646e-05, + "loss": 1.5763, + "step": 6789 + }, + { + "epoch": 0.22666843924488167, + "grad_norm": 1.4993386268615723, + "learning_rate": 4.599771961937391e-05, + "loss": 1.5615, + "step": 6820 + }, + { + "epoch": 0.22769875033235842, + "grad_norm": 1.4630553722381592, + "learning_rate": 4.5952021651364204e-05, + "loss": 1.543, + "step": 6851 + }, + { + "epoch": 0.22872906141983515, + "grad_norm": 1.470173954963684, + "learning_rate": 4.590608724263786e-05, + "loss": 1.5674, + "step": 6882 + }, + { + "epoch": 0.22975937250731188, + "grad_norm": 1.5867971181869507, + "learning_rate": 4.585991691155845e-05, + "loss": 1.5702, + "step": 6913 + }, + { + "epoch": 0.23078968359478863, + "grad_norm": 1.44207763671875, + "learning_rate": 4.581351117915188e-05, + "loss": 1.5436, + "step": 6944 + }, + { + "epoch": 0.23181999468226536, + "grad_norm": 1.4691039323806763, + "learning_rate": 4.5766870569100534e-05, + "loss": 1.5465, + "step": 6975 + }, + { + "epoch": 0.23285030576974208, + "grad_norm": 1.4807918071746826, + "learning_rate": 4.571999560773736e-05, + "loss": 1.5564, + "step": 7006 + }, + { + "epoch": 0.23388061685721884, + "grad_norm": 1.481487512588501, + "learning_rate": 4.5672886824039915e-05, + "loss": 1.5466, + "step": 7037 + }, + { + "epoch": 0.23491092794469556, + "grad_norm": 1.4518013000488281, + "learning_rate": 4.5625544749624435e-05, + "loss": 1.5618, + "step": 7068 + }, + { + "epoch": 0.2359412390321723, + "grad_norm": 1.4186676740646362, + "learning_rate": 4.5577969918739794e-05, + "loss": 1.5528, + "step": 7099 + }, + { + "epoch": 0.23697155011964902, + "grad_norm": 1.5287110805511475, + "learning_rate": 4.5530162868261486e-05, + "loss": 1.5457, + "step": 7130 + }, + { + "epoch": 0.23800186120712577, + "grad_norm": 1.5516417026519775, + "learning_rate": 4.548212413768558e-05, + "loss": 1.5467, + "step": 7161 + }, + { + "epoch": 0.2390321722946025, + "grad_norm": 1.4710053205490112, + "learning_rate": 4.543385426912261e-05, + "loss": 1.5431, + "step": 7192 + }, + { + "epoch": 0.24006248338207922, + "grad_norm": 1.5005567073822021, + "learning_rate": 4.53853538072915e-05, + "loss": 1.5592, + "step": 7223 + }, + { + "epoch": 0.24109279446955598, + "grad_norm": 1.5864965915679932, + "learning_rate": 4.533662329951336e-05, + "loss": 1.5694, + "step": 7254 + }, + { + "epoch": 0.2421231055570327, + "grad_norm": 1.4661896228790283, + "learning_rate": 4.528766329570536e-05, + "loss": 1.545, + "step": 7285 + }, + { + "epoch": 0.24315341664450943, + "grad_norm": 1.5157560110092163, + "learning_rate": 4.523847434837447e-05, + "loss": 1.5458, + "step": 7316 + }, + { + "epoch": 0.24418372773198618, + "grad_norm": 1.4033585786819458, + "learning_rate": 4.518905701261128e-05, + "loss": 1.5464, + "step": 7347 + }, + { + "epoch": 0.2452140388194629, + "grad_norm": 1.5357593297958374, + "learning_rate": 4.5139411846083715e-05, + "loss": 1.5497, + "step": 7378 + }, + { + "epoch": 0.24624434990693964, + "grad_norm": 1.419507384300232, + "learning_rate": 4.508953940903073e-05, + "loss": 1.5414, + "step": 7409 + }, + { + "epoch": 0.2472746609944164, + "grad_norm": 1.5201773643493652, + "learning_rate": 4.5039440264255994e-05, + "loss": 1.5503, + "step": 7440 + }, + { + "epoch": 0.24830497208189312, + "grad_norm": 1.8000444173812866, + "learning_rate": 4.498911497712155e-05, + "loss": 1.5448, + "step": 7471 + }, + { + "epoch": 0.24933528316936984, + "grad_norm": 1.4876810312271118, + "learning_rate": 4.493856411554142e-05, + "loss": 1.5524, + "step": 7502 + }, + { + "epoch": 0.25036559425684657, + "grad_norm": 1.5130078792572021, + "learning_rate": 4.4887788249975206e-05, + "loss": 1.5454, + "step": 7533 + }, + { + "epoch": 0.2513959053443233, + "grad_norm": 1.4829351902008057, + "learning_rate": 4.4836787953421656e-05, + "loss": 1.5407, + "step": 7564 + }, + { + "epoch": 0.2524262164318001, + "grad_norm": 1.521550178527832, + "learning_rate": 4.478556380141218e-05, + "loss": 1.5727, + "step": 7595 + }, + { + "epoch": 0.2534565275192768, + "grad_norm": 1.4377928972244263, + "learning_rate": 4.4734116372004375e-05, + "loss": 1.5432, + "step": 7626 + }, + { + "epoch": 0.25448683860675353, + "grad_norm": 1.4101744890213013, + "learning_rate": 4.4682446245775477e-05, + "loss": 1.547, + "step": 7657 + }, + { + "epoch": 0.2555171496942303, + "grad_norm": 1.522524356842041, + "learning_rate": 4.463055400581586e-05, + "loss": 1.5418, + "step": 7688 + }, + { + "epoch": 0.256547460781707, + "grad_norm": 1.4160797595977783, + "learning_rate": 4.4578440237722374e-05, + "loss": 1.5457, + "step": 7719 + }, + { + "epoch": 0.25757777186918374, + "grad_norm": 1.4106636047363281, + "learning_rate": 4.452610552959183e-05, + "loss": 1.5405, + "step": 7750 + }, + { + "epoch": 0.2586080829566605, + "grad_norm": 1.422723650932312, + "learning_rate": 4.447355047201428e-05, + "loss": 1.5423, + "step": 7781 + }, + { + "epoch": 0.2596383940441372, + "grad_norm": 1.4362592697143555, + "learning_rate": 4.4420775658066414e-05, + "loss": 1.5372, + "step": 7812 + }, + { + "epoch": 0.26066870513161394, + "grad_norm": 1.4319696426391602, + "learning_rate": 4.436778168330484e-05, + "loss": 1.5451, + "step": 7843 + }, + { + "epoch": 0.26169901621909064, + "grad_norm": 1.4069257974624634, + "learning_rate": 4.4314569145759353e-05, + "loss": 1.5221, + "step": 7874 + }, + { + "epoch": 0.2627293273065674, + "grad_norm": 1.4424949884414673, + "learning_rate": 4.42611386459262e-05, + "loss": 1.5419, + "step": 7905 + }, + { + "epoch": 0.26375963839404415, + "grad_norm": 1.4579105377197266, + "learning_rate": 4.420749078676133e-05, + "loss": 1.5116, + "step": 7936 + }, + { + "epoch": 0.26478994948152085, + "grad_norm": 1.4563167095184326, + "learning_rate": 4.4153626173673516e-05, + "loss": 1.5296, + "step": 7967 + }, + { + "epoch": 0.2658202605689976, + "grad_norm": 1.4440968036651611, + "learning_rate": 4.409954541451762e-05, + "loss": 1.5548, + "step": 7998 + }, + { + "epoch": 0.26685057165647436, + "grad_norm": 1.5711034536361694, + "learning_rate": 4.404524911958764e-05, + "loss": 1.535, + "step": 8029 + }, + { + "epoch": 0.26788088274395105, + "grad_norm": 1.5221564769744873, + "learning_rate": 4.399073790160989e-05, + "loss": 1.5495, + "step": 8060 + }, + { + "epoch": 0.2689111938314278, + "grad_norm": 1.392699956893921, + "learning_rate": 4.393601237573607e-05, + "loss": 1.546, + "step": 8091 + }, + { + "epoch": 0.26994150491890456, + "grad_norm": 1.5343137979507446, + "learning_rate": 4.388107315953628e-05, + "loss": 1.549, + "step": 8122 + }, + { + "epoch": 0.27097181600638126, + "grad_norm": 1.4483468532562256, + "learning_rate": 4.382592087299212e-05, + "loss": 1.5424, + "step": 8153 + }, + { + "epoch": 0.272002127093858, + "grad_norm": 1.4963489770889282, + "learning_rate": 4.377055613848964e-05, + "loss": 1.508, + "step": 8184 + }, + { + "epoch": 0.27303243818133477, + "grad_norm": 1.4839162826538086, + "learning_rate": 4.3714979580812355e-05, + "loss": 1.5203, + "step": 8215 + }, + { + "epoch": 0.27406274926881147, + "grad_norm": 1.4272018671035767, + "learning_rate": 4.365919182713416e-05, + "loss": 1.5264, + "step": 8246 + }, + { + "epoch": 0.2750930603562882, + "grad_norm": 1.3808270692825317, + "learning_rate": 4.360319350701226e-05, + "loss": 1.5255, + "step": 8277 + }, + { + "epoch": 0.276123371443765, + "grad_norm": 1.4179162979125977, + "learning_rate": 4.3546985252380115e-05, + "loss": 1.535, + "step": 8308 + }, + { + "epoch": 0.2771536825312417, + "grad_norm": 1.3617374897003174, + "learning_rate": 4.349056769754021e-05, + "loss": 1.5295, + "step": 8339 + }, + { + "epoch": 0.27818399361871843, + "grad_norm": 1.4745615720748901, + "learning_rate": 4.3433941479156994e-05, + "loss": 1.5438, + "step": 8370 + }, + { + "epoch": 0.2792143047061952, + "grad_norm": 1.3661375045776367, + "learning_rate": 4.3377107236249647e-05, + "loss": 1.5134, + "step": 8401 + }, + { + "epoch": 0.2802446157936719, + "grad_norm": 1.3907949924468994, + "learning_rate": 4.332006561018488e-05, + "loss": 1.5237, + "step": 8432 + }, + { + "epoch": 0.28127492688114863, + "grad_norm": 1.3575704097747803, + "learning_rate": 4.3262817244669683e-05, + "loss": 1.5226, + "step": 8463 + }, + { + "epoch": 0.2823052379686254, + "grad_norm": 1.3836462497711182, + "learning_rate": 4.3205362785744083e-05, + "loss": 1.5433, + "step": 8494 + }, + { + "epoch": 0.2833355490561021, + "grad_norm": 1.6108276844024658, + "learning_rate": 4.314770288177384e-05, + "loss": 1.5324, + "step": 8525 + }, + { + "epoch": 0.28436586014357884, + "grad_norm": 1.4650689363479614, + "learning_rate": 4.308983818344313e-05, + "loss": 1.535, + "step": 8556 + }, + { + "epoch": 0.2853961712310556, + "grad_norm": 1.5836583375930786, + "learning_rate": 4.3031769343747206e-05, + "loss": 1.5313, + "step": 8587 + }, + { + "epoch": 0.2864264823185323, + "grad_norm": 1.5348492860794067, + "learning_rate": 4.297349701798505e-05, + "loss": 1.5106, + "step": 8618 + }, + { + "epoch": 0.28745679340600905, + "grad_norm": 1.4060319662094116, + "learning_rate": 4.2915021863751916e-05, + "loss": 1.5283, + "step": 8649 + }, + { + "epoch": 0.2884871044934858, + "grad_norm": 1.531657099723816, + "learning_rate": 4.285634454093198e-05, + "loss": 1.5087, + "step": 8680 + }, + { + "epoch": 0.2895174155809625, + "grad_norm": 1.4756299257278442, + "learning_rate": 4.279746571169086e-05, + "loss": 1.5042, + "step": 8711 + }, + { + "epoch": 0.29054772666843925, + "grad_norm": 1.3221153020858765, + "learning_rate": 4.2738386040468136e-05, + "loss": 1.5244, + "step": 8742 + }, + { + "epoch": 0.29157803775591595, + "grad_norm": 1.4067268371582031, + "learning_rate": 4.2679106193969866e-05, + "loss": 1.5012, + "step": 8773 + }, + { + "epoch": 0.2926083488433927, + "grad_norm": 1.5192064046859741, + "learning_rate": 4.261962684116106e-05, + "loss": 1.521, + "step": 8804 + }, + { + "epoch": 0.29363865993086946, + "grad_norm": 1.3847788572311401, + "learning_rate": 4.2559948653258145e-05, + "loss": 1.5128, + "step": 8835 + }, + { + "epoch": 0.29466897101834616, + "grad_norm": 1.4612780809402466, + "learning_rate": 4.250007230372134e-05, + "loss": 1.5371, + "step": 8866 + }, + { + "epoch": 0.2956992821058229, + "grad_norm": 1.468971610069275, + "learning_rate": 4.2439998468247126e-05, + "loss": 1.5199, + "step": 8897 + }, + { + "epoch": 0.29672959319329967, + "grad_norm": 1.386236310005188, + "learning_rate": 4.2379727824760566e-05, + "loss": 1.5273, + "step": 8928 + }, + { + "epoch": 0.29775990428077637, + "grad_norm": 1.3843929767608643, + "learning_rate": 4.231926105340768e-05, + "loss": 1.5011, + "step": 8959 + }, + { + "epoch": 0.2987902153682531, + "grad_norm": 1.4554557800292969, + "learning_rate": 4.225859883654776e-05, + "loss": 1.5311, + "step": 8990 + }, + { + "epoch": 0.2998205264557299, + "grad_norm": 1.3674421310424805, + "learning_rate": 4.219774185874569e-05, + "loss": 1.5302, + "step": 9021 + }, + { + "epoch": 0.3008508375432066, + "grad_norm": 1.3804330825805664, + "learning_rate": 4.213669080676418e-05, + "loss": 1.538, + "step": 9052 + }, + { + "epoch": 0.3018811486306833, + "grad_norm": 1.4643255472183228, + "learning_rate": 4.2075446369556056e-05, + "loss": 1.5172, + "step": 9083 + }, + { + "epoch": 0.3029114597181601, + "grad_norm": 1.3375928401947021, + "learning_rate": 4.201400923825648e-05, + "loss": 1.5123, + "step": 9114 + }, + { + "epoch": 0.3039417708056368, + "grad_norm": 1.4321980476379395, + "learning_rate": 4.195238010617511e-05, + "loss": 1.5196, + "step": 9145 + }, + { + "epoch": 0.30497208189311353, + "grad_norm": 1.4312376976013184, + "learning_rate": 4.1890559668788344e-05, + "loss": 1.5138, + "step": 9176 + }, + { + "epoch": 0.3060023929805903, + "grad_norm": 1.3089646100997925, + "learning_rate": 4.1828548623731405e-05, + "loss": 1.5027, + "step": 9207 + }, + { + "epoch": 0.307032704068067, + "grad_norm": 1.4863250255584717, + "learning_rate": 4.1766347670790506e-05, + "loss": 1.5091, + "step": 9238 + }, + { + "epoch": 0.30806301515554374, + "grad_norm": 1.373666763305664, + "learning_rate": 4.170395751189495e-05, + "loss": 1.5256, + "step": 9269 + }, + { + "epoch": 0.3090933262430205, + "grad_norm": 1.4160584211349487, + "learning_rate": 4.164137885110921e-05, + "loss": 1.4938, + "step": 9300 + }, + { + "epoch": 0.3101236373304972, + "grad_norm": 2.112110137939453, + "learning_rate": 4.157861239462495e-05, + "loss": 1.5106, + "step": 9331 + }, + { + "epoch": 0.31115394841797395, + "grad_norm": 1.337058663368225, + "learning_rate": 4.1515658850753114e-05, + "loss": 1.4999, + "step": 9362 + }, + { + "epoch": 0.3121842595054507, + "grad_norm": 1.3625296354293823, + "learning_rate": 4.145251892991588e-05, + "loss": 1.5136, + "step": 9393 + }, + { + "epoch": 0.3132145705929274, + "grad_norm": 1.399491548538208, + "learning_rate": 4.138919334463868e-05, + "loss": 1.499, + "step": 9424 + }, + { + "epoch": 0.31424488168040415, + "grad_norm": 1.4202344417572021, + "learning_rate": 4.1325682809542124e-05, + "loss": 1.5049, + "step": 9455 + }, + { + "epoch": 0.3152751927678809, + "grad_norm": 1.392248272895813, + "learning_rate": 4.126198804133398e-05, + "loss": 1.5287, + "step": 9486 + }, + { + "epoch": 0.3163055038553576, + "grad_norm": 1.3807618618011475, + "learning_rate": 4.1198109758801055e-05, + "loss": 1.5309, + "step": 9517 + }, + { + "epoch": 0.31733581494283436, + "grad_norm": 1.3117905855178833, + "learning_rate": 4.113404868280107e-05, + "loss": 1.4933, + "step": 9548 + }, + { + "epoch": 0.3183661260303111, + "grad_norm": 1.452086091041565, + "learning_rate": 4.106980553625457e-05, + "loss": 1.5221, + "step": 9579 + }, + { + "epoch": 0.3193964371177878, + "grad_norm": 1.477364182472229, + "learning_rate": 4.100538104413674e-05, + "loss": 1.4904, + "step": 9610 + }, + { + "epoch": 0.32042674820526457, + "grad_norm": 1.3584345579147339, + "learning_rate": 4.09407759334692e-05, + "loss": 1.4953, + "step": 9641 + }, + { + "epoch": 0.32145705929274127, + "grad_norm": 1.3619811534881592, + "learning_rate": 4.087599093331186e-05, + "loss": 1.4956, + "step": 9672 + }, + { + "epoch": 0.322487370380218, + "grad_norm": 1.4507052898406982, + "learning_rate": 4.081102677475462e-05, + "loss": 1.5197, + "step": 9703 + }, + { + "epoch": 0.3235176814676948, + "grad_norm": 1.4229698181152344, + "learning_rate": 4.0745884190909194e-05, + "loss": 1.498, + "step": 9734 + }, + { + "epoch": 0.32454799255517147, + "grad_norm": 1.3074679374694824, + "learning_rate": 4.0680563916900796e-05, + "loss": 1.5146, + "step": 9765 + }, + { + "epoch": 0.3255783036426482, + "grad_norm": 1.397815465927124, + "learning_rate": 4.0615066689859815e-05, + "loss": 1.5291, + "step": 9796 + }, + { + "epoch": 0.326608614730125, + "grad_norm": 1.3196336030960083, + "learning_rate": 4.0549393248913584e-05, + "loss": 1.5077, + "step": 9827 + }, + { + "epoch": 0.3276389258176017, + "grad_norm": 1.3129957914352417, + "learning_rate": 4.048354433517794e-05, + "loss": 1.4965, + "step": 9858 + }, + { + "epoch": 0.32866923690507843, + "grad_norm": 1.4380089044570923, + "learning_rate": 4.0417520691748916e-05, + "loss": 1.5115, + "step": 9889 + }, + { + "epoch": 0.3296995479925552, + "grad_norm": 1.3162370920181274, + "learning_rate": 4.035132306369438e-05, + "loss": 1.5029, + "step": 9920 + }, + { + "epoch": 0.3307298590800319, + "grad_norm": 1.3739668130874634, + "learning_rate": 4.028495219804555e-05, + "loss": 1.5083, + "step": 9951 + }, + { + "epoch": 0.33176017016750864, + "grad_norm": 1.3673723936080933, + "learning_rate": 4.021840884378864e-05, + "loss": 1.5223, + "step": 9982 + }, + { + "epoch": 0.3327904812549854, + "grad_norm": 1.3970317840576172, + "learning_rate": 4.015169375185633e-05, + "loss": 1.5003, + "step": 10013 + }, + { + "epoch": 0.3338207923424621, + "grad_norm": 1.2982394695281982, + "learning_rate": 4.0084807675119396e-05, + "loss": 1.5066, + "step": 10044 + }, + { + "epoch": 0.33485110342993885, + "grad_norm": 1.4548689126968384, + "learning_rate": 4.0017751368378106e-05, + "loss": 1.4993, + "step": 10075 + }, + { + "epoch": 0.3358814145174156, + "grad_norm": 1.3693586587905884, + "learning_rate": 3.995052558835377e-05, + "loss": 1.4987, + "step": 10106 + }, + { + "epoch": 0.3369117256048923, + "grad_norm": 1.4046767950057983, + "learning_rate": 3.988313109368017e-05, + "loss": 1.5098, + "step": 10137 + }, + { + "epoch": 0.33794203669236905, + "grad_norm": 1.3772069215774536, + "learning_rate": 3.981556864489504e-05, + "loss": 1.5165, + "step": 10168 + }, + { + "epoch": 0.3389723477798458, + "grad_norm": 1.471211314201355, + "learning_rate": 3.974783900443142e-05, + "loss": 1.5037, + "step": 10199 + }, + { + "epoch": 0.3400026588673225, + "grad_norm": 1.3990979194641113, + "learning_rate": 3.9679942936609095e-05, + "loss": 1.5096, + "step": 10230 + }, + { + "epoch": 0.34103296995479926, + "grad_norm": 1.3779234886169434, + "learning_rate": 3.961188120762596e-05, + "loss": 1.4914, + "step": 10261 + }, + { + "epoch": 0.342063281042276, + "grad_norm": 1.2866768836975098, + "learning_rate": 3.954365458554938e-05, + "loss": 1.5026, + "step": 10292 + }, + { + "epoch": 0.3430935921297527, + "grad_norm": 1.353468894958496, + "learning_rate": 3.947526384030751e-05, + "loss": 1.5063, + "step": 10323 + }, + { + "epoch": 0.34412390321722947, + "grad_norm": 1.3264256715774536, + "learning_rate": 3.9406709743680624e-05, + "loss": 1.4911, + "step": 10354 + }, + { + "epoch": 0.3451542143047062, + "grad_norm": 1.3496876955032349, + "learning_rate": 3.9337993069292366e-05, + "loss": 1.4921, + "step": 10385 + }, + { + "epoch": 0.3461845253921829, + "grad_norm": 1.3812434673309326, + "learning_rate": 3.926911459260109e-05, + "loss": 1.4826, + "step": 10416 + }, + { + "epoch": 0.34721483647965967, + "grad_norm": 1.4926965236663818, + "learning_rate": 3.920007509089102e-05, + "loss": 1.4994, + "step": 10447 + }, + { + "epoch": 0.3482451475671364, + "grad_norm": 1.3446170091629028, + "learning_rate": 3.913087534326357e-05, + "loss": 1.5114, + "step": 10478 + }, + { + "epoch": 0.3492754586546131, + "grad_norm": 1.3100495338439941, + "learning_rate": 3.9061516130628475e-05, + "loss": 1.5066, + "step": 10509 + }, + { + "epoch": 0.3503057697420899, + "grad_norm": 1.395874261856079, + "learning_rate": 3.8991998235695025e-05, + "loss": 1.4999, + "step": 10540 + }, + { + "epoch": 0.3513360808295666, + "grad_norm": 1.3682137727737427, + "learning_rate": 3.8922322442963224e-05, + "loss": 1.4778, + "step": 10571 + }, + { + "epoch": 0.35236639191704333, + "grad_norm": 1.4196573495864868, + "learning_rate": 3.885248953871491e-05, + "loss": 1.4909, + "step": 10602 + }, + { + "epoch": 0.3533967030045201, + "grad_norm": 1.4299864768981934, + "learning_rate": 3.8782500311004915e-05, + "loss": 1.5025, + "step": 10633 + }, + { + "epoch": 0.3544270140919968, + "grad_norm": 1.39677095413208, + "learning_rate": 3.871235554965218e-05, + "loss": 1.4932, + "step": 10664 + }, + { + "epoch": 0.35545732517947354, + "grad_norm": 1.3219736814498901, + "learning_rate": 3.864205604623078e-05, + "loss": 1.4795, + "step": 10695 + }, + { + "epoch": 0.3564876362669503, + "grad_norm": 1.3649324178695679, + "learning_rate": 3.857160259406107e-05, + "loss": 1.4838, + "step": 10726 + }, + { + "epoch": 0.357517947354427, + "grad_norm": 1.4109989404678345, + "learning_rate": 3.8500995988200674e-05, + "loss": 1.5058, + "step": 10757 + }, + { + "epoch": 0.35854825844190374, + "grad_norm": 1.3625038862228394, + "learning_rate": 3.843023702543556e-05, + "loss": 1.4912, + "step": 10788 + }, + { + "epoch": 0.3595785695293805, + "grad_norm": 1.4725775718688965, + "learning_rate": 3.8359326504270984e-05, + "loss": 1.5012, + "step": 10819 + }, + { + "epoch": 0.3606088806168572, + "grad_norm": 1.4126085042953491, + "learning_rate": 3.828826522492255e-05, + "loss": 1.4977, + "step": 10850 + }, + { + "epoch": 0.36163919170433395, + "grad_norm": 1.3949086666107178, + "learning_rate": 3.821705398930713e-05, + "loss": 1.4903, + "step": 10881 + }, + { + "epoch": 0.3626695027918107, + "grad_norm": 1.286792516708374, + "learning_rate": 3.814569360103385e-05, + "loss": 1.5067, + "step": 10912 + }, + { + "epoch": 0.3636998138792874, + "grad_norm": 1.274703025817871, + "learning_rate": 3.807418486539499e-05, + "loss": 1.4583, + "step": 10943 + }, + { + "epoch": 0.36473012496676416, + "grad_norm": 1.401455283164978, + "learning_rate": 3.80025285893569e-05, + "loss": 1.4834, + "step": 10974 + }, + { + "epoch": 0.3657604360542409, + "grad_norm": 1.308361530303955, + "learning_rate": 3.793072558155093e-05, + "loss": 1.4832, + "step": 11005 + }, + { + "epoch": 0.3667907471417176, + "grad_norm": 1.654733419418335, + "learning_rate": 3.785877665226426e-05, + "loss": 1.4867, + "step": 11036 + }, + { + "epoch": 0.36782105822919436, + "grad_norm": 1.3530856370925903, + "learning_rate": 3.778668261343079e-05, + "loss": 1.4873, + "step": 11067 + }, + { + "epoch": 0.3688513693166711, + "grad_norm": 1.3567407131195068, + "learning_rate": 3.771444427862192e-05, + "loss": 1.4935, + "step": 11098 + }, + { + "epoch": 0.3698816804041478, + "grad_norm": 1.3184572458267212, + "learning_rate": 3.7642062463037465e-05, + "loss": 1.4891, + "step": 11129 + }, + { + "epoch": 0.37091199149162457, + "grad_norm": 1.366489291191101, + "learning_rate": 3.7569537983496373e-05, + "loss": 1.5159, + "step": 11160 + }, + { + "epoch": 0.3719423025791013, + "grad_norm": 1.423258662223816, + "learning_rate": 3.749687165842753e-05, + "loss": 1.4938, + "step": 11191 + }, + { + "epoch": 0.372972613666578, + "grad_norm": 1.3226194381713867, + "learning_rate": 3.7424064307860536e-05, + "loss": 1.499, + "step": 11222 + }, + { + "epoch": 0.3740029247540548, + "grad_norm": 1.350500464439392, + "learning_rate": 3.735111675341645e-05, + "loss": 1.4952, + "step": 11253 + }, + { + "epoch": 0.37503323584153153, + "grad_norm": 1.3667839765548706, + "learning_rate": 3.7278029818298524e-05, + "loss": 1.4763, + "step": 11284 + }, + { + "epoch": 0.37606354692900823, + "grad_norm": 1.4876132011413574, + "learning_rate": 3.720480432728287e-05, + "loss": 1.4913, + "step": 11315 + }, + { + "epoch": 0.377093858016485, + "grad_norm": 1.3927743434906006, + "learning_rate": 3.71314411067092e-05, + "loss": 1.4948, + "step": 11346 + }, + { + "epoch": 0.37812416910396174, + "grad_norm": 1.3752413988113403, + "learning_rate": 3.70579409844715e-05, + "loss": 1.4763, + "step": 11377 + }, + { + "epoch": 0.37915448019143844, + "grad_norm": 1.3530951738357544, + "learning_rate": 3.698430479000865e-05, + "loss": 1.5077, + "step": 11408 + }, + { + "epoch": 0.3801847912789152, + "grad_norm": 1.4309345483779907, + "learning_rate": 3.691053335429509e-05, + "loss": 1.4945, + "step": 11439 + }, + { + "epoch": 0.38121510236639194, + "grad_norm": 1.2874380350112915, + "learning_rate": 3.683662750983147e-05, + "loss": 1.4698, + "step": 11470 + }, + { + "epoch": 0.38224541345386864, + "grad_norm": 1.3356250524520874, + "learning_rate": 3.676258809063518e-05, + "loss": 1.4924, + "step": 11501 + }, + { + "epoch": 0.3832757245413454, + "grad_norm": 1.304559588432312, + "learning_rate": 3.6688415932231004e-05, + "loss": 1.4682, + "step": 11532 + }, + { + "epoch": 0.3843060356288221, + "grad_norm": 1.4153447151184082, + "learning_rate": 3.661411187164166e-05, + "loss": 1.4989, + "step": 11563 + }, + { + "epoch": 0.38533634671629885, + "grad_norm": 1.356992244720459, + "learning_rate": 3.65396767473784e-05, + "loss": 1.4854, + "step": 11594 + }, + { + "epoch": 0.3863666578037756, + "grad_norm": 1.322449803352356, + "learning_rate": 3.6465111399431465e-05, + "loss": 1.4877, + "step": 11625 + }, + { + "epoch": 0.3873969688912523, + "grad_norm": 1.3981350660324097, + "learning_rate": 3.6390416669260674e-05, + "loss": 1.499, + "step": 11656 + }, + { + "epoch": 0.38842727997872906, + "grad_norm": 1.324871301651001, + "learning_rate": 3.63155933997859e-05, + "loss": 1.4814, + "step": 11687 + }, + { + "epoch": 0.3894575910662058, + "grad_norm": 1.3940790891647339, + "learning_rate": 3.624064243537758e-05, + "loss": 1.4754, + "step": 11718 + }, + { + "epoch": 0.3904879021536825, + "grad_norm": 1.2880780696868896, + "learning_rate": 3.616556462184716e-05, + "loss": 1.4832, + "step": 11749 + }, + { + "epoch": 0.39151821324115926, + "grad_norm": 1.315329670906067, + "learning_rate": 3.609036080643755e-05, + "loss": 1.4853, + "step": 11780 + }, + { + "epoch": 0.392548524328636, + "grad_norm": 1.4093523025512695, + "learning_rate": 3.60150318378136e-05, + "loss": 1.4978, + "step": 11811 + }, + { + "epoch": 0.3935788354161127, + "grad_norm": 1.271151065826416, + "learning_rate": 3.5939578566052465e-05, + "loss": 1.4933, + "step": 11842 + }, + { + "epoch": 0.39460914650358947, + "grad_norm": 1.2910923957824707, + "learning_rate": 3.586400184263408e-05, + "loss": 1.4853, + "step": 11873 + }, + { + "epoch": 0.3956394575910662, + "grad_norm": 1.2480064630508423, + "learning_rate": 3.578830252043148e-05, + "loss": 1.4642, + "step": 11904 + }, + { + "epoch": 0.3966697686785429, + "grad_norm": 1.263197422027588, + "learning_rate": 3.571248145370125e-05, + "loss": 1.4812, + "step": 11935 + }, + { + "epoch": 0.3977000797660197, + "grad_norm": 1.3231288194656372, + "learning_rate": 3.5636539498073794e-05, + "loss": 1.4744, + "step": 11966 + }, + { + "epoch": 0.39873039085349643, + "grad_norm": 1.3933110237121582, + "learning_rate": 3.556047751054378e-05, + "loss": 1.4849, + "step": 11997 + }, + { + "epoch": 0.39976070194097313, + "grad_norm": 1.3615801334381104, + "learning_rate": 3.548429634946039e-05, + "loss": 1.4866, + "step": 12028 + }, + { + "epoch": 0.4007910130284499, + "grad_norm": 1.298638939857483, + "learning_rate": 3.540799687451768e-05, + "loss": 1.4664, + "step": 12059 + }, + { + "epoch": 0.40182132411592664, + "grad_norm": 1.29216468334198, + "learning_rate": 3.533157994674485e-05, + "loss": 1.4697, + "step": 12090 + }, + { + "epoch": 0.40285163520340334, + "grad_norm": 1.3759845495224, + "learning_rate": 3.5255046428496546e-05, + "loss": 1.4854, + "step": 12121 + }, + { + "epoch": 0.4038819462908801, + "grad_norm": 1.4045615196228027, + "learning_rate": 3.517839718344311e-05, + "loss": 1.4622, + "step": 12152 + }, + { + "epoch": 0.40491225737835684, + "grad_norm": 1.2979034185409546, + "learning_rate": 3.510163307656086e-05, + "loss": 1.4797, + "step": 12183 + }, + { + "epoch": 0.40594256846583354, + "grad_norm": 1.303139567375183, + "learning_rate": 3.5024754974122324e-05, + "loss": 1.4588, + "step": 12214 + }, + { + "epoch": 0.4069728795533103, + "grad_norm": 1.287781834602356, + "learning_rate": 3.494776374368643e-05, + "loss": 1.4834, + "step": 12245 + }, + { + "epoch": 0.40800319064078705, + "grad_norm": 1.3806688785552979, + "learning_rate": 3.4870660254088724e-05, + "loss": 1.4807, + "step": 12276 + }, + { + "epoch": 0.40903350172826375, + "grad_norm": 1.4059745073318481, + "learning_rate": 3.479344537543164e-05, + "loss": 1.4906, + "step": 12307 + }, + { + "epoch": 0.4100638128157405, + "grad_norm": 1.3052942752838135, + "learning_rate": 3.4716119979074565e-05, + "loss": 1.4801, + "step": 12338 + }, + { + "epoch": 0.41109412390321726, + "grad_norm": 1.3306844234466553, + "learning_rate": 3.463868493762412e-05, + "loss": 1.4911, + "step": 12369 + }, + { + "epoch": 0.41212443499069396, + "grad_norm": 1.3276656866073608, + "learning_rate": 3.456114112492418e-05, + "loss": 1.4678, + "step": 12400 + }, + { + "epoch": 0.4131547460781707, + "grad_norm": 1.3164253234863281, + "learning_rate": 3.4483489416046164e-05, + "loss": 1.4816, + "step": 12431 + }, + { + "epoch": 0.4141850571656474, + "grad_norm": 1.3827886581420898, + "learning_rate": 3.440573068727905e-05, + "loss": 1.481, + "step": 12462 + }, + { + "epoch": 0.41521536825312416, + "grad_norm": 1.2899463176727295, + "learning_rate": 3.4327865816119495e-05, + "loss": 1.4575, + "step": 12493 + }, + { + "epoch": 0.4162456793406009, + "grad_norm": 1.3136677742004395, + "learning_rate": 3.4249895681262025e-05, + "loss": 1.4695, + "step": 12524 + }, + { + "epoch": 0.4172759904280776, + "grad_norm": 1.2920372486114502, + "learning_rate": 3.417182116258899e-05, + "loss": 1.4765, + "step": 12555 + }, + { + "epoch": 0.41830630151555437, + "grad_norm": 1.3285510540008545, + "learning_rate": 3.409364314116074e-05, + "loss": 1.4559, + "step": 12586 + }, + { + "epoch": 0.4193366126030311, + "grad_norm": 1.2834984064102173, + "learning_rate": 3.401536249920559e-05, + "loss": 1.4706, + "step": 12617 + }, + { + "epoch": 0.4203669236905078, + "grad_norm": 1.315942645072937, + "learning_rate": 3.393698012010998e-05, + "loss": 1.4692, + "step": 12648 + }, + { + "epoch": 0.4213972347779846, + "grad_norm": 1.3668091297149658, + "learning_rate": 3.385849688840839e-05, + "loss": 1.4801, + "step": 12679 + }, + { + "epoch": 0.42242754586546133, + "grad_norm": 1.312280297279358, + "learning_rate": 3.3779913689773414e-05, + "loss": 1.4673, + "step": 12710 + }, + { + "epoch": 0.423457856952938, + "grad_norm": 1.3579858541488647, + "learning_rate": 3.370123141100578e-05, + "loss": 1.4578, + "step": 12741 + }, + { + "epoch": 0.4244881680404148, + "grad_norm": 1.4001456499099731, + "learning_rate": 3.3622450940024305e-05, + "loss": 1.4787, + "step": 12772 + }, + { + "epoch": 0.42551847912789154, + "grad_norm": 1.352629542350769, + "learning_rate": 3.35435731658559e-05, + "loss": 1.457, + "step": 12803 + }, + { + "epoch": 0.42654879021536823, + "grad_norm": 1.4044222831726074, + "learning_rate": 3.346459897862552e-05, + "loss": 1.4979, + "step": 12834 + }, + { + "epoch": 0.427579101302845, + "grad_norm": 1.2666436433792114, + "learning_rate": 3.338552926954613e-05, + "loss": 1.4712, + "step": 12865 + }, + { + "epoch": 0.42860941239032174, + "grad_norm": 1.2487694025039673, + "learning_rate": 3.330636493090868e-05, + "loss": 1.4784, + "step": 12896 + }, + { + "epoch": 0.42963972347779844, + "grad_norm": 1.2346290349960327, + "learning_rate": 3.322710685607193e-05, + "loss": 1.4754, + "step": 12927 + }, + { + "epoch": 0.4306700345652752, + "grad_norm": 1.2908893823623657, + "learning_rate": 3.314775593945251e-05, + "loss": 1.4677, + "step": 12958 + }, + { + "epoch": 0.43170034565275195, + "grad_norm": 1.3283506631851196, + "learning_rate": 3.3068313076514714e-05, + "loss": 1.4661, + "step": 12989 + }, + { + "epoch": 0.43273065674022865, + "grad_norm": 1.2982537746429443, + "learning_rate": 3.298877916376047e-05, + "loss": 1.4838, + "step": 13020 + }, + { + "epoch": 0.4337609678277054, + "grad_norm": 1.3566454648971558, + "learning_rate": 3.290915509871915e-05, + "loss": 1.4683, + "step": 13051 + }, + { + "epoch": 0.43479127891518216, + "grad_norm": 1.3470877408981323, + "learning_rate": 3.282944177993753e-05, + "loss": 1.4724, + "step": 13082 + }, + { + "epoch": 0.43582159000265885, + "grad_norm": 1.451150894165039, + "learning_rate": 3.274964010696957e-05, + "loss": 1.4731, + "step": 13113 + }, + { + "epoch": 0.4368519010901356, + "grad_norm": 1.3415958881378174, + "learning_rate": 3.266975098036629e-05, + "loss": 1.4809, + "step": 13144 + }, + { + "epoch": 0.43788221217761236, + "grad_norm": 1.2775352001190186, + "learning_rate": 3.258977530166562e-05, + "loss": 1.4523, + "step": 13175 + }, + { + "epoch": 0.43891252326508906, + "grad_norm": 1.365050196647644, + "learning_rate": 3.250971397338227e-05, + "loss": 1.4611, + "step": 13206 + }, + { + "epoch": 0.4399428343525658, + "grad_norm": 1.3481686115264893, + "learning_rate": 3.2429567898997404e-05, + "loss": 1.4708, + "step": 13237 + }, + { + "epoch": 0.44097314544004257, + "grad_norm": 1.3418121337890625, + "learning_rate": 3.234933798294859e-05, + "loss": 1.485, + "step": 13268 + }, + { + "epoch": 0.44200345652751927, + "grad_norm": 1.3098441362380981, + "learning_rate": 3.2269025130619535e-05, + "loss": 1.472, + "step": 13299 + }, + { + "epoch": 0.443033767614996, + "grad_norm": 1.2792437076568604, + "learning_rate": 3.218863024832985e-05, + "loss": 1.4592, + "step": 13330 + }, + { + "epoch": 0.4440640787024727, + "grad_norm": 1.3804035186767578, + "learning_rate": 3.2108154243324864e-05, + "loss": 1.4546, + "step": 13361 + }, + { + "epoch": 0.4450943897899495, + "grad_norm": 1.287787675857544, + "learning_rate": 3.2027598023765345e-05, + "loss": 1.4477, + "step": 13392 + }, + { + "epoch": 0.44612470087742623, + "grad_norm": 1.5964646339416504, + "learning_rate": 3.194696249871729e-05, + "loss": 1.4468, + "step": 13423 + }, + { + "epoch": 0.4471550119649029, + "grad_norm": 1.3253474235534668, + "learning_rate": 3.186624857814164e-05, + "loss": 1.4588, + "step": 13454 + }, + { + "epoch": 0.4481853230523797, + "grad_norm": 1.288176417350769, + "learning_rate": 3.178545717288401e-05, + "loss": 1.4644, + "step": 13485 + }, + { + "epoch": 0.44921563413985643, + "grad_norm": 1.3357142210006714, + "learning_rate": 3.170458919466444e-05, + "loss": 1.4871, + "step": 13516 + }, + { + "epoch": 0.45024594522733313, + "grad_norm": 1.2954436540603638, + "learning_rate": 3.1623645556067063e-05, + "loss": 1.4571, + "step": 13547 + }, + { + "epoch": 0.4512762563148099, + "grad_norm": 1.344789981842041, + "learning_rate": 3.154262717052985e-05, + "loss": 1.459, + "step": 13578 + }, + { + "epoch": 0.45230656740228664, + "grad_norm": 1.2648475170135498, + "learning_rate": 3.146153495233426e-05, + "loss": 1.4496, + "step": 13609 + }, + { + "epoch": 0.45333687848976334, + "grad_norm": 1.312733769416809, + "learning_rate": 3.1380369816594944e-05, + "loss": 1.4309, + "step": 13640 + }, + { + "epoch": 0.4543671895772401, + "grad_norm": 1.3719325065612793, + "learning_rate": 3.129913267924946e-05, + "loss": 1.4723, + "step": 13671 + }, + { + "epoch": 0.45539750066471685, + "grad_norm": 1.2850617170333862, + "learning_rate": 3.121782445704782e-05, + "loss": 1.4599, + "step": 13702 + }, + { + "epoch": 0.45642781175219355, + "grad_norm": 1.3335177898406982, + "learning_rate": 3.11364460675423e-05, + "loss": 1.4821, + "step": 13733 + }, + { + "epoch": 0.4574581228396703, + "grad_norm": 1.1675069332122803, + "learning_rate": 3.1054998429076934e-05, + "loss": 1.453, + "step": 13764 + }, + { + "epoch": 0.45848843392714705, + "grad_norm": 1.283544898033142, + "learning_rate": 3.097348246077728e-05, + "loss": 1.4545, + "step": 13795 + }, + { + "epoch": 0.45951874501462375, + "grad_norm": 1.4358693361282349, + "learning_rate": 3.0891899082539924e-05, + "loss": 1.4673, + "step": 13826 + }, + { + "epoch": 0.4605490561021005, + "grad_norm": 1.2551497220993042, + "learning_rate": 3.0810249215022233e-05, + "loss": 1.4532, + "step": 13857 + }, + { + "epoch": 0.46157936718957726, + "grad_norm": 1.2574602365493774, + "learning_rate": 3.0728533779631865e-05, + "loss": 1.4762, + "step": 13888 + }, + { + "epoch": 0.46260967827705396, + "grad_norm": 1.2202764749526978, + "learning_rate": 3.064675369851637e-05, + "loss": 1.4461, + "step": 13919 + }, + { + "epoch": 0.4636399893645307, + "grad_norm": 1.2787501811981201, + "learning_rate": 3.056490989455289e-05, + "loss": 1.4607, + "step": 13950 + }, + { + "epoch": 0.46467030045200747, + "grad_norm": 1.2511006593704224, + "learning_rate": 3.0483003291337596e-05, + "loss": 1.4548, + "step": 13981 + }, + { + "epoch": 0.46570061153948417, + "grad_norm": 1.2749834060668945, + "learning_rate": 3.040103481317539e-05, + "loss": 1.4394, + "step": 14012 + }, + { + "epoch": 0.4667309226269609, + "grad_norm": 1.223057746887207, + "learning_rate": 3.03190053850694e-05, + "loss": 1.4684, + "step": 14043 + }, + { + "epoch": 0.4677612337144377, + "grad_norm": 1.39846932888031, + "learning_rate": 3.0236915932710573e-05, + "loss": 1.4657, + "step": 14074 + }, + { + "epoch": 0.4687915448019144, + "grad_norm": 1.5305665731430054, + "learning_rate": 3.0154767382467232e-05, + "loss": 1.4795, + "step": 14105 + }, + { + "epoch": 0.4698218558893911, + "grad_norm": 1.2569035291671753, + "learning_rate": 3.0072560661374582e-05, + "loss": 1.4756, + "step": 14136 + }, + { + "epoch": 0.4708521669768679, + "grad_norm": 1.3472824096679688, + "learning_rate": 2.999029669712431e-05, + "loss": 1.4682, + "step": 14167 + }, + { + "epoch": 0.4718824780643446, + "grad_norm": 1.271714210510254, + "learning_rate": 2.990797641805408e-05, + "loss": 1.4509, + "step": 14198 + }, + { + "epoch": 0.47291278915182133, + "grad_norm": 1.3342047929763794, + "learning_rate": 2.982560075313704e-05, + "loss": 1.4528, + "step": 14229 + }, + { + "epoch": 0.47394310023929803, + "grad_norm": 1.5821506977081299, + "learning_rate": 2.9743170631971368e-05, + "loss": 1.4609, + "step": 14260 + }, + { + "epoch": 0.4749734113267748, + "grad_norm": 1.2598062753677368, + "learning_rate": 2.9660686984769792e-05, + "loss": 1.471, + "step": 14291 + }, + { + "epoch": 0.47600372241425154, + "grad_norm": 1.2648885250091553, + "learning_rate": 2.9578150742349047e-05, + "loss": 1.4708, + "step": 14322 + }, + { + "epoch": 0.47703403350172824, + "grad_norm": 1.559665560722351, + "learning_rate": 2.949556283611942e-05, + "loss": 1.4516, + "step": 14353 + }, + { + "epoch": 0.478064344589205, + "grad_norm": 1.2621581554412842, + "learning_rate": 2.9412924198074206e-05, + "loss": 1.446, + "step": 14384 + }, + { + "epoch": 0.47909465567668175, + "grad_norm": 1.2775017023086548, + "learning_rate": 2.9330235760779208e-05, + "loss": 1.4496, + "step": 14415 + }, + { + "epoch": 0.48012496676415845, + "grad_norm": 1.2010388374328613, + "learning_rate": 2.9247498457362188e-05, + "loss": 1.4606, + "step": 14446 + }, + { + "epoch": 0.4811552778516352, + "grad_norm": 1.3053895235061646, + "learning_rate": 2.9164713221502373e-05, + "loss": 1.4536, + "step": 14477 + }, + { + "epoch": 0.48218558893911195, + "grad_norm": 1.311596155166626, + "learning_rate": 2.9081880987419912e-05, + "loss": 1.4409, + "step": 14508 + }, + { + "epoch": 0.48321590002658865, + "grad_norm": 1.3888933658599854, + "learning_rate": 2.8999002689865296e-05, + "loss": 1.4314, + "step": 14539 + }, + { + "epoch": 0.4842462111140654, + "grad_norm": 1.288619875907898, + "learning_rate": 2.8916079264108852e-05, + "loss": 1.4539, + "step": 14570 + }, + { + "epoch": 0.48527652220154216, + "grad_norm": 1.2974294424057007, + "learning_rate": 2.883311164593017e-05, + "loss": 1.4627, + "step": 14601 + }, + { + "epoch": 0.48630683328901886, + "grad_norm": 1.2057379484176636, + "learning_rate": 2.875010077160754e-05, + "loss": 1.4578, + "step": 14632 + }, + { + "epoch": 0.4873371443764956, + "grad_norm": 1.363971471786499, + "learning_rate": 2.866704757790741e-05, + "loss": 1.4671, + "step": 14663 + }, + { + "epoch": 0.48836745546397237, + "grad_norm": 1.2696925401687622, + "learning_rate": 2.858395300207376e-05, + "loss": 1.4333, + "step": 14694 + }, + { + "epoch": 0.48939776655144906, + "grad_norm": 1.2653478384017944, + "learning_rate": 2.8500817981817607e-05, + "loss": 1.4662, + "step": 14725 + }, + { + "epoch": 0.4904280776389258, + "grad_norm": 1.3011239767074585, + "learning_rate": 2.8417643455306336e-05, + "loss": 1.4589, + "step": 14756 + }, + { + "epoch": 0.4914583887264026, + "grad_norm": 1.3312432765960693, + "learning_rate": 2.8334430361153185e-05, + "loss": 1.4368, + "step": 14787 + }, + { + "epoch": 0.49248869981387927, + "grad_norm": 1.3015661239624023, + "learning_rate": 2.8251179638406612e-05, + "loss": 1.466, + "step": 14818 + }, + { + "epoch": 0.493519010901356, + "grad_norm": 1.3215759992599487, + "learning_rate": 2.8167892226539704e-05, + "loss": 1.4486, + "step": 14849 + }, + { + "epoch": 0.4945493219888328, + "grad_norm": 1.2909883260726929, + "learning_rate": 2.8084569065439588e-05, + "loss": 1.4433, + "step": 14880 + }, + { + "epoch": 0.4955796330763095, + "grad_norm": 1.364015817642212, + "learning_rate": 2.8001211095396807e-05, + "loss": 1.4449, + "step": 14911 + }, + { + "epoch": 0.49660994416378623, + "grad_norm": 1.2468819618225098, + "learning_rate": 2.791781925709473e-05, + "loss": 1.4572, + "step": 14942 + }, + { + "epoch": 0.497640255251263, + "grad_norm": 1.2739325761795044, + "learning_rate": 2.7834394491598908e-05, + "loss": 1.4478, + "step": 14973 + }, + { + "epoch": 0.4986705663387397, + "grad_norm": 1.3384937047958374, + "learning_rate": 2.7750937740346485e-05, + "loss": 1.4429, + "step": 15004 + }, + { + "epoch": 0.49970087742621644, + "grad_norm": 1.231088399887085, + "learning_rate": 2.7667449945135564e-05, + "loss": 1.4631, + "step": 15035 + }, + { + "epoch": 0.5007311885136931, + "grad_norm": 1.2262307405471802, + "learning_rate": 2.7583932048114557e-05, + "loss": 1.4508, + "step": 15066 + }, + { + "epoch": 0.5017614996011699, + "grad_norm": 1.3427774906158447, + "learning_rate": 2.7500384991771587e-05, + "loss": 1.4441, + "step": 15097 + }, + { + "epoch": 0.5027918106886466, + "grad_norm": 1.2950241565704346, + "learning_rate": 2.7416809718923825e-05, + "loss": 1.4427, + "step": 15128 + }, + { + "epoch": 0.5038221217761234, + "grad_norm": 1.4129016399383545, + "learning_rate": 2.7333207172706864e-05, + "loss": 1.4562, + "step": 15159 + }, + { + "epoch": 0.5048524328636002, + "grad_norm": 1.2751520872116089, + "learning_rate": 2.7249578296564088e-05, + "loss": 1.4517, + "step": 15190 + }, + { + "epoch": 0.5058827439510768, + "grad_norm": 1.302485466003418, + "learning_rate": 2.7165924034235973e-05, + "loss": 1.4327, + "step": 15221 + }, + { + "epoch": 0.5069130550385536, + "grad_norm": 1.295390009880066, + "learning_rate": 2.708224532974953e-05, + "loss": 1.4455, + "step": 15252 + }, + { + "epoch": 0.5079433661260303, + "grad_norm": 1.3160103559494019, + "learning_rate": 2.6998543127407538e-05, + "loss": 1.4556, + "step": 15283 + }, + { + "epoch": 0.5089736772135071, + "grad_norm": 1.2997361421585083, + "learning_rate": 2.6914818371777988e-05, + "loss": 1.444, + "step": 15314 + }, + { + "epoch": 0.5100039883009838, + "grad_norm": 1.2427833080291748, + "learning_rate": 2.6831072007683373e-05, + "loss": 1.4501, + "step": 15345 + }, + { + "epoch": 0.5110342993884606, + "grad_norm": 1.2402199506759644, + "learning_rate": 2.6747304980190018e-05, + "loss": 1.4543, + "step": 15376 + }, + { + "epoch": 0.5120646104759372, + "grad_norm": 1.2938770055770874, + "learning_rate": 2.6663518234597453e-05, + "loss": 1.4394, + "step": 15407 + }, + { + "epoch": 0.513094921563414, + "grad_norm": 1.1747736930847168, + "learning_rate": 2.6579712716427696e-05, + "loss": 1.4389, + "step": 15438 + }, + { + "epoch": 0.5141252326508907, + "grad_norm": 1.326824426651001, + "learning_rate": 2.6495889371414652e-05, + "loss": 1.4365, + "step": 15469 + }, + { + "epoch": 0.5151555437383675, + "grad_norm": 1.245665431022644, + "learning_rate": 2.6412049145493367e-05, + "loss": 1.4525, + "step": 15500 + }, + { + "epoch": 0.5161858548258442, + "grad_norm": 1.1753687858581543, + "learning_rate": 2.632819298478939e-05, + "loss": 1.447, + "step": 15531 + }, + { + "epoch": 0.517216165913321, + "grad_norm": 1.3870874643325806, + "learning_rate": 2.6244321835608105e-05, + "loss": 1.4577, + "step": 15562 + }, + { + "epoch": 0.5182464770007976, + "grad_norm": 1.2849411964416504, + "learning_rate": 2.6160436644424024e-05, + "loss": 1.4371, + "step": 15593 + }, + { + "epoch": 0.5192767880882744, + "grad_norm": 1.292443037033081, + "learning_rate": 2.6076538357870133e-05, + "loss": 1.4558, + "step": 15624 + }, + { + "epoch": 0.5203070991757511, + "grad_norm": 1.279961347579956, + "learning_rate": 2.5992627922727196e-05, + "loss": 1.4384, + "step": 15655 + }, + { + "epoch": 0.5213374102632279, + "grad_norm": 1.3141279220581055, + "learning_rate": 2.5908706285913066e-05, + "loss": 1.45, + "step": 15686 + }, + { + "epoch": 0.5223677213507046, + "grad_norm": 1.3931515216827393, + "learning_rate": 2.5824774394472008e-05, + "loss": 1.4403, + "step": 15717 + }, + { + "epoch": 0.5233980324381813, + "grad_norm": 1.2564170360565186, + "learning_rate": 2.5740833195563996e-05, + "loss": 1.4482, + "step": 15748 + }, + { + "epoch": 0.524428343525658, + "grad_norm": 1.5450046062469482, + "learning_rate": 2.5656883636454067e-05, + "loss": 1.4443, + "step": 15779 + }, + { + "epoch": 0.5254586546131348, + "grad_norm": 1.2659518718719482, + "learning_rate": 2.557292666450159e-05, + "loss": 1.4653, + "step": 15810 + }, + { + "epoch": 0.5264889657006115, + "grad_norm": 1.2940540313720703, + "learning_rate": 2.5488963227149566e-05, + "loss": 1.4302, + "step": 15841 + }, + { + "epoch": 0.5275192767880883, + "grad_norm": 1.2514533996582031, + "learning_rate": 2.5404994271913983e-05, + "loss": 1.4412, + "step": 15872 + }, + { + "epoch": 0.528549587875565, + "grad_norm": 1.2681846618652344, + "learning_rate": 2.5321020746373085e-05, + "loss": 1.4411, + "step": 15903 + }, + { + "epoch": 0.5295798989630417, + "grad_norm": 1.2581806182861328, + "learning_rate": 2.52370435981567e-05, + "loss": 1.4503, + "step": 15934 + }, + { + "epoch": 0.5306102100505184, + "grad_norm": 1.3299468755722046, + "learning_rate": 2.5153063774935533e-05, + "loss": 1.4392, + "step": 15965 + }, + { + "epoch": 0.5316405211379952, + "grad_norm": 1.240678310394287, + "learning_rate": 2.506908222441045e-05, + "loss": 1.4412, + "step": 15996 + }, + { + "epoch": 0.532670832225472, + "grad_norm": 1.337936520576477, + "learning_rate": 2.498509989430187e-05, + "loss": 1.4254, + "step": 16027 + }, + { + "epoch": 0.5337011433129487, + "grad_norm": 1.302909016609192, + "learning_rate": 2.4901117732338958e-05, + "loss": 1.4436, + "step": 16058 + }, + { + "epoch": 0.5347314544004255, + "grad_norm": 1.2539550065994263, + "learning_rate": 2.481713668624899e-05, + "loss": 1.4496, + "step": 16089 + }, + { + "epoch": 0.5357617654879021, + "grad_norm": 1.287431001663208, + "learning_rate": 2.4733157703746663e-05, + "loss": 1.424, + "step": 16120 + }, + { + "epoch": 0.5367920765753789, + "grad_norm": 1.5333632230758667, + "learning_rate": 2.4649181732523392e-05, + "loss": 1.4399, + "step": 16151 + }, + { + "epoch": 0.5378223876628556, + "grad_norm": 1.2591406106948853, + "learning_rate": 2.4565209720236582e-05, + "loss": 1.439, + "step": 16182 + }, + { + "epoch": 0.5388526987503324, + "grad_norm": 1.3093276023864746, + "learning_rate": 2.4481242614498975e-05, + "loss": 1.4279, + "step": 16213 + }, + { + "epoch": 0.5398830098378091, + "grad_norm": 1.2824875116348267, + "learning_rate": 2.439728136286796e-05, + "loss": 1.4428, + "step": 16244 + }, + { + "epoch": 0.5409133209252859, + "grad_norm": 1.2775593996047974, + "learning_rate": 2.4313326912834852e-05, + "loss": 1.4352, + "step": 16275 + }, + { + "epoch": 0.5419436320127625, + "grad_norm": 1.4667550325393677, + "learning_rate": 2.4229380211814206e-05, + "loss": 1.4633, + "step": 16306 + }, + { + "epoch": 0.5429739431002393, + "grad_norm": 1.2620900869369507, + "learning_rate": 2.4145442207133124e-05, + "loss": 1.4482, + "step": 16337 + }, + { + "epoch": 0.544004254187716, + "grad_norm": 1.3041224479675293, + "learning_rate": 2.406151384602059e-05, + "loss": 1.4431, + "step": 16368 + }, + { + "epoch": 0.5450345652751928, + "grad_norm": 1.3634989261627197, + "learning_rate": 2.3977596075596747e-05, + "loss": 1.4186, + "step": 16399 + }, + { + "epoch": 0.5460648763626695, + "grad_norm": 1.2322940826416016, + "learning_rate": 2.3893689842862223e-05, + "loss": 1.4322, + "step": 16430 + }, + { + "epoch": 0.5470951874501463, + "grad_norm": 1.5554733276367188, + "learning_rate": 2.3809796094687475e-05, + "loss": 1.4337, + "step": 16461 + }, + { + "epoch": 0.5481254985376229, + "grad_norm": 1.4745500087738037, + "learning_rate": 2.372591577780202e-05, + "loss": 1.4411, + "step": 16492 + }, + { + "epoch": 0.5491558096250997, + "grad_norm": 1.2865196466445923, + "learning_rate": 2.3642049838783838e-05, + "loss": 1.429, + "step": 16523 + }, + { + "epoch": 0.5501861207125764, + "grad_norm": 1.399247407913208, + "learning_rate": 2.3558199224048666e-05, + "loss": 1.4753, + "step": 16554 + }, + { + "epoch": 0.5512164318000532, + "grad_norm": 1.2135406732559204, + "learning_rate": 2.347436487983929e-05, + "loss": 1.4553, + "step": 16585 + }, + { + "epoch": 0.55224674288753, + "grad_norm": 1.164150357246399, + "learning_rate": 2.3390547752214888e-05, + "loss": 1.4268, + "step": 16616 + }, + { + "epoch": 0.5532770539750066, + "grad_norm": 1.2363818883895874, + "learning_rate": 2.330674878704035e-05, + "loss": 1.4381, + "step": 16647 + }, + { + "epoch": 0.5543073650624833, + "grad_norm": 1.286139726638794, + "learning_rate": 2.322296892997561e-05, + "loss": 1.4492, + "step": 16678 + }, + { + "epoch": 0.5553376761499601, + "grad_norm": 1.2836147546768188, + "learning_rate": 2.313920912646497e-05, + "loss": 1.4128, + "step": 16709 + }, + { + "epoch": 0.5563679872374369, + "grad_norm": 1.253727674484253, + "learning_rate": 2.305547032172643e-05, + "loss": 1.4472, + "step": 16740 + }, + { + "epoch": 0.5573982983249136, + "grad_norm": 1.2580201625823975, + "learning_rate": 2.2971753460741014e-05, + "loss": 1.4461, + "step": 16771 + }, + { + "epoch": 0.5584286094123904, + "grad_norm": 1.2446421384811401, + "learning_rate": 2.288805948824212e-05, + "loss": 1.4267, + "step": 16802 + }, + { + "epoch": 0.559458920499867, + "grad_norm": 1.3572150468826294, + "learning_rate": 2.2804389348704858e-05, + "loss": 1.4222, + "step": 16833 + }, + { + "epoch": 0.5604892315873438, + "grad_norm": 1.3694707155227661, + "learning_rate": 2.2720743986335374e-05, + "loss": 1.4624, + "step": 16864 + }, + { + "epoch": 0.5615195426748205, + "grad_norm": 1.2654088735580444, + "learning_rate": 2.2637124345060233e-05, + "loss": 1.4379, + "step": 16895 + }, + { + "epoch": 0.5625498537622973, + "grad_norm": 1.3349469900131226, + "learning_rate": 2.2553531368515695e-05, + "loss": 1.4404, + "step": 16926 + }, + { + "epoch": 0.563580164849774, + "grad_norm": 1.2259774208068848, + "learning_rate": 2.2469966000037144e-05, + "loss": 1.4335, + "step": 16957 + }, + { + "epoch": 0.5646104759372508, + "grad_norm": 1.2973053455352783, + "learning_rate": 2.2386429182648417e-05, + "loss": 1.4397, + "step": 16988 + }, + { + "epoch": 0.5656407870247274, + "grad_norm": 1.2674601078033447, + "learning_rate": 2.230292185905114e-05, + "loss": 1.4256, + "step": 17019 + }, + { + "epoch": 0.5666710981122042, + "grad_norm": 1.243605136871338, + "learning_rate": 2.2219444971614116e-05, + "loss": 1.4404, + "step": 17050 + }, + { + "epoch": 0.5677014091996809, + "grad_norm": 1.2108361721038818, + "learning_rate": 2.2135999462362655e-05, + "loss": 1.4318, + "step": 17081 + }, + { + "epoch": 0.5687317202871577, + "grad_norm": 1.2497962713241577, + "learning_rate": 2.2052586272968003e-05, + "loss": 1.4409, + "step": 17112 + }, + { + "epoch": 0.5697620313746344, + "grad_norm": 1.2269086837768555, + "learning_rate": 2.196920634473666e-05, + "loss": 1.4417, + "step": 17143 + }, + { + "epoch": 0.5707923424621112, + "grad_norm": 1.3165903091430664, + "learning_rate": 2.1885860618599787e-05, + "loss": 1.4541, + "step": 17174 + }, + { + "epoch": 0.5718226535495878, + "grad_norm": 1.2117608785629272, + "learning_rate": 2.1802550035102577e-05, + "loss": 1.4457, + "step": 17205 + }, + { + "epoch": 0.5728529646370646, + "grad_norm": 1.2482073307037354, + "learning_rate": 2.171927553439363e-05, + "loss": 1.4408, + "step": 17236 + }, + { + "epoch": 0.5738832757245413, + "grad_norm": 1.2258682250976562, + "learning_rate": 2.1636038056214376e-05, + "loss": 1.4366, + "step": 17267 + }, + { + "epoch": 0.5749135868120181, + "grad_norm": 1.254062294960022, + "learning_rate": 2.155283853988844e-05, + "loss": 1.4187, + "step": 17298 + }, + { + "epoch": 0.5759438978994948, + "grad_norm": 1.3397905826568604, + "learning_rate": 2.146967792431106e-05, + "loss": 1.4316, + "step": 17329 + }, + { + "epoch": 0.5769742089869716, + "grad_norm": 1.3253263235092163, + "learning_rate": 2.138655714793849e-05, + "loss": 1.4361, + "step": 17360 + }, + { + "epoch": 0.5780045200744482, + "grad_norm": 1.2624903917312622, + "learning_rate": 2.1303477148777367e-05, + "loss": 1.4136, + "step": 17391 + }, + { + "epoch": 0.579034831161925, + "grad_norm": 1.3255977630615234, + "learning_rate": 2.122043886437421e-05, + "loss": 1.4552, + "step": 17422 + }, + { + "epoch": 0.5800651422494018, + "grad_norm": 1.300898790359497, + "learning_rate": 2.1137443231804765e-05, + "loss": 1.4152, + "step": 17453 + }, + { + "epoch": 0.5810954533368785, + "grad_norm": 1.2904343605041504, + "learning_rate": 2.105449118766347e-05, + "loss": 1.4195, + "step": 17484 + }, + { + "epoch": 0.5821257644243553, + "grad_norm": 1.3146878480911255, + "learning_rate": 2.097158366805287e-05, + "loss": 1.426, + "step": 17515 + }, + { + "epoch": 0.5831560755118319, + "grad_norm": 1.2454010248184204, + "learning_rate": 2.0888721608573047e-05, + "loss": 1.4239, + "step": 17546 + }, + { + "epoch": 0.5841863865993087, + "grad_norm": 1.194626808166504, + "learning_rate": 2.0805905944311087e-05, + "loss": 1.4416, + "step": 17577 + }, + { + "epoch": 0.5852166976867854, + "grad_norm": 1.359053373336792, + "learning_rate": 2.0723137609830497e-05, + "loss": 1.4112, + "step": 17608 + }, + { + "epoch": 0.5862470087742622, + "grad_norm": 1.2577933073043823, + "learning_rate": 2.0640417539160686e-05, + "loss": 1.4432, + "step": 17639 + }, + { + "epoch": 0.5872773198617389, + "grad_norm": 1.2604849338531494, + "learning_rate": 2.0557746665786427e-05, + "loss": 1.4184, + "step": 17670 + }, + { + "epoch": 0.5883076309492157, + "grad_norm": 1.2511252164840698, + "learning_rate": 2.0475125922637256e-05, + "loss": 1.4276, + "step": 17701 + }, + { + "epoch": 0.5893379420366923, + "grad_norm": 1.2841278314590454, + "learning_rate": 2.0392556242077047e-05, + "loss": 1.4345, + "step": 17732 + }, + { + "epoch": 0.5903682531241691, + "grad_norm": 1.3342245817184448, + "learning_rate": 2.031003855589343e-05, + "loss": 1.4212, + "step": 17763 + }, + { + "epoch": 0.5913985642116458, + "grad_norm": 1.352387547492981, + "learning_rate": 2.022757379528727e-05, + "loss": 1.4316, + "step": 17794 + }, + { + "epoch": 0.5924288752991226, + "grad_norm": 1.3534374237060547, + "learning_rate": 2.0145162890862184e-05, + "loss": 1.4352, + "step": 17825 + }, + { + "epoch": 0.5934591863865993, + "grad_norm": 1.2957963943481445, + "learning_rate": 2.0062806772614022e-05, + "loss": 1.4057, + "step": 17856 + }, + { + "epoch": 0.5944894974740761, + "grad_norm": 1.3178727626800537, + "learning_rate": 1.9980506369920392e-05, + "loss": 1.4323, + "step": 17887 + }, + { + "epoch": 0.5955198085615527, + "grad_norm": 1.3364850282669067, + "learning_rate": 1.989826261153015e-05, + "loss": 1.4228, + "step": 17918 + }, + { + "epoch": 0.5965501196490295, + "grad_norm": 1.283200979232788, + "learning_rate": 1.9816076425552923e-05, + "loss": 1.4348, + "step": 17949 + }, + { + "epoch": 0.5975804307365062, + "grad_norm": 1.2856223583221436, + "learning_rate": 1.9733948739448676e-05, + "loss": 1.4176, + "step": 17980 + }, + { + "epoch": 0.598610741823983, + "grad_norm": 1.253180742263794, + "learning_rate": 1.9651880480017155e-05, + "loss": 1.4175, + "step": 18011 + }, + { + "epoch": 0.5996410529114597, + "grad_norm": 1.3471016883850098, + "learning_rate": 1.9569872573387516e-05, + "loss": 1.433, + "step": 18042 + }, + { + "epoch": 0.6006713639989365, + "grad_norm": 1.2449748516082764, + "learning_rate": 1.9487925945007854e-05, + "loss": 1.4091, + "step": 18073 + }, + { + "epoch": 0.6017016750864131, + "grad_norm": 1.3311972618103027, + "learning_rate": 1.9406041519634726e-05, + "loss": 1.403, + "step": 18104 + }, + { + "epoch": 0.6027319861738899, + "grad_norm": 1.2645657062530518, + "learning_rate": 1.932422022132275e-05, + "loss": 1.4265, + "step": 18135 + }, + { + "epoch": 0.6037622972613667, + "grad_norm": 1.3313370943069458, + "learning_rate": 1.924246297341414e-05, + "loss": 1.4275, + "step": 18166 + }, + { + "epoch": 0.6047926083488434, + "grad_norm": 1.2827123403549194, + "learning_rate": 1.9160770698528338e-05, + "loss": 1.4277, + "step": 18197 + }, + { + "epoch": 0.6058229194363202, + "grad_norm": 1.2230308055877686, + "learning_rate": 1.907914431855156e-05, + "loss": 1.4391, + "step": 18228 + }, + { + "epoch": 0.6068532305237969, + "grad_norm": 1.2785223722457886, + "learning_rate": 1.8997584754626412e-05, + "loss": 1.4152, + "step": 18259 + }, + { + "epoch": 0.6078835416112736, + "grad_norm": 1.3152620792388916, + "learning_rate": 1.8916092927141486e-05, + "loss": 1.4137, + "step": 18290 + }, + { + "epoch": 0.6089138526987503, + "grad_norm": 1.1842609643936157, + "learning_rate": 1.883466975572098e-05, + "loss": 1.4141, + "step": 18321 + }, + { + "epoch": 0.6099441637862271, + "grad_norm": 1.2319703102111816, + "learning_rate": 1.8753316159214312e-05, + "loss": 1.4216, + "step": 18352 + }, + { + "epoch": 0.6109744748737038, + "grad_norm": 1.3239370584487915, + "learning_rate": 1.8672033055685766e-05, + "loss": 1.4184, + "step": 18383 + }, + { + "epoch": 0.6120047859611806, + "grad_norm": 1.2665941715240479, + "learning_rate": 1.8590821362404116e-05, + "loss": 1.4249, + "step": 18414 + }, + { + "epoch": 0.6130350970486572, + "grad_norm": 1.2569379806518555, + "learning_rate": 1.8509681995832294e-05, + "loss": 1.4242, + "step": 18445 + }, + { + "epoch": 0.614065408136134, + "grad_norm": 1.2848411798477173, + "learning_rate": 1.8428615871617004e-05, + "loss": 1.4166, + "step": 18476 + }, + { + "epoch": 0.6150957192236107, + "grad_norm": 1.2636574506759644, + "learning_rate": 1.8347623904578448e-05, + "loss": 1.4297, + "step": 18507 + }, + { + "epoch": 0.6161260303110875, + "grad_norm": 1.2672234773635864, + "learning_rate": 1.8266707008699975e-05, + "loss": 1.4244, + "step": 18538 + }, + { + "epoch": 0.6171563413985642, + "grad_norm": 1.2299143075942993, + "learning_rate": 1.818586609711774e-05, + "loss": 1.408, + "step": 18569 + }, + { + "epoch": 0.618186652486041, + "grad_norm": 1.2221580743789673, + "learning_rate": 1.8105102082110462e-05, + "loss": 1.4242, + "step": 18600 + }, + { + "epoch": 0.6192169635735176, + "grad_norm": 1.290737509727478, + "learning_rate": 1.8024415875089058e-05, + "loss": 1.4167, + "step": 18631 + }, + { + "epoch": 0.6202472746609944, + "grad_norm": 1.3236243724822998, + "learning_rate": 1.7943808386586407e-05, + "loss": 1.4341, + "step": 18662 + }, + { + "epoch": 0.6212775857484711, + "grad_norm": 1.1983164548873901, + "learning_rate": 1.7863280526247073e-05, + "loss": 1.4171, + "step": 18693 + }, + { + "epoch": 0.6223078968359479, + "grad_norm": 1.2706191539764404, + "learning_rate": 1.7782833202817003e-05, + "loss": 1.4268, + "step": 18724 + }, + { + "epoch": 0.6233382079234246, + "grad_norm": 1.2584494352340698, + "learning_rate": 1.7702467324133327e-05, + "loss": 1.4364, + "step": 18755 + }, + { + "epoch": 0.6243685190109014, + "grad_norm": 1.345226526260376, + "learning_rate": 1.7622183797114042e-05, + "loss": 1.4274, + "step": 18786 + }, + { + "epoch": 0.625398830098378, + "grad_norm": 1.3055671453475952, + "learning_rate": 1.7541983527747838e-05, + "loss": 1.4101, + "step": 18817 + }, + { + "epoch": 0.6264291411858548, + "grad_norm": 1.2878341674804688, + "learning_rate": 1.746186742108387e-05, + "loss": 1.4133, + "step": 18848 + }, + { + "epoch": 0.6274594522733316, + "grad_norm": 1.241191029548645, + "learning_rate": 1.73818363812215e-05, + "loss": 1.4038, + "step": 18879 + }, + { + "epoch": 0.6284897633608083, + "grad_norm": 1.8631796836853027, + "learning_rate": 1.7301891311300153e-05, + "loss": 1.3961, + "step": 18910 + }, + { + "epoch": 0.6295200744482851, + "grad_norm": 1.2781902551651, + "learning_rate": 1.7222033113489055e-05, + "loss": 1.4238, + "step": 18941 + }, + { + "epoch": 0.6305503855357618, + "grad_norm": 1.2679165601730347, + "learning_rate": 1.7142262688977127e-05, + "loss": 1.4236, + "step": 18972 + }, + { + "epoch": 0.6315806966232385, + "grad_norm": 1.257203459739685, + "learning_rate": 1.7062580937962764e-05, + "loss": 1.4156, + "step": 19003 + }, + { + "epoch": 0.6326110077107152, + "grad_norm": 1.284470796585083, + "learning_rate": 1.698298875964369e-05, + "loss": 1.4241, + "step": 19034 + }, + { + "epoch": 0.633641318798192, + "grad_norm": 1.310545802116394, + "learning_rate": 1.690348705220684e-05, + "loss": 1.4205, + "step": 19065 + }, + { + "epoch": 0.6346716298856687, + "grad_norm": 1.2868564128875732, + "learning_rate": 1.6824076712818156e-05, + "loss": 1.4238, + "step": 19096 + }, + { + "epoch": 0.6357019409731455, + "grad_norm": 1.2508702278137207, + "learning_rate": 1.6744758637612533e-05, + "loss": 1.4046, + "step": 19127 + }, + { + "epoch": 0.6367322520606222, + "grad_norm": 1.3149102926254272, + "learning_rate": 1.6665533721683664e-05, + "loss": 1.4211, + "step": 19158 + }, + { + "epoch": 0.6377625631480989, + "grad_norm": 1.3485240936279297, + "learning_rate": 1.6586402859073974e-05, + "loss": 1.4167, + "step": 19189 + }, + { + "epoch": 0.6387928742355756, + "grad_norm": 1.2397938966751099, + "learning_rate": 1.6507366942764463e-05, + "loss": 1.4242, + "step": 19220 + }, + { + "epoch": 0.6398231853230524, + "grad_norm": 1.2909672260284424, + "learning_rate": 1.6428426864664732e-05, + "loss": 1.403, + "step": 19251 + }, + { + "epoch": 0.6408534964105291, + "grad_norm": 1.290385365486145, + "learning_rate": 1.6349583515602816e-05, + "loss": 1.4082, + "step": 19282 + }, + { + "epoch": 0.6418838074980059, + "grad_norm": 1.3623126745224, + "learning_rate": 1.6270837785315208e-05, + "loss": 1.4075, + "step": 19313 + }, + { + "epoch": 0.6429141185854825, + "grad_norm": 1.276903510093689, + "learning_rate": 1.619219056243676e-05, + "loss": 1.4135, + "step": 19344 + }, + { + "epoch": 0.6439444296729593, + "grad_norm": 1.2038910388946533, + "learning_rate": 1.6113642734490698e-05, + "loss": 1.4162, + "step": 19375 + }, + { + "epoch": 0.644974740760436, + "grad_norm": 1.2092891931533813, + "learning_rate": 1.6035195187878577e-05, + "loss": 1.4285, + "step": 19406 + }, + { + "epoch": 0.6460050518479128, + "grad_norm": 1.2983031272888184, + "learning_rate": 1.5956848807870305e-05, + "loss": 1.4128, + "step": 19437 + }, + { + "epoch": 0.6470353629353895, + "grad_norm": 1.279845952987671, + "learning_rate": 1.587860447859413e-05, + "loss": 1.4351, + "step": 19468 + }, + { + "epoch": 0.6480656740228663, + "grad_norm": 1.2781362533569336, + "learning_rate": 1.5800463083026686e-05, + "loss": 1.4118, + "step": 19499 + }, + { + "epoch": 0.6490959851103429, + "grad_norm": 1.2652825117111206, + "learning_rate": 1.572242550298298e-05, + "loss": 1.4195, + "step": 19530 + }, + { + "epoch": 0.6501262961978197, + "grad_norm": 1.3177101612091064, + "learning_rate": 1.56444926191065e-05, + "loss": 1.4307, + "step": 19561 + }, + { + "epoch": 0.6511566072852965, + "grad_norm": 1.2758272886276245, + "learning_rate": 1.5566665310859257e-05, + "loss": 1.4096, + "step": 19592 + }, + { + "epoch": 0.6521869183727732, + "grad_norm": 1.2265219688415527, + "learning_rate": 1.5488944456511846e-05, + "loss": 1.4098, + "step": 19623 + }, + { + "epoch": 0.65321722946025, + "grad_norm": 1.258945345878601, + "learning_rate": 1.5411330933133546e-05, + "loss": 1.4274, + "step": 19654 + }, + { + "epoch": 0.6542475405477267, + "grad_norm": 1.2599055767059326, + "learning_rate": 1.533382561658241e-05, + "loss": 1.4207, + "step": 19685 + }, + { + "epoch": 0.6552778516352034, + "grad_norm": 1.2502135038375854, + "learning_rate": 1.525642938149541e-05, + "loss": 1.4046, + "step": 19716 + }, + { + "epoch": 0.6563081627226801, + "grad_norm": 1.2734349966049194, + "learning_rate": 1.5179143101278536e-05, + "loss": 1.41, + "step": 19747 + }, + { + "epoch": 0.6573384738101569, + "grad_norm": 1.2801038026809692, + "learning_rate": 1.5101967648096955e-05, + "loss": 1.4088, + "step": 19778 + }, + { + "epoch": 0.6583687848976336, + "grad_norm": 1.2488126754760742, + "learning_rate": 1.5024903892865172e-05, + "loss": 1.4111, + "step": 19809 + }, + { + "epoch": 0.6593990959851104, + "grad_norm": 1.2418783903121948, + "learning_rate": 1.4947952705237184e-05, + "loss": 1.384, + "step": 19840 + }, + { + "epoch": 0.6604294070725871, + "grad_norm": 1.2566567659378052, + "learning_rate": 1.4871114953596682e-05, + "loss": 1.4127, + "step": 19871 + }, + { + "epoch": 0.6614597181600638, + "grad_norm": 1.2431600093841553, + "learning_rate": 1.4794391505047256e-05, + "loss": 1.4015, + "step": 19902 + }, + { + "epoch": 0.6624900292475405, + "grad_norm": 1.3174066543579102, + "learning_rate": 1.4717783225402596e-05, + "loss": 1.4113, + "step": 19933 + }, + { + "epoch": 0.6635203403350173, + "grad_norm": 1.3124332427978516, + "learning_rate": 1.4641290979176735e-05, + "loss": 1.421, + "step": 19964 + }, + { + "epoch": 0.664550651422494, + "grad_norm": 1.2595762014389038, + "learning_rate": 1.4564915629574246e-05, + "loss": 1.409, + "step": 19995 + }, + { + "epoch": 0.6655809625099708, + "grad_norm": 1.2872180938720703, + "learning_rate": 1.4488658038480601e-05, + "loss": 1.4082, + "step": 20026 + }, + { + "epoch": 0.6666112735974475, + "grad_norm": 1.27680242061615, + "learning_rate": 1.4412519066452323e-05, + "loss": 1.3979, + "step": 20057 + }, + { + "epoch": 0.6676415846849242, + "grad_norm": 1.2753857374191284, + "learning_rate": 1.4336499572707373e-05, + "loss": 1.4227, + "step": 20088 + }, + { + "epoch": 0.6686718957724009, + "grad_norm": 1.2680202722549438, + "learning_rate": 1.4260600415115433e-05, + "loss": 1.418, + "step": 20119 + }, + { + "epoch": 0.6697022068598777, + "grad_norm": 1.3002320528030396, + "learning_rate": 1.4184822450188137e-05, + "loss": 1.4133, + "step": 20150 + }, + { + "epoch": 0.6707325179473544, + "grad_norm": 1.3236373662948608, + "learning_rate": 1.410916653306954e-05, + "loss": 1.4133, + "step": 20181 + }, + { + "epoch": 0.6717628290348312, + "grad_norm": 1.3784340620040894, + "learning_rate": 1.403363351752639e-05, + "loss": 1.4064, + "step": 20212 + }, + { + "epoch": 0.6727931401223078, + "grad_norm": 1.2793350219726562, + "learning_rate": 1.3958224255938485e-05, + "loss": 1.4203, + "step": 20243 + }, + { + "epoch": 0.6738234512097846, + "grad_norm": 1.3510205745697021, + "learning_rate": 1.388293959928911e-05, + "loss": 1.418, + "step": 20274 + }, + { + "epoch": 0.6748537622972614, + "grad_norm": 1.2981188297271729, + "learning_rate": 1.3807780397155379e-05, + "loss": 1.4019, + "step": 20305 + }, + { + "epoch": 0.6758840733847381, + "grad_norm": 1.2599388360977173, + "learning_rate": 1.3732747497698655e-05, + "loss": 1.4187, + "step": 20336 + }, + { + "epoch": 0.6769143844722149, + "grad_norm": 1.2741434574127197, + "learning_rate": 1.3657841747655038e-05, + "loss": 1.4183, + "step": 20367 + }, + { + "epoch": 0.6779446955596916, + "grad_norm": 1.2376216650009155, + "learning_rate": 1.3583063992325706e-05, + "loss": 1.4208, + "step": 20398 + }, + { + "epoch": 0.6789750066471683, + "grad_norm": 1.341134786605835, + "learning_rate": 1.3508415075567496e-05, + "loss": 1.4015, + "step": 20429 + }, + { + "epoch": 0.680005317734645, + "grad_norm": 1.3483457565307617, + "learning_rate": 1.343389583978327e-05, + "loss": 1.4043, + "step": 20460 + }, + { + "epoch": 0.6810356288221218, + "grad_norm": 1.3255680799484253, + "learning_rate": 1.3359507125912468e-05, + "loss": 1.4162, + "step": 20491 + }, + { + "epoch": 0.6820659399095985, + "grad_norm": 1.211305022239685, + "learning_rate": 1.3285249773421627e-05, + "loss": 1.4043, + "step": 20522 + }, + { + "epoch": 0.6830962509970753, + "grad_norm": 1.3049174547195435, + "learning_rate": 1.3211124620294884e-05, + "loss": 1.4012, + "step": 20553 + }, + { + "epoch": 0.684126562084552, + "grad_norm": 1.2884812355041504, + "learning_rate": 1.313713250302451e-05, + "loss": 1.419, + "step": 20584 + }, + { + "epoch": 0.6851568731720287, + "grad_norm": 1.2465201616287231, + "learning_rate": 1.3063274256601479e-05, + "loss": 1.394, + "step": 20615 + }, + { + "epoch": 0.6861871842595054, + "grad_norm": 1.2868762016296387, + "learning_rate": 1.2989550714506086e-05, + "loss": 1.3975, + "step": 20646 + }, + { + "epoch": 0.6872174953469822, + "grad_norm": 1.2728379964828491, + "learning_rate": 1.291596270869846e-05, + "loss": 1.3918, + "step": 20677 + }, + { + "epoch": 0.6882478064344589, + "grad_norm": 1.265869379043579, + "learning_rate": 1.284251106960927e-05, + "loss": 1.402, + "step": 20708 + }, + { + "epoch": 0.6892781175219357, + "grad_norm": 1.3357373476028442, + "learning_rate": 1.2769196626130263e-05, + "loss": 1.3975, + "step": 20739 + }, + { + "epoch": 0.6903084286094124, + "grad_norm": 1.216797947883606, + "learning_rate": 1.2696020205604969e-05, + "loss": 1.3953, + "step": 20770 + }, + { + "epoch": 0.6913387396968891, + "grad_norm": 1.269227385520935, + "learning_rate": 1.2622982633819359e-05, + "loss": 1.4154, + "step": 20801 + }, + { + "epoch": 0.6923690507843658, + "grad_norm": 1.3336331844329834, + "learning_rate": 1.2550084734992484e-05, + "loss": 1.3992, + "step": 20832 + }, + { + "epoch": 0.6933993618718426, + "grad_norm": 1.2936463356018066, + "learning_rate": 1.247732733176724e-05, + "loss": 1.4147, + "step": 20863 + }, + { + "epoch": 0.6944296729593193, + "grad_norm": 1.344826102256775, + "learning_rate": 1.2404711245201044e-05, + "loss": 1.3878, + "step": 20894 + }, + { + "epoch": 0.6954599840467961, + "grad_norm": 1.2611995935440063, + "learning_rate": 1.2332237294756535e-05, + "loss": 1.4088, + "step": 20925 + }, + { + "epoch": 0.6964902951342729, + "grad_norm": 1.3274885416030884, + "learning_rate": 1.225990629829241e-05, + "loss": 1.4036, + "step": 20956 + }, + { + "epoch": 0.6975206062217495, + "grad_norm": 1.2847373485565186, + "learning_rate": 1.2187719072054136e-05, + "loss": 1.398, + "step": 20987 + }, + { + "epoch": 0.6985509173092262, + "grad_norm": 1.2856248617172241, + "learning_rate": 1.2115676430664735e-05, + "loss": 1.4101, + "step": 21018 + }, + { + "epoch": 0.699581228396703, + "grad_norm": 1.3064154386520386, + "learning_rate": 1.2043779187115647e-05, + "loss": 1.4081, + "step": 21049 + }, + { + "epoch": 0.7006115394841798, + "grad_norm": 1.253602147102356, + "learning_rate": 1.1972028152757476e-05, + "loss": 1.4123, + "step": 21080 + }, + { + "epoch": 0.7016418505716565, + "grad_norm": 1.2678899765014648, + "learning_rate": 1.1900424137290889e-05, + "loss": 1.3969, + "step": 21111 + }, + { + "epoch": 0.7026721616591332, + "grad_norm": 1.2261760234832764, + "learning_rate": 1.1828967948757482e-05, + "loss": 1.4009, + "step": 21142 + }, + { + "epoch": 0.7037024727466099, + "grad_norm": 1.540486216545105, + "learning_rate": 1.175766039353062e-05, + "loss": 1.4215, + "step": 21173 + }, + { + "epoch": 0.7047327838340867, + "grad_norm": 1.2508059740066528, + "learning_rate": 1.1686502276306382e-05, + "loss": 1.4046, + "step": 21204 + }, + { + "epoch": 0.7057630949215634, + "grad_norm": 1.2918591499328613, + "learning_rate": 1.1615494400094445e-05, + "loss": 1.4301, + "step": 21235 + }, + { + "epoch": 0.7067934060090402, + "grad_norm": 1.240178108215332, + "learning_rate": 1.1544637566209029e-05, + "loss": 1.3888, + "step": 21266 + }, + { + "epoch": 0.7078237170965169, + "grad_norm": 1.2358977794647217, + "learning_rate": 1.1473932574259886e-05, + "loss": 1.415, + "step": 21297 + }, + { + "epoch": 0.7088540281839936, + "grad_norm": 1.2963451147079468, + "learning_rate": 1.1403380222143247e-05, + "loss": 1.4002, + "step": 21328 + }, + { + "epoch": 0.7098843392714703, + "grad_norm": 1.3245363235473633, + "learning_rate": 1.1332981306032808e-05, + "loss": 1.3945, + "step": 21359 + }, + { + "epoch": 0.7109146503589471, + "grad_norm": 1.2833342552185059, + "learning_rate": 1.1262736620370762e-05, + "loss": 1.4054, + "step": 21390 + }, + { + "epoch": 0.7119449614464238, + "grad_norm": 1.3230944871902466, + "learning_rate": 1.1192646957858854e-05, + "loss": 1.398, + "step": 21421 + }, + { + "epoch": 0.7129752725339006, + "grad_norm": 1.2515650987625122, + "learning_rate": 1.1122713109449381e-05, + "loss": 1.3958, + "step": 21452 + }, + { + "epoch": 0.7140055836213773, + "grad_norm": 1.313057780265808, + "learning_rate": 1.105293586433634e-05, + "loss": 1.3909, + "step": 21483 + }, + { + "epoch": 0.715035894708854, + "grad_norm": 1.2700668573379517, + "learning_rate": 1.0983316009946446e-05, + "loss": 1.3939, + "step": 21514 + }, + { + "epoch": 0.7160662057963307, + "grad_norm": 1.2487835884094238, + "learning_rate": 1.0913854331930282e-05, + "loss": 1.4162, + "step": 21545 + }, + { + "epoch": 0.7170965168838075, + "grad_norm": 1.2748737335205078, + "learning_rate": 1.0844551614153456e-05, + "loss": 1.3984, + "step": 21576 + }, + { + "epoch": 0.7181268279712842, + "grad_norm": 1.24228036403656, + "learning_rate": 1.0775408638687725e-05, + "loss": 1.4002, + "step": 21607 + }, + { + "epoch": 0.719157139058761, + "grad_norm": 1.3365492820739746, + "learning_rate": 1.0706426185802165e-05, + "loss": 1.4091, + "step": 21638 + }, + { + "epoch": 0.7201874501462378, + "grad_norm": 1.2073006629943848, + "learning_rate": 1.0637605033954371e-05, + "loss": 1.4034, + "step": 21669 + }, + { + "epoch": 0.7212177612337144, + "grad_norm": 1.2873163223266602, + "learning_rate": 1.05689459597817e-05, + "loss": 1.3994, + "step": 21700 + }, + { + "epoch": 0.7222480723211911, + "grad_norm": 1.3623207807540894, + "learning_rate": 1.050044973809246e-05, + "loss": 1.3827, + "step": 21731 + }, + { + "epoch": 0.7232783834086679, + "grad_norm": 1.256643533706665, + "learning_rate": 1.043211714185722e-05, + "loss": 1.3989, + "step": 21762 + }, + { + "epoch": 0.7243086944961447, + "grad_norm": 1.201434850692749, + "learning_rate": 1.036394894220003e-05, + "loss": 1.3892, + "step": 21793 + }, + { + "epoch": 0.7253390055836214, + "grad_norm": 1.335642695426941, + "learning_rate": 1.0295945908389751e-05, + "loss": 1.4077, + "step": 21824 + }, + { + "epoch": 0.7263693166710982, + "grad_norm": 1.252847671508789, + "learning_rate": 1.0228108807831393e-05, + "loss": 1.4077, + "step": 21855 + }, + { + "epoch": 0.7273996277585748, + "grad_norm": 1.3838329315185547, + "learning_rate": 1.01604384060574e-05, + "loss": 1.3944, + "step": 21886 + }, + { + "epoch": 0.7284299388460516, + "grad_norm": 1.3425817489624023, + "learning_rate": 1.009293546671907e-05, + "loss": 1.4067, + "step": 21917 + }, + { + "epoch": 0.7294602499335283, + "grad_norm": 1.3198227882385254, + "learning_rate": 1.002560075157791e-05, + "loss": 1.4043, + "step": 21948 + }, + { + "epoch": 0.7304905610210051, + "grad_norm": 1.3169294595718384, + "learning_rate": 9.958435020496995e-06, + "loss": 1.3743, + "step": 21979 + }, + { + "epoch": 0.7315208721084818, + "grad_norm": 1.2145452499389648, + "learning_rate": 9.89143903143249e-06, + "loss": 1.3875, + "step": 22010 + }, + { + "epoch": 0.7325511831959585, + "grad_norm": 1.368464469909668, + "learning_rate": 9.824613540425038e-06, + "loss": 1.3939, + "step": 22041 + }, + { + "epoch": 0.7335814942834352, + "grad_norm": 1.2481716871261597, + "learning_rate": 9.757959301591197e-06, + "loss": 1.4032, + "step": 22072 + }, + { + "epoch": 0.734611805370912, + "grad_norm": 1.225689172744751, + "learning_rate": 9.691477067115017e-06, + "loss": 1.4057, + "step": 22103 + }, + { + "epoch": 0.7356421164583887, + "grad_norm": 1.2322176694869995, + "learning_rate": 9.625167587239467e-06, + "loss": 1.3983, + "step": 22134 + }, + { + "epoch": 0.7366724275458655, + "grad_norm": 1.2423603534698486, + "learning_rate": 9.559031610258007e-06, + "loss": 1.4246, + "step": 22165 + }, + { + "epoch": 0.7377027386333422, + "grad_norm": 1.2707546949386597, + "learning_rate": 9.493069882506164e-06, + "loss": 1.4033, + "step": 22196 + }, + { + "epoch": 0.7387330497208189, + "grad_norm": 1.2819782495498657, + "learning_rate": 9.427283148353056e-06, + "loss": 1.3942, + "step": 22227 + }, + { + "epoch": 0.7397633608082956, + "grad_norm": 1.278111219406128, + "learning_rate": 9.361672150193052e-06, + "loss": 1.4124, + "step": 22258 + }, + { + "epoch": 0.7407936718957724, + "grad_norm": 1.2402000427246094, + "learning_rate": 9.29623762843734e-06, + "loss": 1.3784, + "step": 22289 + }, + { + "epoch": 0.7418239829832491, + "grad_norm": 1.2294648885726929, + "learning_rate": 9.230980321505594e-06, + "loss": 1.3998, + "step": 22320 + }, + { + "epoch": 0.7428542940707259, + "grad_norm": 1.3570529222488403, + "learning_rate": 9.165900965817668e-06, + "loss": 1.3867, + "step": 22351 + }, + { + "epoch": 0.7438846051582026, + "grad_norm": 1.2765589952468872, + "learning_rate": 9.101000295785245e-06, + "loss": 1.3848, + "step": 22382 + }, + { + "epoch": 0.7449149162456793, + "grad_norm": 1.301269292831421, + "learning_rate": 9.036279043803565e-06, + "loss": 1.3976, + "step": 22413 + }, + { + "epoch": 0.745945227333156, + "grad_norm": 1.3582361936569214, + "learning_rate": 8.971737940243147e-06, + "loss": 1.398, + "step": 22444 + }, + { + "epoch": 0.7469755384206328, + "grad_norm": 1.3054485321044922, + "learning_rate": 8.907377713441592e-06, + "loss": 1.402, + "step": 22475 + }, + { + "epoch": 0.7480058495081096, + "grad_norm": 1.2361812591552734, + "learning_rate": 8.843199089695293e-06, + "loss": 1.4097, + "step": 22506 + }, + { + "epoch": 0.7490361605955863, + "grad_norm": 1.2720493078231812, + "learning_rate": 8.779202793251311e-06, + "loss": 1.4046, + "step": 22537 + }, + { + "epoch": 0.7500664716830631, + "grad_norm": 1.2494639158248901, + "learning_rate": 8.715389546299149e-06, + "loss": 1.3858, + "step": 22568 + }, + { + "epoch": 0.7510967827705397, + "grad_norm": 1.2343871593475342, + "learning_rate": 8.651760068962617e-06, + "loss": 1.3896, + "step": 22599 + }, + { + "epoch": 0.7521270938580165, + "grad_norm": 1.1934345960617065, + "learning_rate": 8.588315079291733e-06, + "loss": 1.4095, + "step": 22630 + }, + { + "epoch": 0.7531574049454932, + "grad_norm": 1.2811630964279175, + "learning_rate": 8.52505529325457e-06, + "loss": 1.3954, + "step": 22661 + }, + { + "epoch": 0.75418771603297, + "grad_norm": 1.2676504850387573, + "learning_rate": 8.461981424729216e-06, + "loss": 1.3901, + "step": 22692 + }, + { + "epoch": 0.7552180271204467, + "grad_norm": 1.3221408128738403, + "learning_rate": 8.399094185495725e-06, + "loss": 1.4057, + "step": 22723 + }, + { + "epoch": 0.7562483382079235, + "grad_norm": 1.2741389274597168, + "learning_rate": 8.336394285228017e-06, + "loss": 1.3964, + "step": 22754 + }, + { + "epoch": 0.7572786492954001, + "grad_norm": 1.329860806465149, + "learning_rate": 8.273882431485952e-06, + "loss": 1.3946, + "step": 22785 + }, + { + "epoch": 0.7583089603828769, + "grad_norm": 1.3073118925094604, + "learning_rate": 8.211559329707316e-06, + "loss": 1.3937, + "step": 22816 + }, + { + "epoch": 0.7593392714703536, + "grad_norm": 1.2866522073745728, + "learning_rate": 8.149425683199823e-06, + "loss": 1.3999, + "step": 22847 + }, + { + "epoch": 0.7603695825578304, + "grad_norm": 1.2539178133010864, + "learning_rate": 8.08748219313325e-06, + "loss": 1.398, + "step": 22878 + }, + { + "epoch": 0.7613998936453071, + "grad_norm": 1.279863715171814, + "learning_rate": 8.025729558531453e-06, + "loss": 1.4155, + "step": 22909 + }, + { + "epoch": 0.7624302047327839, + "grad_norm": 1.2936811447143555, + "learning_rate": 7.964168476264508e-06, + "loss": 1.4036, + "step": 22940 + }, + { + "epoch": 0.7634605158202605, + "grad_norm": 1.2729599475860596, + "learning_rate": 7.902799641040884e-06, + "loss": 1.4003, + "step": 22971 + }, + { + "epoch": 0.7644908269077373, + "grad_norm": 1.2257497310638428, + "learning_rate": 7.841623745399523e-06, + "loss": 1.408, + "step": 23002 + }, + { + "epoch": 0.765521137995214, + "grad_norm": 1.254761815071106, + "learning_rate": 7.780641479702114e-06, + "loss": 1.3925, + "step": 23033 + }, + { + "epoch": 0.7665514490826908, + "grad_norm": 1.2740334272384644, + "learning_rate": 7.719853532125227e-06, + "loss": 1.3996, + "step": 23064 + }, + { + "epoch": 0.7675817601701675, + "grad_norm": 1.2421025037765503, + "learning_rate": 7.65926058865258e-06, + "loss": 1.3852, + "step": 23095 + }, + { + "epoch": 0.7686120712576442, + "grad_norm": 1.3271669149398804, + "learning_rate": 7.598863333067313e-06, + "loss": 1.408, + "step": 23126 + }, + { + "epoch": 0.769642382345121, + "grad_norm": 1.3040279150009155, + "learning_rate": 7.538662446944253e-06, + "loss": 1.3718, + "step": 23157 + }, + { + "epoch": 0.7706726934325977, + "grad_norm": 1.230797290802002, + "learning_rate": 7.478658609642211e-06, + "loss": 1.3776, + "step": 23188 + }, + { + "epoch": 0.7717030045200745, + "grad_norm": 1.2709274291992188, + "learning_rate": 7.418852498296327e-06, + "loss": 1.3975, + "step": 23219 + }, + { + "epoch": 0.7727333156075512, + "grad_norm": 1.227398157119751, + "learning_rate": 7.359244787810457e-06, + "loss": 1.382, + "step": 23250 + }, + { + "epoch": 0.773763626695028, + "grad_norm": 1.242308259010315, + "learning_rate": 7.299836150849493e-06, + "loss": 1.3792, + "step": 23281 + }, + { + "epoch": 0.7747939377825046, + "grad_norm": 1.2658405303955078, + "learning_rate": 7.240627257831847e-06, + "loss": 1.3699, + "step": 23312 + }, + { + "epoch": 0.7758242488699814, + "grad_norm": 1.3357101678848267, + "learning_rate": 7.1816187769218195e-06, + "loss": 1.3972, + "step": 23343 + }, + { + "epoch": 0.7768545599574581, + "grad_norm": 1.2248833179473877, + "learning_rate": 7.1228113740220895e-06, + "loss": 1.3875, + "step": 23374 + }, + { + "epoch": 0.7778848710449349, + "grad_norm": 1.2615251541137695, + "learning_rate": 7.064205712766226e-06, + "loss": 1.3947, + "step": 23405 + }, + { + "epoch": 0.7789151821324116, + "grad_norm": 1.2719477415084839, + "learning_rate": 7.005802454511129e-06, + "loss": 1.3943, + "step": 23436 + }, + { + "epoch": 0.7799454932198884, + "grad_norm": 1.2429877519607544, + "learning_rate": 6.947602258329639e-06, + "loss": 1.3924, + "step": 23467 + }, + { + "epoch": 0.780975804307365, + "grad_norm": 1.3180112838745117, + "learning_rate": 6.889605781003078e-06, + "loss": 1.4095, + "step": 23498 + }, + { + "epoch": 0.7820061153948418, + "grad_norm": 1.3340109586715698, + "learning_rate": 6.831813677013776e-06, + "loss": 1.3873, + "step": 23529 + }, + { + "epoch": 0.7830364264823185, + "grad_norm": 1.2713093757629395, + "learning_rate": 6.774226598537792e-06, + "loss": 1.3882, + "step": 23560 + }, + { + "epoch": 0.7840667375697953, + "grad_norm": 1.2504241466522217, + "learning_rate": 6.716845195437482e-06, + "loss": 1.3795, + "step": 23591 + }, + { + "epoch": 0.785097048657272, + "grad_norm": 1.273703694343567, + "learning_rate": 6.659670115254168e-06, + "loss": 1.3819, + "step": 23622 + }, + { + "epoch": 0.7861273597447488, + "grad_norm": 1.3121949434280396, + "learning_rate": 6.602702003200872e-06, + "loss": 1.3827, + "step": 23653 + }, + { + "epoch": 0.7871576708322254, + "grad_norm": 1.2552127838134766, + "learning_rate": 6.545941502154992e-06, + "loss": 1.3935, + "step": 23684 + }, + { + "epoch": 0.7881879819197022, + "grad_norm": 1.2457008361816406, + "learning_rate": 6.489389252651057e-06, + "loss": 1.3847, + "step": 23715 + }, + { + "epoch": 0.7892182930071789, + "grad_norm": 1.2819870710372925, + "learning_rate": 6.4330458928735325e-06, + "loss": 1.3965, + "step": 23746 + }, + { + "epoch": 0.7902486040946557, + "grad_norm": 1.2543584108352661, + "learning_rate": 6.376912058649559e-06, + "loss": 1.4025, + "step": 23777 + }, + { + "epoch": 0.7912789151821324, + "grad_norm": 1.2502461671829224, + "learning_rate": 6.320988383441845e-06, + "loss": 1.3799, + "step": 23808 + }, + { + "epoch": 0.7923092262696092, + "grad_norm": 1.2568906545639038, + "learning_rate": 6.265275498341452e-06, + "loss": 1.3887, + "step": 23839 + }, + { + "epoch": 0.7933395373570858, + "grad_norm": 1.2879040241241455, + "learning_rate": 6.209774032060714e-06, + "loss": 1.3922, + "step": 23870 + }, + { + "epoch": 0.7943698484445626, + "grad_norm": 1.2547533512115479, + "learning_rate": 6.1544846109261365e-06, + "loss": 1.3891, + "step": 23901 + }, + { + "epoch": 0.7954001595320394, + "grad_norm": 1.2941306829452515, + "learning_rate": 6.099407858871342e-06, + "loss": 1.3914, + "step": 23932 + }, + { + "epoch": 0.7964304706195161, + "grad_norm": 1.3194507360458374, + "learning_rate": 6.044544397429958e-06, + "loss": 1.3857, + "step": 23963 + }, + { + "epoch": 0.7974607817069929, + "grad_norm": 1.2143921852111816, + "learning_rate": 5.989894845728708e-06, + "loss": 1.4041, + "step": 23994 + }, + { + "epoch": 0.7984910927944695, + "grad_norm": 1.2587990760803223, + "learning_rate": 5.9354598204803605e-06, + "loss": 1.3901, + "step": 24025 + }, + { + "epoch": 0.7995214038819463, + "grad_norm": 1.2482203245162964, + "learning_rate": 5.881239935976762e-06, + "loss": 1.384, + "step": 24056 + }, + { + "epoch": 0.800551714969423, + "grad_norm": 1.2880163192749023, + "learning_rate": 5.827235804081954e-06, + "loss": 1.3876, + "step": 24087 + }, + { + "epoch": 0.8015820260568998, + "grad_norm": 1.2727841138839722, + "learning_rate": 5.773448034225221e-06, + "loss": 1.3752, + "step": 24118 + }, + { + "epoch": 0.8026123371443765, + "grad_norm": 1.2767062187194824, + "learning_rate": 5.719877233394228e-06, + "loss": 1.4, + "step": 24149 + }, + { + "epoch": 0.8036426482318533, + "grad_norm": 1.2654463052749634, + "learning_rate": 5.666524006128191e-06, + "loss": 1.39, + "step": 24180 + }, + { + "epoch": 0.8046729593193299, + "grad_norm": 1.2623034715652466, + "learning_rate": 5.613388954511015e-06, + "loss": 1.3885, + "step": 24211 + }, + { + "epoch": 0.8057032704068067, + "grad_norm": 1.303368330001831, + "learning_rate": 5.560472678164552e-06, + "loss": 1.3933, + "step": 24242 + }, + { + "epoch": 0.8067335814942834, + "grad_norm": 1.232909917831421, + "learning_rate": 5.507775774241775e-06, + "loss": 1.3897, + "step": 24273 + }, + { + "epoch": 0.8077638925817602, + "grad_norm": 1.3074171543121338, + "learning_rate": 5.4552988374200945e-06, + "loss": 1.3836, + "step": 24304 + }, + { + "epoch": 0.8087942036692369, + "grad_norm": 1.287463903427124, + "learning_rate": 5.403042459894597e-06, + "loss": 1.3889, + "step": 24335 + }, + { + "epoch": 0.8098245147567137, + "grad_norm": 1.2616747617721558, + "learning_rate": 5.3510072313714135e-06, + "loss": 1.3978, + "step": 24366 + }, + { + "epoch": 0.8108548258441903, + "grad_norm": 1.2531288862228394, + "learning_rate": 5.2991937390610205e-06, + "loss": 1.4116, + "step": 24397 + }, + { + "epoch": 0.8118851369316671, + "grad_norm": 1.2136998176574707, + "learning_rate": 5.247602567671625e-06, + "loss": 1.3794, + "step": 24428 + }, + { + "epoch": 0.8129154480191438, + "grad_norm": 1.3023301362991333, + "learning_rate": 5.196234299402603e-06, + "loss": 1.3868, + "step": 24459 + }, + { + "epoch": 0.8139457591066206, + "grad_norm": 1.2590848207473755, + "learning_rate": 5.145089513937865e-06, + "loss": 1.3865, + "step": 24490 + }, + { + "epoch": 0.8149760701940973, + "grad_norm": 1.2516260147094727, + "learning_rate": 5.094168788439369e-06, + "loss": 1.3923, + "step": 24521 + }, + { + "epoch": 0.8160063812815741, + "grad_norm": 1.2341543436050415, + "learning_rate": 5.043472697540594e-06, + "loss": 1.3824, + "step": 24552 + }, + { + "epoch": 0.8170366923690507, + "grad_norm": 1.3493062257766724, + "learning_rate": 4.993001813340012e-06, + "loss": 1.4024, + "step": 24583 + }, + { + "epoch": 0.8180670034565275, + "grad_norm": 1.271795392036438, + "learning_rate": 4.942756705394702e-06, + "loss": 1.3821, + "step": 24614 + }, + { + "epoch": 0.8190973145440043, + "grad_norm": 1.3145335912704468, + "learning_rate": 4.892737940713884e-06, + "loss": 1.3786, + "step": 24645 + }, + { + "epoch": 0.820127625631481, + "grad_norm": 1.3532222509384155, + "learning_rate": 4.842946083752511e-06, + "loss": 1.3981, + "step": 24676 + }, + { + "epoch": 0.8211579367189578, + "grad_norm": 1.3181504011154175, + "learning_rate": 4.79338169640493e-06, + "loss": 1.3916, + "step": 24707 + }, + { + "epoch": 0.8221882478064345, + "grad_norm": 1.267794132232666, + "learning_rate": 4.74404533799851e-06, + "loss": 1.3768, + "step": 24738 + }, + { + "epoch": 0.8232185588939112, + "grad_norm": 1.2763338088989258, + "learning_rate": 4.694937565287344e-06, + "loss": 1.3972, + "step": 24769 + }, + { + "epoch": 0.8242488699813879, + "grad_norm": 1.2626184225082397, + "learning_rate": 4.646058932445985e-06, + "loss": 1.3815, + "step": 24800 + }, + { + "epoch": 0.8252791810688647, + "grad_norm": 1.1800566911697388, + "learning_rate": 4.597409991063148e-06, + "loss": 1.3949, + "step": 24831 + }, + { + "epoch": 0.8263094921563414, + "grad_norm": 1.2157528400421143, + "learning_rate": 4.5489912901355375e-06, + "loss": 1.3783, + "step": 24862 + }, + { + "epoch": 0.8273398032438182, + "grad_norm": 1.3244526386260986, + "learning_rate": 4.500803376061608e-06, + "loss": 1.3861, + "step": 24893 + }, + { + "epoch": 0.8283701143312948, + "grad_norm": 1.2245334386825562, + "learning_rate": 4.45284679263541e-06, + "loss": 1.3817, + "step": 24924 + }, + { + "epoch": 0.8294004254187716, + "grad_norm": 1.2566081285476685, + "learning_rate": 4.4051220810404775e-06, + "loss": 1.3979, + "step": 24955 + }, + { + "epoch": 0.8304307365062483, + "grad_norm": 1.2556860446929932, + "learning_rate": 4.3576297798437025e-06, + "loss": 1.3826, + "step": 24986 + }, + { + "epoch": 0.8314610475937251, + "grad_norm": 1.2634494304656982, + "learning_rate": 4.3103704249892436e-06, + "loss": 1.3733, + "step": 25017 + }, + { + "epoch": 0.8324913586812018, + "grad_norm": 1.234903335571289, + "learning_rate": 4.263344549792487e-06, + "loss": 1.3815, + "step": 25048 + }, + { + "epoch": 0.8335216697686786, + "grad_norm": 1.3948299884796143, + "learning_rate": 4.216552684934056e-06, + "loss": 1.402, + "step": 25079 + }, + { + "epoch": 0.8345519808561552, + "grad_norm": 1.363745093345642, + "learning_rate": 4.169995358453777e-06, + "loss": 1.3872, + "step": 25110 + }, + { + "epoch": 0.835582291943632, + "grad_norm": 1.354319453239441, + "learning_rate": 4.123673095744757e-06, + "loss": 1.3817, + "step": 25141 + }, + { + "epoch": 0.8366126030311087, + "grad_norm": 1.2999165058135986, + "learning_rate": 4.077586419547435e-06, + "loss": 1.3806, + "step": 25172 + }, + { + "epoch": 0.8376429141185855, + "grad_norm": 1.2431261539459229, + "learning_rate": 4.03173584994368e-06, + "loss": 1.3724, + "step": 25203 + }, + { + "epoch": 0.8386732252060622, + "grad_norm": 1.2831732034683228, + "learning_rate": 3.986121904350948e-06, + "loss": 1.4055, + "step": 25234 + }, + { + "epoch": 0.839703536293539, + "grad_norm": 1.2473969459533691, + "learning_rate": 3.940745097516407e-06, + "loss": 1.3804, + "step": 25265 + }, + { + "epoch": 0.8407338473810156, + "grad_norm": 1.2680081129074097, + "learning_rate": 3.89560594151116e-06, + "loss": 1.3971, + "step": 25296 + }, + { + "epoch": 0.8417641584684924, + "grad_norm": 1.3049360513687134, + "learning_rate": 3.850704945724456e-06, + "loss": 1.3883, + "step": 25327 + }, + { + "epoch": 0.8427944695559692, + "grad_norm": 1.3096522092819214, + "learning_rate": 3.8060426168579077e-06, + "loss": 1.3932, + "step": 25358 + }, + { + "epoch": 0.8438247806434459, + "grad_norm": 1.2855119705200195, + "learning_rate": 3.7616194589198407e-06, + "loss": 1.3953, + "step": 25389 + }, + { + "epoch": 0.8448550917309227, + "grad_norm": 1.2272716760635376, + "learning_rate": 3.7174359732195574e-06, + "loss": 1.3924, + "step": 25420 + }, + { + "epoch": 0.8458854028183994, + "grad_norm": 1.2750498056411743, + "learning_rate": 3.673492658361677e-06, + "loss": 1.3857, + "step": 25451 + }, + { + "epoch": 0.846915713905876, + "grad_norm": 1.2702478170394897, + "learning_rate": 3.6297900102405467e-06, + "loss": 1.3833, + "step": 25482 + }, + { + "epoch": 0.8479460249933528, + "grad_norm": 1.3162232637405396, + "learning_rate": 3.586328522034607e-06, + "loss": 1.3936, + "step": 25513 + }, + { + "epoch": 0.8489763360808296, + "grad_norm": 1.228898048400879, + "learning_rate": 3.543108684200838e-06, + "loss": 1.376, + "step": 25544 + }, + { + "epoch": 0.8500066471683063, + "grad_norm": 1.2657815217971802, + "learning_rate": 3.5001309844692464e-06, + "loss": 1.3827, + "step": 25575 + }, + { + "epoch": 0.8510369582557831, + "grad_norm": 1.252999186515808, + "learning_rate": 3.4573959078373215e-06, + "loss": 1.3706, + "step": 25606 + }, + { + "epoch": 0.8520672693432598, + "grad_norm": 1.4515488147735596, + "learning_rate": 3.4149039365646063e-06, + "loss": 1.3928, + "step": 25637 + }, + { + "epoch": 0.8530975804307365, + "grad_norm": 1.2513251304626465, + "learning_rate": 3.3726555501672143e-06, + "loss": 1.3763, + "step": 25668 + }, + { + "epoch": 0.8541278915182132, + "grad_norm": 1.311325192451477, + "learning_rate": 3.33065122541244e-06, + "loss": 1.3807, + "step": 25699 + }, + { + "epoch": 0.85515820260569, + "grad_norm": 1.2587943077087402, + "learning_rate": 3.288891436313385e-06, + "loss": 1.3802, + "step": 25730 + }, + { + "epoch": 0.8561885136931667, + "grad_norm": 1.2624818086624146, + "learning_rate": 3.2473766541235963e-06, + "loss": 1.3915, + "step": 25761 + }, + { + "epoch": 0.8572188247806435, + "grad_norm": 1.2625864744186401, + "learning_rate": 3.2061073473317466e-06, + "loss": 1.3855, + "step": 25792 + }, + { + "epoch": 0.8582491358681201, + "grad_norm": 1.2889775037765503, + "learning_rate": 3.1650839816563444e-06, + "loss": 1.3942, + "step": 25823 + }, + { + "epoch": 0.8592794469555969, + "grad_norm": 1.2399699687957764, + "learning_rate": 3.1243070200405093e-06, + "loss": 1.3929, + "step": 25854 + }, + { + "epoch": 0.8603097580430736, + "grad_norm": 1.2660589218139648, + "learning_rate": 3.0837769226467e-06, + "loss": 1.3647, + "step": 25885 + }, + { + "epoch": 0.8613400691305504, + "grad_norm": 1.2619723081588745, + "learning_rate": 3.0434941468515666e-06, + "loss": 1.3804, + "step": 25916 + }, + { + "epoch": 0.8623703802180271, + "grad_norm": 1.3124239444732666, + "learning_rate": 3.003459147240753e-06, + "loss": 1.368, + "step": 25947 + }, + { + "epoch": 0.8634006913055039, + "grad_norm": 1.2878339290618896, + "learning_rate": 2.9636723756037875e-06, + "loss": 1.3835, + "step": 25978 + }, + { + "epoch": 0.8644310023929805, + "grad_norm": 1.2607743740081787, + "learning_rate": 2.9241342809289833e-06, + "loss": 1.3933, + "step": 26009 + }, + { + "epoch": 0.8654613134804573, + "grad_norm": 1.2619109153747559, + "learning_rate": 2.8848453093983594e-06, + "loss": 1.3881, + "step": 26040 + }, + { + "epoch": 0.866491624567934, + "grad_norm": 1.3922829627990723, + "learning_rate": 2.8458059043826257e-06, + "loss": 1.3741, + "step": 26071 + }, + { + "epoch": 0.8675219356554108, + "grad_norm": 1.3063180446624756, + "learning_rate": 2.807016506436172e-06, + "loss": 1.3987, + "step": 26102 + }, + { + "epoch": 0.8685522467428876, + "grad_norm": 1.3027793169021606, + "learning_rate": 2.7684775532920566e-06, + "loss": 1.3833, + "step": 26133 + }, + { + "epoch": 0.8695825578303643, + "grad_norm": 1.286738395690918, + "learning_rate": 2.7301894798571425e-06, + "loss": 1.3983, + "step": 26164 + }, + { + "epoch": 0.870612868917841, + "grad_norm": 1.2284873723983765, + "learning_rate": 2.6921527182071386e-06, + "loss": 1.3806, + "step": 26195 + }, + { + "epoch": 0.8716431800053177, + "grad_norm": 1.282870888710022, + "learning_rate": 2.654367697581725e-06, + "loss": 1.3864, + "step": 26226 + }, + { + "epoch": 0.8726734910927945, + "grad_norm": 1.2854727506637573, + "learning_rate": 2.6168348443797175e-06, + "loss": 1.3615, + "step": 26257 + }, + { + "epoch": 0.8737038021802712, + "grad_norm": 1.2982513904571533, + "learning_rate": 2.5795545821542757e-06, + "loss": 1.3636, + "step": 26288 + }, + { + "epoch": 0.874734113267748, + "grad_norm": 1.3433053493499756, + "learning_rate": 2.54252733160808e-06, + "loss": 1.3792, + "step": 26319 + }, + { + "epoch": 0.8757644243552247, + "grad_norm": 1.2748687267303467, + "learning_rate": 2.5057535105886294e-06, + "loss": 1.3822, + "step": 26350 + }, + { + "epoch": 0.8767947354427014, + "grad_norm": 1.1860417127609253, + "learning_rate": 2.4692335340834953e-06, + "loss": 1.3812, + "step": 26381 + }, + { + "epoch": 0.8778250465301781, + "grad_norm": 1.3041021823883057, + "learning_rate": 2.432967814215639e-06, + "loss": 1.3919, + "step": 26412 + }, + { + "epoch": 0.8788553576176549, + "grad_norm": 1.3307725191116333, + "learning_rate": 2.396956760238794e-06, + "loss": 1.3737, + "step": 26443 + }, + { + "epoch": 0.8798856687051316, + "grad_norm": 1.3257086277008057, + "learning_rate": 2.361200778532796e-06, + "loss": 1.3729, + "step": 26474 + }, + { + "epoch": 0.8809159797926084, + "grad_norm": 1.4235222339630127, + "learning_rate": 2.325700272599049e-06, + "loss": 1.3892, + "step": 26505 + }, + { + "epoch": 0.8819462908800851, + "grad_norm": 1.3324629068374634, + "learning_rate": 2.2904556430559415e-06, + "loss": 1.3958, + "step": 26536 + }, + { + "epoch": 0.8829766019675618, + "grad_norm": 1.2705706357955933, + "learning_rate": 2.2554672876343106e-06, + "loss": 1.369, + "step": 26567 + }, + { + "epoch": 0.8840069130550385, + "grad_norm": 1.2910141944885254, + "learning_rate": 2.220735601173002e-06, + "loss": 1.3764, + "step": 26598 + }, + { + "epoch": 0.8850372241425153, + "grad_norm": 1.312762975692749, + "learning_rate": 2.186260975614382e-06, + "loss": 1.3689, + "step": 26629 + }, + { + "epoch": 0.886067535229992, + "grad_norm": 1.279833197593689, + "learning_rate": 2.1520437999999034e-06, + "loss": 1.3816, + "step": 26660 + }, + { + "epoch": 0.8870978463174688, + "grad_norm": 1.312485933303833, + "learning_rate": 2.1180844604657526e-06, + "loss": 1.3935, + "step": 26691 + }, + { + "epoch": 0.8881281574049454, + "grad_norm": 1.2287721633911133, + "learning_rate": 2.084383340238455e-06, + "loss": 1.3721, + "step": 26722 + }, + { + "epoch": 0.8891584684924222, + "grad_norm": 1.2619805335998535, + "learning_rate": 2.0509408196305704e-06, + "loss": 1.3874, + "step": 26753 + }, + { + "epoch": 0.890188779579899, + "grad_norm": 1.3075838088989258, + "learning_rate": 2.017757276036403e-06, + "loss": 1.3888, + "step": 26784 + }, + { + "epoch": 0.8912190906673757, + "grad_norm": 1.257625937461853, + "learning_rate": 1.984833083927726e-06, + "loss": 1.3814, + "step": 26815 + }, + { + "epoch": 0.8922494017548525, + "grad_norm": 1.2962384223937988, + "learning_rate": 1.952168614849581e-06, + "loss": 1.3762, + "step": 26846 + }, + { + "epoch": 0.8932797128423292, + "grad_norm": 1.277114748954773, + "learning_rate": 1.919764237416058e-06, + "loss": 1.3709, + "step": 26877 + }, + { + "epoch": 0.8943100239298059, + "grad_norm": 1.3202005624771118, + "learning_rate": 1.8876203173061463e-06, + "loss": 1.3974, + "step": 26908 + }, + { + "epoch": 0.8953403350172826, + "grad_norm": 1.2782710790634155, + "learning_rate": 1.8557372172596206e-06, + "loss": 1.3882, + "step": 26939 + }, + { + "epoch": 0.8963706461047594, + "grad_norm": 1.1860815286636353, + "learning_rate": 1.8241152970729341e-06, + "loss": 1.359, + "step": 26970 + }, + { + "epoch": 0.8974009571922361, + "grad_norm": 1.2500203847885132, + "learning_rate": 1.7927549135951572e-06, + "loss": 1.3858, + "step": 27001 + }, + { + "epoch": 0.8984312682797129, + "grad_norm": 1.264669418334961, + "learning_rate": 1.7616564207239477e-06, + "loss": 1.3849, + "step": 27032 + }, + { + "epoch": 0.8994615793671896, + "grad_norm": 1.2666518688201904, + "learning_rate": 1.730820169401584e-06, + "loss": 1.4078, + "step": 27063 + }, + { + "epoch": 0.9004918904546663, + "grad_norm": 1.2911863327026367, + "learning_rate": 1.7002465076109558e-06, + "loss": 1.3714, + "step": 27094 + }, + { + "epoch": 0.901522201542143, + "grad_norm": 1.3311351537704468, + "learning_rate": 1.6699357803716898e-06, + "loss": 1.3852, + "step": 27125 + }, + { + "epoch": 0.9025525126296198, + "grad_norm": 1.2619616985321045, + "learning_rate": 1.6398883297362305e-06, + "loss": 1.3778, + "step": 27156 + }, + { + "epoch": 0.9035828237170965, + "grad_norm": 1.245505452156067, + "learning_rate": 1.6101044947859606e-06, + "loss": 1.3928, + "step": 27187 + }, + { + "epoch": 0.9046131348045733, + "grad_norm": 1.2463428974151611, + "learning_rate": 1.5805846116274114e-06, + "loss": 1.373, + "step": 27218 + }, + { + "epoch": 0.90564344589205, + "grad_norm": 1.2582367658615112, + "learning_rate": 1.5513290133884611e-06, + "loss": 1.3829, + "step": 27249 + }, + { + "epoch": 0.9066737569795267, + "grad_norm": 1.3230143785476685, + "learning_rate": 1.5223380302145512e-06, + "loss": 1.3705, + "step": 27280 + }, + { + "epoch": 0.9077040680670034, + "grad_norm": 1.2450213432312012, + "learning_rate": 1.4936119892649925e-06, + "loss": 1.3825, + "step": 27311 + }, + { + "epoch": 0.9087343791544802, + "grad_norm": 1.3045326471328735, + "learning_rate": 1.4651512147092482e-06, + "loss": 1.3619, + "step": 27342 + }, + { + "epoch": 0.9097646902419569, + "grad_norm": 1.3278846740722656, + "learning_rate": 1.4369560277232908e-06, + "loss": 1.3791, + "step": 27373 + }, + { + "epoch": 0.9107950013294337, + "grad_norm": 1.355610728263855, + "learning_rate": 1.409026746485978e-06, + "loss": 1.3973, + "step": 27404 + }, + { + "epoch": 0.9118253124169104, + "grad_norm": 1.219814658164978, + "learning_rate": 1.3813636861754464e-06, + "loss": 1.3874, + "step": 27435 + }, + { + "epoch": 0.9128556235043871, + "grad_norm": 1.27649986743927, + "learning_rate": 1.3539671589655773e-06, + "loss": 1.3752, + "step": 27466 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.037550548620044e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-27468/training_args.bin b/checkpoint-27468/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..974208468b82a3c5684aaa384776477cf21c18ca --- /dev/null +++ b/checkpoint-27468/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a23be0ff07d6d3142f7c0980f91dddba845519c24fcb411cbb4b9ddb1513ff +size 5304 diff --git a/checkpoint-30517/config.json b/checkpoint-30517/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aaa74176892d42e1c7f5979b7ddf8ab15985d3 --- /dev/null +++ b/checkpoint-30517/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "/mnt/parscratch/users/acp23ay/private/models/Llama-3.1-8B-Instruct-ta-madlad-mean/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 138256 +} diff --git a/checkpoint-30517/generation_config.json b/checkpoint-30517/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-30517/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-30517/model-00001-of-00007.safetensors b/checkpoint-30517/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2daf804d7e6e513fbf46d8ab1552516bca5fe6cb --- /dev/null +++ b/checkpoint-30517/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46d49734b69cc2edb5667850ac9486f5fd2b23d7175f2aa4edbdab66f483dcff +size 4983197184 diff --git a/checkpoint-30517/model-00002-of-00007.safetensors b/checkpoint-30517/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d32d83cb96a109af411b9cf577e7fbfe07ea76fc --- /dev/null +++ b/checkpoint-30517/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168629c67732309d49f60a5ec48a6d160212bc987365e82e183dfbf74ba0c1f3 +size 4899116432 diff --git a/checkpoint-30517/model-00003-of-00007.safetensors b/checkpoint-30517/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-30517/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-30517/model-00004-of-00007.safetensors b/checkpoint-30517/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-30517/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-30517/model-00005-of-00007.safetensors b/checkpoint-30517/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-30517/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-30517/model-00006-of-00007.safetensors b/checkpoint-30517/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4007112580e058a26d0427e97b8ff6f856ba66bc --- /dev/null +++ b/checkpoint-30517/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0cfa5a25a356ba23308aab1ccf95e47e7b15c0420612d1d30a063442a2c57d +size 4999813120 diff --git a/checkpoint-30517/model-00007-of-00007.safetensors b/checkpoint-30517/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..062439748e773713d4fac56147dc9c0d9ee41736 --- /dev/null +++ b/checkpoint-30517/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:477a55c32130f695f1b4ca9b7aa96ca1bd9aa24b439b5233ee9fe5662f217d42 +size 2734998184 diff --git a/checkpoint-30517/model.safetensors.index.json b/checkpoint-30517/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..318803c6a3dd771c7f7c3b8038a896af7c8322ae --- /dev/null +++ b/checkpoint-30517/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32448724992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-30517/optimizer.pt b/checkpoint-30517/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..159793f552dc491720ee84d2e83dfb218eb0ebc1 --- /dev/null +++ b/checkpoint-30517/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84233f5fa6b1c6faa7bbfc2cdb9fb64f9c4643179a5956376e2fc60296523a88 +size 16040396334 diff --git a/checkpoint-30517/rng_state.pth b/checkpoint-30517/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ece67508ffa7f89d1f8e8b4e514d0551447e32a9 --- /dev/null +++ b/checkpoint-30517/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b66e3cc7c452b707ddac5caf0aa17618afb9bc1a0333600a22c4afb353f3165 +size 14244 diff --git a/checkpoint-30517/scheduler.pt b/checkpoint-30517/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d585b62f4a9bc8119a4f1d03cf2bb269e99411b --- /dev/null +++ b/checkpoint-30517/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d91c597306f26392e8da0d33a09fdcca77cfdeb5ad42248045521772fa3d64e +size 1064 diff --git a/checkpoint-30517/trainer_state.json b/checkpoint-30517/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bce034ebbdd893c56e4e4b1dca27d32fb8b4f348 --- /dev/null +++ b/checkpoint-30517/trainer_state.json @@ -0,0 +1,6921 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0142581760170168, + "eval_steps": 500, + "global_step": 30517, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001030311087476735, + "grad_norm": 60.25547409057617, + "learning_rate": 1.0157273918741808e-06, + "loss": 8.8455, + "step": 31 + }, + { + "epoch": 0.00206062217495347, + "grad_norm": 15.669363975524902, + "learning_rate": 2.0314547837483616e-06, + "loss": 7.1553, + "step": 62 + }, + { + "epoch": 0.003090933262430205, + "grad_norm": 15.366345405578613, + "learning_rate": 3.0471821756225426e-06, + "loss": 5.8784, + "step": 93 + }, + { + "epoch": 0.00412124434990694, + "grad_norm": 36.30561828613281, + "learning_rate": 4.062909567496723e-06, + "loss": 4.7708, + "step": 124 + }, + { + "epoch": 0.005151555437383675, + "grad_norm": 27.202678680419922, + "learning_rate": 5.078636959370905e-06, + "loss": 4.1629, + "step": 155 + }, + { + "epoch": 0.00618186652486041, + "grad_norm": 24.30484962463379, + "learning_rate": 6.094364351245085e-06, + "loss": 3.867, + "step": 186 + }, + { + "epoch": 0.007212177612337145, + "grad_norm": 19.916366577148438, + "learning_rate": 7.110091743119267e-06, + "loss": 3.6131, + "step": 217 + }, + { + "epoch": 0.00824248869981388, + "grad_norm": 17.577274322509766, + "learning_rate": 8.125819134993446e-06, + "loss": 3.4772, + "step": 248 + }, + { + "epoch": 0.009272799787290615, + "grad_norm": 12.133190155029297, + "learning_rate": 9.141546526867629e-06, + "loss": 3.3218, + "step": 279 + }, + { + "epoch": 0.01030311087476735, + "grad_norm": 19.79263687133789, + "learning_rate": 1.015727391874181e-05, + "loss": 3.2055, + "step": 310 + }, + { + "epoch": 0.011333421962244085, + "grad_norm": 16.38133430480957, + "learning_rate": 1.117300131061599e-05, + "loss": 3.1062, + "step": 341 + }, + { + "epoch": 0.01236373304972082, + "grad_norm": 12.638299942016602, + "learning_rate": 1.218872870249017e-05, + "loss": 3.0106, + "step": 372 + }, + { + "epoch": 0.013394044137197554, + "grad_norm": 9.46596908569336, + "learning_rate": 1.3204456094364351e-05, + "loss": 2.924, + "step": 403 + }, + { + "epoch": 0.01442435522467429, + "grad_norm": 10.945392608642578, + "learning_rate": 1.4220183486238533e-05, + "loss": 2.844, + "step": 434 + }, + { + "epoch": 0.015454666312151024, + "grad_norm": 8.474015235900879, + "learning_rate": 1.5235910878112714e-05, + "loss": 2.7892, + "step": 465 + }, + { + "epoch": 0.01648497739962776, + "grad_norm": 9.370804786682129, + "learning_rate": 1.6251638269986893e-05, + "loss": 2.7509, + "step": 496 + }, + { + "epoch": 0.017515288487104493, + "grad_norm": 11.63398551940918, + "learning_rate": 1.7267365661861077e-05, + "loss": 2.6999, + "step": 527 + }, + { + "epoch": 0.01854559957458123, + "grad_norm": 9.17713451385498, + "learning_rate": 1.8283093053735257e-05, + "loss": 2.6459, + "step": 558 + }, + { + "epoch": 0.019575910662057962, + "grad_norm": 7.119054794311523, + "learning_rate": 1.9298820445609438e-05, + "loss": 2.603, + "step": 589 + }, + { + "epoch": 0.0206062217495347, + "grad_norm": 6.653646945953369, + "learning_rate": 2.031454783748362e-05, + "loss": 2.5588, + "step": 620 + }, + { + "epoch": 0.021636532837011432, + "grad_norm": 8.332653045654297, + "learning_rate": 2.13302752293578e-05, + "loss": 2.5357, + "step": 651 + }, + { + "epoch": 0.02266684392448817, + "grad_norm": 6.4949116706848145, + "learning_rate": 2.234600262123198e-05, + "loss": 2.4967, + "step": 682 + }, + { + "epoch": 0.023697155011964902, + "grad_norm": 9.41009521484375, + "learning_rate": 2.336173001310616e-05, + "loss": 2.4563, + "step": 713 + }, + { + "epoch": 0.02472746609944164, + "grad_norm": 7.840345859527588, + "learning_rate": 2.437745740498034e-05, + "loss": 2.4383, + "step": 744 + }, + { + "epoch": 0.025757777186918372, + "grad_norm": 6.116458415985107, + "learning_rate": 2.5393184796854525e-05, + "loss": 2.3817, + "step": 775 + }, + { + "epoch": 0.02678808827439511, + "grad_norm": 5.938300609588623, + "learning_rate": 2.6408912188728702e-05, + "loss": 2.3508, + "step": 806 + }, + { + "epoch": 0.027818399361871842, + "grad_norm": 5.4408345222473145, + "learning_rate": 2.7424639580602886e-05, + "loss": 2.3325, + "step": 837 + }, + { + "epoch": 0.02884871044934858, + "grad_norm": 5.375136375427246, + "learning_rate": 2.8440366972477066e-05, + "loss": 2.3101, + "step": 868 + }, + { + "epoch": 0.029879021536825312, + "grad_norm": 5.149726867675781, + "learning_rate": 2.9456094364351244e-05, + "loss": 2.282, + "step": 899 + }, + { + "epoch": 0.03090933262430205, + "grad_norm": 4.591221332550049, + "learning_rate": 3.0471821756225428e-05, + "loss": 2.2427, + "step": 930 + }, + { + "epoch": 0.031939643711778785, + "grad_norm": 4.977034091949463, + "learning_rate": 3.148754914809961e-05, + "loss": 2.2218, + "step": 961 + }, + { + "epoch": 0.03296995479925552, + "grad_norm": 5.038781642913818, + "learning_rate": 3.2503276539973785e-05, + "loss": 2.2044, + "step": 992 + }, + { + "epoch": 0.03400026588673225, + "grad_norm": 4.872281551361084, + "learning_rate": 3.351900393184797e-05, + "loss": 2.1657, + "step": 1023 + }, + { + "epoch": 0.035030576974208985, + "grad_norm": 4.370841979980469, + "learning_rate": 3.453473132372215e-05, + "loss": 2.1365, + "step": 1054 + }, + { + "epoch": 0.036060888061685725, + "grad_norm": 4.087072849273682, + "learning_rate": 3.555045871559633e-05, + "loss": 2.1253, + "step": 1085 + }, + { + "epoch": 0.03709119914916246, + "grad_norm": 4.113957405090332, + "learning_rate": 3.6566186107470514e-05, + "loss": 2.0973, + "step": 1116 + }, + { + "epoch": 0.03812151023663919, + "grad_norm": 4.0119733810424805, + "learning_rate": 3.7581913499344695e-05, + "loss": 2.1024, + "step": 1147 + }, + { + "epoch": 0.039151821324115925, + "grad_norm": 4.247573375701904, + "learning_rate": 3.8597640891218876e-05, + "loss": 2.0722, + "step": 1178 + }, + { + "epoch": 0.04018213241159266, + "grad_norm": 3.5575129985809326, + "learning_rate": 3.9613368283093056e-05, + "loss": 2.056, + "step": 1209 + }, + { + "epoch": 0.0412124434990694, + "grad_norm": 3.8885862827301025, + "learning_rate": 4.062909567496724e-05, + "loss": 2.0389, + "step": 1240 + }, + { + "epoch": 0.04224275458654613, + "grad_norm": 3.680628538131714, + "learning_rate": 4.164482306684142e-05, + "loss": 2.0385, + "step": 1271 + }, + { + "epoch": 0.043273065674022865, + "grad_norm": 3.780876874923706, + "learning_rate": 4.26605504587156e-05, + "loss": 2.0097, + "step": 1302 + }, + { + "epoch": 0.0443033767614996, + "grad_norm": 4.235328674316406, + "learning_rate": 4.367627785058978e-05, + "loss": 2.0024, + "step": 1333 + }, + { + "epoch": 0.04533368784897634, + "grad_norm": 3.326941967010498, + "learning_rate": 4.469200524246396e-05, + "loss": 1.9953, + "step": 1364 + }, + { + "epoch": 0.04636399893645307, + "grad_norm": 3.28456449508667, + "learning_rate": 4.570773263433814e-05, + "loss": 1.9579, + "step": 1395 + }, + { + "epoch": 0.047394310023929805, + "grad_norm": 16.107433319091797, + "learning_rate": 4.672346002621232e-05, + "loss": 1.9701, + "step": 1426 + }, + { + "epoch": 0.04842462111140654, + "grad_norm": 3.5708224773406982, + "learning_rate": 4.77391874180865e-05, + "loss": 1.9621, + "step": 1457 + }, + { + "epoch": 0.04945493219888328, + "grad_norm": 2.9053499698638916, + "learning_rate": 4.875491480996068e-05, + "loss": 1.9458, + "step": 1488 + }, + { + "epoch": 0.05048524328636001, + "grad_norm": 3.0863258838653564, + "learning_rate": 4.977064220183487e-05, + "loss": 1.9483, + "step": 1519 + }, + { + "epoch": 0.051515554373836744, + "grad_norm": 2.9012269973754883, + "learning_rate": 4.9999915451558777e-05, + "loss": 1.928, + "step": 1550 + }, + { + "epoch": 0.05254586546131348, + "grad_norm": 3.0949041843414307, + "learning_rate": 4.999955597496219e-05, + "loss": 1.9229, + "step": 1581 + }, + { + "epoch": 0.05357617654879022, + "grad_norm": 2.8687901496887207, + "learning_rate": 4.9998914381774255e-05, + "loss": 1.915, + "step": 1612 + }, + { + "epoch": 0.05460648763626695, + "grad_norm": 3.2136878967285156, + "learning_rate": 4.999799067923527e-05, + "loss": 1.9197, + "step": 1643 + }, + { + "epoch": 0.055636798723743684, + "grad_norm": 2.590843677520752, + "learning_rate": 4.999678487776908e-05, + "loss": 1.8756, + "step": 1674 + }, + { + "epoch": 0.05666710981122042, + "grad_norm": 2.64634108543396, + "learning_rate": 4.9995296990983006e-05, + "loss": 1.9033, + "step": 1705 + }, + { + "epoch": 0.05769742089869716, + "grad_norm": 3.0151331424713135, + "learning_rate": 4.999352703566763e-05, + "loss": 1.8883, + "step": 1736 + }, + { + "epoch": 0.05872773198617389, + "grad_norm": 2.526806354522705, + "learning_rate": 4.999147503179668e-05, + "loss": 1.8666, + "step": 1767 + }, + { + "epoch": 0.059758043073650624, + "grad_norm": 2.510300397872925, + "learning_rate": 4.998914100252672e-05, + "loss": 1.854, + "step": 1798 + }, + { + "epoch": 0.06078835416112736, + "grad_norm": 2.4867682456970215, + "learning_rate": 4.998652497419696e-05, + "loss": 1.8548, + "step": 1829 + }, + { + "epoch": 0.0618186652486041, + "grad_norm": 2.3920586109161377, + "learning_rate": 4.9983626976328927e-05, + "loss": 1.8495, + "step": 1860 + }, + { + "epoch": 0.06284897633608083, + "grad_norm": 2.714177370071411, + "learning_rate": 4.998044704162613e-05, + "loss": 1.8433, + "step": 1891 + }, + { + "epoch": 0.06387928742355757, + "grad_norm": 2.3094465732574463, + "learning_rate": 4.9976985205973705e-05, + "loss": 1.8382, + "step": 1922 + }, + { + "epoch": 0.0649095985110343, + "grad_norm": 2.47184419631958, + "learning_rate": 4.997324150843799e-05, + "loss": 1.8464, + "step": 1953 + }, + { + "epoch": 0.06593990959851104, + "grad_norm": 2.391841411590576, + "learning_rate": 4.99692159912661e-05, + "loss": 1.8179, + "step": 1984 + }, + { + "epoch": 0.06697022068598776, + "grad_norm": 2.2471864223480225, + "learning_rate": 4.996490869988546e-05, + "loss": 1.8149, + "step": 2015 + }, + { + "epoch": 0.0680005317734645, + "grad_norm": 2.5497376918792725, + "learning_rate": 4.996031968290326e-05, + "loss": 1.8099, + "step": 2046 + }, + { + "epoch": 0.06903084286094124, + "grad_norm": 2.330463409423828, + "learning_rate": 4.995544899210594e-05, + "loss": 1.8267, + "step": 2077 + }, + { + "epoch": 0.07006115394841797, + "grad_norm": 2.3259341716766357, + "learning_rate": 4.9950296682458583e-05, + "loss": 1.7801, + "step": 2108 + }, + { + "epoch": 0.07109146503589471, + "grad_norm": 2.1711952686309814, + "learning_rate": 4.994486281210429e-05, + "loss": 1.7961, + "step": 2139 + }, + { + "epoch": 0.07212177612337145, + "grad_norm": 2.1808884143829346, + "learning_rate": 4.9939147442363566e-05, + "loss": 1.8109, + "step": 2170 + }, + { + "epoch": 0.07315208721084818, + "grad_norm": 2.089256525039673, + "learning_rate": 4.9933150637733574e-05, + "loss": 1.8026, + "step": 2201 + }, + { + "epoch": 0.07418239829832492, + "grad_norm": 2.0864951610565186, + "learning_rate": 4.992687246588743e-05, + "loss": 1.7753, + "step": 2232 + }, + { + "epoch": 0.07521270938580164, + "grad_norm": 2.36157488822937, + "learning_rate": 4.992031299767347e-05, + "loss": 1.7746, + "step": 2263 + }, + { + "epoch": 0.07624302047327838, + "grad_norm": 2.5334439277648926, + "learning_rate": 4.9913472307114386e-05, + "loss": 1.7927, + "step": 2294 + }, + { + "epoch": 0.07727333156075512, + "grad_norm": 2.2565715312957764, + "learning_rate": 4.9906350471406446e-05, + "loss": 1.7668, + "step": 2325 + }, + { + "epoch": 0.07830364264823185, + "grad_norm": 2.1043128967285156, + "learning_rate": 4.989894757091861e-05, + "loss": 1.7771, + "step": 2356 + }, + { + "epoch": 0.07933395373570859, + "grad_norm": 1.9659819602966309, + "learning_rate": 4.989126368919158e-05, + "loss": 1.7666, + "step": 2387 + }, + { + "epoch": 0.08036426482318532, + "grad_norm": 2.0778403282165527, + "learning_rate": 4.988329891293693e-05, + "loss": 1.7405, + "step": 2418 + }, + { + "epoch": 0.08139457591066206, + "grad_norm": 2.1767923831939697, + "learning_rate": 4.987505333203608e-05, + "loss": 1.7495, + "step": 2449 + }, + { + "epoch": 0.0824248869981388, + "grad_norm": 2.260143280029297, + "learning_rate": 4.9866527039539276e-05, + "loss": 1.7504, + "step": 2480 + }, + { + "epoch": 0.08345519808561552, + "grad_norm": 2.18271803855896, + "learning_rate": 4.9857720131664594e-05, + "loss": 1.7456, + "step": 2511 + }, + { + "epoch": 0.08448550917309226, + "grad_norm": 2.209594964981079, + "learning_rate": 4.9848632707796773e-05, + "loss": 1.7528, + "step": 2542 + }, + { + "epoch": 0.085515820260569, + "grad_norm": 2.0666229724884033, + "learning_rate": 4.9839264870486155e-05, + "loss": 1.7517, + "step": 2573 + }, + { + "epoch": 0.08654613134804573, + "grad_norm": 2.1070454120635986, + "learning_rate": 4.9829616725447526e-05, + "loss": 1.7474, + "step": 2604 + }, + { + "epoch": 0.08757644243552247, + "grad_norm": 1.9430303573608398, + "learning_rate": 4.981968838155888e-05, + "loss": 1.7348, + "step": 2635 + }, + { + "epoch": 0.0886067535229992, + "grad_norm": 1.9638925790786743, + "learning_rate": 4.980947995086024e-05, + "loss": 1.7202, + "step": 2666 + }, + { + "epoch": 0.08963706461047594, + "grad_norm": 1.8845652341842651, + "learning_rate": 4.979899154855234e-05, + "loss": 1.7375, + "step": 2697 + }, + { + "epoch": 0.09066737569795268, + "grad_norm": 5.712058067321777, + "learning_rate": 4.9788223292995386e-05, + "loss": 1.7379, + "step": 2728 + }, + { + "epoch": 0.0916976867854294, + "grad_norm": 1.9520670175552368, + "learning_rate": 4.977717530570768e-05, + "loss": 1.7302, + "step": 2759 + }, + { + "epoch": 0.09272799787290614, + "grad_norm": 1.8802224397659302, + "learning_rate": 4.976584771136425e-05, + "loss": 1.74, + "step": 2790 + }, + { + "epoch": 0.09375830896038288, + "grad_norm": 2.1098153591156006, + "learning_rate": 4.975424063779547e-05, + "loss": 1.7024, + "step": 2821 + }, + { + "epoch": 0.09478862004785961, + "grad_norm": 2.1568291187286377, + "learning_rate": 4.974235421598557e-05, + "loss": 1.7131, + "step": 2852 + }, + { + "epoch": 0.09581893113533635, + "grad_norm": 1.8769980669021606, + "learning_rate": 4.973018858007122e-05, + "loss": 1.7008, + "step": 2883 + }, + { + "epoch": 0.09684924222281308, + "grad_norm": 1.8325533866882324, + "learning_rate": 4.9717743867339963e-05, + "loss": 1.7058, + "step": 2914 + }, + { + "epoch": 0.09787955331028982, + "grad_norm": 2.086416721343994, + "learning_rate": 4.9705020218228695e-05, + "loss": 1.711, + "step": 2945 + }, + { + "epoch": 0.09890986439776656, + "grad_norm": 1.8294793367385864, + "learning_rate": 4.969201777632205e-05, + "loss": 1.6998, + "step": 2976 + }, + { + "epoch": 0.09994017548524328, + "grad_norm": 2.0608153343200684, + "learning_rate": 4.9678736688350846e-05, + "loss": 1.6948, + "step": 3007 + }, + { + "epoch": 0.10097048657272002, + "grad_norm": 3.2166008949279785, + "learning_rate": 4.966517710419033e-05, + "loss": 1.6788, + "step": 3038 + }, + { + "epoch": 0.10200079766019676, + "grad_norm": 1.9431313276290894, + "learning_rate": 4.965133917685858e-05, + "loss": 1.7115, + "step": 3069 + }, + { + "epoch": 0.10303110874767349, + "grad_norm": 1.967512845993042, + "learning_rate": 4.9637223062514714e-05, + "loss": 1.7033, + "step": 3100 + }, + { + "epoch": 0.10406141983515023, + "grad_norm": 1.9253389835357666, + "learning_rate": 4.962282892045718e-05, + "loss": 1.6856, + "step": 3131 + }, + { + "epoch": 0.10509173092262696, + "grad_norm": 1.986840009689331, + "learning_rate": 4.9608156913121904e-05, + "loss": 1.723, + "step": 3162 + }, + { + "epoch": 0.1061220420101037, + "grad_norm": 1.83523690700531, + "learning_rate": 4.959320720608049e-05, + "loss": 1.6912, + "step": 3193 + }, + { + "epoch": 0.10715235309758044, + "grad_norm": 2.1271955966949463, + "learning_rate": 4.9577979968038354e-05, + "loss": 1.7032, + "step": 3224 + }, + { + "epoch": 0.10818266418505716, + "grad_norm": 1.8383768796920776, + "learning_rate": 4.956247537083282e-05, + "loss": 1.6726, + "step": 3255 + }, + { + "epoch": 0.1092129752725339, + "grad_norm": 1.8806651830673218, + "learning_rate": 4.9546693589431145e-05, + "loss": 1.6817, + "step": 3286 + }, + { + "epoch": 0.11024328636001064, + "grad_norm": 1.7535260915756226, + "learning_rate": 4.9530634801928595e-05, + "loss": 1.6875, + "step": 3317 + }, + { + "epoch": 0.11127359744748737, + "grad_norm": 1.765906810760498, + "learning_rate": 4.9514299189546395e-05, + "loss": 1.6859, + "step": 3348 + }, + { + "epoch": 0.11230390853496411, + "grad_norm": 1.869828462600708, + "learning_rate": 4.949768693662973e-05, + "loss": 1.6915, + "step": 3379 + }, + { + "epoch": 0.11333421962244083, + "grad_norm": 1.8347504138946533, + "learning_rate": 4.948079823064559e-05, + "loss": 1.6859, + "step": 3410 + }, + { + "epoch": 0.11436453070991758, + "grad_norm": 1.7692474126815796, + "learning_rate": 4.946363326218074e-05, + "loss": 1.6565, + "step": 3441 + }, + { + "epoch": 0.11539484179739432, + "grad_norm": 1.8231885433197021, + "learning_rate": 4.9446192224939525e-05, + "loss": 1.686, + "step": 3472 + }, + { + "epoch": 0.11642515288487104, + "grad_norm": 1.7155958414077759, + "learning_rate": 4.942847531574167e-05, + "loss": 1.6538, + "step": 3503 + }, + { + "epoch": 0.11745546397234778, + "grad_norm": 1.787183403968811, + "learning_rate": 4.941048273452008e-05, + "loss": 1.6776, + "step": 3534 + }, + { + "epoch": 0.11848577505982451, + "grad_norm": 1.741213083267212, + "learning_rate": 4.9392214684318605e-05, + "loss": 1.6784, + "step": 3565 + }, + { + "epoch": 0.11951608614730125, + "grad_norm": 1.7836824655532837, + "learning_rate": 4.93736713712897e-05, + "loss": 1.6557, + "step": 3596 + }, + { + "epoch": 0.12054639723477799, + "grad_norm": 1.7103859186172485, + "learning_rate": 4.9354853004692124e-05, + "loss": 1.6606, + "step": 3627 + }, + { + "epoch": 0.12157670832225471, + "grad_norm": 1.7865506410598755, + "learning_rate": 4.93357597968886e-05, + "loss": 1.6409, + "step": 3658 + }, + { + "epoch": 0.12260701940973145, + "grad_norm": 1.7770143747329712, + "learning_rate": 4.931639196334338e-05, + "loss": 1.6574, + "step": 3689 + }, + { + "epoch": 0.1236373304972082, + "grad_norm": 1.857575535774231, + "learning_rate": 4.9296749722619826e-05, + "loss": 1.6724, + "step": 3720 + }, + { + "epoch": 0.12466764158468492, + "grad_norm": 1.8742581605911255, + "learning_rate": 4.9276833296377966e-05, + "loss": 1.6506, + "step": 3751 + }, + { + "epoch": 0.12569795267216166, + "grad_norm": 1.827668309211731, + "learning_rate": 4.925664290937196e-05, + "loss": 1.6523, + "step": 3782 + }, + { + "epoch": 0.1267282637596384, + "grad_norm": 1.7517486810684204, + "learning_rate": 4.9236178789447576e-05, + "loss": 1.6459, + "step": 3813 + }, + { + "epoch": 0.12775857484711514, + "grad_norm": 1.8109570741653442, + "learning_rate": 4.921544116753962e-05, + "loss": 1.6614, + "step": 3844 + }, + { + "epoch": 0.12878888593459187, + "grad_norm": 1.692597508430481, + "learning_rate": 4.919443027766935e-05, + "loss": 1.6431, + "step": 3875 + }, + { + "epoch": 0.1298191970220686, + "grad_norm": 1.8650025129318237, + "learning_rate": 4.91731463569418e-05, + "loss": 1.6466, + "step": 3906 + }, + { + "epoch": 0.13084950810954532, + "grad_norm": 1.6794081926345825, + "learning_rate": 4.915158964554312e-05, + "loss": 1.6504, + "step": 3937 + }, + { + "epoch": 0.13187981919702207, + "grad_norm": 1.7685374021530151, + "learning_rate": 4.912976038673786e-05, + "loss": 1.6446, + "step": 3968 + }, + { + "epoch": 0.1329101302844988, + "grad_norm": 1.7601110935211182, + "learning_rate": 4.9107658826866254e-05, + "loss": 1.631, + "step": 3999 + }, + { + "epoch": 0.13394044137197553, + "grad_norm": 2.0616064071655273, + "learning_rate": 4.908528521534139e-05, + "loss": 1.6476, + "step": 4030 + }, + { + "epoch": 0.13497075245945228, + "grad_norm": 1.8973504304885864, + "learning_rate": 4.906263980464644e-05, + "loss": 1.6582, + "step": 4061 + }, + { + "epoch": 0.136001063546929, + "grad_norm": 1.7768895626068115, + "learning_rate": 4.903972285033178e-05, + "loss": 1.6159, + "step": 4092 + }, + { + "epoch": 0.13703137463440573, + "grad_norm": 1.8264424800872803, + "learning_rate": 4.901653461101213e-05, + "loss": 1.6289, + "step": 4123 + }, + { + "epoch": 0.1380616857218825, + "grad_norm": 1.7140119075775146, + "learning_rate": 4.8993075348363626e-05, + "loss": 1.6357, + "step": 4154 + }, + { + "epoch": 0.13909199680935921, + "grad_norm": 1.6964486837387085, + "learning_rate": 4.896934532712084e-05, + "loss": 1.6233, + "step": 4185 + }, + { + "epoch": 0.14012230789683594, + "grad_norm": 1.8008025884628296, + "learning_rate": 4.8945344815073846e-05, + "loss": 1.637, + "step": 4216 + }, + { + "epoch": 0.1411526189843127, + "grad_norm": 1.562730073928833, + "learning_rate": 4.892107408306516e-05, + "loss": 1.6379, + "step": 4247 + }, + { + "epoch": 0.14218293007178942, + "grad_norm": 1.8273371458053589, + "learning_rate": 4.889653340498669e-05, + "loss": 1.6246, + "step": 4278 + }, + { + "epoch": 0.14321324115926615, + "grad_norm": 56.33716583251953, + "learning_rate": 4.8871723057776664e-05, + "loss": 1.6457, + "step": 4309 + }, + { + "epoch": 0.1442435522467429, + "grad_norm": 1.746523380279541, + "learning_rate": 4.8846643321416476e-05, + "loss": 1.6343, + "step": 4340 + }, + { + "epoch": 0.14527386333421963, + "grad_norm": 1.7737531661987305, + "learning_rate": 4.882129447892753e-05, + "loss": 1.6447, + "step": 4371 + }, + { + "epoch": 0.14630417442169635, + "grad_norm": 1.660485863685608, + "learning_rate": 4.8795676816368076e-05, + "loss": 1.6192, + "step": 4402 + }, + { + "epoch": 0.14733448550917308, + "grad_norm": 1.6823406219482422, + "learning_rate": 4.876979062282995e-05, + "loss": 1.6253, + "step": 4433 + }, + { + "epoch": 0.14836479659664983, + "grad_norm": 7.78139066696167, + "learning_rate": 4.8743636190435325e-05, + "loss": 1.6234, + "step": 4464 + }, + { + "epoch": 0.14939510768412656, + "grad_norm": 1.7426058053970337, + "learning_rate": 4.871721381433344e-05, + "loss": 1.6337, + "step": 4495 + }, + { + "epoch": 0.1504254187716033, + "grad_norm": 1.6294783353805542, + "learning_rate": 4.869052379269719e-05, + "loss": 1.6217, + "step": 4526 + }, + { + "epoch": 0.15145572985908004, + "grad_norm": 1.6523306369781494, + "learning_rate": 4.866356642671985e-05, + "loss": 1.605, + "step": 4557 + }, + { + "epoch": 0.15248604094655677, + "grad_norm": 1.8571300506591797, + "learning_rate": 4.8636342020611634e-05, + "loss": 1.6218, + "step": 4588 + }, + { + "epoch": 0.1535163520340335, + "grad_norm": 1.7754936218261719, + "learning_rate": 4.860885088159626e-05, + "loss": 1.6171, + "step": 4619 + }, + { + "epoch": 0.15454666312151025, + "grad_norm": 1.91987943649292, + "learning_rate": 4.858109331990751e-05, + "loss": 1.6167, + "step": 4650 + }, + { + "epoch": 0.15557697420898697, + "grad_norm": 1.5994452238082886, + "learning_rate": 4.855306964878567e-05, + "loss": 1.5951, + "step": 4681 + }, + { + "epoch": 0.1566072852964637, + "grad_norm": 1.6490916013717651, + "learning_rate": 4.8524780184474084e-05, + "loss": 1.616, + "step": 4712 + }, + { + "epoch": 0.15763759638394045, + "grad_norm": 1.5921640396118164, + "learning_rate": 4.8496225246215496e-05, + "loss": 1.6346, + "step": 4743 + }, + { + "epoch": 0.15866790747141718, + "grad_norm": 1.6729261875152588, + "learning_rate": 4.8467405156248505e-05, + "loss": 1.6165, + "step": 4774 + }, + { + "epoch": 0.1596982185588939, + "grad_norm": 1.628113031387329, + "learning_rate": 4.843832023980392e-05, + "loss": 1.6119, + "step": 4805 + }, + { + "epoch": 0.16072852964637063, + "grad_norm": 1.651647925376892, + "learning_rate": 4.840897082510106e-05, + "loss": 1.5997, + "step": 4836 + }, + { + "epoch": 0.1617588407338474, + "grad_norm": 1.5297720432281494, + "learning_rate": 4.8379357243344084e-05, + "loss": 1.6242, + "step": 4867 + }, + { + "epoch": 0.1627891518213241, + "grad_norm": 1.5779869556427002, + "learning_rate": 4.8349479828718236e-05, + "loss": 1.6149, + "step": 4898 + }, + { + "epoch": 0.16381946290880084, + "grad_norm": 1.5843939781188965, + "learning_rate": 4.8319338918386075e-05, + "loss": 1.5926, + "step": 4929 + }, + { + "epoch": 0.1648497739962776, + "grad_norm": 2.3762106895446777, + "learning_rate": 4.828893485248369e-05, + "loss": 1.6108, + "step": 4960 + }, + { + "epoch": 0.16588008508375432, + "grad_norm": 1.5871953964233398, + "learning_rate": 4.825826797411682e-05, + "loss": 1.6103, + "step": 4991 + }, + { + "epoch": 0.16691039617123105, + "grad_norm": 1.5934125185012817, + "learning_rate": 4.822733862935702e-05, + "loss": 1.6091, + "step": 5022 + }, + { + "epoch": 0.1679407072587078, + "grad_norm": 1.6997628211975098, + "learning_rate": 4.819614716723775e-05, + "loss": 1.6098, + "step": 5053 + }, + { + "epoch": 0.16897101834618453, + "grad_norm": 1.682849645614624, + "learning_rate": 4.8164693939750425e-05, + "loss": 1.599, + "step": 5084 + }, + { + "epoch": 0.17000132943366125, + "grad_norm": 1.709743857383728, + "learning_rate": 4.813297930184042e-05, + "loss": 1.6194, + "step": 5115 + }, + { + "epoch": 0.171031640521138, + "grad_norm": 1.725879430770874, + "learning_rate": 4.810100361140314e-05, + "loss": 1.6115, + "step": 5146 + }, + { + "epoch": 0.17206195160861473, + "grad_norm": 1.6710290908813477, + "learning_rate": 4.8068767229279885e-05, + "loss": 1.6032, + "step": 5177 + }, + { + "epoch": 0.17309226269609146, + "grad_norm": 1.6156634092330933, + "learning_rate": 4.8036270519253854e-05, + "loss": 1.5973, + "step": 5208 + }, + { + "epoch": 0.1741225737835682, + "grad_norm": 1.5654059648513794, + "learning_rate": 4.8003513848046e-05, + "loss": 1.5817, + "step": 5239 + }, + { + "epoch": 0.17515288487104494, + "grad_norm": 1.5789822340011597, + "learning_rate": 4.79704975853109e-05, + "loss": 1.6138, + "step": 5270 + }, + { + "epoch": 0.17618319595852167, + "grad_norm": 1.6022037267684937, + "learning_rate": 4.793722210363262e-05, + "loss": 1.5998, + "step": 5301 + }, + { + "epoch": 0.1772135070459984, + "grad_norm": 1.5142741203308105, + "learning_rate": 4.7903687778520414e-05, + "loss": 1.6061, + "step": 5332 + }, + { + "epoch": 0.17824381813347515, + "grad_norm": 1.6454212665557861, + "learning_rate": 4.7869894988404593e-05, + "loss": 1.6063, + "step": 5363 + }, + { + "epoch": 0.17927412922095187, + "grad_norm": 1.5250823497772217, + "learning_rate": 4.783584411463221e-05, + "loss": 1.6038, + "step": 5394 + }, + { + "epoch": 0.1803044403084286, + "grad_norm": 1.5829335451126099, + "learning_rate": 4.780153554146274e-05, + "loss": 1.5949, + "step": 5425 + }, + { + "epoch": 0.18133475139590535, + "grad_norm": 1.5342432260513306, + "learning_rate": 4.7766969656063766e-05, + "loss": 1.5913, + "step": 5456 + }, + { + "epoch": 0.18236506248338208, + "grad_norm": 1.6397250890731812, + "learning_rate": 4.773214684850662e-05, + "loss": 1.6102, + "step": 5487 + }, + { + "epoch": 0.1833953735708588, + "grad_norm": 1.5228471755981445, + "learning_rate": 4.769706751176193e-05, + "loss": 1.5885, + "step": 5518 + }, + { + "epoch": 0.18442568465833556, + "grad_norm": 1.6186103820800781, + "learning_rate": 4.7661732041695264e-05, + "loss": 1.6086, + "step": 5549 + }, + { + "epoch": 0.18545599574581229, + "grad_norm": 1.6024582386016846, + "learning_rate": 4.762614083706258e-05, + "loss": 1.6004, + "step": 5580 + }, + { + "epoch": 0.186486306833289, + "grad_norm": 1.5443711280822754, + "learning_rate": 4.759029429950581e-05, + "loss": 1.6048, + "step": 5611 + }, + { + "epoch": 0.18751661792076577, + "grad_norm": 1.4831629991531372, + "learning_rate": 4.7554192833548235e-05, + "loss": 1.5841, + "step": 5642 + }, + { + "epoch": 0.1885469290082425, + "grad_norm": 1.6426068544387817, + "learning_rate": 4.751783684659e-05, + "loss": 1.587, + "step": 5673 + }, + { + "epoch": 0.18957724009571922, + "grad_norm": 1.4609078168869019, + "learning_rate": 4.748122674890348e-05, + "loss": 1.5945, + "step": 5704 + }, + { + "epoch": 0.19060755118319597, + "grad_norm": 1.5365614891052246, + "learning_rate": 4.7444362953628654e-05, + "loss": 1.5737, + "step": 5735 + }, + { + "epoch": 0.1916378622706727, + "grad_norm": 1.5755670070648193, + "learning_rate": 4.7407245876768424e-05, + "loss": 1.5862, + "step": 5766 + }, + { + "epoch": 0.19266817335814942, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.736987593718397e-05, + "loss": 1.5663, + "step": 5797 + }, + { + "epoch": 0.19369848444562615, + "grad_norm": 1.5927278995513916, + "learning_rate": 4.733225355658999e-05, + "loss": 1.5776, + "step": 5828 + }, + { + "epoch": 0.1947287955331029, + "grad_norm": 1.5593287944793701, + "learning_rate": 4.7294379159549926e-05, + "loss": 1.579, + "step": 5859 + }, + { + "epoch": 0.19575910662057963, + "grad_norm": 1.534055233001709, + "learning_rate": 4.725625317347119e-05, + "loss": 1.6017, + "step": 5890 + }, + { + "epoch": 0.19678941770805636, + "grad_norm": 1.5846387147903442, + "learning_rate": 4.7217876028600374e-05, + "loss": 1.5739, + "step": 5921 + }, + { + "epoch": 0.1978197287955331, + "grad_norm": 1.5377682447433472, + "learning_rate": 4.717924815801832e-05, + "loss": 1.57, + "step": 5952 + }, + { + "epoch": 0.19885003988300984, + "grad_norm": 1.467956781387329, + "learning_rate": 4.714036999763532e-05, + "loss": 1.5736, + "step": 5983 + }, + { + "epoch": 0.19988035097048656, + "grad_norm": 1.601070523262024, + "learning_rate": 4.7101241986186116e-05, + "loss": 1.5861, + "step": 6014 + }, + { + "epoch": 0.20091066205796332, + "grad_norm": 1.5051921606063843, + "learning_rate": 4.7061864565225e-05, + "loss": 1.5735, + "step": 6045 + }, + { + "epoch": 0.20194097314544004, + "grad_norm": 1.462843418121338, + "learning_rate": 4.702223817912081e-05, + "loss": 1.582, + "step": 6076 + }, + { + "epoch": 0.20297128423291677, + "grad_norm": 1.5698682069778442, + "learning_rate": 4.698236327505195e-05, + "loss": 1.5647, + "step": 6107 + }, + { + "epoch": 0.20400159532039353, + "grad_norm": 1.5633916854858398, + "learning_rate": 4.694224030300127e-05, + "loss": 1.5741, + "step": 6138 + }, + { + "epoch": 0.20503190640787025, + "grad_norm": 1.6174733638763428, + "learning_rate": 4.690186971575107e-05, + "loss": 1.5634, + "step": 6169 + }, + { + "epoch": 0.20606221749534698, + "grad_norm": 1.4957518577575684, + "learning_rate": 4.6861251968877916e-05, + "loss": 1.575, + "step": 6200 + }, + { + "epoch": 0.2070925285828237, + "grad_norm": 1.670933485031128, + "learning_rate": 4.68203875207476e-05, + "loss": 1.5792, + "step": 6231 + }, + { + "epoch": 0.20812283967030046, + "grad_norm": 1.5676430463790894, + "learning_rate": 4.677927683250983e-05, + "loss": 1.5689, + "step": 6262 + }, + { + "epoch": 0.20915315075777718, + "grad_norm": 1.5753976106643677, + "learning_rate": 4.6737920368093156e-05, + "loss": 1.5594, + "step": 6293 + }, + { + "epoch": 0.2101834618452539, + "grad_norm": 1.4973617792129517, + "learning_rate": 4.669631859419965e-05, + "loss": 1.5593, + "step": 6324 + }, + { + "epoch": 0.21121377293273066, + "grad_norm": 1.4691433906555176, + "learning_rate": 4.6654471980299676e-05, + "loss": 1.5711, + "step": 6355 + }, + { + "epoch": 0.2122440840202074, + "grad_norm": 1.407630443572998, + "learning_rate": 4.661238099862658e-05, + "loss": 1.5787, + "step": 6386 + }, + { + "epoch": 0.21327439510768412, + "grad_norm": 1.5011677742004395, + "learning_rate": 4.657004612417138e-05, + "loss": 1.5751, + "step": 6417 + }, + { + "epoch": 0.21430470619516087, + "grad_norm": 1.509750485420227, + "learning_rate": 4.6527467834677374e-05, + "loss": 1.5583, + "step": 6448 + }, + { + "epoch": 0.2153350172826376, + "grad_norm": 1.3919882774353027, + "learning_rate": 4.648464661063478e-05, + "loss": 1.5712, + "step": 6479 + }, + { + "epoch": 0.21636532837011432, + "grad_norm": 1.4854936599731445, + "learning_rate": 4.6441582935275264e-05, + "loss": 1.5637, + "step": 6510 + }, + { + "epoch": 0.21739563945759108, + "grad_norm": 1.4413583278656006, + "learning_rate": 4.6398277294566586e-05, + "loss": 1.56, + "step": 6541 + }, + { + "epoch": 0.2184259505450678, + "grad_norm": 1.5063883066177368, + "learning_rate": 4.6354730177207e-05, + "loss": 1.5525, + "step": 6572 + }, + { + "epoch": 0.21945626163254453, + "grad_norm": 1.4899688959121704, + "learning_rate": 4.6310942074619787e-05, + "loss": 1.5817, + "step": 6603 + }, + { + "epoch": 0.22048657272002128, + "grad_norm": 1.3927967548370361, + "learning_rate": 4.626691348094777e-05, + "loss": 1.5407, + "step": 6634 + }, + { + "epoch": 0.221516883807498, + "grad_norm": 1.5378398895263672, + "learning_rate": 4.622264489304762e-05, + "loss": 1.5561, + "step": 6665 + }, + { + "epoch": 0.22254719489497474, + "grad_norm": 1.554624319076538, + "learning_rate": 4.617813681048434e-05, + "loss": 1.5859, + "step": 6696 + }, + { + "epoch": 0.22357750598245146, + "grad_norm": 1.5356658697128296, + "learning_rate": 4.61333897355256e-05, + "loss": 1.5531, + "step": 6727 + }, + { + "epoch": 0.22460781706992822, + "grad_norm": 1.5534918308258057, + "learning_rate": 4.608840417313604e-05, + "loss": 1.5774, + "step": 6758 + }, + { + "epoch": 0.22563812815740494, + "grad_norm": 1.5660988092422485, + "learning_rate": 4.6043180630971646e-05, + "loss": 1.5763, + "step": 6789 + }, + { + "epoch": 0.22666843924488167, + "grad_norm": 1.4993386268615723, + "learning_rate": 4.599771961937391e-05, + "loss": 1.5615, + "step": 6820 + }, + { + "epoch": 0.22769875033235842, + "grad_norm": 1.4630553722381592, + "learning_rate": 4.5952021651364204e-05, + "loss": 1.543, + "step": 6851 + }, + { + "epoch": 0.22872906141983515, + "grad_norm": 1.470173954963684, + "learning_rate": 4.590608724263786e-05, + "loss": 1.5674, + "step": 6882 + }, + { + "epoch": 0.22975937250731188, + "grad_norm": 1.5867971181869507, + "learning_rate": 4.585991691155845e-05, + "loss": 1.5702, + "step": 6913 + }, + { + "epoch": 0.23078968359478863, + "grad_norm": 1.44207763671875, + "learning_rate": 4.581351117915188e-05, + "loss": 1.5436, + "step": 6944 + }, + { + "epoch": 0.23181999468226536, + "grad_norm": 1.4691039323806763, + "learning_rate": 4.5766870569100534e-05, + "loss": 1.5465, + "step": 6975 + }, + { + "epoch": 0.23285030576974208, + "grad_norm": 1.4807918071746826, + "learning_rate": 4.571999560773736e-05, + "loss": 1.5564, + "step": 7006 + }, + { + "epoch": 0.23388061685721884, + "grad_norm": 1.481487512588501, + "learning_rate": 4.5672886824039915e-05, + "loss": 1.5466, + "step": 7037 + }, + { + "epoch": 0.23491092794469556, + "grad_norm": 1.4518013000488281, + "learning_rate": 4.5625544749624435e-05, + "loss": 1.5618, + "step": 7068 + }, + { + "epoch": 0.2359412390321723, + "grad_norm": 1.4186676740646362, + "learning_rate": 4.5577969918739794e-05, + "loss": 1.5528, + "step": 7099 + }, + { + "epoch": 0.23697155011964902, + "grad_norm": 1.5287110805511475, + "learning_rate": 4.5530162868261486e-05, + "loss": 1.5457, + "step": 7130 + }, + { + "epoch": 0.23800186120712577, + "grad_norm": 1.5516417026519775, + "learning_rate": 4.548212413768558e-05, + "loss": 1.5467, + "step": 7161 + }, + { + "epoch": 0.2390321722946025, + "grad_norm": 1.4710053205490112, + "learning_rate": 4.543385426912261e-05, + "loss": 1.5431, + "step": 7192 + }, + { + "epoch": 0.24006248338207922, + "grad_norm": 1.5005567073822021, + "learning_rate": 4.53853538072915e-05, + "loss": 1.5592, + "step": 7223 + }, + { + "epoch": 0.24109279446955598, + "grad_norm": 1.5864965915679932, + "learning_rate": 4.533662329951336e-05, + "loss": 1.5694, + "step": 7254 + }, + { + "epoch": 0.2421231055570327, + "grad_norm": 1.4661896228790283, + "learning_rate": 4.528766329570536e-05, + "loss": 1.545, + "step": 7285 + }, + { + "epoch": 0.24315341664450943, + "grad_norm": 1.5157560110092163, + "learning_rate": 4.523847434837447e-05, + "loss": 1.5458, + "step": 7316 + }, + { + "epoch": 0.24418372773198618, + "grad_norm": 1.4033585786819458, + "learning_rate": 4.518905701261128e-05, + "loss": 1.5464, + "step": 7347 + }, + { + "epoch": 0.2452140388194629, + "grad_norm": 1.5357593297958374, + "learning_rate": 4.5139411846083715e-05, + "loss": 1.5497, + "step": 7378 + }, + { + "epoch": 0.24624434990693964, + "grad_norm": 1.419507384300232, + "learning_rate": 4.508953940903073e-05, + "loss": 1.5414, + "step": 7409 + }, + { + "epoch": 0.2472746609944164, + "grad_norm": 1.5201773643493652, + "learning_rate": 4.5039440264255994e-05, + "loss": 1.5503, + "step": 7440 + }, + { + "epoch": 0.24830497208189312, + "grad_norm": 1.8000444173812866, + "learning_rate": 4.498911497712155e-05, + "loss": 1.5448, + "step": 7471 + }, + { + "epoch": 0.24933528316936984, + "grad_norm": 1.4876810312271118, + "learning_rate": 4.493856411554142e-05, + "loss": 1.5524, + "step": 7502 + }, + { + "epoch": 0.25036559425684657, + "grad_norm": 1.5130078792572021, + "learning_rate": 4.4887788249975206e-05, + "loss": 1.5454, + "step": 7533 + }, + { + "epoch": 0.2513959053443233, + "grad_norm": 1.4829351902008057, + "learning_rate": 4.4836787953421656e-05, + "loss": 1.5407, + "step": 7564 + }, + { + "epoch": 0.2524262164318001, + "grad_norm": 1.521550178527832, + "learning_rate": 4.478556380141218e-05, + "loss": 1.5727, + "step": 7595 + }, + { + "epoch": 0.2534565275192768, + "grad_norm": 1.4377928972244263, + "learning_rate": 4.4734116372004375e-05, + "loss": 1.5432, + "step": 7626 + }, + { + "epoch": 0.25448683860675353, + "grad_norm": 1.4101744890213013, + "learning_rate": 4.4682446245775477e-05, + "loss": 1.547, + "step": 7657 + }, + { + "epoch": 0.2555171496942303, + "grad_norm": 1.522524356842041, + "learning_rate": 4.463055400581586e-05, + "loss": 1.5418, + "step": 7688 + }, + { + "epoch": 0.256547460781707, + "grad_norm": 1.4160797595977783, + "learning_rate": 4.4578440237722374e-05, + "loss": 1.5457, + "step": 7719 + }, + { + "epoch": 0.25757777186918374, + "grad_norm": 1.4106636047363281, + "learning_rate": 4.452610552959183e-05, + "loss": 1.5405, + "step": 7750 + }, + { + "epoch": 0.2586080829566605, + "grad_norm": 1.422723650932312, + "learning_rate": 4.447355047201428e-05, + "loss": 1.5423, + "step": 7781 + }, + { + "epoch": 0.2596383940441372, + "grad_norm": 1.4362592697143555, + "learning_rate": 4.4420775658066414e-05, + "loss": 1.5372, + "step": 7812 + }, + { + "epoch": 0.26066870513161394, + "grad_norm": 1.4319696426391602, + "learning_rate": 4.436778168330484e-05, + "loss": 1.5451, + "step": 7843 + }, + { + "epoch": 0.26169901621909064, + "grad_norm": 1.4069257974624634, + "learning_rate": 4.4314569145759353e-05, + "loss": 1.5221, + "step": 7874 + }, + { + "epoch": 0.2627293273065674, + "grad_norm": 1.4424949884414673, + "learning_rate": 4.42611386459262e-05, + "loss": 1.5419, + "step": 7905 + }, + { + "epoch": 0.26375963839404415, + "grad_norm": 1.4579105377197266, + "learning_rate": 4.420749078676133e-05, + "loss": 1.5116, + "step": 7936 + }, + { + "epoch": 0.26478994948152085, + "grad_norm": 1.4563167095184326, + "learning_rate": 4.4153626173673516e-05, + "loss": 1.5296, + "step": 7967 + }, + { + "epoch": 0.2658202605689976, + "grad_norm": 1.4440968036651611, + "learning_rate": 4.409954541451762e-05, + "loss": 1.5548, + "step": 7998 + }, + { + "epoch": 0.26685057165647436, + "grad_norm": 1.5711034536361694, + "learning_rate": 4.404524911958764e-05, + "loss": 1.535, + "step": 8029 + }, + { + "epoch": 0.26788088274395105, + "grad_norm": 1.5221564769744873, + "learning_rate": 4.399073790160989e-05, + "loss": 1.5495, + "step": 8060 + }, + { + "epoch": 0.2689111938314278, + "grad_norm": 1.392699956893921, + "learning_rate": 4.393601237573607e-05, + "loss": 1.546, + "step": 8091 + }, + { + "epoch": 0.26994150491890456, + "grad_norm": 1.5343137979507446, + "learning_rate": 4.388107315953628e-05, + "loss": 1.549, + "step": 8122 + }, + { + "epoch": 0.27097181600638126, + "grad_norm": 1.4483468532562256, + "learning_rate": 4.382592087299212e-05, + "loss": 1.5424, + "step": 8153 + }, + { + "epoch": 0.272002127093858, + "grad_norm": 1.4963489770889282, + "learning_rate": 4.377055613848964e-05, + "loss": 1.508, + "step": 8184 + }, + { + "epoch": 0.27303243818133477, + "grad_norm": 1.4839162826538086, + "learning_rate": 4.3714979580812355e-05, + "loss": 1.5203, + "step": 8215 + }, + { + "epoch": 0.27406274926881147, + "grad_norm": 1.4272018671035767, + "learning_rate": 4.365919182713416e-05, + "loss": 1.5264, + "step": 8246 + }, + { + "epoch": 0.2750930603562882, + "grad_norm": 1.3808270692825317, + "learning_rate": 4.360319350701226e-05, + "loss": 1.5255, + "step": 8277 + }, + { + "epoch": 0.276123371443765, + "grad_norm": 1.4179162979125977, + "learning_rate": 4.3546985252380115e-05, + "loss": 1.535, + "step": 8308 + }, + { + "epoch": 0.2771536825312417, + "grad_norm": 1.3617374897003174, + "learning_rate": 4.349056769754021e-05, + "loss": 1.5295, + "step": 8339 + }, + { + "epoch": 0.27818399361871843, + "grad_norm": 1.4745615720748901, + "learning_rate": 4.3433941479156994e-05, + "loss": 1.5438, + "step": 8370 + }, + { + "epoch": 0.2792143047061952, + "grad_norm": 1.3661375045776367, + "learning_rate": 4.3377107236249647e-05, + "loss": 1.5134, + "step": 8401 + }, + { + "epoch": 0.2802446157936719, + "grad_norm": 1.3907949924468994, + "learning_rate": 4.332006561018488e-05, + "loss": 1.5237, + "step": 8432 + }, + { + "epoch": 0.28127492688114863, + "grad_norm": 1.3575704097747803, + "learning_rate": 4.3262817244669683e-05, + "loss": 1.5226, + "step": 8463 + }, + { + "epoch": 0.2823052379686254, + "grad_norm": 1.3836462497711182, + "learning_rate": 4.3205362785744083e-05, + "loss": 1.5433, + "step": 8494 + }, + { + "epoch": 0.2833355490561021, + "grad_norm": 1.6108276844024658, + "learning_rate": 4.314770288177384e-05, + "loss": 1.5324, + "step": 8525 + }, + { + "epoch": 0.28436586014357884, + "grad_norm": 1.4650689363479614, + "learning_rate": 4.308983818344313e-05, + "loss": 1.535, + "step": 8556 + }, + { + "epoch": 0.2853961712310556, + "grad_norm": 1.5836583375930786, + "learning_rate": 4.3031769343747206e-05, + "loss": 1.5313, + "step": 8587 + }, + { + "epoch": 0.2864264823185323, + "grad_norm": 1.5348492860794067, + "learning_rate": 4.297349701798505e-05, + "loss": 1.5106, + "step": 8618 + }, + { + "epoch": 0.28745679340600905, + "grad_norm": 1.4060319662094116, + "learning_rate": 4.2915021863751916e-05, + "loss": 1.5283, + "step": 8649 + }, + { + "epoch": 0.2884871044934858, + "grad_norm": 1.531657099723816, + "learning_rate": 4.285634454093198e-05, + "loss": 1.5087, + "step": 8680 + }, + { + "epoch": 0.2895174155809625, + "grad_norm": 1.4756299257278442, + "learning_rate": 4.279746571169086e-05, + "loss": 1.5042, + "step": 8711 + }, + { + "epoch": 0.29054772666843925, + "grad_norm": 1.3221153020858765, + "learning_rate": 4.2738386040468136e-05, + "loss": 1.5244, + "step": 8742 + }, + { + "epoch": 0.29157803775591595, + "grad_norm": 1.4067268371582031, + "learning_rate": 4.2679106193969866e-05, + "loss": 1.5012, + "step": 8773 + }, + { + "epoch": 0.2926083488433927, + "grad_norm": 1.5192064046859741, + "learning_rate": 4.261962684116106e-05, + "loss": 1.521, + "step": 8804 + }, + { + "epoch": 0.29363865993086946, + "grad_norm": 1.3847788572311401, + "learning_rate": 4.2559948653258145e-05, + "loss": 1.5128, + "step": 8835 + }, + { + "epoch": 0.29466897101834616, + "grad_norm": 1.4612780809402466, + "learning_rate": 4.250007230372134e-05, + "loss": 1.5371, + "step": 8866 + }, + { + "epoch": 0.2956992821058229, + "grad_norm": 1.468971610069275, + "learning_rate": 4.2439998468247126e-05, + "loss": 1.5199, + "step": 8897 + }, + { + "epoch": 0.29672959319329967, + "grad_norm": 1.386236310005188, + "learning_rate": 4.2379727824760566e-05, + "loss": 1.5273, + "step": 8928 + }, + { + "epoch": 0.29775990428077637, + "grad_norm": 1.3843929767608643, + "learning_rate": 4.231926105340768e-05, + "loss": 1.5011, + "step": 8959 + }, + { + "epoch": 0.2987902153682531, + "grad_norm": 1.4554557800292969, + "learning_rate": 4.225859883654776e-05, + "loss": 1.5311, + "step": 8990 + }, + { + "epoch": 0.2998205264557299, + "grad_norm": 1.3674421310424805, + "learning_rate": 4.219774185874569e-05, + "loss": 1.5302, + "step": 9021 + }, + { + "epoch": 0.3008508375432066, + "grad_norm": 1.3804330825805664, + "learning_rate": 4.213669080676418e-05, + "loss": 1.538, + "step": 9052 + }, + { + "epoch": 0.3018811486306833, + "grad_norm": 1.4643255472183228, + "learning_rate": 4.2075446369556056e-05, + "loss": 1.5172, + "step": 9083 + }, + { + "epoch": 0.3029114597181601, + "grad_norm": 1.3375928401947021, + "learning_rate": 4.201400923825648e-05, + "loss": 1.5123, + "step": 9114 + }, + { + "epoch": 0.3039417708056368, + "grad_norm": 1.4321980476379395, + "learning_rate": 4.195238010617511e-05, + "loss": 1.5196, + "step": 9145 + }, + { + "epoch": 0.30497208189311353, + "grad_norm": 1.4312376976013184, + "learning_rate": 4.1890559668788344e-05, + "loss": 1.5138, + "step": 9176 + }, + { + "epoch": 0.3060023929805903, + "grad_norm": 1.3089646100997925, + "learning_rate": 4.1828548623731405e-05, + "loss": 1.5027, + "step": 9207 + }, + { + "epoch": 0.307032704068067, + "grad_norm": 1.4863250255584717, + "learning_rate": 4.1766347670790506e-05, + "loss": 1.5091, + "step": 9238 + }, + { + "epoch": 0.30806301515554374, + "grad_norm": 1.373666763305664, + "learning_rate": 4.170395751189495e-05, + "loss": 1.5256, + "step": 9269 + }, + { + "epoch": 0.3090933262430205, + "grad_norm": 1.4160584211349487, + "learning_rate": 4.164137885110921e-05, + "loss": 1.4938, + "step": 9300 + }, + { + "epoch": 0.3101236373304972, + "grad_norm": 2.112110137939453, + "learning_rate": 4.157861239462495e-05, + "loss": 1.5106, + "step": 9331 + }, + { + "epoch": 0.31115394841797395, + "grad_norm": 1.337058663368225, + "learning_rate": 4.1515658850753114e-05, + "loss": 1.4999, + "step": 9362 + }, + { + "epoch": 0.3121842595054507, + "grad_norm": 1.3625296354293823, + "learning_rate": 4.145251892991588e-05, + "loss": 1.5136, + "step": 9393 + }, + { + "epoch": 0.3132145705929274, + "grad_norm": 1.399491548538208, + "learning_rate": 4.138919334463868e-05, + "loss": 1.499, + "step": 9424 + }, + { + "epoch": 0.31424488168040415, + "grad_norm": 1.4202344417572021, + "learning_rate": 4.1325682809542124e-05, + "loss": 1.5049, + "step": 9455 + }, + { + "epoch": 0.3152751927678809, + "grad_norm": 1.392248272895813, + "learning_rate": 4.126198804133398e-05, + "loss": 1.5287, + "step": 9486 + }, + { + "epoch": 0.3163055038553576, + "grad_norm": 1.3807618618011475, + "learning_rate": 4.1198109758801055e-05, + "loss": 1.5309, + "step": 9517 + }, + { + "epoch": 0.31733581494283436, + "grad_norm": 1.3117905855178833, + "learning_rate": 4.113404868280107e-05, + "loss": 1.4933, + "step": 9548 + }, + { + "epoch": 0.3183661260303111, + "grad_norm": 1.452086091041565, + "learning_rate": 4.106980553625457e-05, + "loss": 1.5221, + "step": 9579 + }, + { + "epoch": 0.3193964371177878, + "grad_norm": 1.477364182472229, + "learning_rate": 4.100538104413674e-05, + "loss": 1.4904, + "step": 9610 + }, + { + "epoch": 0.32042674820526457, + "grad_norm": 1.3584345579147339, + "learning_rate": 4.09407759334692e-05, + "loss": 1.4953, + "step": 9641 + }, + { + "epoch": 0.32145705929274127, + "grad_norm": 1.3619811534881592, + "learning_rate": 4.087599093331186e-05, + "loss": 1.4956, + "step": 9672 + }, + { + "epoch": 0.322487370380218, + "grad_norm": 1.4507052898406982, + "learning_rate": 4.081102677475462e-05, + "loss": 1.5197, + "step": 9703 + }, + { + "epoch": 0.3235176814676948, + "grad_norm": 1.4229698181152344, + "learning_rate": 4.0745884190909194e-05, + "loss": 1.498, + "step": 9734 + }, + { + "epoch": 0.32454799255517147, + "grad_norm": 1.3074679374694824, + "learning_rate": 4.0680563916900796e-05, + "loss": 1.5146, + "step": 9765 + }, + { + "epoch": 0.3255783036426482, + "grad_norm": 1.397815465927124, + "learning_rate": 4.0615066689859815e-05, + "loss": 1.5291, + "step": 9796 + }, + { + "epoch": 0.326608614730125, + "grad_norm": 1.3196336030960083, + "learning_rate": 4.0549393248913584e-05, + "loss": 1.5077, + "step": 9827 + }, + { + "epoch": 0.3276389258176017, + "grad_norm": 1.3129957914352417, + "learning_rate": 4.048354433517794e-05, + "loss": 1.4965, + "step": 9858 + }, + { + "epoch": 0.32866923690507843, + "grad_norm": 1.4380089044570923, + "learning_rate": 4.0417520691748916e-05, + "loss": 1.5115, + "step": 9889 + }, + { + "epoch": 0.3296995479925552, + "grad_norm": 1.3162370920181274, + "learning_rate": 4.035132306369438e-05, + "loss": 1.5029, + "step": 9920 + }, + { + "epoch": 0.3307298590800319, + "grad_norm": 1.3739668130874634, + "learning_rate": 4.028495219804555e-05, + "loss": 1.5083, + "step": 9951 + }, + { + "epoch": 0.33176017016750864, + "grad_norm": 1.3673723936080933, + "learning_rate": 4.021840884378864e-05, + "loss": 1.5223, + "step": 9982 + }, + { + "epoch": 0.3327904812549854, + "grad_norm": 1.3970317840576172, + "learning_rate": 4.015169375185633e-05, + "loss": 1.5003, + "step": 10013 + }, + { + "epoch": 0.3338207923424621, + "grad_norm": 1.2982394695281982, + "learning_rate": 4.0084807675119396e-05, + "loss": 1.5066, + "step": 10044 + }, + { + "epoch": 0.33485110342993885, + "grad_norm": 1.4548689126968384, + "learning_rate": 4.0017751368378106e-05, + "loss": 1.4993, + "step": 10075 + }, + { + "epoch": 0.3358814145174156, + "grad_norm": 1.3693586587905884, + "learning_rate": 3.995052558835377e-05, + "loss": 1.4987, + "step": 10106 + }, + { + "epoch": 0.3369117256048923, + "grad_norm": 1.4046767950057983, + "learning_rate": 3.988313109368017e-05, + "loss": 1.5098, + "step": 10137 + }, + { + "epoch": 0.33794203669236905, + "grad_norm": 1.3772069215774536, + "learning_rate": 3.981556864489504e-05, + "loss": 1.5165, + "step": 10168 + }, + { + "epoch": 0.3389723477798458, + "grad_norm": 1.471211314201355, + "learning_rate": 3.974783900443142e-05, + "loss": 1.5037, + "step": 10199 + }, + { + "epoch": 0.3400026588673225, + "grad_norm": 1.3990979194641113, + "learning_rate": 3.9679942936609095e-05, + "loss": 1.5096, + "step": 10230 + }, + { + "epoch": 0.34103296995479926, + "grad_norm": 1.3779234886169434, + "learning_rate": 3.961188120762596e-05, + "loss": 1.4914, + "step": 10261 + }, + { + "epoch": 0.342063281042276, + "grad_norm": 1.2866768836975098, + "learning_rate": 3.954365458554938e-05, + "loss": 1.5026, + "step": 10292 + }, + { + "epoch": 0.3430935921297527, + "grad_norm": 1.353468894958496, + "learning_rate": 3.947526384030751e-05, + "loss": 1.5063, + "step": 10323 + }, + { + "epoch": 0.34412390321722947, + "grad_norm": 1.3264256715774536, + "learning_rate": 3.9406709743680624e-05, + "loss": 1.4911, + "step": 10354 + }, + { + "epoch": 0.3451542143047062, + "grad_norm": 1.3496876955032349, + "learning_rate": 3.9337993069292366e-05, + "loss": 1.4921, + "step": 10385 + }, + { + "epoch": 0.3461845253921829, + "grad_norm": 1.3812434673309326, + "learning_rate": 3.926911459260109e-05, + "loss": 1.4826, + "step": 10416 + }, + { + "epoch": 0.34721483647965967, + "grad_norm": 1.4926965236663818, + "learning_rate": 3.920007509089102e-05, + "loss": 1.4994, + "step": 10447 + }, + { + "epoch": 0.3482451475671364, + "grad_norm": 1.3446170091629028, + "learning_rate": 3.913087534326357e-05, + "loss": 1.5114, + "step": 10478 + }, + { + "epoch": 0.3492754586546131, + "grad_norm": 1.3100495338439941, + "learning_rate": 3.9061516130628475e-05, + "loss": 1.5066, + "step": 10509 + }, + { + "epoch": 0.3503057697420899, + "grad_norm": 1.395874261856079, + "learning_rate": 3.8991998235695025e-05, + "loss": 1.4999, + "step": 10540 + }, + { + "epoch": 0.3513360808295666, + "grad_norm": 1.3682137727737427, + "learning_rate": 3.8922322442963224e-05, + "loss": 1.4778, + "step": 10571 + }, + { + "epoch": 0.35236639191704333, + "grad_norm": 1.4196573495864868, + "learning_rate": 3.885248953871491e-05, + "loss": 1.4909, + "step": 10602 + }, + { + "epoch": 0.3533967030045201, + "grad_norm": 1.4299864768981934, + "learning_rate": 3.8782500311004915e-05, + "loss": 1.5025, + "step": 10633 + }, + { + "epoch": 0.3544270140919968, + "grad_norm": 1.39677095413208, + "learning_rate": 3.871235554965218e-05, + "loss": 1.4932, + "step": 10664 + }, + { + "epoch": 0.35545732517947354, + "grad_norm": 1.3219736814498901, + "learning_rate": 3.864205604623078e-05, + "loss": 1.4795, + "step": 10695 + }, + { + "epoch": 0.3564876362669503, + "grad_norm": 1.3649324178695679, + "learning_rate": 3.857160259406107e-05, + "loss": 1.4838, + "step": 10726 + }, + { + "epoch": 0.357517947354427, + "grad_norm": 1.4109989404678345, + "learning_rate": 3.8500995988200674e-05, + "loss": 1.5058, + "step": 10757 + }, + { + "epoch": 0.35854825844190374, + "grad_norm": 1.3625038862228394, + "learning_rate": 3.843023702543556e-05, + "loss": 1.4912, + "step": 10788 + }, + { + "epoch": 0.3595785695293805, + "grad_norm": 1.4725775718688965, + "learning_rate": 3.8359326504270984e-05, + "loss": 1.5012, + "step": 10819 + }, + { + "epoch": 0.3606088806168572, + "grad_norm": 1.4126085042953491, + "learning_rate": 3.828826522492255e-05, + "loss": 1.4977, + "step": 10850 + }, + { + "epoch": 0.36163919170433395, + "grad_norm": 1.3949086666107178, + "learning_rate": 3.821705398930713e-05, + "loss": 1.4903, + "step": 10881 + }, + { + "epoch": 0.3626695027918107, + "grad_norm": 1.286792516708374, + "learning_rate": 3.814569360103385e-05, + "loss": 1.5067, + "step": 10912 + }, + { + "epoch": 0.3636998138792874, + "grad_norm": 1.274703025817871, + "learning_rate": 3.807418486539499e-05, + "loss": 1.4583, + "step": 10943 + }, + { + "epoch": 0.36473012496676416, + "grad_norm": 1.401455283164978, + "learning_rate": 3.80025285893569e-05, + "loss": 1.4834, + "step": 10974 + }, + { + "epoch": 0.3657604360542409, + "grad_norm": 1.308361530303955, + "learning_rate": 3.793072558155093e-05, + "loss": 1.4832, + "step": 11005 + }, + { + "epoch": 0.3667907471417176, + "grad_norm": 1.654733419418335, + "learning_rate": 3.785877665226426e-05, + "loss": 1.4867, + "step": 11036 + }, + { + "epoch": 0.36782105822919436, + "grad_norm": 1.3530856370925903, + "learning_rate": 3.778668261343079e-05, + "loss": 1.4873, + "step": 11067 + }, + { + "epoch": 0.3688513693166711, + "grad_norm": 1.3567407131195068, + "learning_rate": 3.771444427862192e-05, + "loss": 1.4935, + "step": 11098 + }, + { + "epoch": 0.3698816804041478, + "grad_norm": 1.3184572458267212, + "learning_rate": 3.7642062463037465e-05, + "loss": 1.4891, + "step": 11129 + }, + { + "epoch": 0.37091199149162457, + "grad_norm": 1.366489291191101, + "learning_rate": 3.7569537983496373e-05, + "loss": 1.5159, + "step": 11160 + }, + { + "epoch": 0.3719423025791013, + "grad_norm": 1.423258662223816, + "learning_rate": 3.749687165842753e-05, + "loss": 1.4938, + "step": 11191 + }, + { + "epoch": 0.372972613666578, + "grad_norm": 1.3226194381713867, + "learning_rate": 3.7424064307860536e-05, + "loss": 1.499, + "step": 11222 + }, + { + "epoch": 0.3740029247540548, + "grad_norm": 1.350500464439392, + "learning_rate": 3.735111675341645e-05, + "loss": 1.4952, + "step": 11253 + }, + { + "epoch": 0.37503323584153153, + "grad_norm": 1.3667839765548706, + "learning_rate": 3.7278029818298524e-05, + "loss": 1.4763, + "step": 11284 + }, + { + "epoch": 0.37606354692900823, + "grad_norm": 1.4876132011413574, + "learning_rate": 3.720480432728287e-05, + "loss": 1.4913, + "step": 11315 + }, + { + "epoch": 0.377093858016485, + "grad_norm": 1.3927743434906006, + "learning_rate": 3.71314411067092e-05, + "loss": 1.4948, + "step": 11346 + }, + { + "epoch": 0.37812416910396174, + "grad_norm": 1.3752413988113403, + "learning_rate": 3.70579409844715e-05, + "loss": 1.4763, + "step": 11377 + }, + { + "epoch": 0.37915448019143844, + "grad_norm": 1.3530951738357544, + "learning_rate": 3.698430479000865e-05, + "loss": 1.5077, + "step": 11408 + }, + { + "epoch": 0.3801847912789152, + "grad_norm": 1.4309345483779907, + "learning_rate": 3.691053335429509e-05, + "loss": 1.4945, + "step": 11439 + }, + { + "epoch": 0.38121510236639194, + "grad_norm": 1.2874380350112915, + "learning_rate": 3.683662750983147e-05, + "loss": 1.4698, + "step": 11470 + }, + { + "epoch": 0.38224541345386864, + "grad_norm": 1.3356250524520874, + "learning_rate": 3.676258809063518e-05, + "loss": 1.4924, + "step": 11501 + }, + { + "epoch": 0.3832757245413454, + "grad_norm": 1.304559588432312, + "learning_rate": 3.6688415932231004e-05, + "loss": 1.4682, + "step": 11532 + }, + { + "epoch": 0.3843060356288221, + "grad_norm": 1.4153447151184082, + "learning_rate": 3.661411187164166e-05, + "loss": 1.4989, + "step": 11563 + }, + { + "epoch": 0.38533634671629885, + "grad_norm": 1.356992244720459, + "learning_rate": 3.65396767473784e-05, + "loss": 1.4854, + "step": 11594 + }, + { + "epoch": 0.3863666578037756, + "grad_norm": 1.322449803352356, + "learning_rate": 3.6465111399431465e-05, + "loss": 1.4877, + "step": 11625 + }, + { + "epoch": 0.3873969688912523, + "grad_norm": 1.3981350660324097, + "learning_rate": 3.6390416669260674e-05, + "loss": 1.499, + "step": 11656 + }, + { + "epoch": 0.38842727997872906, + "grad_norm": 1.324871301651001, + "learning_rate": 3.63155933997859e-05, + "loss": 1.4814, + "step": 11687 + }, + { + "epoch": 0.3894575910662058, + "grad_norm": 1.3940790891647339, + "learning_rate": 3.624064243537758e-05, + "loss": 1.4754, + "step": 11718 + }, + { + "epoch": 0.3904879021536825, + "grad_norm": 1.2880780696868896, + "learning_rate": 3.616556462184716e-05, + "loss": 1.4832, + "step": 11749 + }, + { + "epoch": 0.39151821324115926, + "grad_norm": 1.315329670906067, + "learning_rate": 3.609036080643755e-05, + "loss": 1.4853, + "step": 11780 + }, + { + "epoch": 0.392548524328636, + "grad_norm": 1.4093523025512695, + "learning_rate": 3.60150318378136e-05, + "loss": 1.4978, + "step": 11811 + }, + { + "epoch": 0.3935788354161127, + "grad_norm": 1.271151065826416, + "learning_rate": 3.5939578566052465e-05, + "loss": 1.4933, + "step": 11842 + }, + { + "epoch": 0.39460914650358947, + "grad_norm": 1.2910923957824707, + "learning_rate": 3.586400184263408e-05, + "loss": 1.4853, + "step": 11873 + }, + { + "epoch": 0.3956394575910662, + "grad_norm": 1.2480064630508423, + "learning_rate": 3.578830252043148e-05, + "loss": 1.4642, + "step": 11904 + }, + { + "epoch": 0.3966697686785429, + "grad_norm": 1.263197422027588, + "learning_rate": 3.571248145370125e-05, + "loss": 1.4812, + "step": 11935 + }, + { + "epoch": 0.3977000797660197, + "grad_norm": 1.3231288194656372, + "learning_rate": 3.5636539498073794e-05, + "loss": 1.4744, + "step": 11966 + }, + { + "epoch": 0.39873039085349643, + "grad_norm": 1.3933110237121582, + "learning_rate": 3.556047751054378e-05, + "loss": 1.4849, + "step": 11997 + }, + { + "epoch": 0.39976070194097313, + "grad_norm": 1.3615801334381104, + "learning_rate": 3.548429634946039e-05, + "loss": 1.4866, + "step": 12028 + }, + { + "epoch": 0.4007910130284499, + "grad_norm": 1.298638939857483, + "learning_rate": 3.540799687451768e-05, + "loss": 1.4664, + "step": 12059 + }, + { + "epoch": 0.40182132411592664, + "grad_norm": 1.29216468334198, + "learning_rate": 3.533157994674485e-05, + "loss": 1.4697, + "step": 12090 + }, + { + "epoch": 0.40285163520340334, + "grad_norm": 1.3759845495224, + "learning_rate": 3.5255046428496546e-05, + "loss": 1.4854, + "step": 12121 + }, + { + "epoch": 0.4038819462908801, + "grad_norm": 1.4045615196228027, + "learning_rate": 3.517839718344311e-05, + "loss": 1.4622, + "step": 12152 + }, + { + "epoch": 0.40491225737835684, + "grad_norm": 1.2979034185409546, + "learning_rate": 3.510163307656086e-05, + "loss": 1.4797, + "step": 12183 + }, + { + "epoch": 0.40594256846583354, + "grad_norm": 1.303139567375183, + "learning_rate": 3.5024754974122324e-05, + "loss": 1.4588, + "step": 12214 + }, + { + "epoch": 0.4069728795533103, + "grad_norm": 1.287781834602356, + "learning_rate": 3.494776374368643e-05, + "loss": 1.4834, + "step": 12245 + }, + { + "epoch": 0.40800319064078705, + "grad_norm": 1.3806688785552979, + "learning_rate": 3.4870660254088724e-05, + "loss": 1.4807, + "step": 12276 + }, + { + "epoch": 0.40903350172826375, + "grad_norm": 1.4059745073318481, + "learning_rate": 3.479344537543164e-05, + "loss": 1.4906, + "step": 12307 + }, + { + "epoch": 0.4100638128157405, + "grad_norm": 1.3052942752838135, + "learning_rate": 3.4716119979074565e-05, + "loss": 1.4801, + "step": 12338 + }, + { + "epoch": 0.41109412390321726, + "grad_norm": 1.3306844234466553, + "learning_rate": 3.463868493762412e-05, + "loss": 1.4911, + "step": 12369 + }, + { + "epoch": 0.41212443499069396, + "grad_norm": 1.3276656866073608, + "learning_rate": 3.456114112492418e-05, + "loss": 1.4678, + "step": 12400 + }, + { + "epoch": 0.4131547460781707, + "grad_norm": 1.3164253234863281, + "learning_rate": 3.4483489416046164e-05, + "loss": 1.4816, + "step": 12431 + }, + { + "epoch": 0.4141850571656474, + "grad_norm": 1.3827886581420898, + "learning_rate": 3.440573068727905e-05, + "loss": 1.481, + "step": 12462 + }, + { + "epoch": 0.41521536825312416, + "grad_norm": 1.2899463176727295, + "learning_rate": 3.4327865816119495e-05, + "loss": 1.4575, + "step": 12493 + }, + { + "epoch": 0.4162456793406009, + "grad_norm": 1.3136677742004395, + "learning_rate": 3.4249895681262025e-05, + "loss": 1.4695, + "step": 12524 + }, + { + "epoch": 0.4172759904280776, + "grad_norm": 1.2920372486114502, + "learning_rate": 3.417182116258899e-05, + "loss": 1.4765, + "step": 12555 + }, + { + "epoch": 0.41830630151555437, + "grad_norm": 1.3285510540008545, + "learning_rate": 3.409364314116074e-05, + "loss": 1.4559, + "step": 12586 + }, + { + "epoch": 0.4193366126030311, + "grad_norm": 1.2834984064102173, + "learning_rate": 3.401536249920559e-05, + "loss": 1.4706, + "step": 12617 + }, + { + "epoch": 0.4203669236905078, + "grad_norm": 1.315942645072937, + "learning_rate": 3.393698012010998e-05, + "loss": 1.4692, + "step": 12648 + }, + { + "epoch": 0.4213972347779846, + "grad_norm": 1.3668091297149658, + "learning_rate": 3.385849688840839e-05, + "loss": 1.4801, + "step": 12679 + }, + { + "epoch": 0.42242754586546133, + "grad_norm": 1.312280297279358, + "learning_rate": 3.3779913689773414e-05, + "loss": 1.4673, + "step": 12710 + }, + { + "epoch": 0.423457856952938, + "grad_norm": 1.3579858541488647, + "learning_rate": 3.370123141100578e-05, + "loss": 1.4578, + "step": 12741 + }, + { + "epoch": 0.4244881680404148, + "grad_norm": 1.4001456499099731, + "learning_rate": 3.3622450940024305e-05, + "loss": 1.4787, + "step": 12772 + }, + { + "epoch": 0.42551847912789154, + "grad_norm": 1.352629542350769, + "learning_rate": 3.35435731658559e-05, + "loss": 1.457, + "step": 12803 + }, + { + "epoch": 0.42654879021536823, + "grad_norm": 1.4044222831726074, + "learning_rate": 3.346459897862552e-05, + "loss": 1.4979, + "step": 12834 + }, + { + "epoch": 0.427579101302845, + "grad_norm": 1.2666436433792114, + "learning_rate": 3.338552926954613e-05, + "loss": 1.4712, + "step": 12865 + }, + { + "epoch": 0.42860941239032174, + "grad_norm": 1.2487694025039673, + "learning_rate": 3.330636493090868e-05, + "loss": 1.4784, + "step": 12896 + }, + { + "epoch": 0.42963972347779844, + "grad_norm": 1.2346290349960327, + "learning_rate": 3.322710685607193e-05, + "loss": 1.4754, + "step": 12927 + }, + { + "epoch": 0.4306700345652752, + "grad_norm": 1.2908893823623657, + "learning_rate": 3.314775593945251e-05, + "loss": 1.4677, + "step": 12958 + }, + { + "epoch": 0.43170034565275195, + "grad_norm": 1.3283506631851196, + "learning_rate": 3.3068313076514714e-05, + "loss": 1.4661, + "step": 12989 + }, + { + "epoch": 0.43273065674022865, + "grad_norm": 1.2982537746429443, + "learning_rate": 3.298877916376047e-05, + "loss": 1.4838, + "step": 13020 + }, + { + "epoch": 0.4337609678277054, + "grad_norm": 1.3566454648971558, + "learning_rate": 3.290915509871915e-05, + "loss": 1.4683, + "step": 13051 + }, + { + "epoch": 0.43479127891518216, + "grad_norm": 1.3470877408981323, + "learning_rate": 3.282944177993753e-05, + "loss": 1.4724, + "step": 13082 + }, + { + "epoch": 0.43582159000265885, + "grad_norm": 1.451150894165039, + "learning_rate": 3.274964010696957e-05, + "loss": 1.4731, + "step": 13113 + }, + { + "epoch": 0.4368519010901356, + "grad_norm": 1.3415958881378174, + "learning_rate": 3.266975098036629e-05, + "loss": 1.4809, + "step": 13144 + }, + { + "epoch": 0.43788221217761236, + "grad_norm": 1.2775352001190186, + "learning_rate": 3.258977530166562e-05, + "loss": 1.4523, + "step": 13175 + }, + { + "epoch": 0.43891252326508906, + "grad_norm": 1.365050196647644, + "learning_rate": 3.250971397338227e-05, + "loss": 1.4611, + "step": 13206 + }, + { + "epoch": 0.4399428343525658, + "grad_norm": 1.3481686115264893, + "learning_rate": 3.2429567898997404e-05, + "loss": 1.4708, + "step": 13237 + }, + { + "epoch": 0.44097314544004257, + "grad_norm": 1.3418121337890625, + "learning_rate": 3.234933798294859e-05, + "loss": 1.485, + "step": 13268 + }, + { + "epoch": 0.44200345652751927, + "grad_norm": 1.3098441362380981, + "learning_rate": 3.2269025130619535e-05, + "loss": 1.472, + "step": 13299 + }, + { + "epoch": 0.443033767614996, + "grad_norm": 1.2792437076568604, + "learning_rate": 3.218863024832985e-05, + "loss": 1.4592, + "step": 13330 + }, + { + "epoch": 0.4440640787024727, + "grad_norm": 1.3804035186767578, + "learning_rate": 3.2108154243324864e-05, + "loss": 1.4546, + "step": 13361 + }, + { + "epoch": 0.4450943897899495, + "grad_norm": 1.287787675857544, + "learning_rate": 3.2027598023765345e-05, + "loss": 1.4477, + "step": 13392 + }, + { + "epoch": 0.44612470087742623, + "grad_norm": 1.5964646339416504, + "learning_rate": 3.194696249871729e-05, + "loss": 1.4468, + "step": 13423 + }, + { + "epoch": 0.4471550119649029, + "grad_norm": 1.3253474235534668, + "learning_rate": 3.186624857814164e-05, + "loss": 1.4588, + "step": 13454 + }, + { + "epoch": 0.4481853230523797, + "grad_norm": 1.288176417350769, + "learning_rate": 3.178545717288401e-05, + "loss": 1.4644, + "step": 13485 + }, + { + "epoch": 0.44921563413985643, + "grad_norm": 1.3357142210006714, + "learning_rate": 3.170458919466444e-05, + "loss": 1.4871, + "step": 13516 + }, + { + "epoch": 0.45024594522733313, + "grad_norm": 1.2954436540603638, + "learning_rate": 3.1623645556067063e-05, + "loss": 1.4571, + "step": 13547 + }, + { + "epoch": 0.4512762563148099, + "grad_norm": 1.344789981842041, + "learning_rate": 3.154262717052985e-05, + "loss": 1.459, + "step": 13578 + }, + { + "epoch": 0.45230656740228664, + "grad_norm": 1.2648475170135498, + "learning_rate": 3.146153495233426e-05, + "loss": 1.4496, + "step": 13609 + }, + { + "epoch": 0.45333687848976334, + "grad_norm": 1.312733769416809, + "learning_rate": 3.1380369816594944e-05, + "loss": 1.4309, + "step": 13640 + }, + { + "epoch": 0.4543671895772401, + "grad_norm": 1.3719325065612793, + "learning_rate": 3.129913267924946e-05, + "loss": 1.4723, + "step": 13671 + }, + { + "epoch": 0.45539750066471685, + "grad_norm": 1.2850617170333862, + "learning_rate": 3.121782445704782e-05, + "loss": 1.4599, + "step": 13702 + }, + { + "epoch": 0.45642781175219355, + "grad_norm": 1.3335177898406982, + "learning_rate": 3.11364460675423e-05, + "loss": 1.4821, + "step": 13733 + }, + { + "epoch": 0.4574581228396703, + "grad_norm": 1.1675069332122803, + "learning_rate": 3.1054998429076934e-05, + "loss": 1.453, + "step": 13764 + }, + { + "epoch": 0.45848843392714705, + "grad_norm": 1.283544898033142, + "learning_rate": 3.097348246077728e-05, + "loss": 1.4545, + "step": 13795 + }, + { + "epoch": 0.45951874501462375, + "grad_norm": 1.4358693361282349, + "learning_rate": 3.0891899082539924e-05, + "loss": 1.4673, + "step": 13826 + }, + { + "epoch": 0.4605490561021005, + "grad_norm": 1.2551497220993042, + "learning_rate": 3.0810249215022233e-05, + "loss": 1.4532, + "step": 13857 + }, + { + "epoch": 0.46157936718957726, + "grad_norm": 1.2574602365493774, + "learning_rate": 3.0728533779631865e-05, + "loss": 1.4762, + "step": 13888 + }, + { + "epoch": 0.46260967827705396, + "grad_norm": 1.2202764749526978, + "learning_rate": 3.064675369851637e-05, + "loss": 1.4461, + "step": 13919 + }, + { + "epoch": 0.4636399893645307, + "grad_norm": 1.2787501811981201, + "learning_rate": 3.056490989455289e-05, + "loss": 1.4607, + "step": 13950 + }, + { + "epoch": 0.46467030045200747, + "grad_norm": 1.2511006593704224, + "learning_rate": 3.0483003291337596e-05, + "loss": 1.4548, + "step": 13981 + }, + { + "epoch": 0.46570061153948417, + "grad_norm": 1.2749834060668945, + "learning_rate": 3.040103481317539e-05, + "loss": 1.4394, + "step": 14012 + }, + { + "epoch": 0.4667309226269609, + "grad_norm": 1.223057746887207, + "learning_rate": 3.03190053850694e-05, + "loss": 1.4684, + "step": 14043 + }, + { + "epoch": 0.4677612337144377, + "grad_norm": 1.39846932888031, + "learning_rate": 3.0236915932710573e-05, + "loss": 1.4657, + "step": 14074 + }, + { + "epoch": 0.4687915448019144, + "grad_norm": 1.5305665731430054, + "learning_rate": 3.0154767382467232e-05, + "loss": 1.4795, + "step": 14105 + }, + { + "epoch": 0.4698218558893911, + "grad_norm": 1.2569035291671753, + "learning_rate": 3.0072560661374582e-05, + "loss": 1.4756, + "step": 14136 + }, + { + "epoch": 0.4708521669768679, + "grad_norm": 1.3472824096679688, + "learning_rate": 2.999029669712431e-05, + "loss": 1.4682, + "step": 14167 + }, + { + "epoch": 0.4718824780643446, + "grad_norm": 1.271714210510254, + "learning_rate": 2.990797641805408e-05, + "loss": 1.4509, + "step": 14198 + }, + { + "epoch": 0.47291278915182133, + "grad_norm": 1.3342047929763794, + "learning_rate": 2.982560075313704e-05, + "loss": 1.4528, + "step": 14229 + }, + { + "epoch": 0.47394310023929803, + "grad_norm": 1.5821506977081299, + "learning_rate": 2.9743170631971368e-05, + "loss": 1.4609, + "step": 14260 + }, + { + "epoch": 0.4749734113267748, + "grad_norm": 1.2598062753677368, + "learning_rate": 2.9660686984769792e-05, + "loss": 1.471, + "step": 14291 + }, + { + "epoch": 0.47600372241425154, + "grad_norm": 1.2648885250091553, + "learning_rate": 2.9578150742349047e-05, + "loss": 1.4708, + "step": 14322 + }, + { + "epoch": 0.47703403350172824, + "grad_norm": 1.559665560722351, + "learning_rate": 2.949556283611942e-05, + "loss": 1.4516, + "step": 14353 + }, + { + "epoch": 0.478064344589205, + "grad_norm": 1.2621581554412842, + "learning_rate": 2.9412924198074206e-05, + "loss": 1.446, + "step": 14384 + }, + { + "epoch": 0.47909465567668175, + "grad_norm": 1.2775017023086548, + "learning_rate": 2.9330235760779208e-05, + "loss": 1.4496, + "step": 14415 + }, + { + "epoch": 0.48012496676415845, + "grad_norm": 1.2010388374328613, + "learning_rate": 2.9247498457362188e-05, + "loss": 1.4606, + "step": 14446 + }, + { + "epoch": 0.4811552778516352, + "grad_norm": 1.3053895235061646, + "learning_rate": 2.9164713221502373e-05, + "loss": 1.4536, + "step": 14477 + }, + { + "epoch": 0.48218558893911195, + "grad_norm": 1.311596155166626, + "learning_rate": 2.9081880987419912e-05, + "loss": 1.4409, + "step": 14508 + }, + { + "epoch": 0.48321590002658865, + "grad_norm": 1.3888933658599854, + "learning_rate": 2.8999002689865296e-05, + "loss": 1.4314, + "step": 14539 + }, + { + "epoch": 0.4842462111140654, + "grad_norm": 1.288619875907898, + "learning_rate": 2.8916079264108852e-05, + "loss": 1.4539, + "step": 14570 + }, + { + "epoch": 0.48527652220154216, + "grad_norm": 1.2974294424057007, + "learning_rate": 2.883311164593017e-05, + "loss": 1.4627, + "step": 14601 + }, + { + "epoch": 0.48630683328901886, + "grad_norm": 1.2057379484176636, + "learning_rate": 2.875010077160754e-05, + "loss": 1.4578, + "step": 14632 + }, + { + "epoch": 0.4873371443764956, + "grad_norm": 1.363971471786499, + "learning_rate": 2.866704757790741e-05, + "loss": 1.4671, + "step": 14663 + }, + { + "epoch": 0.48836745546397237, + "grad_norm": 1.2696925401687622, + "learning_rate": 2.858395300207376e-05, + "loss": 1.4333, + "step": 14694 + }, + { + "epoch": 0.48939776655144906, + "grad_norm": 1.2653478384017944, + "learning_rate": 2.8500817981817607e-05, + "loss": 1.4662, + "step": 14725 + }, + { + "epoch": 0.4904280776389258, + "grad_norm": 1.3011239767074585, + "learning_rate": 2.8417643455306336e-05, + "loss": 1.4589, + "step": 14756 + }, + { + "epoch": 0.4914583887264026, + "grad_norm": 1.3312432765960693, + "learning_rate": 2.8334430361153185e-05, + "loss": 1.4368, + "step": 14787 + }, + { + "epoch": 0.49248869981387927, + "grad_norm": 1.3015661239624023, + "learning_rate": 2.8251179638406612e-05, + "loss": 1.466, + "step": 14818 + }, + { + "epoch": 0.493519010901356, + "grad_norm": 1.3215759992599487, + "learning_rate": 2.8167892226539704e-05, + "loss": 1.4486, + "step": 14849 + }, + { + "epoch": 0.4945493219888328, + "grad_norm": 1.2909883260726929, + "learning_rate": 2.8084569065439588e-05, + "loss": 1.4433, + "step": 14880 + }, + { + "epoch": 0.4955796330763095, + "grad_norm": 1.364015817642212, + "learning_rate": 2.8001211095396807e-05, + "loss": 1.4449, + "step": 14911 + }, + { + "epoch": 0.49660994416378623, + "grad_norm": 1.2468819618225098, + "learning_rate": 2.791781925709473e-05, + "loss": 1.4572, + "step": 14942 + }, + { + "epoch": 0.497640255251263, + "grad_norm": 1.2739325761795044, + "learning_rate": 2.7834394491598908e-05, + "loss": 1.4478, + "step": 14973 + }, + { + "epoch": 0.4986705663387397, + "grad_norm": 1.3384937047958374, + "learning_rate": 2.7750937740346485e-05, + "loss": 1.4429, + "step": 15004 + }, + { + "epoch": 0.49970087742621644, + "grad_norm": 1.231088399887085, + "learning_rate": 2.7667449945135564e-05, + "loss": 1.4631, + "step": 15035 + }, + { + "epoch": 0.5007311885136931, + "grad_norm": 1.2262307405471802, + "learning_rate": 2.7583932048114557e-05, + "loss": 1.4508, + "step": 15066 + }, + { + "epoch": 0.5017614996011699, + "grad_norm": 1.3427774906158447, + "learning_rate": 2.7500384991771587e-05, + "loss": 1.4441, + "step": 15097 + }, + { + "epoch": 0.5027918106886466, + "grad_norm": 1.2950241565704346, + "learning_rate": 2.7416809718923825e-05, + "loss": 1.4427, + "step": 15128 + }, + { + "epoch": 0.5038221217761234, + "grad_norm": 1.4129016399383545, + "learning_rate": 2.7333207172706864e-05, + "loss": 1.4562, + "step": 15159 + }, + { + "epoch": 0.5048524328636002, + "grad_norm": 1.2751520872116089, + "learning_rate": 2.7249578296564088e-05, + "loss": 1.4517, + "step": 15190 + }, + { + "epoch": 0.5058827439510768, + "grad_norm": 1.302485466003418, + "learning_rate": 2.7165924034235973e-05, + "loss": 1.4327, + "step": 15221 + }, + { + "epoch": 0.5069130550385536, + "grad_norm": 1.295390009880066, + "learning_rate": 2.708224532974953e-05, + "loss": 1.4455, + "step": 15252 + }, + { + "epoch": 0.5079433661260303, + "grad_norm": 1.3160103559494019, + "learning_rate": 2.6998543127407538e-05, + "loss": 1.4556, + "step": 15283 + }, + { + "epoch": 0.5089736772135071, + "grad_norm": 1.2997361421585083, + "learning_rate": 2.6914818371777988e-05, + "loss": 1.444, + "step": 15314 + }, + { + "epoch": 0.5100039883009838, + "grad_norm": 1.2427833080291748, + "learning_rate": 2.6831072007683373e-05, + "loss": 1.4501, + "step": 15345 + }, + { + "epoch": 0.5110342993884606, + "grad_norm": 1.2402199506759644, + "learning_rate": 2.6747304980190018e-05, + "loss": 1.4543, + "step": 15376 + }, + { + "epoch": 0.5120646104759372, + "grad_norm": 1.2938770055770874, + "learning_rate": 2.6663518234597453e-05, + "loss": 1.4394, + "step": 15407 + }, + { + "epoch": 0.513094921563414, + "grad_norm": 1.1747736930847168, + "learning_rate": 2.6579712716427696e-05, + "loss": 1.4389, + "step": 15438 + }, + { + "epoch": 0.5141252326508907, + "grad_norm": 1.326824426651001, + "learning_rate": 2.6495889371414652e-05, + "loss": 1.4365, + "step": 15469 + }, + { + "epoch": 0.5151555437383675, + "grad_norm": 1.245665431022644, + "learning_rate": 2.6412049145493367e-05, + "loss": 1.4525, + "step": 15500 + }, + { + "epoch": 0.5161858548258442, + "grad_norm": 1.1753687858581543, + "learning_rate": 2.632819298478939e-05, + "loss": 1.447, + "step": 15531 + }, + { + "epoch": 0.517216165913321, + "grad_norm": 1.3870874643325806, + "learning_rate": 2.6244321835608105e-05, + "loss": 1.4577, + "step": 15562 + }, + { + "epoch": 0.5182464770007976, + "grad_norm": 1.2849411964416504, + "learning_rate": 2.6160436644424024e-05, + "loss": 1.4371, + "step": 15593 + }, + { + "epoch": 0.5192767880882744, + "grad_norm": 1.292443037033081, + "learning_rate": 2.6076538357870133e-05, + "loss": 1.4558, + "step": 15624 + }, + { + "epoch": 0.5203070991757511, + "grad_norm": 1.279961347579956, + "learning_rate": 2.5992627922727196e-05, + "loss": 1.4384, + "step": 15655 + }, + { + "epoch": 0.5213374102632279, + "grad_norm": 1.3141279220581055, + "learning_rate": 2.5908706285913066e-05, + "loss": 1.45, + "step": 15686 + }, + { + "epoch": 0.5223677213507046, + "grad_norm": 1.3931515216827393, + "learning_rate": 2.5824774394472008e-05, + "loss": 1.4403, + "step": 15717 + }, + { + "epoch": 0.5233980324381813, + "grad_norm": 1.2564170360565186, + "learning_rate": 2.5740833195563996e-05, + "loss": 1.4482, + "step": 15748 + }, + { + "epoch": 0.524428343525658, + "grad_norm": 1.5450046062469482, + "learning_rate": 2.5656883636454067e-05, + "loss": 1.4443, + "step": 15779 + }, + { + "epoch": 0.5254586546131348, + "grad_norm": 1.2659518718719482, + "learning_rate": 2.557292666450159e-05, + "loss": 1.4653, + "step": 15810 + }, + { + "epoch": 0.5264889657006115, + "grad_norm": 1.2940540313720703, + "learning_rate": 2.5488963227149566e-05, + "loss": 1.4302, + "step": 15841 + }, + { + "epoch": 0.5275192767880883, + "grad_norm": 1.2514533996582031, + "learning_rate": 2.5404994271913983e-05, + "loss": 1.4412, + "step": 15872 + }, + { + "epoch": 0.528549587875565, + "grad_norm": 1.2681846618652344, + "learning_rate": 2.5321020746373085e-05, + "loss": 1.4411, + "step": 15903 + }, + { + "epoch": 0.5295798989630417, + "grad_norm": 1.2581806182861328, + "learning_rate": 2.52370435981567e-05, + "loss": 1.4503, + "step": 15934 + }, + { + "epoch": 0.5306102100505184, + "grad_norm": 1.3299468755722046, + "learning_rate": 2.5153063774935533e-05, + "loss": 1.4392, + "step": 15965 + }, + { + "epoch": 0.5316405211379952, + "grad_norm": 1.240678310394287, + "learning_rate": 2.506908222441045e-05, + "loss": 1.4412, + "step": 15996 + }, + { + "epoch": 0.532670832225472, + "grad_norm": 1.337936520576477, + "learning_rate": 2.498509989430187e-05, + "loss": 1.4254, + "step": 16027 + }, + { + "epoch": 0.5337011433129487, + "grad_norm": 1.302909016609192, + "learning_rate": 2.4901117732338958e-05, + "loss": 1.4436, + "step": 16058 + }, + { + "epoch": 0.5347314544004255, + "grad_norm": 1.2539550065994263, + "learning_rate": 2.481713668624899e-05, + "loss": 1.4496, + "step": 16089 + }, + { + "epoch": 0.5357617654879021, + "grad_norm": 1.287431001663208, + "learning_rate": 2.4733157703746663e-05, + "loss": 1.424, + "step": 16120 + }, + { + "epoch": 0.5367920765753789, + "grad_norm": 1.5333632230758667, + "learning_rate": 2.4649181732523392e-05, + "loss": 1.4399, + "step": 16151 + }, + { + "epoch": 0.5378223876628556, + "grad_norm": 1.2591406106948853, + "learning_rate": 2.4565209720236582e-05, + "loss": 1.439, + "step": 16182 + }, + { + "epoch": 0.5388526987503324, + "grad_norm": 1.3093276023864746, + "learning_rate": 2.4481242614498975e-05, + "loss": 1.4279, + "step": 16213 + }, + { + "epoch": 0.5398830098378091, + "grad_norm": 1.2824875116348267, + "learning_rate": 2.439728136286796e-05, + "loss": 1.4428, + "step": 16244 + }, + { + "epoch": 0.5409133209252859, + "grad_norm": 1.2775593996047974, + "learning_rate": 2.4313326912834852e-05, + "loss": 1.4352, + "step": 16275 + }, + { + "epoch": 0.5419436320127625, + "grad_norm": 1.4667550325393677, + "learning_rate": 2.4229380211814206e-05, + "loss": 1.4633, + "step": 16306 + }, + { + "epoch": 0.5429739431002393, + "grad_norm": 1.2620900869369507, + "learning_rate": 2.4145442207133124e-05, + "loss": 1.4482, + "step": 16337 + }, + { + "epoch": 0.544004254187716, + "grad_norm": 1.3041224479675293, + "learning_rate": 2.406151384602059e-05, + "loss": 1.4431, + "step": 16368 + }, + { + "epoch": 0.5450345652751928, + "grad_norm": 1.3634989261627197, + "learning_rate": 2.3977596075596747e-05, + "loss": 1.4186, + "step": 16399 + }, + { + "epoch": 0.5460648763626695, + "grad_norm": 1.2322940826416016, + "learning_rate": 2.3893689842862223e-05, + "loss": 1.4322, + "step": 16430 + }, + { + "epoch": 0.5470951874501463, + "grad_norm": 1.5554733276367188, + "learning_rate": 2.3809796094687475e-05, + "loss": 1.4337, + "step": 16461 + }, + { + "epoch": 0.5481254985376229, + "grad_norm": 1.4745500087738037, + "learning_rate": 2.372591577780202e-05, + "loss": 1.4411, + "step": 16492 + }, + { + "epoch": 0.5491558096250997, + "grad_norm": 1.2865196466445923, + "learning_rate": 2.3642049838783838e-05, + "loss": 1.429, + "step": 16523 + }, + { + "epoch": 0.5501861207125764, + "grad_norm": 1.399247407913208, + "learning_rate": 2.3558199224048666e-05, + "loss": 1.4753, + "step": 16554 + }, + { + "epoch": 0.5512164318000532, + "grad_norm": 1.2135406732559204, + "learning_rate": 2.347436487983929e-05, + "loss": 1.4553, + "step": 16585 + }, + { + "epoch": 0.55224674288753, + "grad_norm": 1.164150357246399, + "learning_rate": 2.3390547752214888e-05, + "loss": 1.4268, + "step": 16616 + }, + { + "epoch": 0.5532770539750066, + "grad_norm": 1.2363818883895874, + "learning_rate": 2.330674878704035e-05, + "loss": 1.4381, + "step": 16647 + }, + { + "epoch": 0.5543073650624833, + "grad_norm": 1.286139726638794, + "learning_rate": 2.322296892997561e-05, + "loss": 1.4492, + "step": 16678 + }, + { + "epoch": 0.5553376761499601, + "grad_norm": 1.2836147546768188, + "learning_rate": 2.313920912646497e-05, + "loss": 1.4128, + "step": 16709 + }, + { + "epoch": 0.5563679872374369, + "grad_norm": 1.253727674484253, + "learning_rate": 2.305547032172643e-05, + "loss": 1.4472, + "step": 16740 + }, + { + "epoch": 0.5573982983249136, + "grad_norm": 1.2580201625823975, + "learning_rate": 2.2971753460741014e-05, + "loss": 1.4461, + "step": 16771 + }, + { + "epoch": 0.5584286094123904, + "grad_norm": 1.2446421384811401, + "learning_rate": 2.288805948824212e-05, + "loss": 1.4267, + "step": 16802 + }, + { + "epoch": 0.559458920499867, + "grad_norm": 1.3572150468826294, + "learning_rate": 2.2804389348704858e-05, + "loss": 1.4222, + "step": 16833 + }, + { + "epoch": 0.5604892315873438, + "grad_norm": 1.3694707155227661, + "learning_rate": 2.2720743986335374e-05, + "loss": 1.4624, + "step": 16864 + }, + { + "epoch": 0.5615195426748205, + "grad_norm": 1.2654088735580444, + "learning_rate": 2.2637124345060233e-05, + "loss": 1.4379, + "step": 16895 + }, + { + "epoch": 0.5625498537622973, + "grad_norm": 1.3349469900131226, + "learning_rate": 2.2553531368515695e-05, + "loss": 1.4404, + "step": 16926 + }, + { + "epoch": 0.563580164849774, + "grad_norm": 1.2259774208068848, + "learning_rate": 2.2469966000037144e-05, + "loss": 1.4335, + "step": 16957 + }, + { + "epoch": 0.5646104759372508, + "grad_norm": 1.2973053455352783, + "learning_rate": 2.2386429182648417e-05, + "loss": 1.4397, + "step": 16988 + }, + { + "epoch": 0.5656407870247274, + "grad_norm": 1.2674601078033447, + "learning_rate": 2.230292185905114e-05, + "loss": 1.4256, + "step": 17019 + }, + { + "epoch": 0.5666710981122042, + "grad_norm": 1.243605136871338, + "learning_rate": 2.2219444971614116e-05, + "loss": 1.4404, + "step": 17050 + }, + { + "epoch": 0.5677014091996809, + "grad_norm": 1.2108361721038818, + "learning_rate": 2.2135999462362655e-05, + "loss": 1.4318, + "step": 17081 + }, + { + "epoch": 0.5687317202871577, + "grad_norm": 1.2497962713241577, + "learning_rate": 2.2052586272968003e-05, + "loss": 1.4409, + "step": 17112 + }, + { + "epoch": 0.5697620313746344, + "grad_norm": 1.2269086837768555, + "learning_rate": 2.196920634473666e-05, + "loss": 1.4417, + "step": 17143 + }, + { + "epoch": 0.5707923424621112, + "grad_norm": 1.3165903091430664, + "learning_rate": 2.1885860618599787e-05, + "loss": 1.4541, + "step": 17174 + }, + { + "epoch": 0.5718226535495878, + "grad_norm": 1.2117608785629272, + "learning_rate": 2.1802550035102577e-05, + "loss": 1.4457, + "step": 17205 + }, + { + "epoch": 0.5728529646370646, + "grad_norm": 1.2482073307037354, + "learning_rate": 2.171927553439363e-05, + "loss": 1.4408, + "step": 17236 + }, + { + "epoch": 0.5738832757245413, + "grad_norm": 1.2258682250976562, + "learning_rate": 2.1636038056214376e-05, + "loss": 1.4366, + "step": 17267 + }, + { + "epoch": 0.5749135868120181, + "grad_norm": 1.254062294960022, + "learning_rate": 2.155283853988844e-05, + "loss": 1.4187, + "step": 17298 + }, + { + "epoch": 0.5759438978994948, + "grad_norm": 1.3397905826568604, + "learning_rate": 2.146967792431106e-05, + "loss": 1.4316, + "step": 17329 + }, + { + "epoch": 0.5769742089869716, + "grad_norm": 1.3253263235092163, + "learning_rate": 2.138655714793849e-05, + "loss": 1.4361, + "step": 17360 + }, + { + "epoch": 0.5780045200744482, + "grad_norm": 1.2624903917312622, + "learning_rate": 2.1303477148777367e-05, + "loss": 1.4136, + "step": 17391 + }, + { + "epoch": 0.579034831161925, + "grad_norm": 1.3255977630615234, + "learning_rate": 2.122043886437421e-05, + "loss": 1.4552, + "step": 17422 + }, + { + "epoch": 0.5800651422494018, + "grad_norm": 1.300898790359497, + "learning_rate": 2.1137443231804765e-05, + "loss": 1.4152, + "step": 17453 + }, + { + "epoch": 0.5810954533368785, + "grad_norm": 1.2904343605041504, + "learning_rate": 2.105449118766347e-05, + "loss": 1.4195, + "step": 17484 + }, + { + "epoch": 0.5821257644243553, + "grad_norm": 1.3146878480911255, + "learning_rate": 2.097158366805287e-05, + "loss": 1.426, + "step": 17515 + }, + { + "epoch": 0.5831560755118319, + "grad_norm": 1.2454010248184204, + "learning_rate": 2.0888721608573047e-05, + "loss": 1.4239, + "step": 17546 + }, + { + "epoch": 0.5841863865993087, + "grad_norm": 1.194626808166504, + "learning_rate": 2.0805905944311087e-05, + "loss": 1.4416, + "step": 17577 + }, + { + "epoch": 0.5852166976867854, + "grad_norm": 1.359053373336792, + "learning_rate": 2.0723137609830497e-05, + "loss": 1.4112, + "step": 17608 + }, + { + "epoch": 0.5862470087742622, + "grad_norm": 1.2577933073043823, + "learning_rate": 2.0640417539160686e-05, + "loss": 1.4432, + "step": 17639 + }, + { + "epoch": 0.5872773198617389, + "grad_norm": 1.2604849338531494, + "learning_rate": 2.0557746665786427e-05, + "loss": 1.4184, + "step": 17670 + }, + { + "epoch": 0.5883076309492157, + "grad_norm": 1.2511252164840698, + "learning_rate": 2.0475125922637256e-05, + "loss": 1.4276, + "step": 17701 + }, + { + "epoch": 0.5893379420366923, + "grad_norm": 1.2841278314590454, + "learning_rate": 2.0392556242077047e-05, + "loss": 1.4345, + "step": 17732 + }, + { + "epoch": 0.5903682531241691, + "grad_norm": 1.3342245817184448, + "learning_rate": 2.031003855589343e-05, + "loss": 1.4212, + "step": 17763 + }, + { + "epoch": 0.5913985642116458, + "grad_norm": 1.352387547492981, + "learning_rate": 2.022757379528727e-05, + "loss": 1.4316, + "step": 17794 + }, + { + "epoch": 0.5924288752991226, + "grad_norm": 1.3534374237060547, + "learning_rate": 2.0145162890862184e-05, + "loss": 1.4352, + "step": 17825 + }, + { + "epoch": 0.5934591863865993, + "grad_norm": 1.2957963943481445, + "learning_rate": 2.0062806772614022e-05, + "loss": 1.4057, + "step": 17856 + }, + { + "epoch": 0.5944894974740761, + "grad_norm": 1.3178727626800537, + "learning_rate": 1.9980506369920392e-05, + "loss": 1.4323, + "step": 17887 + }, + { + "epoch": 0.5955198085615527, + "grad_norm": 1.3364850282669067, + "learning_rate": 1.989826261153015e-05, + "loss": 1.4228, + "step": 17918 + }, + { + "epoch": 0.5965501196490295, + "grad_norm": 1.283200979232788, + "learning_rate": 1.9816076425552923e-05, + "loss": 1.4348, + "step": 17949 + }, + { + "epoch": 0.5975804307365062, + "grad_norm": 1.2856223583221436, + "learning_rate": 1.9733948739448676e-05, + "loss": 1.4176, + "step": 17980 + }, + { + "epoch": 0.598610741823983, + "grad_norm": 1.253180742263794, + "learning_rate": 1.9651880480017155e-05, + "loss": 1.4175, + "step": 18011 + }, + { + "epoch": 0.5996410529114597, + "grad_norm": 1.3471016883850098, + "learning_rate": 1.9569872573387516e-05, + "loss": 1.433, + "step": 18042 + }, + { + "epoch": 0.6006713639989365, + "grad_norm": 1.2449748516082764, + "learning_rate": 1.9487925945007854e-05, + "loss": 1.4091, + "step": 18073 + }, + { + "epoch": 0.6017016750864131, + "grad_norm": 1.3311972618103027, + "learning_rate": 1.9406041519634726e-05, + "loss": 1.403, + "step": 18104 + }, + { + "epoch": 0.6027319861738899, + "grad_norm": 1.2645657062530518, + "learning_rate": 1.932422022132275e-05, + "loss": 1.4265, + "step": 18135 + }, + { + "epoch": 0.6037622972613667, + "grad_norm": 1.3313370943069458, + "learning_rate": 1.924246297341414e-05, + "loss": 1.4275, + "step": 18166 + }, + { + "epoch": 0.6047926083488434, + "grad_norm": 1.2827123403549194, + "learning_rate": 1.9160770698528338e-05, + "loss": 1.4277, + "step": 18197 + }, + { + "epoch": 0.6058229194363202, + "grad_norm": 1.2230308055877686, + "learning_rate": 1.907914431855156e-05, + "loss": 1.4391, + "step": 18228 + }, + { + "epoch": 0.6068532305237969, + "grad_norm": 1.2785223722457886, + "learning_rate": 1.8997584754626412e-05, + "loss": 1.4152, + "step": 18259 + }, + { + "epoch": 0.6078835416112736, + "grad_norm": 1.3152620792388916, + "learning_rate": 1.8916092927141486e-05, + "loss": 1.4137, + "step": 18290 + }, + { + "epoch": 0.6089138526987503, + "grad_norm": 1.1842609643936157, + "learning_rate": 1.883466975572098e-05, + "loss": 1.4141, + "step": 18321 + }, + { + "epoch": 0.6099441637862271, + "grad_norm": 1.2319703102111816, + "learning_rate": 1.8753316159214312e-05, + "loss": 1.4216, + "step": 18352 + }, + { + "epoch": 0.6109744748737038, + "grad_norm": 1.3239370584487915, + "learning_rate": 1.8672033055685766e-05, + "loss": 1.4184, + "step": 18383 + }, + { + "epoch": 0.6120047859611806, + "grad_norm": 1.2665941715240479, + "learning_rate": 1.8590821362404116e-05, + "loss": 1.4249, + "step": 18414 + }, + { + "epoch": 0.6130350970486572, + "grad_norm": 1.2569379806518555, + "learning_rate": 1.8509681995832294e-05, + "loss": 1.4242, + "step": 18445 + }, + { + "epoch": 0.614065408136134, + "grad_norm": 1.2848411798477173, + "learning_rate": 1.8428615871617004e-05, + "loss": 1.4166, + "step": 18476 + }, + { + "epoch": 0.6150957192236107, + "grad_norm": 1.2636574506759644, + "learning_rate": 1.8347623904578448e-05, + "loss": 1.4297, + "step": 18507 + }, + { + "epoch": 0.6161260303110875, + "grad_norm": 1.2672234773635864, + "learning_rate": 1.8266707008699975e-05, + "loss": 1.4244, + "step": 18538 + }, + { + "epoch": 0.6171563413985642, + "grad_norm": 1.2299143075942993, + "learning_rate": 1.818586609711774e-05, + "loss": 1.408, + "step": 18569 + }, + { + "epoch": 0.618186652486041, + "grad_norm": 1.2221580743789673, + "learning_rate": 1.8105102082110462e-05, + "loss": 1.4242, + "step": 18600 + }, + { + "epoch": 0.6192169635735176, + "grad_norm": 1.290737509727478, + "learning_rate": 1.8024415875089058e-05, + "loss": 1.4167, + "step": 18631 + }, + { + "epoch": 0.6202472746609944, + "grad_norm": 1.3236243724822998, + "learning_rate": 1.7943808386586407e-05, + "loss": 1.4341, + "step": 18662 + }, + { + "epoch": 0.6212775857484711, + "grad_norm": 1.1983164548873901, + "learning_rate": 1.7863280526247073e-05, + "loss": 1.4171, + "step": 18693 + }, + { + "epoch": 0.6223078968359479, + "grad_norm": 1.2706191539764404, + "learning_rate": 1.7782833202817003e-05, + "loss": 1.4268, + "step": 18724 + }, + { + "epoch": 0.6233382079234246, + "grad_norm": 1.2584494352340698, + "learning_rate": 1.7702467324133327e-05, + "loss": 1.4364, + "step": 18755 + }, + { + "epoch": 0.6243685190109014, + "grad_norm": 1.345226526260376, + "learning_rate": 1.7622183797114042e-05, + "loss": 1.4274, + "step": 18786 + }, + { + "epoch": 0.625398830098378, + "grad_norm": 1.3055671453475952, + "learning_rate": 1.7541983527747838e-05, + "loss": 1.4101, + "step": 18817 + }, + { + "epoch": 0.6264291411858548, + "grad_norm": 1.2878341674804688, + "learning_rate": 1.746186742108387e-05, + "loss": 1.4133, + "step": 18848 + }, + { + "epoch": 0.6274594522733316, + "grad_norm": 1.241191029548645, + "learning_rate": 1.73818363812215e-05, + "loss": 1.4038, + "step": 18879 + }, + { + "epoch": 0.6284897633608083, + "grad_norm": 1.8631796836853027, + "learning_rate": 1.7301891311300153e-05, + "loss": 1.3961, + "step": 18910 + }, + { + "epoch": 0.6295200744482851, + "grad_norm": 1.2781902551651, + "learning_rate": 1.7222033113489055e-05, + "loss": 1.4238, + "step": 18941 + }, + { + "epoch": 0.6305503855357618, + "grad_norm": 1.2679165601730347, + "learning_rate": 1.7142262688977127e-05, + "loss": 1.4236, + "step": 18972 + }, + { + "epoch": 0.6315806966232385, + "grad_norm": 1.257203459739685, + "learning_rate": 1.7062580937962764e-05, + "loss": 1.4156, + "step": 19003 + }, + { + "epoch": 0.6326110077107152, + "grad_norm": 1.284470796585083, + "learning_rate": 1.698298875964369e-05, + "loss": 1.4241, + "step": 19034 + }, + { + "epoch": 0.633641318798192, + "grad_norm": 1.310545802116394, + "learning_rate": 1.690348705220684e-05, + "loss": 1.4205, + "step": 19065 + }, + { + "epoch": 0.6346716298856687, + "grad_norm": 1.2868564128875732, + "learning_rate": 1.6824076712818156e-05, + "loss": 1.4238, + "step": 19096 + }, + { + "epoch": 0.6357019409731455, + "grad_norm": 1.2508702278137207, + "learning_rate": 1.6744758637612533e-05, + "loss": 1.4046, + "step": 19127 + }, + { + "epoch": 0.6367322520606222, + "grad_norm": 1.3149102926254272, + "learning_rate": 1.6665533721683664e-05, + "loss": 1.4211, + "step": 19158 + }, + { + "epoch": 0.6377625631480989, + "grad_norm": 1.3485240936279297, + "learning_rate": 1.6586402859073974e-05, + "loss": 1.4167, + "step": 19189 + }, + { + "epoch": 0.6387928742355756, + "grad_norm": 1.2397938966751099, + "learning_rate": 1.6507366942764463e-05, + "loss": 1.4242, + "step": 19220 + }, + { + "epoch": 0.6398231853230524, + "grad_norm": 1.2909672260284424, + "learning_rate": 1.6428426864664732e-05, + "loss": 1.403, + "step": 19251 + }, + { + "epoch": 0.6408534964105291, + "grad_norm": 1.290385365486145, + "learning_rate": 1.6349583515602816e-05, + "loss": 1.4082, + "step": 19282 + }, + { + "epoch": 0.6418838074980059, + "grad_norm": 1.3623126745224, + "learning_rate": 1.6270837785315208e-05, + "loss": 1.4075, + "step": 19313 + }, + { + "epoch": 0.6429141185854825, + "grad_norm": 1.276903510093689, + "learning_rate": 1.619219056243676e-05, + "loss": 1.4135, + "step": 19344 + }, + { + "epoch": 0.6439444296729593, + "grad_norm": 1.2038910388946533, + "learning_rate": 1.6113642734490698e-05, + "loss": 1.4162, + "step": 19375 + }, + { + "epoch": 0.644974740760436, + "grad_norm": 1.2092891931533813, + "learning_rate": 1.6035195187878577e-05, + "loss": 1.4285, + "step": 19406 + }, + { + "epoch": 0.6460050518479128, + "grad_norm": 1.2983031272888184, + "learning_rate": 1.5956848807870305e-05, + "loss": 1.4128, + "step": 19437 + }, + { + "epoch": 0.6470353629353895, + "grad_norm": 1.279845952987671, + "learning_rate": 1.587860447859413e-05, + "loss": 1.4351, + "step": 19468 + }, + { + "epoch": 0.6480656740228663, + "grad_norm": 1.2781362533569336, + "learning_rate": 1.5800463083026686e-05, + "loss": 1.4118, + "step": 19499 + }, + { + "epoch": 0.6490959851103429, + "grad_norm": 1.2652825117111206, + "learning_rate": 1.572242550298298e-05, + "loss": 1.4195, + "step": 19530 + }, + { + "epoch": 0.6501262961978197, + "grad_norm": 1.3177101612091064, + "learning_rate": 1.56444926191065e-05, + "loss": 1.4307, + "step": 19561 + }, + { + "epoch": 0.6511566072852965, + "grad_norm": 1.2758272886276245, + "learning_rate": 1.5566665310859257e-05, + "loss": 1.4096, + "step": 19592 + }, + { + "epoch": 0.6521869183727732, + "grad_norm": 1.2265219688415527, + "learning_rate": 1.5488944456511846e-05, + "loss": 1.4098, + "step": 19623 + }, + { + "epoch": 0.65321722946025, + "grad_norm": 1.258945345878601, + "learning_rate": 1.5411330933133546e-05, + "loss": 1.4274, + "step": 19654 + }, + { + "epoch": 0.6542475405477267, + "grad_norm": 1.2599055767059326, + "learning_rate": 1.533382561658241e-05, + "loss": 1.4207, + "step": 19685 + }, + { + "epoch": 0.6552778516352034, + "grad_norm": 1.2502135038375854, + "learning_rate": 1.525642938149541e-05, + "loss": 1.4046, + "step": 19716 + }, + { + "epoch": 0.6563081627226801, + "grad_norm": 1.2734349966049194, + "learning_rate": 1.5179143101278536e-05, + "loss": 1.41, + "step": 19747 + }, + { + "epoch": 0.6573384738101569, + "grad_norm": 1.2801038026809692, + "learning_rate": 1.5101967648096955e-05, + "loss": 1.4088, + "step": 19778 + }, + { + "epoch": 0.6583687848976336, + "grad_norm": 1.2488126754760742, + "learning_rate": 1.5024903892865172e-05, + "loss": 1.4111, + "step": 19809 + }, + { + "epoch": 0.6593990959851104, + "grad_norm": 1.2418783903121948, + "learning_rate": 1.4947952705237184e-05, + "loss": 1.384, + "step": 19840 + }, + { + "epoch": 0.6604294070725871, + "grad_norm": 1.2566567659378052, + "learning_rate": 1.4871114953596682e-05, + "loss": 1.4127, + "step": 19871 + }, + { + "epoch": 0.6614597181600638, + "grad_norm": 1.2431600093841553, + "learning_rate": 1.4794391505047256e-05, + "loss": 1.4015, + "step": 19902 + }, + { + "epoch": 0.6624900292475405, + "grad_norm": 1.3174066543579102, + "learning_rate": 1.4717783225402596e-05, + "loss": 1.4113, + "step": 19933 + }, + { + "epoch": 0.6635203403350173, + "grad_norm": 1.3124332427978516, + "learning_rate": 1.4641290979176735e-05, + "loss": 1.421, + "step": 19964 + }, + { + "epoch": 0.664550651422494, + "grad_norm": 1.2595762014389038, + "learning_rate": 1.4564915629574246e-05, + "loss": 1.409, + "step": 19995 + }, + { + "epoch": 0.6655809625099708, + "grad_norm": 1.2872180938720703, + "learning_rate": 1.4488658038480601e-05, + "loss": 1.4082, + "step": 20026 + }, + { + "epoch": 0.6666112735974475, + "grad_norm": 1.27680242061615, + "learning_rate": 1.4412519066452323e-05, + "loss": 1.3979, + "step": 20057 + }, + { + "epoch": 0.6676415846849242, + "grad_norm": 1.2753857374191284, + "learning_rate": 1.4336499572707373e-05, + "loss": 1.4227, + "step": 20088 + }, + { + "epoch": 0.6686718957724009, + "grad_norm": 1.2680202722549438, + "learning_rate": 1.4260600415115433e-05, + "loss": 1.418, + "step": 20119 + }, + { + "epoch": 0.6697022068598777, + "grad_norm": 1.3002320528030396, + "learning_rate": 1.4184822450188137e-05, + "loss": 1.4133, + "step": 20150 + }, + { + "epoch": 0.6707325179473544, + "grad_norm": 1.3236373662948608, + "learning_rate": 1.410916653306954e-05, + "loss": 1.4133, + "step": 20181 + }, + { + "epoch": 0.6717628290348312, + "grad_norm": 1.3784340620040894, + "learning_rate": 1.403363351752639e-05, + "loss": 1.4064, + "step": 20212 + }, + { + "epoch": 0.6727931401223078, + "grad_norm": 1.2793350219726562, + "learning_rate": 1.3958224255938485e-05, + "loss": 1.4203, + "step": 20243 + }, + { + "epoch": 0.6738234512097846, + "grad_norm": 1.3510205745697021, + "learning_rate": 1.388293959928911e-05, + "loss": 1.418, + "step": 20274 + }, + { + "epoch": 0.6748537622972614, + "grad_norm": 1.2981188297271729, + "learning_rate": 1.3807780397155379e-05, + "loss": 1.4019, + "step": 20305 + }, + { + "epoch": 0.6758840733847381, + "grad_norm": 1.2599388360977173, + "learning_rate": 1.3732747497698655e-05, + "loss": 1.4187, + "step": 20336 + }, + { + "epoch": 0.6769143844722149, + "grad_norm": 1.2741434574127197, + "learning_rate": 1.3657841747655038e-05, + "loss": 1.4183, + "step": 20367 + }, + { + "epoch": 0.6779446955596916, + "grad_norm": 1.2376216650009155, + "learning_rate": 1.3583063992325706e-05, + "loss": 1.4208, + "step": 20398 + }, + { + "epoch": 0.6789750066471683, + "grad_norm": 1.341134786605835, + "learning_rate": 1.3508415075567496e-05, + "loss": 1.4015, + "step": 20429 + }, + { + "epoch": 0.680005317734645, + "grad_norm": 1.3483457565307617, + "learning_rate": 1.343389583978327e-05, + "loss": 1.4043, + "step": 20460 + }, + { + "epoch": 0.6810356288221218, + "grad_norm": 1.3255680799484253, + "learning_rate": 1.3359507125912468e-05, + "loss": 1.4162, + "step": 20491 + }, + { + "epoch": 0.6820659399095985, + "grad_norm": 1.211305022239685, + "learning_rate": 1.3285249773421627e-05, + "loss": 1.4043, + "step": 20522 + }, + { + "epoch": 0.6830962509970753, + "grad_norm": 1.3049174547195435, + "learning_rate": 1.3211124620294884e-05, + "loss": 1.4012, + "step": 20553 + }, + { + "epoch": 0.684126562084552, + "grad_norm": 1.2884812355041504, + "learning_rate": 1.313713250302451e-05, + "loss": 1.419, + "step": 20584 + }, + { + "epoch": 0.6851568731720287, + "grad_norm": 1.2465201616287231, + "learning_rate": 1.3063274256601479e-05, + "loss": 1.394, + "step": 20615 + }, + { + "epoch": 0.6861871842595054, + "grad_norm": 1.2868762016296387, + "learning_rate": 1.2989550714506086e-05, + "loss": 1.3975, + "step": 20646 + }, + { + "epoch": 0.6872174953469822, + "grad_norm": 1.2728379964828491, + "learning_rate": 1.291596270869846e-05, + "loss": 1.3918, + "step": 20677 + }, + { + "epoch": 0.6882478064344589, + "grad_norm": 1.265869379043579, + "learning_rate": 1.284251106960927e-05, + "loss": 1.402, + "step": 20708 + }, + { + "epoch": 0.6892781175219357, + "grad_norm": 1.3357373476028442, + "learning_rate": 1.2769196626130263e-05, + "loss": 1.3975, + "step": 20739 + }, + { + "epoch": 0.6903084286094124, + "grad_norm": 1.216797947883606, + "learning_rate": 1.2696020205604969e-05, + "loss": 1.3953, + "step": 20770 + }, + { + "epoch": 0.6913387396968891, + "grad_norm": 1.269227385520935, + "learning_rate": 1.2622982633819359e-05, + "loss": 1.4154, + "step": 20801 + }, + { + "epoch": 0.6923690507843658, + "grad_norm": 1.3336331844329834, + "learning_rate": 1.2550084734992484e-05, + "loss": 1.3992, + "step": 20832 + }, + { + "epoch": 0.6933993618718426, + "grad_norm": 1.2936463356018066, + "learning_rate": 1.247732733176724e-05, + "loss": 1.4147, + "step": 20863 + }, + { + "epoch": 0.6944296729593193, + "grad_norm": 1.344826102256775, + "learning_rate": 1.2404711245201044e-05, + "loss": 1.3878, + "step": 20894 + }, + { + "epoch": 0.6954599840467961, + "grad_norm": 1.2611995935440063, + "learning_rate": 1.2332237294756535e-05, + "loss": 1.4088, + "step": 20925 + }, + { + "epoch": 0.6964902951342729, + "grad_norm": 1.3274885416030884, + "learning_rate": 1.225990629829241e-05, + "loss": 1.4036, + "step": 20956 + }, + { + "epoch": 0.6975206062217495, + "grad_norm": 1.2847373485565186, + "learning_rate": 1.2187719072054136e-05, + "loss": 1.398, + "step": 20987 + }, + { + "epoch": 0.6985509173092262, + "grad_norm": 1.2856248617172241, + "learning_rate": 1.2115676430664735e-05, + "loss": 1.4101, + "step": 21018 + }, + { + "epoch": 0.699581228396703, + "grad_norm": 1.3064154386520386, + "learning_rate": 1.2043779187115647e-05, + "loss": 1.4081, + "step": 21049 + }, + { + "epoch": 0.7006115394841798, + "grad_norm": 1.253602147102356, + "learning_rate": 1.1972028152757476e-05, + "loss": 1.4123, + "step": 21080 + }, + { + "epoch": 0.7016418505716565, + "grad_norm": 1.2678899765014648, + "learning_rate": 1.1900424137290889e-05, + "loss": 1.3969, + "step": 21111 + }, + { + "epoch": 0.7026721616591332, + "grad_norm": 1.2261760234832764, + "learning_rate": 1.1828967948757482e-05, + "loss": 1.4009, + "step": 21142 + }, + { + "epoch": 0.7037024727466099, + "grad_norm": 1.540486216545105, + "learning_rate": 1.175766039353062e-05, + "loss": 1.4215, + "step": 21173 + }, + { + "epoch": 0.7047327838340867, + "grad_norm": 1.2508059740066528, + "learning_rate": 1.1686502276306382e-05, + "loss": 1.4046, + "step": 21204 + }, + { + "epoch": 0.7057630949215634, + "grad_norm": 1.2918591499328613, + "learning_rate": 1.1615494400094445e-05, + "loss": 1.4301, + "step": 21235 + }, + { + "epoch": 0.7067934060090402, + "grad_norm": 1.240178108215332, + "learning_rate": 1.1544637566209029e-05, + "loss": 1.3888, + "step": 21266 + }, + { + "epoch": 0.7078237170965169, + "grad_norm": 1.2358977794647217, + "learning_rate": 1.1473932574259886e-05, + "loss": 1.415, + "step": 21297 + }, + { + "epoch": 0.7088540281839936, + "grad_norm": 1.2963451147079468, + "learning_rate": 1.1403380222143247e-05, + "loss": 1.4002, + "step": 21328 + }, + { + "epoch": 0.7098843392714703, + "grad_norm": 1.3245363235473633, + "learning_rate": 1.1332981306032808e-05, + "loss": 1.3945, + "step": 21359 + }, + { + "epoch": 0.7109146503589471, + "grad_norm": 1.2833342552185059, + "learning_rate": 1.1262736620370762e-05, + "loss": 1.4054, + "step": 21390 + }, + { + "epoch": 0.7119449614464238, + "grad_norm": 1.3230944871902466, + "learning_rate": 1.1192646957858854e-05, + "loss": 1.398, + "step": 21421 + }, + { + "epoch": 0.7129752725339006, + "grad_norm": 1.2515650987625122, + "learning_rate": 1.1122713109449381e-05, + "loss": 1.3958, + "step": 21452 + }, + { + "epoch": 0.7140055836213773, + "grad_norm": 1.313057780265808, + "learning_rate": 1.105293586433634e-05, + "loss": 1.3909, + "step": 21483 + }, + { + "epoch": 0.715035894708854, + "grad_norm": 1.2700668573379517, + "learning_rate": 1.0983316009946446e-05, + "loss": 1.3939, + "step": 21514 + }, + { + "epoch": 0.7160662057963307, + "grad_norm": 1.2487835884094238, + "learning_rate": 1.0913854331930282e-05, + "loss": 1.4162, + "step": 21545 + }, + { + "epoch": 0.7170965168838075, + "grad_norm": 1.2748737335205078, + "learning_rate": 1.0844551614153456e-05, + "loss": 1.3984, + "step": 21576 + }, + { + "epoch": 0.7181268279712842, + "grad_norm": 1.24228036403656, + "learning_rate": 1.0775408638687725e-05, + "loss": 1.4002, + "step": 21607 + }, + { + "epoch": 0.719157139058761, + "grad_norm": 1.3365492820739746, + "learning_rate": 1.0706426185802165e-05, + "loss": 1.4091, + "step": 21638 + }, + { + "epoch": 0.7201874501462378, + "grad_norm": 1.2073006629943848, + "learning_rate": 1.0637605033954371e-05, + "loss": 1.4034, + "step": 21669 + }, + { + "epoch": 0.7212177612337144, + "grad_norm": 1.2873163223266602, + "learning_rate": 1.05689459597817e-05, + "loss": 1.3994, + "step": 21700 + }, + { + "epoch": 0.7222480723211911, + "grad_norm": 1.3623207807540894, + "learning_rate": 1.050044973809246e-05, + "loss": 1.3827, + "step": 21731 + }, + { + "epoch": 0.7232783834086679, + "grad_norm": 1.256643533706665, + "learning_rate": 1.043211714185722e-05, + "loss": 1.3989, + "step": 21762 + }, + { + "epoch": 0.7243086944961447, + "grad_norm": 1.201434850692749, + "learning_rate": 1.036394894220003e-05, + "loss": 1.3892, + "step": 21793 + }, + { + "epoch": 0.7253390055836214, + "grad_norm": 1.335642695426941, + "learning_rate": 1.0295945908389751e-05, + "loss": 1.4077, + "step": 21824 + }, + { + "epoch": 0.7263693166710982, + "grad_norm": 1.252847671508789, + "learning_rate": 1.0228108807831393e-05, + "loss": 1.4077, + "step": 21855 + }, + { + "epoch": 0.7273996277585748, + "grad_norm": 1.3838329315185547, + "learning_rate": 1.01604384060574e-05, + "loss": 1.3944, + "step": 21886 + }, + { + "epoch": 0.7284299388460516, + "grad_norm": 1.3425817489624023, + "learning_rate": 1.009293546671907e-05, + "loss": 1.4067, + "step": 21917 + }, + { + "epoch": 0.7294602499335283, + "grad_norm": 1.3198227882385254, + "learning_rate": 1.002560075157791e-05, + "loss": 1.4043, + "step": 21948 + }, + { + "epoch": 0.7304905610210051, + "grad_norm": 1.3169294595718384, + "learning_rate": 9.958435020496995e-06, + "loss": 1.3743, + "step": 21979 + }, + { + "epoch": 0.7315208721084818, + "grad_norm": 1.2145452499389648, + "learning_rate": 9.89143903143249e-06, + "loss": 1.3875, + "step": 22010 + }, + { + "epoch": 0.7325511831959585, + "grad_norm": 1.368464469909668, + "learning_rate": 9.824613540425038e-06, + "loss": 1.3939, + "step": 22041 + }, + { + "epoch": 0.7335814942834352, + "grad_norm": 1.2481716871261597, + "learning_rate": 9.757959301591197e-06, + "loss": 1.4032, + "step": 22072 + }, + { + "epoch": 0.734611805370912, + "grad_norm": 1.225689172744751, + "learning_rate": 9.691477067115017e-06, + "loss": 1.4057, + "step": 22103 + }, + { + "epoch": 0.7356421164583887, + "grad_norm": 1.2322176694869995, + "learning_rate": 9.625167587239467e-06, + "loss": 1.3983, + "step": 22134 + }, + { + "epoch": 0.7366724275458655, + "grad_norm": 1.2423603534698486, + "learning_rate": 9.559031610258007e-06, + "loss": 1.4246, + "step": 22165 + }, + { + "epoch": 0.7377027386333422, + "grad_norm": 1.2707546949386597, + "learning_rate": 9.493069882506164e-06, + "loss": 1.4033, + "step": 22196 + }, + { + "epoch": 0.7387330497208189, + "grad_norm": 1.2819782495498657, + "learning_rate": 9.427283148353056e-06, + "loss": 1.3942, + "step": 22227 + }, + { + "epoch": 0.7397633608082956, + "grad_norm": 1.278111219406128, + "learning_rate": 9.361672150193052e-06, + "loss": 1.4124, + "step": 22258 + }, + { + "epoch": 0.7407936718957724, + "grad_norm": 1.2402000427246094, + "learning_rate": 9.29623762843734e-06, + "loss": 1.3784, + "step": 22289 + }, + { + "epoch": 0.7418239829832491, + "grad_norm": 1.2294648885726929, + "learning_rate": 9.230980321505594e-06, + "loss": 1.3998, + "step": 22320 + }, + { + "epoch": 0.7428542940707259, + "grad_norm": 1.3570529222488403, + "learning_rate": 9.165900965817668e-06, + "loss": 1.3867, + "step": 22351 + }, + { + "epoch": 0.7438846051582026, + "grad_norm": 1.2765589952468872, + "learning_rate": 9.101000295785245e-06, + "loss": 1.3848, + "step": 22382 + }, + { + "epoch": 0.7449149162456793, + "grad_norm": 1.301269292831421, + "learning_rate": 9.036279043803565e-06, + "loss": 1.3976, + "step": 22413 + }, + { + "epoch": 0.745945227333156, + "grad_norm": 1.3582361936569214, + "learning_rate": 8.971737940243147e-06, + "loss": 1.398, + "step": 22444 + }, + { + "epoch": 0.7469755384206328, + "grad_norm": 1.3054485321044922, + "learning_rate": 8.907377713441592e-06, + "loss": 1.402, + "step": 22475 + }, + { + "epoch": 0.7480058495081096, + "grad_norm": 1.2361812591552734, + "learning_rate": 8.843199089695293e-06, + "loss": 1.4097, + "step": 22506 + }, + { + "epoch": 0.7490361605955863, + "grad_norm": 1.2720493078231812, + "learning_rate": 8.779202793251311e-06, + "loss": 1.4046, + "step": 22537 + }, + { + "epoch": 0.7500664716830631, + "grad_norm": 1.2494639158248901, + "learning_rate": 8.715389546299149e-06, + "loss": 1.3858, + "step": 22568 + }, + { + "epoch": 0.7510967827705397, + "grad_norm": 1.2343871593475342, + "learning_rate": 8.651760068962617e-06, + "loss": 1.3896, + "step": 22599 + }, + { + "epoch": 0.7521270938580165, + "grad_norm": 1.1934345960617065, + "learning_rate": 8.588315079291733e-06, + "loss": 1.4095, + "step": 22630 + }, + { + "epoch": 0.7531574049454932, + "grad_norm": 1.2811630964279175, + "learning_rate": 8.52505529325457e-06, + "loss": 1.3954, + "step": 22661 + }, + { + "epoch": 0.75418771603297, + "grad_norm": 1.2676504850387573, + "learning_rate": 8.461981424729216e-06, + "loss": 1.3901, + "step": 22692 + }, + { + "epoch": 0.7552180271204467, + "grad_norm": 1.3221408128738403, + "learning_rate": 8.399094185495725e-06, + "loss": 1.4057, + "step": 22723 + }, + { + "epoch": 0.7562483382079235, + "grad_norm": 1.2741389274597168, + "learning_rate": 8.336394285228017e-06, + "loss": 1.3964, + "step": 22754 + }, + { + "epoch": 0.7572786492954001, + "grad_norm": 1.329860806465149, + "learning_rate": 8.273882431485952e-06, + "loss": 1.3946, + "step": 22785 + }, + { + "epoch": 0.7583089603828769, + "grad_norm": 1.3073118925094604, + "learning_rate": 8.211559329707316e-06, + "loss": 1.3937, + "step": 22816 + }, + { + "epoch": 0.7593392714703536, + "grad_norm": 1.2866522073745728, + "learning_rate": 8.149425683199823e-06, + "loss": 1.3999, + "step": 22847 + }, + { + "epoch": 0.7603695825578304, + "grad_norm": 1.2539178133010864, + "learning_rate": 8.08748219313325e-06, + "loss": 1.398, + "step": 22878 + }, + { + "epoch": 0.7613998936453071, + "grad_norm": 1.279863715171814, + "learning_rate": 8.025729558531453e-06, + "loss": 1.4155, + "step": 22909 + }, + { + "epoch": 0.7624302047327839, + "grad_norm": 1.2936811447143555, + "learning_rate": 7.964168476264508e-06, + "loss": 1.4036, + "step": 22940 + }, + { + "epoch": 0.7634605158202605, + "grad_norm": 1.2729599475860596, + "learning_rate": 7.902799641040884e-06, + "loss": 1.4003, + "step": 22971 + }, + { + "epoch": 0.7644908269077373, + "grad_norm": 1.2257497310638428, + "learning_rate": 7.841623745399523e-06, + "loss": 1.408, + "step": 23002 + }, + { + "epoch": 0.765521137995214, + "grad_norm": 1.254761815071106, + "learning_rate": 7.780641479702114e-06, + "loss": 1.3925, + "step": 23033 + }, + { + "epoch": 0.7665514490826908, + "grad_norm": 1.2740334272384644, + "learning_rate": 7.719853532125227e-06, + "loss": 1.3996, + "step": 23064 + }, + { + "epoch": 0.7675817601701675, + "grad_norm": 1.2421025037765503, + "learning_rate": 7.65926058865258e-06, + "loss": 1.3852, + "step": 23095 + }, + { + "epoch": 0.7686120712576442, + "grad_norm": 1.3271669149398804, + "learning_rate": 7.598863333067313e-06, + "loss": 1.408, + "step": 23126 + }, + { + "epoch": 0.769642382345121, + "grad_norm": 1.3040279150009155, + "learning_rate": 7.538662446944253e-06, + "loss": 1.3718, + "step": 23157 + }, + { + "epoch": 0.7706726934325977, + "grad_norm": 1.230797290802002, + "learning_rate": 7.478658609642211e-06, + "loss": 1.3776, + "step": 23188 + }, + { + "epoch": 0.7717030045200745, + "grad_norm": 1.2709274291992188, + "learning_rate": 7.418852498296327e-06, + "loss": 1.3975, + "step": 23219 + }, + { + "epoch": 0.7727333156075512, + "grad_norm": 1.227398157119751, + "learning_rate": 7.359244787810457e-06, + "loss": 1.382, + "step": 23250 + }, + { + "epoch": 0.773763626695028, + "grad_norm": 1.242308259010315, + "learning_rate": 7.299836150849493e-06, + "loss": 1.3792, + "step": 23281 + }, + { + "epoch": 0.7747939377825046, + "grad_norm": 1.2658405303955078, + "learning_rate": 7.240627257831847e-06, + "loss": 1.3699, + "step": 23312 + }, + { + "epoch": 0.7758242488699814, + "grad_norm": 1.3357101678848267, + "learning_rate": 7.1816187769218195e-06, + "loss": 1.3972, + "step": 23343 + }, + { + "epoch": 0.7768545599574581, + "grad_norm": 1.2248833179473877, + "learning_rate": 7.1228113740220895e-06, + "loss": 1.3875, + "step": 23374 + }, + { + "epoch": 0.7778848710449349, + "grad_norm": 1.2615251541137695, + "learning_rate": 7.064205712766226e-06, + "loss": 1.3947, + "step": 23405 + }, + { + "epoch": 0.7789151821324116, + "grad_norm": 1.2719477415084839, + "learning_rate": 7.005802454511129e-06, + "loss": 1.3943, + "step": 23436 + }, + { + "epoch": 0.7799454932198884, + "grad_norm": 1.2429877519607544, + "learning_rate": 6.947602258329639e-06, + "loss": 1.3924, + "step": 23467 + }, + { + "epoch": 0.780975804307365, + "grad_norm": 1.3180112838745117, + "learning_rate": 6.889605781003078e-06, + "loss": 1.4095, + "step": 23498 + }, + { + "epoch": 0.7820061153948418, + "grad_norm": 1.3340109586715698, + "learning_rate": 6.831813677013776e-06, + "loss": 1.3873, + "step": 23529 + }, + { + "epoch": 0.7830364264823185, + "grad_norm": 1.2713093757629395, + "learning_rate": 6.774226598537792e-06, + "loss": 1.3882, + "step": 23560 + }, + { + "epoch": 0.7840667375697953, + "grad_norm": 1.2504241466522217, + "learning_rate": 6.716845195437482e-06, + "loss": 1.3795, + "step": 23591 + }, + { + "epoch": 0.785097048657272, + "grad_norm": 1.273703694343567, + "learning_rate": 6.659670115254168e-06, + "loss": 1.3819, + "step": 23622 + }, + { + "epoch": 0.7861273597447488, + "grad_norm": 1.3121949434280396, + "learning_rate": 6.602702003200872e-06, + "loss": 1.3827, + "step": 23653 + }, + { + "epoch": 0.7871576708322254, + "grad_norm": 1.2552127838134766, + "learning_rate": 6.545941502154992e-06, + "loss": 1.3935, + "step": 23684 + }, + { + "epoch": 0.7881879819197022, + "grad_norm": 1.2457008361816406, + "learning_rate": 6.489389252651057e-06, + "loss": 1.3847, + "step": 23715 + }, + { + "epoch": 0.7892182930071789, + "grad_norm": 1.2819870710372925, + "learning_rate": 6.4330458928735325e-06, + "loss": 1.3965, + "step": 23746 + }, + { + "epoch": 0.7902486040946557, + "grad_norm": 1.2543584108352661, + "learning_rate": 6.376912058649559e-06, + "loss": 1.4025, + "step": 23777 + }, + { + "epoch": 0.7912789151821324, + "grad_norm": 1.2502461671829224, + "learning_rate": 6.320988383441845e-06, + "loss": 1.3799, + "step": 23808 + }, + { + "epoch": 0.7923092262696092, + "grad_norm": 1.2568906545639038, + "learning_rate": 6.265275498341452e-06, + "loss": 1.3887, + "step": 23839 + }, + { + "epoch": 0.7933395373570858, + "grad_norm": 1.2879040241241455, + "learning_rate": 6.209774032060714e-06, + "loss": 1.3922, + "step": 23870 + }, + { + "epoch": 0.7943698484445626, + "grad_norm": 1.2547533512115479, + "learning_rate": 6.1544846109261365e-06, + "loss": 1.3891, + "step": 23901 + }, + { + "epoch": 0.7954001595320394, + "grad_norm": 1.2941306829452515, + "learning_rate": 6.099407858871342e-06, + "loss": 1.3914, + "step": 23932 + }, + { + "epoch": 0.7964304706195161, + "grad_norm": 1.3194507360458374, + "learning_rate": 6.044544397429958e-06, + "loss": 1.3857, + "step": 23963 + }, + { + "epoch": 0.7974607817069929, + "grad_norm": 1.2143921852111816, + "learning_rate": 5.989894845728708e-06, + "loss": 1.4041, + "step": 23994 + }, + { + "epoch": 0.7984910927944695, + "grad_norm": 1.2587990760803223, + "learning_rate": 5.9354598204803605e-06, + "loss": 1.3901, + "step": 24025 + }, + { + "epoch": 0.7995214038819463, + "grad_norm": 1.2482203245162964, + "learning_rate": 5.881239935976762e-06, + "loss": 1.384, + "step": 24056 + }, + { + "epoch": 0.800551714969423, + "grad_norm": 1.2880163192749023, + "learning_rate": 5.827235804081954e-06, + "loss": 1.3876, + "step": 24087 + }, + { + "epoch": 0.8015820260568998, + "grad_norm": 1.2727841138839722, + "learning_rate": 5.773448034225221e-06, + "loss": 1.3752, + "step": 24118 + }, + { + "epoch": 0.8026123371443765, + "grad_norm": 1.2767062187194824, + "learning_rate": 5.719877233394228e-06, + "loss": 1.4, + "step": 24149 + }, + { + "epoch": 0.8036426482318533, + "grad_norm": 1.2654463052749634, + "learning_rate": 5.666524006128191e-06, + "loss": 1.39, + "step": 24180 + }, + { + "epoch": 0.8046729593193299, + "grad_norm": 1.2623034715652466, + "learning_rate": 5.613388954511015e-06, + "loss": 1.3885, + "step": 24211 + }, + { + "epoch": 0.8057032704068067, + "grad_norm": 1.303368330001831, + "learning_rate": 5.560472678164552e-06, + "loss": 1.3933, + "step": 24242 + }, + { + "epoch": 0.8067335814942834, + "grad_norm": 1.232909917831421, + "learning_rate": 5.507775774241775e-06, + "loss": 1.3897, + "step": 24273 + }, + { + "epoch": 0.8077638925817602, + "grad_norm": 1.3074171543121338, + "learning_rate": 5.4552988374200945e-06, + "loss": 1.3836, + "step": 24304 + }, + { + "epoch": 0.8087942036692369, + "grad_norm": 1.287463903427124, + "learning_rate": 5.403042459894597e-06, + "loss": 1.3889, + "step": 24335 + }, + { + "epoch": 0.8098245147567137, + "grad_norm": 1.2616747617721558, + "learning_rate": 5.3510072313714135e-06, + "loss": 1.3978, + "step": 24366 + }, + { + "epoch": 0.8108548258441903, + "grad_norm": 1.2531288862228394, + "learning_rate": 5.2991937390610205e-06, + "loss": 1.4116, + "step": 24397 + }, + { + "epoch": 0.8118851369316671, + "grad_norm": 1.2136998176574707, + "learning_rate": 5.247602567671625e-06, + "loss": 1.3794, + "step": 24428 + }, + { + "epoch": 0.8129154480191438, + "grad_norm": 1.3023301362991333, + "learning_rate": 5.196234299402603e-06, + "loss": 1.3868, + "step": 24459 + }, + { + "epoch": 0.8139457591066206, + "grad_norm": 1.2590848207473755, + "learning_rate": 5.145089513937865e-06, + "loss": 1.3865, + "step": 24490 + }, + { + "epoch": 0.8149760701940973, + "grad_norm": 1.2516260147094727, + "learning_rate": 5.094168788439369e-06, + "loss": 1.3923, + "step": 24521 + }, + { + "epoch": 0.8160063812815741, + "grad_norm": 1.2341543436050415, + "learning_rate": 5.043472697540594e-06, + "loss": 1.3824, + "step": 24552 + }, + { + "epoch": 0.8170366923690507, + "grad_norm": 1.3493062257766724, + "learning_rate": 4.993001813340012e-06, + "loss": 1.4024, + "step": 24583 + }, + { + "epoch": 0.8180670034565275, + "grad_norm": 1.271795392036438, + "learning_rate": 4.942756705394702e-06, + "loss": 1.3821, + "step": 24614 + }, + { + "epoch": 0.8190973145440043, + "grad_norm": 1.3145335912704468, + "learning_rate": 4.892737940713884e-06, + "loss": 1.3786, + "step": 24645 + }, + { + "epoch": 0.820127625631481, + "grad_norm": 1.3532222509384155, + "learning_rate": 4.842946083752511e-06, + "loss": 1.3981, + "step": 24676 + }, + { + "epoch": 0.8211579367189578, + "grad_norm": 1.3181504011154175, + "learning_rate": 4.79338169640493e-06, + "loss": 1.3916, + "step": 24707 + }, + { + "epoch": 0.8221882478064345, + "grad_norm": 1.267794132232666, + "learning_rate": 4.74404533799851e-06, + "loss": 1.3768, + "step": 24738 + }, + { + "epoch": 0.8232185588939112, + "grad_norm": 1.2763338088989258, + "learning_rate": 4.694937565287344e-06, + "loss": 1.3972, + "step": 24769 + }, + { + "epoch": 0.8242488699813879, + "grad_norm": 1.2626184225082397, + "learning_rate": 4.646058932445985e-06, + "loss": 1.3815, + "step": 24800 + }, + { + "epoch": 0.8252791810688647, + "grad_norm": 1.1800566911697388, + "learning_rate": 4.597409991063148e-06, + "loss": 1.3949, + "step": 24831 + }, + { + "epoch": 0.8263094921563414, + "grad_norm": 1.2157528400421143, + "learning_rate": 4.5489912901355375e-06, + "loss": 1.3783, + "step": 24862 + }, + { + "epoch": 0.8273398032438182, + "grad_norm": 1.3244526386260986, + "learning_rate": 4.500803376061608e-06, + "loss": 1.3861, + "step": 24893 + }, + { + "epoch": 0.8283701143312948, + "grad_norm": 1.2245334386825562, + "learning_rate": 4.45284679263541e-06, + "loss": 1.3817, + "step": 24924 + }, + { + "epoch": 0.8294004254187716, + "grad_norm": 1.2566081285476685, + "learning_rate": 4.4051220810404775e-06, + "loss": 1.3979, + "step": 24955 + }, + { + "epoch": 0.8304307365062483, + "grad_norm": 1.2556860446929932, + "learning_rate": 4.3576297798437025e-06, + "loss": 1.3826, + "step": 24986 + }, + { + "epoch": 0.8314610475937251, + "grad_norm": 1.2634494304656982, + "learning_rate": 4.3103704249892436e-06, + "loss": 1.3733, + "step": 25017 + }, + { + "epoch": 0.8324913586812018, + "grad_norm": 1.234903335571289, + "learning_rate": 4.263344549792487e-06, + "loss": 1.3815, + "step": 25048 + }, + { + "epoch": 0.8335216697686786, + "grad_norm": 1.3948299884796143, + "learning_rate": 4.216552684934056e-06, + "loss": 1.402, + "step": 25079 + }, + { + "epoch": 0.8345519808561552, + "grad_norm": 1.363745093345642, + "learning_rate": 4.169995358453777e-06, + "loss": 1.3872, + "step": 25110 + }, + { + "epoch": 0.835582291943632, + "grad_norm": 1.354319453239441, + "learning_rate": 4.123673095744757e-06, + "loss": 1.3817, + "step": 25141 + }, + { + "epoch": 0.8366126030311087, + "grad_norm": 1.2999165058135986, + "learning_rate": 4.077586419547435e-06, + "loss": 1.3806, + "step": 25172 + }, + { + "epoch": 0.8376429141185855, + "grad_norm": 1.2431261539459229, + "learning_rate": 4.03173584994368e-06, + "loss": 1.3724, + "step": 25203 + }, + { + "epoch": 0.8386732252060622, + "grad_norm": 1.2831732034683228, + "learning_rate": 3.986121904350948e-06, + "loss": 1.4055, + "step": 25234 + }, + { + "epoch": 0.839703536293539, + "grad_norm": 1.2473969459533691, + "learning_rate": 3.940745097516407e-06, + "loss": 1.3804, + "step": 25265 + }, + { + "epoch": 0.8407338473810156, + "grad_norm": 1.2680081129074097, + "learning_rate": 3.89560594151116e-06, + "loss": 1.3971, + "step": 25296 + }, + { + "epoch": 0.8417641584684924, + "grad_norm": 1.3049360513687134, + "learning_rate": 3.850704945724456e-06, + "loss": 1.3883, + "step": 25327 + }, + { + "epoch": 0.8427944695559692, + "grad_norm": 1.3096522092819214, + "learning_rate": 3.8060426168579077e-06, + "loss": 1.3932, + "step": 25358 + }, + { + "epoch": 0.8438247806434459, + "grad_norm": 1.2855119705200195, + "learning_rate": 3.7616194589198407e-06, + "loss": 1.3953, + "step": 25389 + }, + { + "epoch": 0.8448550917309227, + "grad_norm": 1.2272716760635376, + "learning_rate": 3.7174359732195574e-06, + "loss": 1.3924, + "step": 25420 + }, + { + "epoch": 0.8458854028183994, + "grad_norm": 1.2750498056411743, + "learning_rate": 3.673492658361677e-06, + "loss": 1.3857, + "step": 25451 + }, + { + "epoch": 0.846915713905876, + "grad_norm": 1.2702478170394897, + "learning_rate": 3.6297900102405467e-06, + "loss": 1.3833, + "step": 25482 + }, + { + "epoch": 0.8479460249933528, + "grad_norm": 1.3162232637405396, + "learning_rate": 3.586328522034607e-06, + "loss": 1.3936, + "step": 25513 + }, + { + "epoch": 0.8489763360808296, + "grad_norm": 1.228898048400879, + "learning_rate": 3.543108684200838e-06, + "loss": 1.376, + "step": 25544 + }, + { + "epoch": 0.8500066471683063, + "grad_norm": 1.2657815217971802, + "learning_rate": 3.5001309844692464e-06, + "loss": 1.3827, + "step": 25575 + }, + { + "epoch": 0.8510369582557831, + "grad_norm": 1.252999186515808, + "learning_rate": 3.4573959078373215e-06, + "loss": 1.3706, + "step": 25606 + }, + { + "epoch": 0.8520672693432598, + "grad_norm": 1.4515488147735596, + "learning_rate": 3.4149039365646063e-06, + "loss": 1.3928, + "step": 25637 + }, + { + "epoch": 0.8530975804307365, + "grad_norm": 1.2513251304626465, + "learning_rate": 3.3726555501672143e-06, + "loss": 1.3763, + "step": 25668 + }, + { + "epoch": 0.8541278915182132, + "grad_norm": 1.311325192451477, + "learning_rate": 3.33065122541244e-06, + "loss": 1.3807, + "step": 25699 + }, + { + "epoch": 0.85515820260569, + "grad_norm": 1.2587943077087402, + "learning_rate": 3.288891436313385e-06, + "loss": 1.3802, + "step": 25730 + }, + { + "epoch": 0.8561885136931667, + "grad_norm": 1.2624818086624146, + "learning_rate": 3.2473766541235963e-06, + "loss": 1.3915, + "step": 25761 + }, + { + "epoch": 0.8572188247806435, + "grad_norm": 1.2625864744186401, + "learning_rate": 3.2061073473317466e-06, + "loss": 1.3855, + "step": 25792 + }, + { + "epoch": 0.8582491358681201, + "grad_norm": 1.2889775037765503, + "learning_rate": 3.1650839816563444e-06, + "loss": 1.3942, + "step": 25823 + }, + { + "epoch": 0.8592794469555969, + "grad_norm": 1.2399699687957764, + "learning_rate": 3.1243070200405093e-06, + "loss": 1.3929, + "step": 25854 + }, + { + "epoch": 0.8603097580430736, + "grad_norm": 1.2660589218139648, + "learning_rate": 3.0837769226467e-06, + "loss": 1.3647, + "step": 25885 + }, + { + "epoch": 0.8613400691305504, + "grad_norm": 1.2619723081588745, + "learning_rate": 3.0434941468515666e-06, + "loss": 1.3804, + "step": 25916 + }, + { + "epoch": 0.8623703802180271, + "grad_norm": 1.3124239444732666, + "learning_rate": 3.003459147240753e-06, + "loss": 1.368, + "step": 25947 + }, + { + "epoch": 0.8634006913055039, + "grad_norm": 1.2878339290618896, + "learning_rate": 2.9636723756037875e-06, + "loss": 1.3835, + "step": 25978 + }, + { + "epoch": 0.8644310023929805, + "grad_norm": 1.2607743740081787, + "learning_rate": 2.9241342809289833e-06, + "loss": 1.3933, + "step": 26009 + }, + { + "epoch": 0.8654613134804573, + "grad_norm": 1.2619109153747559, + "learning_rate": 2.8848453093983594e-06, + "loss": 1.3881, + "step": 26040 + }, + { + "epoch": 0.866491624567934, + "grad_norm": 1.3922829627990723, + "learning_rate": 2.8458059043826257e-06, + "loss": 1.3741, + "step": 26071 + }, + { + "epoch": 0.8675219356554108, + "grad_norm": 1.3063180446624756, + "learning_rate": 2.807016506436172e-06, + "loss": 1.3987, + "step": 26102 + }, + { + "epoch": 0.8685522467428876, + "grad_norm": 1.3027793169021606, + "learning_rate": 2.7684775532920566e-06, + "loss": 1.3833, + "step": 26133 + }, + { + "epoch": 0.8695825578303643, + "grad_norm": 1.286738395690918, + "learning_rate": 2.7301894798571425e-06, + "loss": 1.3983, + "step": 26164 + }, + { + "epoch": 0.870612868917841, + "grad_norm": 1.2284873723983765, + "learning_rate": 2.6921527182071386e-06, + "loss": 1.3806, + "step": 26195 + }, + { + "epoch": 0.8716431800053177, + "grad_norm": 1.282870888710022, + "learning_rate": 2.654367697581725e-06, + "loss": 1.3864, + "step": 26226 + }, + { + "epoch": 0.8726734910927945, + "grad_norm": 1.2854727506637573, + "learning_rate": 2.6168348443797175e-06, + "loss": 1.3615, + "step": 26257 + }, + { + "epoch": 0.8737038021802712, + "grad_norm": 1.2982513904571533, + "learning_rate": 2.5795545821542757e-06, + "loss": 1.3636, + "step": 26288 + }, + { + "epoch": 0.874734113267748, + "grad_norm": 1.3433053493499756, + "learning_rate": 2.54252733160808e-06, + "loss": 1.3792, + "step": 26319 + }, + { + "epoch": 0.8757644243552247, + "grad_norm": 1.2748687267303467, + "learning_rate": 2.5057535105886294e-06, + "loss": 1.3822, + "step": 26350 + }, + { + "epoch": 0.8767947354427014, + "grad_norm": 1.1860417127609253, + "learning_rate": 2.4692335340834953e-06, + "loss": 1.3812, + "step": 26381 + }, + { + "epoch": 0.8778250465301781, + "grad_norm": 1.3041021823883057, + "learning_rate": 2.432967814215639e-06, + "loss": 1.3919, + "step": 26412 + }, + { + "epoch": 0.8788553576176549, + "grad_norm": 1.3307725191116333, + "learning_rate": 2.396956760238794e-06, + "loss": 1.3737, + "step": 26443 + }, + { + "epoch": 0.8798856687051316, + "grad_norm": 1.3257086277008057, + "learning_rate": 2.361200778532796e-06, + "loss": 1.3729, + "step": 26474 + }, + { + "epoch": 0.8809159797926084, + "grad_norm": 1.4235222339630127, + "learning_rate": 2.325700272599049e-06, + "loss": 1.3892, + "step": 26505 + }, + { + "epoch": 0.8819462908800851, + "grad_norm": 1.3324629068374634, + "learning_rate": 2.2904556430559415e-06, + "loss": 1.3958, + "step": 26536 + }, + { + "epoch": 0.8829766019675618, + "grad_norm": 1.2705706357955933, + "learning_rate": 2.2554672876343106e-06, + "loss": 1.369, + "step": 26567 + }, + { + "epoch": 0.8840069130550385, + "grad_norm": 1.2910141944885254, + "learning_rate": 2.220735601173002e-06, + "loss": 1.3764, + "step": 26598 + }, + { + "epoch": 0.8850372241425153, + "grad_norm": 1.312762975692749, + "learning_rate": 2.186260975614382e-06, + "loss": 1.3689, + "step": 26629 + }, + { + "epoch": 0.886067535229992, + "grad_norm": 1.279833197593689, + "learning_rate": 2.1520437999999034e-06, + "loss": 1.3816, + "step": 26660 + }, + { + "epoch": 0.8870978463174688, + "grad_norm": 1.312485933303833, + "learning_rate": 2.1180844604657526e-06, + "loss": 1.3935, + "step": 26691 + }, + { + "epoch": 0.8881281574049454, + "grad_norm": 1.2287721633911133, + "learning_rate": 2.084383340238455e-06, + "loss": 1.3721, + "step": 26722 + }, + { + "epoch": 0.8891584684924222, + "grad_norm": 1.2619805335998535, + "learning_rate": 2.0509408196305704e-06, + "loss": 1.3874, + "step": 26753 + }, + { + "epoch": 0.890188779579899, + "grad_norm": 1.3075838088989258, + "learning_rate": 2.017757276036403e-06, + "loss": 1.3888, + "step": 26784 + }, + { + "epoch": 0.8912190906673757, + "grad_norm": 1.257625937461853, + "learning_rate": 1.984833083927726e-06, + "loss": 1.3814, + "step": 26815 + }, + { + "epoch": 0.8922494017548525, + "grad_norm": 1.2962384223937988, + "learning_rate": 1.952168614849581e-06, + "loss": 1.3762, + "step": 26846 + }, + { + "epoch": 0.8932797128423292, + "grad_norm": 1.277114748954773, + "learning_rate": 1.919764237416058e-06, + "loss": 1.3709, + "step": 26877 + }, + { + "epoch": 0.8943100239298059, + "grad_norm": 1.3202005624771118, + "learning_rate": 1.8876203173061463e-06, + "loss": 1.3974, + "step": 26908 + }, + { + "epoch": 0.8953403350172826, + "grad_norm": 1.2782710790634155, + "learning_rate": 1.8557372172596206e-06, + "loss": 1.3882, + "step": 26939 + }, + { + "epoch": 0.8963706461047594, + "grad_norm": 1.1860815286636353, + "learning_rate": 1.8241152970729341e-06, + "loss": 1.359, + "step": 26970 + }, + { + "epoch": 0.8974009571922361, + "grad_norm": 1.2500203847885132, + "learning_rate": 1.7927549135951572e-06, + "loss": 1.3858, + "step": 27001 + }, + { + "epoch": 0.8984312682797129, + "grad_norm": 1.264669418334961, + "learning_rate": 1.7616564207239477e-06, + "loss": 1.3849, + "step": 27032 + }, + { + "epoch": 0.8994615793671896, + "grad_norm": 1.2666518688201904, + "learning_rate": 1.730820169401584e-06, + "loss": 1.4078, + "step": 27063 + }, + { + "epoch": 0.9004918904546663, + "grad_norm": 1.2911863327026367, + "learning_rate": 1.7002465076109558e-06, + "loss": 1.3714, + "step": 27094 + }, + { + "epoch": 0.901522201542143, + "grad_norm": 1.3311351537704468, + "learning_rate": 1.6699357803716898e-06, + "loss": 1.3852, + "step": 27125 + }, + { + "epoch": 0.9025525126296198, + "grad_norm": 1.2619616985321045, + "learning_rate": 1.6398883297362305e-06, + "loss": 1.3778, + "step": 27156 + }, + { + "epoch": 0.9035828237170965, + "grad_norm": 1.245505452156067, + "learning_rate": 1.6101044947859606e-06, + "loss": 1.3928, + "step": 27187 + }, + { + "epoch": 0.9046131348045733, + "grad_norm": 1.2463428974151611, + "learning_rate": 1.5805846116274114e-06, + "loss": 1.373, + "step": 27218 + }, + { + "epoch": 0.90564344589205, + "grad_norm": 1.2582367658615112, + "learning_rate": 1.5513290133884611e-06, + "loss": 1.3829, + "step": 27249 + }, + { + "epoch": 0.9066737569795267, + "grad_norm": 1.3230143785476685, + "learning_rate": 1.5223380302145512e-06, + "loss": 1.3705, + "step": 27280 + }, + { + "epoch": 0.9077040680670034, + "grad_norm": 1.2450213432312012, + "learning_rate": 1.4936119892649925e-06, + "loss": 1.3825, + "step": 27311 + }, + { + "epoch": 0.9087343791544802, + "grad_norm": 1.3045326471328735, + "learning_rate": 1.4651512147092482e-06, + "loss": 1.3619, + "step": 27342 + }, + { + "epoch": 0.9097646902419569, + "grad_norm": 1.3278846740722656, + "learning_rate": 1.4369560277232908e-06, + "loss": 1.3791, + "step": 27373 + }, + { + "epoch": 0.9107950013294337, + "grad_norm": 1.355610728263855, + "learning_rate": 1.409026746485978e-06, + "loss": 1.3973, + "step": 27404 + }, + { + "epoch": 0.9118253124169104, + "grad_norm": 1.219814658164978, + "learning_rate": 1.3813636861754464e-06, + "loss": 1.3874, + "step": 27435 + }, + { + "epoch": 0.9128556235043871, + "grad_norm": 1.27649986743927, + "learning_rate": 1.3539671589655773e-06, + "loss": 1.3752, + "step": 27466 + }, + { + "epoch": 0.9138859345918638, + "grad_norm": 1.2818325757980347, + "learning_rate": 1.3268374740224548e-06, + "loss": 1.3947, + "step": 27497 + }, + { + "epoch": 0.9149162456793406, + "grad_norm": 1.2845216989517212, + "learning_rate": 1.2999749375008807e-06, + "loss": 1.3841, + "step": 27528 + }, + { + "epoch": 0.9159465567668174, + "grad_norm": 1.267153263092041, + "learning_rate": 1.2733798525409346e-06, + "loss": 1.3727, + "step": 27559 + }, + { + "epoch": 0.9169768678542941, + "grad_norm": 1.3620883226394653, + "learning_rate": 1.2470525192645383e-06, + "loss": 1.3962, + "step": 27590 + }, + { + "epoch": 0.9180071789417708, + "grad_norm": 1.273087501525879, + "learning_rate": 1.2209932347720666e-06, + "loss": 1.3821, + "step": 27621 + }, + { + "epoch": 0.9190374900292475, + "grad_norm": 1.274993896484375, + "learning_rate": 1.1952022931389972e-06, + "loss": 1.3867, + "step": 27652 + }, + { + "epoch": 0.9200678011167243, + "grad_norm": 1.3041173219680786, + "learning_rate": 1.1696799854126083e-06, + "loss": 1.3791, + "step": 27683 + }, + { + "epoch": 0.921098112204201, + "grad_norm": 1.2610989809036255, + "learning_rate": 1.1444265996086694e-06, + "loss": 1.3663, + "step": 27714 + }, + { + "epoch": 0.9221284232916778, + "grad_norm": 1.2655807733535767, + "learning_rate": 1.119442420708211e-06, + "loss": 1.3824, + "step": 27745 + }, + { + "epoch": 0.9231587343791545, + "grad_norm": 1.3172816038131714, + "learning_rate": 1.0947277306542964e-06, + "loss": 1.3693, + "step": 27776 + }, + { + "epoch": 0.9241890454666312, + "grad_norm": 1.291439414024353, + "learning_rate": 1.0702828083488353e-06, + "loss": 1.3678, + "step": 27807 + }, + { + "epoch": 0.9252193565541079, + "grad_norm": 1.29994535446167, + "learning_rate": 1.0461079296494647e-06, + "loss": 1.3873, + "step": 27838 + }, + { + "epoch": 0.9262496676415847, + "grad_norm": 1.3204083442687988, + "learning_rate": 1.0222033673663978e-06, + "loss": 1.3667, + "step": 27869 + }, + { + "epoch": 0.9272799787290614, + "grad_norm": 1.3328169584274292, + "learning_rate": 9.985693912593713e-07, + "loss": 1.3922, + "step": 27900 + }, + { + "epoch": 0.9283102898165382, + "grad_norm": 1.3002989292144775, + "learning_rate": 9.752062680346035e-07, + "loss": 1.3782, + "step": 27931 + }, + { + "epoch": 0.9293406009040149, + "grad_norm": 1.2758339643478394, + "learning_rate": 9.521142613417494e-07, + "loss": 1.377, + "step": 27962 + }, + { + "epoch": 0.9303709119914916, + "grad_norm": 1.288136601448059, + "learning_rate": 9.292936317709722e-07, + "loss": 1.3868, + "step": 27993 + }, + { + "epoch": 0.9314012230789683, + "grad_norm": 1.3093862533569336, + "learning_rate": 9.067446368499793e-07, + "loss": 1.3829, + "step": 28024 + }, + { + "epoch": 0.9324315341664451, + "grad_norm": 1.3423957824707031, + "learning_rate": 8.844675310411055e-07, + "loss": 1.3744, + "step": 28055 + }, + { + "epoch": 0.9334618452539218, + "grad_norm": 1.2059671878814697, + "learning_rate": 8.6246256573847e-07, + "loss": 1.3862, + "step": 28086 + }, + { + "epoch": 0.9344921563413986, + "grad_norm": 1.2304048538208008, + "learning_rate": 8.407299892651127e-07, + "loss": 1.3893, + "step": 28117 + }, + { + "epoch": 0.9355224674288753, + "grad_norm": 1.2521936893463135, + "learning_rate": 8.19270046870202e-07, + "loss": 1.3962, + "step": 28148 + }, + { + "epoch": 0.936552778516352, + "grad_norm": 1.3047491312026978, + "learning_rate": 7.980829807262752e-07, + "loss": 1.383, + "step": 28179 + }, + { + "epoch": 0.9375830896038287, + "grad_norm": 1.274664282798767, + "learning_rate": 7.771690299264889e-07, + "loss": 1.3706, + "step": 28210 + }, + { + "epoch": 0.9386134006913055, + "grad_norm": 1.27972412109375, + "learning_rate": 7.565284304819426e-07, + "loss": 1.3849, + "step": 28241 + }, + { + "epoch": 0.9396437117787823, + "grad_norm": 1.3737306594848633, + "learning_rate": 7.361614153189922e-07, + "loss": 1.3777, + "step": 28272 + }, + { + "epoch": 0.940674022866259, + "grad_norm": 1.3274649381637573, + "learning_rate": 7.160682142766328e-07, + "loss": 1.3671, + "step": 28303 + }, + { + "epoch": 0.9417043339537358, + "grad_norm": 1.244835615158081, + "learning_rate": 6.962490541039091e-07, + "loss": 1.3833, + "step": 28334 + }, + { + "epoch": 0.9427346450412124, + "grad_norm": 1.2732888460159302, + "learning_rate": 6.767041584573531e-07, + "loss": 1.4046, + "step": 28365 + }, + { + "epoch": 0.9437649561286892, + "grad_norm": 1.416917085647583, + "learning_rate": 6.574337478984532e-07, + "loss": 1.3771, + "step": 28396 + }, + { + "epoch": 0.9447952672161659, + "grad_norm": 1.285765528678894, + "learning_rate": 6.384380398911732e-07, + "loss": 1.3631, + "step": 28427 + }, + { + "epoch": 0.9458255783036427, + "grad_norm": 1.3602466583251953, + "learning_rate": 6.197172487994951e-07, + "loss": 1.3869, + "step": 28458 + }, + { + "epoch": 0.9468558893911194, + "grad_norm": 1.229366660118103, + "learning_rate": 6.012715858850021e-07, + "loss": 1.3676, + "step": 28489 + }, + { + "epoch": 0.9478862004785961, + "grad_norm": 1.2718359231948853, + "learning_rate": 5.831012593044971e-07, + "loss": 1.3704, + "step": 28520 + }, + { + "epoch": 0.9489165115660728, + "grad_norm": 1.3188321590423584, + "learning_rate": 5.652064741076435e-07, + "loss": 1.3845, + "step": 28551 + }, + { + "epoch": 0.9499468226535496, + "grad_norm": 1.2386760711669922, + "learning_rate": 5.475874322346558e-07, + "loss": 1.3734, + "step": 28582 + }, + { + "epoch": 0.9509771337410263, + "grad_norm": 1.2667484283447266, + "learning_rate": 5.30244332514035e-07, + "loss": 1.3857, + "step": 28613 + }, + { + "epoch": 0.9520074448285031, + "grad_norm": 1.3065145015716553, + "learning_rate": 5.131773706602977e-07, + "loss": 1.3874, + "step": 28644 + }, + { + "epoch": 0.9530377559159798, + "grad_norm": 1.2271697521209717, + "learning_rate": 4.963867392717897e-07, + "loss": 1.3747, + "step": 28675 + }, + { + "epoch": 0.9540680670034565, + "grad_norm": 1.2699921131134033, + "learning_rate": 4.798726278285093e-07, + "loss": 1.364, + "step": 28706 + }, + { + "epoch": 0.9550983780909332, + "grad_norm": 1.2270889282226562, + "learning_rate": 4.6363522268995097e-07, + "loss": 1.3812, + "step": 28737 + }, + { + "epoch": 0.95612868917841, + "grad_norm": 1.2237279415130615, + "learning_rate": 4.4767470709302927e-07, + "loss": 1.3789, + "step": 28768 + }, + { + "epoch": 0.9571590002658867, + "grad_norm": 1.3381733894348145, + "learning_rate": 4.319912611499971e-07, + "loss": 1.378, + "step": 28799 + }, + { + "epoch": 0.9581893113533635, + "grad_norm": 1.2563492059707642, + "learning_rate": 4.1658506184640564e-07, + "loss": 1.3667, + "step": 28830 + }, + { + "epoch": 0.9592196224408402, + "grad_norm": 1.2401185035705566, + "learning_rate": 4.0145628303911996e-07, + "loss": 1.3608, + "step": 28861 + }, + { + "epoch": 0.9602499335283169, + "grad_norm": 1.2803692817687988, + "learning_rate": 3.866050954543565e-07, + "loss": 1.3673, + "step": 28892 + }, + { + "epoch": 0.9612802446157936, + "grad_norm": 1.297766923904419, + "learning_rate": 3.720316666857432e-07, + "loss": 1.3805, + "step": 28923 + }, + { + "epoch": 0.9623105557032704, + "grad_norm": 1.324561357498169, + "learning_rate": 3.5773616119244845e-07, + "loss": 1.3887, + "step": 28954 + }, + { + "epoch": 0.9633408667907472, + "grad_norm": 1.260181188583374, + "learning_rate": 3.437187402973052e-07, + "loss": 1.3622, + "step": 28985 + }, + { + "epoch": 0.9643711778782239, + "grad_norm": 1.3247239589691162, + "learning_rate": 3.2997956218500104e-07, + "loss": 1.3786, + "step": 29016 + }, + { + "epoch": 0.9654014889657007, + "grad_norm": 1.3164467811584473, + "learning_rate": 3.165187819003018e-07, + "loss": 1.3757, + "step": 29047 + }, + { + "epoch": 0.9664318000531773, + "grad_norm": 1.2644035816192627, + "learning_rate": 3.033365513462755e-07, + "loss": 1.364, + "step": 29078 + }, + { + "epoch": 0.9674621111406541, + "grad_norm": 1.3035861253738403, + "learning_rate": 2.9043301928260437e-07, + "loss": 1.394, + "step": 29109 + }, + { + "epoch": 0.9684924222281308, + "grad_norm": 1.2896157503128052, + "learning_rate": 2.7780833132389773e-07, + "loss": 1.3757, + "step": 29140 + }, + { + "epoch": 0.9695227333156076, + "grad_norm": 1.2223542928695679, + "learning_rate": 2.6546262993803473e-07, + "loss": 1.3864, + "step": 29171 + }, + { + "epoch": 0.9705530444030843, + "grad_norm": 1.2267515659332275, + "learning_rate": 2.533960544445879e-07, + "loss": 1.3675, + "step": 29202 + }, + { + "epoch": 0.9715833554905611, + "grad_norm": 1.356759786605835, + "learning_rate": 2.416087410132134e-07, + "loss": 1.3773, + "step": 29233 + }, + { + "epoch": 0.9726136665780377, + "grad_norm": 1.3377565145492554, + "learning_rate": 2.301008226621465e-07, + "loss": 1.379, + "step": 29264 + }, + { + "epoch": 0.9736439776655145, + "grad_norm": 1.3110448122024536, + "learning_rate": 2.1887242925668073e-07, + "loss": 1.3694, + "step": 29295 + }, + { + "epoch": 0.9746742887529912, + "grad_norm": 1.6680659055709839, + "learning_rate": 2.0792368750770785e-07, + "loss": 1.3863, + "step": 29326 + }, + { + "epoch": 0.975704599840468, + "grad_norm": 1.2342867851257324, + "learning_rate": 1.9725472097028851e-07, + "loss": 1.3682, + "step": 29357 + }, + { + "epoch": 0.9767349109279447, + "grad_norm": 1.241080403327942, + "learning_rate": 1.8686565004226718e-07, + "loss": 1.3788, + "step": 29388 + }, + { + "epoch": 0.9777652220154214, + "grad_norm": 1.2744892835617065, + "learning_rate": 1.7675659196288995e-07, + "loss": 1.37, + "step": 29419 + }, + { + "epoch": 0.9787955331028981, + "grad_norm": 1.2974997758865356, + "learning_rate": 1.6692766081150556e-07, + "loss": 1.3879, + "step": 29450 + }, + { + "epoch": 0.9798258441903749, + "grad_norm": 1.2312657833099365, + "learning_rate": 1.5737896750626647e-07, + "loss": 1.3869, + "step": 29481 + }, + { + "epoch": 0.9808561552778516, + "grad_norm": 1.345173954963684, + "learning_rate": 1.4811061980287976e-07, + "loss": 1.3805, + "step": 29512 + }, + { + "epoch": 0.9818864663653284, + "grad_norm": 1.2451786994934082, + "learning_rate": 1.3912272229338886e-07, + "loss": 1.344, + "step": 29543 + }, + { + "epoch": 0.9829167774528051, + "grad_norm": 1.3506325483322144, + "learning_rate": 1.3041537640499645e-07, + "loss": 1.3599, + "step": 29574 + }, + { + "epoch": 0.9839470885402818, + "grad_norm": 1.2373636960983276, + "learning_rate": 1.2198868039891564e-07, + "loss": 1.3783, + "step": 29605 + }, + { + "epoch": 0.9849773996277585, + "grad_norm": 1.2550500631332397, + "learning_rate": 1.138427293692651e-07, + "loss": 1.3775, + "step": 29636 + }, + { + "epoch": 0.9860077107152353, + "grad_norm": 1.2327601909637451, + "learning_rate": 1.0597761524199778e-07, + "loss": 1.3832, + "step": 29667 + }, + { + "epoch": 0.987038021802712, + "grad_norm": 1.3128068447113037, + "learning_rate": 9.839342677385455e-08, + "loss": 1.3747, + "step": 29698 + }, + { + "epoch": 0.9880683328901888, + "grad_norm": 1.3429338932037354, + "learning_rate": 9.109024955137325e-08, + "loss": 1.396, + "step": 29729 + }, + { + "epoch": 0.9890986439776656, + "grad_norm": 1.2928231954574585, + "learning_rate": 8.406816598991729e-08, + "loss": 1.3897, + "step": 29760 + }, + { + "epoch": 0.9901289550651422, + "grad_norm": 1.3004258871078491, + "learning_rate": 7.73272553327431e-08, + "loss": 1.3724, + "step": 29791 + }, + { + "epoch": 0.991159266152619, + "grad_norm": 1.226399302482605, + "learning_rate": 7.086759365011186e-08, + "loss": 1.378, + "step": 29822 + }, + { + "epoch": 0.9921895772400957, + "grad_norm": 1.376007080078125, + "learning_rate": 6.468925383842639e-08, + "loss": 1.3777, + "step": 29853 + }, + { + "epoch": 0.9932198883275725, + "grad_norm": 1.2747802734375, + "learning_rate": 5.8792305619415067e-08, + "loss": 1.3874, + "step": 29884 + }, + { + "epoch": 0.9942501994150492, + "grad_norm": 1.4200111627578735, + "learning_rate": 5.317681553933529e-08, + "loss": 1.3629, + "step": 29915 + }, + { + "epoch": 0.995280510502526, + "grad_norm": 1.256399154663086, + "learning_rate": 4.78428469682296e-08, + "loss": 1.372, + "step": 29946 + }, + { + "epoch": 0.9963108215900026, + "grad_norm": 1.273094654083252, + "learning_rate": 4.2790460099206844e-08, + "loss": 1.3951, + "step": 29977 + }, + { + "epoch": 0.9973411326774794, + "grad_norm": 1.2665611505508423, + "learning_rate": 3.801971194777043e-08, + "loss": 1.3802, + "step": 30008 + }, + { + "epoch": 0.9983714437649561, + "grad_norm": 1.2919692993164062, + "learning_rate": 3.353065635115782e-08, + "loss": 1.3767, + "step": 30039 + }, + { + "epoch": 0.9994017548524329, + "grad_norm": 1.3227587938308716, + "learning_rate": 2.93233439677576e-08, + "loss": 1.3737, + "step": 30070 + }, + { + "epoch": 1.0004320659399095, + "grad_norm": 1.265411615371704, + "learning_rate": 2.539782227651555e-08, + "loss": 1.3559, + "step": 30101 + }, + { + "epoch": 1.0014623770273863, + "grad_norm": 1.3115289211273193, + "learning_rate": 2.175413557641004e-08, + "loss": 1.316, + "step": 30132 + }, + { + "epoch": 1.002492688114863, + "grad_norm": 1.3192830085754395, + "learning_rate": 1.839232498594967e-08, + "loss": 1.3074, + "step": 30163 + }, + { + "epoch": 1.0035229992023398, + "grad_norm": 1.4131948947906494, + "learning_rate": 1.5312428442712522e-08, + "loss": 1.3404, + "step": 30194 + }, + { + "epoch": 1.0045533102898165, + "grad_norm": 1.2529557943344116, + "learning_rate": 1.2514480702913168e-08, + "loss": 1.3232, + "step": 30225 + }, + { + "epoch": 1.0055836213772933, + "grad_norm": 1.3203102350234985, + "learning_rate": 9.998513341005766e-09, + "loss": 1.3345, + "step": 30256 + }, + { + "epoch": 1.00661393246477, + "grad_norm": 1.3507790565490723, + "learning_rate": 7.764554749345454e-09, + "loss": 1.3182, + "step": 30287 + }, + { + "epoch": 1.0076442435522468, + "grad_norm": 1.3324298858642578, + "learning_rate": 5.812630137849717e-09, + "loss": 1.3259, + "step": 30318 + }, + { + "epoch": 1.0086745546397236, + "grad_norm": 1.4362479448318481, + "learning_rate": 4.142761533723616e-09, + "loss": 1.3115, + "step": 30349 + }, + { + "epoch": 1.0097048657272003, + "grad_norm": 1.2116522789001465, + "learning_rate": 2.7549677812044317e-09, + "loss": 1.3282, + "step": 30380 + }, + { + "epoch": 1.0107351768146768, + "grad_norm": 1.279630184173584, + "learning_rate": 1.6492645413590525e-09, + "loss": 1.3183, + "step": 30411 + }, + { + "epoch": 1.0117654879021536, + "grad_norm": 1.4043036699295044, + "learning_rate": 8.256642918980096e-10, + "loss": 1.3294, + "step": 30442 + }, + { + "epoch": 1.0127957989896303, + "grad_norm": 1.2335171699523926, + "learning_rate": 2.841763270367004e-10, + "loss": 1.3185, + "step": 30473 + }, + { + "epoch": 1.013826110077107, + "grad_norm": 1.3882181644439697, + "learning_rate": 2.480675739269245e-11, + "loss": 1.336, + "step": 30504 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.2637062901578334e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-30517/training_args.bin b/checkpoint-30517/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..974208468b82a3c5684aaa384776477cf21c18ca --- /dev/null +++ b/checkpoint-30517/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a23be0ff07d6d3142f7c0980f91dddba845519c24fcb411cbb4b9ddb1513ff +size 5304 diff --git a/checkpoint-3052/config.json b/checkpoint-3052/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aaa74176892d42e1c7f5979b7ddf8ab15985d3 --- /dev/null +++ b/checkpoint-3052/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "/mnt/parscratch/users/acp23ay/private/models/Llama-3.1-8B-Instruct-ta-madlad-mean/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 138256 +} diff --git a/checkpoint-3052/generation_config.json b/checkpoint-3052/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-3052/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-3052/model-00001-of-00007.safetensors b/checkpoint-3052/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..14565d65266b239a4f3f25894c1b2669a4abf0f7 --- /dev/null +++ b/checkpoint-3052/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a9d212fcb20dcb1bac45369843094968fcb43003901fd71566bd07da60b04c2 +size 4983197184 diff --git a/checkpoint-3052/model-00002-of-00007.safetensors b/checkpoint-3052/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d32d83cb96a109af411b9cf577e7fbfe07ea76fc --- /dev/null +++ b/checkpoint-3052/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168629c67732309d49f60a5ec48a6d160212bc987365e82e183dfbf74ba0c1f3 +size 4899116432 diff --git a/checkpoint-3052/model-00003-of-00007.safetensors b/checkpoint-3052/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-3052/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-3052/model-00004-of-00007.safetensors b/checkpoint-3052/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-3052/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-3052/model-00005-of-00007.safetensors b/checkpoint-3052/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-3052/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-3052/model-00006-of-00007.safetensors b/checkpoint-3052/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cf0ec08260119be9458e6624718ac91b85a3a106 --- /dev/null +++ b/checkpoint-3052/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df4261cc6a3d85201bff656a048f7dfece0f4e00308cbc6efe0b3360e1a4e829 +size 4999813120 diff --git a/checkpoint-3052/model-00007-of-00007.safetensors b/checkpoint-3052/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e6b007a4bb661f4b3b16c238156730c46c768a75 --- /dev/null +++ b/checkpoint-3052/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aad39c3ff92c0ee16ccef75a2dd057983291febc474640228eb09f68904e18c +size 2734998184 diff --git a/checkpoint-3052/model.safetensors.index.json b/checkpoint-3052/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..318803c6a3dd771c7f7c3b8038a896af7c8322ae --- /dev/null +++ b/checkpoint-3052/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32448724992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-3052/optimizer.pt b/checkpoint-3052/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d6ccdaa59506d83ee327e452dd6c6709f1d1825 --- /dev/null +++ b/checkpoint-3052/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a23fa50181073107e2296b0eb3e09628fbf7520a27e54d943b75b4181feb60b +size 16040396334 diff --git a/checkpoint-3052/rng_state.pth b/checkpoint-3052/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-3052/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-3052/scheduler.pt b/checkpoint-3052/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b2454d919340cd4d989697a74a27016c58dc3aa --- /dev/null +++ b/checkpoint-3052/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ed9d7fea0b9f468b8c97fd491e0f5a211b8ff197e5f8111c42fc974ecafed4c +size 1064 diff --git a/checkpoint-3052/trainer_state.json b/checkpoint-3052/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..819d7b5b3444c8292e14524a073850cff744dbaa --- /dev/null +++ b/checkpoint-3052/trainer_state.json @@ -0,0 +1,719 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.10143578835416113, + "eval_steps": 500, + "global_step": 3052, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001030311087476735, + "grad_norm": 60.25547409057617, + "learning_rate": 1.0157273918741808e-06, + "loss": 8.8455, + "step": 31 + }, + { + "epoch": 0.00206062217495347, + "grad_norm": 15.669363975524902, + "learning_rate": 2.0314547837483616e-06, + "loss": 7.1553, + "step": 62 + }, + { + "epoch": 0.003090933262430205, + "grad_norm": 15.366345405578613, + "learning_rate": 3.0471821756225426e-06, + "loss": 5.8784, + "step": 93 + }, + { + "epoch": 0.00412124434990694, + "grad_norm": 36.30561828613281, + "learning_rate": 4.062909567496723e-06, + "loss": 4.7708, + "step": 124 + }, + { + "epoch": 0.005151555437383675, + "grad_norm": 27.202678680419922, + "learning_rate": 5.078636959370905e-06, + "loss": 4.1629, + "step": 155 + }, + { + "epoch": 0.00618186652486041, + "grad_norm": 24.30484962463379, + "learning_rate": 6.094364351245085e-06, + "loss": 3.867, + "step": 186 + }, + { + "epoch": 0.007212177612337145, + "grad_norm": 19.916366577148438, + "learning_rate": 7.110091743119267e-06, + "loss": 3.6131, + "step": 217 + }, + { + "epoch": 0.00824248869981388, + "grad_norm": 17.577274322509766, + "learning_rate": 8.125819134993446e-06, + "loss": 3.4772, + "step": 248 + }, + { + "epoch": 0.009272799787290615, + "grad_norm": 12.133190155029297, + "learning_rate": 9.141546526867629e-06, + "loss": 3.3218, + "step": 279 + }, + { + "epoch": 0.01030311087476735, + "grad_norm": 19.79263687133789, + "learning_rate": 1.015727391874181e-05, + "loss": 3.2055, + "step": 310 + }, + { + "epoch": 0.011333421962244085, + "grad_norm": 16.38133430480957, + "learning_rate": 1.117300131061599e-05, + "loss": 3.1062, + "step": 341 + }, + { + "epoch": 0.01236373304972082, + "grad_norm": 12.638299942016602, + "learning_rate": 1.218872870249017e-05, + "loss": 3.0106, + "step": 372 + }, + { + "epoch": 0.013394044137197554, + "grad_norm": 9.46596908569336, + "learning_rate": 1.3204456094364351e-05, + "loss": 2.924, + "step": 403 + }, + { + "epoch": 0.01442435522467429, + "grad_norm": 10.945392608642578, + "learning_rate": 1.4220183486238533e-05, + "loss": 2.844, + "step": 434 + }, + { + "epoch": 0.015454666312151024, + "grad_norm": 8.474015235900879, + "learning_rate": 1.5235910878112714e-05, + "loss": 2.7892, + "step": 465 + }, + { + "epoch": 0.01648497739962776, + "grad_norm": 9.370804786682129, + "learning_rate": 1.6251638269986893e-05, + "loss": 2.7509, + "step": 496 + }, + { + "epoch": 0.017515288487104493, + "grad_norm": 11.63398551940918, + "learning_rate": 1.7267365661861077e-05, + "loss": 2.6999, + "step": 527 + }, + { + "epoch": 0.01854559957458123, + "grad_norm": 9.17713451385498, + "learning_rate": 1.8283093053735257e-05, + "loss": 2.6459, + "step": 558 + }, + { + "epoch": 0.019575910662057962, + "grad_norm": 7.119054794311523, + "learning_rate": 1.9298820445609438e-05, + "loss": 2.603, + "step": 589 + }, + { + "epoch": 0.0206062217495347, + "grad_norm": 6.653646945953369, + "learning_rate": 2.031454783748362e-05, + "loss": 2.5588, + "step": 620 + }, + { + "epoch": 0.021636532837011432, + "grad_norm": 8.332653045654297, + "learning_rate": 2.13302752293578e-05, + "loss": 2.5357, + "step": 651 + }, + { + "epoch": 0.02266684392448817, + "grad_norm": 6.4949116706848145, + "learning_rate": 2.234600262123198e-05, + "loss": 2.4967, + "step": 682 + }, + { + "epoch": 0.023697155011964902, + "grad_norm": 9.41009521484375, + "learning_rate": 2.336173001310616e-05, + "loss": 2.4563, + "step": 713 + }, + { + "epoch": 0.02472746609944164, + "grad_norm": 7.840345859527588, + "learning_rate": 2.437745740498034e-05, + "loss": 2.4383, + "step": 744 + }, + { + "epoch": 0.025757777186918372, + "grad_norm": 6.116458415985107, + "learning_rate": 2.5393184796854525e-05, + "loss": 2.3817, + "step": 775 + }, + { + "epoch": 0.02678808827439511, + "grad_norm": 5.938300609588623, + "learning_rate": 2.6408912188728702e-05, + "loss": 2.3508, + "step": 806 + }, + { + "epoch": 0.027818399361871842, + "grad_norm": 5.4408345222473145, + "learning_rate": 2.7424639580602886e-05, + "loss": 2.3325, + "step": 837 + }, + { + "epoch": 0.02884871044934858, + "grad_norm": 5.375136375427246, + "learning_rate": 2.8440366972477066e-05, + "loss": 2.3101, + "step": 868 + }, + { + "epoch": 0.029879021536825312, + "grad_norm": 5.149726867675781, + "learning_rate": 2.9456094364351244e-05, + "loss": 2.282, + "step": 899 + }, + { + "epoch": 0.03090933262430205, + "grad_norm": 4.591221332550049, + "learning_rate": 3.0471821756225428e-05, + "loss": 2.2427, + "step": 930 + }, + { + "epoch": 0.031939643711778785, + "grad_norm": 4.977034091949463, + "learning_rate": 3.148754914809961e-05, + "loss": 2.2218, + "step": 961 + }, + { + "epoch": 0.03296995479925552, + "grad_norm": 5.038781642913818, + "learning_rate": 3.2503276539973785e-05, + "loss": 2.2044, + "step": 992 + }, + { + "epoch": 0.03400026588673225, + "grad_norm": 4.872281551361084, + "learning_rate": 3.351900393184797e-05, + "loss": 2.1657, + "step": 1023 + }, + { + "epoch": 0.035030576974208985, + "grad_norm": 4.370841979980469, + "learning_rate": 3.453473132372215e-05, + "loss": 2.1365, + "step": 1054 + }, + { + "epoch": 0.036060888061685725, + "grad_norm": 4.087072849273682, + "learning_rate": 3.555045871559633e-05, + "loss": 2.1253, + "step": 1085 + }, + { + "epoch": 0.03709119914916246, + "grad_norm": 4.113957405090332, + "learning_rate": 3.6566186107470514e-05, + "loss": 2.0973, + "step": 1116 + }, + { + "epoch": 0.03812151023663919, + "grad_norm": 4.0119733810424805, + "learning_rate": 3.7581913499344695e-05, + "loss": 2.1024, + "step": 1147 + }, + { + "epoch": 0.039151821324115925, + "grad_norm": 4.247573375701904, + "learning_rate": 3.8597640891218876e-05, + "loss": 2.0722, + "step": 1178 + }, + { + "epoch": 0.04018213241159266, + "grad_norm": 3.5575129985809326, + "learning_rate": 3.9613368283093056e-05, + "loss": 2.056, + "step": 1209 + }, + { + "epoch": 0.0412124434990694, + "grad_norm": 3.8885862827301025, + "learning_rate": 4.062909567496724e-05, + "loss": 2.0389, + "step": 1240 + }, + { + "epoch": 0.04224275458654613, + "grad_norm": 3.680628538131714, + "learning_rate": 4.164482306684142e-05, + "loss": 2.0385, + "step": 1271 + }, + { + "epoch": 0.043273065674022865, + "grad_norm": 3.780876874923706, + "learning_rate": 4.26605504587156e-05, + "loss": 2.0097, + "step": 1302 + }, + { + "epoch": 0.0443033767614996, + "grad_norm": 4.235328674316406, + "learning_rate": 4.367627785058978e-05, + "loss": 2.0024, + "step": 1333 + }, + { + "epoch": 0.04533368784897634, + "grad_norm": 3.326941967010498, + "learning_rate": 4.469200524246396e-05, + "loss": 1.9953, + "step": 1364 + }, + { + "epoch": 0.04636399893645307, + "grad_norm": 3.28456449508667, + "learning_rate": 4.570773263433814e-05, + "loss": 1.9579, + "step": 1395 + }, + { + "epoch": 0.047394310023929805, + "grad_norm": 16.107433319091797, + "learning_rate": 4.672346002621232e-05, + "loss": 1.9701, + "step": 1426 + }, + { + "epoch": 0.04842462111140654, + "grad_norm": 3.5708224773406982, + "learning_rate": 4.77391874180865e-05, + "loss": 1.9621, + "step": 1457 + }, + { + "epoch": 0.04945493219888328, + "grad_norm": 2.9053499698638916, + "learning_rate": 4.875491480996068e-05, + "loss": 1.9458, + "step": 1488 + }, + { + "epoch": 0.05048524328636001, + "grad_norm": 3.0863258838653564, + "learning_rate": 4.977064220183487e-05, + "loss": 1.9483, + "step": 1519 + }, + { + "epoch": 0.051515554373836744, + "grad_norm": 2.9012269973754883, + "learning_rate": 4.9999915451558777e-05, + "loss": 1.928, + "step": 1550 + }, + { + "epoch": 0.05254586546131348, + "grad_norm": 3.0949041843414307, + "learning_rate": 4.999955597496219e-05, + "loss": 1.9229, + "step": 1581 + }, + { + "epoch": 0.05357617654879022, + "grad_norm": 2.8687901496887207, + "learning_rate": 4.9998914381774255e-05, + "loss": 1.915, + "step": 1612 + }, + { + "epoch": 0.05460648763626695, + "grad_norm": 3.2136878967285156, + "learning_rate": 4.999799067923527e-05, + "loss": 1.9197, + "step": 1643 + }, + { + "epoch": 0.055636798723743684, + "grad_norm": 2.590843677520752, + "learning_rate": 4.999678487776908e-05, + "loss": 1.8756, + "step": 1674 + }, + { + "epoch": 0.05666710981122042, + "grad_norm": 2.64634108543396, + "learning_rate": 4.9995296990983006e-05, + "loss": 1.9033, + "step": 1705 + }, + { + "epoch": 0.05769742089869716, + "grad_norm": 3.0151331424713135, + "learning_rate": 4.999352703566763e-05, + "loss": 1.8883, + "step": 1736 + }, + { + "epoch": 0.05872773198617389, + "grad_norm": 2.526806354522705, + "learning_rate": 4.999147503179668e-05, + "loss": 1.8666, + "step": 1767 + }, + { + "epoch": 0.059758043073650624, + "grad_norm": 2.510300397872925, + "learning_rate": 4.998914100252672e-05, + "loss": 1.854, + "step": 1798 + }, + { + "epoch": 0.06078835416112736, + "grad_norm": 2.4867682456970215, + "learning_rate": 4.998652497419696e-05, + "loss": 1.8548, + "step": 1829 + }, + { + "epoch": 0.0618186652486041, + "grad_norm": 2.3920586109161377, + "learning_rate": 4.9983626976328927e-05, + "loss": 1.8495, + "step": 1860 + }, + { + "epoch": 0.06284897633608083, + "grad_norm": 2.714177370071411, + "learning_rate": 4.998044704162613e-05, + "loss": 1.8433, + "step": 1891 + }, + { + "epoch": 0.06387928742355757, + "grad_norm": 2.3094465732574463, + "learning_rate": 4.9976985205973705e-05, + "loss": 1.8382, + "step": 1922 + }, + { + "epoch": 0.0649095985110343, + "grad_norm": 2.47184419631958, + "learning_rate": 4.997324150843799e-05, + "loss": 1.8464, + "step": 1953 + }, + { + "epoch": 0.06593990959851104, + "grad_norm": 2.391841411590576, + "learning_rate": 4.99692159912661e-05, + "loss": 1.8179, + "step": 1984 + }, + { + "epoch": 0.06697022068598776, + "grad_norm": 2.2471864223480225, + "learning_rate": 4.996490869988546e-05, + "loss": 1.8149, + "step": 2015 + }, + { + "epoch": 0.0680005317734645, + "grad_norm": 2.5497376918792725, + "learning_rate": 4.996031968290326e-05, + "loss": 1.8099, + "step": 2046 + }, + { + "epoch": 0.06903084286094124, + "grad_norm": 2.330463409423828, + "learning_rate": 4.995544899210594e-05, + "loss": 1.8267, + "step": 2077 + }, + { + "epoch": 0.07006115394841797, + "grad_norm": 2.3259341716766357, + "learning_rate": 4.9950296682458583e-05, + "loss": 1.7801, + "step": 2108 + }, + { + "epoch": 0.07109146503589471, + "grad_norm": 2.1711952686309814, + "learning_rate": 4.994486281210429e-05, + "loss": 1.7961, + "step": 2139 + }, + { + "epoch": 0.07212177612337145, + "grad_norm": 2.1808884143829346, + "learning_rate": 4.9939147442363566e-05, + "loss": 1.8109, + "step": 2170 + }, + { + "epoch": 0.07315208721084818, + "grad_norm": 2.089256525039673, + "learning_rate": 4.9933150637733574e-05, + "loss": 1.8026, + "step": 2201 + }, + { + "epoch": 0.07418239829832492, + "grad_norm": 2.0864951610565186, + "learning_rate": 4.992687246588743e-05, + "loss": 1.7753, + "step": 2232 + }, + { + "epoch": 0.07521270938580164, + "grad_norm": 2.36157488822937, + "learning_rate": 4.992031299767347e-05, + "loss": 1.7746, + "step": 2263 + }, + { + "epoch": 0.07624302047327838, + "grad_norm": 2.5334439277648926, + "learning_rate": 4.9913472307114386e-05, + "loss": 1.7927, + "step": 2294 + }, + { + "epoch": 0.07727333156075512, + "grad_norm": 2.2565715312957764, + "learning_rate": 4.9906350471406446e-05, + "loss": 1.7668, + "step": 2325 + }, + { + "epoch": 0.07830364264823185, + "grad_norm": 2.1043128967285156, + "learning_rate": 4.989894757091861e-05, + "loss": 1.7771, + "step": 2356 + }, + { + "epoch": 0.07933395373570859, + "grad_norm": 1.9659819602966309, + "learning_rate": 4.989126368919158e-05, + "loss": 1.7666, + "step": 2387 + }, + { + "epoch": 0.08036426482318532, + "grad_norm": 2.0778403282165527, + "learning_rate": 4.988329891293693e-05, + "loss": 1.7405, + "step": 2418 + }, + { + "epoch": 0.08139457591066206, + "grad_norm": 2.1767923831939697, + "learning_rate": 4.987505333203608e-05, + "loss": 1.7495, + "step": 2449 + }, + { + "epoch": 0.0824248869981388, + "grad_norm": 2.260143280029297, + "learning_rate": 4.9866527039539276e-05, + "loss": 1.7504, + "step": 2480 + }, + { + "epoch": 0.08345519808561552, + "grad_norm": 2.18271803855896, + "learning_rate": 4.9857720131664594e-05, + "loss": 1.7456, + "step": 2511 + }, + { + "epoch": 0.08448550917309226, + "grad_norm": 2.209594964981079, + "learning_rate": 4.9848632707796773e-05, + "loss": 1.7528, + "step": 2542 + }, + { + "epoch": 0.085515820260569, + "grad_norm": 2.0666229724884033, + "learning_rate": 4.9839264870486155e-05, + "loss": 1.7517, + "step": 2573 + }, + { + "epoch": 0.08654613134804573, + "grad_norm": 2.1070454120635986, + "learning_rate": 4.9829616725447526e-05, + "loss": 1.7474, + "step": 2604 + }, + { + "epoch": 0.08757644243552247, + "grad_norm": 1.9430303573608398, + "learning_rate": 4.981968838155888e-05, + "loss": 1.7348, + "step": 2635 + }, + { + "epoch": 0.0886067535229992, + "grad_norm": 1.9638925790786743, + "learning_rate": 4.980947995086024e-05, + "loss": 1.7202, + "step": 2666 + }, + { + "epoch": 0.08963706461047594, + "grad_norm": 1.8845652341842651, + "learning_rate": 4.979899154855234e-05, + "loss": 1.7375, + "step": 2697 + }, + { + "epoch": 0.09066737569795268, + "grad_norm": 5.712058067321777, + "learning_rate": 4.9788223292995386e-05, + "loss": 1.7379, + "step": 2728 + }, + { + "epoch": 0.0916976867854294, + "grad_norm": 1.9520670175552368, + "learning_rate": 4.977717530570768e-05, + "loss": 1.7302, + "step": 2759 + }, + { + "epoch": 0.09272799787290614, + "grad_norm": 1.8802224397659302, + "learning_rate": 4.976584771136425e-05, + "loss": 1.74, + "step": 2790 + }, + { + "epoch": 0.09375830896038288, + "grad_norm": 2.1098153591156006, + "learning_rate": 4.975424063779547e-05, + "loss": 1.7024, + "step": 2821 + }, + { + "epoch": 0.09478862004785961, + "grad_norm": 2.1568291187286377, + "learning_rate": 4.974235421598557e-05, + "loss": 1.7131, + "step": 2852 + }, + { + "epoch": 0.09581893113533635, + "grad_norm": 1.8769980669021606, + "learning_rate": 4.973018858007122e-05, + "loss": 1.7008, + "step": 2883 + }, + { + "epoch": 0.09684924222281308, + "grad_norm": 1.8325533866882324, + "learning_rate": 4.9717743867339963e-05, + "loss": 1.7058, + "step": 2914 + }, + { + "epoch": 0.09787955331028982, + "grad_norm": 2.086416721343994, + "learning_rate": 4.9705020218228695e-05, + "loss": 1.711, + "step": 2945 + }, + { + "epoch": 0.09890986439776656, + "grad_norm": 1.8294793367385864, + "learning_rate": 4.969201777632205e-05, + "loss": 1.6998, + "step": 2976 + }, + { + "epoch": 0.09994017548524328, + "grad_norm": 2.0608153343200684, + "learning_rate": 4.9678736688350846e-05, + "loss": 1.6948, + "step": 3007 + }, + { + "epoch": 0.10097048657272002, + "grad_norm": 3.2166008949279785, + "learning_rate": 4.966517710419033e-05, + "loss": 1.6788, + "step": 3038 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.263945054022271e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3052/training_args.bin b/checkpoint-3052/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..974208468b82a3c5684aaa384776477cf21c18ca --- /dev/null +++ b/checkpoint-3052/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a23be0ff07d6d3142f7c0980f91dddba845519c24fcb411cbb4b9ddb1513ff +size 5304 diff --git a/checkpoint-6104/config.json b/checkpoint-6104/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aaa74176892d42e1c7f5979b7ddf8ab15985d3 --- /dev/null +++ b/checkpoint-6104/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "/mnt/parscratch/users/acp23ay/private/models/Llama-3.1-8B-Instruct-ta-madlad-mean/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 138256 +} diff --git a/checkpoint-6104/generation_config.json b/checkpoint-6104/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-6104/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-6104/model-00001-of-00007.safetensors b/checkpoint-6104/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..69b275ca8c07ea1a9e91150fccffdf67f22ca916 --- /dev/null +++ b/checkpoint-6104/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b8bb51216c4b3d45d3e11feae96d6922209ca476bfb5962d157bf85f8d4195d +size 4983197184 diff --git a/checkpoint-6104/model-00002-of-00007.safetensors b/checkpoint-6104/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d32d83cb96a109af411b9cf577e7fbfe07ea76fc --- /dev/null +++ b/checkpoint-6104/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168629c67732309d49f60a5ec48a6d160212bc987365e82e183dfbf74ba0c1f3 +size 4899116432 diff --git a/checkpoint-6104/model-00003-of-00007.safetensors b/checkpoint-6104/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-6104/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-6104/model-00004-of-00007.safetensors b/checkpoint-6104/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-6104/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-6104/model-00005-of-00007.safetensors b/checkpoint-6104/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-6104/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-6104/model-00006-of-00007.safetensors b/checkpoint-6104/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..051901761d55b32441ead32046695512a493ae65 --- /dev/null +++ b/checkpoint-6104/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9756983fc95b180b1c63c6504c2bdcba25fb2a44751fa08c96566ade399229d +size 4999813120 diff --git a/checkpoint-6104/model-00007-of-00007.safetensors b/checkpoint-6104/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..678ec804ce8a18391749dc35d0f83a935157cce2 --- /dev/null +++ b/checkpoint-6104/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:553fb64fa8a1ef5b0d2e8d3ae106511e477e0caec986f33bc58eb74ca049f1d4 +size 2734998184 diff --git a/checkpoint-6104/model.safetensors.index.json b/checkpoint-6104/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..318803c6a3dd771c7f7c3b8038a896af7c8322ae --- /dev/null +++ b/checkpoint-6104/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32448724992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-6104/optimizer.pt b/checkpoint-6104/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..68fcfe37bb26f448ef5a99edfe7751d294d0f073 --- /dev/null +++ b/checkpoint-6104/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b57e5878c1070e8412c93f14fcabdb722a51838fa63772d9b3e3d021b84c176 +size 16040396334 diff --git a/checkpoint-6104/rng_state.pth b/checkpoint-6104/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-6104/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-6104/scheduler.pt b/checkpoint-6104/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f5e5514ba898102fcdb5731bba1ae1c2957e6e5 --- /dev/null +++ b/checkpoint-6104/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:107e0617754026d870a7da422dabb716a8dc7d3a550066ff507e37f8f0818429 +size 1064 diff --git a/checkpoint-6104/trainer_state.json b/checkpoint-6104/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..eb1499014531fabaff14b20594db8a706a25c5a0 --- /dev/null +++ b/checkpoint-6104/trainer_state.json @@ -0,0 +1,1405 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.20287157670832226, + "eval_steps": 500, + "global_step": 6104, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001030311087476735, + "grad_norm": 60.25547409057617, + "learning_rate": 1.0157273918741808e-06, + "loss": 8.8455, + "step": 31 + }, + { + "epoch": 0.00206062217495347, + "grad_norm": 15.669363975524902, + "learning_rate": 2.0314547837483616e-06, + "loss": 7.1553, + "step": 62 + }, + { + "epoch": 0.003090933262430205, + "grad_norm": 15.366345405578613, + "learning_rate": 3.0471821756225426e-06, + "loss": 5.8784, + "step": 93 + }, + { + "epoch": 0.00412124434990694, + "grad_norm": 36.30561828613281, + "learning_rate": 4.062909567496723e-06, + "loss": 4.7708, + "step": 124 + }, + { + "epoch": 0.005151555437383675, + "grad_norm": 27.202678680419922, + "learning_rate": 5.078636959370905e-06, + "loss": 4.1629, + "step": 155 + }, + { + "epoch": 0.00618186652486041, + "grad_norm": 24.30484962463379, + "learning_rate": 6.094364351245085e-06, + "loss": 3.867, + "step": 186 + }, + { + "epoch": 0.007212177612337145, + "grad_norm": 19.916366577148438, + "learning_rate": 7.110091743119267e-06, + "loss": 3.6131, + "step": 217 + }, + { + "epoch": 0.00824248869981388, + "grad_norm": 17.577274322509766, + "learning_rate": 8.125819134993446e-06, + "loss": 3.4772, + "step": 248 + }, + { + "epoch": 0.009272799787290615, + "grad_norm": 12.133190155029297, + "learning_rate": 9.141546526867629e-06, + "loss": 3.3218, + "step": 279 + }, + { + "epoch": 0.01030311087476735, + "grad_norm": 19.79263687133789, + "learning_rate": 1.015727391874181e-05, + "loss": 3.2055, + "step": 310 + }, + { + "epoch": 0.011333421962244085, + "grad_norm": 16.38133430480957, + "learning_rate": 1.117300131061599e-05, + "loss": 3.1062, + "step": 341 + }, + { + "epoch": 0.01236373304972082, + "grad_norm": 12.638299942016602, + "learning_rate": 1.218872870249017e-05, + "loss": 3.0106, + "step": 372 + }, + { + "epoch": 0.013394044137197554, + "grad_norm": 9.46596908569336, + "learning_rate": 1.3204456094364351e-05, + "loss": 2.924, + "step": 403 + }, + { + "epoch": 0.01442435522467429, + "grad_norm": 10.945392608642578, + "learning_rate": 1.4220183486238533e-05, + "loss": 2.844, + "step": 434 + }, + { + "epoch": 0.015454666312151024, + "grad_norm": 8.474015235900879, + "learning_rate": 1.5235910878112714e-05, + "loss": 2.7892, + "step": 465 + }, + { + "epoch": 0.01648497739962776, + "grad_norm": 9.370804786682129, + "learning_rate": 1.6251638269986893e-05, + "loss": 2.7509, + "step": 496 + }, + { + "epoch": 0.017515288487104493, + "grad_norm": 11.63398551940918, + "learning_rate": 1.7267365661861077e-05, + "loss": 2.6999, + "step": 527 + }, + { + "epoch": 0.01854559957458123, + "grad_norm": 9.17713451385498, + "learning_rate": 1.8283093053735257e-05, + "loss": 2.6459, + "step": 558 + }, + { + "epoch": 0.019575910662057962, + "grad_norm": 7.119054794311523, + "learning_rate": 1.9298820445609438e-05, + "loss": 2.603, + "step": 589 + }, + { + "epoch": 0.0206062217495347, + "grad_norm": 6.653646945953369, + "learning_rate": 2.031454783748362e-05, + "loss": 2.5588, + "step": 620 + }, + { + "epoch": 0.021636532837011432, + "grad_norm": 8.332653045654297, + "learning_rate": 2.13302752293578e-05, + "loss": 2.5357, + "step": 651 + }, + { + "epoch": 0.02266684392448817, + "grad_norm": 6.4949116706848145, + "learning_rate": 2.234600262123198e-05, + "loss": 2.4967, + "step": 682 + }, + { + "epoch": 0.023697155011964902, + "grad_norm": 9.41009521484375, + "learning_rate": 2.336173001310616e-05, + "loss": 2.4563, + "step": 713 + }, + { + "epoch": 0.02472746609944164, + "grad_norm": 7.840345859527588, + "learning_rate": 2.437745740498034e-05, + "loss": 2.4383, + "step": 744 + }, + { + "epoch": 0.025757777186918372, + "grad_norm": 6.116458415985107, + "learning_rate": 2.5393184796854525e-05, + "loss": 2.3817, + "step": 775 + }, + { + "epoch": 0.02678808827439511, + "grad_norm": 5.938300609588623, + "learning_rate": 2.6408912188728702e-05, + "loss": 2.3508, + "step": 806 + }, + { + "epoch": 0.027818399361871842, + "grad_norm": 5.4408345222473145, + "learning_rate": 2.7424639580602886e-05, + "loss": 2.3325, + "step": 837 + }, + { + "epoch": 0.02884871044934858, + "grad_norm": 5.375136375427246, + "learning_rate": 2.8440366972477066e-05, + "loss": 2.3101, + "step": 868 + }, + { + "epoch": 0.029879021536825312, + "grad_norm": 5.149726867675781, + "learning_rate": 2.9456094364351244e-05, + "loss": 2.282, + "step": 899 + }, + { + "epoch": 0.03090933262430205, + "grad_norm": 4.591221332550049, + "learning_rate": 3.0471821756225428e-05, + "loss": 2.2427, + "step": 930 + }, + { + "epoch": 0.031939643711778785, + "grad_norm": 4.977034091949463, + "learning_rate": 3.148754914809961e-05, + "loss": 2.2218, + "step": 961 + }, + { + "epoch": 0.03296995479925552, + "grad_norm": 5.038781642913818, + "learning_rate": 3.2503276539973785e-05, + "loss": 2.2044, + "step": 992 + }, + { + "epoch": 0.03400026588673225, + "grad_norm": 4.872281551361084, + "learning_rate": 3.351900393184797e-05, + "loss": 2.1657, + "step": 1023 + }, + { + "epoch": 0.035030576974208985, + "grad_norm": 4.370841979980469, + "learning_rate": 3.453473132372215e-05, + "loss": 2.1365, + "step": 1054 + }, + { + "epoch": 0.036060888061685725, + "grad_norm": 4.087072849273682, + "learning_rate": 3.555045871559633e-05, + "loss": 2.1253, + "step": 1085 + }, + { + "epoch": 0.03709119914916246, + "grad_norm": 4.113957405090332, + "learning_rate": 3.6566186107470514e-05, + "loss": 2.0973, + "step": 1116 + }, + { + "epoch": 0.03812151023663919, + "grad_norm": 4.0119733810424805, + "learning_rate": 3.7581913499344695e-05, + "loss": 2.1024, + "step": 1147 + }, + { + "epoch": 0.039151821324115925, + "grad_norm": 4.247573375701904, + "learning_rate": 3.8597640891218876e-05, + "loss": 2.0722, + "step": 1178 + }, + { + "epoch": 0.04018213241159266, + "grad_norm": 3.5575129985809326, + "learning_rate": 3.9613368283093056e-05, + "loss": 2.056, + "step": 1209 + }, + { + "epoch": 0.0412124434990694, + "grad_norm": 3.8885862827301025, + "learning_rate": 4.062909567496724e-05, + "loss": 2.0389, + "step": 1240 + }, + { + "epoch": 0.04224275458654613, + "grad_norm": 3.680628538131714, + "learning_rate": 4.164482306684142e-05, + "loss": 2.0385, + "step": 1271 + }, + { + "epoch": 0.043273065674022865, + "grad_norm": 3.780876874923706, + "learning_rate": 4.26605504587156e-05, + "loss": 2.0097, + "step": 1302 + }, + { + "epoch": 0.0443033767614996, + "grad_norm": 4.235328674316406, + "learning_rate": 4.367627785058978e-05, + "loss": 2.0024, + "step": 1333 + }, + { + "epoch": 0.04533368784897634, + "grad_norm": 3.326941967010498, + "learning_rate": 4.469200524246396e-05, + "loss": 1.9953, + "step": 1364 + }, + { + "epoch": 0.04636399893645307, + "grad_norm": 3.28456449508667, + "learning_rate": 4.570773263433814e-05, + "loss": 1.9579, + "step": 1395 + }, + { + "epoch": 0.047394310023929805, + "grad_norm": 16.107433319091797, + "learning_rate": 4.672346002621232e-05, + "loss": 1.9701, + "step": 1426 + }, + { + "epoch": 0.04842462111140654, + "grad_norm": 3.5708224773406982, + "learning_rate": 4.77391874180865e-05, + "loss": 1.9621, + "step": 1457 + }, + { + "epoch": 0.04945493219888328, + "grad_norm": 2.9053499698638916, + "learning_rate": 4.875491480996068e-05, + "loss": 1.9458, + "step": 1488 + }, + { + "epoch": 0.05048524328636001, + "grad_norm": 3.0863258838653564, + "learning_rate": 4.977064220183487e-05, + "loss": 1.9483, + "step": 1519 + }, + { + "epoch": 0.051515554373836744, + "grad_norm": 2.9012269973754883, + "learning_rate": 4.9999915451558777e-05, + "loss": 1.928, + "step": 1550 + }, + { + "epoch": 0.05254586546131348, + "grad_norm": 3.0949041843414307, + "learning_rate": 4.999955597496219e-05, + "loss": 1.9229, + "step": 1581 + }, + { + "epoch": 0.05357617654879022, + "grad_norm": 2.8687901496887207, + "learning_rate": 4.9998914381774255e-05, + "loss": 1.915, + "step": 1612 + }, + { + "epoch": 0.05460648763626695, + "grad_norm": 3.2136878967285156, + "learning_rate": 4.999799067923527e-05, + "loss": 1.9197, + "step": 1643 + }, + { + "epoch": 0.055636798723743684, + "grad_norm": 2.590843677520752, + "learning_rate": 4.999678487776908e-05, + "loss": 1.8756, + "step": 1674 + }, + { + "epoch": 0.05666710981122042, + "grad_norm": 2.64634108543396, + "learning_rate": 4.9995296990983006e-05, + "loss": 1.9033, + "step": 1705 + }, + { + "epoch": 0.05769742089869716, + "grad_norm": 3.0151331424713135, + "learning_rate": 4.999352703566763e-05, + "loss": 1.8883, + "step": 1736 + }, + { + "epoch": 0.05872773198617389, + "grad_norm": 2.526806354522705, + "learning_rate": 4.999147503179668e-05, + "loss": 1.8666, + "step": 1767 + }, + { + "epoch": 0.059758043073650624, + "grad_norm": 2.510300397872925, + "learning_rate": 4.998914100252672e-05, + "loss": 1.854, + "step": 1798 + }, + { + "epoch": 0.06078835416112736, + "grad_norm": 2.4867682456970215, + "learning_rate": 4.998652497419696e-05, + "loss": 1.8548, + "step": 1829 + }, + { + "epoch": 0.0618186652486041, + "grad_norm": 2.3920586109161377, + "learning_rate": 4.9983626976328927e-05, + "loss": 1.8495, + "step": 1860 + }, + { + "epoch": 0.06284897633608083, + "grad_norm": 2.714177370071411, + "learning_rate": 4.998044704162613e-05, + "loss": 1.8433, + "step": 1891 + }, + { + "epoch": 0.06387928742355757, + "grad_norm": 2.3094465732574463, + "learning_rate": 4.9976985205973705e-05, + "loss": 1.8382, + "step": 1922 + }, + { + "epoch": 0.0649095985110343, + "grad_norm": 2.47184419631958, + "learning_rate": 4.997324150843799e-05, + "loss": 1.8464, + "step": 1953 + }, + { + "epoch": 0.06593990959851104, + "grad_norm": 2.391841411590576, + "learning_rate": 4.99692159912661e-05, + "loss": 1.8179, + "step": 1984 + }, + { + "epoch": 0.06697022068598776, + "grad_norm": 2.2471864223480225, + "learning_rate": 4.996490869988546e-05, + "loss": 1.8149, + "step": 2015 + }, + { + "epoch": 0.0680005317734645, + "grad_norm": 2.5497376918792725, + "learning_rate": 4.996031968290326e-05, + "loss": 1.8099, + "step": 2046 + }, + { + "epoch": 0.06903084286094124, + "grad_norm": 2.330463409423828, + "learning_rate": 4.995544899210594e-05, + "loss": 1.8267, + "step": 2077 + }, + { + "epoch": 0.07006115394841797, + "grad_norm": 2.3259341716766357, + "learning_rate": 4.9950296682458583e-05, + "loss": 1.7801, + "step": 2108 + }, + { + "epoch": 0.07109146503589471, + "grad_norm": 2.1711952686309814, + "learning_rate": 4.994486281210429e-05, + "loss": 1.7961, + "step": 2139 + }, + { + "epoch": 0.07212177612337145, + "grad_norm": 2.1808884143829346, + "learning_rate": 4.9939147442363566e-05, + "loss": 1.8109, + "step": 2170 + }, + { + "epoch": 0.07315208721084818, + "grad_norm": 2.089256525039673, + "learning_rate": 4.9933150637733574e-05, + "loss": 1.8026, + "step": 2201 + }, + { + "epoch": 0.07418239829832492, + "grad_norm": 2.0864951610565186, + "learning_rate": 4.992687246588743e-05, + "loss": 1.7753, + "step": 2232 + }, + { + "epoch": 0.07521270938580164, + "grad_norm": 2.36157488822937, + "learning_rate": 4.992031299767347e-05, + "loss": 1.7746, + "step": 2263 + }, + { + "epoch": 0.07624302047327838, + "grad_norm": 2.5334439277648926, + "learning_rate": 4.9913472307114386e-05, + "loss": 1.7927, + "step": 2294 + }, + { + "epoch": 0.07727333156075512, + "grad_norm": 2.2565715312957764, + "learning_rate": 4.9906350471406446e-05, + "loss": 1.7668, + "step": 2325 + }, + { + "epoch": 0.07830364264823185, + "grad_norm": 2.1043128967285156, + "learning_rate": 4.989894757091861e-05, + "loss": 1.7771, + "step": 2356 + }, + { + "epoch": 0.07933395373570859, + "grad_norm": 1.9659819602966309, + "learning_rate": 4.989126368919158e-05, + "loss": 1.7666, + "step": 2387 + }, + { + "epoch": 0.08036426482318532, + "grad_norm": 2.0778403282165527, + "learning_rate": 4.988329891293693e-05, + "loss": 1.7405, + "step": 2418 + }, + { + "epoch": 0.08139457591066206, + "grad_norm": 2.1767923831939697, + "learning_rate": 4.987505333203608e-05, + "loss": 1.7495, + "step": 2449 + }, + { + "epoch": 0.0824248869981388, + "grad_norm": 2.260143280029297, + "learning_rate": 4.9866527039539276e-05, + "loss": 1.7504, + "step": 2480 + }, + { + "epoch": 0.08345519808561552, + "grad_norm": 2.18271803855896, + "learning_rate": 4.9857720131664594e-05, + "loss": 1.7456, + "step": 2511 + }, + { + "epoch": 0.08448550917309226, + "grad_norm": 2.209594964981079, + "learning_rate": 4.9848632707796773e-05, + "loss": 1.7528, + "step": 2542 + }, + { + "epoch": 0.085515820260569, + "grad_norm": 2.0666229724884033, + "learning_rate": 4.9839264870486155e-05, + "loss": 1.7517, + "step": 2573 + }, + { + "epoch": 0.08654613134804573, + "grad_norm": 2.1070454120635986, + "learning_rate": 4.9829616725447526e-05, + "loss": 1.7474, + "step": 2604 + }, + { + "epoch": 0.08757644243552247, + "grad_norm": 1.9430303573608398, + "learning_rate": 4.981968838155888e-05, + "loss": 1.7348, + "step": 2635 + }, + { + "epoch": 0.0886067535229992, + "grad_norm": 1.9638925790786743, + "learning_rate": 4.980947995086024e-05, + "loss": 1.7202, + "step": 2666 + }, + { + "epoch": 0.08963706461047594, + "grad_norm": 1.8845652341842651, + "learning_rate": 4.979899154855234e-05, + "loss": 1.7375, + "step": 2697 + }, + { + "epoch": 0.09066737569795268, + "grad_norm": 5.712058067321777, + "learning_rate": 4.9788223292995386e-05, + "loss": 1.7379, + "step": 2728 + }, + { + "epoch": 0.0916976867854294, + "grad_norm": 1.9520670175552368, + "learning_rate": 4.977717530570768e-05, + "loss": 1.7302, + "step": 2759 + }, + { + "epoch": 0.09272799787290614, + "grad_norm": 1.8802224397659302, + "learning_rate": 4.976584771136425e-05, + "loss": 1.74, + "step": 2790 + }, + { + "epoch": 0.09375830896038288, + "grad_norm": 2.1098153591156006, + "learning_rate": 4.975424063779547e-05, + "loss": 1.7024, + "step": 2821 + }, + { + "epoch": 0.09478862004785961, + "grad_norm": 2.1568291187286377, + "learning_rate": 4.974235421598557e-05, + "loss": 1.7131, + "step": 2852 + }, + { + "epoch": 0.09581893113533635, + "grad_norm": 1.8769980669021606, + "learning_rate": 4.973018858007122e-05, + "loss": 1.7008, + "step": 2883 + }, + { + "epoch": 0.09684924222281308, + "grad_norm": 1.8325533866882324, + "learning_rate": 4.9717743867339963e-05, + "loss": 1.7058, + "step": 2914 + }, + { + "epoch": 0.09787955331028982, + "grad_norm": 2.086416721343994, + "learning_rate": 4.9705020218228695e-05, + "loss": 1.711, + "step": 2945 + }, + { + "epoch": 0.09890986439776656, + "grad_norm": 1.8294793367385864, + "learning_rate": 4.969201777632205e-05, + "loss": 1.6998, + "step": 2976 + }, + { + "epoch": 0.09994017548524328, + "grad_norm": 2.0608153343200684, + "learning_rate": 4.9678736688350846e-05, + "loss": 1.6948, + "step": 3007 + }, + { + "epoch": 0.10097048657272002, + "grad_norm": 3.2166008949279785, + "learning_rate": 4.966517710419033e-05, + "loss": 1.6788, + "step": 3038 + }, + { + "epoch": 0.10200079766019676, + "grad_norm": 1.9431313276290894, + "learning_rate": 4.965133917685858e-05, + "loss": 1.7115, + "step": 3069 + }, + { + "epoch": 0.10303110874767349, + "grad_norm": 1.967512845993042, + "learning_rate": 4.9637223062514714e-05, + "loss": 1.7033, + "step": 3100 + }, + { + "epoch": 0.10406141983515023, + "grad_norm": 1.9253389835357666, + "learning_rate": 4.962282892045718e-05, + "loss": 1.6856, + "step": 3131 + }, + { + "epoch": 0.10509173092262696, + "grad_norm": 1.986840009689331, + "learning_rate": 4.9608156913121904e-05, + "loss": 1.723, + "step": 3162 + }, + { + "epoch": 0.1061220420101037, + "grad_norm": 1.83523690700531, + "learning_rate": 4.959320720608049e-05, + "loss": 1.6912, + "step": 3193 + }, + { + "epoch": 0.10715235309758044, + "grad_norm": 2.1271955966949463, + "learning_rate": 4.9577979968038354e-05, + "loss": 1.7032, + "step": 3224 + }, + { + "epoch": 0.10818266418505716, + "grad_norm": 1.8383768796920776, + "learning_rate": 4.956247537083282e-05, + "loss": 1.6726, + "step": 3255 + }, + { + "epoch": 0.1092129752725339, + "grad_norm": 1.8806651830673218, + "learning_rate": 4.9546693589431145e-05, + "loss": 1.6817, + "step": 3286 + }, + { + "epoch": 0.11024328636001064, + "grad_norm": 1.7535260915756226, + "learning_rate": 4.9530634801928595e-05, + "loss": 1.6875, + "step": 3317 + }, + { + "epoch": 0.11127359744748737, + "grad_norm": 1.765906810760498, + "learning_rate": 4.9514299189546395e-05, + "loss": 1.6859, + "step": 3348 + }, + { + "epoch": 0.11230390853496411, + "grad_norm": 1.869828462600708, + "learning_rate": 4.949768693662973e-05, + "loss": 1.6915, + "step": 3379 + }, + { + "epoch": 0.11333421962244083, + "grad_norm": 1.8347504138946533, + "learning_rate": 4.948079823064559e-05, + "loss": 1.6859, + "step": 3410 + }, + { + "epoch": 0.11436453070991758, + "grad_norm": 1.7692474126815796, + "learning_rate": 4.946363326218074e-05, + "loss": 1.6565, + "step": 3441 + }, + { + "epoch": 0.11539484179739432, + "grad_norm": 1.8231885433197021, + "learning_rate": 4.9446192224939525e-05, + "loss": 1.686, + "step": 3472 + }, + { + "epoch": 0.11642515288487104, + "grad_norm": 1.7155958414077759, + "learning_rate": 4.942847531574167e-05, + "loss": 1.6538, + "step": 3503 + }, + { + "epoch": 0.11745546397234778, + "grad_norm": 1.787183403968811, + "learning_rate": 4.941048273452008e-05, + "loss": 1.6776, + "step": 3534 + }, + { + "epoch": 0.11848577505982451, + "grad_norm": 1.741213083267212, + "learning_rate": 4.9392214684318605e-05, + "loss": 1.6784, + "step": 3565 + }, + { + "epoch": 0.11951608614730125, + "grad_norm": 1.7836824655532837, + "learning_rate": 4.93736713712897e-05, + "loss": 1.6557, + "step": 3596 + }, + { + "epoch": 0.12054639723477799, + "grad_norm": 1.7103859186172485, + "learning_rate": 4.9354853004692124e-05, + "loss": 1.6606, + "step": 3627 + }, + { + "epoch": 0.12157670832225471, + "grad_norm": 1.7865506410598755, + "learning_rate": 4.93357597968886e-05, + "loss": 1.6409, + "step": 3658 + }, + { + "epoch": 0.12260701940973145, + "grad_norm": 1.7770143747329712, + "learning_rate": 4.931639196334338e-05, + "loss": 1.6574, + "step": 3689 + }, + { + "epoch": 0.1236373304972082, + "grad_norm": 1.857575535774231, + "learning_rate": 4.9296749722619826e-05, + "loss": 1.6724, + "step": 3720 + }, + { + "epoch": 0.12466764158468492, + "grad_norm": 1.8742581605911255, + "learning_rate": 4.9276833296377966e-05, + "loss": 1.6506, + "step": 3751 + }, + { + "epoch": 0.12569795267216166, + "grad_norm": 1.827668309211731, + "learning_rate": 4.925664290937196e-05, + "loss": 1.6523, + "step": 3782 + }, + { + "epoch": 0.1267282637596384, + "grad_norm": 1.7517486810684204, + "learning_rate": 4.9236178789447576e-05, + "loss": 1.6459, + "step": 3813 + }, + { + "epoch": 0.12775857484711514, + "grad_norm": 1.8109570741653442, + "learning_rate": 4.921544116753962e-05, + "loss": 1.6614, + "step": 3844 + }, + { + "epoch": 0.12878888593459187, + "grad_norm": 1.692597508430481, + "learning_rate": 4.919443027766935e-05, + "loss": 1.6431, + "step": 3875 + }, + { + "epoch": 0.1298191970220686, + "grad_norm": 1.8650025129318237, + "learning_rate": 4.91731463569418e-05, + "loss": 1.6466, + "step": 3906 + }, + { + "epoch": 0.13084950810954532, + "grad_norm": 1.6794081926345825, + "learning_rate": 4.915158964554312e-05, + "loss": 1.6504, + "step": 3937 + }, + { + "epoch": 0.13187981919702207, + "grad_norm": 1.7685374021530151, + "learning_rate": 4.912976038673786e-05, + "loss": 1.6446, + "step": 3968 + }, + { + "epoch": 0.1329101302844988, + "grad_norm": 1.7601110935211182, + "learning_rate": 4.9107658826866254e-05, + "loss": 1.631, + "step": 3999 + }, + { + "epoch": 0.13394044137197553, + "grad_norm": 2.0616064071655273, + "learning_rate": 4.908528521534139e-05, + "loss": 1.6476, + "step": 4030 + }, + { + "epoch": 0.13497075245945228, + "grad_norm": 1.8973504304885864, + "learning_rate": 4.906263980464644e-05, + "loss": 1.6582, + "step": 4061 + }, + { + "epoch": 0.136001063546929, + "grad_norm": 1.7768895626068115, + "learning_rate": 4.903972285033178e-05, + "loss": 1.6159, + "step": 4092 + }, + { + "epoch": 0.13703137463440573, + "grad_norm": 1.8264424800872803, + "learning_rate": 4.901653461101213e-05, + "loss": 1.6289, + "step": 4123 + }, + { + "epoch": 0.1380616857218825, + "grad_norm": 1.7140119075775146, + "learning_rate": 4.8993075348363626e-05, + "loss": 1.6357, + "step": 4154 + }, + { + "epoch": 0.13909199680935921, + "grad_norm": 1.6964486837387085, + "learning_rate": 4.896934532712084e-05, + "loss": 1.6233, + "step": 4185 + }, + { + "epoch": 0.14012230789683594, + "grad_norm": 1.8008025884628296, + "learning_rate": 4.8945344815073846e-05, + "loss": 1.637, + "step": 4216 + }, + { + "epoch": 0.1411526189843127, + "grad_norm": 1.562730073928833, + "learning_rate": 4.892107408306516e-05, + "loss": 1.6379, + "step": 4247 + }, + { + "epoch": 0.14218293007178942, + "grad_norm": 1.8273371458053589, + "learning_rate": 4.889653340498669e-05, + "loss": 1.6246, + "step": 4278 + }, + { + "epoch": 0.14321324115926615, + "grad_norm": 56.33716583251953, + "learning_rate": 4.8871723057776664e-05, + "loss": 1.6457, + "step": 4309 + }, + { + "epoch": 0.1442435522467429, + "grad_norm": 1.746523380279541, + "learning_rate": 4.8846643321416476e-05, + "loss": 1.6343, + "step": 4340 + }, + { + "epoch": 0.14527386333421963, + "grad_norm": 1.7737531661987305, + "learning_rate": 4.882129447892753e-05, + "loss": 1.6447, + "step": 4371 + }, + { + "epoch": 0.14630417442169635, + "grad_norm": 1.660485863685608, + "learning_rate": 4.8795676816368076e-05, + "loss": 1.6192, + "step": 4402 + }, + { + "epoch": 0.14733448550917308, + "grad_norm": 1.6823406219482422, + "learning_rate": 4.876979062282995e-05, + "loss": 1.6253, + "step": 4433 + }, + { + "epoch": 0.14836479659664983, + "grad_norm": 7.78139066696167, + "learning_rate": 4.8743636190435325e-05, + "loss": 1.6234, + "step": 4464 + }, + { + "epoch": 0.14939510768412656, + "grad_norm": 1.7426058053970337, + "learning_rate": 4.871721381433344e-05, + "loss": 1.6337, + "step": 4495 + }, + { + "epoch": 0.1504254187716033, + "grad_norm": 1.6294783353805542, + "learning_rate": 4.869052379269719e-05, + "loss": 1.6217, + "step": 4526 + }, + { + "epoch": 0.15145572985908004, + "grad_norm": 1.6523306369781494, + "learning_rate": 4.866356642671985e-05, + "loss": 1.605, + "step": 4557 + }, + { + "epoch": 0.15248604094655677, + "grad_norm": 1.8571300506591797, + "learning_rate": 4.8636342020611634e-05, + "loss": 1.6218, + "step": 4588 + }, + { + "epoch": 0.1535163520340335, + "grad_norm": 1.7754936218261719, + "learning_rate": 4.860885088159626e-05, + "loss": 1.6171, + "step": 4619 + }, + { + "epoch": 0.15454666312151025, + "grad_norm": 1.91987943649292, + "learning_rate": 4.858109331990751e-05, + "loss": 1.6167, + "step": 4650 + }, + { + "epoch": 0.15557697420898697, + "grad_norm": 1.5994452238082886, + "learning_rate": 4.855306964878567e-05, + "loss": 1.5951, + "step": 4681 + }, + { + "epoch": 0.1566072852964637, + "grad_norm": 1.6490916013717651, + "learning_rate": 4.8524780184474084e-05, + "loss": 1.616, + "step": 4712 + }, + { + "epoch": 0.15763759638394045, + "grad_norm": 1.5921640396118164, + "learning_rate": 4.8496225246215496e-05, + "loss": 1.6346, + "step": 4743 + }, + { + "epoch": 0.15866790747141718, + "grad_norm": 1.6729261875152588, + "learning_rate": 4.8467405156248505e-05, + "loss": 1.6165, + "step": 4774 + }, + { + "epoch": 0.1596982185588939, + "grad_norm": 1.628113031387329, + "learning_rate": 4.843832023980392e-05, + "loss": 1.6119, + "step": 4805 + }, + { + "epoch": 0.16072852964637063, + "grad_norm": 1.651647925376892, + "learning_rate": 4.840897082510106e-05, + "loss": 1.5997, + "step": 4836 + }, + { + "epoch": 0.1617588407338474, + "grad_norm": 1.5297720432281494, + "learning_rate": 4.8379357243344084e-05, + "loss": 1.6242, + "step": 4867 + }, + { + "epoch": 0.1627891518213241, + "grad_norm": 1.5779869556427002, + "learning_rate": 4.8349479828718236e-05, + "loss": 1.6149, + "step": 4898 + }, + { + "epoch": 0.16381946290880084, + "grad_norm": 1.5843939781188965, + "learning_rate": 4.8319338918386075e-05, + "loss": 1.5926, + "step": 4929 + }, + { + "epoch": 0.1648497739962776, + "grad_norm": 2.3762106895446777, + "learning_rate": 4.828893485248369e-05, + "loss": 1.6108, + "step": 4960 + }, + { + "epoch": 0.16588008508375432, + "grad_norm": 1.5871953964233398, + "learning_rate": 4.825826797411682e-05, + "loss": 1.6103, + "step": 4991 + }, + { + "epoch": 0.16691039617123105, + "grad_norm": 1.5934125185012817, + "learning_rate": 4.822733862935702e-05, + "loss": 1.6091, + "step": 5022 + }, + { + "epoch": 0.1679407072587078, + "grad_norm": 1.6997628211975098, + "learning_rate": 4.819614716723775e-05, + "loss": 1.6098, + "step": 5053 + }, + { + "epoch": 0.16897101834618453, + "grad_norm": 1.682849645614624, + "learning_rate": 4.8164693939750425e-05, + "loss": 1.599, + "step": 5084 + }, + { + "epoch": 0.17000132943366125, + "grad_norm": 1.709743857383728, + "learning_rate": 4.813297930184042e-05, + "loss": 1.6194, + "step": 5115 + }, + { + "epoch": 0.171031640521138, + "grad_norm": 1.725879430770874, + "learning_rate": 4.810100361140314e-05, + "loss": 1.6115, + "step": 5146 + }, + { + "epoch": 0.17206195160861473, + "grad_norm": 1.6710290908813477, + "learning_rate": 4.8068767229279885e-05, + "loss": 1.6032, + "step": 5177 + }, + { + "epoch": 0.17309226269609146, + "grad_norm": 1.6156634092330933, + "learning_rate": 4.8036270519253854e-05, + "loss": 1.5973, + "step": 5208 + }, + { + "epoch": 0.1741225737835682, + "grad_norm": 1.5654059648513794, + "learning_rate": 4.8003513848046e-05, + "loss": 1.5817, + "step": 5239 + }, + { + "epoch": 0.17515288487104494, + "grad_norm": 1.5789822340011597, + "learning_rate": 4.79704975853109e-05, + "loss": 1.6138, + "step": 5270 + }, + { + "epoch": 0.17618319595852167, + "grad_norm": 1.6022037267684937, + "learning_rate": 4.793722210363262e-05, + "loss": 1.5998, + "step": 5301 + }, + { + "epoch": 0.1772135070459984, + "grad_norm": 1.5142741203308105, + "learning_rate": 4.7903687778520414e-05, + "loss": 1.6061, + "step": 5332 + }, + { + "epoch": 0.17824381813347515, + "grad_norm": 1.6454212665557861, + "learning_rate": 4.7869894988404593e-05, + "loss": 1.6063, + "step": 5363 + }, + { + "epoch": 0.17927412922095187, + "grad_norm": 1.5250823497772217, + "learning_rate": 4.783584411463221e-05, + "loss": 1.6038, + "step": 5394 + }, + { + "epoch": 0.1803044403084286, + "grad_norm": 1.5829335451126099, + "learning_rate": 4.780153554146274e-05, + "loss": 1.5949, + "step": 5425 + }, + { + "epoch": 0.18133475139590535, + "grad_norm": 1.5342432260513306, + "learning_rate": 4.7766969656063766e-05, + "loss": 1.5913, + "step": 5456 + }, + { + "epoch": 0.18236506248338208, + "grad_norm": 1.6397250890731812, + "learning_rate": 4.773214684850662e-05, + "loss": 1.6102, + "step": 5487 + }, + { + "epoch": 0.1833953735708588, + "grad_norm": 1.5228471755981445, + "learning_rate": 4.769706751176193e-05, + "loss": 1.5885, + "step": 5518 + }, + { + "epoch": 0.18442568465833556, + "grad_norm": 1.6186103820800781, + "learning_rate": 4.7661732041695264e-05, + "loss": 1.6086, + "step": 5549 + }, + { + "epoch": 0.18545599574581229, + "grad_norm": 1.6024582386016846, + "learning_rate": 4.762614083706258e-05, + "loss": 1.6004, + "step": 5580 + }, + { + "epoch": 0.186486306833289, + "grad_norm": 1.5443711280822754, + "learning_rate": 4.759029429950581e-05, + "loss": 1.6048, + "step": 5611 + }, + { + "epoch": 0.18751661792076577, + "grad_norm": 1.4831629991531372, + "learning_rate": 4.7554192833548235e-05, + "loss": 1.5841, + "step": 5642 + }, + { + "epoch": 0.1885469290082425, + "grad_norm": 1.6426068544387817, + "learning_rate": 4.751783684659e-05, + "loss": 1.587, + "step": 5673 + }, + { + "epoch": 0.18957724009571922, + "grad_norm": 1.4609078168869019, + "learning_rate": 4.748122674890348e-05, + "loss": 1.5945, + "step": 5704 + }, + { + "epoch": 0.19060755118319597, + "grad_norm": 1.5365614891052246, + "learning_rate": 4.7444362953628654e-05, + "loss": 1.5737, + "step": 5735 + }, + { + "epoch": 0.1916378622706727, + "grad_norm": 1.5755670070648193, + "learning_rate": 4.7407245876768424e-05, + "loss": 1.5862, + "step": 5766 + }, + { + "epoch": 0.19266817335814942, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.736987593718397e-05, + "loss": 1.5663, + "step": 5797 + }, + { + "epoch": 0.19369848444562615, + "grad_norm": 1.5927278995513916, + "learning_rate": 4.733225355658999e-05, + "loss": 1.5776, + "step": 5828 + }, + { + "epoch": 0.1947287955331029, + "grad_norm": 1.5593287944793701, + "learning_rate": 4.7294379159549926e-05, + "loss": 1.579, + "step": 5859 + }, + { + "epoch": 0.19575910662057963, + "grad_norm": 1.534055233001709, + "learning_rate": 4.725625317347119e-05, + "loss": 1.6017, + "step": 5890 + }, + { + "epoch": 0.19678941770805636, + "grad_norm": 1.5846387147903442, + "learning_rate": 4.7217876028600374e-05, + "loss": 1.5739, + "step": 5921 + }, + { + "epoch": 0.1978197287955331, + "grad_norm": 1.5377682447433472, + "learning_rate": 4.717924815801832e-05, + "loss": 1.57, + "step": 5952 + }, + { + "epoch": 0.19885003988300984, + "grad_norm": 1.467956781387329, + "learning_rate": 4.714036999763532e-05, + "loss": 1.5736, + "step": 5983 + }, + { + "epoch": 0.19988035097048656, + "grad_norm": 1.601070523262024, + "learning_rate": 4.7101241986186116e-05, + "loss": 1.5861, + "step": 6014 + }, + { + "epoch": 0.20091066205796332, + "grad_norm": 1.5051921606063843, + "learning_rate": 4.7061864565225e-05, + "loss": 1.5735, + "step": 6045 + }, + { + "epoch": 0.20194097314544004, + "grad_norm": 1.462843418121338, + "learning_rate": 4.702223817912081e-05, + "loss": 1.582, + "step": 6076 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.527890108044542e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6104/training_args.bin b/checkpoint-6104/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..974208468b82a3c5684aaa384776477cf21c18ca --- /dev/null +++ b/checkpoint-6104/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a23be0ff07d6d3142f7c0980f91dddba845519c24fcb411cbb4b9ddb1513ff +size 5304 diff --git a/checkpoint-9156/config.json b/checkpoint-9156/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aaa74176892d42e1c7f5979b7ddf8ab15985d3 --- /dev/null +++ b/checkpoint-9156/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "/mnt/parscratch/users/acp23ay/private/models/Llama-3.1-8B-Instruct-ta-madlad-mean/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 138256 +} diff --git a/checkpoint-9156/generation_config.json b/checkpoint-9156/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-9156/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-9156/model-00001-of-00007.safetensors b/checkpoint-9156/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9ad36e83f19ceadc99a3c66feadd5e507328850a --- /dev/null +++ b/checkpoint-9156/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20383d672b8d86abd7410b7b9191b1106039aeba71474d4a480a75fd67e09008 +size 4983197184 diff --git a/checkpoint-9156/model-00002-of-00007.safetensors b/checkpoint-9156/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d32d83cb96a109af411b9cf577e7fbfe07ea76fc --- /dev/null +++ b/checkpoint-9156/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168629c67732309d49f60a5ec48a6d160212bc987365e82e183dfbf74ba0c1f3 +size 4899116432 diff --git a/checkpoint-9156/model-00003-of-00007.safetensors b/checkpoint-9156/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-9156/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-9156/model-00004-of-00007.safetensors b/checkpoint-9156/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-9156/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-9156/model-00005-of-00007.safetensors b/checkpoint-9156/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-9156/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-9156/model-00006-of-00007.safetensors b/checkpoint-9156/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..298a224b2d1680237a90af675e45635d6a19bb3f --- /dev/null +++ b/checkpoint-9156/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:479dc777226ef049a224892cb770e1dd8f44e0824c24bf8b109829875238fb77 +size 4999813120 diff --git a/checkpoint-9156/model-00007-of-00007.safetensors b/checkpoint-9156/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d0dfe6d9c4b8393a06cdcd42230ee5979fafb4c0 --- /dev/null +++ b/checkpoint-9156/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:792345c45b7df6d0b78fa8853045e8f9494d6ed4f0afbcc09192330a98c4dcb4 +size 2734998184 diff --git a/checkpoint-9156/model.safetensors.index.json b/checkpoint-9156/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..318803c6a3dd771c7f7c3b8038a896af7c8322ae --- /dev/null +++ b/checkpoint-9156/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32448724992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-9156/optimizer.pt b/checkpoint-9156/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..59062dcd8ac715505d5c4d947fa590d5c39013b3 --- /dev/null +++ b/checkpoint-9156/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ed06ff2d1bec3bf400b1817b0ecd8c973b7088b77cd26f54a03c9221720e5ca +size 16040396334 diff --git a/checkpoint-9156/rng_state.pth b/checkpoint-9156/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-9156/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-9156/scheduler.pt b/checkpoint-9156/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5456a295e7e9e24785bebf5e96ccb62dbbac4f62 --- /dev/null +++ b/checkpoint-9156/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c3f410c61b11096714461ebc2a4aa1b4573d0d0c3eb997bda14fafb34cdc922 +size 1064 diff --git a/checkpoint-9156/trainer_state.json b/checkpoint-9156/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c55865d5a74390282449860ff6cf6a782d8f0e61 --- /dev/null +++ b/checkpoint-9156/trainer_state.json @@ -0,0 +1,2098 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3043073650624834, + "eval_steps": 500, + "global_step": 9156, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001030311087476735, + "grad_norm": 60.25547409057617, + "learning_rate": 1.0157273918741808e-06, + "loss": 8.8455, + "step": 31 + }, + { + "epoch": 0.00206062217495347, + "grad_norm": 15.669363975524902, + "learning_rate": 2.0314547837483616e-06, + "loss": 7.1553, + "step": 62 + }, + { + "epoch": 0.003090933262430205, + "grad_norm": 15.366345405578613, + "learning_rate": 3.0471821756225426e-06, + "loss": 5.8784, + "step": 93 + }, + { + "epoch": 0.00412124434990694, + "grad_norm": 36.30561828613281, + "learning_rate": 4.062909567496723e-06, + "loss": 4.7708, + "step": 124 + }, + { + "epoch": 0.005151555437383675, + "grad_norm": 27.202678680419922, + "learning_rate": 5.078636959370905e-06, + "loss": 4.1629, + "step": 155 + }, + { + "epoch": 0.00618186652486041, + "grad_norm": 24.30484962463379, + "learning_rate": 6.094364351245085e-06, + "loss": 3.867, + "step": 186 + }, + { + "epoch": 0.007212177612337145, + "grad_norm": 19.916366577148438, + "learning_rate": 7.110091743119267e-06, + "loss": 3.6131, + "step": 217 + }, + { + "epoch": 0.00824248869981388, + "grad_norm": 17.577274322509766, + "learning_rate": 8.125819134993446e-06, + "loss": 3.4772, + "step": 248 + }, + { + "epoch": 0.009272799787290615, + "grad_norm": 12.133190155029297, + "learning_rate": 9.141546526867629e-06, + "loss": 3.3218, + "step": 279 + }, + { + "epoch": 0.01030311087476735, + "grad_norm": 19.79263687133789, + "learning_rate": 1.015727391874181e-05, + "loss": 3.2055, + "step": 310 + }, + { + "epoch": 0.011333421962244085, + "grad_norm": 16.38133430480957, + "learning_rate": 1.117300131061599e-05, + "loss": 3.1062, + "step": 341 + }, + { + "epoch": 0.01236373304972082, + "grad_norm": 12.638299942016602, + "learning_rate": 1.218872870249017e-05, + "loss": 3.0106, + "step": 372 + }, + { + "epoch": 0.013394044137197554, + "grad_norm": 9.46596908569336, + "learning_rate": 1.3204456094364351e-05, + "loss": 2.924, + "step": 403 + }, + { + "epoch": 0.01442435522467429, + "grad_norm": 10.945392608642578, + "learning_rate": 1.4220183486238533e-05, + "loss": 2.844, + "step": 434 + }, + { + "epoch": 0.015454666312151024, + "grad_norm": 8.474015235900879, + "learning_rate": 1.5235910878112714e-05, + "loss": 2.7892, + "step": 465 + }, + { + "epoch": 0.01648497739962776, + "grad_norm": 9.370804786682129, + "learning_rate": 1.6251638269986893e-05, + "loss": 2.7509, + "step": 496 + }, + { + "epoch": 0.017515288487104493, + "grad_norm": 11.63398551940918, + "learning_rate": 1.7267365661861077e-05, + "loss": 2.6999, + "step": 527 + }, + { + "epoch": 0.01854559957458123, + "grad_norm": 9.17713451385498, + "learning_rate": 1.8283093053735257e-05, + "loss": 2.6459, + "step": 558 + }, + { + "epoch": 0.019575910662057962, + "grad_norm": 7.119054794311523, + "learning_rate": 1.9298820445609438e-05, + "loss": 2.603, + "step": 589 + }, + { + "epoch": 0.0206062217495347, + "grad_norm": 6.653646945953369, + "learning_rate": 2.031454783748362e-05, + "loss": 2.5588, + "step": 620 + }, + { + "epoch": 0.021636532837011432, + "grad_norm": 8.332653045654297, + "learning_rate": 2.13302752293578e-05, + "loss": 2.5357, + "step": 651 + }, + { + "epoch": 0.02266684392448817, + "grad_norm": 6.4949116706848145, + "learning_rate": 2.234600262123198e-05, + "loss": 2.4967, + "step": 682 + }, + { + "epoch": 0.023697155011964902, + "grad_norm": 9.41009521484375, + "learning_rate": 2.336173001310616e-05, + "loss": 2.4563, + "step": 713 + }, + { + "epoch": 0.02472746609944164, + "grad_norm": 7.840345859527588, + "learning_rate": 2.437745740498034e-05, + "loss": 2.4383, + "step": 744 + }, + { + "epoch": 0.025757777186918372, + "grad_norm": 6.116458415985107, + "learning_rate": 2.5393184796854525e-05, + "loss": 2.3817, + "step": 775 + }, + { + "epoch": 0.02678808827439511, + "grad_norm": 5.938300609588623, + "learning_rate": 2.6408912188728702e-05, + "loss": 2.3508, + "step": 806 + }, + { + "epoch": 0.027818399361871842, + "grad_norm": 5.4408345222473145, + "learning_rate": 2.7424639580602886e-05, + "loss": 2.3325, + "step": 837 + }, + { + "epoch": 0.02884871044934858, + "grad_norm": 5.375136375427246, + "learning_rate": 2.8440366972477066e-05, + "loss": 2.3101, + "step": 868 + }, + { + "epoch": 0.029879021536825312, + "grad_norm": 5.149726867675781, + "learning_rate": 2.9456094364351244e-05, + "loss": 2.282, + "step": 899 + }, + { + "epoch": 0.03090933262430205, + "grad_norm": 4.591221332550049, + "learning_rate": 3.0471821756225428e-05, + "loss": 2.2427, + "step": 930 + }, + { + "epoch": 0.031939643711778785, + "grad_norm": 4.977034091949463, + "learning_rate": 3.148754914809961e-05, + "loss": 2.2218, + "step": 961 + }, + { + "epoch": 0.03296995479925552, + "grad_norm": 5.038781642913818, + "learning_rate": 3.2503276539973785e-05, + "loss": 2.2044, + "step": 992 + }, + { + "epoch": 0.03400026588673225, + "grad_norm": 4.872281551361084, + "learning_rate": 3.351900393184797e-05, + "loss": 2.1657, + "step": 1023 + }, + { + "epoch": 0.035030576974208985, + "grad_norm": 4.370841979980469, + "learning_rate": 3.453473132372215e-05, + "loss": 2.1365, + "step": 1054 + }, + { + "epoch": 0.036060888061685725, + "grad_norm": 4.087072849273682, + "learning_rate": 3.555045871559633e-05, + "loss": 2.1253, + "step": 1085 + }, + { + "epoch": 0.03709119914916246, + "grad_norm": 4.113957405090332, + "learning_rate": 3.6566186107470514e-05, + "loss": 2.0973, + "step": 1116 + }, + { + "epoch": 0.03812151023663919, + "grad_norm": 4.0119733810424805, + "learning_rate": 3.7581913499344695e-05, + "loss": 2.1024, + "step": 1147 + }, + { + "epoch": 0.039151821324115925, + "grad_norm": 4.247573375701904, + "learning_rate": 3.8597640891218876e-05, + "loss": 2.0722, + "step": 1178 + }, + { + "epoch": 0.04018213241159266, + "grad_norm": 3.5575129985809326, + "learning_rate": 3.9613368283093056e-05, + "loss": 2.056, + "step": 1209 + }, + { + "epoch": 0.0412124434990694, + "grad_norm": 3.8885862827301025, + "learning_rate": 4.062909567496724e-05, + "loss": 2.0389, + "step": 1240 + }, + { + "epoch": 0.04224275458654613, + "grad_norm": 3.680628538131714, + "learning_rate": 4.164482306684142e-05, + "loss": 2.0385, + "step": 1271 + }, + { + "epoch": 0.043273065674022865, + "grad_norm": 3.780876874923706, + "learning_rate": 4.26605504587156e-05, + "loss": 2.0097, + "step": 1302 + }, + { + "epoch": 0.0443033767614996, + "grad_norm": 4.235328674316406, + "learning_rate": 4.367627785058978e-05, + "loss": 2.0024, + "step": 1333 + }, + { + "epoch": 0.04533368784897634, + "grad_norm": 3.326941967010498, + "learning_rate": 4.469200524246396e-05, + "loss": 1.9953, + "step": 1364 + }, + { + "epoch": 0.04636399893645307, + "grad_norm": 3.28456449508667, + "learning_rate": 4.570773263433814e-05, + "loss": 1.9579, + "step": 1395 + }, + { + "epoch": 0.047394310023929805, + "grad_norm": 16.107433319091797, + "learning_rate": 4.672346002621232e-05, + "loss": 1.9701, + "step": 1426 + }, + { + "epoch": 0.04842462111140654, + "grad_norm": 3.5708224773406982, + "learning_rate": 4.77391874180865e-05, + "loss": 1.9621, + "step": 1457 + }, + { + "epoch": 0.04945493219888328, + "grad_norm": 2.9053499698638916, + "learning_rate": 4.875491480996068e-05, + "loss": 1.9458, + "step": 1488 + }, + { + "epoch": 0.05048524328636001, + "grad_norm": 3.0863258838653564, + "learning_rate": 4.977064220183487e-05, + "loss": 1.9483, + "step": 1519 + }, + { + "epoch": 0.051515554373836744, + "grad_norm": 2.9012269973754883, + "learning_rate": 4.9999915451558777e-05, + "loss": 1.928, + "step": 1550 + }, + { + "epoch": 0.05254586546131348, + "grad_norm": 3.0949041843414307, + "learning_rate": 4.999955597496219e-05, + "loss": 1.9229, + "step": 1581 + }, + { + "epoch": 0.05357617654879022, + "grad_norm": 2.8687901496887207, + "learning_rate": 4.9998914381774255e-05, + "loss": 1.915, + "step": 1612 + }, + { + "epoch": 0.05460648763626695, + "grad_norm": 3.2136878967285156, + "learning_rate": 4.999799067923527e-05, + "loss": 1.9197, + "step": 1643 + }, + { + "epoch": 0.055636798723743684, + "grad_norm": 2.590843677520752, + "learning_rate": 4.999678487776908e-05, + "loss": 1.8756, + "step": 1674 + }, + { + "epoch": 0.05666710981122042, + "grad_norm": 2.64634108543396, + "learning_rate": 4.9995296990983006e-05, + "loss": 1.9033, + "step": 1705 + }, + { + "epoch": 0.05769742089869716, + "grad_norm": 3.0151331424713135, + "learning_rate": 4.999352703566763e-05, + "loss": 1.8883, + "step": 1736 + }, + { + "epoch": 0.05872773198617389, + "grad_norm": 2.526806354522705, + "learning_rate": 4.999147503179668e-05, + "loss": 1.8666, + "step": 1767 + }, + { + "epoch": 0.059758043073650624, + "grad_norm": 2.510300397872925, + "learning_rate": 4.998914100252672e-05, + "loss": 1.854, + "step": 1798 + }, + { + "epoch": 0.06078835416112736, + "grad_norm": 2.4867682456970215, + "learning_rate": 4.998652497419696e-05, + "loss": 1.8548, + "step": 1829 + }, + { + "epoch": 0.0618186652486041, + "grad_norm": 2.3920586109161377, + "learning_rate": 4.9983626976328927e-05, + "loss": 1.8495, + "step": 1860 + }, + { + "epoch": 0.06284897633608083, + "grad_norm": 2.714177370071411, + "learning_rate": 4.998044704162613e-05, + "loss": 1.8433, + "step": 1891 + }, + { + "epoch": 0.06387928742355757, + "grad_norm": 2.3094465732574463, + "learning_rate": 4.9976985205973705e-05, + "loss": 1.8382, + "step": 1922 + }, + { + "epoch": 0.0649095985110343, + "grad_norm": 2.47184419631958, + "learning_rate": 4.997324150843799e-05, + "loss": 1.8464, + "step": 1953 + }, + { + "epoch": 0.06593990959851104, + "grad_norm": 2.391841411590576, + "learning_rate": 4.99692159912661e-05, + "loss": 1.8179, + "step": 1984 + }, + { + "epoch": 0.06697022068598776, + "grad_norm": 2.2471864223480225, + "learning_rate": 4.996490869988546e-05, + "loss": 1.8149, + "step": 2015 + }, + { + "epoch": 0.0680005317734645, + "grad_norm": 2.5497376918792725, + "learning_rate": 4.996031968290326e-05, + "loss": 1.8099, + "step": 2046 + }, + { + "epoch": 0.06903084286094124, + "grad_norm": 2.330463409423828, + "learning_rate": 4.995544899210594e-05, + "loss": 1.8267, + "step": 2077 + }, + { + "epoch": 0.07006115394841797, + "grad_norm": 2.3259341716766357, + "learning_rate": 4.9950296682458583e-05, + "loss": 1.7801, + "step": 2108 + }, + { + "epoch": 0.07109146503589471, + "grad_norm": 2.1711952686309814, + "learning_rate": 4.994486281210429e-05, + "loss": 1.7961, + "step": 2139 + }, + { + "epoch": 0.07212177612337145, + "grad_norm": 2.1808884143829346, + "learning_rate": 4.9939147442363566e-05, + "loss": 1.8109, + "step": 2170 + }, + { + "epoch": 0.07315208721084818, + "grad_norm": 2.089256525039673, + "learning_rate": 4.9933150637733574e-05, + "loss": 1.8026, + "step": 2201 + }, + { + "epoch": 0.07418239829832492, + "grad_norm": 2.0864951610565186, + "learning_rate": 4.992687246588743e-05, + "loss": 1.7753, + "step": 2232 + }, + { + "epoch": 0.07521270938580164, + "grad_norm": 2.36157488822937, + "learning_rate": 4.992031299767347e-05, + "loss": 1.7746, + "step": 2263 + }, + { + "epoch": 0.07624302047327838, + "grad_norm": 2.5334439277648926, + "learning_rate": 4.9913472307114386e-05, + "loss": 1.7927, + "step": 2294 + }, + { + "epoch": 0.07727333156075512, + "grad_norm": 2.2565715312957764, + "learning_rate": 4.9906350471406446e-05, + "loss": 1.7668, + "step": 2325 + }, + { + "epoch": 0.07830364264823185, + "grad_norm": 2.1043128967285156, + "learning_rate": 4.989894757091861e-05, + "loss": 1.7771, + "step": 2356 + }, + { + "epoch": 0.07933395373570859, + "grad_norm": 1.9659819602966309, + "learning_rate": 4.989126368919158e-05, + "loss": 1.7666, + "step": 2387 + }, + { + "epoch": 0.08036426482318532, + "grad_norm": 2.0778403282165527, + "learning_rate": 4.988329891293693e-05, + "loss": 1.7405, + "step": 2418 + }, + { + "epoch": 0.08139457591066206, + "grad_norm": 2.1767923831939697, + "learning_rate": 4.987505333203608e-05, + "loss": 1.7495, + "step": 2449 + }, + { + "epoch": 0.0824248869981388, + "grad_norm": 2.260143280029297, + "learning_rate": 4.9866527039539276e-05, + "loss": 1.7504, + "step": 2480 + }, + { + "epoch": 0.08345519808561552, + "grad_norm": 2.18271803855896, + "learning_rate": 4.9857720131664594e-05, + "loss": 1.7456, + "step": 2511 + }, + { + "epoch": 0.08448550917309226, + "grad_norm": 2.209594964981079, + "learning_rate": 4.9848632707796773e-05, + "loss": 1.7528, + "step": 2542 + }, + { + "epoch": 0.085515820260569, + "grad_norm": 2.0666229724884033, + "learning_rate": 4.9839264870486155e-05, + "loss": 1.7517, + "step": 2573 + }, + { + "epoch": 0.08654613134804573, + "grad_norm": 2.1070454120635986, + "learning_rate": 4.9829616725447526e-05, + "loss": 1.7474, + "step": 2604 + }, + { + "epoch": 0.08757644243552247, + "grad_norm": 1.9430303573608398, + "learning_rate": 4.981968838155888e-05, + "loss": 1.7348, + "step": 2635 + }, + { + "epoch": 0.0886067535229992, + "grad_norm": 1.9638925790786743, + "learning_rate": 4.980947995086024e-05, + "loss": 1.7202, + "step": 2666 + }, + { + "epoch": 0.08963706461047594, + "grad_norm": 1.8845652341842651, + "learning_rate": 4.979899154855234e-05, + "loss": 1.7375, + "step": 2697 + }, + { + "epoch": 0.09066737569795268, + "grad_norm": 5.712058067321777, + "learning_rate": 4.9788223292995386e-05, + "loss": 1.7379, + "step": 2728 + }, + { + "epoch": 0.0916976867854294, + "grad_norm": 1.9520670175552368, + "learning_rate": 4.977717530570768e-05, + "loss": 1.7302, + "step": 2759 + }, + { + "epoch": 0.09272799787290614, + "grad_norm": 1.8802224397659302, + "learning_rate": 4.976584771136425e-05, + "loss": 1.74, + "step": 2790 + }, + { + "epoch": 0.09375830896038288, + "grad_norm": 2.1098153591156006, + "learning_rate": 4.975424063779547e-05, + "loss": 1.7024, + "step": 2821 + }, + { + "epoch": 0.09478862004785961, + "grad_norm": 2.1568291187286377, + "learning_rate": 4.974235421598557e-05, + "loss": 1.7131, + "step": 2852 + }, + { + "epoch": 0.09581893113533635, + "grad_norm": 1.8769980669021606, + "learning_rate": 4.973018858007122e-05, + "loss": 1.7008, + "step": 2883 + }, + { + "epoch": 0.09684924222281308, + "grad_norm": 1.8325533866882324, + "learning_rate": 4.9717743867339963e-05, + "loss": 1.7058, + "step": 2914 + }, + { + "epoch": 0.09787955331028982, + "grad_norm": 2.086416721343994, + "learning_rate": 4.9705020218228695e-05, + "loss": 1.711, + "step": 2945 + }, + { + "epoch": 0.09890986439776656, + "grad_norm": 1.8294793367385864, + "learning_rate": 4.969201777632205e-05, + "loss": 1.6998, + "step": 2976 + }, + { + "epoch": 0.09994017548524328, + "grad_norm": 2.0608153343200684, + "learning_rate": 4.9678736688350846e-05, + "loss": 1.6948, + "step": 3007 + }, + { + "epoch": 0.10097048657272002, + "grad_norm": 3.2166008949279785, + "learning_rate": 4.966517710419033e-05, + "loss": 1.6788, + "step": 3038 + }, + { + "epoch": 0.10200079766019676, + "grad_norm": 1.9431313276290894, + "learning_rate": 4.965133917685858e-05, + "loss": 1.7115, + "step": 3069 + }, + { + "epoch": 0.10303110874767349, + "grad_norm": 1.967512845993042, + "learning_rate": 4.9637223062514714e-05, + "loss": 1.7033, + "step": 3100 + }, + { + "epoch": 0.10406141983515023, + "grad_norm": 1.9253389835357666, + "learning_rate": 4.962282892045718e-05, + "loss": 1.6856, + "step": 3131 + }, + { + "epoch": 0.10509173092262696, + "grad_norm": 1.986840009689331, + "learning_rate": 4.9608156913121904e-05, + "loss": 1.723, + "step": 3162 + }, + { + "epoch": 0.1061220420101037, + "grad_norm": 1.83523690700531, + "learning_rate": 4.959320720608049e-05, + "loss": 1.6912, + "step": 3193 + }, + { + "epoch": 0.10715235309758044, + "grad_norm": 2.1271955966949463, + "learning_rate": 4.9577979968038354e-05, + "loss": 1.7032, + "step": 3224 + }, + { + "epoch": 0.10818266418505716, + "grad_norm": 1.8383768796920776, + "learning_rate": 4.956247537083282e-05, + "loss": 1.6726, + "step": 3255 + }, + { + "epoch": 0.1092129752725339, + "grad_norm": 1.8806651830673218, + "learning_rate": 4.9546693589431145e-05, + "loss": 1.6817, + "step": 3286 + }, + { + "epoch": 0.11024328636001064, + "grad_norm": 1.7535260915756226, + "learning_rate": 4.9530634801928595e-05, + "loss": 1.6875, + "step": 3317 + }, + { + "epoch": 0.11127359744748737, + "grad_norm": 1.765906810760498, + "learning_rate": 4.9514299189546395e-05, + "loss": 1.6859, + "step": 3348 + }, + { + "epoch": 0.11230390853496411, + "grad_norm": 1.869828462600708, + "learning_rate": 4.949768693662973e-05, + "loss": 1.6915, + "step": 3379 + }, + { + "epoch": 0.11333421962244083, + "grad_norm": 1.8347504138946533, + "learning_rate": 4.948079823064559e-05, + "loss": 1.6859, + "step": 3410 + }, + { + "epoch": 0.11436453070991758, + "grad_norm": 1.7692474126815796, + "learning_rate": 4.946363326218074e-05, + "loss": 1.6565, + "step": 3441 + }, + { + "epoch": 0.11539484179739432, + "grad_norm": 1.8231885433197021, + "learning_rate": 4.9446192224939525e-05, + "loss": 1.686, + "step": 3472 + }, + { + "epoch": 0.11642515288487104, + "grad_norm": 1.7155958414077759, + "learning_rate": 4.942847531574167e-05, + "loss": 1.6538, + "step": 3503 + }, + { + "epoch": 0.11745546397234778, + "grad_norm": 1.787183403968811, + "learning_rate": 4.941048273452008e-05, + "loss": 1.6776, + "step": 3534 + }, + { + "epoch": 0.11848577505982451, + "grad_norm": 1.741213083267212, + "learning_rate": 4.9392214684318605e-05, + "loss": 1.6784, + "step": 3565 + }, + { + "epoch": 0.11951608614730125, + "grad_norm": 1.7836824655532837, + "learning_rate": 4.93736713712897e-05, + "loss": 1.6557, + "step": 3596 + }, + { + "epoch": 0.12054639723477799, + "grad_norm": 1.7103859186172485, + "learning_rate": 4.9354853004692124e-05, + "loss": 1.6606, + "step": 3627 + }, + { + "epoch": 0.12157670832225471, + "grad_norm": 1.7865506410598755, + "learning_rate": 4.93357597968886e-05, + "loss": 1.6409, + "step": 3658 + }, + { + "epoch": 0.12260701940973145, + "grad_norm": 1.7770143747329712, + "learning_rate": 4.931639196334338e-05, + "loss": 1.6574, + "step": 3689 + }, + { + "epoch": 0.1236373304972082, + "grad_norm": 1.857575535774231, + "learning_rate": 4.9296749722619826e-05, + "loss": 1.6724, + "step": 3720 + }, + { + "epoch": 0.12466764158468492, + "grad_norm": 1.8742581605911255, + "learning_rate": 4.9276833296377966e-05, + "loss": 1.6506, + "step": 3751 + }, + { + "epoch": 0.12569795267216166, + "grad_norm": 1.827668309211731, + "learning_rate": 4.925664290937196e-05, + "loss": 1.6523, + "step": 3782 + }, + { + "epoch": 0.1267282637596384, + "grad_norm": 1.7517486810684204, + "learning_rate": 4.9236178789447576e-05, + "loss": 1.6459, + "step": 3813 + }, + { + "epoch": 0.12775857484711514, + "grad_norm": 1.8109570741653442, + "learning_rate": 4.921544116753962e-05, + "loss": 1.6614, + "step": 3844 + }, + { + "epoch": 0.12878888593459187, + "grad_norm": 1.692597508430481, + "learning_rate": 4.919443027766935e-05, + "loss": 1.6431, + "step": 3875 + }, + { + "epoch": 0.1298191970220686, + "grad_norm": 1.8650025129318237, + "learning_rate": 4.91731463569418e-05, + "loss": 1.6466, + "step": 3906 + }, + { + "epoch": 0.13084950810954532, + "grad_norm": 1.6794081926345825, + "learning_rate": 4.915158964554312e-05, + "loss": 1.6504, + "step": 3937 + }, + { + "epoch": 0.13187981919702207, + "grad_norm": 1.7685374021530151, + "learning_rate": 4.912976038673786e-05, + "loss": 1.6446, + "step": 3968 + }, + { + "epoch": 0.1329101302844988, + "grad_norm": 1.7601110935211182, + "learning_rate": 4.9107658826866254e-05, + "loss": 1.631, + "step": 3999 + }, + { + "epoch": 0.13394044137197553, + "grad_norm": 2.0616064071655273, + "learning_rate": 4.908528521534139e-05, + "loss": 1.6476, + "step": 4030 + }, + { + "epoch": 0.13497075245945228, + "grad_norm": 1.8973504304885864, + "learning_rate": 4.906263980464644e-05, + "loss": 1.6582, + "step": 4061 + }, + { + "epoch": 0.136001063546929, + "grad_norm": 1.7768895626068115, + "learning_rate": 4.903972285033178e-05, + "loss": 1.6159, + "step": 4092 + }, + { + "epoch": 0.13703137463440573, + "grad_norm": 1.8264424800872803, + "learning_rate": 4.901653461101213e-05, + "loss": 1.6289, + "step": 4123 + }, + { + "epoch": 0.1380616857218825, + "grad_norm": 1.7140119075775146, + "learning_rate": 4.8993075348363626e-05, + "loss": 1.6357, + "step": 4154 + }, + { + "epoch": 0.13909199680935921, + "grad_norm": 1.6964486837387085, + "learning_rate": 4.896934532712084e-05, + "loss": 1.6233, + "step": 4185 + }, + { + "epoch": 0.14012230789683594, + "grad_norm": 1.8008025884628296, + "learning_rate": 4.8945344815073846e-05, + "loss": 1.637, + "step": 4216 + }, + { + "epoch": 0.1411526189843127, + "grad_norm": 1.562730073928833, + "learning_rate": 4.892107408306516e-05, + "loss": 1.6379, + "step": 4247 + }, + { + "epoch": 0.14218293007178942, + "grad_norm": 1.8273371458053589, + "learning_rate": 4.889653340498669e-05, + "loss": 1.6246, + "step": 4278 + }, + { + "epoch": 0.14321324115926615, + "grad_norm": 56.33716583251953, + "learning_rate": 4.8871723057776664e-05, + "loss": 1.6457, + "step": 4309 + }, + { + "epoch": 0.1442435522467429, + "grad_norm": 1.746523380279541, + "learning_rate": 4.8846643321416476e-05, + "loss": 1.6343, + "step": 4340 + }, + { + "epoch": 0.14527386333421963, + "grad_norm": 1.7737531661987305, + "learning_rate": 4.882129447892753e-05, + "loss": 1.6447, + "step": 4371 + }, + { + "epoch": 0.14630417442169635, + "grad_norm": 1.660485863685608, + "learning_rate": 4.8795676816368076e-05, + "loss": 1.6192, + "step": 4402 + }, + { + "epoch": 0.14733448550917308, + "grad_norm": 1.6823406219482422, + "learning_rate": 4.876979062282995e-05, + "loss": 1.6253, + "step": 4433 + }, + { + "epoch": 0.14836479659664983, + "grad_norm": 7.78139066696167, + "learning_rate": 4.8743636190435325e-05, + "loss": 1.6234, + "step": 4464 + }, + { + "epoch": 0.14939510768412656, + "grad_norm": 1.7426058053970337, + "learning_rate": 4.871721381433344e-05, + "loss": 1.6337, + "step": 4495 + }, + { + "epoch": 0.1504254187716033, + "grad_norm": 1.6294783353805542, + "learning_rate": 4.869052379269719e-05, + "loss": 1.6217, + "step": 4526 + }, + { + "epoch": 0.15145572985908004, + "grad_norm": 1.6523306369781494, + "learning_rate": 4.866356642671985e-05, + "loss": 1.605, + "step": 4557 + }, + { + "epoch": 0.15248604094655677, + "grad_norm": 1.8571300506591797, + "learning_rate": 4.8636342020611634e-05, + "loss": 1.6218, + "step": 4588 + }, + { + "epoch": 0.1535163520340335, + "grad_norm": 1.7754936218261719, + "learning_rate": 4.860885088159626e-05, + "loss": 1.6171, + "step": 4619 + }, + { + "epoch": 0.15454666312151025, + "grad_norm": 1.91987943649292, + "learning_rate": 4.858109331990751e-05, + "loss": 1.6167, + "step": 4650 + }, + { + "epoch": 0.15557697420898697, + "grad_norm": 1.5994452238082886, + "learning_rate": 4.855306964878567e-05, + "loss": 1.5951, + "step": 4681 + }, + { + "epoch": 0.1566072852964637, + "grad_norm": 1.6490916013717651, + "learning_rate": 4.8524780184474084e-05, + "loss": 1.616, + "step": 4712 + }, + { + "epoch": 0.15763759638394045, + "grad_norm": 1.5921640396118164, + "learning_rate": 4.8496225246215496e-05, + "loss": 1.6346, + "step": 4743 + }, + { + "epoch": 0.15866790747141718, + "grad_norm": 1.6729261875152588, + "learning_rate": 4.8467405156248505e-05, + "loss": 1.6165, + "step": 4774 + }, + { + "epoch": 0.1596982185588939, + "grad_norm": 1.628113031387329, + "learning_rate": 4.843832023980392e-05, + "loss": 1.6119, + "step": 4805 + }, + { + "epoch": 0.16072852964637063, + "grad_norm": 1.651647925376892, + "learning_rate": 4.840897082510106e-05, + "loss": 1.5997, + "step": 4836 + }, + { + "epoch": 0.1617588407338474, + "grad_norm": 1.5297720432281494, + "learning_rate": 4.8379357243344084e-05, + "loss": 1.6242, + "step": 4867 + }, + { + "epoch": 0.1627891518213241, + "grad_norm": 1.5779869556427002, + "learning_rate": 4.8349479828718236e-05, + "loss": 1.6149, + "step": 4898 + }, + { + "epoch": 0.16381946290880084, + "grad_norm": 1.5843939781188965, + "learning_rate": 4.8319338918386075e-05, + "loss": 1.5926, + "step": 4929 + }, + { + "epoch": 0.1648497739962776, + "grad_norm": 2.3762106895446777, + "learning_rate": 4.828893485248369e-05, + "loss": 1.6108, + "step": 4960 + }, + { + "epoch": 0.16588008508375432, + "grad_norm": 1.5871953964233398, + "learning_rate": 4.825826797411682e-05, + "loss": 1.6103, + "step": 4991 + }, + { + "epoch": 0.16691039617123105, + "grad_norm": 1.5934125185012817, + "learning_rate": 4.822733862935702e-05, + "loss": 1.6091, + "step": 5022 + }, + { + "epoch": 0.1679407072587078, + "grad_norm": 1.6997628211975098, + "learning_rate": 4.819614716723775e-05, + "loss": 1.6098, + "step": 5053 + }, + { + "epoch": 0.16897101834618453, + "grad_norm": 1.682849645614624, + "learning_rate": 4.8164693939750425e-05, + "loss": 1.599, + "step": 5084 + }, + { + "epoch": 0.17000132943366125, + "grad_norm": 1.709743857383728, + "learning_rate": 4.813297930184042e-05, + "loss": 1.6194, + "step": 5115 + }, + { + "epoch": 0.171031640521138, + "grad_norm": 1.725879430770874, + "learning_rate": 4.810100361140314e-05, + "loss": 1.6115, + "step": 5146 + }, + { + "epoch": 0.17206195160861473, + "grad_norm": 1.6710290908813477, + "learning_rate": 4.8068767229279885e-05, + "loss": 1.6032, + "step": 5177 + }, + { + "epoch": 0.17309226269609146, + "grad_norm": 1.6156634092330933, + "learning_rate": 4.8036270519253854e-05, + "loss": 1.5973, + "step": 5208 + }, + { + "epoch": 0.1741225737835682, + "grad_norm": 1.5654059648513794, + "learning_rate": 4.8003513848046e-05, + "loss": 1.5817, + "step": 5239 + }, + { + "epoch": 0.17515288487104494, + "grad_norm": 1.5789822340011597, + "learning_rate": 4.79704975853109e-05, + "loss": 1.6138, + "step": 5270 + }, + { + "epoch": 0.17618319595852167, + "grad_norm": 1.6022037267684937, + "learning_rate": 4.793722210363262e-05, + "loss": 1.5998, + "step": 5301 + }, + { + "epoch": 0.1772135070459984, + "grad_norm": 1.5142741203308105, + "learning_rate": 4.7903687778520414e-05, + "loss": 1.6061, + "step": 5332 + }, + { + "epoch": 0.17824381813347515, + "grad_norm": 1.6454212665557861, + "learning_rate": 4.7869894988404593e-05, + "loss": 1.6063, + "step": 5363 + }, + { + "epoch": 0.17927412922095187, + "grad_norm": 1.5250823497772217, + "learning_rate": 4.783584411463221e-05, + "loss": 1.6038, + "step": 5394 + }, + { + "epoch": 0.1803044403084286, + "grad_norm": 1.5829335451126099, + "learning_rate": 4.780153554146274e-05, + "loss": 1.5949, + "step": 5425 + }, + { + "epoch": 0.18133475139590535, + "grad_norm": 1.5342432260513306, + "learning_rate": 4.7766969656063766e-05, + "loss": 1.5913, + "step": 5456 + }, + { + "epoch": 0.18236506248338208, + "grad_norm": 1.6397250890731812, + "learning_rate": 4.773214684850662e-05, + "loss": 1.6102, + "step": 5487 + }, + { + "epoch": 0.1833953735708588, + "grad_norm": 1.5228471755981445, + "learning_rate": 4.769706751176193e-05, + "loss": 1.5885, + "step": 5518 + }, + { + "epoch": 0.18442568465833556, + "grad_norm": 1.6186103820800781, + "learning_rate": 4.7661732041695264e-05, + "loss": 1.6086, + "step": 5549 + }, + { + "epoch": 0.18545599574581229, + "grad_norm": 1.6024582386016846, + "learning_rate": 4.762614083706258e-05, + "loss": 1.6004, + "step": 5580 + }, + { + "epoch": 0.186486306833289, + "grad_norm": 1.5443711280822754, + "learning_rate": 4.759029429950581e-05, + "loss": 1.6048, + "step": 5611 + }, + { + "epoch": 0.18751661792076577, + "grad_norm": 1.4831629991531372, + "learning_rate": 4.7554192833548235e-05, + "loss": 1.5841, + "step": 5642 + }, + { + "epoch": 0.1885469290082425, + "grad_norm": 1.6426068544387817, + "learning_rate": 4.751783684659e-05, + "loss": 1.587, + "step": 5673 + }, + { + "epoch": 0.18957724009571922, + "grad_norm": 1.4609078168869019, + "learning_rate": 4.748122674890348e-05, + "loss": 1.5945, + "step": 5704 + }, + { + "epoch": 0.19060755118319597, + "grad_norm": 1.5365614891052246, + "learning_rate": 4.7444362953628654e-05, + "loss": 1.5737, + "step": 5735 + }, + { + "epoch": 0.1916378622706727, + "grad_norm": 1.5755670070648193, + "learning_rate": 4.7407245876768424e-05, + "loss": 1.5862, + "step": 5766 + }, + { + "epoch": 0.19266817335814942, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.736987593718397e-05, + "loss": 1.5663, + "step": 5797 + }, + { + "epoch": 0.19369848444562615, + "grad_norm": 1.5927278995513916, + "learning_rate": 4.733225355658999e-05, + "loss": 1.5776, + "step": 5828 + }, + { + "epoch": 0.1947287955331029, + "grad_norm": 1.5593287944793701, + "learning_rate": 4.7294379159549926e-05, + "loss": 1.579, + "step": 5859 + }, + { + "epoch": 0.19575910662057963, + "grad_norm": 1.534055233001709, + "learning_rate": 4.725625317347119e-05, + "loss": 1.6017, + "step": 5890 + }, + { + "epoch": 0.19678941770805636, + "grad_norm": 1.5846387147903442, + "learning_rate": 4.7217876028600374e-05, + "loss": 1.5739, + "step": 5921 + }, + { + "epoch": 0.1978197287955331, + "grad_norm": 1.5377682447433472, + "learning_rate": 4.717924815801832e-05, + "loss": 1.57, + "step": 5952 + }, + { + "epoch": 0.19885003988300984, + "grad_norm": 1.467956781387329, + "learning_rate": 4.714036999763532e-05, + "loss": 1.5736, + "step": 5983 + }, + { + "epoch": 0.19988035097048656, + "grad_norm": 1.601070523262024, + "learning_rate": 4.7101241986186116e-05, + "loss": 1.5861, + "step": 6014 + }, + { + "epoch": 0.20091066205796332, + "grad_norm": 1.5051921606063843, + "learning_rate": 4.7061864565225e-05, + "loss": 1.5735, + "step": 6045 + }, + { + "epoch": 0.20194097314544004, + "grad_norm": 1.462843418121338, + "learning_rate": 4.702223817912081e-05, + "loss": 1.582, + "step": 6076 + }, + { + "epoch": 0.20297128423291677, + "grad_norm": 1.5698682069778442, + "learning_rate": 4.698236327505195e-05, + "loss": 1.5647, + "step": 6107 + }, + { + "epoch": 0.20400159532039353, + "grad_norm": 1.5633916854858398, + "learning_rate": 4.694224030300127e-05, + "loss": 1.5741, + "step": 6138 + }, + { + "epoch": 0.20503190640787025, + "grad_norm": 1.6174733638763428, + "learning_rate": 4.690186971575107e-05, + "loss": 1.5634, + "step": 6169 + }, + { + "epoch": 0.20606221749534698, + "grad_norm": 1.4957518577575684, + "learning_rate": 4.6861251968877916e-05, + "loss": 1.575, + "step": 6200 + }, + { + "epoch": 0.2070925285828237, + "grad_norm": 1.670933485031128, + "learning_rate": 4.68203875207476e-05, + "loss": 1.5792, + "step": 6231 + }, + { + "epoch": 0.20812283967030046, + "grad_norm": 1.5676430463790894, + "learning_rate": 4.677927683250983e-05, + "loss": 1.5689, + "step": 6262 + }, + { + "epoch": 0.20915315075777718, + "grad_norm": 1.5753976106643677, + "learning_rate": 4.6737920368093156e-05, + "loss": 1.5594, + "step": 6293 + }, + { + "epoch": 0.2101834618452539, + "grad_norm": 1.4973617792129517, + "learning_rate": 4.669631859419965e-05, + "loss": 1.5593, + "step": 6324 + }, + { + "epoch": 0.21121377293273066, + "grad_norm": 1.4691433906555176, + "learning_rate": 4.6654471980299676e-05, + "loss": 1.5711, + "step": 6355 + }, + { + "epoch": 0.2122440840202074, + "grad_norm": 1.407630443572998, + "learning_rate": 4.661238099862658e-05, + "loss": 1.5787, + "step": 6386 + }, + { + "epoch": 0.21327439510768412, + "grad_norm": 1.5011677742004395, + "learning_rate": 4.657004612417138e-05, + "loss": 1.5751, + "step": 6417 + }, + { + "epoch": 0.21430470619516087, + "grad_norm": 1.509750485420227, + "learning_rate": 4.6527467834677374e-05, + "loss": 1.5583, + "step": 6448 + }, + { + "epoch": 0.2153350172826376, + "grad_norm": 1.3919882774353027, + "learning_rate": 4.648464661063478e-05, + "loss": 1.5712, + "step": 6479 + }, + { + "epoch": 0.21636532837011432, + "grad_norm": 1.4854936599731445, + "learning_rate": 4.6441582935275264e-05, + "loss": 1.5637, + "step": 6510 + }, + { + "epoch": 0.21739563945759108, + "grad_norm": 1.4413583278656006, + "learning_rate": 4.6398277294566586e-05, + "loss": 1.56, + "step": 6541 + }, + { + "epoch": 0.2184259505450678, + "grad_norm": 1.5063883066177368, + "learning_rate": 4.6354730177207e-05, + "loss": 1.5525, + "step": 6572 + }, + { + "epoch": 0.21945626163254453, + "grad_norm": 1.4899688959121704, + "learning_rate": 4.6310942074619787e-05, + "loss": 1.5817, + "step": 6603 + }, + { + "epoch": 0.22048657272002128, + "grad_norm": 1.3927967548370361, + "learning_rate": 4.626691348094777e-05, + "loss": 1.5407, + "step": 6634 + }, + { + "epoch": 0.221516883807498, + "grad_norm": 1.5378398895263672, + "learning_rate": 4.622264489304762e-05, + "loss": 1.5561, + "step": 6665 + }, + { + "epoch": 0.22254719489497474, + "grad_norm": 1.554624319076538, + "learning_rate": 4.617813681048434e-05, + "loss": 1.5859, + "step": 6696 + }, + { + "epoch": 0.22357750598245146, + "grad_norm": 1.5356658697128296, + "learning_rate": 4.61333897355256e-05, + "loss": 1.5531, + "step": 6727 + }, + { + "epoch": 0.22460781706992822, + "grad_norm": 1.5534918308258057, + "learning_rate": 4.608840417313604e-05, + "loss": 1.5774, + "step": 6758 + }, + { + "epoch": 0.22563812815740494, + "grad_norm": 1.5660988092422485, + "learning_rate": 4.6043180630971646e-05, + "loss": 1.5763, + "step": 6789 + }, + { + "epoch": 0.22666843924488167, + "grad_norm": 1.4993386268615723, + "learning_rate": 4.599771961937391e-05, + "loss": 1.5615, + "step": 6820 + }, + { + "epoch": 0.22769875033235842, + "grad_norm": 1.4630553722381592, + "learning_rate": 4.5952021651364204e-05, + "loss": 1.543, + "step": 6851 + }, + { + "epoch": 0.22872906141983515, + "grad_norm": 1.470173954963684, + "learning_rate": 4.590608724263786e-05, + "loss": 1.5674, + "step": 6882 + }, + { + "epoch": 0.22975937250731188, + "grad_norm": 1.5867971181869507, + "learning_rate": 4.585991691155845e-05, + "loss": 1.5702, + "step": 6913 + }, + { + "epoch": 0.23078968359478863, + "grad_norm": 1.44207763671875, + "learning_rate": 4.581351117915188e-05, + "loss": 1.5436, + "step": 6944 + }, + { + "epoch": 0.23181999468226536, + "grad_norm": 1.4691039323806763, + "learning_rate": 4.5766870569100534e-05, + "loss": 1.5465, + "step": 6975 + }, + { + "epoch": 0.23285030576974208, + "grad_norm": 1.4807918071746826, + "learning_rate": 4.571999560773736e-05, + "loss": 1.5564, + "step": 7006 + }, + { + "epoch": 0.23388061685721884, + "grad_norm": 1.481487512588501, + "learning_rate": 4.5672886824039915e-05, + "loss": 1.5466, + "step": 7037 + }, + { + "epoch": 0.23491092794469556, + "grad_norm": 1.4518013000488281, + "learning_rate": 4.5625544749624435e-05, + "loss": 1.5618, + "step": 7068 + }, + { + "epoch": 0.2359412390321723, + "grad_norm": 1.4186676740646362, + "learning_rate": 4.5577969918739794e-05, + "loss": 1.5528, + "step": 7099 + }, + { + "epoch": 0.23697155011964902, + "grad_norm": 1.5287110805511475, + "learning_rate": 4.5530162868261486e-05, + "loss": 1.5457, + "step": 7130 + }, + { + "epoch": 0.23800186120712577, + "grad_norm": 1.5516417026519775, + "learning_rate": 4.548212413768558e-05, + "loss": 1.5467, + "step": 7161 + }, + { + "epoch": 0.2390321722946025, + "grad_norm": 1.4710053205490112, + "learning_rate": 4.543385426912261e-05, + "loss": 1.5431, + "step": 7192 + }, + { + "epoch": 0.24006248338207922, + "grad_norm": 1.5005567073822021, + "learning_rate": 4.53853538072915e-05, + "loss": 1.5592, + "step": 7223 + }, + { + "epoch": 0.24109279446955598, + "grad_norm": 1.5864965915679932, + "learning_rate": 4.533662329951336e-05, + "loss": 1.5694, + "step": 7254 + }, + { + "epoch": 0.2421231055570327, + "grad_norm": 1.4661896228790283, + "learning_rate": 4.528766329570536e-05, + "loss": 1.545, + "step": 7285 + }, + { + "epoch": 0.24315341664450943, + "grad_norm": 1.5157560110092163, + "learning_rate": 4.523847434837447e-05, + "loss": 1.5458, + "step": 7316 + }, + { + "epoch": 0.24418372773198618, + "grad_norm": 1.4033585786819458, + "learning_rate": 4.518905701261128e-05, + "loss": 1.5464, + "step": 7347 + }, + { + "epoch": 0.2452140388194629, + "grad_norm": 1.5357593297958374, + "learning_rate": 4.5139411846083715e-05, + "loss": 1.5497, + "step": 7378 + }, + { + "epoch": 0.24624434990693964, + "grad_norm": 1.419507384300232, + "learning_rate": 4.508953940903073e-05, + "loss": 1.5414, + "step": 7409 + }, + { + "epoch": 0.2472746609944164, + "grad_norm": 1.5201773643493652, + "learning_rate": 4.5039440264255994e-05, + "loss": 1.5503, + "step": 7440 + }, + { + "epoch": 0.24830497208189312, + "grad_norm": 1.8000444173812866, + "learning_rate": 4.498911497712155e-05, + "loss": 1.5448, + "step": 7471 + }, + { + "epoch": 0.24933528316936984, + "grad_norm": 1.4876810312271118, + "learning_rate": 4.493856411554142e-05, + "loss": 1.5524, + "step": 7502 + }, + { + "epoch": 0.25036559425684657, + "grad_norm": 1.5130078792572021, + "learning_rate": 4.4887788249975206e-05, + "loss": 1.5454, + "step": 7533 + }, + { + "epoch": 0.2513959053443233, + "grad_norm": 1.4829351902008057, + "learning_rate": 4.4836787953421656e-05, + "loss": 1.5407, + "step": 7564 + }, + { + "epoch": 0.2524262164318001, + "grad_norm": 1.521550178527832, + "learning_rate": 4.478556380141218e-05, + "loss": 1.5727, + "step": 7595 + }, + { + "epoch": 0.2534565275192768, + "grad_norm": 1.4377928972244263, + "learning_rate": 4.4734116372004375e-05, + "loss": 1.5432, + "step": 7626 + }, + { + "epoch": 0.25448683860675353, + "grad_norm": 1.4101744890213013, + "learning_rate": 4.4682446245775477e-05, + "loss": 1.547, + "step": 7657 + }, + { + "epoch": 0.2555171496942303, + "grad_norm": 1.522524356842041, + "learning_rate": 4.463055400581586e-05, + "loss": 1.5418, + "step": 7688 + }, + { + "epoch": 0.256547460781707, + "grad_norm": 1.4160797595977783, + "learning_rate": 4.4578440237722374e-05, + "loss": 1.5457, + "step": 7719 + }, + { + "epoch": 0.25757777186918374, + "grad_norm": 1.4106636047363281, + "learning_rate": 4.452610552959183e-05, + "loss": 1.5405, + "step": 7750 + }, + { + "epoch": 0.2586080829566605, + "grad_norm": 1.422723650932312, + "learning_rate": 4.447355047201428e-05, + "loss": 1.5423, + "step": 7781 + }, + { + "epoch": 0.2596383940441372, + "grad_norm": 1.4362592697143555, + "learning_rate": 4.4420775658066414e-05, + "loss": 1.5372, + "step": 7812 + }, + { + "epoch": 0.26066870513161394, + "grad_norm": 1.4319696426391602, + "learning_rate": 4.436778168330484e-05, + "loss": 1.5451, + "step": 7843 + }, + { + "epoch": 0.26169901621909064, + "grad_norm": 1.4069257974624634, + "learning_rate": 4.4314569145759353e-05, + "loss": 1.5221, + "step": 7874 + }, + { + "epoch": 0.2627293273065674, + "grad_norm": 1.4424949884414673, + "learning_rate": 4.42611386459262e-05, + "loss": 1.5419, + "step": 7905 + }, + { + "epoch": 0.26375963839404415, + "grad_norm": 1.4579105377197266, + "learning_rate": 4.420749078676133e-05, + "loss": 1.5116, + "step": 7936 + }, + { + "epoch": 0.26478994948152085, + "grad_norm": 1.4563167095184326, + "learning_rate": 4.4153626173673516e-05, + "loss": 1.5296, + "step": 7967 + }, + { + "epoch": 0.2658202605689976, + "grad_norm": 1.4440968036651611, + "learning_rate": 4.409954541451762e-05, + "loss": 1.5548, + "step": 7998 + }, + { + "epoch": 0.26685057165647436, + "grad_norm": 1.5711034536361694, + "learning_rate": 4.404524911958764e-05, + "loss": 1.535, + "step": 8029 + }, + { + "epoch": 0.26788088274395105, + "grad_norm": 1.5221564769744873, + "learning_rate": 4.399073790160989e-05, + "loss": 1.5495, + "step": 8060 + }, + { + "epoch": 0.2689111938314278, + "grad_norm": 1.392699956893921, + "learning_rate": 4.393601237573607e-05, + "loss": 1.546, + "step": 8091 + }, + { + "epoch": 0.26994150491890456, + "grad_norm": 1.5343137979507446, + "learning_rate": 4.388107315953628e-05, + "loss": 1.549, + "step": 8122 + }, + { + "epoch": 0.27097181600638126, + "grad_norm": 1.4483468532562256, + "learning_rate": 4.382592087299212e-05, + "loss": 1.5424, + "step": 8153 + }, + { + "epoch": 0.272002127093858, + "grad_norm": 1.4963489770889282, + "learning_rate": 4.377055613848964e-05, + "loss": 1.508, + "step": 8184 + }, + { + "epoch": 0.27303243818133477, + "grad_norm": 1.4839162826538086, + "learning_rate": 4.3714979580812355e-05, + "loss": 1.5203, + "step": 8215 + }, + { + "epoch": 0.27406274926881147, + "grad_norm": 1.4272018671035767, + "learning_rate": 4.365919182713416e-05, + "loss": 1.5264, + "step": 8246 + }, + { + "epoch": 0.2750930603562882, + "grad_norm": 1.3808270692825317, + "learning_rate": 4.360319350701226e-05, + "loss": 1.5255, + "step": 8277 + }, + { + "epoch": 0.276123371443765, + "grad_norm": 1.4179162979125977, + "learning_rate": 4.3546985252380115e-05, + "loss": 1.535, + "step": 8308 + }, + { + "epoch": 0.2771536825312417, + "grad_norm": 1.3617374897003174, + "learning_rate": 4.349056769754021e-05, + "loss": 1.5295, + "step": 8339 + }, + { + "epoch": 0.27818399361871843, + "grad_norm": 1.4745615720748901, + "learning_rate": 4.3433941479156994e-05, + "loss": 1.5438, + "step": 8370 + }, + { + "epoch": 0.2792143047061952, + "grad_norm": 1.3661375045776367, + "learning_rate": 4.3377107236249647e-05, + "loss": 1.5134, + "step": 8401 + }, + { + "epoch": 0.2802446157936719, + "grad_norm": 1.3907949924468994, + "learning_rate": 4.332006561018488e-05, + "loss": 1.5237, + "step": 8432 + }, + { + "epoch": 0.28127492688114863, + "grad_norm": 1.3575704097747803, + "learning_rate": 4.3262817244669683e-05, + "loss": 1.5226, + "step": 8463 + }, + { + "epoch": 0.2823052379686254, + "grad_norm": 1.3836462497711182, + "learning_rate": 4.3205362785744083e-05, + "loss": 1.5433, + "step": 8494 + }, + { + "epoch": 0.2833355490561021, + "grad_norm": 1.6108276844024658, + "learning_rate": 4.314770288177384e-05, + "loss": 1.5324, + "step": 8525 + }, + { + "epoch": 0.28436586014357884, + "grad_norm": 1.4650689363479614, + "learning_rate": 4.308983818344313e-05, + "loss": 1.535, + "step": 8556 + }, + { + "epoch": 0.2853961712310556, + "grad_norm": 1.5836583375930786, + "learning_rate": 4.3031769343747206e-05, + "loss": 1.5313, + "step": 8587 + }, + { + "epoch": 0.2864264823185323, + "grad_norm": 1.5348492860794067, + "learning_rate": 4.297349701798505e-05, + "loss": 1.5106, + "step": 8618 + }, + { + "epoch": 0.28745679340600905, + "grad_norm": 1.4060319662094116, + "learning_rate": 4.2915021863751916e-05, + "loss": 1.5283, + "step": 8649 + }, + { + "epoch": 0.2884871044934858, + "grad_norm": 1.531657099723816, + "learning_rate": 4.285634454093198e-05, + "loss": 1.5087, + "step": 8680 + }, + { + "epoch": 0.2895174155809625, + "grad_norm": 1.4756299257278442, + "learning_rate": 4.279746571169086e-05, + "loss": 1.5042, + "step": 8711 + }, + { + "epoch": 0.29054772666843925, + "grad_norm": 1.3221153020858765, + "learning_rate": 4.2738386040468136e-05, + "loss": 1.5244, + "step": 8742 + }, + { + "epoch": 0.29157803775591595, + "grad_norm": 1.4067268371582031, + "learning_rate": 4.2679106193969866e-05, + "loss": 1.5012, + "step": 8773 + }, + { + "epoch": 0.2926083488433927, + "grad_norm": 1.5192064046859741, + "learning_rate": 4.261962684116106e-05, + "loss": 1.521, + "step": 8804 + }, + { + "epoch": 0.29363865993086946, + "grad_norm": 1.3847788572311401, + "learning_rate": 4.2559948653258145e-05, + "loss": 1.5128, + "step": 8835 + }, + { + "epoch": 0.29466897101834616, + "grad_norm": 1.4612780809402466, + "learning_rate": 4.250007230372134e-05, + "loss": 1.5371, + "step": 8866 + }, + { + "epoch": 0.2956992821058229, + "grad_norm": 1.468971610069275, + "learning_rate": 4.2439998468247126e-05, + "loss": 1.5199, + "step": 8897 + }, + { + "epoch": 0.29672959319329967, + "grad_norm": 1.386236310005188, + "learning_rate": 4.2379727824760566e-05, + "loss": 1.5273, + "step": 8928 + }, + { + "epoch": 0.29775990428077637, + "grad_norm": 1.3843929767608643, + "learning_rate": 4.231926105340768e-05, + "loss": 1.5011, + "step": 8959 + }, + { + "epoch": 0.2987902153682531, + "grad_norm": 1.4554557800292969, + "learning_rate": 4.225859883654776e-05, + "loss": 1.5311, + "step": 8990 + }, + { + "epoch": 0.2998205264557299, + "grad_norm": 1.3674421310424805, + "learning_rate": 4.219774185874569e-05, + "loss": 1.5302, + "step": 9021 + }, + { + "epoch": 0.3008508375432066, + "grad_norm": 1.3804330825805664, + "learning_rate": 4.213669080676418e-05, + "loss": 1.538, + "step": 9052 + }, + { + "epoch": 0.3018811486306833, + "grad_norm": 1.4643255472183228, + "learning_rate": 4.2075446369556056e-05, + "loss": 1.5172, + "step": 9083 + }, + { + "epoch": 0.3029114597181601, + "grad_norm": 1.3375928401947021, + "learning_rate": 4.201400923825648e-05, + "loss": 1.5123, + "step": 9114 + }, + { + "epoch": 0.3039417708056368, + "grad_norm": 1.4321980476379395, + "learning_rate": 4.195238010617511e-05, + "loss": 1.5196, + "step": 9145 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.791835162066813e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-9156/training_args.bin b/checkpoint-9156/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..974208468b82a3c5684aaa384776477cf21c18ca --- /dev/null +++ b/checkpoint-9156/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a23be0ff07d6d3142f7c0980f91dddba845519c24fcb411cbb4b9ddb1513ff +size 5304 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aaa74176892d42e1c7f5979b7ddf8ab15985d3 --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "/mnt/parscratch/users/acp23ay/private/models/Llama-3.1-8B-Instruct-ta-madlad-mean/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 138256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/model-00001-of-00007.safetensors b/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2daf804d7e6e513fbf46d8ab1552516bca5fe6cb --- /dev/null +++ b/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46d49734b69cc2edb5667850ac9486f5fd2b23d7175f2aa4edbdab66f483dcff +size 4983197184 diff --git a/model-00002-of-00007.safetensors b/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d32d83cb96a109af411b9cf577e7fbfe07ea76fc --- /dev/null +++ b/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168629c67732309d49f60a5ec48a6d160212bc987365e82e183dfbf74ba0c1f3 +size 4899116432 diff --git a/model-00003-of-00007.safetensors b/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/model-00004-of-00007.safetensors b/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/model-00005-of-00007.safetensors b/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/model-00006-of-00007.safetensors b/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4007112580e058a26d0427e97b8ff6f856ba66bc --- /dev/null +++ b/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0cfa5a25a356ba23308aab1ccf95e47e7b15c0420612d1d30a063442a2c57d +size 4999813120 diff --git a/model-00007-of-00007.safetensors b/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..062439748e773713d4fac56147dc9c0d9ee41736 --- /dev/null +++ b/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:477a55c32130f695f1b4ca9b7aa96ca1bd9aa24b439b5233ee9fe5662f217d42 +size 2734998184 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..318803c6a3dd771c7f7c3b8038a896af7c8322ae --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32448724992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..b59082a85971d01421cdbee83fcd3c3c26d98bc6 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,431379 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 128000, + "content": "<|begin_of_text|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128001, + "content": "<|end_of_text|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128002, + "content": "<|reserved_special_token_0|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128003, + "content": "<|reserved_special_token_1|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128004, + "content": "<|finetune_right_pad_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128005, + "content": "<|reserved_special_token_2|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128006, + "content": "<|start_header_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128007, + "content": "<|end_header_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128008, + "content": "<|eom_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128009, + "content": "<|eot_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128010, + "content": "<|python_tag|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128011, + "content": "<|reserved_special_token_3|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128012, + "content": "<|reserved_special_token_4|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128013, + "content": "<|reserved_special_token_5|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128014, + "content": "<|reserved_special_token_6|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128015, + "content": "<|reserved_special_token_7|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128016, + "content": "<|reserved_special_token_8|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128017, + "content": "<|reserved_special_token_9|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128018, + "content": "<|reserved_special_token_10|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128019, + "content": "<|reserved_special_token_11|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128020, + "content": "<|reserved_special_token_12|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128021, + "content": "<|reserved_special_token_13|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128022, + "content": "<|reserved_special_token_14|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128023, + "content": "<|reserved_special_token_15|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128024, + "content": "<|reserved_special_token_16|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128025, + "content": "<|reserved_special_token_17|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128026, + "content": "<|reserved_special_token_18|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128027, + "content": "<|reserved_special_token_19|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128028, + "content": "<|reserved_special_token_20|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128029, + "content": "<|reserved_special_token_21|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128030, + "content": "<|reserved_special_token_22|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128031, + "content": "<|reserved_special_token_23|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128032, + "content": "<|reserved_special_token_24|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128033, + "content": "<|reserved_special_token_25|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128034, + "content": "<|reserved_special_token_26|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128035, + "content": "<|reserved_special_token_27|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128036, + "content": "<|reserved_special_token_28|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128037, + "content": "<|reserved_special_token_29|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128038, + "content": "<|reserved_special_token_30|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128039, + "content": "<|reserved_special_token_31|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128040, + "content": "<|reserved_special_token_32|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128041, + "content": "<|reserved_special_token_33|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128042, + "content": "<|reserved_special_token_34|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128043, + "content": "<|reserved_special_token_35|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128044, + "content": "<|reserved_special_token_36|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128045, + "content": "<|reserved_special_token_37|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128046, + "content": "<|reserved_special_token_38|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128047, + "content": "<|reserved_special_token_39|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128048, + "content": "<|reserved_special_token_40|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128049, + "content": "<|reserved_special_token_41|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128050, + "content": "<|reserved_special_token_42|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128051, + "content": "<|reserved_special_token_43|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128052, + "content": "<|reserved_special_token_44|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128053, + "content": "<|reserved_special_token_45|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128054, + "content": "<|reserved_special_token_46|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128055, + "content": "<|reserved_special_token_47|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128056, + "content": "<|reserved_special_token_48|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128057, + "content": "<|reserved_special_token_49|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128058, + "content": "<|reserved_special_token_50|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128059, + "content": "<|reserved_special_token_51|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128060, + "content": "<|reserved_special_token_52|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128061, + "content": "<|reserved_special_token_53|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128062, + "content": "<|reserved_special_token_54|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128063, + "content": "<|reserved_special_token_55|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128064, + "content": "<|reserved_special_token_56|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128065, + "content": "<|reserved_special_token_57|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128066, + "content": "<|reserved_special_token_58|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128067, + "content": "<|reserved_special_token_59|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128068, + "content": "<|reserved_special_token_60|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128069, + "content": "<|reserved_special_token_61|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128070, + "content": "<|reserved_special_token_62|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128071, + "content": "<|reserved_special_token_63|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128072, + "content": "<|reserved_special_token_64|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128073, + "content": "<|reserved_special_token_65|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128074, + "content": "<|reserved_special_token_66|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128075, + "content": "<|reserved_special_token_67|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128076, + "content": "<|reserved_special_token_68|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128077, + "content": "<|reserved_special_token_69|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128078, + "content": "<|reserved_special_token_70|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128079, + "content": "<|reserved_special_token_71|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128080, + "content": "<|reserved_special_token_72|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128081, + "content": "<|reserved_special_token_73|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128082, + "content": "<|reserved_special_token_74|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128083, + "content": "<|reserved_special_token_75|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128084, + "content": "<|reserved_special_token_76|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128085, + "content": "<|reserved_special_token_77|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128086, + "content": "<|reserved_special_token_78|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128087, + "content": "<|reserved_special_token_79|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128088, + "content": "<|reserved_special_token_80|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128089, + "content": "<|reserved_special_token_81|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128090, + "content": "<|reserved_special_token_82|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128091, + "content": "<|reserved_special_token_83|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128092, + "content": "<|reserved_special_token_84|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128093, + "content": "<|reserved_special_token_85|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128094, + "content": "<|reserved_special_token_86|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128095, + "content": "<|reserved_special_token_87|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128096, + "content": "<|reserved_special_token_88|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128097, + "content": "<|reserved_special_token_89|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128098, + "content": "<|reserved_special_token_90|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128099, + "content": "<|reserved_special_token_91|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128100, + "content": "<|reserved_special_token_92|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128101, + "content": "<|reserved_special_token_93|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128102, + "content": "<|reserved_special_token_94|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128103, + "content": "<|reserved_special_token_95|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128104, + "content": "<|reserved_special_token_96|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128105, + "content": "<|reserved_special_token_97|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128106, + "content": "<|reserved_special_token_98|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128107, + "content": "<|reserved_special_token_99|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128108, + "content": "<|reserved_special_token_100|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128109, + "content": "<|reserved_special_token_101|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128110, + "content": "<|reserved_special_token_102|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128111, + "content": "<|reserved_special_token_103|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128112, + "content": "<|reserved_special_token_104|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128113, + "content": "<|reserved_special_token_105|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128114, + "content": "<|reserved_special_token_106|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128115, + "content": "<|reserved_special_token_107|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128116, + "content": "<|reserved_special_token_108|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128117, + "content": "<|reserved_special_token_109|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128118, + "content": "<|reserved_special_token_110|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128119, + "content": "<|reserved_special_token_111|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128120, + "content": "<|reserved_special_token_112|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128121, + "content": "<|reserved_special_token_113|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128122, + "content": "<|reserved_special_token_114|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128123, + "content": "<|reserved_special_token_115|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128124, + "content": "<|reserved_special_token_116|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128125, + "content": "<|reserved_special_token_117|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128126, + "content": "<|reserved_special_token_118|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128127, + "content": "<|reserved_special_token_119|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128128, + "content": "<|reserved_special_token_120|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128129, + "content": "<|reserved_special_token_121|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128130, + "content": "<|reserved_special_token_122|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128131, + "content": "<|reserved_special_token_123|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128132, + "content": "<|reserved_special_token_124|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128133, + "content": "<|reserved_special_token_125|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128134, + "content": "<|reserved_special_token_126|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128135, + "content": "<|reserved_special_token_127|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128136, + "content": "<|reserved_special_token_128|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128137, + "content": "<|reserved_special_token_129|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128138, + "content": "<|reserved_special_token_130|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128139, + "content": "<|reserved_special_token_131|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128140, + "content": "<|reserved_special_token_132|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128141, + "content": "<|reserved_special_token_133|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128142, + "content": "<|reserved_special_token_134|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128143, + "content": "<|reserved_special_token_135|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128144, + "content": "<|reserved_special_token_136|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128145, + "content": "<|reserved_special_token_137|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128146, + "content": "<|reserved_special_token_138|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128147, + "content": "<|reserved_special_token_139|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128148, + "content": "<|reserved_special_token_140|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128149, + "content": "<|reserved_special_token_141|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128150, + "content": "<|reserved_special_token_142|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128151, + "content": "<|reserved_special_token_143|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128152, + "content": "<|reserved_special_token_144|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128153, + "content": "<|reserved_special_token_145|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128154, + "content": "<|reserved_special_token_146|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128155, + "content": "<|reserved_special_token_147|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128156, + "content": "<|reserved_special_token_148|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128157, + "content": "<|reserved_special_token_149|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128158, + "content": "<|reserved_special_token_150|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128159, + "content": "<|reserved_special_token_151|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128160, + "content": "<|reserved_special_token_152|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128161, + "content": "<|reserved_special_token_153|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128162, + "content": "<|reserved_special_token_154|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128163, + "content": "<|reserved_special_token_155|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128164, + "content": "<|reserved_special_token_156|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128165, + "content": "<|reserved_special_token_157|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128166, + "content": "<|reserved_special_token_158|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128167, + "content": "<|reserved_special_token_159|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128168, + "content": "<|reserved_special_token_160|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128169, + "content": "<|reserved_special_token_161|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128170, + "content": "<|reserved_special_token_162|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128171, + "content": "<|reserved_special_token_163|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128172, + "content": "<|reserved_special_token_164|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128173, + "content": "<|reserved_special_token_165|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128174, + "content": "<|reserved_special_token_166|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128175, + "content": "<|reserved_special_token_167|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128176, + "content": "<|reserved_special_token_168|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128177, + "content": "<|reserved_special_token_169|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128178, + "content": "<|reserved_special_token_170|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128179, + "content": "<|reserved_special_token_171|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128180, + "content": "<|reserved_special_token_172|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128181, + "content": "<|reserved_special_token_173|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128182, + "content": "<|reserved_special_token_174|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128183, + "content": "<|reserved_special_token_175|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128184, + "content": "<|reserved_special_token_176|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128185, + "content": "<|reserved_special_token_177|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128186, + "content": "<|reserved_special_token_178|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128187, + "content": "<|reserved_special_token_179|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128188, + "content": "<|reserved_special_token_180|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128189, + "content": "<|reserved_special_token_181|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128190, + "content": "<|reserved_special_token_182|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128191, + "content": "<|reserved_special_token_183|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128192, + "content": "<|reserved_special_token_184|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128193, + "content": "<|reserved_special_token_185|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128194, + "content": "<|reserved_special_token_186|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128195, + "content": "<|reserved_special_token_187|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128196, + "content": "<|reserved_special_token_188|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128197, + "content": "<|reserved_special_token_189|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128198, + "content": "<|reserved_special_token_190|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128199, + "content": "<|reserved_special_token_191|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128200, + "content": "<|reserved_special_token_192|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128201, + "content": "<|reserved_special_token_193|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128202, + "content": "<|reserved_special_token_194|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128203, + "content": "<|reserved_special_token_195|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128204, + "content": "<|reserved_special_token_196|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128205, + "content": "<|reserved_special_token_197|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128206, + "content": "<|reserved_special_token_198|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128207, + "content": "<|reserved_special_token_199|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128208, + "content": "<|reserved_special_token_200|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128209, + "content": "<|reserved_special_token_201|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128210, + "content": "<|reserved_special_token_202|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128211, + "content": "<|reserved_special_token_203|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128212, + "content": "<|reserved_special_token_204|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128213, + "content": "<|reserved_special_token_205|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128214, + "content": "<|reserved_special_token_206|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128215, + "content": "<|reserved_special_token_207|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128216, + "content": "<|reserved_special_token_208|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128217, + "content": "<|reserved_special_token_209|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128218, + "content": "<|reserved_special_token_210|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128219, + "content": "<|reserved_special_token_211|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128220, + "content": "<|reserved_special_token_212|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128221, + "content": "<|reserved_special_token_213|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128222, + "content": "<|reserved_special_token_214|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128223, + "content": "<|reserved_special_token_215|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128224, + "content": "<|reserved_special_token_216|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128225, + "content": "<|reserved_special_token_217|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128226, + "content": "<|reserved_special_token_218|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128227, + "content": "<|reserved_special_token_219|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128228, + "content": "<|reserved_special_token_220|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128229, + "content": "<|reserved_special_token_221|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128230, + "content": "<|reserved_special_token_222|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128231, + "content": "<|reserved_special_token_223|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128232, + "content": "<|reserved_special_token_224|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128233, + "content": "<|reserved_special_token_225|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128234, + "content": "<|reserved_special_token_226|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128235, + "content": "<|reserved_special_token_227|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128236, + "content": "<|reserved_special_token_228|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128237, + "content": "<|reserved_special_token_229|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128238, + "content": "<|reserved_special_token_230|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128239, + "content": "<|reserved_special_token_231|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128240, + "content": "<|reserved_special_token_232|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128241, + "content": "<|reserved_special_token_233|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128242, + "content": "<|reserved_special_token_234|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128243, + "content": "<|reserved_special_token_235|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128244, + "content": "<|reserved_special_token_236|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128245, + "content": "<|reserved_special_token_237|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128246, + "content": "<|reserved_special_token_238|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128247, + "content": "<|reserved_special_token_239|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128248, + "content": "<|reserved_special_token_240|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128249, + "content": "<|reserved_special_token_241|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128250, + "content": "<|reserved_special_token_242|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128251, + "content": "<|reserved_special_token_243|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128252, + "content": "<|reserved_special_token_244|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128253, + "content": "<|reserved_special_token_245|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128254, + "content": "<|reserved_special_token_246|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128255, + "content": "<|reserved_special_token_247|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "Split", + "pattern": { + "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + }, + "behavior": "Isolated", + "invert": false + }, + { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": false + } + ] + }, + "post_processor": { + "type": "Sequence", + "processors": [ + { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": false, + "use_regex": true + }, + { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 1 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "<|begin_of_text|>": { + "id": "<|begin_of_text|>", + "ids": [ + 128000 + ], + "tokens": [ + "<|begin_of_text|>" + ] + } + } + } + ] + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "!": 0, + "\"": 1, + "#": 2, + "$": 3, + "%": 4, + "&": 5, + "'": 6, + "(": 7, + ")": 8, + "*": 9, + "+": 10, + ",": 11, + "-": 12, + ".": 13, + "/": 14, + "0": 15, + "1": 16, + "2": 17, + "3": 18, + "4": 19, + "5": 20, + "6": 21, + "7": 22, + "8": 23, + "9": 24, + ":": 25, + ";": 26, + "<": 27, + "=": 28, + ">": 29, + "?": 30, + "@": 31, + "A": 32, + "B": 33, + "C": 34, + "D": 35, + "E": 36, + "F": 37, + "G": 38, + "H": 39, + "I": 40, + "J": 41, + "K": 42, + "L": 43, + "M": 44, + "N": 45, + "O": 46, + "P": 47, + "Q": 48, + "R": 49, + "S": 50, + "T": 51, + "U": 52, + "V": 53, + "W": 54, + "X": 55, + "Y": 56, + "Z": 57, + "[": 58, + "\\": 59, + "]": 60, + "^": 61, + "_": 62, + "`": 63, + "a": 64, + "b": 65, + "c": 66, + "d": 67, + "e": 68, + "f": 69, + "g": 70, + "h": 71, + "i": 72, + "j": 73, + "k": 74, + "l": 75, + "m": 76, + "n": 77, + "o": 78, + "p": 79, + "q": 80, + "r": 81, + "s": 82, + "t": 83, + "u": 84, + "v": 85, + "w": 86, + "x": 87, + "y": 88, + "z": 89, + "{": 90, + "|": 91, + "}": 92, + "~": 93, + "¡": 94, + "¢": 95, + "£": 96, + "¤": 97, + "¥": 98, + "¦": 99, + "§": 100, + "¨": 101, + "©": 102, + "ª": 103, + "«": 104, + "¬": 105, + "®": 106, + "¯": 107, + "°": 108, + "±": 109, + "²": 110, + "³": 111, + "´": 112, + "µ": 113, + "¶": 114, + "·": 115, + "¸": 116, + "¹": 117, + "º": 118, + "»": 119, + "¼": 120, + "½": 121, + "¾": 122, + "¿": 123, + "À": 124, + "Á": 125, + "Â": 126, + "Ã": 127, + "Ä": 128, + "Å": 129, + "Æ": 130, + "Ç": 131, + "È": 132, + "É": 133, + "Ê": 134, + "Ë": 135, + "Ì": 136, + "Í": 137, + "Î": 138, + "Ï": 139, + "Ð": 140, + "Ñ": 141, + "Ò": 142, + "Ó": 143, + "Ô": 144, + "Õ": 145, + "Ö": 146, + "×": 147, + "Ø": 148, + "Ù": 149, + "Ú": 150, + "Û": 151, + "Ü": 152, + "Ý": 153, + "Þ": 154, + "ß": 155, + "à": 156, + "á": 157, + "â": 158, + "ã": 159, + "ä": 160, + "å": 161, + "æ": 162, + "ç": 163, + "è": 164, + "é": 165, + "ê": 166, + "ë": 167, + "ì": 168, + "í": 169, + "î": 170, + "ï": 171, + "ð": 172, + "ñ": 173, + "ò": 174, + "ó": 175, + "ô": 176, + "õ": 177, + "ö": 178, + "÷": 179, + "ø": 180, + "ù": 181, + "ú": 182, + "û": 183, + "ü": 184, + "ý": 185, + "þ": 186, + "ÿ": 187, + "Ā": 188, + "ā": 189, + "Ă": 190, + "ă": 191, + "Ą": 192, + "ą": 193, + "Ć": 194, + "ć": 195, + "Ĉ": 196, + "ĉ": 197, + "Ċ": 198, + "ċ": 199, + "Č": 200, + "č": 201, + "Ď": 202, + "ď": 203, + "Đ": 204, + "đ": 205, + "Ē": 206, + "ē": 207, + "Ĕ": 208, + "ĕ": 209, + "Ė": 210, + "ė": 211, + "Ę": 212, + "ę": 213, + "Ě": 214, + "ě": 215, + "Ĝ": 216, + "ĝ": 217, + "Ğ": 218, + "ğ": 219, + "Ġ": 220, + "ġ": 221, + "Ģ": 222, + "ģ": 223, + "Ĥ": 224, + "ĥ": 225, + "Ħ": 226, + "ħ": 227, + "Ĩ": 228, + "ĩ": 229, + "Ī": 230, + "ī": 231, + "Ĭ": 232, + "ĭ": 233, + "Į": 234, + "į": 235, + "İ": 236, + "ı": 237, + "IJ": 238, + "ij": 239, + "Ĵ": 240, + "ĵ": 241, + "Ķ": 242, + "ķ": 243, + "ĸ": 244, + "Ĺ": 245, + "ĺ": 246, + "Ļ": 247, + "ļ": 248, + "Ľ": 249, + "ľ": 250, + "Ŀ": 251, + "ŀ": 252, + "Ł": 253, + "ł": 254, + "Ń": 255, + "ĠĠ": 256, + "ĠĠĠĠ": 257, + "in": 258, + "Ġt": 259, + "ĠĠĠĠĠĠĠĠ": 260, + "er": 261, + "ĠĠĠ": 262, + "on": 263, + "Ġa": 264, + "re": 265, + "at": 266, + "st": 267, + "en": 268, + "or": 269, + "Ġth": 270, + "ĊĊ": 271, + "Ġc": 272, + "le": 273, + "Ġs": 274, + "it": 275, + "an": 276, + "ar": 277, + "al": 278, + "Ġthe": 279, + ";Ċ": 280, + "Ġp": 281, + "Ġf": 282, + "ou": 283, + "Ġ=": 284, + "is": 285, + "ĠĠĠĠĠĠĠ": 286, + "ing": 287, + "es": 288, + "Ġw": 289, + "ion": 290, + "ed": 291, + "ic": 292, + "Ġb": 293, + "Ġd": 294, + "et": 295, + "Ġm": 296, + "Ġo": 297, + "ĉĉ": 298, + "ro": 299, + "as": 300, + "el": 301, + "ct": 302, + "nd": 303, + "Ġin": 304, + "Ġh": 305, + "ent": 306, + "id": 307, + "Ġn": 308, + "am": 309, + "ĠĠĠĠĠĠĠĠĠĠĠ": 310, + "Ġto": 311, + "Ġre": 312, + "--": 313, + "Ġ{": 314, + "Ġof": 315, + "om": 316, + ");Ċ": 317, + "im": 318, + "čĊ": 319, + "Ġ(": 320, + "il": 321, + "//": 322, + "Ġand": 323, + "ur": 324, + "se": 325, + "Ġl": 326, + "ex": 327, + "ĠS": 328, + "ad": 329, + "Ġ\"": 330, + "ch": 331, + "ut": 332, + "if": 333, + "**": 334, + "Ġ}": 335, + "em": 336, + "ol": 337, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 338, + "th": 339, + ")Ċ": 340, + "Ġ{Ċ": 341, + "Ġg": 342, + "ig": 343, + "iv": 344, + ",Ċ": 345, + "ce": 346, + "od": 347, + "Ġv": 348, + "ate": 349, + "ĠT": 350, + "ag": 351, + "ay": 352, + "Ġ*": 353, + "ot": 354, + "us": 355, + "ĠC": 356, + "Ġst": 357, + "ĠI": 358, + "un": 359, + "ul": 360, + "ue": 361, + "ĠA": 362, + "ow": 363, + "Ġ'": 364, + "ew": 365, + "Ġ<": 366, + "ation": 367, + "()": 368, + "Ġfor": 369, + "ab": 370, + "ort": 371, + "um": 372, + "ame": 373, + "Ġis": 374, + "pe": 375, + "tr": 376, + "ck": 377, + "âĢ": 378, + "Ġy": 379, + "ist": 380, + "----": 381, + ".ĊĊ": 382, + "he": 383, + "Ġe": 384, + "lo": 385, + "ĠM": 386, + "Ġbe": 387, + "ers": 388, + "Ġon": 389, + "Ġcon": 390, + "ap": 391, + "ub": 392, + "ĠP": 393, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 394, + "ass": 395, + "int": 396, + ">Ċ": 397, + "ly": 398, + "urn": 399, + "Ġ$": 400, + ";ĊĊ": 401, + "av": 402, + "port": 403, + "ir": 404, + "->": 405, + "nt": 406, + "ction": 407, + "end": 408, + "Ġde": 409, + "00": 410, + "ith": 411, + "out": 412, + "turn": 413, + "our": 414, + "ĠĠĠĠĠ": 415, + "lic": 416, + "res": 417, + "pt": 418, + "==": 419, + "Ġthis": 420, + "Ġwh": 421, + "Ġif": 422, + "ĠD": 423, + "ver": 424, + "age": 425, + "ĠB": 426, + "ht": 427, + "ext": 428, + "=\"": 429, + "Ġthat": 430, + "****": 431, + "ĠR": 432, + "Ġit": 433, + "ess": 434, + "ĠF": 435, + "Ġr": 436, + "os": 437, + "and": 438, + "Ġas": 439, + "ect": 440, + "ke": 441, + "rom": 442, + "Ġ//": 443, + "con": 444, + "ĠL": 445, + "(\"": 446, + "qu": 447, + "lass": 448, + "Ġwith": 449, + "iz": 450, + "de": 451, + "ĠN": 452, + "Ġal": 453, + "op": 454, + "up": 455, + "get": 456, + "Ġ}Ċ": 457, + "ile": 458, + "Ġan": 459, + "ata": 460, + "ore": 461, + "ri": 462, + "Ġpro": 463, + ";čĊ": 464, + "ĉĉĉĉ": 465, + "ter": 466, + "ain": 467, + "ĠW": 468, + "ĠE": 469, + "Ġcom": 470, + "Ġreturn": 471, + "art": 472, + "ĠH": 473, + "ack": 474, + "import": 475, + "ublic": 476, + "Ġor": 477, + "est": 478, + "ment": 479, + "ĠG": 480, + "able": 481, + "Ġ-": 482, + "ine": 483, + "ill": 484, + "ind": 485, + "ere": 486, + "::": 487, + "ity": 488, + "Ġ+": 489, + "Ġtr": 490, + "elf": 491, + "ight": 492, + "('": 493, + "orm": 494, + "ult": 495, + "str": 496, + "..": 497, + "\",": 498, + "Ġyou": 499, + "ype": 500, + "pl": 501, + "Ġnew": 502, + "Ġj": 503, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 504, + "Ġfrom": 505, + "Ġex": 506, + "ĠO": 507, + "20": 508, + "ld": 509, + "Ġ[": 510, + "oc": 511, + ":Ċ": 512, + "Ġse": 513, + "Ġle": 514, + "--------": 515, + ".s": 516, + "{Ċ": 517, + "',": 518, + "ant": 519, + "Ġat": 520, + "ase": 521, + ".c": 522, + "Ġch": 523, + "": 524, + "ave": 525, + "ang": 526, + "Ġare": 527, + "Ġint": 528, + "âĢĻ": 529, + "_t": 530, + "ert": 531, + "ial": 532, + "act": 533, + "}Ċ": 534, + "ive": 535, + "ode": 536, + "ost": 537, + "Ġclass": 538, + "Ġnot": 539, + "og": 540, + "ord": 541, + "alue": 542, + "all": 543, + "ff": 544, + "();Ċ": 545, + "ont": 546, + "ime": 547, + "are": 548, + "ĠU": 549, + "Ġpr": 550, + "Ġ:": 551, + "ies": 552, + "ize": 553, + "ure": 554, + "Ġby": 555, + "ire": 556, + "Ġ}ĊĊ": 557, + ".p": 558, + "Ġsh": 559, + "ice": 560, + "ast": 561, + "ption": 562, + "tring": 563, + "ok": 564, + "__": 565, + "cl": 566, + "##": 567, + "Ġhe": 568, + "ard": 569, + ").": 570, + "Ġ@": 571, + "iew": 572, + "ĉĉĉ": 573, + "Ġwas": 574, + "ip": 575, + "this": 576, + "Ġu": 577, + "ĠThe": 578, + "ide": 579, + "ace": 580, + "ib": 581, + "ac": 582, + "rou": 583, + "Ġwe": 584, + "ject": 585, + "Ġpublic": 586, + "ak": 587, + "ve": 588, + "ath": 589, + "oid": 590, + "Ġ=>": 591, + "ust": 592, + "que": 593, + "Ġres": 594, + "))": 595, + "'s": 596, + "Ġk": 597, + "ans": 598, + "yst": 599, + "unction": 600, + "********": 601, + "Ġi": 602, + "Ġus": 603, + "pp": 604, + "10": 605, + "one": 606, + "ail": 607, + "====": 608, + "name": 609, + "Ġstr": 610, + "Ġ/": 611, + "Ġ&": 612, + "ach": 613, + "div": 614, + "ystem": 615, + "ell": 616, + "Ġhave": 617, + "err": 618, + "ould": 619, + "ull": 620, + "pon": 621, + "ĠJ": 622, + "_p": 623, + "Ġ==": 624, + "ign": 625, + "St": 626, + ".Ċ": 627, + "Ġpl": 628, + ");ĊĊ": 629, + "form": 630, + "put": 631, + "ount": 632, + "}ĊĊ": 633, + "dd": 634, + "ite": 635, + "Ġget": 636, + "rr": 637, + "ome": 638, + "ĠâĢ": 639, + "aram": 640, + "cc": 641, + "Ġ*/": 642, + "ER": 643, + "In": 644, + "les": 645, + "_s": 646, + "ong": 647, + "ie": 648, + "Ġcan": 649, + "ĠV": 650, + "erv": 651, + "pr": 652, + "Ġun": 653, + "row": 654, + "ber": 655, + "Ġdo": 656, + "ll": 657, + "Ġel": 658, + "Ġself": 659, + "ated": 660, + "ary": 661, + "Ġ.": 662, + "']": 663, + "ud": 664, + "Ġen": 665, + "ĠTh": 666, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 667, + "te": 668, + "_c": 669, + "uct": 670, + "Ġab": 671, + "ork": 672, + ".get": 673, + "Ġ#": 674, + "aw": 675, + "ress": 676, + "ob": 677, + "Name": 678, + "201": 679, + "app": 680, + "['": 681, + "Ġall": 682, + "ory": 683, + "ition": 684, + "ance": 685, + "ear": 686, + "Ġcont": 687, + "vent": 688, + "ia": 689, + "Ġwill": 690, + "IN": 691, + "ĠĠĠĠĠĠĠĠĠ": 692, + "return": 693, + "Ġ": 694, + "data": 695, + ")ĊĊ": 696, + "Re": 697, + "ple": 698, + "ild": 699, + "ther": 700, + "Ġyour": 701, + "\"Ċ": 702, + "($": 703, + "Ġout": 704, + "),": 705, + "Ġhas": 706, + "String": 707, + "so": 708, + "Ġup": 709, + "ax": 710, + "Ġdef": 711, + "Ġbo": 712, + "ge": 713, + "alse": 714, + "ON": 715, + "per": 716, + "12": 717, + "ich": 718, + "Ġbut": 719, + "ĠĊ": 720, + "Ġ_": 721, + "_m": 722, + "add": 723, + "quest": 724, + "odel": 725, + "self": 726, + "ery": 727, + "ft": 728, + "ens": 729, + "////": 730, + "ake": 731, + ".C": 732, + "Ġgo": 733, + "Ġfunction": 734, + "ĠK": 735, + "ivate": 736, + "Ġim": 737, + "Ġconst": 738, + ".t": 739, + "Ġ*/Ċ": 740, + ");čĊ": 741, + "Ġvoid": 742, + "Ġset": 743, + "ĠSystem": 744, + "cri": 745, + "()Ċ": 746, + "li": 747, + "ĉif": 748, + ".m": 749, + "ally": 750, + "set": 751, + "ep": 752, + "âĢĻs": 753, + "bo": 754, + "def": 755, + "',Ċ": 756, + "Ġme": 757, + "Ġ!": 758, + "atch": 759, + "\">": 760, + "\",Ċ": 761, + "ec": 762, + "ĠIn": 763, + "ph": 764, + "Ġ|": 765, + "_f": 766, + "Ġvar": 767, + "ence": 768, + "Id": 769, + "ree": 770, + "ink": 771, + "lect": 772, + "ug": 773, + "eth": 774, + "Ġelse": 775, + "----------------": 776, + "19": 777, + "cont": 778, + "Ġso": 779, + "atic": 780, + "Ġlo": 781, + "pro": 782, + "ton": 783, + "ss": 784, + "own": 785, + "abel": 786, + "oint": 787, + "ous": 788, + "eld": 789, + "ST": 790, + "The": 791, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 792, + "RE": 793, + "\":": 794, + "olor": 795, + "tp": 796, + "eg": 797, + "key": 798, + "ude": 799, + "ĠSt": 800, + "ound": 801, + "Ġar": 802, + "\");Ċ": 803, + "ener": 804, + "ser": 805, + "11": 806, + "bject": 807, + "essage": 808, + "fer": 809, + "Ġmore": 810, + "ations": 811, + "ents": 812, + "Ġhis": 813, + "Ġthey": 814, + ".S": 815, + "ĠY": 816, + "use": 817, + "ne": 818, + "ish": 819, + "old": 820, + "_d": 821, + "io": 822, + "ield": 823, + "Ġper": 824, + "Cont": 825, + "ings": 826, + "####": 827, + "Ġdata": 828, + "Ġsa": 829, + "ef": 830, + "fo": 831, + "Ġone": 832, + "eng": 833, + "Ġdis": 834, + "AT": 835, + "Ġname": 836, + "Ġtrue": 837, + "val": 838, + "led": 839, + ".f": 840, + "Ġne": 841, + "Ġend": 842, + "32": 843, + ".T": 844, + "16": 845, + "cre": 846, + "ark": 847, + "log": 848, + "Ex": 849, + "error": 850, + "_id": 851, + "urre": 852, + "ange": 853, + "Ġnull": 854, + "rray": 855, + "Ġmy": 856, + "pan": 857, + "ict": 858, + "ator": 859, + "View": 860, + "List": 861, + "ĉreturn": 862, + "âĢĿ": 863, + "Ġpre": 864, + "Ġx": 865, + "clude": 866, + "arg": 867, + "15": 868, + "ov": 869, + ".h": 870, + "Ġ>": 871, + "Ġtheir": 872, + "')": 873, + "irst": 874, + "ick": 875, + "gh": 876, + "LE": 877, + "OR": 878, + "Ġprivate": 879, + "tem": 880, + "čĊčĊ": 881, + "user": 882, + "Ġ)": 883, + "com": 884, + ".A": 885, + "\";Ċ": 886, + "Ġid": 887, + "read": 888, + "Ġwho": 889, + "_b": 890, + "\">Ċ": 891, + "Ġtime": 892, + "Ġman": 893, + "ry": 894, + "========": 895, + "roup": 896, + "rop": 897, + "public": 898, + "vel": 899, + "umber": 900, + "ble": 901, + "Ġwhich": 902, + "****************": 903, + "Ġany": 904, + "Ġfalse": 905, + "we": 906, + "Ġvalue": 907, + "Ġli": 908, + "\")": 909, + "nder": 910, + "gr": 911, + "Ġno": 912, + "param": 913, + "25": 914, + "fig": 915, + ".com": 916, + "Ġapp": 917, + "_l": 918, + "ions": 919, + ".D": 920, + "ĠCh": 921, + "Ġabout": 922, + "Ġadd": 923, + "Ġsu": 924, + "Ġstring": 925, + "ID": 926, + "Ġover": 927, + "string": 928, + ".l": 929, + "ource": 930, + "000": 931, + "_C": 932, + "]Ċ": 933, + "Ġqu": 934, + "ĠString": 935, + "ca": 936, + "SE": 937, + "Ġro": 938, + "sh": 939, + "ual": 940, + "Type": 941, + "son": 942, + "new": 943, + "ern": 944, + "Ġag": 945, + "AR": 946, + "];Ċ": 947, + "].": 948, + "Ġ?": 949, + "ical": 950, + "Ġdes": 951, + "uth": 952, + "ix": 953, + "ays": 954, + "Ġtype": 955, + "'t": 956, + "ault": 957, + "Ġinter": 958, + "var": 959, + ".b": 960, + "Ġpart": 961, + ".d": 962, + "urrent": 963, + "IT": 964, + "EN": 965, + "30": 966, + "enc": 967, + "(f": 968, + "ra": 969, + "value": 970, + "cho": 971, + "18": 972, + "utton": 973, + "ose": 974, + "14": 975, + "Ġ!=": 976, + "ater": 977, + "é": 978, + "reate": 979, + "oll": 980, + "pos": 981, + "yle": 982, + "ng": 983, + "AL": 984, + "using": 985, + "ames": 986, + "Ġ{čĊ": 987, + "ates": 988, + "ely": 989, + "Ġwork": 990, + "Ġem": 991, + "inal": 992, + "Ġsp": 993, + "Ġwhen": 994, + ".set": 995, + "ĠĠĠĠĠĠ": 996, + "):Ċ": 997, + "to": 998, + "quire": 999, + "indow": 1000, + "lement": 1001, + "pect": 1002, + "ash": 1003, + "[i": 1004, + "Ġuse": 1005, + ".F": 1006, + "pec": 1007, + "Ġad": 1008, + "ove": 1009, + "ception": 1010, + "ength": 1011, + "include": 1012, + "ader": 1013, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1014, + "atus": 1015, + "Th": 1016, + "itle": 1017, + "rit": 1018, + "void": 1019, + "().": 1020, + "(Ċ": 1021, + "Ġoff": 1022, + "Ġother": 1023, + "Ġ&&": 1024, + "';Ċ": 1025, + "ms": 1026, + "Ġbeen": 1027, + "Ġte": 1028, + "ml": 1029, + "co": 1030, + "nc": 1031, + "13": 1032, + "ervice": 1033, + "Ġ%": 1034, + "**Ċ": 1035, + "ann": 1036, + "ade": 1037, + "ĊĊĊĊ": 1038, + "lock": 1039, + "const": 1040, + "100": 1041, + "ponse": 1042, + "Ġsup": 1043, + "++": 1044, + "date": 1045, + "Ġacc": 1046, + "Ġhad": 1047, + "Ġbu": 1048, + "200": 1049, + "ĠRe": 1050, + "Ġwere": 1051, + "Ġfile": 1052, + "Ġwould": 1053, + "ĠâĢľ": 1054, + "ven": 1055, + "iss": 1056, + "Ġour": 1057, + "class": 1058, + "raw": 1059, + "Ġyear": 1060, + "Data": 1061, + "Ġval": 1062, + "Ġsome": 1063, + "fter": 1064, + "ys": 1065, + "Ġ///": 1066, + "round": 1067, + "view": 1068, + "Ġpe": 1069, + "Ġthere": 1070, + "Ġsaid": 1071, + "du": 1072, + "of": 1073, + "line": 1074, + "/*": 1075, + "duct": 1076, + "Ġher": 1077, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1078, + "Res": 1079, + "Ġco": 1080, + "Ġcomm": 1081, + "ise": 1082, + "min": 1083, + "ĠĠĠĠĊ": 1084, + "#include": 1085, + "ethod": 1086, + ".P": 1087, + "ute": 1088, + "Ġass": 1089, + "Int": 1090, + "ask": 1091, + "loc": 1092, + "Ġlike": 1093, + "ody": 1094, + "Ġlet": 1095, + "load": 1096, + "Ġam": 1097, + "rol": 1098, + "Ġgr": 1099, + "yp": 1100, + "Ġalso": 1101, + "ĠIt": 1102, + "url": 1103, + "ific": 1104, + "ors": 1105, + "_P": 1106, + "_n": 1107, + "igh": 1108, + "Ġthan": 1109, + "Com": 1110, + "AN": 1111, + "UL": 1112, + "ating": 1113, + "17": 1114, + "ĠThis": 1115, + "ref": 1116, + "_S": 1117, + "Ġstatic": 1118, + "roll": 1119, + "Ġjust": 1120, + "Ġresult": 1121, + "ian": 1122, + "idth": 1123, + "Ġthem": 1124, + "));Ċ": 1125, + "der": 1126, + "reak": 1127, + "Con": 1128, + "://": 1129, + "ule": 1130, + "...": 1131, + "arch": 1132, + "ement": 1133, + "Ġ<<": 1134, + "50": 1135, + "ush": 1136, + "ense": 1137, + "arr": 1138, + "Ġinto": 1139, + "cess": 1140, + "amp": 1141, + "ied": 1142, + "ument": 1143, + "Ġ\\": 1144, + "],": 1145, + "wo": 1146, + "als": 1147, + "Ġwhat": 1148, + "anc": 1149, + "Value": 1150, + "='": 1151, + "olum": 1152, + "Ġpos": 1153, + "ages": 1154, + "ayer": 1155, + "Ġsc": 1156, + "ues": 1157, + "\")Ċ": 1158, + "_T": 1159, + "Ġlist": 1160, + "(s": 1161, + "Ġcase": 1162, + "Ch": 1163, + "ĉĉĉĉĉ": 1164, + "////////": 1165, + "ponent": 1166, + "Ġz": 1167, + "Ġkn": 1168, + "let": 1169, + "DE": 1170, + "red": 1171, + "Ġfe": 1172, + "Ġ},Ċ": 1173, + "Ġ,": 1174, + "(t": 1175, + "Ġfirst": 1176, + "');Ċ": 1177, + "word": 1178, + "Ġimport": 1179, + "Ġact": 1180, + "Ġchar": 1181, + "CT": 1182, + "ĠTr": 1183, + "ople": 1184, + "={": 1185, + "ĉf": 1186, + "24": 1187, + "ient": 1188, + "cent": 1189, + ".j": 1190, + "lection": 1191, + "))Ċ": 1192, + "Ġonly": 1193, + "Ġprint": 1194, + "mer": 1195, + ".W": 1196, + "ock": 1197, + "Ġ--": 1198, + "Text": 1199, + "Ġop": 1200, + "ank": 1201, + "Ġits": 1202, + "Ġback": 1203, + "[\"": 1204, + "Ġneed": 1205, + "Ġcl": 1206, + "Ġsub": 1207, + "Ġla": 1208, + "((": 1209, + ".\"": 1210, + "Object": 1211, + "Ġstart": 1212, + "file": 1213, + "(self": 1214, + "ner": 1215, + "ey": 1216, + "Ġuser": 1217, + "Ġent": 1218, + "ĠCom": 1219, + "its": 1220, + "ĠCon": 1221, + "ouble": 1222, + "ower": 1223, + "item": 1224, + "very": 1225, + "ĠWe": 1226, + "64": 1227, + "lick": 1228, + "ĠQ": 1229, + "php": 1230, + "ttp": 1231, + "':": 1232, + "ics": 1233, + "Ġunder": 1234, + "Ġ*Ċ": 1235, + ".L": 1236, + ");": 1237, + "ices": 1238, + "Ġreg": 1239, + ")čĊ": 1240, + "ĉpublic": 1241, + "SS": 1242, + "Ġthen": 1243, + "reat": 1244, + "ious": 1245, + ".G": 1246, + "ek": 1247, + "irect": 1248, + "heck": 1249, + "cript": 1250, + "ning": 1251, + "ĠUn": 1252, + "Ġmay": 1253, + "ĠWh": 1254, + "Bo": 1255, + "Item": 1256, + "struct": 1257, + ".st": 1258, + "ream": 1259, + "ible": 1260, + "loat": 1261, + "Ġorg": 1262, + "und": 1263, + "sum": 1264, + "_in": 1265, + "../": 1266, + "_M": 1267, + "Ġhow": 1268, + "rite": 1269, + "'Ċ": 1270, + "To": 1271, + "40": 1272, + "ww": 1273, + "Ġpeople": 1274, + "index": 1275, + ".n": 1276, + "http": 1277, + "(m": 1278, + "ector": 1279, + "Ġind": 1280, + "Ġjav": 1281, + "],Ċ": 1282, + "ĠHe": 1283, + "_st": 1284, + "ful": 1285, + "ole": 1286, + "){Ċ": 1287, + "Ġshould": 1288, + "opy": 1289, + "elp": 1290, + "ier": 1291, + "_name": 1292, + "erson": 1293, + "ION": 1294, + "ote": 1295, + "Ġtest": 1296, + "Ġbet": 1297, + "rror": 1298, + "ular": 1299, + "ãĢ": 1300, + "ĠÐ": 1301, + "bs": 1302, + "ting": 1303, + "Ġmake": 1304, + "Tr": 1305, + "Ġafter": 1306, + "arget": 1307, + "RO": 1308, + "olumn": 1309, + "rc": 1310, + "_re": 1311, + "define": 1312, + "22": 1313, + "Ġright": 1314, + "right": 1315, + "day": 1316, + "Ġlong": 1317, + "[]": 1318, + "(p": 1319, + "td": 1320, + "cond": 1321, + "ĠPro": 1322, + "Ġrem": 1323, + "ptions": 1324, + "vid": 1325, + ".g": 1326, + "Ġext": 1327, + "Ġ__": 1328, + "')Ċ": 1329, + "pace": 1330, + "mp": 1331, + "Ġmin": 1332, + "stance": 1333, + "air": 1334, + "action": 1335, + "wh": 1336, + "type": 1337, + "util": 1338, + "ait": 1339, + "": 1340, + "IC": 1341, + "text": 1342, + "Ġph": 1343, + "Ġfl": 1344, + ".M": 1345, + "ccess": 1346, + "br": 1347, + "fore": 1348, + "ersion": 1349, + "),Ċ": 1350, + ".re": 1351, + "ateg": 1352, + "Ġloc": 1353, + "ins": 1354, + "-s": 1355, + "trib": 1356, + "ĠInt": 1357, + "Ġarray": 1358, + ",\"": 1359, + "Pro": 1360, + "(c": 1361, + "ession": 1362, + ">ĊĊ": 1363, + "Ġshe": 1364, + "\"]": 1365, + "aph": 1366, + "Ġexp": 1367, + "erty": 1368, + "ĠSe": 1369, + "Ġpar": 1370, + "unc": 1371, + "ET": 1372, + "Ġread": 1373, + "print": 1374, + "Ġrel": 1375, + "Ġform": 1376, + "Ġdr": 1377, + "Exception": 1378, + "input": 1379, + "Ġtrans": 1380, + "########": 1381, + "order": 1382, + "By": 1383, + "Ġaw": 1384, + "ities": 1385, + "uff": 1386, + "play": 1387, + ".add": 1388, + "ĠâĢĵ": 1389, + "Ġwant": 1390, + "Ġcomp": 1391, + "ments": 1392, + "Ġ||": 1393, + "az": 1394, + "be": 1395, + "Ġnumber": 1396, + "Ġrequire": 1397, + "ĠEx": 1398, + "60": 1399, + "Ġcol": 1400, + "Ġkey": 1401, + "ember": 1402, + "Ġtwo": 1403, + "Ġsize": 1404, + "Ġwhere": 1405, + "UT": 1406, + "result": 1407, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1408, + "ough": 1409, + "orld": 1410, + "ood": 1411, + "uch": 1412, + "ative": 1413, + "ger": 1414, + "arent": 1415, + "Ġ/*": 1416, + "Ġarg": 1417, + "Ġwhile": 1418, + "23": 1419, + "(this": 1420, + "Ġrec": 1421, + "Ġdif": 1422, + "State": 1423, + "Ġspec": 1424, + "ride": 1425, + "_F": 1426, + "Ġlook": 1427, + "AM": 1428, + "ility": 1429, + "eter": 1430, + "âĢĻt": 1431, + "ĊĊĊ": 1432, + "ayout": 1433, + "--------------------------------": 1434, + "ager": 1435, + "Ġcould": 1436, + "Ġbr": 1437, + "ends": 1438, + "ures": 1439, + "Ġknow": 1440, + "ets": 1441, + "ĠIf": 1442, + "ĠSh": 1443, + ".w": 1444, + "back": 1445, + "Ġser": 1446, + "Ġ+=": 1447, + "Ġfr": 1448, + "());Ċ": 1449, + "Ġhand": 1450, + "Ind": 1451, + "ULL": 1452, + "Im": 1453, + "();ĊĊ": 1454, + "Ġmost": 1455, + "Ġtry": 1456, + "Ġnow": 1457, + "rough": 1458, + ">čĊ": 1459, + "ackage": 1460, + "Ġhim": 1461, + "._": 1462, + "ify": 1463, + "Ġbreak": 1464, + "Ġ);Ċ": 1465, + "ren": 1466, + "#define": 1467, + "itt": 1468, + "Ġap": 1469, + "ĉc": 1470, + "(n": 1471, + "ĠYou": 1472, + ":ĊĊ": 1473, + "-m": 1474, + "Ġevery": 1475, + "ustom": 1476, + "lient": 1477, + "ocument": 1478, + "cription": 1479, + "Error": 1480, + "-b": 1481, + "о": 1482, + "][": 1483, + "99": 1484, + "trans": 1485, + "Ġpoint": 1486, + "Ġstd": 1487, + "Ġfil": 1488, + "Time": 1489, + "80": 1490, + "Ġmod": 1491, + "Ġ->": 1492, + "Ġerror": 1493, + "ah": 1494, + "Ġtext": 1495, + "roller": 1496, + "lose": 1497, + "ql": 1498, + "Ġpol": 1499, + ">": 1500, + "Ġshow": 1501, + "User": 1502, + "ased": 1503, + "Ġ{ĊĊ": 1504, + "Ġfind": 1505, + "а": 1506, + "ED": 1507, + "span": 1508, + "enu": 1509, + "Ġcurrent": 1510, + "Ġused": 1511, + "cept": 1512, + "clud": 1513, + "Ġplay": 1514, + "Ġlog": 1515, + "ution": 1516, + "fl": 1517, + "Ġsee": 1518, + "indows": 1519, + "Ġhelp": 1520, + "Ġthese": 1521, + "Ġpass": 1522, + "Ġdown": 1523, + "Ġeven": 1524, + "ason": 1525, + "uild": 1526, + "from": 1527, + "(d": 1528, + "Ġbl": 1529, + "label": 1530, + "else": 1531, + "е": 1532, + "Ġ(!": 1533, + "ized": 1534, + "(),": 1535, + "Ġob": 1536, + "Ġitem": 1537, + "ump": 1538, + "UR": 1539, + "orn": 1540, + "Ġdon": 1541, + "Se": 1542, + "man": 1543, + "27": 1544, + "ample": 1545, + "tn": 1546, + "================": 1547, + "He": 1548, + "gram": 1549, + "Ġdid": 1550, + "wn": 1551, + "_h": 1552, + "iver": 1553, + "Ġsm": 1554, + "Ġthrough": 1555, + "ĠAn": 1556, + "che": 1557, + "Ġinv": 1558, + "ouse": 1559, + "Ġes": 1560, + "ĠNew": 1561, + "export": 1562, + "mary": 1563, + "uto": 1564, + "ler": 1565, + "Ġlast": 1566, + "Ġevent": 1567, + "try": 1568, + "ï¼": 1569, + "ily": 1570, + "igned": 1571, + "ines": 1572, + "ollow": 1573, + "icense": 1574, + "sole": 1575, + "lear": 1576, + "(int": 1577, + "Ġagain": 1578, + "Ġhigh": 1579, + "html": 1580, + "Index": 1581, + "uthor": 1582, + "Ġ/**Ċ": 1583, + "Ġline": 1584, + "Event": 1585, + "_D": 1586, + "Ġdoes": 1587, + "itial": 1588, + "Ġcr": 1589, + "ars": 1590, + "28": 1591, + "Ġtem": 1592, + "cause": 1593, + "face": 1594, + "Ġ`": 1595, + "_A": 1596, + "Button": 1597, + "ature": 1598, + "ected": 1599, + "ES": 1600, + "ister": 1601, + "ĉĊ": 1602, + "Ġbefore": 1603, + "ale": 1604, + "other": 1605, + "Ġbecause": 1606, + "roid": 1607, + "Ġed": 1608, + "ik": 1609, + "reg": 1610, + "ĠDe": 1611, + "Ġdist": 1612, + "},Ċ": 1613, + "Ġstate": 1614, + "Ġcons": 1615, + "rint": 1616, + "att": 1617, + "Ġhere": 1618, + "ined": 1619, + "Ġfinal": 1620, + "Ġ\"\"": 1621, + "Key": 1622, + "LO": 1623, + "Ġdel": 1624, + "pty": 1625, + "thing": 1626, + "26": 1627, + "ĠAnd": 1628, + "Ġrun": 1629, + "ĠX": 1630, + "ym": 1631, + ".app": 1632, + "Ġvery": 1633, + "ces": 1634, + "_N": 1635, + "ared": 1636, + "ward": 1637, + "list": 1638, + "ited": 1639, + "olog": 1640, + "itch": 1641, + "Box": 1642, + "ife": 1643, + "33": 1644, + "Ġac": 1645, + "Ġmodel": 1646, + "Ġmon": 1647, + "Ġway": 1648, + "lete": 1649, + "Ġcall": 1650, + "Ġatt": 1651, + "Ġcal": 1652, + "vert": 1653, + "Ġdec": 1654, + "lease": 1655, + "oun": 1656, + "Ġ});Ċ": 1657, + "fr": 1658, + "formation": 1659, + "etail": 1660, + "Ġnum": 1661, + "aj": 1662, + "query": 1663, + "Ġwell": 1664, + "Ġobject": 1665, + "ĠAs": 1666, + "Ġyears": 1667, + "Color": 1668, + "IS": 1669, + "Ġdefault": 1670, + "Wh": 1671, + "Ġins": 1672, + "aint": 1673, + "Ġjava": 1674, + "Ġsim": 1675, + "ĠAr": 1676, + "mon": 1677, + "til": 1678, + "();čĊ": 1679, + "):": 1680, + "Set": 1681, + "29": 1682, + "atter": 1683, + "Ġview": 1684, + "Ġpres": 1685, + "array": 1686, + "We": 1687, + "At": 1688, + "Ġbel": 1689, + "Ġmany": 1690, + "21": 1691, + "Man": 1692, + "ender": 1693, + "Ġbeing": 1694, + "Ġgood": 1695, + "ĉĉĉĉĉĉ": 1696, + "ational": 1697, + "ware": 1698, + ".log": 1699, + "{čĊ": 1700, + "Ġusing": 1701, + "_B": 1702, + "Ġ:=": 1703, + "_w": 1704, + "ists": 1705, + "lish": 1706, + "Ġstud": 1707, + "ĠAl": 1708, + "Ġgu": 1709, + "config": 1710, + "uring": 1711, + "time": 1712, + "oken": 1713, + "amespace": 1714, + "Ġrequest": 1715, + "Ġchild": 1716, + "ĠÃ": 1717, + "lob": 1718, + "Ġparam": 1719, + "Ġ}čĊ": 1720, + "01": 1721, + "Ġecho": 1722, + "function": 1723, + "********************************": 1724, + "ps": 1725, + "Element": 1726, + "alk": 1727, + "lication": 1728, + "by": 1729, + "Size": 1730, + "rawing": 1731, + "Ġperson": 1732, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1733, + "\\n": 1734, + "object": 1735, + "ince": 1736, + "En": 1737, + "File": 1738, + "uf": 1739, + "ffect": 1740, + "AC": 1741, + "Ġstyle": 1742, + "summary": 1743, + "Ġque": 1744, + "_r": 1745, + "Ġ($": 1746, + "Model": 1747, + "ident": 1748, + "Ġmethod": 1749, + "IL": 1750, + "ott": 1751, + "less": 1752, + "ING": 1753, + "Ġ()": 1754, + "Ġexpect": 1755, + "ync": 1756, + "package": 1757, + "35": 1758, + "urs": 1759, + "Ġprot": 1760, + "./": 1761, + "pre": 1762, + "Ġ)Ċ": 1763, + "ma": 1764, + "Ġsur": 1765, + "Ġfound": 1766, + "Info": 1767, + "par": 1768, + "imes": 1769, + ".e": 1770, + "ains": 1771, + "Ġpost": 1772, + "-d": 1773, + "45": 1774, + "olean": 1775, + "Ġsl": 1776, + "PE": 1777, + "Ġsuch": 1778, + "select": 1779, + "ainer": 1780, + "Ġthink": 1781, + "Ġdiffer": 1782, + ".r": 1783, + "/**Ċ": 1784, + "FF": 1785, + "ool": 1786, + "plate": 1787, + "qual": 1788, + "ĠFor": 1789, + "Ġmuch": 1790, + "uc": 1791, + "(new": 1792, + "odule": 1793, + "Ġsom": 1794, + "Ġhttp": 1795, + "ĠList": 1796, + "Ġcount": 1797, + "Ġinst": 1798, + "char": 1799, + "mit": 1800, + ".id": 1801, + "aking": 1802, + "Ġgener": 1803, + "px": 1804, + "vice": 1805, + "37": 1806, + "_data": 1807, + "ĠNULL": 1808, + "}čĊ": 1809, + "idd": 1810, + "ãĢĤ": 1811, + "Ġmed": 1812, + "org": 1813, + "ider": 1814, + "ache": 1815, + "work": 1816, + "Ġcheck": 1817, + "ween": 1818, + "Ġ((": 1819, + "the": 1820, + "ants": 1821, + "><": 1822, + ".B": 1823, + "-c": 1824, + "Ġopen": 1825, + "Ġest": 1826, + "ĠĠĠĠĠĠĠĠĊ": 1827, + "Ġnext": 1828, + "IM": 1829, + "ÑĤ": 1830, + "OT": 1831, + "ó": 1832, + "Ġfollow": 1833, + "content": 1834, + "ĠĠĠĠĠĠĠĠĠĠĠĠ": 1835, + "Ġinclud": 1836, + "HE": 1837, + "ĠRes": 1838, + "Ġhref": 1839, + "и": 1840, + "Ġcar": 1841, + "ypes": 1842, + "image": 1843, + "Un": 1844, + "Ġbool": 1845, + "AD": 1846, + "Ġgame": 1847, + ".Form": 1848, + "rows": 1849, + "*/": 1850, + "velop": 1851, + ".Drawing": 1852, + "Ġpath": 1853, + "ision": 1854, + "Ġeach": 1855, + "ĠPl": 1856, + "_type": 1857, + "Path": 1858, + "nection": 1859, + "Ġav": 1860, + "').": 1861, + "Ġsupport": 1862, + "ENT": 1863, + "rem": 1864, + "\").": 1865, + "Ġown": 1866, + "Ġcor": 1867, + "count": 1868, + "miss": 1869, + "ually": 1870, + "Ġmem": 1871, + "std": 1872, + "ience": 1873, + "search": 1874, + "\"ĊĊ": 1875, + "Form": 1876, + "Ġsex": 1877, + "ename": 1878, + "Ġsign": 1879, + "Ġet": 1880, + "ĠĠĠĠĠĠĠĠĠĠ": 1881, + "','": 1882, + "ĠApp": 1883, + "Ġthose": 1884, + "off": 1885, + "Ġerr": 1886, + "Ġsystem": 1887, + "Ġbest": 1888, + "code": 1889, + "Ġsame": 1890, + "Ġdi": 1891, + "uss": 1892, + "Ġcreate": 1893, + "ather": 1894, + "Array": 1895, + ".in": 1896, + "fe": 1897, + "Service": 1898, + "UN": 1899, + "ats": 1900, + "ĠZ": 1901, + "alth": 1902, + "Ġmade": 1903, + "true": 1904, + "AB": 1905, + "Ġmark": 1906, + "rid": 1907, + "ified": 1908, + ",čĊ": 1909, + "yn": 1910, + "press": 1911, + "Ġgroup": 1912, + "Ġfin": 1913, + "ĠLicense": 1914, + "Field": 1915, + "eger": 1916, + "Ġworld": 1917, + "iness": 1918, + "ty": 1919, + "Ġprocess": 1920, + "(b": 1921, + "Ġcre": 1922, + "arn": 1923, + "ives": 1924, + "Ġmain": 1925, + "ideo": 1926, + "36": 1927, + "_g": 1928, + "AG": 1929, + "valid": 1930, + "img": 1931, + "PI": 1932, + "Ġcolor": 1933, + "Ġreport": 1934, + "Ġtake": 1935, + "rib": 1936, + "OM": 1937, + "Ġday": 1938, + "Request": 1939, + "Ġsk": 1940, + "bers": 1941, + "ĉs": 1942, + ".Add": 1943, + "oot": 1944, + "Image": 1945, + "Ġcomple": 1946, + "ollection": 1947, + "Ġtop": 1948, + "Ġfree": 1949, + "AS": 1950, + "De": 1951, + "ĠOn": 1952, + "IG": 1953, + "90": 1954, + "eta": 1955, + "Date": 1956, + "Ġaction": 1957, + "34": 1958, + "Over": 1959, + "itor": 1960, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1961, + "not": 1962, + "Ġindex": 1963, + "her": 1964, + "icon": 1965, + "On": 1966, + ";čĊčĊ": 1967, + "ivity": 1968, + "mand": 1969, + ".Windows": 1970, + "OL": 1971, + "Ġreal": 1972, + "Ġmax": 1973, + "land": 1974, + "....": 1975, + "raph": 1976, + "Ġbuild": 1977, + "leg": 1978, + "assword": 1979, + "?ĊĊ": 1980, + "â̦": 1981, + "ook": 1982, + "uck": 1983, + "Ġmessage": 1984, + "test": 1985, + "ivers": 1986, + "38": 1987, + "Ġinput": 1988, + "Ġart": 1989, + "Ġbetween": 1990, + "Get": 1991, + "enter": 1992, + "ground": 1993, + "ene": 1994, + "á": 1995, + ".length": 1996, + "Node": 1997, + "(i": 1998, + "Class": 1999, + "for": 2000, + "ĠâĢĶ": 2001, + "ten": 2002, + "oin": 2003, + "Ġke": 2004, + "ui": 2005, + "ĠIN": 2006, + "Ġtable": 2007, + "sub": 2008, + "ĠLe": 2009, + "Ġhead": 2010, + "Ġmust": 2011, + "////////////////": 2012, + ".util": 2013, + "Context": 2014, + "Ġorder": 2015, + "Ġmov": 2016, + "over": 2017, + "Ġcontin": 2018, + "Ġsay": 2019, + "static": 2020, + ".Text": 2021, + "ĠclassName": 2022, + "pany": 2023, + "Ġter": 2024, + "head": 2025, + "rg": 2026, + "Ġproduct": 2027, + "This": 2028, + ".âĢĿ": 2029, + "ĠBut": 2030, + "70": 2031, + "loy": 2032, + "Ġdouble": 2033, + "sg": 2034, + "Ġplace": 2035, + ".x": 2036, + "message": 2037, + "Ġinformation": 2038, + "private": 2039, + "Ġoper": 2040, + "ced": 2041, + "db": 2042, + "\">": 2043, + "Param": 2044, + "icle": 2045, + "Ġweek": 2046, + "Ġprop": 2047, + "table": 2048, + "idget": 2049, + "place": 2050, + "Prop": 2051, + "ĠAll": 2052, + "els": 2053, + "box": 2054, + ".ĊĊĊĊ": 2055, + ".R": 2056, + "ĠTo": 2057, + "iter": 2058, + "Sh": 2059, + "uration": 2060, + "older": 2061, + "_list": 2062, + "come": 2063, + "Ġsw": 2064, + "ization": 2065, + "ĉfor": 2066, + "bl": 2067, + "Ġprogram": 2068, + "(e": 2069, + "ape": 2070, + "check": 2071, + ".Forms": 2072, + "Ġund": 2073, + "ategory": 2074, + "75": 2075, + "ags": 2076, + "Ġresponse": 2077, + "US": 2078, + "request": 2079, + "Ġstruct": 2080, + "escription": 2081, + "Ġcode": 2082, + "_H": 2083, + "uffer": 2084, + "Ġwithout": 2085, + "lobal": 2086, + "Manager": 2087, + "ilter": 2088, + "PO": 2089, + "ĉthis": 2090, + "option": 2091, + "Ġsol": 2092, + "Ġ===": 2093, + "akes": 2094, + "Controller": 2095, + "44": 2096, + "Message": 2097, + "Ġref": 2098, + "ever": 2099, + "ĠSo": 2100, + "aining": 2101, + ".append": 2102, + "Ġstill": 2103, + "Ġprovid": 2104, + "Ġassert": 2105, + "med": 2106, + "Ġcap": 2107, + "usiness": 2108, + "Ġrep": 2109, + "tings": 2110, + "ved": 2111, + ".N": 2112, + "api": 2113, + "OD": 2114, + "Ġfield": 2115, + "iven": 2116, + "oto": 2117, + "âĢľ": 2118, + "col": 2119, + "(x": 2120, + "ght": 2121, + "Result": 2122, + "Code": 2123, + ".is": 2124, + "link": 2125, + "Ġcour": 2126, + "An": 2127, + "Ġteam": 2128, + "ĉint": 2129, + "ift": 2130, + "55": 2131, + "Ġsecond": 2132, + "Ġgoing": 2133, + "Ġrange": 2134, + "_E": 2135, + "ness": 2136, + "39": 2137, + "Ġfam": 2138, + "Ġnil": 2139, + "ĠCont": 2140, + "ailable": 2141, + "utes": 2142, + "atab": 2143, + "Ġfact": 2144, + "Ġvis": 2145, + "(&": 2146, + "ĠAN": 2147, + "31": 2148, + "Al": 2149, + "title": 2150, + "Ġandroid": 2151, + "CE": 2152, + "\\\"": 2153, + "irt": 2154, + "Ġwrit": 2155, + "н": 2156, + "ĉm": 2157, + "ftware": 2158, + "ond": 2159, + "Ġret": 2160, + "osition": 2161, + "Ġhome": 2162, + "Ġleft": 2163, + "args": 2164, + "meric": 2165, + "48": 2166, + "Ġdirect": 2167, + "oci": 2168, + "Pl": 2169, + "As": 2170, + "ret": 2171, + "ado": 2172, + "Of": 2173, + "chn": 2174, + "ĠGet": 2175, + "ee": 2176, + "ross": 2177, + "();": 2178, + "____": 2179, + ".ph": 2180, + "It": 2181, + "oute": 2182, + "Ġexper": 2183, + "chool": 2184, + "www": 2185, + "},": 2186, + "Ġallow": 2187, + "ĠÂ": 2188, + "())": 2189, + "size": 2190, + "ism": 2191, + "ai": 2192, + "tract": 2193, + "ane": 2194, + "...ĊĊ": 2195, + "context": 2196, + "Ġbeg": 2197, + "CH": 2198, + "Ġpage": 2199, + "hip": 2200, + "no": 2201, + "core": 2202, + "sp": 2203, + "Ġdifferent": 2204, + "iable": 2205, + "ĠMe": 2206, + "_IN": 2207, + "button": 2208, + "ĠIs": 2209, + "ervices": 2210, + "Ġca": 2211, + "Ġaround": 2212, + "App": 2213, + "ration": 2214, + "Ġrece": 2215, + "Ġreally": 2216, + "Ġimage": 2217, + "Ġtarget": 2218, + "Ġdep": 2219, + "opyright": 2220, + "tra": 2221, + "ingle": 2222, + "ital": 2223, + "Layout": 2224, + "Ġboth": 2225, + "Override": 2226, + "arm": 2227, + "=>": 2228, + "aterial": 2229, + "iled": 2230, + "Ġput": 2231, + "Qu": 2232, + "ÑĢ": 2233, + "ung": 2234, + "map": 2235, + "ĉĉĉĉĉĉĉĉ": 2236, + "Ġlevel": 2237, + "Component": 2238, + "book": 2239, + "creen": 2240, + "_RE": 2241, + "Ġconfig": 2242, + "ãģ": 2243, + "Or": 2244, + ".data": 2245, + "Ġdocument": 2246, + "\",\"": 2247, + "tribute": 2248, + "ux": 2249, + "Log": 2250, + "ference": 2251, + "post": 2252, + "_e": 2253, + "Ġlocal": 2254, + "andom": 2255, + "assert": 2256, + "Val": 2257, + "lected": 2258, + "ina": 2259, + "atabase": 2260, + "Add": 2261, + "Ġcontent": 2262, + ".print": 2263, + "signed": 2264, + "ric": 2265, + ".\"ĊĊ": 2266, + "Ġfa": 2267, + "!ĊĊ": 2268, + "-f": 2269, + "ived": 2270, + "Ġquest": 2271, + ".ex": 2272, + "Ġfloat": 2273, + "Ġdevelop": 2274, + "оÐ": 2275, + "Map": 2276, + "ading": 2277, + "Ġposs": 2278, + "UE": 2279, + "namespace": 2280, + "_O": 2281, + "ĉb": 2282, + ".Get": 2283, + ">(": 2284, + "json": 2285, + "etails": 2286, + "66": 2287, + "Ġtoo": 2288, + "Ġextends": 2289, + "ĠNone": 2290, + "Ġfore": 2291, + "(String": 2292, + "format": 2293, + "Ġgreat": 2294, + "inter": 2295, + "cale": 2296, + "Ñģ": 2297, + "ron": 2298, + "iving": 2299, + "Ent": 2300, + "ency": 2301, + "xt": 2302, + "oy": 2303, + "05": 2304, + "Ġmonth": 2305, + "Ġhapp": 2306, + "Ġsuper": 2307, + "bar": 2308, + "default": 2309, + "_de": 2310, + "ords": 2311, + "ln": 2312, + "({Ċ": 2313, + "ĠInd": 2314, + "ases": 2315, + "Ġtitle": 2316, + "Ġcontext": 2317, + "08": 2318, + "oh": 2319, + "-p": 2320, + "Em": 2321, + "Ġmet": 2322, + "Test": 2323, + "Ġlife": 2324, + "_v": 2325, + "ĠUS": 2326, + "UI": 2327, + "ocation": 2328, + "md": 2329, + "Ġ[Ċ": 2330, + "Ġ]": 2331, + "sw": 2332, + "Ġincre": 2333, + "script": 2334, + "ential": 2335, + "ways": 2336, + ".de": 2337, + "Ġsrc": 2338, + "Ġcatch": 2339, + "ĠAmeric": 2340, + "//Ċ": 2341, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 2342, + "Ġpay": 2343, + "plit": 2344, + "âĢĶ": 2345, + "Ġcoun": 2346, + "obj": 2347, + ".php": 2348, + "Ġchange": 2349, + "ething": 2350, + "'re": 2351, + "aster": 2352, + "los": 2353, + "lation": 2354, + "ĠĠĊ": 2355, + "Le": 2356, + "ä": 2357, + "({": 2358, + "ready": 2359, + "ĠNo": 2360, + "Ġposition": 2361, + "Ġold": 2362, + "Ġbook": 2363, + "abled": 2364, + "bug": 2365, + "202": 2366, + "Hand": 2367, + "};ĊĊ": 2368, + "isplay": 2369, + "aving": 2370, + "04": 2371, + "Ġgover": 2372, + "Ġversion": 2373, + "System": 2374, + "nect": 2375, + "response": 2376, + "Style": 2377, + "Up": 2378, + "angu": 2379, + "Ġthree": 2380, + "init": 2381, + "ero": 2382, + "Ġlaw": 2383, + "endif": 2384, + "Ġbase": 2385, + "email": 2386, + "(l": 2387, + "_V": 2388, + "Ġconf": 2389, + "ATE": 2390, + "Ġduring": 2391, + "tes": 2392, + "Ġconsole": 2393, + "ĠPr": 2394, + "Ġspe": 2395, + "ves": 2396, + "65": 2397, + "path": 2398, + "ialog": 2399, + "dition": 2400, + "_to": 2401, + "ards": 2402, + "Ġagainst": 2403, + "etwork": 2404, + "ĠPh": 2405, + "_L": 2406, + "cur": 2407, + "imit": 2408, + "With": 2409, + "Ġpower": 2410, + "ium": 2411, + "';ĊĊ": 2412, + "Ġwom": 2413, + "left": 2414, + "ources": 2415, + "atri": 2416, + "ĠIm": 2417, + "ĠMan": 2418, + "orth": 2419, + "${": 2420, + "88": 2421, + "quals": 2422, + "ese": 2423, + "_size": 2424, + "Ġiss": 2425, + "otal": 2426, + "-g": 2427, + "ique": 2428, + "rame": 2429, + "Ġwidth": 2430, + "erg": 2431, + ")(": 2432, + "ittle": 2433, + "TR": 2434, + "ĠThey": 2435, + "ences": 2436, + "02": 2437, + "rl": 2438, + "ons": 2439, + "Ġlabel": 2440, + ".y": 2441, + "-t": 2442, + "update": 2443, + "anel": 2444, + "sc": 2445, + ".to": 2446, + "Ġproject": 2447, + "ü": 2448, + "Ġelement": 2449, + "Ġsuccess": 2450, + "ĉĉĊ": 2451, + ".sh": 2452, + "ram": 2453, + "ched": 2454, + "())Ċ": 2455, + "Ġ(Ċ": 2456, + "Ġdate": 2457, + "Ġtot": 2458, + "_ST": 2459, + "All": 2460, + "ification": 2461, + "ĉvar": 2462, + "Ġtri": 2463, + "chem": 2464, + "my": 2465, + "Ġbig": 2466, + "ĠAd": 2467, + "ĠAt": 2468, + "ots": 2469, + "num": 2470, + "Act": 2471, + "Ġmap": 2472, + "era": 2473, + "cope": 2474, + ".$": 2475, + ",âĢĿ": 2476, + "Ġpop": 2477, + "Ġfew": 2478, + "Ġlen": 2479, + "uid": 2480, + "eters": 2481, + "ules": 2482, + "ÃŃ": 2483, + "source": 2484, + "https": 2485, + "Ġdem": 2486, + "Ġear": 2487, + "################": 2488, + "Ġmatch": 2489, + "ories": 2490, + "49": 2491, + "aces": 2492, + "ĠCl": 2493, + "Ġnode": 2494, + "78": 2495, + "irc": 2496, + "local": 2497, + "unity": 2498, + "};Ċ": 2499, + "Ġanother": 2500, + "<<": 2501, + "ogle": 2502, + "Ġsit": 2503, + "ework": 2504, + "TE": 2505, + ".I": 2506, + "NS": 2507, + "ology": 2508, + "ought": 2509, + ".Cont": 2510, + ">>": 2511, + "Ġcare": 2512, + "state": 2513, + "ĉprivate": 2514, + "Ġeffect": 2515, + "++)": 2516, + "_file": 2517, + "ending": 2518, + "Line": 2519, + "For": 2520, + "ior": 2521, + "ĠSc": 2522, + "Ġfun": 2523, + ".Size": 2524, + "ĉelse": 2525, + "])": 2526, + "start": 2527, + "vious": 2528, + "Ġ},": 2529, + "ours": 2530, + "Ġleg": 2531, + "Ġservice": 2532, + "Ġsince": 2533, + "iron": 2534, + "Label": 2535, + "Ġnon": 2536, + "Ġlos": 2537, + "iction": 2538, + "Ġfull": 2539, + "acter": 2540, + "board": 2541, + "gress": 2542, + "Ġturn": 2543, + "ither": 2544, + "09": 2545, + ".size": 2546, + "Ġbody": 2547, + "resh": 2548, + "eturn": 2549, + "199": 2550, + "(_": 2551, + "yles": 2552, + "ormal": 2553, + "pi": 2554, + "Ġsomething": 2555, + "!--": 2556, + "uint": 2557, + "Ġprodu": 2558, + "Ġstand": 2559, + "Ġproble": 2560, + "Ġavailable": 2561, + "mt": 2562, + "ĠBl": 2563, + "Ġ...": 2564, + "Ġblock": 2565, + "Input": 2566, + "Ġkeep": 2567, + "Count": 2568, + "open": 2569, + "Ġ['": 2570, + "Ġthrow": 2571, + "uilder": 2572, + "Action": 2573, + "Ġthings": 2574, + "True": 2575, + "Ġurl": 2576, + "ĠBo": 2577, + "printf": 2578, + "Ġred": 2579, + "js": 2580, + ".create": 2581, + "ĠOr": 2582, + "Status": 2583, + "Instance": 2584, + "Ġcontrol": 2585, + "Ġcome": 2586, + "Ġcustom": 2587, + "location": 2588, + "07": 2589, + "model": 2590, + "ĠčĊ": 2591, + "Ġsource": 2592, + "Ġeas": 2593, + ".out": 2594, + "]ĊĊ": 2595, + "oney": 2596, + "Ġawait": 2597, + "Ġpartic": 2598, + "AP": 2599, + "ublish": 2600, + "odes": 2601, + "_pro": 2602, + "ply": 2603, + "riter": 2604, + "Ġprov": 2605, + "Ġmill": 2606, + "HT": 2607, + "])Ċ": 2608, + "Ġchang": 2609, + "Ġask": 2610, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 2611, + "Ġoutput": 2612, + "Ġemail": 2613, + "68": 2614, + ".push": 2615, + "Ġ}čĊčĊ": 2616, + "ination": 2617, + "47": 2618, + "atrix": 2619, + "Table": 2620, + "uccess": 2621, + "]);Ċ": 2622, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 2623, + "Ġdisc": 2624, + "([": 2625, + "Ġbusiness": 2626, + "height": 2627, + ".html": 2628, + "ta": 2629, + "field": 2630, + "Ġrequired": 2631, + "_R": 2632, + "Ġgovern": 2633, + "}čĊčĊ": 2634, + "lex": 2635, + "500": 2636, + ".,": 2637, + "ĠSet": 2638, + "urch": 2639, + "///": 2640, + "ts": 2641, + "af": 2642, + "Ġmight": 2643, + "istory": 2644, + "Str": 2645, + "Ġnever": 2646, + "Response": 2647, + "arse": 2648, + "ada": 2649, + "ĠHow": 2650, + "Ġ*)": 2651, + "Ġ;": 2652, + "Ġhard": 2653, + "Ad": 2654, + "Ġintern": 2655, + "used": 2656, + "(data": 2657, + "mod": 2658, + "annel": 2659, + "Ġnp": 2660, + "ugg": 2661, + "Ġ/>Ċ": 2662, + "Ġcalled": 2663, + "body": 2664, + "Ġcho": 2665, + "(r": 2666, + "_set": 2667, + "ird": 2668, + "Ġ>=": 2669, + "Ġ};Ċ": 2670, + "Ġoptions": 2671, + "ĠGener": 2672, + "Ġheight": 2673, + "Point": 2674, + "You": 2675, + "ety": 2676, + "Click": 2677, + "Ġsmall": 2678, + "Ġide": 2679, + "Ġaccess": 2680, + "anguage": 2681, + "Ġprotected": 2682, + "Ġjob": 2683, + "ĠThere": 2684, + "Def": 2685, + "Ġaddress": 2686, + "Ġuint": 2687, + "Not": 2688, + "oo": 2689, + "aps": 2690, + "
&": 5909,
+ "CON": 5910,
+ "Ġrepl": 5911,
+ "Ġregular": 5912,
+ "Storage": 5913,
+ "ramework": 5914,
+ "Ġgoal": 5915,
+ "Ġtouch": 5916,
+ ".widget": 5917,
+ "Ġbuilt": 5918,
+ "des": 5919,
+ "Part": 5920,
+ "(re": 5921,
+ "Ġworth": 5922,
+ "hib": 5923,
+ "game": 5924,
+ "91": 5925,
+ "192": 5926,
+ "Ġв": 5927,
+ "acion": 5928,
+ "ĠWhite": 5929,
+ "(type": 5930,
+ "(`": 5931,
+ "81": 5932,
+ "Ġnatural": 5933,
+ "Ġinj": 5934,
+ "Ġcalcul": 5935,
+ "ĠApril": 5936,
+ ".List": 5937,
+ "Ġassociated": 5938,
+ "ĉSystem": 5939,
+ "~~": 5940,
+ "=[": 5941,
+ "Ġstorage": 5942,
+ "Ġbytes": 5943,
+ "Ġtravel": 5944,
+ "Ġsou": 5945,
+ "Ġpassed": 5946,
+ "!=": 5947,
+ "ascript": 5948,
+ ".open": 5949,
+ "Ġgrid": 5950,
+ "Ġbus": 5951,
+ "Ġrecogn": 5952,
+ "Ab": 5953,
+ "Ġhon": 5954,
+ "ĠCenter": 5955,
+ "Ġprec": 5956,
+ "build": 5957,
+ "73": 5958,
+ "HTML": 5959,
+ "ĠSan": 5960,
+ "Ġcountries": 5961,
+ "aled": 5962,
+ "token": 5963,
+ "kt": 5964,
+ "Ġqual": 5965,
+ "Last": 5966,
+ "adow": 5967,
+ "Ġmanufact": 5968,
+ "idad": 5969,
+ "jango": 5970,
+ "Next": 5971,
+ "xf": 5972,
+ ".a": 5973,
+ "Ġporno": 5974,
+ "ĠPM": 5975,
+ "erve": 5976,
+ "iting": 5977,
+ "_th": 5978,
+ "ci": 5979,
+ "=None": 5980,
+ "gs": 5981,
+ "Ġlogin": 5982,
+ "atives": 5983,
+ "']);Ċ": 5984,
+ "Äħ": 5985,
+ "Ġill": 5986,
+ "IA": 5987,
+ "children": 5988,
+ "DO": 5989,
+ "Ġlevels": 5990,
+ "Ġ{{": 5991,
+ "Ġlooks": 5992,
+ "Ġ\"#": 5993,
+ "ToString": 5994,
+ "Ġnecessary": 5995,
+ "ĠĠĠĊ": 5996,
+ "cell": 5997,
+ "Entry": 5998,
+ "Ġ'#": 5999,
+ "Ġextrem": 6000,
+ "Selector": 6001,
+ "Ġplaceholder": 6002,
+ "Load": 6003,
+ "Ġreleased": 6004,
+ "ORE": 6005,
+ "Enumer": 6006,
+ "ĠTV": 6007,
+ "SET": 6008,
+ "inq": 6009,
+ "Press": 6010,
+ "ĠDepartment": 6011,
+ "Ġproperties": 6012,
+ "Ġrespond": 6013,
+ "Search": 6014,
+ "ael": 6015,
+ "Ġrequ": 6016,
+ "ĠBook": 6017,
+ "/Ċ": 6018,
+ "(st": 6019,
+ "Ġfinancial": 6020,
+ "icket": 6021,
+ "_input": 6022,
+ "Ġthreat": 6023,
+ "(in": 6024,
+ "Strip": 6025,
+ "ìĿ": 6026,
+ "ção": 6027,
+ "71": 6028,
+ "Ġevidence": 6029,
+ "));": 6030,
+ "ĠBro": 6031,
+ "Ġ[];Ċ": 6032,
+ "Ġou": 6033,
+ "buf": 6034,
+ "Script": 6035,
+ "dat": 6036,
+ "Ġrule": 6037,
+ "#import": 6038,
+ "=\"/": 6039,
+ "Serial": 6040,
+ "Ġstarting": 6041,
+ "[index": 6042,
+ "ae": 6043,
+ "Ġcontrib": 6044,
+ "session": 6045,
+ "_new": 6046,
+ "utable": 6047,
+ "ober": 6048,
+ "Ġ\"./": 6049,
+ "Ġlogger": 6050,
+ "Ġrecently": 6051,
+ "Ġreturned": 6052,
+ "ččĊ": 6053,
+ ")))Ċ": 6054,
+ "itions": 6055,
+ "Ġseek": 6056,
+ "Ġcommunic": 6057,
+ "Ġ\".": 6058,
+ "Ġusername": 6059,
+ "ECT": 6060,
+ "DS": 6061,
+ "Ġotherwise": 6062,
+ "ĠGerman": 6063,
+ ".aw": 6064,
+ "Adapter": 6065,
+ "ixel": 6066,
+ "Ġsystems": 6067,
+ "Ġdrop": 6068,
+ "83": 6069,
+ "Ġstructure": 6070,
+ "Ġ$(\"#": 6071,
+ "encies": 6072,
+ "anning": 6073,
+ "ĠLink": 6074,
+ "ĠResponse": 6075,
+ "Ġstri": 6076,
+ "ż": 6077,
+ "ĠDB": 6078,
+ "æĹ": 6079,
+ "android": 6080,
+ "submit": 6081,
+ "otion": 6082,
+ "92": 6083,
+ "(@": 6084,
+ ".test": 6085,
+ "82": 6086,
+ "ĊĊĊĊĊĊĊĊ": 6087,
+ "];čĊ": 6088,
+ "Ġdirectly": 6089,
+ "Ġ\"%": 6090,
+ "ris": 6091,
+ "elta": 6092,
+ "AIL": 6093,
+ "){čĊ": 6094,
+ "mine": 6095,
+ "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 6096,
+ "(k": 6097,
+ "bon": 6098,
+ "asic": 6099,
+ "pite": 6100,
+ "___": 6101,
+ "Max": 6102,
+ "Ġerrors": 6103,
+ "ĠWhile": 6104,
+ "Ġarguments": 6105,
+ "Ġensure": 6106,
+ "Right": 6107,
+ "-based": 6108,
+ "Web": 6109,
+ "Ġ-=": 6110,
+ "Ġintrodu": 6111,
+ "ĠInst": 6112,
+ "ĠWash": 6113,
+ "ordin": 6114,
+ "join": 6115,
+ "Database": 6116,
+ "Ġgrad": 6117,
+ "Ġusually": 6118,
+ "ITE": 6119,
+ "Props": 6120,
+ "?>Ċ": 6121,
+ "ĠGo": 6122,
+ "@Override": 6123,
+ "REF": 6124,
+ "Ġip": 6125,
+ "ĠAustral": 6126,
+ "Ġist": 6127,
+ "ViewById": 6128,
+ "Ġserious": 6129,
+ "Ġcustomer": 6130,
+ ".prototype": 6131,
+ "odo": 6132,
+ "cor": 6133,
+ "Ġdoor": 6134,
+ "ĠWITHOUT": 6135,
+ "Ġplant": 6136,
+ "Ġbegan": 6137,
+ "Ġdistance": 6138,
+ "()).": 6139,
+ "Ġchance": 6140,
+ "Ġord": 6141,
+ "came": 6142,
+ "pragma": 6143,
+ "Ġprotect": 6144,
+ "ragment": 6145,
+ "ĠNode": 6146,
+ "ening": 6147,
+ "Ñĩ": 6148,
+ "Ġroute": 6149,
+ "ĠSchool": 6150,
+ "hi": 6151,
+ "Ġneighb": 6152,
+ "After": 6153,
+ "licit": 6154,
+ "Ġcontr": 6155,
+ "Ġprimary": 6156,
+ "AA": 6157,
+ ".WriteLine": 6158,
+ "utils": 6159,
+ "Ġbi": 6160,
+ "Red": 6161,
+ ".Linq": 6162,
+ ".object": 6163,
+ "Ġleaders": 6164,
+ "unities": 6165,
+ "Ġgun": 6166,
+ "onth": 6167,
+ "ĠDev": 6168,
+ "FILE": 6169,
+ "Ġcomments": 6170,
+ "_len": 6171,
+ "arrow": 6172,
+ "amount": 6173,
+ "Range": 6174,
+ "sert": 6175,
+ "GridView": 6176,
+ "Ġupdated": 6177,
+ "ĠMo": 6178,
+ "Ġinform": 6179,
+ "ociety": 6180,
+ "ala": 6181,
+ "Access": 6182,
+ "Ġhab": 6183,
+ "Ġcreat": 6184,
+ "_arg": 6185,
+ "ĠJanuary": 6186,
+ "ĠDay": 6187,
+ "\")čĊ": 6188,
+ "uple": 6189,
+ "document": 6190,
+ "gorith": 6191,
+ "menu": 6192,
+ "ĠOver": 6193,
+ "bb": 6194,
+ ".title": 6195,
+ "_out": 6196,
+ "Ġled": 6197,
+ "uri": 6198,
+ "Ġ?>": 6199,
+ "gl": 6200,
+ "Ġbank": 6201,
+ "ayment": 6202,
+ "ĉprintf": 6203,
+ "MD": 6204,
+ "Ġsample": 6205,
+ "Ġhands": 6206,
+ "ĠVersion": 6207,
+ "uario": 6208,
+ "Ġoffers": 6209,
+ "ityEngine": 6210,
+ "Ġshape": 6211,
+ "Ġsleep": 6212,
+ "_point": 6213,
+ "Settings": 6214,
+ "Ġachie": 6215,
+ "Ġsold": 6216,
+ "ota": 6217,
+ ".bind": 6218,
+ "Am": 6219,
+ "Ġsafe": 6220,
+ "Store": 6221,
+ "Ġshared": 6222,
+ "Ġpriv": 6223,
+ "_VAL": 6224,
+ "Ġsens": 6225,
+ "){": 6226,
+ "Ġremember": 6227,
+ "shared": 6228,
+ "element": 6229,
+ "Ġshoot": 6230,
+ "Vert": 6231,
+ "cout": 6232,
+ "Ġenv": 6233,
+ "_label": 6234,
+ "Ġ>Ċ": 6235,
+ "run": 6236,
+ "Ġscene": 6237,
+ "(array": 6238,
+ "device": 6239,
+ "_title": 6240,
+ "agon": 6241,
+ "]čĊ": 6242,
+ "aby": 6243,
+ "Ġbecame": 6244,
+ "boolean": 6245,
+ "Ġpark": 6246,
+ "ĠCode": 6247,
+ "upload": 6248,
+ "riday": 6249,
+ "ĠSeptember": 6250,
+ "Fe": 6251,
+ "Ġsen": 6252,
+ "cing": 6253,
+ "FL": 6254,
+ "Col": 6255,
+ "uts": 6256,
+ "_page": 6257,
+ "inn": 6258,
+ "Ġimplied": 6259,
+ "aling": 6260,
+ "Ġyourself": 6261,
+ ".Count": 6262,
+ "conf": 6263,
+ "Ġaud": 6264,
+ "_init": 6265,
+ ".)": 6266,
+ "Ġwrote": 6267,
+ "003": 6268,
+ "NG": 6269,
+ ".Error": 6270,
+ "ä»": 6271,
+ ".for": 6272,
+ "Ġequal": 6273,
+ "ĠRequest": 6274,
+ "Ġserial": 6275,
+ "Ġallows": 6276,
+ "XX": 6277,
+ "Ġmiddle": 6278,
+ "chor": 6279,
+ "195": 6280,
+ "94": 6281,
+ "ø": 6282,
+ "erval": 6283,
+ ".Column": 6284,
+ "reading": 6285,
+ "Ġescort": 6286,
+ "ĠAugust": 6287,
+ "Ġquickly": 6288,
+ "Ġweap": 6289,
+ "ĠCG": 6290,
+ "ropri": 6291,
+ "ho": 6292,
+ "Ġcop": 6293,
+ "(struct": 6294,
+ "ĠBig": 6295,
+ "Ġvs": 6296,
+ "Ġfrequ": 6297,
+ ".Value": 6298,
+ "Ġactions": 6299,
+ "Ġproper": 6300,
+ "Ġinn": 6301,
+ "Ġobjects": 6302,
+ "Ġmatrix": 6303,
+ "avascript": 6304,
+ "Ġones": 6305,
+ ".group": 6306,
+ "Ġgreen": 6307,
+ "Ġpaint": 6308,
+ "ools": 6309,
+ "ycl": 6310,
+ "encode": 6311,
+ "olt": 6312,
+ "comment": 6313,
+ ".api": 6314,
+ "Dir": 6315,
+ "Ġune": 6316,
+ "izont": 6317,
+ ".position": 6318,
+ "Ġdesigned": 6319,
+ "_val": 6320,
+ "avi": 6321,
+ "iring": 6322,
+ "tab": 6323,
+ "Ġlayer": 6324,
+ "Ġviews": 6325,
+ "Ġreve": 6326,
+ "rael": 6327,
+ "ĠON": 6328,
+ "rics": 6329,
+ "160": 6330,
+ "np": 6331,
+ "Ġcore": 6332,
+ "());čĊ": 6333,
+ "Main": 6334,
+ "Ġexpert": 6335,
+ "ĉĉčĊ": 6336,
+ "_en": 6337,
+ "Ġ/>": 6338,
+ "utter": 6339,
+ "IAL": 6340,
+ "ails": 6341,
+ "ĠKing": 6342,
+ "*/ĊĊ": 6343,
+ "ĠMet": 6344,
+ "_end": 6345,
+ "addr": 6346,
+ "ora": 6347,
+ "Ġir": 6348,
+ "Min": 6349,
+ "Ġsurpr": 6350,
+ "Ġrepe": 6351,
+ "Ġdirectory": 6352,
+ "PUT": 6353,
+ "-S": 6354,
+ "Ġelection": 6355,
+ "haps": 6356,
+ ".pre": 6357,
+ "cm": 6358,
+ "Values": 6359,
+ "Ġ\"Ċ": 6360,
+ "column": 6361,
+ "ivil": 6362,
+ "Login": 6363,
+ "inue": 6364,
+ "93": 6365,
+ "Ġbeautiful": 6366,
+ "Ġsecret": 6367,
+ "(event": 6368,
+ "Ġchat": 6369,
+ "ums": 6370,
+ "Ġorigin": 6371,
+ "Ġeffects": 6372,
+ "Ġmanagement": 6373,
+ "illa": 6374,
+ "tk": 6375,
+ "Ġsetting": 6376,
+ "ĠCour": 6377,
+ "Ġmassage": 6378,
+ "ĉend": 6379,
+ "Ġhappy": 6380,
+ "Ġfinish": 6381,
+ "Ġcamera": 6382,
+ "ĠVer": 6383,
+ "ĠDemocr": 6384,
+ "ĠHer": 6385,
+ "(Q": 6386,
+ "cons": 6387,
+ "ita": 6388,
+ "Ġ'.": 6389,
+ "{}": 6390,
+ "ĉC": 6391,
+ "Ġstuff": 6392,
+ "194": 6393,
+ "Ġ:Ċ": 6394,
+ "ĠAR": 6395,
+ "Task": 6396,
+ "hidden": 6397,
+ "eros": 6398,
+ "IGN": 6399,
+ "atio": 6400,
+ "ĠHealth": 6401,
+ "olute": 6402,
+ "Enter": 6403,
+ "'>": 6404,
+ "ĠTwitter": 6405,
+ "ĠCounty": 6406,
+ "scribe": 6407,
+ "Ġ=>Ċ": 6408,
+ "Ġhy": 6409,
+ "fit": 6410,
+ "Ġmilitary": 6411,
+ "Ġsale": 6412,
+ "required": 6413,
+ "non": 6414,
+ "bootstrap": 6415,
+ "hold": 6416,
+ "rim": 6417,
+ "-old": 6418,
+ "ĠDown": 6419,
+ "Ġmention": 6420,
+ "contact": 6421,
+ "_group": 6422,
+ "oday": 6423,
+ "Ġtown": 6424,
+ "Ġsolution": 6425,
+ "uate": 6426,
+ "elling": 6427,
+ "]->": 6428,
+ "otes": 6429,
+ "ental": 6430,
+ "omen": 6431,
+ "ospital": 6432,
+ "ĠSup": 6433,
+ "_EN": 6434,
+ "Ġslow": 6435,
+ "SESSION": 6436,
+ "Ġblue": 6437,
+ "ago": 6438,
+ "Ġlives": 6439,
+ "Ġ^": 6440,
+ ".un": 6441,
+ "inst": 6442,
+ "enge": 6443,
+ "Ġcustomers": 6444,
+ "Ġcast": 6445,
+ "udget": 6446,
+ "ï¼ģ": 6447,
+ "icens": 6448,
+ "Ġdetermin": 6449,
+ "Selected": 6450,
+ "_pl": 6451,
+ "ueue": 6452,
+ "Ġdark": 6453,
+ "//ĊĊ": 6454,
+ "si": 6455,
+ "thern": 6456,
+ "ĠJapan": 6457,
+ "/w": 6458,
+ "PU": 6459,
+ "ĠEast": 6460,
+ "ovie": 6461,
+ "Ġpackage": 6462,
+ "Ġnor": 6463,
+ "Ġapi": 6464,
+ "bot": 6465,
+ "\"];Ċ": 6466,
+ "_post": 6467,
+ "ulate": 6468,
+ "Ġclub": 6469,
+ "'));Ċ": 6470,
+ "Ġloop": 6471,
+ "PIO": 6472,
+ "ione": 6473,
+ "shot": 6474,
+ "Initial": 6475,
+ "Ġplayed": 6476,
+ "register": 6477,
+ "rought": 6478,
+ "_max": 6479,
+ "acement": 6480,
+ "match": 6481,
+ "raphics": 6482,
+ "AST": 6483,
+ "Ġexisting": 6484,
+ "Ġcomplex": 6485,
+ "DA": 6486,
+ ".Ch": 6487,
+ ".common": 6488,
+ "mo": 6489,
+ "Ġ'../../": 6490,
+ "ito": 6491,
+ "Ġanalysis": 6492,
+ "Ġdeliver": 6493,
+ "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĊ": 6494,
+ "idx": 6495,
+ "Ãł": 6496,
+ "ongo": 6497,
+ "ĠEnglish": 6498,
+ "Ċ": 10197,
+ "_default": 10198,
+ "ĠDatabase": 10199,
+ "rep": 10200,
+ "ESS": 10201,
+ "nergy": 10202,
+ ".Find": 10203,
+ "_mask": 10204,
+ "Ġrise": 10205,
+ "Ġkernel": 10206,
+ "::$": 10207,
+ ".Q": 10208,
+ "Ġoffering": 10209,
+ "decl": 10210,
+ "ĠCS": 10211,
+ "Ġlisted": 10212,
+ "Ġmostly": 10213,
+ "enger": 10214,
+ "Ġblocks": 10215,
+ "olo": 10216,
+ "Ġgoverning": 10217,
+ "\\F": 10218,
+ "Ġconcent": 10219,
+ ".getText": 10220,
+ "Ġmb": 10221,
+ "Ġoccurred": 10222,
+ "Ġchanging": 10223,
+ "Scene": 10224,
+ "_CODE": 10225,
+ "Beh": 10226,
+ "\"The": 10227,
+ "Ġtile": 10228,
+ "ĠAssociation": 10229,
+ "ĉP": 10230,
+ "alty": 10231,
+ "_ad": 10232,
+ "odies": 10233,
+ "iated": 10234,
+ "Ġprepared": 10235,
+ "possible": 10236,
+ "Ġmort": 10237,
+ "TEST": 10238,
+ "142": 10239,
+ "Ġignore": 10240,
+ "Ġcalc": 10241,
+ "Ġrs": 10242,
+ "ĠassertEquals": 10243,
+ "Ġsz": 10244,
+ "ĠTHIS": 10245,
+ ".\"Ċ": 10246,
+ "Ġcanvas": 10247,
+ "java": 10248,
+ "Ġdut": 10249,
+ "VALID": 10250,
+ ".sql": 10251,
+ ".input": 10252,
+ "Ġaux": 10253,
+ "Sup": 10254,
+ "Ġartist": 10255,
+ "Vec": 10256,
+ "_TIME": 10257,
+ ".stringify": 10258,
+ "etween": 10259,
+ "ĠCategory": 10260,
+ "Ġ[-": 10261,
+ "ĠDevExpress": 10262,
+ "ĠJul": 10263,
+ "Ġring": 10264,
+ ".ed": 10265,
+ "YY": 10266,
+ "Let": 10267,
+ "TextField": 10268,
+ "Ġflat": 10269,
+ "_print": 10270,
+ "ĠOTHER": 10271,
+ "adian": 10272,
+ "Ġchecked": 10273,
+ "ele": 10274,
+ "Align": 10275,
+ "standing": 10276,
+ "Ġ[],": 10277,
+ "Ġlab": 10278,
+ "ucky": 10279,
+ "ĠChristmas": 10280,
+ "(image": 10281,
+ ".module": 10282,
+ "Ġlots": 10283,
+ "Ġslightly": 10284,
+ "(final": 10285,
+ "erge": 10286,
+ "è¿": 10287,
+ "147": 10288,
+ "ĠPolice": 10289,
+ "143": 10290,
+ "ĠRight": 10291,
+ "Ġaward": 10292,
+ "ĠOS": 10293,
+ "Ġ{}ĊĊ": 10294,
+ "Ġptr": 10295,
+ "oves": 10296,
+ "icated": 10297,
+ "ем": 10298,
+ "Ġmanage": 10299,
+ "oliday": 10300,
+ "Amount": 10301,
+ "oolStrip": 10302,
+ "tbody": 10303,
+ "Nav": 10304,
+ "wrap": 10305,
+ "BB": 10306,
+ "Ġwatching": 10307,
+ "arios": 10308,
+ "Ġoptional": 10309,
+ "_K": 10310,
+ "ĠLicensed": 10311,
+ ".Map": 10312,
+ "Timer": 10313,
+ "ĠAP": 10314,
+ "ĠRev": 10315,
+ "(o": 10316,
+ ",c": 10317,
+ "umin": 10318,
+ "etailed": 10319,
+ "ĠHy": 10320,
+ "Ġblank": 10321,
+ "agger": 10322,
+ "ĠSelf": 10323,
+ "()[": 10324,
+ ".make": 10325,
+ "earn": 10326,
+ "channel": 10327,
+ ";Ċ": 10342,
+ "World": 10343,
+ "Ġpython": 10344,
+ "Ġlif": 10345,
+ "Ġtrav": 10346,
+ "Ġconven": 10347,
+ "company": 10348,
+ "ĠClub": 10349,
+ "138": 10350,
+ "Ver": 10351,
+ "Btn": 10352,
+ "Ġzone": 10353,
+ "products": 10354,
+ "ĠEduc": 10355,
+ "Ġverify": 10356,
+ "ĠMil": 10357,
+ "ono": 10358,
+ "]);ĊĊ": 10359,
+ "ENCE": 10360,
+ "Ġpacket": 10361,
+ "Ġcer": 10362,
+ "Ġenumer": 10363,
+ "Ġpars": 10364,
+ "formed": 10365,
+ "Ġoccup": 10366,
+ "tre": 10367,
+ "Ġexercise": 10368,
+ "Day": 10369,
+ "_sum": 10370,
+ "Ġasking": 10371,
+ "aption": 10372,
+ "Ġorders": 10373,
+ "Ġspending": 10374,
+ "ĠERR": 10375,
+ ".Dis": 10376,
+ "ĠUtil": 10377,
+ "âĢľI": 10378,
+ "\\'": 10379,
+ "?)": 10380,
+ "/>Ċ": 10381,
+ "Ġemot": 10382,
+ "Ġinfluence": 10383,
+ "ĠAfrica": 10384,
+ "atters": 10385,
+ "Ùħ": 10386,
+ ".session": 10387,
+ "Ġchief": 10388,
+ "ĉĉĉĉĉĉĉĉĉĉĉ": 10389,
+ "Ġtom": 10390,
+ "cluded": 10391,
+ "serial": 10392,
+ "_handler": 10393,
+ ".Type": 10394,
+ "aped": 10395,
+ "Ġpolicies": 10396,
+ "-ex": 10397,
+ "-tr": 10398,
+ "blank": 10399,
+ "merce": 10400,
+ "Ġcoverage": 10401,
+ "Ġrc": 10402,
+ "_matrix": 10403,
+ "_box": 10404,
+ "Ġcharges": 10405,
+ "ĠBoston": 10406,
+ "Pe": 10407,
+ "Ġcircum": 10408,
+ "Ġfilled": 10409,
+ "148": 10410,
+ "Ġnorth": 10411,
+ "ictureBox": 10412,
+ "ĉres": 10413,
+ "è®": 10414,
+ "Ġtermin": 10415,
+ "Ġ[â̦": 10416,
+ "IRECT": 10417,
+ "Ġber": 10418,
+ "Ġ\"../../": 10419,
+ "retch": 10420,
+ ".code": 10421,
+ "_col": 10422,
+ "ĠGovernment": 10423,
+ "Ġargv": 10424,
+ "ĠLord": 10425,
+ "asi": 10426,
+ "Exec": 10427,
+ "ĉlet": 10428,
+ "vertis": 10429,
+ "Ġdiscussion": 10430,
+ "enance": 10431,
+ "outube": 10432,
+ "typeof": 10433,
+ "Ġserved": 10434,
+ "ĠPut": 10435,
+ "ĉx": 10436,
+ "Ġsweet": 10437,
+ "Before": 10438,
+ "ategy": 10439,
+ ".of": 10440,
+ "ĠMaterial": 10441,
+ "Sort": 10442,
+ "ONT": 10443,
+ "igital": 10444,
+ "Why": 10445,
+ "Ġsust": 10446,
+ "Ġç": 10447,
+ "abet": 10448,
+ "Ġsegment": 10449,
+ "Ġ[],Ċ": 10450,
+ "ĠMuslim": 10451,
+ "ĠfindViewById": 10452,
+ "cut": 10453,
+ "_TEXT": 10454,
+ "ĠMary": 10455,
+ "Ġloved": 10456,
+ "Ġlie": 10457,
+ "ĠJO": 10458,
+ "Ġisset": 10459,
+ "month": 10460,
+ "Ġprime": 10461,
+ "ti": 10462,
+ "ĠCarol": 10463,
+ "Use": 10464,
+ "146": 10465,
+ "ĠPop": 10466,
+ "ĠSave": 10467,
+ "Interval": 10468,
+ "execute": 10469,
+ "dy": 10470,
+ "ĠIran": 10471,
+ "_cont": 10472,
+ "ĉT": 10473,
+ "Ġphase": 10474,
+ "checkbox": 10475,
+ "week": 10476,
+ "Ġhide": 10477,
+ "Ġtil": 10478,
+ "Ġju": 10479,
+ "Custom": 10480,
+ "burg": 10481,
+ "/M": 10482,
+ "TON": 10483,
+ "Ġquant": 10484,
+ "Ġrub": 10485,
+ "ixels": 10486,
+ "Ġinstalled": 10487,
+ "Ġdump": 10488,
+ "Ġproperly": 10489,
+ "(List": 10490,
+ "Ġdecide": 10491,
+ "apply": 10492,
+ "Has": 10493,
+ "Ġkeeping": 10494,
+ "Ġcitizens": 10495,
+ "Ġjoint": 10496,
+ "pool": 10497,
+ "Socket": 10498,
+ "_op": 10499,
+ "Ġweapon": 10500,
+ "gnore": 10501,
+ "ĠExec": 10502,
+ "otten": 10503,
+ "ĠMS": 10504,
+ "Ġ(-": 10505,
+ "ĠReview": 10506,
+ "Ġexamples": 10507,
+ "Ġtight": 10508,
+ "!(": 10509,
+ "DP": 10510,
+ "ĠMessageBox": 10511,
+ "Ġphotograph": 10512,
+ "164": 10513,
+ "URI": 10514,
+ "ét": 10515,
+ "low": 10516,
+ "ĠGrand": 10517,
+ ".persistence": 10518,
+ "Ġmaintain": 10519,
+ "Ġnums": 10520,
+ "Ġzip": 10521,
+ "ials": 10522,
+ "ĠGets": 10523,
+ "peg": 10524,
+ "ĠBuffer": 10525,
+ "~~~~": 10526,
+ "rastructure": 10527,
+ "ĠPL": 10528,
+ "uen": 10529,
+ "obby": 10530,
+ "sizeof": 10531,
+ "Ġpic": 10532,
+ "Ġseed": 10533,
+ "Ġexperienced": 10534,
+ "Ġodd": 10535,
+ "Ġkick": 10536,
+ "Ġprocedure": 10537,
+ "avigator": 10538,
+ "-on": 10539,
+ ",j": 10540,
+ "ĠAlthough": 10541,
+ "ĠuserId": 10542,
+ "accept": 10543,
+ "Blue": 10544,
+ "IColor": 10545,
+ "layer": 10546,
+ "available": 10547,
+ "Ġends": 10548,
+ ".table": 10549,
+ "Ġdataset": 10550,
+ "bus": 10551,
+ "Ġexplain": 10552,
+ "(pro": 10553,
+ "ĠCommittee": 10554,
+ "Ġnoted": 10555,
+ "]:Ċ": 10556,
+ "Dim": 10557,
+ "stdio": 10558,
+ "154": 10559,
+ ".\",Ċ": 10560,
+ "_source": 10561,
+ "181": 10562,
+ "ĠWeek": 10563,
+ "ĠEdge": 10564,
+ "Ġoperating": 10565,
+ "Ġeste": 10566,
+ "ipl": 10567,
+ "330": 10568,
+ "agination": 10569,
+ "Ġproceed": 10570,
+ "Ġanimation": 10571,
+ ".Models": 10572,
+ "ĠWatch": 10573,
+ "iat": 10574,
+ "Ġoppon": 10575,
+ "/A": 10576,
+ "Report": 10577,
+ "Ġsounds": 10578,
+ "_buf": 10579,
+ "IELD": 10580,
+ "Ġbund": 10581,
+ "ĉget": 10582,
+ ".pr": 10583,
+ "(tmp": 10584,
+ "Ġkid": 10585,
+ ">ĊĊĊ": 10586,
+ "Ġyang": 10587,
+ "NotFound": 10588,
+ "ÑĨ": 10589,
+ "math": 10590,
+ "@gmail": 10591,
+ "ĠLIMIT": 10592,
+ "redients": 10593,
+ "Ġvent": 10594,
+ "avigate": 10595,
+ "Look": 10596,
+ "Ġreligious": 10597,
+ "Ġrand": 10598,
+ "rio": 10599,
+ "(GL": 10600,
+ "_ip": 10601,
+ "uan": 10602,
+ "iciency": 10603,
+ "ĠChange": 10604,
+ ">čĊčĊ": 10605,
+ "ĠEntity": 10606,
+ "Ġrencontre": 10607,
+ "ĠRet": 10608,
+ "plan": 10609,
+ "én": 10610,
+ "BOOL": 10611,
+ "uries": 10612,
+ "train": 10613,
+ "Definition": 10614,
+ "============": 10615,
+ "zz": 10616,
+ "450": 10617,
+ "Animation": 10618,
+ "ĠOK": 10619,
+ "_menu": 10620,
+ ".bl": 10621,
+ "_score": 10622,
+ "Ġacad": 10623,
+ "(System": 10624,
+ "Ġrefresh": 10625,
+ "'=>$": 10626,
+ ".Graphics": 10627,
+ "amento": 10628,
+ "pid": 10629,
+ "tc": 10630,
+ "Ġtips": 10631,
+ "Ġhomes": 10632,
+ "Ġfuel": 10633,
+ "âĸ": 10634,
+ "_helper": 10635,
+ "ĠĠčĊ": 10636,
+ "ĠRoom": 10637,
+ ".Close": 10638,
+ "_attr": 10639,
+ "ĠMount": 10640,
+ "ĠEv": 10641,
+ "arser": 10642,
+ "_top": 10643,
+ "eah": 10644,
+ "ĠDelete": 10645,
+ "ãĢį": 10646,
+ "uke": 10647,
+ "Ġusage": 10648,
+ "aria": 10649,
+ "_dev": 10650,
+ "Ġtexture": 10651,
+ "Ġconversation": 10652,
+ "eper": 10653,
+ "Bean": 10654,
+ "done": 10655,
+ "nonatomic": 10656,
+ "ĠSecond": 10657,
+ "Ġshooting": 10658,
+ "_pre": 10659,
+ "Components": 10660,
+ "Ġ]ĊĊ": 10661,
+ "__,": 10662,
+ "stitution": 10663,
+ ".Char": 10664,
+ ">();ĊĊ": 10665,
+ "Ġpresented": 10666,
+ "Ġwa": 10667,
+ "oker": 10668,
+ "-ĊĊ": 10669,
+ "iner": 10670,
+ "Ġbecoming": 10671,
+ "Ġincident": 10672,
+ "Att": 10673,
+ "162": 10674,
+ "Ġrevealed": 10675,
+ "forc": 10676,
+ "Ġboot": 10677,
+ ".page": 10678,
+ "Enumerator": 10679,
+ "165": 10680,
+ "_->": 10681,
+ "Photo": 10682,
+ "Ġspring": 10683,
+ ".\",": 10684,
+ "ĠDictionary": 10685,
+ "BJECT": 10686,
+ "Ġlocations": 10687,
+ "Ġsamples": 10688,
+ "InputStream": 10689,
+ "ĠBrown": 10690,
+ "Ġstats": 10691,
+ "quality": 10692,
+ "Ñħ": 10693,
+ "-dis": 10694,
+ "Ġhelping": 10695,
+ "Ġped": 10696,
+ "224": 10697,
+ "(se": 10698,
+ "ĠWho": 10699,
+ "alian": 10700,
+ "internal": 10701,
+ "Ġft": 10702,
+ ">().": 10703,
+ "->{": 10704,
+ "Ġmine": 10705,
+ "Ġsector": 10706,
+ "Ġgro": 10707,
+ "Ġopportunities": 10708,
+ "Ġü": 10709,
+ "Ġmp": 10710,
+ "Ġalleged": 10711,
+ "Ġdoubt": 10712,
+ "Mouse": 10713,
+ "About": 10714,
+ "_part": 10715,
+ "Ġchair": 10716,
+ "Ġstopped": 10717,
+ "161": 10718,
+ "loop": 10719,
+ "entities": 10720,
+ "Ġapps": 10721,
+ "ansion": 10722,
+ "Ġmental": 10723,
+ "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 10724,
+ "FR": 10725,
+ "Ġdefend": 10726,
+ "care": 10727,
+ "Ġideal": 10728,
+ "/api": 10729,
+ "urface": 10730,
+ "011": 10731,
+ "Ġele": 10732,
+ "ulator": 10733,
+ "ĠRights": 10734,
+ "anguages": 10735,
+ "Ġfunds": 10736,
+ "Ġadapt": 10737,
+ "Attributes": 10738,
+ "Ġdeploy": 10739,
+ "opts": 10740,
+ "Ġvalidation": 10741,
+ "Ġconcerns": 10742,
+ "uce": 10743,
+ ".num": 10744,
+ "ulture": 10745,
+ "ila": 10746,
+ "Ġcup": 10747,
+ "Ġpure": 10748,
+ ".Fore": 10749,
+ "183": 10750,
+ "ĠHashMap": 10751,
+ ".valueOf": 10752,
+ "asm": 10753,
+ "MO": 10754,
+ "Ġcs": 10755,
+ "Ġstores": 10756,
+ "Ġ************************************************************************": 10757,
+ "Ġcommunication": 10758,
+ "mem": 10759,
+ ".EventHandler": 10760,
+ ".Status": 10761,
+ "_right": 10762,
+ ".setOn": 10763,
+ "Sheet": 10764,
+ "Ġidentify": 10765,
+ "enerated": 10766,
+ "ordered": 10767,
+ "Ġ\"[": 10768,
+ "Ġswe": 10769,
+ "Condition": 10770,
+ "ĠAccording": 10771,
+ "Ġprepare": 10772,
+ "Ġrob": 10773,
+ "Pool": 10774,
+ "Ġsport": 10775,
+ "rv": 10776,
+ "ĠRouter": 10777,
+ "Ġalternative": 10778,
+ "([]": 10779,
+ "ĠChicago": 10780,
+ "ipher": 10781,
+ "ische": 10782,
+ "ĠDirector": 10783,
+ "kl": 10784,
+ "ĠWil": 10785,
+ "keys": 10786,
+ "Ġmysql": 10787,
+ "Ġwelcome": 10788,
+ "king": 10789,
+ "ĠManager": 10790,
+ "Ġcaught": 10791,
+ ")}Ċ": 10792,
+ "Score": 10793,
+ "_PR": 10794,
+ "Ġsurvey": 10795,
+ "hab": 10796,
+ "Headers": 10797,
+ "ADER": 10798,
+ "Ġdecor": 10799,
+ "Ġturns": 10800,
+ "Ġradius": 10801,
+ "errupt": 10802,
+ "Cor": 10803,
+ "Ġmel": 10804,
+ "Ġintr": 10805,
+ "(q": 10806,
+ "ĠAC": 10807,
+ "amos": 10808,
+ "MAX": 10809,
+ "ĠGrid": 10810,
+ "ĠJesus": 10811,
+ "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 10812,
+ ".DE": 10813,
+ "Ġts": 10814,
+ "Ġlinked": 10815,
+ "free": 10816,
+ "ĠQt": 10817,
+ "Ġ/**čĊ": 10818,
+ "Ġfaster": 10819,
+ "ctr": 10820,
+ "_J": 10821,
+ "DT": 10822,
+ ".Check": 10823,
+ "Ġcombination": 10824,
+ "Ġintended": 10825,
+ "-the": 10826,
+ "-type": 10827,
+ "182": 10828,
+ "ectors": 10829,
+ "ami": 10830,
+ "uting": 10831,
+ "Ġuma": 10832,
+ "XML": 10833,
+ "UCT": 10834,
+ "Ap": 10835,
+ "ĠRandom": 10836,
+ "Ġran": 10837,
+ ".sort": 10838,
+ "Ġsorted": 10839,
+ ".Un": 10840,
+ "401": 10841,
+ "_PER": 10842,
+ "itory": 10843,
+ "Ġpriority": 10844,
+ "ĠGal": 10845,
+ "ĠOld": 10846,
+ "hot": 10847,
+ "ĠDisplay": 10848,
+ "(sub": 10849,
+ "_TH": 10850,
+ "_Y": 10851,
+ "ĠCare": 10852,
+ "loading": 10853,
+ "Kind": 10854,
+ "_handle": 10855,
+ ",,": 10856,
+ "rase": 10857,
+ "_replace": 10858,
+ ".addEventListener": 10859,
+ "ĠRT": 10860,
+ "172": 10861,
+ "Ġentered": 10862,
+ "gers": 10863,
+ "Ġich": 10864,
+ "(start": 10865,
+ "205": 10866,
+ "/app": 10867,
+ "Ġbrother": 10868,
+ "Memory": 10869,
+ "Outlet": 10870,
+ "Ġutf": 10871,
+ "prec": 10872,
+ "Ġnavigation": 10873,
+ "ORK": 10874,
+ "Ġdst": 10875,
+ "Detail": 10876,
+ "Ġaudience": 10877,
+ "Ġdur": 10878,
+ "Ġcluster": 10879,
+ "unched": 10880,
+ "Ġ],": 10881,
+ "Ġcomfortable": 10882,
+ ".values": 10883,
+ "ĠTotal": 10884,
+ "Ġsnap": 10885,
+ "Ġstandards": 10886,
+ "Ġperformed": 10887,
+ "hand": 10888,
+ "(\"@": 10889,
+ "åŃ": 10890,
+ "Ġphil": 10891,
+ "ibr": 10892,
+ "trim": 10893,
+ "Ġforget": 10894,
+ "157": 10895,
+ "Ġdoctor": 10896,
+ ".TextBox": 10897,
+ "377": 10898,
+ "icons": 10899,
+ ",s": 10900,
+ "ĠOp": 10901,
+ "Sm": 10902,
+ "Stop": 10903,
+ "ĉList": 10904,
+ "ĉu": 10905,
+ "Comment": 10906,
+ "_VERSION": 10907,
+ ".Xtra": 10908,
+ "Person": 10909,
+ "rb": 10910,
+ "LOB": 10911,
+ "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĊ": 10912,
+ "ĠCentral": 10913,
+ "270": 10914,
+ "ICK": 10915,
+ "raq": 10916,
+ "Ġputting": 10917,
+ "Ġmd": 10918,
+ "ĠLove": 10919,
+ "Program": 10920,
+ "Border": 10921,
+ "oor": 10922,
+ "Ġallowing": 10923,
+ "after": 10924,
+ "Ġentries": 10925,
+ "ĠMaybe": 10926,
+ "]).": 10927,
+ "ĠShort": 10928,
+ ")\\": 10929,
+ ".now": 10930,
+ "friend": 10931,
+ "Ġprefer": 10932,
+ "ĠGPIO": 10933,
+ "osis": 10934,
+ "ĠGameObject": 10935,
+ "Ġskip": 10936,
+ "Ġcompetition": 10937,
+ "_match": 10938,
+ "lications": 10939,
+ "_CONT": 10940,
+ ".groupBox": 10941,
+ "Ġals": 10942,
+ "666": 10943,
+ "\"We": 10944,
+ "_eq": 10945,
+ "lan": 10946,
+ "_search": 10947,
+ "ĠMusic": 10948,
+ "asis": 10949,
+ "Ġbind": 10950,
+ "ĠIsland": 10951,
+ "rum": 10952,
+ "(E": 10953,
+ "Ġseat": 10954,
+ "Video": 10955,
+ "Ġack": 10956,
+ "reek": 10957,
+ "={()": 10958,
+ "Ġrating": 10959,
+ "Ġrestaurant": 10960,
+ "456": 10961,
+ "DEX": 10962,
+ "(buf": 10963,
+ "pping": 10964,
+ "uality": 10965,
+ "Ġleague": 10966,
+ "176": 10967,
+ "Ġfocused": 10968,
+ "apon": 10969,
+ "$data": 10970,
+ "CLUD": 10971,
+ "CLUDING": 10972,
+ "Ġabsolute": 10973,
+ "(query": 10974,
+ "Ġtells": 10975,
+ "Ang": 10976,
+ "Ġcommunities": 10977,
+ "Ġhonest": 10978,
+ "oking": 10979,
+ "Ġapart": 10980,
+ "arity": 10981,
+ "/$": 10982,
+ "_module": 10983,
+ "ĠEnc": 10984,
+ ".an": 10985,
+ ".Config": 10986,
+ "Cre": 10987,
+ "Ġshock": 10988,
+ "ĠArab": 10989,
+ "IENT": 10990,
+ "/re": 10991,
+ "Ġretrie": 10992,
+ "ycler": 10993,
+ "isa": 10994,
+ "ĠOrgan": 10995,
+ ".graph": 10996,
+ "Ġí": 10997,
+ "ĠBAS": 10998,
+ "Enum": 10999,
+ "Ġpossibly": 11000,
+ "ÑĢаÐ": 11001,
+ "ĠJapanese": 11002,
+ "Ġcraft": 11003,
+ "ĠPlace": 11004,
+ "Ġtalent": 11005,
+ "Ġfunding": 11006,
+ "Ġconfirmed": 11007,
+ "Ġcycle": 11008,
+ "/x": 11009,
+ "GE": 11010,
+ "Ġhearing": 11011,
+ "Ġplants": 11012,
+ "Ġmouth": 11013,
+ "pages": 11014,
+ "oria": 11015,
+ "ĠRemove": 11016,
+ "_total": 11017,
+ "Ġod": 11018,
+ "ollapse": 11019,
+ "door": 11020,
+ "Ġbought": 11021,
+ "Ġaddr": 11022,
+ "ARCH": 11023,
+ "_dim": 11024,
+ "dden": 11025,
+ "Ġdecades": 11026,
+ "REQUEST": 11027,
+ "Ġversions": 11028,
+ "fire": 11029,
+ "006": 11030,
+ "Ġmoves": 11031,
+ "fb": 11032,
+ "Ġcoffee": 11033,
+ ".connect": 11034,
+ "ĠRow": 11035,
+ "Ġschema": 11036,
+ "Scope": 11037,
+ "-Type": 11038,
+ "Ġfighting": 11039,
+ "Ġretail": 11040,
+ "Ġmodified": 11041,
+ "TF": 11042,
+ "Files": 11043,
+ "nie": 11044,
+ "_command": 11045,
+ "stone": 11046,
+ "ĠÑĤ": 11047,
+ "_thread": 11048,
+ "Ġbond": 11049,
+ "ĠDevelopment": 11050,
+ "Ġpt": 11051,
+ "FORM": 11052,
+ "plet": 11053,
+ "Ġidentified": 11054,
+ "cpp": 11055,
+ "206": 11056,
+ "225": 11057,
+ "Ġcoding": 11058,
+ "oked": 11059,
+ "ĠMaster": 11060,
+ "IDTH": 11061,
+ "Ġresidents": 11062,
+ "redit": 11063,
+ "ĠPhoto": 11064,
+ "=-": 11065,
+ "unte": 11066,
+ "ateur": 11067,
+ "159": 11068,
+ "_STATE": 11069,
+ "ĠSing": 11070,
+ "Ġsheet": 11071,
+ ".val": 11072,
+ "orse": 11073,
+ "Ġhers": 11074,
+ "Ġdetermined": 11075,
+ "Common": 11076,
+ "Ġwed": 11077,
+ "_queue": 11078,
+ "PH": 11079,
+ "ĠAtl": 11080,
+ "cred": 11081,
+ "/LICENSE": 11082,
+ "Ġmes": 11083,
+ "Ġadvanced": 11084,
+ ".java": 11085,
+ ".Sh": 11086,
+ "Go": 11087,
+ "kill": 11088,
+ "fp": 11089,
+ "_settings": 11090,
+ "Ġpal": 11091,
+ "Ġtruck": 11092,
+ "Ġcombined": 11093,
+ "Ġ\"${": 11094,
+ "ĠCorpor": 11095,
+ "Ġjoined": 11096,
+ "ĠJose": 11097,
+ "ĠCup": 11098,
+ "uns": 11099,
+ "estival": 11100,
+ "levision": 11101,
+ "Ġbroken": 11102,
+ "Ġmarriage": 11103,
+ "ĠWestern": 11104,
+ "Ġrepresents": 11105,
+ "ĠTitle": 11106,
+ "Ġss": 11107,
+ ".Ass": 11108,
+ "ongoose": 11109,
+ "iento": 11110,
+ "<>();Ċ": 11111,
+ "Ġabsolutely": 11112,
+ "Ġsmooth": 11113,
+ "TERN": 11114,
+ "ĠUnless": 11115,
+ "Word": 11116,
+ "Ġmerge": 11117,
+ "igan": 11118,
+ "ĠVol": 11119,
+ "Ġnn": 11120,
+ ".getId": 11121,
+ "Ġз": 11122,
+ "171": 11123,
+ "Ġsexy": 11124,
+ "Ġseeking": 11125,
+ "Single": 11126,
+ ".this": 11127,
+ "179": 11128,
+ "Ġkom": 11129,
+ "bound": 11130,
+ ";\"": 11131,
+ "ĠfontSize": 11132,
+ "_df": 11133,
+ "Ġinjury": 11134,
+ "(H": 11135,
+ "Ġissued": 11136,
+ "_END": 11137,
+ ":self": 11138,
+ "020": 11139,
+ "Ġpatch": 11140,
+ "Ġleaves": 11141,
+ "Ġadopt": 11142,
+ "FileName": 11143,
+ "ãĢIJ": 11144,
+ "Ġexecutive": 11145,
+ "ĠByte": 11146,
+ "]))Ċ": 11147,
+ "Ġnu": 11148,
+ "outing": 11149,
+ "cluding": 11150,
+ "-R": 11151,
+ ".options": 11152,
+ "Ġsubstant": 11153,
+ "avax": 11154,
+ "ĠBUT": 11155,
+ "Ġtechnical": 11156,
+ "Ġtwice": 11157,
+ "Ġmás": 11158,
+ "Ġunivers": 11159,
+ "yr": 11160,
+ "Ġdrag": 11161,
+ "ĠDC": 11162,
+ "Ġsed": 11163,
+ "Ġbot": 11164,
+ "ĠPal": 11165,
+ "ĠHall": 11166,
+ "forcement": 11167,
+ "Ġauch": 11168,
+ ".mod": 11169,
+ "notation": 11170,
+ "_files": 11171,
+ ".line": 11172,
+ "_flag": 11173,
+ "[name": 11174,
+ "Ġresolution": 11175,
+ "Ġbott": 11176,
+ "(\"[": 11177,
+ "ende": 11178,
+ "(arr": 11179,
+ "Free": 11180,
+ "(@\"": 11181,
+ "ĠDistrict": 11182,
+ "PEC": 11183,
+ ":-": 11184,
+ "Picker": 11185,
+ "ĠJo": 11186,
+ "ĠĠĠĠĠĊ": 11187,
+ "ĠRiver": 11188,
+ "_rows": 11189,
+ "Ġhelpful": 11190,
+ "Ġmassive": 11191,
+ "---Ċ": 11192,
+ "Ġmeasures": 11193,
+ "007": 11194,
+ "ĠRuntime": 11195,
+ "Ġworry": 11196,
+ "ĠSpec": 11197,
+ "ĉD": 11198,
+ "ãĢij": 11199,
+ "Ġ){Ċ": 11200,
+ "Ġworse": 11201,
+ "(filename": 11202,
+ "Ġlay": 11203,
+ "Ġmagic": 11204,
+ "ĠTheir": 11205,
+ "oul": 11206,
+ "stroy": 11207,
+ "ĠWhere": 11208,
+ "280": 11209,
+ "Ġsudden": 11210,
+ "Ġdefe": 11211,
+ "Ġbinding": 11212,
+ "Ġflight": 11213,
+ "ĠOnInit": 11214,
+ "ĠWomen": 11215,
+ "ĠPolicy": 11216,
+ "Ġdrugs": 11217,
+ "ishing": 11218,
+ "('../": 11219,
+ "ĠMel": 11220,
+ "peat": 11221,
+ "tor": 11222,
+ "Ġproposed": 11223,
+ "Ġstated": 11224,
+ "_RES": 11225,
+ "Ġeast": 11226,
+ "212": 11227,
+ "ĠCONDITION": 11228,
+ "_desc": 11229,
+ "Ġwinning": 11230,
+ "folio": 11231,
+ "Mapper": 11232,
+ "ĠPan": 11233,
+ "ĠAnge": 11234,
+ ".servlet": 11235,
+ "Ġcopies": 11236,
+ "LM": 11237,
+ "Ġvm": 11238,
+ "åį": 11239,
+ "Ġdictionary": 11240,
+ "Seg": 11241,
+ "177": 11242,
+ "elines": 11243,
+ "ĠSend": 11244,
+ "Ġiron": 11245,
+ "ĠFort": 11246,
+ "166": 11247,
+ ".domain": 11248,
+ "Ġdebate": 11249,
+ "NotNull": 11250,
+ "eq": 11251,
+ "acher": 11252,
+ "lf": 11253,
+ "ĉfmt": 11254,
+ "Ġlawy": 11255,
+ "178": 11256,
+ "ÄŁ": 11257,
+ "ĠMen": 11258,
+ "Ġtrim": 11259,
+ "(NULL": 11260,
+ "Ġ!!": 11261,
+ "Ġpad": 11262,
+ "Ġfollows": 11263,
+ "\"][\"": 11264,
+ "requ": 11265,
+ "ĠEp": 11266,
+ ".github": 11267,
+ "(img": 11268,
+ "eto": 11269,
+ "('\\": 11270,
+ "Services": 11271,
+ "umbnail": 11272,
+ "_main": 11273,
+ "pleted": 11274,
+ "fortunately": 11275,
+ "Ġwindows": 11276,
+ "Ġplane": 11277,
+ "ĠConnection": 11278,
+ ".local": 11279,
+ "uard": 11280,
+ "}\\": 11281,
+ "==\"": 11282,
+ "andon": 11283,
+ "ĠRoy": 11284,
+ "west": 11285,
+ "158": 11286,
+ "iginal": 11287,
+ "emies": 11288,
+ "itz": 11289,
+ "'):Ċ": 11290,
+ "ĠPeter": 11291,
+ "Ġtough": 11292,
+ "Ġreduced": 11293,
+ "Ġcalculate": 11294,
+ "Ġrapid": 11295,
+ "customer": 11296,
+ "Ġefficient": 11297,
+ "Ġmedium": 11298,
+ "Ġfell": 11299,
+ ".ref": 11300,
+ "ĠCas": 11301,
+ "Ġfeedback": 11302,
+ "Speed": 11303,
+ "(output": 11304,
+ "aje": 11305,
+ "Ġcategories": 11306,
+ "Ġfee": 11307,
+ "};": 11308,
+ "Ġdeleted": 11309,
+ "reh": 11310,
+ "Ġproof": 11311,
+ "Desc": 11312,
+ "Build": 11313,
+ "Ġsides": 11314,
+ ".ArrayList": 11315,
+ "-%": 11316,
+ "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 11317,
+ "ر": 11318,
+ ".match": 11319,
+ "ли": 11320,
+ "Ġfeels": 11321,
+ "Ġachieve": 11322,
+ "Ġclim": 11323,
+ "_ON": 11324,
+ "ĠCD": 11325,
+ "Ġteacher": 11326,
+ "_current": 11327,
+ "bn": 11328,
+ "_PL": 11329,
+ "isting": 11330,
+ "Enable": 11331,
+ "GEN": 11332,
+ "Ġtv": 11333,
+ "Ġsock": 11334,
+ "Ġplays": 11335,
+ "Ġdiscount": 11336,
+ "ĠKE": 11337,
+ "ĠDebug": 11338,
+ "Fore": 11339,
+ "ĠIraq": 11340,
+ "Ġappearance": 11341,
+ "Mon": 11342,
+ "Ġstyled": 11343,
+ "ĠHuman": 11344,
+ "iot": 11345,
+ "ĠHistory": 11346,
+ "Ġsac": 11347,
+ "ĠCollection": 11348,
+ "Ġrecommended": 11349,
+ ".Selected": 11350,
+ "Ġorganizations": 11351,
+ "Ġdiscovered": 11352,
+ "cohol": 11353,
+ "adas": 11354,
+ "ĠThomas": 11355,
+ "May": 11356,
+ "Ġconserv": 11357,
+ "Ġdomin": 11358,
+ "ĠFollow": 11359,
+ "ĠSection": 11360,
+ "ĠThanks": 11361,
+ "Username": 11362,
+ "Ġrecipe": 11363,
+ "Ġwonderful": 11364,
+ ".sleep": 11365,
+ "_if": 11366,
+ "ĉĊĉĊ": 11367,
+ "orno": 11368,
+ "Ġru": 11369,
+ "_target": 11370,
+ ".\"\"": 11371,
+ "à¦": 11372,
+ "EventArgs": 11373,
+ "Ġinputs": 11374,
+ "Ġfif": 11375,
+ "Ġvision": 11376,
+ "cy": 11377,
+ "ĠSeries": 11378,
+ ")(((": 11379,
+ "Ġtrading": 11380,
+ "Ġmarker": 11381,
+ "Begin": 11382,
+ "Ġtypically": 11383,
+ "Ġcauses": 11384,
+ "dropdown": 11385,
+ "_DEBUG": 11386,
+ "260": 11387,
+ "Ġdetect": 11388,
+ "country": 11389,
+ "!\");Ċ": 11390,
+ "ĉR": 11391,
+ "appy": 11392,
+ "Ġcref": 11393,
+ "('<": 11394,
+ "\"=>": 11395,
+ "ĠLE": 11396,
+ "reader": 11397,
+ "Ġadministr": 11398,
+ "õ": 11399,
+ "ucket": 11400,
+ "Ġfashion": 11401,
+ ".char": 11402,
+ "izar": 11403,
+ "Ġdisable": 11404,
+ "Ġsuc": 11405,
+ "ĠLive": 11406,
+ "issue": 11407,
+ "Ġmetadata": 11408,
+ "flags": 11409,
+ "ĠðŁ": 11410,
+ "Ġcommitted": 11411,
+ "Ġva": 11412,
+ "Ġrough": 11413,
+ "Ġ'''Ċ": 11414,
+ "Ġhighlight": 11415,
+ "_vars": 11416,
+ "VO": 11417,
+ "Ġencoding": 11418,
+ "-Z": 11419,
+ "_sign": 11420,
+ "$(\"#": 11421,
+ "Ġrain": 11422,
+ "reatest": 11423,
+ "ĠEND": 11424,
+ "Selection": 11425,
+ "Ġcandidates": 11426,
+ "Ġsav": 11427,
+ ".Empty": 11428,
+ "Ġdecisions": 11429,
+ "Ġcollabor": 11430,
+ "ridge": 11431,
+ "feed": 11432,
+ "ression": 11433,
+ "Ġpersons": 11434,
+ "VM": 11435,
+ "008": 11436,
+ "ega": 11437,
+ "_BIT": 11438,
+ "According": 11439,
+ "acked": 11440,
+ "Ġdollars": 11441,
+ "_loss": 11442,
+ "ĠCost": 11443,
+ "}\"Ċ": 11444,
+ "Notification": 11445,
+ "Ġprostit": 11446,
+ "Ġauthority": 11447,
+ ".rec": 11448,
+ "Ġspokes": 11449,
+ "ĠToday": 11450,
+ "istant": 11451,
+ "ĠHead": 11452,
+ "âĢĿ.": 11453,
+ "ertainment": 11454,
+ "cean": 11455,
+ "culate": 11456,
+ "Ġven": 11457,
+ "However": 11458,
+ "_arr": 11459,
+ "Ġtokens": 11460,
+ "Graph": 11461,
+ "ĠJud": 11462,
+ "ĠVirgin": 11463,
+ "ĠSerial": 11464,
+ "unning": 11465,
+ "Mutable": 11466,
+ "agers": 11467,
+ ".csv": 11468,
+ "Ġdeveloping": 11469,
+ "Ġinstructions": 11470,
+ "Ġpromise": 11471,
+ "Ġrequested": 11472,
+ "_encode": 11473,
+ "/\"": 11474,
+ "ĠIcon": 11475,
+ "uilt": 11476,
+ "-day": 11477,
+ "Ġintelligence": 11478,
+ ".IS": 11479,
+ "ĠObservable": 11480,
+ "ĠHard": 11481,
+ "Bool": 11482,
+ "211": 11483,
+ "idential": 11484,
+ ".Anchor": 11485,
+ "Ġselling": 11486,
+ "CI": 11487,
+ "AGES": 11488,
+ "tle": 11489,
+ "bur": 11490,
+ "UFFER": 11491,
+ "RY": 11492,
+ "Ġbigger": 11493,
+ "Ġrat": 11494,
+ "Ġfamous": 11495,
+ "Ġtypename": 11496,
+ "Ġexplained": 11497,
+ "}}Ċ": 11498,
+ "Ġnuclear": 11499,
+ "-N": 11500,
+ "Ġcrisis": 11501,
+ "ĠEnter": 11502,
+ "Ġanswers": 11503,
+ "/${": 11504,
+ "/pl": 11505,
+ "Ġsequ": 11506,
+ "_next": 11507,
+ "mask": 11508,
+ "Ġstanding": 11509,
+ "Ġplenty": 11510,
+ "ĠCross": 11511,
+ "ĉret": 11512,
+ "dro": 11513,
+ "ĠCast": 11514,
+ "167": 11515,
+ "=true": 11516,
+ "ĠChris": 11517,
+ "icio": 11518,
+ "ĠMike": 11519,
+ "Decimal": 11520,
+ "addComponent": 11521,
+ "Len": 11522,
+ "Ġcock": 11523,
+ "Ġ#{": 11524,
+ "URN": 11525,
+ "": 11657,
+ "Ġ*=": 11658,
+ "ĠPS": 11659,
+ "Ġdangerous": 11660,
+ "[p": 11661,
+ "OME": 11662,
+ "Other": 11663,
+ "ĠStringBuilder": 11664,
+ "Points": 11665,
+ "heading": 11666,
+ "Ġcurrency": 11667,
+ "Ġpercentage": 11668,
+ "_API": 11669,
+ "Ġclassic": 11670,
+ "thead": 11671,
+ "ĠMO": 11672,
+ "FE": 11673,
+ "Idx": 11674,
+ "await": 11675,
+ "Ġè": 11676,
+ "Ġaccident": 11677,
+ "Ġvariant": 11678,
+ "Ġmyst": 11679,
+ "ĠLand": 11680,
+ "ĠBre": 11681,
+ "Ġharm": 11682,
+ "ĠAcc": 11683,
+ "Ġcharged": 11684,
+ "iones": 11685,
+ "Visibility": 11686,
+ "arry": 11687,
+ "ĠLanguage": 11688,
+ "Ġwalking": 11689,
+ "\".ĊĊ": 11690,
+ "ifer": 11691,
+ "Ġleadership": 11692,
+ ".From": 11693,
+ "ynam": 11694,
+ "Ġtimestamp": 11695,
+ "ipt": 11696,
+ "ĠHas": 11697,
+ "REFER": 11698,
+ "ĠIts": 11699,
+ "Ġlistener": 11700,
+ "UTE": 11701,
+ "213": 11702,
+ "_description": 11703,
+ "Ġexperiences": 11704,
+ "Ġcreates": 11705,
+ "RS": 11706,
+ "cart": 11707,
+ "black": 11708,
+ "Ġchoices": 11709,
+ "war": 11710,
+ "750": 11711,
+ "Ġ'''": 11712,
+ "Ġordered": 11713,
+ "Ġevening": 11714,
+ "Ġpil": 11715,
+ "Ġtun": 11716,
+ "ĠBad": 11717,
+ "(app": 11718,
+ "random": 11719,
+ "Ġexplicit": 11720,
+ "Ġarrived": 11721,
+ "Ġfly": 11722,
+ "Ġeconom": 11723,
+ "-mail": 11724,
+ "Ġlists": 11725,
+ "Ġarchitect": 11726,
+ "234": 11727,
+ "ĠPay": 11728,
+ "Ġds": 11729,
+ "ĠSol": 11730,
+ "Ġvehicles": 11731,
+ "Hz": 11732,
+ "-com": 11733,
+ "Ġking": 11734,
+ "_equal": 11735,
+ "ĠHelp": 11736,
+ "Ġabuse": 11737,
+ "480": 11738,
+ "169": 11739,
+ "--;Ċ": 11740,
+ "Ġextr": 11741,
+ "Ġchemical": 11742,
+ "ä¿": 11743,
+ "Ġorient": 11744,
+ "Ġbreath": 11745,
+ "ĠSpace": 11746,
+ "(element": 11747,
+ "wait": 11748,
+ "DED": 11749,
+ "igma": 11750,
+ "Ġentr": 11751,
+ "Ġsob": 11752,
+ "-name": 11753,
+ "Ġaffected": 11754,
+ "ika": 11755,
+ "Ġcoal": 11756,
+ "_work": 11757,
+ "Ġhundreds": 11758,
+ "Ġpolitics": 11759,
+ "subject": 11760,
+ "Ġconsumer": 11761,
+ "ANGE": 11762,
+ "Ġrepeated": 11763,
+ "Send": 11764,
+ "Ġ#[": 11765,
+ "Ġprotocol": 11766,
+ "Ġleads": 11767,
+ "useum": 11768,
+ "Every": 11769,
+ "808": 11770,
+ "174": 11771,
+ "Import": 11772,
+ "(count": 11773,
+ "Ġchallenges": 11774,
+ "Ġnovel": 11775,
+ "Ġdepart": 11776,
+ "bits": 11777,
+ ".Current": 11778,
+ "Ġ`${": 11779,
+ "oting": 11780,
+ "(\\": 11781,
+ "Ġcreative": 11782,
+ "Ġbuff": 11783,
+ "Ġintroduced": 11784,
+ "usic": 11785,
+ "modules": 11786,
+ "Are": 11787,
+ "-doc": 11788,
+ "language": 11789,
+ "_cache": 11790,
+ "Ġtod": 11791,
+ "?>": 11792,
+ "omething": 11793,
+ "Ġhun": 11794,
+ "åº": 11795,
+ "aters": 11796,
+ "Intent": 11797,
+ "Ġimplemented": 11798,
+ "ĠCase": 11799,
+ "Children": 11800,
+ "Ġnotification": 11801,
+ "Renderer": 11802,
+ "Wrapper": 11803,
+ "Objects": 11804,
+ "tl": 11805,
+ ".Contains": 11806,
+ "Plugin": 11807,
+ ".row": 11808,
+ "Ġforg": 11809,
+ "Ġpermit": 11810,
+ "Ġtargets": 11811,
+ "ĠIF": 11812,
+ "Ġtip": 11813,
+ "sex": 11814,
+ "Ġsupports": 11815,
+ "Ġfold": 11816,
+ "photo": 11817,
+ "},čĊ": 11818,
+ "Ġgoogle": 11819,
+ "$('#": 11820,
+ "Ġsharing": 11821,
+ "Ġgoods": 11822,
+ "vs": 11823,
+ "ĠDan": 11824,
+ "Rate": 11825,
+ "ĠMartin": 11826,
+ "Ġmanner": 11827,
+ "lie": 11828,
+ ".The": 11829,
+ "Internal": 11830,
+ "ĠCONTR": 11831,
+ "Mock": 11832,
+ "RIGHT": 11833,
+ "Ġ'{": 11834,
+ "Ġcontrols": 11835,
+ "Mat": 11836,
+ "Ġmand": 11837,
+ "Ġextended": 11838,
+ "Ok": 11839,
+ "Ġembed": 11840,
+ "Ġplanet": 11841,
+ "ĠNon": 11842,
+ "-ch": 11843,
+ ")\",": 11844,
+ "epar": 11845,
+ "Ġbelieved": 11846,
+ "ĠEnvironment": 11847,
+ "ĠFriend": 11848,
+ "-res": 11849,
+ "Ġhandling": 11850,
+ "nic": 11851,
+ "-level": 11852,
+ "scri": 11853,
+ "Xml": 11854,
+ "BE": 11855,
+ "ungen": 11856,
+ "Ġalter": 11857,
+ "[idx": 11858,
+ "Pop": 11859,
+ "cam": 11860,
+ "Ġ(((": 11861,
+ "Ġshipping": 11862,
+ "Ġbattery": 11863,
+ "iddleware": 11864,
+ "MC": 11865,
+ "Ġimpl": 11866,
+ "otation": 11867,
+ "ĠLab": 11868,
+ "