{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 16.0,
  "eval_steps": 500,
  "global_step": 100,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.16,
      "grad_norm": 1.0079981088638306,
      "learning_rate": 4e-05,
      "loss": 2.3624,
      "step": 1
    },
    {
      "epoch": 0.32,
      "grad_norm": 1.0123796463012695,
      "learning_rate": 8e-05,
      "loss": 2.4117,
      "step": 2
    },
    {
      "epoch": 0.48,
      "grad_norm": 1.0385504961013794,
      "learning_rate": 0.00012,
      "loss": 2.4351,
      "step": 3
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.7601240277290344,
      "learning_rate": 0.00016,
      "loss": 1.9867,
      "step": 4
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.9805667400360107,
      "learning_rate": 0.0002,
      "loss": 2.0655,
      "step": 5
    },
    {
      "epoch": 0.96,
      "grad_norm": 1.6322834491729736,
      "learning_rate": 0.00019789473684210526,
      "loss": 1.8377,
      "step": 6
    },
    {
      "epoch": 1.12,
      "grad_norm": 1.1208696365356445,
      "learning_rate": 0.00019578947368421054,
      "loss": 1.5558,
      "step": 7
    },
    {
      "epoch": 1.28,
      "grad_norm": 1.3962080478668213,
      "learning_rate": 0.0001936842105263158,
      "loss": 1.4199,
      "step": 8
    },
    {
      "epoch": 1.44,
      "grad_norm": 1.4532853364944458,
      "learning_rate": 0.00019157894736842104,
      "loss": 1.2996,
      "step": 9
    },
    {
      "epoch": 1.6,
      "grad_norm": 2.3988616466522217,
      "learning_rate": 0.00018947368421052632,
      "loss": 1.2371,
      "step": 10
    },
    {
      "epoch": 1.76,
      "grad_norm": 1.3582508563995361,
      "learning_rate": 0.0001873684210526316,
      "loss": 1.1839,
      "step": 11
    },
    {
      "epoch": 1.92,
      "grad_norm": 1.2997236251831055,
      "learning_rate": 0.00018526315789473685,
      "loss": 0.983,
      "step": 12
    },
    {
      "epoch": 2.08,
      "grad_norm": 1.1868802309036255,
      "learning_rate": 0.0001831578947368421,
      "loss": 0.7447,
      "step": 13
    },
    {
      "epoch": 2.24,
      "grad_norm": 1.0286939144134521,
      "learning_rate": 0.00018105263157894739,
      "loss": 0.8524,
      "step": 14
    },
    {
      "epoch": 2.4,
      "grad_norm": 1.00070321559906,
      "learning_rate": 0.00017894736842105264,
      "loss": 0.8649,
      "step": 15
    },
    {
      "epoch": 2.56,
      "grad_norm": 1.4189987182617188,
      "learning_rate": 0.0001768421052631579,
      "loss": 0.8116,
      "step": 16
    },
    {
      "epoch": 2.7199999999999998,
      "grad_norm": 1.2303727865219116,
      "learning_rate": 0.00017473684210526317,
      "loss": 0.8071,
      "step": 17
    },
    {
      "epoch": 2.88,
      "grad_norm": 0.9925879240036011,
      "learning_rate": 0.00017263157894736842,
      "loss": 0.7081,
      "step": 18
    },
    {
      "epoch": 3.04,
      "grad_norm": 1.0683646202087402,
      "learning_rate": 0.0001705263157894737,
      "loss": 0.5269,
      "step": 19
    },
    {
      "epoch": 3.2,
      "grad_norm": 1.0474812984466553,
      "learning_rate": 0.00016842105263157895,
      "loss": 0.6947,
      "step": 20
    },
    {
      "epoch": 3.36,
      "grad_norm": 1.0291672945022583,
      "learning_rate": 0.00016631578947368423,
      "loss": 0.5014,
      "step": 21
    },
    {
      "epoch": 3.52,
      "grad_norm": 1.1327933073043823,
      "learning_rate": 0.00016421052631578948,
      "loss": 0.481,
      "step": 22
    },
    {
      "epoch": 3.68,
      "grad_norm": 1.4890342950820923,
      "learning_rate": 0.00016210526315789473,
      "loss": 0.5253,
      "step": 23
    },
    {
      "epoch": 3.84,
      "grad_norm": 1.532833456993103,
      "learning_rate": 0.00016,
      "loss": 0.4937,
      "step": 24
    },
    {
      "epoch": 4.0,
      "grad_norm": 1.7453362941741943,
      "learning_rate": 0.00015789473684210527,
      "loss": 0.5188,
      "step": 25
    },
    {
      "epoch": 4.16,
      "grad_norm": 1.2242546081542969,
      "learning_rate": 0.00015578947368421052,
      "loss": 0.1893,
      "step": 26
    },
    {
      "epoch": 4.32,
      "grad_norm": 1.7437238693237305,
      "learning_rate": 0.0001536842105263158,
      "loss": 0.433,
      "step": 27
    },
    {
      "epoch": 4.48,
      "grad_norm": 1.4618209600448608,
      "learning_rate": 0.00015157894736842108,
      "loss": 0.3996,
      "step": 28
    },
    {
      "epoch": 4.64,
      "grad_norm": 1.3685592412948608,
      "learning_rate": 0.00014947368421052633,
      "loss": 0.2189,
      "step": 29
    },
    {
      "epoch": 4.8,
      "grad_norm": 1.741402268409729,
      "learning_rate": 0.00014736842105263158,
      "loss": 0.2934,
      "step": 30
    },
    {
      "epoch": 4.96,
      "grad_norm": 1.5545222759246826,
      "learning_rate": 0.00014526315789473686,
      "loss": 0.2099,
      "step": 31
    },
    {
      "epoch": 5.12,
      "grad_norm": 1.2092806100845337,
      "learning_rate": 0.0001431578947368421,
      "loss": 0.1916,
      "step": 32
    },
    {
      "epoch": 5.28,
      "grad_norm": 1.7175395488739014,
      "learning_rate": 0.00014105263157894736,
      "loss": 0.2527,
      "step": 33
    },
    {
      "epoch": 5.44,
      "grad_norm": 1.368059754371643,
      "learning_rate": 0.00013894736842105264,
      "loss": 0.114,
      "step": 34
    },
    {
      "epoch": 5.6,
      "grad_norm": 1.6632587909698486,
      "learning_rate": 0.0001368421052631579,
      "loss": 0.1549,
      "step": 35
    },
    {
      "epoch": 5.76,
      "grad_norm": 1.6607255935668945,
      "learning_rate": 0.00013473684210526317,
      "loss": 0.1171,
      "step": 36
    },
    {
      "epoch": 5.92,
      "grad_norm": 2.4954917430877686,
      "learning_rate": 0.00013263157894736842,
      "loss": 0.1614,
      "step": 37
    },
    {
      "epoch": 6.08,
      "grad_norm": 1.7216722965240479,
      "learning_rate": 0.0001305263157894737,
      "loss": 0.1459,
      "step": 38
    },
    {
      "epoch": 6.24,
      "grad_norm": 0.9449135065078735,
      "learning_rate": 0.00012842105263157895,
      "loss": 0.1001,
      "step": 39
    },
    {
      "epoch": 6.4,
      "grad_norm": 1.4137742519378662,
      "learning_rate": 0.0001263157894736842,
      "loss": 0.0859,
      "step": 40
    },
    {
      "epoch": 6.5600000000000005,
      "grad_norm": 1.8110110759735107,
      "learning_rate": 0.00012421052631578949,
      "loss": 0.1404,
      "step": 41
    },
    {
      "epoch": 6.72,
      "grad_norm": 1.1322952508926392,
      "learning_rate": 0.00012210526315789474,
      "loss": 0.0687,
      "step": 42
    },
    {
      "epoch": 6.88,
      "grad_norm": 2.2961461544036865,
      "learning_rate": 0.00012,
      "loss": 0.1203,
      "step": 43
    },
    {
      "epoch": 7.04,
      "grad_norm": 1.5652666091918945,
      "learning_rate": 0.00011789473684210525,
      "loss": 0.1299,
      "step": 44
    },
    {
      "epoch": 7.2,
      "grad_norm": 0.7390972375869751,
      "learning_rate": 0.00011578947368421053,
      "loss": 0.0595,
      "step": 45
    },
    {
      "epoch": 7.36,
      "grad_norm": 1.0376925468444824,
      "learning_rate": 0.0001136842105263158,
      "loss": 0.0578,
      "step": 46
    },
    {
      "epoch": 7.52,
      "grad_norm": 0.9976247549057007,
      "learning_rate": 0.00011157894736842105,
      "loss": 0.0695,
      "step": 47
    },
    {
      "epoch": 7.68,
      "grad_norm": 1.0853309631347656,
      "learning_rate": 0.00010947368421052633,
      "loss": 0.0985,
      "step": 48
    },
    {
      "epoch": 7.84,
      "grad_norm": 1.3621833324432373,
      "learning_rate": 0.00010736842105263158,
      "loss": 0.1269,
      "step": 49
    },
    {
      "epoch": 8.0,
      "grad_norm": 0.8868013024330139,
      "learning_rate": 0.00010526315789473685,
      "loss": 0.0641,
      "step": 50
    },
    {
      "epoch": 8.16,
      "grad_norm": 0.6473409533500671,
      "learning_rate": 0.00010315789473684211,
      "loss": 0.0474,
      "step": 51
    },
    {
      "epoch": 8.32,
      "grad_norm": 1.6032112836837769,
      "learning_rate": 0.00010105263157894738,
      "loss": 0.0597,
      "step": 52
    },
    {
      "epoch": 8.48,
      "grad_norm": 1.120687484741211,
      "learning_rate": 9.894736842105263e-05,
      "loss": 0.0582,
      "step": 53
    },
    {
      "epoch": 8.64,
      "grad_norm": 0.7064136862754822,
      "learning_rate": 9.68421052631579e-05,
      "loss": 0.0557,
      "step": 54
    },
    {
      "epoch": 8.8,
      "grad_norm": 0.5838208794593811,
      "learning_rate": 9.473684210526316e-05,
      "loss": 0.0436,
      "step": 55
    },
    {
      "epoch": 8.96,
      "grad_norm": 1.2315547466278076,
      "learning_rate": 9.263157894736843e-05,
      "loss": 0.063,
      "step": 56
    },
    {
      "epoch": 9.12,
      "grad_norm": 0.3518936336040497,
      "learning_rate": 9.052631578947369e-05,
      "loss": 0.0311,
      "step": 57
    },
    {
      "epoch": 9.28,
      "grad_norm": 0.6926944851875305,
      "learning_rate": 8.842105263157894e-05,
      "loss": 0.039,
      "step": 58
    },
    {
      "epoch": 9.44,
      "grad_norm": 0.26300671696662903,
      "learning_rate": 8.631578947368421e-05,
      "loss": 0.0252,
      "step": 59
    },
    {
      "epoch": 9.6,
      "grad_norm": 0.7903566360473633,
      "learning_rate": 8.421052631578948e-05,
      "loss": 0.0415,
      "step": 60
    },
    {
      "epoch": 9.76,
      "grad_norm": 0.5427919626235962,
      "learning_rate": 8.210526315789474e-05,
      "loss": 0.0453,
      "step": 61
    },
    {
      "epoch": 9.92,
      "grad_norm": 0.5827217698097229,
      "learning_rate": 8e-05,
      "loss": 0.0368,
      "step": 62
    },
    {
      "epoch": 10.08,
      "grad_norm": 1.45575749874115,
      "learning_rate": 7.789473684210526e-05,
      "loss": 0.0736,
      "step": 63
    },
    {
      "epoch": 10.24,
      "grad_norm": 0.32767948508262634,
      "learning_rate": 7.578947368421054e-05,
      "loss": 0.0316,
      "step": 64
    },
    {
      "epoch": 10.4,
      "grad_norm": 0.30059218406677246,
      "learning_rate": 7.368421052631579e-05,
      "loss": 0.0277,
      "step": 65
    },
    {
      "epoch": 10.56,
      "grad_norm": 0.4859299659729004,
      "learning_rate": 7.157894736842105e-05,
      "loss": 0.0313,
      "step": 66
    },
    {
      "epoch": 10.72,
      "grad_norm": 0.4874284267425537,
      "learning_rate": 6.947368421052632e-05,
      "loss": 0.0322,
      "step": 67
    },
    {
      "epoch": 10.88,
      "grad_norm": 0.41711848974227905,
      "learning_rate": 6.736842105263159e-05,
      "loss": 0.0389,
      "step": 68
    },
    {
      "epoch": 11.04,
      "grad_norm": 0.8408872485160828,
      "learning_rate": 6.526315789473685e-05,
      "loss": 0.0312,
      "step": 69
    },
    {
      "epoch": 11.2,
      "grad_norm": 0.32355204224586487,
      "learning_rate": 6.31578947368421e-05,
      "loss": 0.0328,
      "step": 70
    },
    {
      "epoch": 11.36,
      "grad_norm": 0.42406928539276123,
      "learning_rate": 6.105263157894737e-05,
      "loss": 0.0277,
      "step": 71
    },
    {
      "epoch": 11.52,
      "grad_norm": 0.7678600549697876,
      "learning_rate": 5.894736842105263e-05,
      "loss": 0.0329,
      "step": 72
    },
    {
      "epoch": 11.68,
      "grad_norm": 0.29065871238708496,
      "learning_rate": 5.68421052631579e-05,
      "loss": 0.0297,
      "step": 73
    },
    {
      "epoch": 11.84,
      "grad_norm": 0.5853772163391113,
      "learning_rate": 5.4736842105263165e-05,
      "loss": 0.0393,
      "step": 74
    },
    {
      "epoch": 12.0,
      "grad_norm": 0.7088480591773987,
      "learning_rate": 5.2631578947368424e-05,
      "loss": 0.0344,
      "step": 75
    },
    {
      "epoch": 12.16,
      "grad_norm": 0.19609542191028595,
      "learning_rate": 5.052631578947369e-05,
      "loss": 0.0232,
      "step": 76
    },
    {
      "epoch": 12.32,
      "grad_norm": 0.31028512120246887,
      "learning_rate": 4.842105263157895e-05,
      "loss": 0.0273,
      "step": 77
    },
    {
      "epoch": 12.48,
      "grad_norm": 0.4248906672000885,
      "learning_rate": 4.6315789473684214e-05,
      "loss": 0.0315,
      "step": 78
    },
    {
      "epoch": 12.64,
      "grad_norm": 0.4214076101779938,
      "learning_rate": 4.421052631578947e-05,
      "loss": 0.0309,
      "step": 79
    },
    {
      "epoch": 12.8,
      "grad_norm": 0.4250756502151489,
      "learning_rate": 4.210526315789474e-05,
      "loss": 0.0285,
      "step": 80
    },
    {
      "epoch": 12.96,
      "grad_norm": 0.2500416934490204,
      "learning_rate": 4e-05,
      "loss": 0.0256,
      "step": 81
    },
    {
      "epoch": 13.12,
      "grad_norm": 0.2516506314277649,
      "learning_rate": 3.789473684210527e-05,
      "loss": 0.0244,
      "step": 82
    },
    {
      "epoch": 13.28,
      "grad_norm": 0.217052161693573,
      "learning_rate": 3.578947368421053e-05,
      "loss": 0.0241,
      "step": 83
    },
    {
      "epoch": 13.44,
      "grad_norm": 0.4375220835208893,
      "learning_rate": 3.368421052631579e-05,
      "loss": 0.0308,
      "step": 84
    },
    {
      "epoch": 13.6,
      "grad_norm": 0.23626229166984558,
      "learning_rate": 3.157894736842105e-05,
      "loss": 0.029,
      "step": 85
    },
    {
      "epoch": 13.76,
      "grad_norm": 0.3816908001899719,
      "learning_rate": 2.9473684210526314e-05,
      "loss": 0.0251,
      "step": 86
    },
    {
      "epoch": 13.92,
      "grad_norm": 0.17371943593025208,
      "learning_rate": 2.7368421052631583e-05,
      "loss": 0.0203,
      "step": 87
    },
    {
      "epoch": 14.08,
      "grad_norm": 0.21958455443382263,
      "learning_rate": 2.5263157894736845e-05,
      "loss": 0.0265,
      "step": 88
    },
    {
      "epoch": 14.24,
      "grad_norm": 0.2628728151321411,
      "learning_rate": 2.3157894736842107e-05,
      "loss": 0.0242,
      "step": 89
    },
    {
      "epoch": 14.4,
      "grad_norm": 0.2763591408729553,
      "learning_rate": 2.105263157894737e-05,
      "loss": 0.0299,
      "step": 90
    },
    {
      "epoch": 14.56,
      "grad_norm": 0.2944229245185852,
      "learning_rate": 1.8947368421052634e-05,
      "loss": 0.0244,
      "step": 91
    },
    {
      "epoch": 14.72,
      "grad_norm": 0.28353527188301086,
      "learning_rate": 1.6842105263157896e-05,
      "loss": 0.0241,
      "step": 92
    },
    {
      "epoch": 14.88,
      "grad_norm": 0.2161315530538559,
      "learning_rate": 1.4736842105263157e-05,
      "loss": 0.024,
      "step": 93
    },
    {
      "epoch": 15.04,
      "grad_norm": 0.2228800654411316,
      "learning_rate": 1.2631578947368422e-05,
      "loss": 0.0263,
      "step": 94
    },
    {
      "epoch": 15.2,
      "grad_norm": 0.17299261689186096,
      "learning_rate": 1.0526315789473684e-05,
      "loss": 0.0227,
      "step": 95
    },
    {
      "epoch": 15.36,
      "grad_norm": 0.21846872568130493,
      "learning_rate": 8.421052631578948e-06,
      "loss": 0.0223,
      "step": 96
    },
    {
      "epoch": 15.52,
      "grad_norm": 0.23234839737415314,
      "learning_rate": 6.315789473684211e-06,
      "loss": 0.0269,
      "step": 97
    },
    {
      "epoch": 15.68,
      "grad_norm": 0.217283234000206,
      "learning_rate": 4.210526315789474e-06,
      "loss": 0.0259,
      "step": 98
    },
    {
      "epoch": 15.84,
      "grad_norm": 0.2666471600532532,
      "learning_rate": 2.105263157894737e-06,
      "loss": 0.027,
      "step": 99
    },
    {
      "epoch": 16.0,
      "grad_norm": 0.2889624536037445,
      "learning_rate": 0.0,
      "loss": 0.0248,
      "step": 100
    }
  ],
  "logging_steps": 1,
  "max_steps": 100,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 17,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 2992005070258176.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}