{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 0, "global_step": 452, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004424778761061947, "grad_norm": 0.057373046875, "learning_rate": 0.00039911504424778763, "loss": 1.3739, "step": 1 }, { "epoch": 0.008849557522123894, "grad_norm": 0.1201171875, "learning_rate": 0.00039823008849557525, "loss": 1.4091, "step": 2 }, { "epoch": 0.01327433628318584, "grad_norm": 0.0751953125, "learning_rate": 0.00039734513274336286, "loss": 1.2628, "step": 3 }, { "epoch": 0.017699115044247787, "grad_norm": 0.064453125, "learning_rate": 0.0003964601769911505, "loss": 1.1101, "step": 4 }, { "epoch": 0.022123893805309734, "grad_norm": 0.06396484375, "learning_rate": 0.0003955752212389381, "loss": 1.344, "step": 5 }, { "epoch": 0.02654867256637168, "grad_norm": 0.0634765625, "learning_rate": 0.00039469026548672565, "loss": 1.1884, "step": 6 }, { "epoch": 0.030973451327433628, "grad_norm": 0.0625, "learning_rate": 0.0003938053097345133, "loss": 1.1329, "step": 7 }, { "epoch": 0.035398230088495575, "grad_norm": 0.052490234375, "learning_rate": 0.0003929203539823009, "loss": 1.138, "step": 8 }, { "epoch": 0.03982300884955752, "grad_norm": 0.0625, "learning_rate": 0.00039203539823008855, "loss": 1.0113, "step": 9 }, { "epoch": 0.04424778761061947, "grad_norm": 0.041748046875, "learning_rate": 0.0003911504424778761, "loss": 1.087, "step": 10 }, { "epoch": 0.048672566371681415, "grad_norm": 0.046630859375, "learning_rate": 0.0003902654867256637, "loss": 1.1459, "step": 11 }, { "epoch": 0.05309734513274336, "grad_norm": 0.03662109375, "learning_rate": 0.00038938053097345134, "loss": 1.1421, "step": 12 }, { "epoch": 0.05752212389380531, "grad_norm": 0.035888671875, "learning_rate": 0.00038849557522123895, "loss": 1.175, "step": 13 }, { "epoch": 0.061946902654867256, "grad_norm": 0.037109375, "learning_rate": 0.00038761061946902657, "loss": 1.2099, "step": 14 }, { "epoch": 0.06637168141592921, "grad_norm": 0.038818359375, "learning_rate": 0.0003867256637168142, "loss": 1.1295, "step": 15 }, { "epoch": 0.07079646017699115, "grad_norm": 0.0419921875, "learning_rate": 0.00038584070796460174, "loss": 1.0737, "step": 16 }, { "epoch": 0.0752212389380531, "grad_norm": 0.037109375, "learning_rate": 0.0003849557522123894, "loss": 1.1563, "step": 17 }, { "epoch": 0.07964601769911504, "grad_norm": 0.039306640625, "learning_rate": 0.000384070796460177, "loss": 1.1061, "step": 18 }, { "epoch": 0.084070796460177, "grad_norm": 0.050048828125, "learning_rate": 0.00038318584070796464, "loss": 1.1052, "step": 19 }, { "epoch": 0.08849557522123894, "grad_norm": 0.036865234375, "learning_rate": 0.00038230088495575226, "loss": 1.0009, "step": 20 }, { "epoch": 0.09292035398230089, "grad_norm": 0.041015625, "learning_rate": 0.0003814159292035398, "loss": 0.9805, "step": 21 }, { "epoch": 0.09734513274336283, "grad_norm": 0.03173828125, "learning_rate": 0.0003805309734513275, "loss": 1.1098, "step": 22 }, { "epoch": 0.10176991150442478, "grad_norm": 0.0322265625, "learning_rate": 0.00037964601769911505, "loss": 1.0691, "step": 23 }, { "epoch": 0.10619469026548672, "grad_norm": 0.05029296875, "learning_rate": 0.00037876106194690266, "loss": 1.2944, "step": 24 }, { "epoch": 0.11061946902654868, "grad_norm": 0.0419921875, "learning_rate": 0.0003778761061946903, "loss": 1.0819, "step": 25 }, { "epoch": 0.11504424778761062, "grad_norm": 0.0341796875, "learning_rate": 0.0003769911504424779, "loss": 1.215, "step": 26 }, { "epoch": 0.11946902654867257, "grad_norm": 0.041015625, "learning_rate": 0.0003761061946902655, "loss": 1.0624, "step": 27 }, { "epoch": 0.12389380530973451, "grad_norm": 0.03271484375, "learning_rate": 0.0003752212389380531, "loss": 1.0258, "step": 28 }, { "epoch": 0.12831858407079647, "grad_norm": 0.038330078125, "learning_rate": 0.00037433628318584073, "loss": 1.0544, "step": 29 }, { "epoch": 0.13274336283185842, "grad_norm": 0.035400390625, "learning_rate": 0.00037345132743362835, "loss": 1.0203, "step": 30 }, { "epoch": 0.13716814159292035, "grad_norm": 0.05810546875, "learning_rate": 0.0003725663716814159, "loss": 1.1584, "step": 31 }, { "epoch": 0.1415929203539823, "grad_norm": 0.0341796875, "learning_rate": 0.0003716814159292036, "loss": 0.9215, "step": 32 }, { "epoch": 0.14601769911504425, "grad_norm": 0.03857421875, "learning_rate": 0.0003707964601769912, "loss": 1.1255, "step": 33 }, { "epoch": 0.1504424778761062, "grad_norm": 0.053466796875, "learning_rate": 0.00036991150442477875, "loss": 1.3504, "step": 34 }, { "epoch": 0.15486725663716813, "grad_norm": 0.0419921875, "learning_rate": 0.0003690265486725664, "loss": 1.0819, "step": 35 }, { "epoch": 0.1592920353982301, "grad_norm": 0.041259765625, "learning_rate": 0.000368141592920354, "loss": 1.2328, "step": 36 }, { "epoch": 0.16371681415929204, "grad_norm": 0.04345703125, "learning_rate": 0.00036725663716814165, "loss": 1.1783, "step": 37 }, { "epoch": 0.168141592920354, "grad_norm": 0.044189453125, "learning_rate": 0.0003663716814159292, "loss": 1.105, "step": 38 }, { "epoch": 0.17256637168141592, "grad_norm": 0.05224609375, "learning_rate": 0.0003654867256637168, "loss": 1.1757, "step": 39 }, { "epoch": 0.17699115044247787, "grad_norm": 0.042236328125, "learning_rate": 0.00036460176991150444, "loss": 1.1601, "step": 40 }, { "epoch": 0.18141592920353983, "grad_norm": 0.04296875, "learning_rate": 0.00036371681415929205, "loss": 0.9869, "step": 41 }, { "epoch": 0.18584070796460178, "grad_norm": 0.05419921875, "learning_rate": 0.00036283185840707967, "loss": 1.0769, "step": 42 }, { "epoch": 0.1902654867256637, "grad_norm": 0.0361328125, "learning_rate": 0.0003619469026548673, "loss": 1.015, "step": 43 }, { "epoch": 0.19469026548672566, "grad_norm": 0.03564453125, "learning_rate": 0.00036106194690265484, "loss": 0.9435, "step": 44 }, { "epoch": 0.19911504424778761, "grad_norm": 0.058837890625, "learning_rate": 0.0003601769911504425, "loss": 1.1832, "step": 45 }, { "epoch": 0.20353982300884957, "grad_norm": 0.052490234375, "learning_rate": 0.00035929203539823007, "loss": 1.1826, "step": 46 }, { "epoch": 0.2079646017699115, "grad_norm": 0.0625, "learning_rate": 0.00035840707964601774, "loss": 1.02, "step": 47 }, { "epoch": 0.21238938053097345, "grad_norm": 0.047607421875, "learning_rate": 0.0003575221238938053, "loss": 1.0803, "step": 48 }, { "epoch": 0.2168141592920354, "grad_norm": 0.041015625, "learning_rate": 0.0003566371681415929, "loss": 1.021, "step": 49 }, { "epoch": 0.22123893805309736, "grad_norm": 0.041015625, "learning_rate": 0.0003557522123893806, "loss": 1.0058, "step": 50 }, { "epoch": 0.22566371681415928, "grad_norm": 0.040771484375, "learning_rate": 0.00035486725663716814, "loss": 1.0489, "step": 51 }, { "epoch": 0.23008849557522124, "grad_norm": 0.040771484375, "learning_rate": 0.0003539823008849558, "loss": 0.986, "step": 52 }, { "epoch": 0.2345132743362832, "grad_norm": 0.039794921875, "learning_rate": 0.00035309734513274337, "loss": 1.0928, "step": 53 }, { "epoch": 0.23893805309734514, "grad_norm": 0.0419921875, "learning_rate": 0.000352212389380531, "loss": 1.0037, "step": 54 }, { "epoch": 0.24336283185840707, "grad_norm": 0.035888671875, "learning_rate": 0.0003513274336283186, "loss": 1.0165, "step": 55 }, { "epoch": 0.24778761061946902, "grad_norm": 0.046630859375, "learning_rate": 0.0003504424778761062, "loss": 0.9856, "step": 56 }, { "epoch": 0.252212389380531, "grad_norm": 0.0390625, "learning_rate": 0.00034955752212389383, "loss": 1.0988, "step": 57 }, { "epoch": 0.25663716814159293, "grad_norm": 0.035400390625, "learning_rate": 0.00034867256637168145, "loss": 0.9983, "step": 58 }, { "epoch": 0.2610619469026549, "grad_norm": 0.0390625, "learning_rate": 0.000347787610619469, "loss": 1.0727, "step": 59 }, { "epoch": 0.26548672566371684, "grad_norm": 0.0380859375, "learning_rate": 0.0003469026548672567, "loss": 0.9617, "step": 60 }, { "epoch": 0.26991150442477874, "grad_norm": 0.04638671875, "learning_rate": 0.00034601769911504423, "loss": 1.1435, "step": 61 }, { "epoch": 0.2743362831858407, "grad_norm": 0.0419921875, "learning_rate": 0.0003451327433628319, "loss": 1.0895, "step": 62 }, { "epoch": 0.27876106194690264, "grad_norm": 0.038330078125, "learning_rate": 0.00034424778761061946, "loss": 1.0823, "step": 63 }, { "epoch": 0.2831858407079646, "grad_norm": 0.042236328125, "learning_rate": 0.0003433628318584071, "loss": 1.1119, "step": 64 }, { "epoch": 0.28761061946902655, "grad_norm": 0.0576171875, "learning_rate": 0.00034247787610619475, "loss": 1.2428, "step": 65 }, { "epoch": 0.2920353982300885, "grad_norm": 0.04541015625, "learning_rate": 0.0003415929203539823, "loss": 0.9943, "step": 66 }, { "epoch": 0.29646017699115046, "grad_norm": 0.0439453125, "learning_rate": 0.0003407079646017699, "loss": 1.3215, "step": 67 }, { "epoch": 0.3008849557522124, "grad_norm": 0.03515625, "learning_rate": 0.00033982300884955754, "loss": 0.9997, "step": 68 }, { "epoch": 0.3053097345132743, "grad_norm": 0.039794921875, "learning_rate": 0.00033893805309734515, "loss": 0.9796, "step": 69 }, { "epoch": 0.30973451327433627, "grad_norm": 0.044677734375, "learning_rate": 0.00033805309734513277, "loss": 1.1079, "step": 70 }, { "epoch": 0.3141592920353982, "grad_norm": 0.041259765625, "learning_rate": 0.0003371681415929204, "loss": 1.0242, "step": 71 }, { "epoch": 0.3185840707964602, "grad_norm": 0.04638671875, "learning_rate": 0.000336283185840708, "loss": 1.0227, "step": 72 }, { "epoch": 0.3230088495575221, "grad_norm": 0.042236328125, "learning_rate": 0.0003353982300884956, "loss": 0.9375, "step": 73 }, { "epoch": 0.3274336283185841, "grad_norm": 0.03759765625, "learning_rate": 0.00033451327433628317, "loss": 1.0104, "step": 74 }, { "epoch": 0.33185840707964603, "grad_norm": 0.041748046875, "learning_rate": 0.00033362831858407084, "loss": 1.1685, "step": 75 }, { "epoch": 0.336283185840708, "grad_norm": 0.051513671875, "learning_rate": 0.0003327433628318584, "loss": 1.2954, "step": 76 }, { "epoch": 0.3407079646017699, "grad_norm": 0.0517578125, "learning_rate": 0.000331858407079646, "loss": 0.9816, "step": 77 }, { "epoch": 0.34513274336283184, "grad_norm": 0.04248046875, "learning_rate": 0.00033097345132743363, "loss": 1.0791, "step": 78 }, { "epoch": 0.3495575221238938, "grad_norm": 0.043701171875, "learning_rate": 0.00033008849557522124, "loss": 1.0989, "step": 79 }, { "epoch": 0.35398230088495575, "grad_norm": 0.05029296875, "learning_rate": 0.00032920353982300886, "loss": 1.1164, "step": 80 }, { "epoch": 0.3584070796460177, "grad_norm": 0.0400390625, "learning_rate": 0.00032831858407079647, "loss": 1.2053, "step": 81 }, { "epoch": 0.36283185840707965, "grad_norm": 0.041748046875, "learning_rate": 0.0003274336283185841, "loss": 1.0322, "step": 82 }, { "epoch": 0.3672566371681416, "grad_norm": 0.064453125, "learning_rate": 0.0003265486725663717, "loss": 0.9184, "step": 83 }, { "epoch": 0.37168141592920356, "grad_norm": 0.037353515625, "learning_rate": 0.0003256637168141593, "loss": 1.0874, "step": 84 }, { "epoch": 0.37610619469026546, "grad_norm": 0.04638671875, "learning_rate": 0.00032477876106194693, "loss": 1.0051, "step": 85 }, { "epoch": 0.3805309734513274, "grad_norm": 0.052001953125, "learning_rate": 0.00032389380530973454, "loss": 1.1232, "step": 86 }, { "epoch": 0.38495575221238937, "grad_norm": 0.036865234375, "learning_rate": 0.0003230088495575221, "loss": 0.9745, "step": 87 }, { "epoch": 0.3893805309734513, "grad_norm": 0.037353515625, "learning_rate": 0.0003221238938053098, "loss": 0.9092, "step": 88 }, { "epoch": 0.3938053097345133, "grad_norm": 0.04931640625, "learning_rate": 0.00032123893805309733, "loss": 1.0712, "step": 89 }, { "epoch": 0.39823008849557523, "grad_norm": 0.043701171875, "learning_rate": 0.000320353982300885, "loss": 1.0908, "step": 90 }, { "epoch": 0.4026548672566372, "grad_norm": 0.04150390625, "learning_rate": 0.00031946902654867256, "loss": 1.0897, "step": 91 }, { "epoch": 0.40707964601769914, "grad_norm": 0.03857421875, "learning_rate": 0.0003185840707964602, "loss": 0.8939, "step": 92 }, { "epoch": 0.41150442477876104, "grad_norm": 0.044677734375, "learning_rate": 0.0003176991150442478, "loss": 1.0992, "step": 93 }, { "epoch": 0.415929203539823, "grad_norm": 0.038818359375, "learning_rate": 0.0003168141592920354, "loss": 0.937, "step": 94 }, { "epoch": 0.42035398230088494, "grad_norm": 0.0634765625, "learning_rate": 0.000315929203539823, "loss": 1.1744, "step": 95 }, { "epoch": 0.4247787610619469, "grad_norm": 0.042236328125, "learning_rate": 0.00031504424778761064, "loss": 1.0227, "step": 96 }, { "epoch": 0.42920353982300885, "grad_norm": 0.041259765625, "learning_rate": 0.00031415929203539825, "loss": 1.112, "step": 97 }, { "epoch": 0.4336283185840708, "grad_norm": 0.047119140625, "learning_rate": 0.00031327433628318586, "loss": 0.9122, "step": 98 }, { "epoch": 0.43805309734513276, "grad_norm": 0.04931640625, "learning_rate": 0.0003123893805309735, "loss": 1.0073, "step": 99 }, { "epoch": 0.4424778761061947, "grad_norm": 0.040283203125, "learning_rate": 0.0003115044247787611, "loss": 1.0326, "step": 100 }, { "epoch": 0.4469026548672566, "grad_norm": 0.046142578125, "learning_rate": 0.0003106194690265487, "loss": 1.0014, "step": 101 }, { "epoch": 0.45132743362831856, "grad_norm": 0.041015625, "learning_rate": 0.00030973451327433627, "loss": 1.1081, "step": 102 }, { "epoch": 0.4557522123893805, "grad_norm": 0.041015625, "learning_rate": 0.00030884955752212394, "loss": 1.1268, "step": 103 }, { "epoch": 0.46017699115044247, "grad_norm": 0.05078125, "learning_rate": 0.0003079646017699115, "loss": 1.0382, "step": 104 }, { "epoch": 0.4646017699115044, "grad_norm": 0.0576171875, "learning_rate": 0.00030707964601769917, "loss": 0.9887, "step": 105 }, { "epoch": 0.4690265486725664, "grad_norm": 0.0390625, "learning_rate": 0.0003061946902654867, "loss": 1.0143, "step": 106 }, { "epoch": 0.47345132743362833, "grad_norm": 0.06982421875, "learning_rate": 0.00030530973451327434, "loss": 1.0332, "step": 107 }, { "epoch": 0.4778761061946903, "grad_norm": 0.044921875, "learning_rate": 0.00030442477876106196, "loss": 0.9422, "step": 108 }, { "epoch": 0.4823008849557522, "grad_norm": 0.06298828125, "learning_rate": 0.00030353982300884957, "loss": 1.0376, "step": 109 }, { "epoch": 0.48672566371681414, "grad_norm": 0.04833984375, "learning_rate": 0.0003026548672566372, "loss": 1.1175, "step": 110 }, { "epoch": 0.4911504424778761, "grad_norm": 0.044189453125, "learning_rate": 0.0003017699115044248, "loss": 0.9571, "step": 111 }, { "epoch": 0.49557522123893805, "grad_norm": 0.0478515625, "learning_rate": 0.00030088495575221236, "loss": 1.0857, "step": 112 }, { "epoch": 0.5, "grad_norm": 0.059814453125, "learning_rate": 0.00030000000000000003, "loss": 0.9346, "step": 113 }, { "epoch": 0.504424778761062, "grad_norm": 0.054443359375, "learning_rate": 0.00029911504424778764, "loss": 1.0317, "step": 114 }, { "epoch": 0.5088495575221239, "grad_norm": 0.0625, "learning_rate": 0.00029823008849557526, "loss": 1.0535, "step": 115 }, { "epoch": 0.5132743362831859, "grad_norm": 0.04150390625, "learning_rate": 0.00029734513274336287, "loss": 1.0437, "step": 116 }, { "epoch": 0.5176991150442478, "grad_norm": 0.046142578125, "learning_rate": 0.00029646017699115043, "loss": 1.0253, "step": 117 }, { "epoch": 0.5221238938053098, "grad_norm": 0.07421875, "learning_rate": 0.0002955752212389381, "loss": 1.022, "step": 118 }, { "epoch": 0.5265486725663717, "grad_norm": 0.058837890625, "learning_rate": 0.00029469026548672566, "loss": 1.2344, "step": 119 }, { "epoch": 0.5309734513274337, "grad_norm": 0.0576171875, "learning_rate": 0.0002938053097345133, "loss": 0.9828, "step": 120 }, { "epoch": 0.5353982300884956, "grad_norm": 0.05078125, "learning_rate": 0.0002929203539823009, "loss": 0.9207, "step": 121 }, { "epoch": 0.5398230088495575, "grad_norm": 0.050537109375, "learning_rate": 0.0002920353982300885, "loss": 0.9794, "step": 122 }, { "epoch": 0.5442477876106194, "grad_norm": 0.05908203125, "learning_rate": 0.0002911504424778761, "loss": 1.0962, "step": 123 }, { "epoch": 0.5486725663716814, "grad_norm": 0.041748046875, "learning_rate": 0.00029026548672566373, "loss": 1.1614, "step": 124 }, { "epoch": 0.5530973451327433, "grad_norm": 0.038330078125, "learning_rate": 0.00028938053097345135, "loss": 0.9082, "step": 125 }, { "epoch": 0.5575221238938053, "grad_norm": 0.037353515625, "learning_rate": 0.00028849557522123896, "loss": 0.9406, "step": 126 }, { "epoch": 0.5619469026548672, "grad_norm": 0.039306640625, "learning_rate": 0.0002876106194690265, "loss": 1.1105, "step": 127 }, { "epoch": 0.5663716814159292, "grad_norm": 0.051025390625, "learning_rate": 0.0002867256637168142, "loss": 0.9679, "step": 128 }, { "epoch": 0.5707964601769911, "grad_norm": 0.037109375, "learning_rate": 0.00028584070796460175, "loss": 0.9529, "step": 129 }, { "epoch": 0.5752212389380531, "grad_norm": 0.056396484375, "learning_rate": 0.00028495575221238937, "loss": 1.0341, "step": 130 }, { "epoch": 0.5796460176991151, "grad_norm": 0.039306640625, "learning_rate": 0.00028407079646017704, "loss": 0.9493, "step": 131 }, { "epoch": 0.584070796460177, "grad_norm": 0.06591796875, "learning_rate": 0.0002831858407079646, "loss": 1.262, "step": 132 }, { "epoch": 0.588495575221239, "grad_norm": 0.038330078125, "learning_rate": 0.00028230088495575226, "loss": 0.9412, "step": 133 }, { "epoch": 0.5929203539823009, "grad_norm": 0.046875, "learning_rate": 0.0002814159292035398, "loss": 1.0563, "step": 134 }, { "epoch": 0.5973451327433629, "grad_norm": 0.05712890625, "learning_rate": 0.00028053097345132744, "loss": 1.0201, "step": 135 }, { "epoch": 0.6017699115044248, "grad_norm": 0.04052734375, "learning_rate": 0.00027964601769911505, "loss": 1.0401, "step": 136 }, { "epoch": 0.6061946902654868, "grad_norm": 0.05078125, "learning_rate": 0.00027876106194690267, "loss": 1.0241, "step": 137 }, { "epoch": 0.6106194690265486, "grad_norm": 0.05810546875, "learning_rate": 0.0002778761061946903, "loss": 1.1263, "step": 138 }, { "epoch": 0.6150442477876106, "grad_norm": 0.048095703125, "learning_rate": 0.0002769911504424779, "loss": 1.0869, "step": 139 }, { "epoch": 0.6194690265486725, "grad_norm": 0.447265625, "learning_rate": 0.0002761061946902655, "loss": 0.9944, "step": 140 }, { "epoch": 0.6238938053097345, "grad_norm": 0.038818359375, "learning_rate": 0.0002752212389380531, "loss": 0.9675, "step": 141 }, { "epoch": 0.6283185840707964, "grad_norm": 0.068359375, "learning_rate": 0.0002743362831858407, "loss": 1.0227, "step": 142 }, { "epoch": 0.6327433628318584, "grad_norm": 0.072265625, "learning_rate": 0.00027345132743362836, "loss": 1.0381, "step": 143 }, { "epoch": 0.6371681415929203, "grad_norm": 0.055908203125, "learning_rate": 0.0002725663716814159, "loss": 0.9385, "step": 144 }, { "epoch": 0.6415929203539823, "grad_norm": 0.04248046875, "learning_rate": 0.00027168141592920353, "loss": 1.001, "step": 145 }, { "epoch": 0.6460176991150443, "grad_norm": 0.06005859375, "learning_rate": 0.0002707964601769912, "loss": 1.04, "step": 146 }, { "epoch": 0.6504424778761062, "grad_norm": 0.049072265625, "learning_rate": 0.00026991150442477876, "loss": 0.9735, "step": 147 }, { "epoch": 0.6548672566371682, "grad_norm": 0.045654296875, "learning_rate": 0.00026902654867256643, "loss": 1.0873, "step": 148 }, { "epoch": 0.6592920353982301, "grad_norm": 0.04638671875, "learning_rate": 0.000268141592920354, "loss": 1.1032, "step": 149 }, { "epoch": 0.6637168141592921, "grad_norm": 0.051513671875, "learning_rate": 0.0002672566371681416, "loss": 1.0414, "step": 150 }, { "epoch": 0.668141592920354, "grad_norm": 0.0419921875, "learning_rate": 0.0002663716814159292, "loss": 0.892, "step": 151 }, { "epoch": 0.672566371681416, "grad_norm": 0.040771484375, "learning_rate": 0.00026548672566371683, "loss": 0.9048, "step": 152 }, { "epoch": 0.6769911504424779, "grad_norm": 0.06494140625, "learning_rate": 0.00026460176991150445, "loss": 1.0745, "step": 153 }, { "epoch": 0.6814159292035398, "grad_norm": 0.059814453125, "learning_rate": 0.00026371681415929206, "loss": 1.2796, "step": 154 }, { "epoch": 0.6858407079646017, "grad_norm": 0.050048828125, "learning_rate": 0.0002628318584070796, "loss": 0.9484, "step": 155 }, { "epoch": 0.6902654867256637, "grad_norm": 0.0458984375, "learning_rate": 0.0002619469026548673, "loss": 1.0571, "step": 156 }, { "epoch": 0.6946902654867256, "grad_norm": 0.0439453125, "learning_rate": 0.00026106194690265485, "loss": 1.1435, "step": 157 }, { "epoch": 0.6991150442477876, "grad_norm": 0.0458984375, "learning_rate": 0.0002601769911504425, "loss": 1.0, "step": 158 }, { "epoch": 0.7035398230088495, "grad_norm": 0.039794921875, "learning_rate": 0.0002592920353982301, "loss": 1.0044, "step": 159 }, { "epoch": 0.7079646017699115, "grad_norm": 0.049072265625, "learning_rate": 0.0002584070796460177, "loss": 1.001, "step": 160 }, { "epoch": 0.7123893805309734, "grad_norm": 0.04541015625, "learning_rate": 0.0002575221238938053, "loss": 1.0643, "step": 161 }, { "epoch": 0.7168141592920354, "grad_norm": 0.046630859375, "learning_rate": 0.0002566371681415929, "loss": 1.2461, "step": 162 }, { "epoch": 0.7212389380530974, "grad_norm": 0.0458984375, "learning_rate": 0.00025575221238938054, "loss": 1.297, "step": 163 }, { "epoch": 0.7256637168141593, "grad_norm": 0.349609375, "learning_rate": 0.00025486725663716815, "loss": 0.9718, "step": 164 }, { "epoch": 0.7300884955752213, "grad_norm": 0.039794921875, "learning_rate": 0.00025398230088495577, "loss": 0.9553, "step": 165 }, { "epoch": 0.7345132743362832, "grad_norm": 0.041748046875, "learning_rate": 0.0002530973451327434, "loss": 1.074, "step": 166 }, { "epoch": 0.7389380530973452, "grad_norm": 0.0615234375, "learning_rate": 0.000252212389380531, "loss": 1.0015, "step": 167 }, { "epoch": 0.7433628318584071, "grad_norm": 0.043212890625, "learning_rate": 0.0002513274336283186, "loss": 1.021, "step": 168 }, { "epoch": 0.7477876106194691, "grad_norm": 0.0556640625, "learning_rate": 0.0002504424778761062, "loss": 1.063, "step": 169 }, { "epoch": 0.7522123893805309, "grad_norm": 0.03759765625, "learning_rate": 0.0002495575221238938, "loss": 0.9415, "step": 170 }, { "epoch": 0.7566371681415929, "grad_norm": 0.0673828125, "learning_rate": 0.00024867256637168145, "loss": 1.0556, "step": 171 }, { "epoch": 0.7610619469026548, "grad_norm": 0.06298828125, "learning_rate": 0.000247787610619469, "loss": 1.1345, "step": 172 }, { "epoch": 0.7654867256637168, "grad_norm": 0.044189453125, "learning_rate": 0.00024690265486725663, "loss": 0.9686, "step": 173 }, { "epoch": 0.7699115044247787, "grad_norm": 0.18359375, "learning_rate": 0.00024601769911504424, "loss": 0.8729, "step": 174 }, { "epoch": 0.7743362831858407, "grad_norm": 0.04736328125, "learning_rate": 0.00024513274336283186, "loss": 1.0424, "step": 175 }, { "epoch": 0.7787610619469026, "grad_norm": 0.05322265625, "learning_rate": 0.00024424778761061947, "loss": 1.0317, "step": 176 }, { "epoch": 0.7831858407079646, "grad_norm": 0.043212890625, "learning_rate": 0.0002433628318584071, "loss": 1.1979, "step": 177 }, { "epoch": 0.7876106194690266, "grad_norm": 0.0615234375, "learning_rate": 0.00024247787610619473, "loss": 1.0134, "step": 178 }, { "epoch": 0.7920353982300885, "grad_norm": 0.0615234375, "learning_rate": 0.00024159292035398232, "loss": 1.1044, "step": 179 }, { "epoch": 0.7964601769911505, "grad_norm": 0.04443359375, "learning_rate": 0.00024070796460176993, "loss": 1.0293, "step": 180 }, { "epoch": 0.8008849557522124, "grad_norm": 0.04248046875, "learning_rate": 0.00023982300884955752, "loss": 0.9629, "step": 181 }, { "epoch": 0.8053097345132744, "grad_norm": 0.03857421875, "learning_rate": 0.00023893805309734516, "loss": 0.9511, "step": 182 }, { "epoch": 0.8097345132743363, "grad_norm": 0.046142578125, "learning_rate": 0.00023805309734513275, "loss": 1.0096, "step": 183 }, { "epoch": 0.8141592920353983, "grad_norm": 0.0498046875, "learning_rate": 0.0002371681415929204, "loss": 0.8986, "step": 184 }, { "epoch": 0.8185840707964602, "grad_norm": 0.050048828125, "learning_rate": 0.00023628318584070798, "loss": 0.9618, "step": 185 }, { "epoch": 0.8230088495575221, "grad_norm": 0.07177734375, "learning_rate": 0.0002353982300884956, "loss": 1.0183, "step": 186 }, { "epoch": 0.827433628318584, "grad_norm": 0.06982421875, "learning_rate": 0.00023451327433628318, "loss": 0.9824, "step": 187 }, { "epoch": 0.831858407079646, "grad_norm": 0.0439453125, "learning_rate": 0.00023362831858407082, "loss": 0.9304, "step": 188 }, { "epoch": 0.8362831858407079, "grad_norm": 0.04736328125, "learning_rate": 0.0002327433628318584, "loss": 0.9942, "step": 189 }, { "epoch": 0.8407079646017699, "grad_norm": 0.05029296875, "learning_rate": 0.00023185840707964602, "loss": 1.1299, "step": 190 }, { "epoch": 0.8451327433628318, "grad_norm": 0.046875, "learning_rate": 0.0002309734513274336, "loss": 1.0395, "step": 191 }, { "epoch": 0.8495575221238938, "grad_norm": 0.04296875, "learning_rate": 0.00023008849557522125, "loss": 0.9442, "step": 192 }, { "epoch": 0.8539823008849557, "grad_norm": 0.05078125, "learning_rate": 0.00022920353982300884, "loss": 1.0056, "step": 193 }, { "epoch": 0.8584070796460177, "grad_norm": 0.050537109375, "learning_rate": 0.00022831858407079648, "loss": 0.9217, "step": 194 }, { "epoch": 0.8628318584070797, "grad_norm": 0.040771484375, "learning_rate": 0.0002274336283185841, "loss": 0.9522, "step": 195 }, { "epoch": 0.8672566371681416, "grad_norm": 0.042236328125, "learning_rate": 0.00022654867256637168, "loss": 0.9525, "step": 196 }, { "epoch": 0.8716814159292036, "grad_norm": 0.048095703125, "learning_rate": 0.00022566371681415932, "loss": 1.0493, "step": 197 }, { "epoch": 0.8761061946902655, "grad_norm": 0.047607421875, "learning_rate": 0.0002247787610619469, "loss": 1.1643, "step": 198 }, { "epoch": 0.8805309734513275, "grad_norm": 0.041748046875, "learning_rate": 0.00022389380530973453, "loss": 0.8968, "step": 199 }, { "epoch": 0.8849557522123894, "grad_norm": 0.046875, "learning_rate": 0.0002230088495575221, "loss": 0.8145, "step": 200 }, { "epoch": 0.8893805309734514, "grad_norm": 0.0693359375, "learning_rate": 0.00022212389380530975, "loss": 1.1892, "step": 201 }, { "epoch": 0.8938053097345132, "grad_norm": 0.0673828125, "learning_rate": 0.00022123893805309734, "loss": 0.9646, "step": 202 }, { "epoch": 0.8982300884955752, "grad_norm": 0.046630859375, "learning_rate": 0.00022035398230088498, "loss": 1.0692, "step": 203 }, { "epoch": 0.9026548672566371, "grad_norm": 0.06396484375, "learning_rate": 0.00021946902654867257, "loss": 0.9034, "step": 204 }, { "epoch": 0.9070796460176991, "grad_norm": 0.04150390625, "learning_rate": 0.00021858407079646019, "loss": 1.1094, "step": 205 }, { "epoch": 0.911504424778761, "grad_norm": 0.064453125, "learning_rate": 0.00021769911504424777, "loss": 1.1966, "step": 206 }, { "epoch": 0.915929203539823, "grad_norm": 0.049560546875, "learning_rate": 0.00021681415929203541, "loss": 1.1902, "step": 207 }, { "epoch": 0.9203539823008849, "grad_norm": 0.06884765625, "learning_rate": 0.000215929203539823, "loss": 1.1077, "step": 208 }, { "epoch": 0.9247787610619469, "grad_norm": 0.042236328125, "learning_rate": 0.00021504424778761064, "loss": 0.9293, "step": 209 }, { "epoch": 0.9292035398230089, "grad_norm": 0.040283203125, "learning_rate": 0.00021415929203539826, "loss": 1.0238, "step": 210 }, { "epoch": 0.9336283185840708, "grad_norm": 0.046142578125, "learning_rate": 0.00021327433628318585, "loss": 0.9889, "step": 211 }, { "epoch": 0.9380530973451328, "grad_norm": 0.048583984375, "learning_rate": 0.0002123893805309735, "loss": 1.0614, "step": 212 }, { "epoch": 0.9424778761061947, "grad_norm": 0.048095703125, "learning_rate": 0.00021150442477876107, "loss": 1.0836, "step": 213 }, { "epoch": 0.9469026548672567, "grad_norm": 0.047607421875, "learning_rate": 0.0002106194690265487, "loss": 1.0815, "step": 214 }, { "epoch": 0.9513274336283186, "grad_norm": 0.039794921875, "learning_rate": 0.00020973451327433628, "loss": 1.0021, "step": 215 }, { "epoch": 0.9557522123893806, "grad_norm": 0.049072265625, "learning_rate": 0.00020884955752212392, "loss": 1.0002, "step": 216 }, { "epoch": 0.9601769911504425, "grad_norm": 0.04541015625, "learning_rate": 0.0002079646017699115, "loss": 1.2081, "step": 217 }, { "epoch": 0.9646017699115044, "grad_norm": 0.0439453125, "learning_rate": 0.00020707964601769915, "loss": 1.0711, "step": 218 }, { "epoch": 0.9690265486725663, "grad_norm": 0.049072265625, "learning_rate": 0.00020619469026548673, "loss": 1.0342, "step": 219 }, { "epoch": 0.9734513274336283, "grad_norm": 0.0556640625, "learning_rate": 0.00020530973451327435, "loss": 1.0103, "step": 220 }, { "epoch": 0.9778761061946902, "grad_norm": 0.04931640625, "learning_rate": 0.00020442477876106194, "loss": 0.9692, "step": 221 }, { "epoch": 0.9823008849557522, "grad_norm": 0.04296875, "learning_rate": 0.00020353982300884958, "loss": 0.9639, "step": 222 }, { "epoch": 0.9867256637168141, "grad_norm": 0.040771484375, "learning_rate": 0.00020265486725663717, "loss": 0.9039, "step": 223 }, { "epoch": 0.9911504424778761, "grad_norm": 0.049560546875, "learning_rate": 0.00020176991150442478, "loss": 0.9265, "step": 224 }, { "epoch": 0.995575221238938, "grad_norm": 0.04248046875, "learning_rate": 0.00020088495575221237, "loss": 0.8961, "step": 225 }, { "epoch": 1.0, "grad_norm": 0.0625, "learning_rate": 0.0002, "loss": 1.0299, "step": 226 }, { "epoch": 1.0044247787610618, "grad_norm": 0.052978515625, "learning_rate": 0.00019911504424778762, "loss": 0.8533, "step": 227 }, { "epoch": 1.008849557522124, "grad_norm": 0.042236328125, "learning_rate": 0.00019823008849557524, "loss": 0.937, "step": 228 }, { "epoch": 1.0132743362831858, "grad_norm": 0.05029296875, "learning_rate": 0.00019734513274336283, "loss": 0.8202, "step": 229 }, { "epoch": 1.0176991150442478, "grad_norm": 0.0517578125, "learning_rate": 0.00019646017699115044, "loss": 0.8976, "step": 230 }, { "epoch": 1.0221238938053097, "grad_norm": 0.048828125, "learning_rate": 0.00019557522123893806, "loss": 0.8791, "step": 231 }, { "epoch": 1.0265486725663717, "grad_norm": 0.050537109375, "learning_rate": 0.00019469026548672567, "loss": 1.0753, "step": 232 }, { "epoch": 1.0309734513274336, "grad_norm": 0.05615234375, "learning_rate": 0.00019380530973451328, "loss": 1.0464, "step": 233 }, { "epoch": 1.0353982300884956, "grad_norm": 0.059326171875, "learning_rate": 0.00019292035398230087, "loss": 0.8115, "step": 234 }, { "epoch": 1.0398230088495575, "grad_norm": 0.058349609375, "learning_rate": 0.0001920353982300885, "loss": 0.9851, "step": 235 }, { "epoch": 1.0442477876106195, "grad_norm": 0.068359375, "learning_rate": 0.00019115044247787613, "loss": 0.8867, "step": 236 }, { "epoch": 1.0486725663716814, "grad_norm": 0.059814453125, "learning_rate": 0.00019026548672566374, "loss": 0.7882, "step": 237 }, { "epoch": 1.0530973451327434, "grad_norm": 0.06494140625, "learning_rate": 0.00018938053097345133, "loss": 1.0028, "step": 238 }, { "epoch": 1.0575221238938053, "grad_norm": 0.06103515625, "learning_rate": 0.00018849557522123894, "loss": 0.9446, "step": 239 }, { "epoch": 1.0619469026548674, "grad_norm": 0.059814453125, "learning_rate": 0.00018761061946902656, "loss": 1.0249, "step": 240 }, { "epoch": 1.0663716814159292, "grad_norm": 0.053955078125, "learning_rate": 0.00018672566371681417, "loss": 0.9277, "step": 241 }, { "epoch": 1.0707964601769913, "grad_norm": 0.0751953125, "learning_rate": 0.0001858407079646018, "loss": 0.8228, "step": 242 }, { "epoch": 1.075221238938053, "grad_norm": 0.058837890625, "learning_rate": 0.00018495575221238938, "loss": 0.8757, "step": 243 }, { "epoch": 1.079646017699115, "grad_norm": 0.059326171875, "learning_rate": 0.000184070796460177, "loss": 0.7868, "step": 244 }, { "epoch": 1.084070796460177, "grad_norm": 0.07275390625, "learning_rate": 0.0001831858407079646, "loss": 0.878, "step": 245 }, { "epoch": 1.0884955752212389, "grad_norm": 0.05908203125, "learning_rate": 0.00018230088495575222, "loss": 0.8944, "step": 246 }, { "epoch": 1.092920353982301, "grad_norm": 0.059326171875, "learning_rate": 0.00018141592920353983, "loss": 0.8831, "step": 247 }, { "epoch": 1.0973451327433628, "grad_norm": 0.060302734375, "learning_rate": 0.00018053097345132742, "loss": 0.9312, "step": 248 }, { "epoch": 1.1017699115044248, "grad_norm": 0.053955078125, "learning_rate": 0.00017964601769911504, "loss": 0.7488, "step": 249 }, { "epoch": 1.1061946902654867, "grad_norm": 0.06298828125, "learning_rate": 0.00017876106194690265, "loss": 0.9677, "step": 250 }, { "epoch": 1.1106194690265487, "grad_norm": 0.06298828125, "learning_rate": 0.0001778761061946903, "loss": 0.8391, "step": 251 }, { "epoch": 1.1150442477876106, "grad_norm": 0.061279296875, "learning_rate": 0.0001769911504424779, "loss": 0.9225, "step": 252 }, { "epoch": 1.1194690265486726, "grad_norm": 0.080078125, "learning_rate": 0.0001761061946902655, "loss": 0.7969, "step": 253 }, { "epoch": 1.1238938053097345, "grad_norm": 0.06494140625, "learning_rate": 0.0001752212389380531, "loss": 0.8957, "step": 254 }, { "epoch": 1.1283185840707965, "grad_norm": 0.062255859375, "learning_rate": 0.00017433628318584072, "loss": 0.9192, "step": 255 }, { "epoch": 1.1327433628318584, "grad_norm": 0.1005859375, "learning_rate": 0.00017345132743362834, "loss": 0.8669, "step": 256 }, { "epoch": 1.1371681415929205, "grad_norm": 0.0810546875, "learning_rate": 0.00017256637168141595, "loss": 0.9332, "step": 257 }, { "epoch": 1.1415929203539823, "grad_norm": 0.06689453125, "learning_rate": 0.00017168141592920354, "loss": 0.8392, "step": 258 }, { "epoch": 1.1460176991150441, "grad_norm": 0.06494140625, "learning_rate": 0.00017079646017699115, "loss": 1.1159, "step": 259 }, { "epoch": 1.1504424778761062, "grad_norm": 0.0625, "learning_rate": 0.00016991150442477877, "loss": 0.9649, "step": 260 }, { "epoch": 1.154867256637168, "grad_norm": 0.059326171875, "learning_rate": 0.00016902654867256638, "loss": 0.9653, "step": 261 }, { "epoch": 1.1592920353982301, "grad_norm": 0.05322265625, "learning_rate": 0.000168141592920354, "loss": 0.8342, "step": 262 }, { "epoch": 1.163716814159292, "grad_norm": 0.109375, "learning_rate": 0.00016725663716814158, "loss": 0.7385, "step": 263 }, { "epoch": 1.168141592920354, "grad_norm": 0.076171875, "learning_rate": 0.0001663716814159292, "loss": 0.7605, "step": 264 }, { "epoch": 1.1725663716814159, "grad_norm": 0.057373046875, "learning_rate": 0.00016548672566371681, "loss": 0.8457, "step": 265 }, { "epoch": 1.176991150442478, "grad_norm": 0.08447265625, "learning_rate": 0.00016460176991150443, "loss": 0.872, "step": 266 }, { "epoch": 1.1814159292035398, "grad_norm": 0.07470703125, "learning_rate": 0.00016371681415929204, "loss": 1.0322, "step": 267 }, { "epoch": 1.1858407079646018, "grad_norm": 0.06640625, "learning_rate": 0.00016283185840707966, "loss": 1.0532, "step": 268 }, { "epoch": 1.1902654867256637, "grad_norm": 0.059814453125, "learning_rate": 0.00016194690265486727, "loss": 0.9205, "step": 269 }, { "epoch": 1.1946902654867257, "grad_norm": 0.060546875, "learning_rate": 0.0001610619469026549, "loss": 0.8789, "step": 270 }, { "epoch": 1.1991150442477876, "grad_norm": 0.0654296875, "learning_rate": 0.0001601769911504425, "loss": 1.0501, "step": 271 }, { "epoch": 1.2035398230088497, "grad_norm": 0.0634765625, "learning_rate": 0.0001592920353982301, "loss": 0.8666, "step": 272 }, { "epoch": 1.2079646017699115, "grad_norm": 0.0595703125, "learning_rate": 0.0001584070796460177, "loss": 0.8761, "step": 273 }, { "epoch": 1.2123893805309733, "grad_norm": 0.057373046875, "learning_rate": 0.00015752212389380532, "loss": 0.8827, "step": 274 }, { "epoch": 1.2168141592920354, "grad_norm": 0.07373046875, "learning_rate": 0.00015663716814159293, "loss": 0.8162, "step": 275 }, { "epoch": 1.2212389380530975, "grad_norm": 0.06494140625, "learning_rate": 0.00015575221238938055, "loss": 0.7613, "step": 276 }, { "epoch": 1.2256637168141593, "grad_norm": 0.06494140625, "learning_rate": 0.00015486725663716813, "loss": 0.825, "step": 277 }, { "epoch": 1.2300884955752212, "grad_norm": 0.061767578125, "learning_rate": 0.00015398230088495575, "loss": 0.9633, "step": 278 }, { "epoch": 1.2345132743362832, "grad_norm": 0.0595703125, "learning_rate": 0.00015309734513274336, "loss": 0.9036, "step": 279 }, { "epoch": 1.238938053097345, "grad_norm": 0.076171875, "learning_rate": 0.00015221238938053098, "loss": 0.9527, "step": 280 }, { "epoch": 1.2433628318584071, "grad_norm": 0.06005859375, "learning_rate": 0.0001513274336283186, "loss": 0.9089, "step": 281 }, { "epoch": 1.247787610619469, "grad_norm": 0.056884765625, "learning_rate": 0.00015044247787610618, "loss": 0.8911, "step": 282 }, { "epoch": 1.252212389380531, "grad_norm": 0.0908203125, "learning_rate": 0.00014955752212389382, "loss": 0.7871, "step": 283 }, { "epoch": 1.2566371681415929, "grad_norm": 0.0771484375, "learning_rate": 0.00014867256637168144, "loss": 0.8415, "step": 284 }, { "epoch": 1.261061946902655, "grad_norm": 0.07177734375, "learning_rate": 0.00014778761061946905, "loss": 1.0105, "step": 285 }, { "epoch": 1.2654867256637168, "grad_norm": 0.0986328125, "learning_rate": 0.00014690265486725664, "loss": 0.9677, "step": 286 }, { "epoch": 1.2699115044247788, "grad_norm": 0.0888671875, "learning_rate": 0.00014601769911504425, "loss": 0.837, "step": 287 }, { "epoch": 1.2743362831858407, "grad_norm": 0.126953125, "learning_rate": 0.00014513274336283187, "loss": 0.8605, "step": 288 }, { "epoch": 1.2787610619469025, "grad_norm": 0.06298828125, "learning_rate": 0.00014424778761061948, "loss": 0.8717, "step": 289 }, { "epoch": 1.2831858407079646, "grad_norm": 0.08740234375, "learning_rate": 0.0001433628318584071, "loss": 1.0469, "step": 290 }, { "epoch": 1.2876106194690267, "grad_norm": 0.061767578125, "learning_rate": 0.00014247787610619468, "loss": 0.9339, "step": 291 }, { "epoch": 1.2920353982300885, "grad_norm": 0.072265625, "learning_rate": 0.0001415929203539823, "loss": 0.7235, "step": 292 }, { "epoch": 1.2964601769911503, "grad_norm": 0.087890625, "learning_rate": 0.0001407079646017699, "loss": 0.8648, "step": 293 }, { "epoch": 1.3008849557522124, "grad_norm": 0.062255859375, "learning_rate": 0.00013982300884955753, "loss": 0.8842, "step": 294 }, { "epoch": 1.3053097345132743, "grad_norm": 0.08056640625, "learning_rate": 0.00013893805309734514, "loss": 0.9593, "step": 295 }, { "epoch": 1.3097345132743363, "grad_norm": 0.0771484375, "learning_rate": 0.00013805309734513276, "loss": 0.9122, "step": 296 }, { "epoch": 1.3141592920353982, "grad_norm": 0.06396484375, "learning_rate": 0.00013716814159292034, "loss": 1.0082, "step": 297 }, { "epoch": 1.3185840707964602, "grad_norm": 0.06298828125, "learning_rate": 0.00013628318584070796, "loss": 0.884, "step": 298 }, { "epoch": 1.323008849557522, "grad_norm": 0.08349609375, "learning_rate": 0.0001353982300884956, "loss": 0.8348, "step": 299 }, { "epoch": 1.3274336283185841, "grad_norm": 0.0732421875, "learning_rate": 0.00013451327433628321, "loss": 0.747, "step": 300 }, { "epoch": 1.331858407079646, "grad_norm": 0.06396484375, "learning_rate": 0.0001336283185840708, "loss": 0.8841, "step": 301 }, { "epoch": 1.336283185840708, "grad_norm": 0.06005859375, "learning_rate": 0.00013274336283185842, "loss": 0.8985, "step": 302 }, { "epoch": 1.3407079646017699, "grad_norm": 0.068359375, "learning_rate": 0.00013185840707964603, "loss": 0.9008, "step": 303 }, { "epoch": 1.3451327433628317, "grad_norm": 0.076171875, "learning_rate": 0.00013097345132743365, "loss": 0.8909, "step": 304 }, { "epoch": 1.3495575221238938, "grad_norm": 0.09521484375, "learning_rate": 0.00013008849557522126, "loss": 0.8108, "step": 305 }, { "epoch": 1.3539823008849559, "grad_norm": 0.08154296875, "learning_rate": 0.00012920353982300885, "loss": 0.8546, "step": 306 }, { "epoch": 1.3584070796460177, "grad_norm": 0.0771484375, "learning_rate": 0.00012831858407079646, "loss": 1.0212, "step": 307 }, { "epoch": 1.3628318584070795, "grad_norm": 0.06201171875, "learning_rate": 0.00012743362831858408, "loss": 0.974, "step": 308 }, { "epoch": 1.3672566371681416, "grad_norm": 0.095703125, "learning_rate": 0.0001265486725663717, "loss": 0.7493, "step": 309 }, { "epoch": 1.3716814159292037, "grad_norm": 0.09765625, "learning_rate": 0.0001256637168141593, "loss": 1.0118, "step": 310 }, { "epoch": 1.3761061946902655, "grad_norm": 0.08740234375, "learning_rate": 0.0001247787610619469, "loss": 0.8243, "step": 311 }, { "epoch": 1.3805309734513274, "grad_norm": 0.06884765625, "learning_rate": 0.0001238938053097345, "loss": 0.9024, "step": 312 }, { "epoch": 1.3849557522123894, "grad_norm": 0.08740234375, "learning_rate": 0.00012300884955752212, "loss": 0.9018, "step": 313 }, { "epoch": 1.3893805309734513, "grad_norm": 0.09814453125, "learning_rate": 0.00012212389380530974, "loss": 1.1168, "step": 314 }, { "epoch": 1.3938053097345133, "grad_norm": 0.07861328125, "learning_rate": 0.00012123893805309736, "loss": 0.9847, "step": 315 }, { "epoch": 1.3982300884955752, "grad_norm": 0.07080078125, "learning_rate": 0.00012035398230088497, "loss": 0.9884, "step": 316 }, { "epoch": 1.4026548672566372, "grad_norm": 0.07568359375, "learning_rate": 0.00011946902654867258, "loss": 0.9483, "step": 317 }, { "epoch": 1.407079646017699, "grad_norm": 0.06787109375, "learning_rate": 0.0001185840707964602, "loss": 0.8768, "step": 318 }, { "epoch": 1.411504424778761, "grad_norm": 0.0751953125, "learning_rate": 0.0001176991150442478, "loss": 0.9072, "step": 319 }, { "epoch": 1.415929203539823, "grad_norm": 0.0810546875, "learning_rate": 0.00011681415929203541, "loss": 0.8627, "step": 320 }, { "epoch": 1.420353982300885, "grad_norm": 0.07275390625, "learning_rate": 0.00011592920353982301, "loss": 0.9518, "step": 321 }, { "epoch": 1.424778761061947, "grad_norm": 0.0830078125, "learning_rate": 0.00011504424778761063, "loss": 0.8705, "step": 322 }, { "epoch": 1.4292035398230087, "grad_norm": 0.061767578125, "learning_rate": 0.00011415929203539824, "loss": 0.8535, "step": 323 }, { "epoch": 1.4336283185840708, "grad_norm": 0.06396484375, "learning_rate": 0.00011327433628318584, "loss": 0.8835, "step": 324 }, { "epoch": 1.4380530973451329, "grad_norm": 0.09033203125, "learning_rate": 0.00011238938053097346, "loss": 1.1187, "step": 325 }, { "epoch": 1.4424778761061947, "grad_norm": 0.08935546875, "learning_rate": 0.00011150442477876106, "loss": 0.6991, "step": 326 }, { "epoch": 1.4469026548672566, "grad_norm": 0.10546875, "learning_rate": 0.00011061946902654867, "loss": 0.8172, "step": 327 }, { "epoch": 1.4513274336283186, "grad_norm": 0.1015625, "learning_rate": 0.00010973451327433629, "loss": 0.8526, "step": 328 }, { "epoch": 1.4557522123893805, "grad_norm": 0.06640625, "learning_rate": 0.00010884955752212389, "loss": 0.8048, "step": 329 }, { "epoch": 1.4601769911504425, "grad_norm": 0.0693359375, "learning_rate": 0.0001079646017699115, "loss": 0.9438, "step": 330 }, { "epoch": 1.4646017699115044, "grad_norm": 0.08837890625, "learning_rate": 0.00010707964601769913, "loss": 0.9667, "step": 331 }, { "epoch": 1.4690265486725664, "grad_norm": 0.0810546875, "learning_rate": 0.00010619469026548674, "loss": 1.0007, "step": 332 }, { "epoch": 1.4734513274336283, "grad_norm": 0.07470703125, "learning_rate": 0.00010530973451327434, "loss": 0.971, "step": 333 }, { "epoch": 1.4778761061946903, "grad_norm": 0.09033203125, "learning_rate": 0.00010442477876106196, "loss": 0.8334, "step": 334 }, { "epoch": 1.4823008849557522, "grad_norm": 0.06640625, "learning_rate": 0.00010353982300884957, "loss": 0.7885, "step": 335 }, { "epoch": 1.4867256637168142, "grad_norm": 0.0947265625, "learning_rate": 0.00010265486725663717, "loss": 0.825, "step": 336 }, { "epoch": 1.491150442477876, "grad_norm": 0.08154296875, "learning_rate": 0.00010176991150442479, "loss": 0.9044, "step": 337 }, { "epoch": 1.495575221238938, "grad_norm": 0.07763671875, "learning_rate": 0.00010088495575221239, "loss": 0.7607, "step": 338 }, { "epoch": 1.5, "grad_norm": 0.0693359375, "learning_rate": 0.0001, "loss": 0.966, "step": 339 }, { "epoch": 1.504424778761062, "grad_norm": 0.1005859375, "learning_rate": 9.911504424778762e-05, "loss": 0.7745, "step": 340 }, { "epoch": 1.508849557522124, "grad_norm": 0.058837890625, "learning_rate": 9.823008849557522e-05, "loss": 0.8849, "step": 341 }, { "epoch": 1.5132743362831858, "grad_norm": 0.0703125, "learning_rate": 9.734513274336283e-05, "loss": 0.9905, "step": 342 }, { "epoch": 1.5176991150442478, "grad_norm": 0.1025390625, "learning_rate": 9.646017699115044e-05, "loss": 0.8459, "step": 343 }, { "epoch": 1.5221238938053099, "grad_norm": 0.07275390625, "learning_rate": 9.557522123893806e-05, "loss": 0.8842, "step": 344 }, { "epoch": 1.5265486725663717, "grad_norm": 0.083984375, "learning_rate": 9.469026548672566e-05, "loss": 1.0654, "step": 345 }, { "epoch": 1.5309734513274336, "grad_norm": 0.0615234375, "learning_rate": 9.380530973451328e-05, "loss": 0.8734, "step": 346 }, { "epoch": 1.5353982300884956, "grad_norm": 0.0791015625, "learning_rate": 9.29203539823009e-05, "loss": 0.9752, "step": 347 }, { "epoch": 1.5398230088495575, "grad_norm": 0.0751953125, "learning_rate": 9.20353982300885e-05, "loss": 0.7664, "step": 348 }, { "epoch": 1.5442477876106193, "grad_norm": 0.0888671875, "learning_rate": 9.115044247787611e-05, "loss": 0.8328, "step": 349 }, { "epoch": 1.5486725663716814, "grad_norm": 0.0712890625, "learning_rate": 9.026548672566371e-05, "loss": 0.8581, "step": 350 }, { "epoch": 1.5530973451327434, "grad_norm": 0.0888671875, "learning_rate": 8.938053097345133e-05, "loss": 0.7521, "step": 351 }, { "epoch": 1.5575221238938053, "grad_norm": 0.0810546875, "learning_rate": 8.849557522123895e-05, "loss": 1.1778, "step": 352 }, { "epoch": 1.5619469026548671, "grad_norm": 0.08447265625, "learning_rate": 8.761061946902655e-05, "loss": 0.8007, "step": 353 }, { "epoch": 1.5663716814159292, "grad_norm": 0.08544921875, "learning_rate": 8.672566371681417e-05, "loss": 1.1795, "step": 354 }, { "epoch": 1.5707964601769913, "grad_norm": 0.08642578125, "learning_rate": 8.584070796460177e-05, "loss": 0.9632, "step": 355 }, { "epoch": 1.575221238938053, "grad_norm": 0.11572265625, "learning_rate": 8.495575221238938e-05, "loss": 0.7671, "step": 356 }, { "epoch": 1.579646017699115, "grad_norm": 0.1396484375, "learning_rate": 8.4070796460177e-05, "loss": 0.692, "step": 357 }, { "epoch": 1.584070796460177, "grad_norm": 0.10791015625, "learning_rate": 8.31858407079646e-05, "loss": 0.6548, "step": 358 }, { "epoch": 1.588495575221239, "grad_norm": 0.080078125, "learning_rate": 8.230088495575221e-05, "loss": 0.805, "step": 359 }, { "epoch": 1.592920353982301, "grad_norm": 0.06005859375, "learning_rate": 8.141592920353983e-05, "loss": 0.7988, "step": 360 }, { "epoch": 1.5973451327433628, "grad_norm": 0.07861328125, "learning_rate": 8.053097345132744e-05, "loss": 0.9695, "step": 361 }, { "epoch": 1.6017699115044248, "grad_norm": 0.07421875, "learning_rate": 7.964601769911504e-05, "loss": 1.0397, "step": 362 }, { "epoch": 1.606194690265487, "grad_norm": 0.0830078125, "learning_rate": 7.876106194690266e-05, "loss": 0.9098, "step": 363 }, { "epoch": 1.6106194690265485, "grad_norm": 0.07861328125, "learning_rate": 7.787610619469027e-05, "loss": 0.9249, "step": 364 }, { "epoch": 1.6150442477876106, "grad_norm": 0.0615234375, "learning_rate": 7.699115044247787e-05, "loss": 0.7443, "step": 365 }, { "epoch": 1.6194690265486726, "grad_norm": 0.08935546875, "learning_rate": 7.610619469026549e-05, "loss": 0.8042, "step": 366 }, { "epoch": 1.6238938053097345, "grad_norm": 0.0810546875, "learning_rate": 7.522123893805309e-05, "loss": 0.8271, "step": 367 }, { "epoch": 1.6283185840707963, "grad_norm": 0.06884765625, "learning_rate": 7.433628318584072e-05, "loss": 0.9711, "step": 368 }, { "epoch": 1.6327433628318584, "grad_norm": 0.06689453125, "learning_rate": 7.345132743362832e-05, "loss": 0.8821, "step": 369 }, { "epoch": 1.6371681415929205, "grad_norm": 0.0556640625, "learning_rate": 7.256637168141593e-05, "loss": 0.7417, "step": 370 }, { "epoch": 1.6415929203539823, "grad_norm": 0.06591796875, "learning_rate": 7.168141592920355e-05, "loss": 0.9247, "step": 371 }, { "epoch": 1.6460176991150441, "grad_norm": 0.06396484375, "learning_rate": 7.079646017699115e-05, "loss": 0.9101, "step": 372 }, { "epoch": 1.6504424778761062, "grad_norm": 0.091796875, "learning_rate": 6.991150442477876e-05, "loss": 1.0123, "step": 373 }, { "epoch": 1.6548672566371683, "grad_norm": 0.103515625, "learning_rate": 6.902654867256638e-05, "loss": 0.7791, "step": 374 }, { "epoch": 1.6592920353982301, "grad_norm": 0.07275390625, "learning_rate": 6.814159292035398e-05, "loss": 1.0589, "step": 375 }, { "epoch": 1.663716814159292, "grad_norm": 0.058349609375, "learning_rate": 6.725663716814161e-05, "loss": 0.8401, "step": 376 }, { "epoch": 1.668141592920354, "grad_norm": 0.059814453125, "learning_rate": 6.637168141592921e-05, "loss": 0.8201, "step": 377 }, { "epoch": 1.672566371681416, "grad_norm": 0.0927734375, "learning_rate": 6.548672566371682e-05, "loss": 0.913, "step": 378 }, { "epoch": 1.676991150442478, "grad_norm": 0.060302734375, "learning_rate": 6.460176991150442e-05, "loss": 0.8276, "step": 379 }, { "epoch": 1.6814159292035398, "grad_norm": 0.08349609375, "learning_rate": 6.371681415929204e-05, "loss": 0.7729, "step": 380 }, { "epoch": 1.6858407079646018, "grad_norm": 0.0703125, "learning_rate": 6.283185840707965e-05, "loss": 1.0113, "step": 381 }, { "epoch": 1.6902654867256637, "grad_norm": 0.0634765625, "learning_rate": 6.194690265486725e-05, "loss": 0.8446, "step": 382 }, { "epoch": 1.6946902654867255, "grad_norm": 0.0673828125, "learning_rate": 6.106194690265487e-05, "loss": 0.8878, "step": 383 }, { "epoch": 1.6991150442477876, "grad_norm": 0.1103515625, "learning_rate": 6.017699115044248e-05, "loss": 0.6718, "step": 384 }, { "epoch": 1.7035398230088497, "grad_norm": 0.060302734375, "learning_rate": 5.92920353982301e-05, "loss": 0.8153, "step": 385 }, { "epoch": 1.7079646017699115, "grad_norm": 0.0712890625, "learning_rate": 5.8407079646017705e-05, "loss": 0.9931, "step": 386 }, { "epoch": 1.7123893805309733, "grad_norm": 0.0556640625, "learning_rate": 5.752212389380531e-05, "loss": 0.7466, "step": 387 }, { "epoch": 1.7168141592920354, "grad_norm": 0.09033203125, "learning_rate": 5.663716814159292e-05, "loss": 0.9364, "step": 388 }, { "epoch": 1.7212389380530975, "grad_norm": 0.068359375, "learning_rate": 5.575221238938053e-05, "loss": 0.8851, "step": 389 }, { "epoch": 1.7256637168141593, "grad_norm": 0.061279296875, "learning_rate": 5.486725663716814e-05, "loss": 0.8714, "step": 390 }, { "epoch": 1.7300884955752212, "grad_norm": 0.06982421875, "learning_rate": 5.398230088495575e-05, "loss": 0.8885, "step": 391 }, { "epoch": 1.7345132743362832, "grad_norm": 0.06298828125, "learning_rate": 5.309734513274337e-05, "loss": 0.8724, "step": 392 }, { "epoch": 1.7389380530973453, "grad_norm": 0.08056640625, "learning_rate": 5.221238938053098e-05, "loss": 1.1328, "step": 393 }, { "epoch": 1.7433628318584071, "grad_norm": 0.099609375, "learning_rate": 5.132743362831859e-05, "loss": 0.7735, "step": 394 }, { "epoch": 1.747787610619469, "grad_norm": 0.06982421875, "learning_rate": 5.0442477876106195e-05, "loss": 0.9325, "step": 395 }, { "epoch": 1.752212389380531, "grad_norm": 0.07080078125, "learning_rate": 4.955752212389381e-05, "loss": 0.9273, "step": 396 }, { "epoch": 1.7566371681415929, "grad_norm": 0.10009765625, "learning_rate": 4.867256637168142e-05, "loss": 0.7756, "step": 397 }, { "epoch": 1.7610619469026547, "grad_norm": 0.0908203125, "learning_rate": 4.778761061946903e-05, "loss": 1.0591, "step": 398 }, { "epoch": 1.7654867256637168, "grad_norm": 0.09423828125, "learning_rate": 4.690265486725664e-05, "loss": 0.7867, "step": 399 }, { "epoch": 1.7699115044247788, "grad_norm": 0.0888671875, "learning_rate": 4.601769911504425e-05, "loss": 0.8369, "step": 400 }, { "epoch": 1.7743362831858407, "grad_norm": 0.06396484375, "learning_rate": 4.5132743362831855e-05, "loss": 0.9999, "step": 401 }, { "epoch": 1.7787610619469025, "grad_norm": 0.061767578125, "learning_rate": 4.4247787610619477e-05, "loss": 0.8612, "step": 402 }, { "epoch": 1.7831858407079646, "grad_norm": 0.09716796875, "learning_rate": 4.3362831858407084e-05, "loss": 0.8529, "step": 403 }, { "epoch": 1.7876106194690267, "grad_norm": 0.07763671875, "learning_rate": 4.247787610619469e-05, "loss": 0.8809, "step": 404 }, { "epoch": 1.7920353982300885, "grad_norm": 0.07177734375, "learning_rate": 4.15929203539823e-05, "loss": 0.9739, "step": 405 }, { "epoch": 1.7964601769911503, "grad_norm": 0.07568359375, "learning_rate": 4.0707964601769914e-05, "loss": 0.9416, "step": 406 }, { "epoch": 1.8008849557522124, "grad_norm": 0.061767578125, "learning_rate": 3.982300884955752e-05, "loss": 0.8359, "step": 407 }, { "epoch": 1.8053097345132745, "grad_norm": 0.0712890625, "learning_rate": 3.893805309734514e-05, "loss": 0.9323, "step": 408 }, { "epoch": 1.8097345132743363, "grad_norm": 0.0810546875, "learning_rate": 3.8053097345132744e-05, "loss": 0.8084, "step": 409 }, { "epoch": 1.8141592920353982, "grad_norm": 0.06298828125, "learning_rate": 3.716814159292036e-05, "loss": 0.9237, "step": 410 }, { "epoch": 1.8185840707964602, "grad_norm": 0.08447265625, "learning_rate": 3.628318584070797e-05, "loss": 1.0047, "step": 411 }, { "epoch": 1.823008849557522, "grad_norm": 0.0654296875, "learning_rate": 3.5398230088495574e-05, "loss": 0.9763, "step": 412 }, { "epoch": 1.827433628318584, "grad_norm": 0.06201171875, "learning_rate": 3.451327433628319e-05, "loss": 0.7498, "step": 413 }, { "epoch": 1.831858407079646, "grad_norm": 0.087890625, "learning_rate": 3.3628318584070804e-05, "loss": 0.8973, "step": 414 }, { "epoch": 1.836283185840708, "grad_norm": 0.0966796875, "learning_rate": 3.274336283185841e-05, "loss": 0.9526, "step": 415 }, { "epoch": 1.8407079646017699, "grad_norm": 0.061767578125, "learning_rate": 3.185840707964602e-05, "loss": 0.9184, "step": 416 }, { "epoch": 1.8451327433628317, "grad_norm": 0.0673828125, "learning_rate": 3.097345132743363e-05, "loss": 0.9124, "step": 417 }, { "epoch": 1.8495575221238938, "grad_norm": 0.056884765625, "learning_rate": 3.008849557522124e-05, "loss": 0.8303, "step": 418 }, { "epoch": 1.8539823008849559, "grad_norm": 0.0703125, "learning_rate": 2.9203539823008852e-05, "loss": 0.9533, "step": 419 }, { "epoch": 1.8584070796460177, "grad_norm": 0.064453125, "learning_rate": 2.831858407079646e-05, "loss": 0.8822, "step": 420 }, { "epoch": 1.8628318584070795, "grad_norm": 0.072265625, "learning_rate": 2.743362831858407e-05, "loss": 0.911, "step": 421 }, { "epoch": 1.8672566371681416, "grad_norm": 0.060546875, "learning_rate": 2.6548672566371686e-05, "loss": 0.8209, "step": 422 }, { "epoch": 1.8716814159292037, "grad_norm": 0.072265625, "learning_rate": 2.5663716814159294e-05, "loss": 0.8294, "step": 423 }, { "epoch": 1.8761061946902655, "grad_norm": 0.061279296875, "learning_rate": 2.4778761061946905e-05, "loss": 0.7602, "step": 424 }, { "epoch": 1.8805309734513274, "grad_norm": 0.0810546875, "learning_rate": 2.3893805309734516e-05, "loss": 0.8862, "step": 425 }, { "epoch": 1.8849557522123894, "grad_norm": 0.06494140625, "learning_rate": 2.3008849557522124e-05, "loss": 0.8715, "step": 426 }, { "epoch": 1.8893805309734515, "grad_norm": 0.06982421875, "learning_rate": 2.2123893805309738e-05, "loss": 0.9235, "step": 427 }, { "epoch": 1.893805309734513, "grad_norm": 0.07958984375, "learning_rate": 2.1238938053097346e-05, "loss": 0.8975, "step": 428 }, { "epoch": 1.8982300884955752, "grad_norm": 0.08935546875, "learning_rate": 2.0353982300884957e-05, "loss": 1.0014, "step": 429 }, { "epoch": 1.9026548672566372, "grad_norm": 0.05712890625, "learning_rate": 1.946902654867257e-05, "loss": 0.8397, "step": 430 }, { "epoch": 1.907079646017699, "grad_norm": 0.0859375, "learning_rate": 1.858407079646018e-05, "loss": 1.0832, "step": 431 }, { "epoch": 1.911504424778761, "grad_norm": 0.09375, "learning_rate": 1.7699115044247787e-05, "loss": 0.7726, "step": 432 }, { "epoch": 1.915929203539823, "grad_norm": 0.06884765625, "learning_rate": 1.6814159292035402e-05, "loss": 0.936, "step": 433 }, { "epoch": 1.920353982300885, "grad_norm": 0.062255859375, "learning_rate": 1.592920353982301e-05, "loss": 1.0048, "step": 434 }, { "epoch": 1.924778761061947, "grad_norm": 0.08349609375, "learning_rate": 1.504424778761062e-05, "loss": 0.864, "step": 435 }, { "epoch": 1.9292035398230087, "grad_norm": 0.0869140625, "learning_rate": 1.415929203539823e-05, "loss": 0.9952, "step": 436 }, { "epoch": 1.9336283185840708, "grad_norm": 0.06982421875, "learning_rate": 1.3274336283185843e-05, "loss": 0.8628, "step": 437 }, { "epoch": 1.9380530973451329, "grad_norm": 0.060546875, "learning_rate": 1.2389380530973452e-05, "loss": 0.8487, "step": 438 }, { "epoch": 1.9424778761061947, "grad_norm": 0.0634765625, "learning_rate": 1.1504424778761062e-05, "loss": 0.8495, "step": 439 }, { "epoch": 1.9469026548672566, "grad_norm": 0.06689453125, "learning_rate": 1.0619469026548673e-05, "loss": 0.8815, "step": 440 }, { "epoch": 1.9513274336283186, "grad_norm": 0.0634765625, "learning_rate": 9.734513274336284e-06, "loss": 0.8667, "step": 441 }, { "epoch": 1.9557522123893807, "grad_norm": 0.0869140625, "learning_rate": 8.849557522123894e-06, "loss": 0.7515, "step": 442 }, { "epoch": 1.9601769911504425, "grad_norm": 0.07275390625, "learning_rate": 7.964601769911505e-06, "loss": 0.8048, "step": 443 }, { "epoch": 1.9646017699115044, "grad_norm": 0.0625, "learning_rate": 7.079646017699115e-06, "loss": 0.9373, "step": 444 }, { "epoch": 1.9690265486725664, "grad_norm": 0.0859375, "learning_rate": 6.194690265486726e-06, "loss": 0.7985, "step": 445 }, { "epoch": 1.9734513274336283, "grad_norm": 0.1083984375, "learning_rate": 5.3097345132743365e-06, "loss": 0.9149, "step": 446 }, { "epoch": 1.9778761061946901, "grad_norm": 0.05615234375, "learning_rate": 4.424778761061947e-06, "loss": 0.8296, "step": 447 }, { "epoch": 1.9823008849557522, "grad_norm": 0.061767578125, "learning_rate": 3.5398230088495575e-06, "loss": 0.8539, "step": 448 }, { "epoch": 1.9867256637168142, "grad_norm": 0.0625, "learning_rate": 2.6548672566371683e-06, "loss": 0.8847, "step": 449 }, { "epoch": 1.991150442477876, "grad_norm": 0.09033203125, "learning_rate": 1.7699115044247788e-06, "loss": 0.8814, "step": 450 }, { "epoch": 1.995575221238938, "grad_norm": 0.0625, "learning_rate": 8.849557522123894e-07, "loss": 0.8299, "step": 451 }, { "epoch": 2.0, "grad_norm": 0.12451171875, "learning_rate": 0.0, "loss": 0.8232, "step": 452 } ], "logging_steps": 1.0, "max_steps": 452, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4086515032577802e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }