| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 16.0, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.0079981088638306, | |
| "learning_rate": 4e-05, | |
| "loss": 2.3624, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.0123796463012695, | |
| "learning_rate": 8e-05, | |
| "loss": 2.4117, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.0385504961013794, | |
| "learning_rate": 0.00012, | |
| "loss": 2.4351, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.7601240277290344, | |
| "learning_rate": 0.00016, | |
| "loss": 1.9867, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.9805667400360107, | |
| "learning_rate": 0.0002, | |
| "loss": 2.0655, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.6322834491729736, | |
| "learning_rate": 0.00019789473684210526, | |
| "loss": 1.8377, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.1208696365356445, | |
| "learning_rate": 0.00019578947368421054, | |
| "loss": 1.5558, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.3962080478668213, | |
| "learning_rate": 0.0001936842105263158, | |
| "loss": 1.4199, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.4532853364944458, | |
| "learning_rate": 0.00019157894736842104, | |
| "loss": 1.2996, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 2.3988616466522217, | |
| "learning_rate": 0.00018947368421052632, | |
| "loss": 1.2371, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.3582508563995361, | |
| "learning_rate": 0.0001873684210526316, | |
| "loss": 1.1839, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.2997236251831055, | |
| "learning_rate": 0.00018526315789473685, | |
| "loss": 0.983, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.1868802309036255, | |
| "learning_rate": 0.0001831578947368421, | |
| "loss": 0.7447, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.0286939144134521, | |
| "learning_rate": 0.00018105263157894739, | |
| "loss": 0.8524, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.00070321559906, | |
| "learning_rate": 0.00017894736842105264, | |
| "loss": 0.8649, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.4189987182617188, | |
| "learning_rate": 0.0001768421052631579, | |
| "loss": 0.8116, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 1.2303727865219116, | |
| "learning_rate": 0.00017473684210526317, | |
| "loss": 0.8071, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.9925879240036011, | |
| "learning_rate": 0.00017263157894736842, | |
| "loss": 0.7081, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 1.0683646202087402, | |
| "learning_rate": 0.0001705263157894737, | |
| "loss": 0.5269, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 1.0474812984466553, | |
| "learning_rate": 0.00016842105263157895, | |
| "loss": 0.6947, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 1.0291672945022583, | |
| "learning_rate": 0.00016631578947368423, | |
| "loss": 0.5014, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 1.1327933073043823, | |
| "learning_rate": 0.00016421052631578948, | |
| "loss": 0.481, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 1.4890342950820923, | |
| "learning_rate": 0.00016210526315789473, | |
| "loss": 0.5253, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 1.532833456993103, | |
| "learning_rate": 0.00016, | |
| "loss": 0.4937, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 1.7453362941741943, | |
| "learning_rate": 0.00015789473684210527, | |
| "loss": 0.5188, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 1.2242546081542969, | |
| "learning_rate": 0.00015578947368421052, | |
| "loss": 0.1893, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 1.7437238693237305, | |
| "learning_rate": 0.0001536842105263158, | |
| "loss": 0.433, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 1.4618209600448608, | |
| "learning_rate": 0.00015157894736842108, | |
| "loss": 0.3996, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 1.3685592412948608, | |
| "learning_rate": 0.00014947368421052633, | |
| "loss": 0.2189, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 1.741402268409729, | |
| "learning_rate": 0.00014736842105263158, | |
| "loss": 0.2934, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 1.5545222759246826, | |
| "learning_rate": 0.00014526315789473686, | |
| "loss": 0.2099, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "grad_norm": 1.2092806100845337, | |
| "learning_rate": 0.0001431578947368421, | |
| "loss": 0.1916, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 5.28, | |
| "grad_norm": 1.7175395488739014, | |
| "learning_rate": 0.00014105263157894736, | |
| "loss": 0.2527, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "grad_norm": 1.368059754371643, | |
| "learning_rate": 0.00013894736842105264, | |
| "loss": 0.114, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 1.6632587909698486, | |
| "learning_rate": 0.0001368421052631579, | |
| "loss": 0.1549, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "grad_norm": 1.6607255935668945, | |
| "learning_rate": 0.00013473684210526317, | |
| "loss": 0.1171, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 5.92, | |
| "grad_norm": 2.4954917430877686, | |
| "learning_rate": 0.00013263157894736842, | |
| "loss": 0.1614, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 1.7216722965240479, | |
| "learning_rate": 0.0001305263157894737, | |
| "loss": 0.1459, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 6.24, | |
| "grad_norm": 0.9449135065078735, | |
| "learning_rate": 0.00012842105263157895, | |
| "loss": 0.1001, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 1.4137742519378662, | |
| "learning_rate": 0.0001263157894736842, | |
| "loss": 0.0859, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 6.5600000000000005, | |
| "grad_norm": 1.8110110759735107, | |
| "learning_rate": 0.00012421052631578949, | |
| "loss": 0.1404, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "grad_norm": 1.1322952508926392, | |
| "learning_rate": 0.00012210526315789474, | |
| "loss": 0.0687, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "grad_norm": 2.2961461544036865, | |
| "learning_rate": 0.00012, | |
| "loss": 0.1203, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 7.04, | |
| "grad_norm": 1.5652666091918945, | |
| "learning_rate": 0.00011789473684210525, | |
| "loss": 0.1299, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 0.7390972375869751, | |
| "learning_rate": 0.00011578947368421053, | |
| "loss": 0.0595, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 7.36, | |
| "grad_norm": 1.0376925468444824, | |
| "learning_rate": 0.0001136842105263158, | |
| "loss": 0.0578, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 7.52, | |
| "grad_norm": 0.9976247549057007, | |
| "learning_rate": 0.00011157894736842105, | |
| "loss": 0.0695, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 7.68, | |
| "grad_norm": 1.0853309631347656, | |
| "learning_rate": 0.00010947368421052633, | |
| "loss": 0.0985, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 7.84, | |
| "grad_norm": 1.3621833324432373, | |
| "learning_rate": 0.00010736842105263158, | |
| "loss": 0.1269, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.8868013024330139, | |
| "learning_rate": 0.00010526315789473685, | |
| "loss": 0.0641, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 8.16, | |
| "grad_norm": 0.6473409533500671, | |
| "learning_rate": 0.00010315789473684211, | |
| "loss": 0.0474, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "grad_norm": 1.6032112836837769, | |
| "learning_rate": 0.00010105263157894738, | |
| "loss": 0.0597, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 8.48, | |
| "grad_norm": 1.120687484741211, | |
| "learning_rate": 9.894736842105263e-05, | |
| "loss": 0.0582, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 8.64, | |
| "grad_norm": 0.7064136862754822, | |
| "learning_rate": 9.68421052631579e-05, | |
| "loss": 0.0557, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "grad_norm": 0.5838208794593811, | |
| "learning_rate": 9.473684210526316e-05, | |
| "loss": 0.0436, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 8.96, | |
| "grad_norm": 1.2315547466278076, | |
| "learning_rate": 9.263157894736843e-05, | |
| "loss": 0.063, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 9.12, | |
| "grad_norm": 0.3518936336040497, | |
| "learning_rate": 9.052631578947369e-05, | |
| "loss": 0.0311, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 9.28, | |
| "grad_norm": 0.6926944851875305, | |
| "learning_rate": 8.842105263157894e-05, | |
| "loss": 0.039, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 9.44, | |
| "grad_norm": 0.26300671696662903, | |
| "learning_rate": 8.631578947368421e-05, | |
| "loss": 0.0252, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "grad_norm": 0.7903566360473633, | |
| "learning_rate": 8.421052631578948e-05, | |
| "loss": 0.0415, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 9.76, | |
| "grad_norm": 0.5427919626235962, | |
| "learning_rate": 8.210526315789474e-05, | |
| "loss": 0.0453, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 9.92, | |
| "grad_norm": 0.5827217698097229, | |
| "learning_rate": 8e-05, | |
| "loss": 0.0368, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 10.08, | |
| "grad_norm": 1.45575749874115, | |
| "learning_rate": 7.789473684210526e-05, | |
| "loss": 0.0736, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 10.24, | |
| "grad_norm": 0.32767948508262634, | |
| "learning_rate": 7.578947368421054e-05, | |
| "loss": 0.0316, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 10.4, | |
| "grad_norm": 0.30059218406677246, | |
| "learning_rate": 7.368421052631579e-05, | |
| "loss": 0.0277, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 10.56, | |
| "grad_norm": 0.4859299659729004, | |
| "learning_rate": 7.157894736842105e-05, | |
| "loss": 0.0313, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 10.72, | |
| "grad_norm": 0.4874284267425537, | |
| "learning_rate": 6.947368421052632e-05, | |
| "loss": 0.0322, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 10.88, | |
| "grad_norm": 0.41711848974227905, | |
| "learning_rate": 6.736842105263159e-05, | |
| "loss": 0.0389, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 11.04, | |
| "grad_norm": 0.8408872485160828, | |
| "learning_rate": 6.526315789473685e-05, | |
| "loss": 0.0312, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 11.2, | |
| "grad_norm": 0.32355204224586487, | |
| "learning_rate": 6.31578947368421e-05, | |
| "loss": 0.0328, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 11.36, | |
| "grad_norm": 0.42406928539276123, | |
| "learning_rate": 6.105263157894737e-05, | |
| "loss": 0.0277, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 11.52, | |
| "grad_norm": 0.7678600549697876, | |
| "learning_rate": 5.894736842105263e-05, | |
| "loss": 0.0329, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 11.68, | |
| "grad_norm": 0.29065871238708496, | |
| "learning_rate": 5.68421052631579e-05, | |
| "loss": 0.0297, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 11.84, | |
| "grad_norm": 0.5853772163391113, | |
| "learning_rate": 5.4736842105263165e-05, | |
| "loss": 0.0393, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.7088480591773987, | |
| "learning_rate": 5.2631578947368424e-05, | |
| "loss": 0.0344, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 12.16, | |
| "grad_norm": 0.19609542191028595, | |
| "learning_rate": 5.052631578947369e-05, | |
| "loss": 0.0232, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 12.32, | |
| "grad_norm": 0.31028512120246887, | |
| "learning_rate": 4.842105263157895e-05, | |
| "loss": 0.0273, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 12.48, | |
| "grad_norm": 0.4248906672000885, | |
| "learning_rate": 4.6315789473684214e-05, | |
| "loss": 0.0315, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 12.64, | |
| "grad_norm": 0.4214076101779938, | |
| "learning_rate": 4.421052631578947e-05, | |
| "loss": 0.0309, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 12.8, | |
| "grad_norm": 0.4250756502151489, | |
| "learning_rate": 4.210526315789474e-05, | |
| "loss": 0.0285, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 12.96, | |
| "grad_norm": 0.2500416934490204, | |
| "learning_rate": 4e-05, | |
| "loss": 0.0256, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 13.12, | |
| "grad_norm": 0.2516506314277649, | |
| "learning_rate": 3.789473684210527e-05, | |
| "loss": 0.0244, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 13.28, | |
| "grad_norm": 0.217052161693573, | |
| "learning_rate": 3.578947368421053e-05, | |
| "loss": 0.0241, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 13.44, | |
| "grad_norm": 0.4375220835208893, | |
| "learning_rate": 3.368421052631579e-05, | |
| "loss": 0.0308, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 13.6, | |
| "grad_norm": 0.23626229166984558, | |
| "learning_rate": 3.157894736842105e-05, | |
| "loss": 0.029, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 13.76, | |
| "grad_norm": 0.3816908001899719, | |
| "learning_rate": 2.9473684210526314e-05, | |
| "loss": 0.0251, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 13.92, | |
| "grad_norm": 0.17371943593025208, | |
| "learning_rate": 2.7368421052631583e-05, | |
| "loss": 0.0203, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 14.08, | |
| "grad_norm": 0.21958455443382263, | |
| "learning_rate": 2.5263157894736845e-05, | |
| "loss": 0.0265, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 14.24, | |
| "grad_norm": 0.2628728151321411, | |
| "learning_rate": 2.3157894736842107e-05, | |
| "loss": 0.0242, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 14.4, | |
| "grad_norm": 0.2763591408729553, | |
| "learning_rate": 2.105263157894737e-05, | |
| "loss": 0.0299, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 14.56, | |
| "grad_norm": 0.2944229245185852, | |
| "learning_rate": 1.8947368421052634e-05, | |
| "loss": 0.0244, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 14.72, | |
| "grad_norm": 0.28353527188301086, | |
| "learning_rate": 1.6842105263157896e-05, | |
| "loss": 0.0241, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 14.88, | |
| "grad_norm": 0.2161315530538559, | |
| "learning_rate": 1.4736842105263157e-05, | |
| "loss": 0.024, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 15.04, | |
| "grad_norm": 0.2228800654411316, | |
| "learning_rate": 1.2631578947368422e-05, | |
| "loss": 0.0263, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 15.2, | |
| "grad_norm": 0.17299261689186096, | |
| "learning_rate": 1.0526315789473684e-05, | |
| "loss": 0.0227, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 15.36, | |
| "grad_norm": 0.21846872568130493, | |
| "learning_rate": 8.421052631578948e-06, | |
| "loss": 0.0223, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 15.52, | |
| "grad_norm": 0.23234839737415314, | |
| "learning_rate": 6.315789473684211e-06, | |
| "loss": 0.0269, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 15.68, | |
| "grad_norm": 0.217283234000206, | |
| "learning_rate": 4.210526315789474e-06, | |
| "loss": 0.0259, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 15.84, | |
| "grad_norm": 0.2666471600532532, | |
| "learning_rate": 2.105263157894737e-06, | |
| "loss": 0.027, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.2889624536037445, | |
| "learning_rate": 0.0, | |
| "loss": 0.0248, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 17, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2992005070258176.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |