| { | |
| "best_metric": NaN, | |
| "best_model_checkpoint": "miner_id_24/checkpoint-50", | |
| "epoch": 0.11883541295306001, | |
| "eval_steps": 50, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0011883541295306002, | |
| "grad_norm": NaN, | |
| "learning_rate": 5.000000000000001e-07, | |
| "loss": 781.0919, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0011883541295306002, | |
| "eval_loss": NaN, | |
| "eval_runtime": 87.1867, | |
| "eval_samples_per_second": 65.021, | |
| "eval_steps_per_second": 8.132, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0023767082590612004, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0035650623885918, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.004753416518122401, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0059417706476530005, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0071301247771836, | |
| "grad_norm": NaN, | |
| "learning_rate": 3e-06, | |
| "loss": 0.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.008318478906714201, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.5e-06, | |
| "loss": 0.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.009506833036244802, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0106951871657754, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.5e-06, | |
| "loss": 0.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.011883541295306001, | |
| "grad_norm": NaN, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.013071895424836602, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.99847706754774e-06, | |
| "loss": 0.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0142602495543672, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.993910125649561e-06, | |
| "loss": 0.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.015448603683897801, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.986304738420684e-06, | |
| "loss": 0.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.016636957813428402, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.975670171853926e-06, | |
| "loss": 0.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.017825311942959002, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.962019382530521e-06, | |
| "loss": 0.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.019013666072489603, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.9453690018345144e-06, | |
| "loss": 0.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.020202020202020204, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.925739315689991e-06, | |
| "loss": 0.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0213903743315508, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.903154239845798e-06, | |
| "loss": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0225787284610814, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.8776412907378845e-06, | |
| "loss": 0.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.023767082590612002, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.849231551964771e-06, | |
| "loss": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.024955436720142603, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.817959636416969e-06, | |
| "loss": 0.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.026143790849673203, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.783863644106502e-06, | |
| "loss": 0.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.027332144979203804, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.746985115747918e-06, | |
| "loss": 0.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.0285204991087344, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.707368982147318e-06, | |
| "loss": 0.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.029708853238265002, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.665063509461098e-06, | |
| "loss": 0.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.030897207367795602, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.620120240391065e-06, | |
| "loss": 0.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.03208556149732621, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.572593931387604e-06, | |
| "loss": 0.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.033273915626856804, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.522542485937369e-06, | |
| "loss": 0.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0344622697563874, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.470026884016805e-06, | |
| "loss": 0.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.035650623885918005, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.415111107797445e-06, | |
| "loss": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0368389780154486, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.357862063693486e-06, | |
| "loss": 0.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.038027332144979206, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.2983495008466285e-06, | |
| "loss": 0.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.0392156862745098, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.236645926147493e-06, | |
| "loss": 0.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.04040404040404041, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.172826515897146e-06, | |
| "loss": 0.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.041592394533571005, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.106969024216348e-06, | |
| "loss": 0.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0427807486631016, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.039153688314146e-06, | |
| "loss": 0.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.043969102792632206, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.969463130731183e-06, | |
| "loss": 0.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.0451574569221628, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.897982258676867e-06, | |
| "loss": 0.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.04634581105169341, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.824798160583012e-06, | |
| "loss": 0.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.047534165181224004, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0487225193107546, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.6736789069647273e-06, | |
| "loss": 0.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.049910873440285206, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.595927866972694e-06, | |
| "loss": 0.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.0510992275698158, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.516841607689501e-06, | |
| "loss": 0.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.05228758169934641, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.436516483539781e-06, | |
| "loss": 0.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.053475935828877004, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.3550503583141726e-06, | |
| "loss": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.05466428995840761, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.272542485937369e-06, | |
| "loss": 0.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.055852644087938205, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.189093389542498e-06, | |
| "loss": 0.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.0570409982174688, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.1048047389991693e-06, | |
| "loss": 0.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.058229352346999406, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.019779227044398e-06, | |
| "loss": 0.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.059417706476530004, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.9341204441673267e-06, | |
| "loss": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.059417706476530004, | |
| "eval_loss": NaN, | |
| "eval_runtime": 86.4908, | |
| "eval_samples_per_second": 65.545, | |
| "eval_steps_per_second": 8.197, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06060606060606061, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.847932752400164e-06, | |
| "loss": 0.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.061794414735591205, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.761321158169134e-06, | |
| "loss": 0.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.0629827688651218, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.6743911843603134e-06, | |
| "loss": 0.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.06417112299465241, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.587248741756253e-06, | |
| "loss": 0.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.06535947712418301, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.06654783125371361, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.4127512582437486e-06, | |
| "loss": 0.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.0677361853832442, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.325608815639687e-06, | |
| "loss": 0.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.0689245395127748, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.238678841830867e-06, | |
| "loss": 0.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.07011289364230541, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.1520672475998374e-06, | |
| "loss": 0.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.07130124777183601, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.0658795558326745e-06, | |
| "loss": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07248960190136661, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.9802207729556023e-06, | |
| "loss": 0.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.0736779560308972, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.895195261000831e-06, | |
| "loss": 0.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.0748663101604278, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.8109066104575023e-06, | |
| "loss": 0.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.07605466428995841, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.7274575140626318e-06, | |
| "loss": 0.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.07724301841948901, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.6449496416858285e-06, | |
| "loss": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.0784313725490196, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.56348351646022e-06, | |
| "loss": 0.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.0796197266785502, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.4831583923105e-06, | |
| "loss": 0.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.08080808080808081, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.4040721330273063e-06, | |
| "loss": 0.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.08199643493761141, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.3263210930352737e-06, | |
| "loss": 0.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.08318478906714201, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.2500000000000007e-06, | |
| "loss": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0843731431966726, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.1752018394169882e-06, | |
| "loss": 0.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.0855614973262032, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.1020177413231334e-06, | |
| "loss": 0.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.08674985145573381, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.0305368692688175e-06, | |
| "loss": 0.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.08793820558526441, | |
| "grad_norm": NaN, | |
| "learning_rate": 9.608463116858544e-07, | |
| "loss": 0.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.08912655971479501, | |
| "grad_norm": NaN, | |
| "learning_rate": 8.930309757836517e-07, | |
| "loss": 0.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.0903149138443256, | |
| "grad_norm": NaN, | |
| "learning_rate": 8.271734841028553e-07, | |
| "loss": 0.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.0915032679738562, | |
| "grad_norm": NaN, | |
| "learning_rate": 7.633540738525066e-07, | |
| "loss": 0.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.09269162210338681, | |
| "grad_norm": NaN, | |
| "learning_rate": 7.016504991533727e-07, | |
| "loss": 0.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.09387997623291741, | |
| "grad_norm": NaN, | |
| "learning_rate": 6.421379363065142e-07, | |
| "loss": 0.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.09506833036244801, | |
| "grad_norm": NaN, | |
| "learning_rate": 5.848888922025553e-07, | |
| "loss": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0962566844919786, | |
| "grad_norm": NaN, | |
| "learning_rate": 5.299731159831953e-07, | |
| "loss": 0.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.0974450386215092, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.774575140626317e-07, | |
| "loss": 0.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.09863339275103981, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.27406068612396e-07, | |
| "loss": 0.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.09982174688057041, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.798797596089351e-07, | |
| "loss": 0.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.10101010101010101, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.3493649053890325e-07, | |
| "loss": 0.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.1021984551396316, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.9263101785268253e-07, | |
| "loss": 0.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.10338680926916222, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.53014884252083e-07, | |
| "loss": 0.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.10457516339869281, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.1613635589349756e-07, | |
| "loss": 0.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.10576351752822341, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.8204036358303173e-07, | |
| "loss": 0.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.10695187165775401, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.507684480352292e-07, | |
| "loss": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1081402257872846, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.223587092621162e-07, | |
| "loss": 0.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.10932857991681522, | |
| "grad_norm": NaN, | |
| "learning_rate": 9.684576015420277e-08, | |
| "loss": 0.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.11051693404634581, | |
| "grad_norm": NaN, | |
| "learning_rate": 7.426068431000883e-08, | |
| "loss": 0.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.11170528817587641, | |
| "grad_norm": NaN, | |
| "learning_rate": 5.463099816548578e-08, | |
| "loss": 0.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.11289364230540701, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.798061746947995e-08, | |
| "loss": 0.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.1140819964349376, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.4329828146074096e-08, | |
| "loss": 0.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.11527035056446822, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.3695261579316776e-08, | |
| "loss": 0.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.11645870469399881, | |
| "grad_norm": NaN, | |
| "learning_rate": 6.089874350439507e-09, | |
| "loss": 0.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.11764705882352941, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.5229324522605949e-09, | |
| "loss": 0.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.11883541295306001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0, | |
| "loss": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11883541295306001, | |
| "eval_loss": NaN, | |
| "eval_runtime": 86.2387, | |
| "eval_samples_per_second": 65.736, | |
| "eval_steps_per_second": 8.221, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 5, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 1 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.894631517814784e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |