| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 100, | |
| "global_step": 2230, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0004484304932735426, | |
| "grad_norm": 4.696539476451585, | |
| "learning_rate": 1.3452914798206278e-08, | |
| "loss": 0.9912, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.004484304932735426, | |
| "grad_norm": 5.089904667658368, | |
| "learning_rate": 1.345291479820628e-07, | |
| "loss": 1.0341, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008968609865470852, | |
| "grad_norm": 5.546828630388097, | |
| "learning_rate": 2.690582959641256e-07, | |
| "loss": 1.0502, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.013452914798206279, | |
| "grad_norm": 4.113849381101499, | |
| "learning_rate": 4.0358744394618834e-07, | |
| "loss": 1.0386, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.017937219730941704, | |
| "grad_norm": 3.6548963622814887, | |
| "learning_rate": 5.381165919282512e-07, | |
| "loss": 1.0282, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02242152466367713, | |
| "grad_norm": 2.157564670206396, | |
| "learning_rate": 6.72645739910314e-07, | |
| "loss": 0.9574, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.026905829596412557, | |
| "grad_norm": 2.0184475272019555, | |
| "learning_rate": 8.071748878923767e-07, | |
| "loss": 0.9263, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03139013452914798, | |
| "grad_norm": 1.7894937443172652, | |
| "learning_rate": 9.417040358744395e-07, | |
| "loss": 0.9253, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03587443946188341, | |
| "grad_norm": 1.6533764414432808, | |
| "learning_rate": 1.0762331838565023e-06, | |
| "loss": 0.9106, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04035874439461883, | |
| "grad_norm": 1.9561381307359194, | |
| "learning_rate": 1.2107623318385651e-06, | |
| "loss": 0.8713, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04484304932735426, | |
| "grad_norm": 1.5478472557018526, | |
| "learning_rate": 1.345291479820628e-06, | |
| "loss": 0.8741, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04484304932735426, | |
| "eval_loss": 0.8599640727043152, | |
| "eval_runtime": 430.7233, | |
| "eval_samples_per_second": 116.263, | |
| "eval_steps_per_second": 1.818, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04932735426008968, | |
| "grad_norm": 1.5759592930264636, | |
| "learning_rate": 1.4798206278026905e-06, | |
| "loss": 0.8381, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.053811659192825115, | |
| "grad_norm": 1.5446577353242628, | |
| "learning_rate": 1.6143497757847533e-06, | |
| "loss": 0.8151, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.05829596412556054, | |
| "grad_norm": 1.6899841974229757, | |
| "learning_rate": 1.7488789237668162e-06, | |
| "loss": 0.8309, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.06278026905829596, | |
| "grad_norm": 1.6274283098945213, | |
| "learning_rate": 1.883408071748879e-06, | |
| "loss": 0.8509, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.06726457399103139, | |
| "grad_norm": 1.7690619100525546, | |
| "learning_rate": 2.0179372197309418e-06, | |
| "loss": 0.8057, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.07174887892376682, | |
| "grad_norm": 1.866473004768342, | |
| "learning_rate": 2.1524663677130046e-06, | |
| "loss": 0.8236, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.07623318385650224, | |
| "grad_norm": 1.5528009019380091, | |
| "learning_rate": 2.2869955156950674e-06, | |
| "loss": 0.7936, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.08071748878923767, | |
| "grad_norm": 1.8924349879943885, | |
| "learning_rate": 2.4215246636771302e-06, | |
| "loss": 0.8054, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.08520179372197309, | |
| "grad_norm": 1.5998254884542162, | |
| "learning_rate": 2.556053811659193e-06, | |
| "loss": 0.7971, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.08968609865470852, | |
| "grad_norm": 1.553085624058612, | |
| "learning_rate": 2.690582959641256e-06, | |
| "loss": 0.8038, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08968609865470852, | |
| "eval_loss": 0.8094644546508789, | |
| "eval_runtime": 412.1717, | |
| "eval_samples_per_second": 121.495, | |
| "eval_steps_per_second": 1.9, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.09417040358744394, | |
| "grad_norm": 1.6621378080442881, | |
| "learning_rate": 2.8251121076233187e-06, | |
| "loss": 0.7815, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.09865470852017937, | |
| "grad_norm": 1.5875832641605891, | |
| "learning_rate": 2.959641255605381e-06, | |
| "loss": 0.8088, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1031390134529148, | |
| "grad_norm": 1.6006597094640902, | |
| "learning_rate": 2.99990995533251e-06, | |
| "loss": 0.8141, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.10762331838565023, | |
| "grad_norm": 1.7932554350094232, | |
| "learning_rate": 2.9994689462512194e-06, | |
| "loss": 0.7834, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.11210762331838565, | |
| "grad_norm": 1.6444723214299724, | |
| "learning_rate": 2.998660541859271e-06, | |
| "loss": 0.7797, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.11659192825112108, | |
| "grad_norm": 1.790145213655978, | |
| "learning_rate": 2.9974849402294452e-06, | |
| "loss": 0.8046, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.1210762331838565, | |
| "grad_norm": 1.8694283184605, | |
| "learning_rate": 2.9959424294040703e-06, | |
| "loss": 0.7802, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.12556053811659193, | |
| "grad_norm": 1.6030839509233756, | |
| "learning_rate": 2.9940333873244464e-06, | |
| "loss": 0.8032, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.13004484304932734, | |
| "grad_norm": 1.664910362160235, | |
| "learning_rate": 2.991758281738245e-06, | |
| "loss": 0.7802, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.13452914798206278, | |
| "grad_norm": 1.6726792291262853, | |
| "learning_rate": 2.989117670084902e-06, | |
| "loss": 0.7937, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.13452914798206278, | |
| "eval_loss": 0.7789004445075989, | |
| "eval_runtime": 410.6605, | |
| "eval_samples_per_second": 121.943, | |
| "eval_steps_per_second": 1.907, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.13901345291479822, | |
| "grad_norm": 1.4685211047526556, | |
| "learning_rate": 2.986112199359036e-06, | |
| "loss": 0.7486, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.14349775784753363, | |
| "grad_norm": 2.0076694355781575, | |
| "learning_rate": 2.9827426059519237e-06, | |
| "loss": 0.808, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.14798206278026907, | |
| "grad_norm": 1.557780179088859, | |
| "learning_rate": 2.9790097154710697e-06, | |
| "loss": 0.7849, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.15246636771300448, | |
| "grad_norm": 1.3610248283116362, | |
| "learning_rate": 2.9749144425379216e-06, | |
| "loss": 0.7696, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.15695067264573992, | |
| "grad_norm": 1.5050628258310632, | |
| "learning_rate": 2.9704577905637718e-06, | |
| "loss": 0.7497, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.16143497757847533, | |
| "grad_norm": 1.4313536098763806, | |
| "learning_rate": 2.9656408515039017e-06, | |
| "loss": 0.7544, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.16591928251121077, | |
| "grad_norm": 1.6003065628553548, | |
| "learning_rate": 2.9604648055900368e-06, | |
| "loss": 0.7648, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.17040358744394618, | |
| "grad_norm": 1.633334409956319, | |
| "learning_rate": 2.9549309210411697e-06, | |
| "loss": 0.7471, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.17488789237668162, | |
| "grad_norm": 1.5700271693529286, | |
| "learning_rate": 2.949040553752826e-06, | |
| "loss": 0.8009, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.17937219730941703, | |
| "grad_norm": 1.4854276734758955, | |
| "learning_rate": 2.9427951469648425e-06, | |
| "loss": 0.7712, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.17937219730941703, | |
| "eval_loss": 0.7643527388572693, | |
| "eval_runtime": 413.4678, | |
| "eval_samples_per_second": 121.115, | |
| "eval_steps_per_second": 1.894, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.18385650224215247, | |
| "grad_norm": 1.4160940764229815, | |
| "learning_rate": 2.936196230907755e-06, | |
| "loss": 0.7532, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.18834080717488788, | |
| "grad_norm": 1.4265290618310995, | |
| "learning_rate": 2.929245422427861e-06, | |
| "loss": 0.7703, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.19282511210762332, | |
| "grad_norm": 1.6899882763333507, | |
| "learning_rate": 2.9219444245910674e-06, | |
| "loss": 0.7919, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.19730941704035873, | |
| "grad_norm": 1.4186337044303068, | |
| "learning_rate": 2.9142950262656098e-06, | |
| "loss": 0.7477, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.20179372197309417, | |
| "grad_norm": 1.4178331376670448, | |
| "learning_rate": 2.9062991016837496e-06, | |
| "loss": 0.7734, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2062780269058296, | |
| "grad_norm": 1.4503162574851487, | |
| "learning_rate": 2.897958609982556e-06, | |
| "loss": 0.7447, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.21076233183856502, | |
| "grad_norm": 1.558520612711291, | |
| "learning_rate": 2.8892755947238818e-06, | |
| "loss": 0.741, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.21524663677130046, | |
| "grad_norm": 1.4382572158325275, | |
| "learning_rate": 2.8802521833936595e-06, | |
| "loss": 0.7563, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.21973094170403587, | |
| "grad_norm": 1.5964216489171685, | |
| "learning_rate": 2.870890586880629e-06, | |
| "loss": 0.7554, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2242152466367713, | |
| "grad_norm": 1.496069010720812, | |
| "learning_rate": 2.8611930989346322e-06, | |
| "loss": 0.7393, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2242152466367713, | |
| "eval_loss": 0.7564548254013062, | |
| "eval_runtime": 408.8965, | |
| "eval_samples_per_second": 122.469, | |
| "eval_steps_per_second": 1.915, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.22869955156950672, | |
| "grad_norm": 1.4866290735466012, | |
| "learning_rate": 2.851162095604607e-06, | |
| "loss": 0.7499, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.23318385650224216, | |
| "grad_norm": 1.3341919240907245, | |
| "learning_rate": 2.8408000346564136e-06, | |
| "loss": 0.7524, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.23766816143497757, | |
| "grad_norm": 1.6374942242171213, | |
| "learning_rate": 2.8301094549706405e-06, | |
| "loss": 0.7386, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.242152466367713, | |
| "grad_norm": 1.6225803035616944, | |
| "learning_rate": 2.8190929759205366e-06, | |
| "loss": 0.7616, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.24663677130044842, | |
| "grad_norm": 1.4683777464043755, | |
| "learning_rate": 2.807753296730219e-06, | |
| "loss": 0.7564, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.25112107623318386, | |
| "grad_norm": 1.350460716883926, | |
| "learning_rate": 2.7960931958133183e-06, | |
| "loss": 0.7424, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.2556053811659193, | |
| "grad_norm": 1.522474854464212, | |
| "learning_rate": 2.7841155300922202e-06, | |
| "loss": 0.7331, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.2600896860986547, | |
| "grad_norm": 1.448720887976205, | |
| "learning_rate": 2.7718232342980693e-06, | |
| "loss": 0.7657, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.2645739910313901, | |
| "grad_norm": 1.6744619426337854, | |
| "learning_rate": 2.759219320251714e-06, | |
| "loss": 0.7363, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.26905829596412556, | |
| "grad_norm": 1.3585539591402243, | |
| "learning_rate": 2.7463068761257554e-06, | |
| "loss": 0.7458, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.26905829596412556, | |
| "eval_loss": 0.7505608797073364, | |
| "eval_runtime": 408.9234, | |
| "eval_samples_per_second": 122.461, | |
| "eval_steps_per_second": 1.915, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.273542600896861, | |
| "grad_norm": 1.580932873164111, | |
| "learning_rate": 2.7330890656878943e-06, | |
| "loss": 0.7565, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.27802690582959644, | |
| "grad_norm": 1.5329888412189265, | |
| "learning_rate": 2.7195691275257547e-06, | |
| "loss": 0.7457, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2825112107623318, | |
| "grad_norm": 1.6754413400622026, | |
| "learning_rate": 2.7057503742533753e-06, | |
| "loss": 0.7392, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.28699551569506726, | |
| "grad_norm": 1.6247897070260917, | |
| "learning_rate": 2.691636191699562e-06, | |
| "loss": 0.758, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.2914798206278027, | |
| "grad_norm": 1.42356323236888, | |
| "learning_rate": 2.6772300380783013e-06, | |
| "loss": 0.7626, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.29596412556053814, | |
| "grad_norm": 1.4955853270730488, | |
| "learning_rate": 2.662535443141443e-06, | |
| "loss": 0.7355, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3004484304932735, | |
| "grad_norm": 1.4879073313151545, | |
| "learning_rate": 2.647556007313847e-06, | |
| "loss": 0.7545, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.30493273542600896, | |
| "grad_norm": 1.4153755477305148, | |
| "learning_rate": 2.6322954008112213e-06, | |
| "loss": 0.7378, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3094170403587444, | |
| "grad_norm": 1.4019993036978922, | |
| "learning_rate": 2.616757362740855e-06, | |
| "loss": 0.7387, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.31390134529147984, | |
| "grad_norm": 1.5335241758091316, | |
| "learning_rate": 2.600945700185474e-06, | |
| "loss": 0.7694, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.31390134529147984, | |
| "eval_loss": 0.7457958459854126, | |
| "eval_runtime": 408.7761, | |
| "eval_samples_per_second": 122.505, | |
| "eval_steps_per_second": 1.915, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3183856502242152, | |
| "grad_norm": 1.47263429505246, | |
| "learning_rate": 2.5848642872704417e-06, | |
| "loss": 0.7246, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.32286995515695066, | |
| "grad_norm": 1.5062835613914285, | |
| "learning_rate": 2.5685170642145337e-06, | |
| "loss": 0.7338, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.3273542600896861, | |
| "grad_norm": 1.6182138547104117, | |
| "learning_rate": 2.5519080363645134e-06, | |
| "loss": 0.73, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.33183856502242154, | |
| "grad_norm": 1.3515300425343295, | |
| "learning_rate": 2.53504127321376e-06, | |
| "loss": 0.7299, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.336322869955157, | |
| "grad_norm": 1.5798782493243635, | |
| "learning_rate": 2.517920907405168e-06, | |
| "loss": 0.7293, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.34080717488789236, | |
| "grad_norm": 1.4549259580353344, | |
| "learning_rate": 2.5005511337185824e-06, | |
| "loss": 0.7621, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.3452914798206278, | |
| "grad_norm": 1.456599605633329, | |
| "learning_rate": 2.4829362080430077e-06, | |
| "loss": 0.7438, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.34977578475336324, | |
| "grad_norm": 1.4128813340833153, | |
| "learning_rate": 2.4650804463338406e-06, | |
| "loss": 0.7413, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.3542600896860987, | |
| "grad_norm": 1.5613737124434628, | |
| "learning_rate": 2.4469882235553887e-06, | |
| "loss": 0.7477, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.35874439461883406, | |
| "grad_norm": 1.6383373422678345, | |
| "learning_rate": 2.4286639726089293e-06, | |
| "loss": 0.713, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.35874439461883406, | |
| "eval_loss": 0.7421520352363586, | |
| "eval_runtime": 408.0589, | |
| "eval_samples_per_second": 122.72, | |
| "eval_steps_per_second": 1.919, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3632286995515695, | |
| "grad_norm": 1.3492102003393152, | |
| "learning_rate": 2.4101121832465754e-06, | |
| "loss": 0.7185, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.36771300448430494, | |
| "grad_norm": 1.4117655797526263, | |
| "learning_rate": 2.3913374009712084e-06, | |
| "loss": 0.7379, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.3721973094170404, | |
| "grad_norm": 1.5281693242796246, | |
| "learning_rate": 2.3723442259227547e-06, | |
| "loss": 0.7406, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.37668161434977576, | |
| "grad_norm": 1.6990323130848894, | |
| "learning_rate": 2.3531373117510695e-06, | |
| "loss": 0.7388, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.3811659192825112, | |
| "grad_norm": 1.476162200960684, | |
| "learning_rate": 2.33372136447572e-06, | |
| "loss": 0.7434, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.38565022421524664, | |
| "grad_norm": 1.3930484173784414, | |
| "learning_rate": 2.3141011413329244e-06, | |
| "loss": 0.7372, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.3901345291479821, | |
| "grad_norm": 1.4071716332679987, | |
| "learning_rate": 2.2942814496099532e-06, | |
| "loss": 0.7531, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.39461883408071746, | |
| "grad_norm": 1.5479232446038012, | |
| "learning_rate": 2.274267145467259e-06, | |
| "loss": 0.7216, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.3991031390134529, | |
| "grad_norm": 1.4255077423798548, | |
| "learning_rate": 2.254063132748637e-06, | |
| "loss": 0.7343, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.40358744394618834, | |
| "grad_norm": 1.57276996130409, | |
| "learning_rate": 2.2336743617797006e-06, | |
| "loss": 0.7347, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.40358744394618834, | |
| "eval_loss": 0.7386789321899414, | |
| "eval_runtime": 408.1839, | |
| "eval_samples_per_second": 122.682, | |
| "eval_steps_per_second": 1.918, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.4080717488789238, | |
| "grad_norm": 1.4568107529063017, | |
| "learning_rate": 2.213105828154964e-06, | |
| "loss": 0.7266, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.4125560538116592, | |
| "grad_norm": 1.374198091231606, | |
| "learning_rate": 2.192362571513841e-06, | |
| "loss": 0.7465, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.4170403587443946, | |
| "grad_norm": 1.3925457206301284, | |
| "learning_rate": 2.171449674305846e-06, | |
| "loss": 0.7427, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.42152466367713004, | |
| "grad_norm": 1.4443502855856463, | |
| "learning_rate": 2.1503722605453083e-06, | |
| "loss": 0.7428, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.4260089686098655, | |
| "grad_norm": 1.5268146365443709, | |
| "learning_rate": 2.1291354945559004e-06, | |
| "loss": 0.7163, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.4304932735426009, | |
| "grad_norm": 1.5000325455240473, | |
| "learning_rate": 2.1077445797052945e-06, | |
| "loss": 0.7472, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.4349775784753363, | |
| "grad_norm": 1.4869091852092478, | |
| "learning_rate": 2.086204757130243e-06, | |
| "loss": 0.7427, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.43946188340807174, | |
| "grad_norm": 1.4430282256544564, | |
| "learning_rate": 2.0645213044524194e-06, | |
| "loss": 0.7174, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.4439461883408072, | |
| "grad_norm": 1.4822025498870304, | |
| "learning_rate": 2.0426995344853043e-06, | |
| "loss": 0.7538, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.4484304932735426, | |
| "grad_norm": 1.5186234240452396, | |
| "learning_rate": 2.0207447939324598e-06, | |
| "loss": 0.7243, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4484304932735426, | |
| "eval_loss": 0.7356163859367371, | |
| "eval_runtime": 407.0139, | |
| "eval_samples_per_second": 123.035, | |
| "eval_steps_per_second": 1.924, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.452914798206278, | |
| "grad_norm": 1.5742685454152958, | |
| "learning_rate": 1.998662462077496e-06, | |
| "loss": 0.7475, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.45739910313901344, | |
| "grad_norm": 1.3834168469611057, | |
| "learning_rate": 1.976457949466054e-06, | |
| "loss": 0.7568, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.4618834080717489, | |
| "grad_norm": 1.4947961999330186, | |
| "learning_rate": 1.954136696580132e-06, | |
| "loss": 0.7464, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.4663677130044843, | |
| "grad_norm": 1.4284253764088304, | |
| "learning_rate": 1.9317041725050747e-06, | |
| "loss": 0.7456, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.47085201793721976, | |
| "grad_norm": 1.4247354157320633, | |
| "learning_rate": 1.909165873589554e-06, | |
| "loss": 0.7008, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.47533632286995514, | |
| "grad_norm": 1.4525308368306575, | |
| "learning_rate": 1.886527322098871e-06, | |
| "loss": 0.7121, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.4798206278026906, | |
| "grad_norm": 1.43738036112722, | |
| "learning_rate": 1.8637940648619065e-06, | |
| "loss": 0.7308, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.484304932735426, | |
| "grad_norm": 1.402086349899742, | |
| "learning_rate": 1.8409716719120561e-06, | |
| "loss": 0.7164, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.48878923766816146, | |
| "grad_norm": 1.5227358428935063, | |
| "learning_rate": 1.8180657351224739e-06, | |
| "loss": 0.732, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.49327354260089684, | |
| "grad_norm": 1.5813743714389112, | |
| "learning_rate": 1.7950818668359733e-06, | |
| "loss": 0.7161, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.49327354260089684, | |
| "eval_loss": 0.7330535054206848, | |
| "eval_runtime": 408.4081, | |
| "eval_samples_per_second": 122.615, | |
| "eval_steps_per_second": 1.917, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.4977578475336323, | |
| "grad_norm": 1.4881819590713468, | |
| "learning_rate": 1.772025698489903e-06, | |
| "loss": 0.7144, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.5022421524663677, | |
| "grad_norm": 1.4750319990458514, | |
| "learning_rate": 1.7489028792363549e-06, | |
| "loss": 0.7365, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.5067264573991032, | |
| "grad_norm": 1.4443590686278198, | |
| "learning_rate": 1.7257190745580209e-06, | |
| "loss": 0.7487, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.5112107623318386, | |
| "grad_norm": 1.4695293763109774, | |
| "learning_rate": 1.7024799648800555e-06, | |
| "loss": 0.7233, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.515695067264574, | |
| "grad_norm": 1.4328944860273993, | |
| "learning_rate": 1.679191244178278e-06, | |
| "loss": 0.7322, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.5201793721973094, | |
| "grad_norm": 1.4157130638413895, | |
| "learning_rate": 1.6558586185840473e-06, | |
| "loss": 0.728, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.5246636771300448, | |
| "grad_norm": 1.4117533616122613, | |
| "learning_rate": 1.6324878049861656e-06, | |
| "loss": 0.7331, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.5291479820627802, | |
| "grad_norm": 1.4255877674393056, | |
| "learning_rate": 1.609084529630145e-06, | |
| "loss": 0.7491, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.5336322869955157, | |
| "grad_norm": 1.4486300200418207, | |
| "learning_rate": 1.5856545267151759e-06, | |
| "loss": 0.7261, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.5381165919282511, | |
| "grad_norm": 1.4628618883782867, | |
| "learning_rate": 1.5622035369891561e-06, | |
| "loss": 0.7247, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5381165919282511, | |
| "eval_loss": 0.7308038473129272, | |
| "eval_runtime": 406.6873, | |
| "eval_samples_per_second": 123.134, | |
| "eval_steps_per_second": 1.925, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5426008968609866, | |
| "grad_norm": 1.4112256357672157, | |
| "learning_rate": 1.5387373063421062e-06, | |
| "loss": 0.7307, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.547085201793722, | |
| "grad_norm": 1.3994109954542429, | |
| "learning_rate": 1.515261584398333e-06, | |
| "loss": 0.7062, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.5515695067264574, | |
| "grad_norm": 1.5279436893984248, | |
| "learning_rate": 1.491782123107669e-06, | |
| "loss": 0.7314, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.5560538116591929, | |
| "grad_norm": 1.4092281762272858, | |
| "learning_rate": 1.4683046753361521e-06, | |
| "loss": 0.7044, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.5605381165919282, | |
| "grad_norm": 1.4363381867810665, | |
| "learning_rate": 1.4448349934564736e-06, | |
| "loss": 0.7287, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.5650224215246636, | |
| "grad_norm": 1.4913351223697051, | |
| "learning_rate": 1.421378827938549e-06, | |
| "loss": 0.7254, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.5695067264573991, | |
| "grad_norm": 1.5096384680619075, | |
| "learning_rate": 1.3979419259405563e-06, | |
| "loss": 0.7389, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.5739910313901345, | |
| "grad_norm": 1.3495144573299676, | |
| "learning_rate": 1.3745300299007856e-06, | |
| "loss": 0.7247, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.57847533632287, | |
| "grad_norm": 1.3641879848291365, | |
| "learning_rate": 1.3511488761306412e-06, | |
| "loss": 0.7312, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.5829596412556054, | |
| "grad_norm": 1.3879105033157129, | |
| "learning_rate": 1.3278041934091524e-06, | |
| "loss": 0.7477, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5829596412556054, | |
| "eval_loss": 0.7287724018096924, | |
| "eval_runtime": 406.882, | |
| "eval_samples_per_second": 123.075, | |
| "eval_steps_per_second": 1.924, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5874439461883408, | |
| "grad_norm": 1.3916697284582622, | |
| "learning_rate": 1.3045017015793217e-06, | |
| "loss": 0.7246, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.5919282511210763, | |
| "grad_norm": 1.4328511876779917, | |
| "learning_rate": 1.2812471101466687e-06, | |
| "loss": 0.7303, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.5964125560538116, | |
| "grad_norm": 1.4411092846252307, | |
| "learning_rate": 1.2580461168803038e-06, | |
| "loss": 0.7318, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.600896860986547, | |
| "grad_norm": 1.4703965551927338, | |
| "learning_rate": 1.2349044064168782e-06, | |
| "loss": 0.7375, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.6053811659192825, | |
| "grad_norm": 1.4319057117061509, | |
| "learning_rate": 1.21182764886775e-06, | |
| "loss": 0.7302, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.6098654708520179, | |
| "grad_norm": 1.5017976848926429, | |
| "learning_rate": 1.188821498429714e-06, | |
| "loss": 0.7262, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.6143497757847534, | |
| "grad_norm": 1.4553869576056546, | |
| "learning_rate": 1.165891591999626e-06, | |
| "loss": 0.7447, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.6188340807174888, | |
| "grad_norm": 1.4128744043127173, | |
| "learning_rate": 1.1430435477932646e-06, | |
| "loss": 0.7423, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.6233183856502242, | |
| "grad_norm": 1.3797159286061107, | |
| "learning_rate": 1.1202829639687785e-06, | |
| "loss": 0.744, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.6278026905829597, | |
| "grad_norm": 1.487304571595245, | |
| "learning_rate": 1.0976154172550408e-06, | |
| "loss": 0.7429, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6278026905829597, | |
| "eval_loss": 0.7272571921348572, | |
| "eval_runtime": 406.7541, | |
| "eval_samples_per_second": 123.114, | |
| "eval_steps_per_second": 1.925, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6322869955156951, | |
| "grad_norm": 1.544512062570189, | |
| "learning_rate": 1.0750464615852523e-06, | |
| "loss": 0.7251, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.6367713004484304, | |
| "grad_norm": 1.422563130817404, | |
| "learning_rate": 1.0525816267361398e-06, | |
| "loss": 0.712, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.6412556053811659, | |
| "grad_norm": 1.4937681764382644, | |
| "learning_rate": 1.0302264169730613e-06, | |
| "loss": 0.7203, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.6457399103139013, | |
| "grad_norm": 1.50738757049434, | |
| "learning_rate": 1.0079863097013722e-06, | |
| "loss": 0.7121, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.6502242152466368, | |
| "grad_norm": 1.286396172710849, | |
| "learning_rate": 9.85866754124367e-07, | |
| "loss": 0.7193, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.6547085201793722, | |
| "grad_norm": 1.4997539342741677, | |
| "learning_rate": 9.638731699081281e-07, | |
| "loss": 0.7288, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.6591928251121076, | |
| "grad_norm": 1.37434247409356, | |
| "learning_rate": 9.42010945853623e-07, | |
| "loss": 0.7597, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.6636771300448431, | |
| "grad_norm": 1.3869436283100607, | |
| "learning_rate": 9.202854385763502e-07, | |
| "loss": 0.7184, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.6681614349775785, | |
| "grad_norm": 1.3970067087387381, | |
| "learning_rate": 8.987019711938812e-07, | |
| "loss": 0.7326, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.672645739910314, | |
| "grad_norm": 1.553183464191494, | |
| "learning_rate": 8.772658320216047e-07, | |
| "loss": 0.7317, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.672645739910314, | |
| "eval_loss": 0.7256098389625549, | |
| "eval_runtime": 406.6132, | |
| "eval_samples_per_second": 123.156, | |
| "eval_steps_per_second": 1.926, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6771300448430493, | |
| "grad_norm": 1.3357768297094936, | |
| "learning_rate": 8.55982273277002e-07, | |
| "loss": 0.7347, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.6816143497757847, | |
| "grad_norm": 1.3249788097985131, | |
| "learning_rate": 8.348565097927605e-07, | |
| "loss": 0.7496, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.6860986547085202, | |
| "grad_norm": 1.4578138220875878, | |
| "learning_rate": 8.13893717739056e-07, | |
| "loss": 0.7308, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.6905829596412556, | |
| "grad_norm": 1.3268077719441809, | |
| "learning_rate": 7.930990333553013e-07, | |
| "loss": 0.7094, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.695067264573991, | |
| "grad_norm": 1.47562182506043, | |
| "learning_rate": 7.72477551691678e-07, | |
| "loss": 0.697, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.6995515695067265, | |
| "grad_norm": 1.4850843190566259, | |
| "learning_rate": 7.520343253607677e-07, | |
| "loss": 0.7301, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.7040358744394619, | |
| "grad_norm": 1.5097763618083517, | |
| "learning_rate": 7.317743632995731e-07, | |
| "loss": 0.7217, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.7085201793721974, | |
| "grad_norm": 1.3914348509226637, | |
| "learning_rate": 7.117026295422425e-07, | |
| "loss": 0.6957, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.7130044843049327, | |
| "grad_norm": 1.5175208261545492, | |
| "learning_rate": 6.918240420038007e-07, | |
| "loss": 0.7317, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.7174887892376681, | |
| "grad_norm": 1.4947559578839034, | |
| "learning_rate": 6.721434712751745e-07, | |
| "loss": 0.7226, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7174887892376681, | |
| "eval_loss": 0.7243176102638245, | |
| "eval_runtime": 406.7899, | |
| "eval_samples_per_second": 123.103, | |
| "eval_steps_per_second": 1.925, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7219730941704036, | |
| "grad_norm": 1.5192098207309965, | |
| "learning_rate": 6.526657394298154e-07, | |
| "loss": 0.705, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.726457399103139, | |
| "grad_norm": 1.3665027387136646, | |
| "learning_rate": 6.333956188422088e-07, | |
| "loss": 0.706, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.7309417040358744, | |
| "grad_norm": 1.4974912840899435, | |
| "learning_rate": 6.143378310185643e-07, | |
| "loss": 0.6983, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.7354260089686099, | |
| "grad_norm": 1.5477574584643699, | |
| "learning_rate": 5.954970454399638e-07, | |
| "loss": 0.7252, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.7399103139013453, | |
| "grad_norm": 1.525090065151942, | |
| "learning_rate": 5.768778784182616e-07, | |
| "loss": 0.7087, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.7443946188340808, | |
| "grad_norm": 1.4837554579437873, | |
| "learning_rate": 5.584848919650069e-07, | |
| "loss": 0.7075, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.7488789237668162, | |
| "grad_norm": 1.3538329119260115, | |
| "learning_rate": 5.403225926736772e-07, | |
| "loss": 0.7057, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.7533632286995515, | |
| "grad_norm": 1.359895087573495, | |
| "learning_rate": 5.223954306154843e-07, | |
| "loss": 0.7306, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.757847533632287, | |
| "grad_norm": 1.4168148218595764, | |
| "learning_rate": 5.047077982490311e-07, | |
| "loss": 0.7424, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.7623318385650224, | |
| "grad_norm": 1.4815842671642683, | |
| "learning_rate": 4.872640293440861e-07, | |
| "loss": 0.695, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.7623318385650224, | |
| "eval_loss": 0.7233718633651733, | |
| "eval_runtime": 406.8015, | |
| "eval_samples_per_second": 123.099, | |
| "eval_steps_per_second": 1.925, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.7668161434977578, | |
| "grad_norm": 1.5501655544071418, | |
| "learning_rate": 4.7006839791973673e-07, | |
| "loss": 0.7327, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.7713004484304933, | |
| "grad_norm": 1.3834984705411, | |
| "learning_rate": 4.53125117197179e-07, | |
| "loss": 0.7245, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.7757847533632287, | |
| "grad_norm": 1.4041748328697374, | |
| "learning_rate": 4.364383385674112e-07, | |
| "loss": 0.7054, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.7802690582959642, | |
| "grad_norm": 1.443104622604103, | |
| "learning_rate": 4.2001215057407026e-07, | |
| "loss": 0.7037, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.7847533632286996, | |
| "grad_norm": 1.5632699202433824, | |
| "learning_rate": 4.038505779116687e-07, | |
| "loss": 0.705, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.7892376681614349, | |
| "grad_norm": 1.349615732583278, | |
| "learning_rate": 3.879575804394782e-07, | |
| "loss": 0.7071, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.7937219730941704, | |
| "grad_norm": 1.3657530768128234, | |
| "learning_rate": 3.7233705221129646e-07, | |
| "loss": 0.7273, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.7982062780269058, | |
| "grad_norm": 1.5107387856649341, | |
| "learning_rate": 3.569928205213354e-07, | |
| "loss": 0.6975, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.8026905829596412, | |
| "grad_norm": 1.4525568524987686, | |
| "learning_rate": 3.419286449664741e-07, | |
| "loss": 0.7095, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.8071748878923767, | |
| "grad_norm": 1.4847854049722584, | |
| "learning_rate": 3.2714821652508854e-07, | |
| "loss": 0.7167, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.8071748878923767, | |
| "eval_loss": 0.7225807309150696, | |
| "eval_runtime": 406.5326, | |
| "eval_samples_per_second": 123.181, | |
| "eval_steps_per_second": 1.926, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.8116591928251121, | |
| "grad_norm": 1.2447161837361285, | |
| "learning_rate": 3.126551566527036e-07, | |
| "loss": 0.7156, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.8161434977578476, | |
| "grad_norm": 1.4139333132454484, | |
| "learning_rate": 2.9845301639467284e-07, | |
| "loss": 0.7537, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.820627802690583, | |
| "grad_norm": 1.3663031642715642, | |
| "learning_rate": 2.8454527551611205e-07, | |
| "loss": 0.7238, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.8251121076233184, | |
| "grad_norm": 1.389263976301968, | |
| "learning_rate": 2.7093534164929904e-07, | |
| "loss": 0.738, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.8295964125560538, | |
| "grad_norm": 1.5068808968575202, | |
| "learning_rate": 2.576265494587458e-07, | |
| "loss": 0.7067, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.8340807174887892, | |
| "grad_norm": 1.4226178531466935, | |
| "learning_rate": 2.446221598241472e-07, | |
| "loss": 0.7143, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.8385650224215246, | |
| "grad_norm": 1.6881847148932905, | |
| "learning_rate": 2.319253590414132e-07, | |
| "loss": 0.7376, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.8430493273542601, | |
| "grad_norm": 1.4353283330892004, | |
| "learning_rate": 2.1953925804197056e-07, | |
| "loss": 0.7095, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.8475336322869955, | |
| "grad_norm": 1.4639605071750654, | |
| "learning_rate": 2.0746689163053113e-07, | |
| "loss": 0.7102, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.852017937219731, | |
| "grad_norm": 1.458703799588621, | |
| "learning_rate": 1.9571121774151545e-07, | |
| "loss": 0.686, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.852017937219731, | |
| "eval_loss": 0.7220604419708252, | |
| "eval_runtime": 406.5609, | |
| "eval_samples_per_second": 123.172, | |
| "eval_steps_per_second": 1.926, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.8565022421524664, | |
| "grad_norm": 1.470148783910905, | |
| "learning_rate": 1.8427511671430757e-07, | |
| "loss": 0.72, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.8609865470852018, | |
| "grad_norm": 1.3891242748262451, | |
| "learning_rate": 1.7316139058752194e-07, | |
| "loss": 0.7318, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.8654708520179372, | |
| "grad_norm": 1.2245069775705093, | |
| "learning_rate": 1.6237276241245867e-07, | |
| "loss": 0.7155, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.8699551569506726, | |
| "grad_norm": 1.360510189488915, | |
| "learning_rate": 1.519118755859084e-07, | |
| "loss": 0.7255, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.874439461883408, | |
| "grad_norm": 1.495119615923585, | |
| "learning_rate": 1.4178129320247486e-07, | |
| "loss": 0.7484, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.8789237668161435, | |
| "grad_norm": 1.3674856635367474, | |
| "learning_rate": 1.31983497426575e-07, | |
| "loss": 0.7366, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.8834080717488789, | |
| "grad_norm": 1.4494730150421093, | |
| "learning_rate": 1.2252088888426431e-07, | |
| "loss": 0.742, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.8878923766816144, | |
| "grad_norm": 1.4368197978682802, | |
| "learning_rate": 1.1339578607504536e-07, | |
| "loss": 0.7269, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.8923766816143498, | |
| "grad_norm": 1.4017197990051706, | |
| "learning_rate": 1.0461042480379402e-07, | |
| "loss": 0.7234, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.8968609865470852, | |
| "grad_norm": 1.426560347266084, | |
| "learning_rate": 9.616695763295007e-08, | |
| "loss": 0.7214, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8968609865470852, | |
| "eval_loss": 0.721759557723999, | |
| "eval_runtime": 406.5838, | |
| "eval_samples_per_second": 123.165, | |
| "eval_steps_per_second": 1.926, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.9013452914798207, | |
| "grad_norm": 1.489947255967281, | |
| "learning_rate": 8.806745335510297e-08, | |
| "loss": 0.7341, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.905829596412556, | |
| "grad_norm": 1.4312716003053576, | |
| "learning_rate": 8.031389648610266e-08, | |
| "loss": 0.7264, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.9103139013452914, | |
| "grad_norm": 1.4764400641380824, | |
| "learning_rate": 7.290818677881966e-08, | |
| "loss": 0.7301, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.9147982062780269, | |
| "grad_norm": 1.4381108917682341, | |
| "learning_rate": 6.585213875767305e-08, | |
| "loss": 0.6997, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.9192825112107623, | |
| "grad_norm": 1.459723127188453, | |
| "learning_rate": 5.914748127404102e-08, | |
| "loss": 0.7168, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.9237668161434978, | |
| "grad_norm": 1.5776619173541433, | |
| "learning_rate": 5.2795857082663655e-08, | |
| "loss": 0.72, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.9282511210762332, | |
| "grad_norm": 1.438610611700907, | |
| "learning_rate": 4.6798822439140185e-08, | |
| "loss": 0.7035, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.9327354260089686, | |
| "grad_norm": 1.4350411032390504, | |
| "learning_rate": 4.115784671861916e-08, | |
| "loss": 0.735, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.9372197309417041, | |
| "grad_norm": 1.4822578142933729, | |
| "learning_rate": 3.587431205577713e-08, | |
| "loss": 0.7178, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.9417040358744395, | |
| "grad_norm": 1.5001233187138816, | |
| "learning_rate": 3.0949513006172325e-08, | |
| "loss": 0.7358, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.9417040358744395, | |
| "eval_loss": 0.7216091752052307, | |
| "eval_runtime": 406.6258, | |
| "eval_samples_per_second": 123.153, | |
| "eval_steps_per_second": 1.926, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.9461883408071748, | |
| "grad_norm": 1.4457564058059627, | |
| "learning_rate": 2.6384656229056946e-08, | |
| "loss": 0.7285, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.9506726457399103, | |
| "grad_norm": 1.6789172768348999, | |
| "learning_rate": 2.218086019172394e-08, | |
| "loss": 0.7027, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.9551569506726457, | |
| "grad_norm": 1.4039832008414181, | |
| "learning_rate": 1.8339154895464894e-08, | |
| "loss": 0.7285, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.9596412556053812, | |
| "grad_norm": 1.7674026844330886, | |
| "learning_rate": 1.4860481623201417e-08, | |
| "loss": 0.713, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.9641255605381166, | |
| "grad_norm": 1.531580121339593, | |
| "learning_rate": 1.1745692708855282e-08, | |
| "loss": 0.7328, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.968609865470852, | |
| "grad_norm": 1.455884868550825, | |
| "learning_rate": 8.99555132851232e-09, | |
| "loss": 0.7196, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.9730941704035875, | |
| "grad_norm": 1.3157536936429735, | |
| "learning_rate": 6.610731313430318e-09, | |
| "loss": 0.7277, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.9775784753363229, | |
| "grad_norm": 1.5586404477319191, | |
| "learning_rate": 4.5918169849406e-09, | |
| "loss": 0.7265, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.9820627802690582, | |
| "grad_norm": 1.3596393082767964, | |
| "learning_rate": 2.939303011277872e-09, | |
| "loss": 0.719, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.9865470852017937, | |
| "grad_norm": 1.3866642718972106, | |
| "learning_rate": 1.6535942863788456e-09, | |
| "loss": 0.7259, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.9865470852017937, | |
| "eval_loss": 0.7215752005577087, | |
| "eval_runtime": 408.9437, | |
| "eval_samples_per_second": 122.455, | |
| "eval_steps_per_second": 1.915, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.9910313901345291, | |
| "grad_norm": 1.6643780128489514, | |
| "learning_rate": 7.350058306764273e-10, | |
| "loss": 0.7044, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.9955156950672646, | |
| "grad_norm": 1.428221428067804, | |
| "learning_rate": 1.8376271391412624e-10, | |
| "loss": 0.7109, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.3882910125414851, | |
| "learning_rate": 0.0, | |
| "loss": 0.7123, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 2230, | |
| "total_flos": 250303561007104.0, | |
| "train_loss": 0.7492096503219262, | |
| "train_runtime": 18007.2993, | |
| "train_samples_per_second": 15.851, | |
| "train_steps_per_second": 0.124 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2230, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 250303561007104.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |