| { | |
| "best_metric": 1.6396043300628662, | |
| "best_model_checkpoint": "content/lora_8bit_onco_raw/oncology/checkpoint-1200", | |
| "epoch": 2.9964100518548067, | |
| "eval_steps": 200, | |
| "global_step": 1878, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.7856405973434448, | |
| "learning_rate": 7.499999999999999e-06, | |
| "loss": 1.9839, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.7171689867973328, | |
| "learning_rate": 1.4999999999999999e-05, | |
| "loss": 1.8072, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.7502748966217041, | |
| "learning_rate": 2.2499999999999998e-05, | |
| "loss": 1.916, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.6175706386566162, | |
| "learning_rate": 2.9999999999999997e-05, | |
| "loss": 1.8292, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.4654443562030792, | |
| "learning_rate": 3.75e-05, | |
| "loss": 1.8564, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.5197769999504089, | |
| "learning_rate": 4.4999999999999996e-05, | |
| "loss": 1.9732, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.6166006922721863, | |
| "learning_rate": 5.2499999999999995e-05, | |
| "loss": 1.8221, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.5110687613487244, | |
| "learning_rate": 5.9999999999999995e-05, | |
| "loss": 1.5955, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.6438449025154114, | |
| "learning_rate": 6.75e-05, | |
| "loss": 1.7571, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.4936439096927643, | |
| "learning_rate": 7.5e-05, | |
| "loss": 1.6779, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.554935872554779, | |
| "learning_rate": 8.25e-05, | |
| "loss": 1.74, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.49913492798805237, | |
| "learning_rate": 8.999999999999999e-05, | |
| "loss": 1.8844, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.5313754081726074, | |
| "learning_rate": 9.75e-05, | |
| "loss": 1.6513, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.5471957921981812, | |
| "learning_rate": 0.00010499999999999999, | |
| "loss": 1.7246, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.6044861674308777, | |
| "learning_rate": 0.0001125, | |
| "loss": 1.8523, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.5198370814323425, | |
| "learning_rate": 0.00011999999999999999, | |
| "loss": 1.7499, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.5604796409606934, | |
| "learning_rate": 0.00012749999999999998, | |
| "loss": 1.7664, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.5335713624954224, | |
| "learning_rate": 0.000135, | |
| "loss": 1.666, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.6011459827423096, | |
| "learning_rate": 0.0001425, | |
| "loss": 1.8222, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.6249136924743652, | |
| "learning_rate": 0.00015, | |
| "loss": 1.7571, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.5300886034965515, | |
| "learning_rate": 0.00015749999999999998, | |
| "loss": 1.7534, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.5625594854354858, | |
| "learning_rate": 0.000165, | |
| "loss": 1.6312, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.48619601130485535, | |
| "learning_rate": 0.00017249999999999996, | |
| "loss": 1.7695, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.5377641320228577, | |
| "learning_rate": 0.00017999999999999998, | |
| "loss": 1.7723, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.5358831882476807, | |
| "learning_rate": 0.00018749999999999998, | |
| "loss": 1.7541, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.5402532815933228, | |
| "learning_rate": 0.000195, | |
| "loss": 1.6796, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.5598291754722595, | |
| "learning_rate": 0.0002025, | |
| "loss": 1.6744, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.5184920430183411, | |
| "learning_rate": 0.00020999999999999998, | |
| "loss": 1.7768, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.5164936780929565, | |
| "learning_rate": 0.00021749999999999997, | |
| "loss": 1.6877, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.4958161413669586, | |
| "learning_rate": 0.000225, | |
| "loss": 1.8754, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.564961314201355, | |
| "learning_rate": 0.00023249999999999999, | |
| "loss": 1.6324, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.8469408750534058, | |
| "learning_rate": 0.00023999999999999998, | |
| "loss": 1.7146, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.6041777729988098, | |
| "learning_rate": 0.00024749999999999994, | |
| "loss": 1.6416, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.5473640561103821, | |
| "learning_rate": 0.00025499999999999996, | |
| "loss": 1.6726, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.6132262349128723, | |
| "learning_rate": 0.0002625, | |
| "loss": 1.7383, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.49731534719467163, | |
| "learning_rate": 0.00027, | |
| "loss": 1.6531, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.5386232733726501, | |
| "learning_rate": 0.00027749999999999997, | |
| "loss": 1.7335, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.5605046153068542, | |
| "learning_rate": 0.000285, | |
| "loss": 1.7373, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.7236664295196533, | |
| "learning_rate": 0.00029249999999999995, | |
| "loss": 1.5766, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.5735570788383484, | |
| "learning_rate": 0.0003, | |
| "loss": 1.7882, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 1.7317848205566406, | |
| "eval_runtime": 53.4548, | |
| "eval_samples_per_second": 9.054, | |
| "eval_steps_per_second": 2.264, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.48291996121406555, | |
| "learning_rate": 0.0002999934277521501, | |
| "loss": 1.6244, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.5112442970275879, | |
| "learning_rate": 0.00029997371158452626, | |
| "loss": 1.6932, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.5557854771614075, | |
| "learning_rate": 0.0002999408532248557, | |
| "loss": 1.6655, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.477541446685791, | |
| "learning_rate": 0.00029989485555251554, | |
| "loss": 1.6773, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.45816484093666077, | |
| "learning_rate": 0.00029983572259828045, | |
| "loss": 1.5355, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.5112146735191345, | |
| "learning_rate": 0.00029976345954396954, | |
| "loss": 1.6489, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.5796986222267151, | |
| "learning_rate": 0.0002996780727219923, | |
| "loss": 1.6708, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.4717162251472473, | |
| "learning_rate": 0.0002995795696147933, | |
| "loss": 1.7605, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.5081738233566284, | |
| "learning_rate": 0.00029946795885419714, | |
| "loss": 1.7257, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.6322786808013916, | |
| "learning_rate": 0.0002993432502206515, | |
| "loss": 1.7894, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.626715898513794, | |
| "learning_rate": 0.00029920545464237033, | |
| "loss": 1.6251, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.5590308308601379, | |
| "learning_rate": 0.0002990545841943763, | |
| "loss": 1.602, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.5137120485305786, | |
| "learning_rate": 0.00029889065209744235, | |
| "loss": 1.664, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.5325127840042114, | |
| "learning_rate": 0.0002987136727169335, | |
| "loss": 1.5928, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.5906115174293518, | |
| "learning_rate": 0.0002985236615615478, | |
| "loss": 1.5565, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.5913872718811035, | |
| "learning_rate": 0.00029832063528195733, | |
| "loss": 1.701, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.5970895886421204, | |
| "learning_rate": 0.0002981046116693491, | |
| "loss": 1.5652, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.5957712531089783, | |
| "learning_rate": 0.00029787560965386614, | |
| "loss": 1.6611, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.5931127071380615, | |
| "learning_rate": 0.00029763364930294854, | |
| "loss": 1.6456, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.5497797131538391, | |
| "learning_rate": 0.00029737875181957486, | |
| "loss": 1.5782, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.5486745238304138, | |
| "learning_rate": 0.00029711093954040425, | |
| "loss": 1.7476, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.5858426690101624, | |
| "learning_rate": 0.0002968302359338191, | |
| "loss": 1.6871, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.5358626842498779, | |
| "learning_rate": 0.0002965366655978683, | |
| "loss": 1.8075, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.6227905750274658, | |
| "learning_rate": 0.00029623025425811215, | |
| "loss": 1.6059, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.5188565850257874, | |
| "learning_rate": 0.0002959110287653674, | |
| "loss": 1.7475, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.5319951772689819, | |
| "learning_rate": 0.00029557901709335497, | |
| "loss": 1.6401, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.5894659757614136, | |
| "learning_rate": 0.00029523424833624806, | |
| "loss": 1.7475, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.5921294093132019, | |
| "learning_rate": 0.00029487675270612303, | |
| "loss": 1.7581, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.528262734413147, | |
| "learning_rate": 0.0002945065615303116, | |
| "loss": 1.8347, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.6218762397766113, | |
| "learning_rate": 0.000294123707248656, | |
| "loss": 1.789, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.6058654189109802, | |
| "learning_rate": 0.000293728223410666, | |
| "loss": 1.646, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.5830885767936707, | |
| "learning_rate": 0.00029332014467257884, | |
| "loss": 1.7596, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.5829673409461975, | |
| "learning_rate": 0.0002928995067943227, | |
| "loss": 1.5936, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.569988489151001, | |
| "learning_rate": 0.00029246634663638286, | |
| "loss": 1.8244, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.5579255819320679, | |
| "learning_rate": 0.0002920207021565714, | |
| "loss": 1.7428, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.5439159274101257, | |
| "learning_rate": 0.00029156261240670145, | |
| "loss": 1.6165, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.6219534873962402, | |
| "learning_rate": 0.0002910921175291646, | |
| "loss": 1.727, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.5160556435585022, | |
| "learning_rate": 0.00029060925875341344, | |
| "loss": 1.5996, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.6084945797920227, | |
| "learning_rate": 0.0002901140783923487, | |
| "loss": 1.6613, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.552727222442627, | |
| "learning_rate": 0.00028960661983861124, | |
| "loss": 1.604, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 1.7020632028579712, | |
| "eval_runtime": 53.5236, | |
| "eval_samples_per_second": 9.043, | |
| "eval_steps_per_second": 2.261, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.6136419177055359, | |
| "learning_rate": 0.00028908692756077957, | |
| "loss": 1.6674, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.6204317212104797, | |
| "learning_rate": 0.00028855504709947305, | |
| "loss": 1.7643, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.5804699659347534, | |
| "learning_rate": 0.0002880110250633613, | |
| "loss": 1.5933, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.5574301481246948, | |
| "learning_rate": 0.0002874549091250797, | |
| "loss": 1.6314, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.666695773601532, | |
| "learning_rate": 0.00028688674801705203, | |
| "loss": 1.62, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.5769309997558594, | |
| "learning_rate": 0.0002863065915272199, | |
| "loss": 1.7478, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.5807488560676575, | |
| "learning_rate": 0.0002857144904946799, | |
| "loss": 1.7793, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.60151606798172, | |
| "learning_rate": 0.0002851104968052284, | |
| "loss": 1.6921, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.5778372883796692, | |
| "learning_rate": 0.00028449466338681525, | |
| "loss": 1.6779, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.6416473388671875, | |
| "learning_rate": 0.00028386704420490513, | |
| "loss": 1.6848, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.681058943271637, | |
| "learning_rate": 0.0002832276942577491, | |
| "loss": 1.7468, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.6174286007881165, | |
| "learning_rate": 0.0002825766695715646, | |
| "loss": 1.6839, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.742919385433197, | |
| "learning_rate": 0.0002819140271956262, | |
| "loss": 1.7846, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.744145929813385, | |
| "learning_rate": 0.00028123982519726634, | |
| "loss": 1.6862, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.590994656085968, | |
| "learning_rate": 0.0002805541226567869, | |
| "loss": 1.6675, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.6487185955047607, | |
| "learning_rate": 0.0002798569796622817, | |
| "loss": 1.5819, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.6046218276023865, | |
| "learning_rate": 0.00027914845730437135, | |
| "loss": 1.6375, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.6337038278579712, | |
| "learning_rate": 0.00027842861767084996, | |
| "loss": 1.8259, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.6214140057563782, | |
| "learning_rate": 0.00027769752384124375, | |
| "loss": 1.7048, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.646763026714325, | |
| "learning_rate": 0.00027695523988128417, | |
| "loss": 1.7394, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.5836499929428101, | |
| "learning_rate": 0.0002762018308372934, | |
| "loss": 1.6096, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.557185709476471, | |
| "learning_rate": 0.0002754373627304842, | |
| "loss": 1.6317, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.5877421498298645, | |
| "learning_rate": 0.00027466190255117506, | |
| "loss": 1.6814, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.6311076283454895, | |
| "learning_rate": 0.00027387551825291907, | |
| "loss": 1.7358, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.57215815782547, | |
| "learning_rate": 0.0002730782787465499, | |
| "loss": 1.5929, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.6365725994110107, | |
| "learning_rate": 0.0002722702538941425, | |
| "loss": 1.6938, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.6177056431770325, | |
| "learning_rate": 0.0002714515145028916, | |
| "loss": 1.6506, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.6062068343162537, | |
| "learning_rate": 0.0002706221323189066, | |
| "loss": 1.7848, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.6754643321037292, | |
| "learning_rate": 0.0002697821800209244, | |
| "loss": 1.6818, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.5303813219070435, | |
| "learning_rate": 0.00026893173121394094, | |
| "loss": 1.6557, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.560896635055542, | |
| "learning_rate": 0.00026807086042276074, | |
| "loss": 1.5825, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.5587555766105652, | |
| "learning_rate": 0.00026719964308546647, | |
| "loss": 1.8095, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.5372602343559265, | |
| "learning_rate": 0.00026631815554680854, | |
| "loss": 1.5673, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.7331911325454712, | |
| "learning_rate": 0.0002654264750515146, | |
| "loss": 1.6092, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.5914915800094604, | |
| "learning_rate": 0.00026452467973752097, | |
| "loss": 1.6226, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.6332432627677917, | |
| "learning_rate": 0.0002636128486291251, | |
| "loss": 1.687, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.5786017179489136, | |
| "learning_rate": 0.00026269106163006093, | |
| "loss": 1.7443, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.6086643934249878, | |
| "learning_rate": 0.0002617593995164967, | |
| "loss": 1.6546, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.6353456377983093, | |
| "learning_rate": 0.00026081794392995676, | |
| "loss": 1.6635, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.6383431553840637, | |
| "learning_rate": 0.00025986677737016713, | |
| "loss": 1.706, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_loss": 1.6813957691192627, | |
| "eval_runtime": 53.307, | |
| "eval_samples_per_second": 9.079, | |
| "eval_steps_per_second": 2.27, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.6295060515403748, | |
| "learning_rate": 0.0002589059831878262, | |
| "loss": 1.6353, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.6981686353683472, | |
| "learning_rate": 0.00025793564557730065, | |
| "loss": 1.6768, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.6336885094642639, | |
| "learning_rate": 0.0002569558495692474, | |
| "loss": 1.6594, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.6425840258598328, | |
| "learning_rate": 0.00025596668102316266, | |
| "loss": 1.7709, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.5762469172477722, | |
| "learning_rate": 0.0002549682266198577, | |
| "loss": 1.599, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.6281951069831848, | |
| "learning_rate": 0.00025396057385386326, | |
| "loss": 1.6798, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.6240226030349731, | |
| "learning_rate": 0.0002529438110257623, | |
| "loss": 1.491, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 0.7243877053260803, | |
| "learning_rate": 0.00025191802723445233, | |
| "loss": 1.6763, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 0.7280773520469666, | |
| "learning_rate": 0.0002508833123693376, | |
| "loss": 1.5093, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.6628164649009705, | |
| "learning_rate": 0.0002498397571024521, | |
| "loss": 1.5736, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.8163209557533264, | |
| "learning_rate": 0.00024878745288051384, | |
| "loss": 1.6704, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.7377064228057861, | |
| "learning_rate": 0.0002477264919169117, | |
| "loss": 1.6146, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 0.7334110140800476, | |
| "learning_rate": 0.0002466569671836246, | |
| "loss": 1.5641, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 0.708491325378418, | |
| "learning_rate": 0.000245578972403074, | |
| "loss": 1.6447, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.688526451587677, | |
| "learning_rate": 0.0002444926020399118, | |
| "loss": 1.4617, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.6798132658004761, | |
| "learning_rate": 0.00024339795129274178, | |
| "loss": 1.4603, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 0.7954903244972229, | |
| "learning_rate": 0.0002422951160857775, | |
| "loss": 1.6878, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.7106612920761108, | |
| "learning_rate": 0.00024118419306043653, | |
| "loss": 1.5761, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 0.6339300870895386, | |
| "learning_rate": 0.00024006527956687171, | |
| "loss": 1.6388, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.6403482556343079, | |
| "learning_rate": 0.00023893847365544043, | |
| "loss": 1.6328, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.6963227391242981, | |
| "learning_rate": 0.0002378038740681124, | |
| "loss": 1.5307, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 0.7481818795204163, | |
| "learning_rate": 0.00023666158022981696, | |
| "loss": 1.544, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 0.6810987591743469, | |
| "learning_rate": 0.00023551169223973037, | |
| "loss": 1.4045, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 0.7195698022842407, | |
| "learning_rate": 0.00023435431086250424, | |
| "loss": 1.5408, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.8156709671020508, | |
| "learning_rate": 0.0002331895375194355, | |
| "loss": 1.5569, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.6353544592857361, | |
| "learning_rate": 0.00023201747427957882, | |
| "loss": 1.5852, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.7116926312446594, | |
| "learning_rate": 0.0002308382238508023, | |
| "loss": 1.5359, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 0.7053561806678772, | |
| "learning_rate": 0.0002296518895707871, | |
| "loss": 1.6377, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 0.8180266618728638, | |
| "learning_rate": 0.0002284585753979723, | |
| "loss": 1.5982, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.6952159404754639, | |
| "learning_rate": 0.00022725838590244472, | |
| "loss": 1.5485, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.8135376572608948, | |
| "learning_rate": 0.00022605142625677525, | |
| "loss": 1.6058, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 0.6240015625953674, | |
| "learning_rate": 0.00022483780222680314, | |
| "loss": 1.6065, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.8067346215248108, | |
| "learning_rate": 0.00022361762016236738, | |
| "loss": 1.5782, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 0.7767935991287231, | |
| "learning_rate": 0.0002223909869879872, | |
| "loss": 1.4984, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 0.7751308083534241, | |
| "learning_rate": 0.00022115801019349256, | |
| "loss": 1.6065, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 0.6872211694717407, | |
| "learning_rate": 0.00021991879782460468, | |
| "loss": 1.6571, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.6883332133293152, | |
| "learning_rate": 0.00021867345847346783, | |
| "loss": 1.587, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 0.6802603006362915, | |
| "learning_rate": 0.00021742210126913364, | |
| "loss": 1.527, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 0.8581397533416748, | |
| "learning_rate": 0.0002161648358679981, | |
| "loss": 1.5975, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.779639482498169, | |
| "learning_rate": 0.00021490177244419225, | |
| "loss": 1.5361, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_loss": 1.6755969524383545, | |
| "eval_runtime": 54.1533, | |
| "eval_samples_per_second": 8.938, | |
| "eval_steps_per_second": 2.234, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.7144075036048889, | |
| "learning_rate": 0.00021363302167992773, | |
| "loss": 1.6061, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 0.7203413844108582, | |
| "learning_rate": 0.00021235869475579763, | |
| "loss": 1.5182, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 0.7165353298187256, | |
| "learning_rate": 0.00021107890334103372, | |
| "loss": 1.4595, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 0.7226712703704834, | |
| "learning_rate": 0.0002097937595837211, | |
| "loss": 1.5236, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 0.6462448239326477, | |
| "learning_rate": 0.00020850337610097027, | |
| "loss": 1.6672, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 0.8719556331634521, | |
| "learning_rate": 0.00020720786596904893, | |
| "loss": 1.5598, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.8658766150474548, | |
| "learning_rate": 0.0002059073427134727, | |
| "loss": 1.6314, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 0.6820198893547058, | |
| "learning_rate": 0.00020460192029905726, | |
| "loss": 1.5311, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.6817758083343506, | |
| "learning_rate": 0.00020329171311993135, | |
| "loss": 1.4233, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 0.7133145928382874, | |
| "learning_rate": 0.00020197683598951248, | |
| "loss": 1.5895, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 0.684045135974884, | |
| "learning_rate": 0.00020065740413044588, | |
| "loss": 1.7072, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 0.8078965544700623, | |
| "learning_rate": 0.0001993335331645076, | |
| "loss": 1.6305, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 0.7130747437477112, | |
| "learning_rate": 0.00019800533910247243, | |
| "loss": 1.5375, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 0.8379193544387817, | |
| "learning_rate": 0.00019667293833394793, | |
| "loss": 1.5368, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.8084167242050171, | |
| "learning_rate": 0.00019533644761717518, | |
| "loss": 1.729, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.8069108724594116, | |
| "learning_rate": 0.00019399598406879748, | |
| "loss": 1.5978, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 0.8795610666275024, | |
| "learning_rate": 0.0001926516651535971, | |
| "loss": 1.7128, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.7351583242416382, | |
| "learning_rate": 0.00019130360867420203, | |
| "loss": 1.6878, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 0.747438907623291, | |
| "learning_rate": 0.00018995193276076294, | |
| "loss": 1.4505, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.7769878506660461, | |
| "learning_rate": 0.00018859675586060134, | |
| "loss": 1.4913, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.7609659433364868, | |
| "learning_rate": 0.0001872381967278302, | |
| "loss": 1.6376, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.7220240831375122, | |
| "learning_rate": 0.00018587637441294734, | |
| "loss": 1.5796, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 0.7993665337562561, | |
| "learning_rate": 0.00018451140825240312, | |
| "loss": 1.5595, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 0.7627410292625427, | |
| "learning_rate": 0.00018314341785814317, | |
| "loss": 1.618, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.7644938826560974, | |
| "learning_rate": 0.00018177252310712646, | |
| "loss": 1.5943, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.6797645688056946, | |
| "learning_rate": 0.00018039884413082065, | |
| "loss": 1.5168, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 0.7855983376502991, | |
| "learning_rate": 0.00017902250130467524, | |
| "loss": 1.3884, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.7340678572654724, | |
| "learning_rate": 0.00017764361523757246, | |
| "loss": 1.6262, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 0.9311641454696655, | |
| "learning_rate": 0.00017626230676125887, | |
| "loss": 1.5546, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.7892611622810364, | |
| "learning_rate": 0.0001748786969197567, | |
| "loss": 1.541, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.737107515335083, | |
| "learning_rate": 0.00017349290695875668, | |
| "loss": 1.543, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 0.7268142104148865, | |
| "learning_rate": 0.00017210505831499336, | |
| "loss": 1.3838, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 0.7059959769248962, | |
| "learning_rate": 0.00017071527260560356, | |
| "loss": 1.4105, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.7432154417037964, | |
| "learning_rate": 0.00016932367161746922, | |
| "loss": 1.5525, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 0.7598636746406555, | |
| "learning_rate": 0.00016793037729654503, | |
| "loss": 1.4841, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 0.7130389213562012, | |
| "learning_rate": 0.0001665355117371723, | |
| "loss": 1.6633, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 0.7120307087898254, | |
| "learning_rate": 0.00016513919717138015, | |
| "loss": 1.661, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 0.7838018536567688, | |
| "learning_rate": 0.00016374155595817383, | |
| "loss": 1.4154, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 0.8129784464836121, | |
| "learning_rate": 0.00016234271057281288, | |
| "loss": 1.4759, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.7084518671035767, | |
| "learning_rate": 0.0001609427835960783, | |
| "loss": 1.5625, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 1.656510591506958, | |
| "eval_runtime": 53.8113, | |
| "eval_samples_per_second": 8.994, | |
| "eval_steps_per_second": 2.249, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.7194417715072632, | |
| "learning_rate": 0.00015954189770353086, | |
| "loss": 1.6973, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 0.8406971096992493, | |
| "learning_rate": 0.00015814017565476122, | |
| "loss": 1.5086, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 0.7156040072441101, | |
| "learning_rate": 0.0001567377402826323, | |
| "loss": 1.6073, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 0.7198348045349121, | |
| "learning_rate": 0.00015533471448251555, | |
| "loss": 1.5575, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 0.7150219678878784, | |
| "learning_rate": 0.0001539312212015217, | |
| "loss": 1.5192, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 0.6892033815383911, | |
| "learning_rate": 0.0001525273834277266, | |
| "loss": 1.4947, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 0.7327867746353149, | |
| "learning_rate": 0.00015112332417939418, | |
| "loss": 1.5601, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 0.6658421754837036, | |
| "learning_rate": 0.00014971916649419615, | |
| "loss": 1.6199, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 0.8288849592208862, | |
| "learning_rate": 0.00014831503341843018, | |
| "loss": 1.6261, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 0.7870001196861267, | |
| "learning_rate": 0.00014691104799623744, | |
| "loss": 1.5233, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 0.7046535015106201, | |
| "learning_rate": 0.00014550733325882024, | |
| "loss": 1.6226, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 0.7216707468032837, | |
| "learning_rate": 0.0001441040122136608, | |
| "loss": 1.4483, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.7812987565994263, | |
| "learning_rate": 0.00014270120783374204, | |
| "loss": 1.512, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 0.7140246629714966, | |
| "learning_rate": 0.0001412990430467715, | |
| "loss": 1.5703, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.8250936269760132, | |
| "learning_rate": 0.00013989764072440945, | |
| "loss": 1.5059, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.7511314749717712, | |
| "learning_rate": 0.000138497123671501, | |
| "loss": 1.6571, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 0.7608028054237366, | |
| "learning_rate": 0.00013709761461531552, | |
| "loss": 1.4981, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 0.8892323970794678, | |
| "learning_rate": 0.00013569923619479138, | |
| "loss": 1.5202, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.7229086756706238, | |
| "learning_rate": 0.00013430211094978955, | |
| "loss": 1.5342, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.814064621925354, | |
| "learning_rate": 0.00013290636131035513, | |
| "loss": 1.4798, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.5819219350814819, | |
| "learning_rate": 0.0001315121095859891, | |
| "loss": 1.3957, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 0.7352663278579712, | |
| "learning_rate": 0.00013011947795492994, | |
| "loss": 1.5094, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 0.7600275874137878, | |
| "learning_rate": 0.0001287285884534478, | |
| "loss": 1.4571, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 0.795772910118103, | |
| "learning_rate": 0.00012733956296514954, | |
| "loss": 1.5675, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 0.8060831427574158, | |
| "learning_rate": 0.0001259525232102991, | |
| "loss": 1.4759, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.8213143348693848, | |
| "learning_rate": 0.0001245675907351503, | |
| "loss": 1.6123, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 0.7344096302986145, | |
| "learning_rate": 0.00012318488690129633, | |
| "loss": 1.5137, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 0.6787902116775513, | |
| "learning_rate": 0.00012180453287503444, | |
| "loss": 1.6397, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 0.7361781001091003, | |
| "learning_rate": 0.00012042664961674843, | |
| "loss": 1.453, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 0.7698022127151489, | |
| "learning_rate": 0.00011905135787030865, | |
| "loss": 1.4961, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 0.7049584984779358, | |
| "learning_rate": 0.00011767877815249155, | |
| "loss": 1.5562, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 0.7255071401596069, | |
| "learning_rate": 0.00011630903074241826, | |
| "loss": 1.6307, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 0.7467817664146423, | |
| "learning_rate": 0.0001149422356710152, | |
| "loss": 1.4051, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 0.6346266269683838, | |
| "learning_rate": 0.00011357851271049528, | |
| "loss": 1.6404, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 0.6892123818397522, | |
| "learning_rate": 0.00011221798136386238, | |
| "loss": 1.462, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 0.7803078889846802, | |
| "learning_rate": 0.00011086076085443949, | |
| "loss": 1.6246, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 0.6938590407371521, | |
| "learning_rate": 0.00010950697011542082, | |
| "loss": 1.3757, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.6958265900611877, | |
| "learning_rate": 0.00010815672777945013, | |
| "loss": 1.3996, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 0.9197216033935547, | |
| "learning_rate": 0.00010681015216822454, | |
| "loss": 1.5832, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 0.6959114670753479, | |
| "learning_rate": 0.00010546736128212632, | |
| "loss": 1.606, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "eval_loss": 1.6396043300628662, | |
| "eval_runtime": 53.5893, | |
| "eval_samples_per_second": 9.032, | |
| "eval_steps_per_second": 2.258, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.8191283345222473, | |
| "learning_rate": 0.00010412847278988228, | |
| "loss": 1.5161, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 0.9132874011993408, | |
| "learning_rate": 0.00010279360401825263, | |
| "loss": 1.4927, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 0.7479792833328247, | |
| "learning_rate": 0.00010146287194174953, | |
| "loss": 1.5343, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 0.7714102864265442, | |
| "learning_rate": 0.00010013639317238674, | |
| "loss": 1.6156, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 0.8507571220397949, | |
| "learning_rate": 9.88142839494607e-05, | |
| "loss": 1.6551, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.8553720116615295, | |
| "learning_rate": 9.749666012936487e-05, | |
| "loss": 1.6274, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 0.6895230412483215, | |
| "learning_rate": 9.618363717543669e-05, | |
| "loss": 1.5211, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 0.6788089871406555, | |
| "learning_rate": 9.487533014784007e-05, | |
| "loss": 1.4057, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 0.851324737071991, | |
| "learning_rate": 9.357185369348225e-05, | |
| "loss": 1.3783, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 0.7894309163093567, | |
| "learning_rate": 9.227332203596764e-05, | |
| "loss": 1.5338, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.6584810018539429, | |
| "learning_rate": 9.097984896558807e-05, | |
| "loss": 1.4221, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 0.6491633653640747, | |
| "learning_rate": 8.96915478293517e-05, | |
| "loss": 1.4814, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 0.8882700800895691, | |
| "learning_rate": 8.840853152105004e-05, | |
| "loss": 1.4217, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 0.8957266807556152, | |
| "learning_rate": 8.713091247136558e-05, | |
| "loss": 1.4734, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 0.8561681509017944, | |
| "learning_rate": 8.585880263801893e-05, | |
| "loss": 1.4981, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 0.8122314810752869, | |
| "learning_rate": 8.459231349595824e-05, | |
| "loss": 1.3924, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 0.7481365203857422, | |
| "learning_rate": 8.333155602759088e-05, | |
| "loss": 1.5153, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 0.9371137619018555, | |
| "learning_rate": 8.207664071305751e-05, | |
| "loss": 1.4867, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 0.8563275337219238, | |
| "learning_rate": 8.082767752055129e-05, | |
| "loss": 1.4395, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 0.7887305617332458, | |
| "learning_rate": 7.958477589668092e-05, | |
| "loss": 1.4446, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.852190375328064, | |
| "learning_rate": 7.834804475688008e-05, | |
| "loss": 1.3596, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 0.7616653442382812, | |
| "learning_rate": 7.711759247586301e-05, | |
| "loss": 1.452, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 0.6950060725212097, | |
| "learning_rate": 7.589352687812797e-05, | |
| "loss": 1.3658, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.0462900400161743, | |
| "learning_rate": 7.467595522850805e-05, | |
| "loss": 1.4803, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 0.9355428218841553, | |
| "learning_rate": 7.346498422277214e-05, | |
| "loss": 1.5474, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 0.8697147369384766, | |
| "learning_rate": 7.22607199782747e-05, | |
| "loss": 1.4883, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 0.9624801874160767, | |
| "learning_rate": 7.106326802465692e-05, | |
| "loss": 1.3623, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 0.884113073348999, | |
| "learning_rate": 6.987273329459915e-05, | |
| "loss": 1.4287, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 0.8870747685432434, | |
| "learning_rate": 6.868922011462574e-05, | |
| "loss": 1.4426, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 0.9415817260742188, | |
| "learning_rate": 6.751283219596263e-05, | |
| "loss": 1.4421, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 0.8890645503997803, | |
| "learning_rate": 6.634367262544956e-05, | |
| "loss": 1.4436, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 0.8797799944877625, | |
| "learning_rate": 6.518184385650595e-05, | |
| "loss": 1.3414, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 0.8096060156822205, | |
| "learning_rate": 6.402744770015371e-05, | |
| "loss": 1.3585, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 0.9265967607498169, | |
| "learning_rate": 6.28805853160948e-05, | |
| "loss": 1.4575, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 0.7921759486198425, | |
| "learning_rate": 6.174135720384715e-05, | |
| "loss": 1.5079, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 0.9000433087348938, | |
| "learning_rate": 6.0609863193937554e-05, | |
| "loss": 1.4363, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 0.9442440271377563, | |
| "learning_rate": 5.9486202439153636e-05, | |
| "loss": 1.3952, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 0.9569089412689209, | |
| "learning_rate": 5.837047340585508e-05, | |
| "loss": 1.3183, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 0.7584913969039917, | |
| "learning_rate": 5.72627738653452e-05, | |
| "loss": 1.3542, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 0.9052013754844666, | |
| "learning_rate": 5.616320088530287e-05, | |
| "loss": 1.4462, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "eval_loss": 1.6514110565185547, | |
| "eval_runtime": 53.4421, | |
| "eval_samples_per_second": 9.057, | |
| "eval_steps_per_second": 2.264, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.9608349800109863, | |
| "learning_rate": 5.50718508212769e-05, | |
| "loss": 1.4392, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 1.0080007314682007, | |
| "learning_rate": 5.398881930824208e-05, | |
| "loss": 1.4975, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 0.9581565260887146, | |
| "learning_rate": 5.2914201252218776e-05, | |
| "loss": 1.3116, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 0.9258535504341125, | |
| "learning_rate": 5.184809082195656e-05, | |
| "loss": 1.5025, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 0.815497100353241, | |
| "learning_rate": 5.079058144068175e-05, | |
| "loss": 1.4159, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 0.867760181427002, | |
| "learning_rate": 4.974176577791123e-05, | |
| "loss": 1.4266, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 0.8949117660522461, | |
| "learning_rate": 4.870173574133139e-05, | |
| "loss": 1.4693, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.9198044538497925, | |
| "learning_rate": 4.767058246874453e-05, | |
| "loss": 1.4362, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 0.928459882736206, | |
| "learning_rate": 4.664839632008233e-05, | |
| "loss": 1.4778, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 0.789795458316803, | |
| "learning_rate": 4.563526686948776e-05, | |
| "loss": 1.3265, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.9109902381896973, | |
| "learning_rate": 4.4631282897465504e-05, | |
| "loss": 1.4303, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 0.9469901919364929, | |
| "learning_rate": 4.363653238310223e-05, | |
| "loss": 1.3995, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 0.8550880551338196, | |
| "learning_rate": 4.2651102496356985e-05, | |
| "loss": 1.3542, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.8234253525733948, | |
| "learning_rate": 4.1675079590422553e-05, | |
| "loss": 1.3771, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.9473711848258972, | |
| "learning_rate": 4.070854919415811e-05, | |
| "loss": 1.3968, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 1.0804728269577026, | |
| "learning_rate": 3.9751596004594696e-05, | |
| "loss": 1.4282, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 0.9122664332389832, | |
| "learning_rate": 3.88043038795128e-05, | |
| "loss": 1.4415, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 0.851219892501831, | |
| "learning_rate": 3.786675583009419e-05, | |
| "loss": 1.3758, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 0.8568178415298462, | |
| "learning_rate": 3.6939034013647516e-05, | |
| "loss": 1.3924, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 0.8509604334831238, | |
| "learning_rate": 3.6021219726409064e-05, | |
| "loss": 1.3347, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.9844736456871033, | |
| "learning_rate": 3.51133933964184e-05, | |
| "loss": 1.3425, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 0.8552778959274292, | |
| "learning_rate": 3.421563457647091e-05, | |
| "loss": 1.4085, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 0.8389209508895874, | |
| "learning_rate": 3.3328021937146175e-05, | |
| "loss": 1.4795, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 0.7627727389335632, | |
| "learning_rate": 3.2450633259914324e-05, | |
| "loss": 1.3852, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 0.7493878602981567, | |
| "learning_rate": 3.158354543032004e-05, | |
| "loss": 1.4683, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.8921322226524353, | |
| "learning_rate": 3.0726834431244916e-05, | |
| "loss": 1.3623, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.934528648853302, | |
| "learning_rate": 2.9880575336249347e-05, | |
| "loss": 1.4292, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 0.9410387873649597, | |
| "learning_rate": 2.904484230299341e-05, | |
| "loss": 1.3741, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 0.8113580942153931, | |
| "learning_rate": 2.821970856673898e-05, | |
| "loss": 1.3496, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 0.822819709777832, | |
| "learning_rate": 2.7405246433931603e-05, | |
| "loss": 1.387, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.7705480456352234, | |
| "learning_rate": 2.6601527275864743e-05, | |
| "loss": 1.4588, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 0.8231070637702942, | |
| "learning_rate": 2.5808621522425134e-05, | |
| "loss": 1.5636, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.8312299251556396, | |
| "learning_rate": 2.5026598655921182e-05, | |
| "loss": 1.4001, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.7779568433761597, | |
| "learning_rate": 2.4255527204994268e-05, | |
| "loss": 1.4055, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 0.9976507425308228, | |
| "learning_rate": 2.3495474738613595e-05, | |
| "loss": 1.4549, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 0.9732272624969482, | |
| "learning_rate": 2.274650786015491e-05, | |
| "loss": 1.3406, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 0.8873571753501892, | |
| "learning_rate": 2.200869220156439e-05, | |
| "loss": 1.3103, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.940963625907898, | |
| "learning_rate": 2.1282092417606998e-05, | |
| "loss": 1.3055, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.9407901167869568, | |
| "learning_rate": 2.0566772180201002e-05, | |
| "loss": 1.3727, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 0.7993718981742859, | |
| "learning_rate": 1.986279417283823e-05, | |
| "loss": 1.3236, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "eval_loss": 1.6478137969970703, | |
| "eval_runtime": 53.9373, | |
| "eval_samples_per_second": 8.973, | |
| "eval_steps_per_second": 2.243, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.9243110418319702, | |
| "learning_rate": 1.9170220085091354e-05, | |
| "loss": 1.3978, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 0.9336215257644653, | |
| "learning_rate": 1.8489110607207796e-05, | |
| "loss": 1.3513, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.9775585532188416, | |
| "learning_rate": 1.7819525424791637e-05, | |
| "loss": 1.3751, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.8599871397018433, | |
| "learning_rate": 1.7161523213573115e-05, | |
| "loss": 1.4563, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 0.8650587797164917, | |
| "learning_rate": 1.6515161634267216e-05, | |
| "loss": 1.3313, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.8923302888870239, | |
| "learning_rate": 1.5880497327520546e-05, | |
| "loss": 1.4745, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 0.9115777611732483, | |
| "learning_rate": 1.5257585908948172e-05, | |
| "loss": 1.5308, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 0.8948424458503723, | |
| "learning_rate": 1.464648196425981e-05, | |
| "loss": 1.4087, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 0.9226679801940918, | |
| "learning_rate": 1.4047239044476594e-05, | |
| "loss": 1.3918, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 0.8747596144676208, | |
| "learning_rate": 1.345990966123846e-05, | |
| "loss": 1.3892, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.8658227324485779, | |
| "learning_rate": 1.288454528220238e-05, | |
| "loss": 1.4378, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.8440380096435547, | |
| "learning_rate": 1.2321196326532428e-05, | |
| "loss": 1.3919, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 0.8718552589416504, | |
| "learning_rate": 1.176991216048141e-05, | |
| "loss": 1.3446, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 1.080561876296997, | |
| "learning_rate": 1.123074109306501e-05, | |
| "loss": 1.5042, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 0.9557725191116333, | |
| "learning_rate": 1.070373037182839e-05, | |
| "loss": 1.4899, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 0.8951625823974609, | |
| "learning_rate": 1.0188926178705976e-05, | |
| "loss": 1.3954, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.8350385427474976, | |
| "learning_rate": 9.686373625974398e-06, | |
| "loss": 1.3416, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.7895828485488892, | |
| "learning_rate": 9.196116752299504e-06, | |
| "loss": 1.4138, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.8298912644386292, | |
| "learning_rate": 8.718198518877063e-06, | |
| "loss": 1.4092, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 0.7747901082038879, | |
| "learning_rate": 8.252660805668172e-06, | |
| "loss": 1.4483, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 0.9418133497238159, | |
| "learning_rate": 7.799544407729247e-06, | |
| "loss": 1.2875, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 0.8848152756690979, | |
| "learning_rate": 7.3588890316373e-06, | |
| "loss": 1.3768, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.8357783555984497, | |
| "learning_rate": 6.930733292010299e-06, | |
| "loss": 1.2686, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.8660513758659363, | |
| "learning_rate": 6.5151147081234314e-06, | |
| "loss": 1.5152, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.9291744232177734, | |
| "learning_rate": 6.112069700621247e-06, | |
| "loss": 1.3963, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.9582973718643188, | |
| "learning_rate": 5.721633588326263e-06, | |
| "loss": 1.3672, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.8532952070236206, | |
| "learning_rate": 5.343840585143755e-06, | |
| "loss": 1.4313, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 0.8306805491447449, | |
| "learning_rate": 4.9787237970638046e-06, | |
| "loss": 1.348, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 0.9041390419006348, | |
| "learning_rate": 4.626315219260074e-06, | |
| "loss": 1.323, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 0.9139644503593445, | |
| "learning_rate": 4.2866457332861095e-06, | |
| "loss": 1.4489, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.7889164090156555, | |
| "learning_rate": 3.959745104369227e-06, | |
| "loss": 1.2446, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 0.8802595734596252, | |
| "learning_rate": 3.645641978802183e-06, | |
| "loss": 1.3255, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 0.8217524290084839, | |
| "learning_rate": 3.3443638814328765e-06, | |
| "loss": 1.3215, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 1.0614471435546875, | |
| "learning_rate": 3.0559372132523354e-06, | |
| "loss": 1.5207, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 0.8429582715034485, | |
| "learning_rate": 2.7803872490812287e-06, | |
| "loss": 1.3383, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 0.8786703944206238, | |
| "learning_rate": 2.5177381353549907e-06, | |
| "loss": 1.3797, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.9570217132568359, | |
| "learning_rate": 2.268012888007947e-06, | |
| "loss": 1.3989, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.8841099739074707, | |
| "learning_rate": 2.0312333904563605e-06, | |
| "loss": 1.3689, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.9913931488990784, | |
| "learning_rate": 1.8074203916808472e-06, | |
| "loss": 1.4612, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 0.84140545129776, | |
| "learning_rate": 1.5965935044080003e-06, | |
| "loss": 1.4728, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "eval_loss": 1.645330786705017, | |
| "eval_runtime": 53.8569, | |
| "eval_samples_per_second": 8.987, | |
| "eval_steps_per_second": 2.247, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.0088138580322266, | |
| "learning_rate": 1.3987712033919796e-06, | |
| "loss": 1.4539, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 0.853320837020874, | |
| "learning_rate": 1.2139708237953073e-06, | |
| "loss": 1.3314, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 0.9394313097000122, | |
| "learning_rate": 1.0422085596699658e-06, | |
| "loss": 1.4403, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 0.8725711107254028, | |
| "learning_rate": 8.834994625382341e-07, | |
| "loss": 1.4662, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 0.9195972084999084, | |
| "learning_rate": 7.378574400737414e-07, | |
| "loss": 1.3785, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 0.9402215480804443, | |
| "learning_rate": 6.052952548827428e-07, | |
| "loss": 1.4565, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 0.8402209281921387, | |
| "learning_rate": 4.858245233857305e-07, | |
| "loss": 1.371, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 0.7974345088005066, | |
| "learning_rate": 3.7945571479948147e-07, | |
| "loss": 1.3159, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 0.8706817626953125, | |
| "learning_rate": 2.8619815021964085e-07, | |
| "loss": 1.521, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 0.9706933498382568, | |
| "learning_rate": 2.0606000180390935e-07, | |
| "loss": 1.3561, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.9271072149276733, | |
| "learning_rate": 1.3904829205594882e-07, | |
| "loss": 1.4329, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 0.8780162930488586, | |
| "learning_rate": 8.516889320995813e-08, | |
| "loss": 1.3876, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 0.8870643377304077, | |
| "learning_rate": 4.442652671611813e-08, | |
| "loss": 1.4097, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 0.9191935062408447, | |
| "learning_rate": 1.6824762826822546e-08, | |
| "loss": 1.5358, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 0.856283962726593, | |
| "learning_rate": 2.3660202838615695e-09, | |
| "loss": 1.407, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 1878, | |
| "total_flos": 2.5822124463489024e+17, | |
| "train_loss": 1.5555958059776072, | |
| "train_runtime": 10085.6682, | |
| "train_samples_per_second": 2.982, | |
| "train_steps_per_second": 0.186 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1878, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 400, | |
| "total_flos": 2.5822124463489024e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |