{ "best_metric": 1.613356113433838, "best_model_checkpoint": "output/checkpoint-6000", "epoch": 2.4038461538461537, "eval_steps": 2000, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030048076923076924, "grad_norm": 2.3787589073181152, "learning_rate": 2e-05, "loss": 3.6173, "step": 100 }, { "epoch": 0.06009615384615385, "grad_norm": 1.996222734451294, "learning_rate": 4e-05, "loss": 1.7498, "step": 200 }, { "epoch": 0.09014423076923077, "grad_norm": 1.5797070264816284, "learning_rate": 6e-05, "loss": 1.7224, "step": 300 }, { "epoch": 0.1201923076923077, "grad_norm": 1.4705523252487183, "learning_rate": 8e-05, "loss": 1.6887, "step": 400 }, { "epoch": 0.1502403846153846, "grad_norm": 1.807113766670227, "learning_rate": 0.0001, "loss": 1.6883, "step": 500 }, { "epoch": 0.18028846153846154, "grad_norm": 1.211004376411438, "learning_rate": 9.99725705593595e-05, "loss": 1.6908, "step": 600 }, { "epoch": 0.21033653846153846, "grad_norm": 1.194908857345581, "learning_rate": 9.989031233240653e-05, "loss": 1.6865, "step": 700 }, { "epoch": 0.2403846153846154, "grad_norm": 1.125349998474121, "learning_rate": 9.975331557102723e-05, "loss": 1.6796, "step": 800 }, { "epoch": 0.2704326923076923, "grad_norm": 1.2960246801376343, "learning_rate": 9.9561730585003e-05, "loss": 1.6697, "step": 900 }, { "epoch": 0.3004807692307692, "grad_norm": 1.1631627082824707, "learning_rate": 9.931576757709384e-05, "loss": 1.6675, "step": 1000 }, { "epoch": 0.33052884615384615, "grad_norm": 1.0823206901550293, "learning_rate": 9.901569641240883e-05, "loss": 1.6532, "step": 1100 }, { "epoch": 0.3605769230769231, "grad_norm": 1.1251226663589478, "learning_rate": 9.866184632231592e-05, "loss": 1.6625, "step": 1200 }, { "epoch": 0.390625, "grad_norm": 1.27994704246521, "learning_rate": 9.825460554321679e-05, "loss": 1.6463, "step": 1300 }, { "epoch": 0.4206730769230769, "grad_norm": 1.3841296434402466, "learning_rate": 9.779442089058252e-05, "loss": 1.6462, "step": 1400 }, { "epoch": 0.45072115384615385, "grad_norm": 1.1315901279449463, "learning_rate": 9.728179726871762e-05, "loss": 1.6452, "step": 1500 }, { "epoch": 0.4807692307692308, "grad_norm": 1.2269665002822876, "learning_rate": 9.671729711679036e-05, "loss": 1.6449, "step": 1600 }, { "epoch": 0.5108173076923077, "grad_norm": 1.135764479637146, "learning_rate": 9.610153979173711e-05, "loss": 1.6367, "step": 1700 }, { "epoch": 0.5408653846153846, "grad_norm": 1.2129323482513428, "learning_rate": 9.543520088871773e-05, "loss": 1.6271, "step": 1800 }, { "epoch": 0.5709134615384616, "grad_norm": 1.2092684507369995, "learning_rate": 9.471901149986767e-05, "loss": 1.6353, "step": 1900 }, { "epoch": 0.6009615384615384, "grad_norm": 1.2054263353347778, "learning_rate": 9.39537574121601e-05, "loss": 1.6386, "step": 2000 }, { "epoch": 0.6009615384615384, "eval_loss": 1.627854585647583, "eval_runtime": 242.6417, "eval_samples_per_second": 73.141, "eval_steps_per_second": 9.145, "step": 2000 }, { "epoch": 0.6310096153846154, "grad_norm": 1.4155864715576172, "learning_rate": 9.314027824525798e-05, "loss": 1.6322, "step": 2100 }, { "epoch": 0.6610576923076923, "grad_norm": 1.2875721454620361, "learning_rate": 9.22794665303021e-05, "loss": 1.6205, "step": 2200 }, { "epoch": 0.6911057692307693, "grad_norm": 1.2036750316619873, "learning_rate": 9.137226673064603e-05, "loss": 1.6201, "step": 2300 }, { "epoch": 0.7211538461538461, "grad_norm": 1.3741754293441772, "learning_rate": 9.04196742056119e-05, "loss": 1.6197, "step": 2400 }, { "epoch": 0.7512019230769231, "grad_norm": 1.3001148700714111, "learning_rate": 8.942273411840452e-05, "loss": 1.6285, "step": 2500 }, { "epoch": 0.78125, "grad_norm": 1.3223652839660645, "learning_rate": 8.838254028938162e-05, "loss": 1.6323, "step": 2600 }, { "epoch": 0.8112980769230769, "grad_norm": 1.399418592453003, "learning_rate": 8.730023399593876e-05, "loss": 1.6184, "step": 2700 }, { "epoch": 0.8413461538461539, "grad_norm": 1.2166377305984497, "learning_rate": 8.617700272032516e-05, "loss": 1.6165, "step": 2800 }, { "epoch": 0.8713942307692307, "grad_norm": 1.208473563194275, "learning_rate": 8.501407884676479e-05, "loss": 1.616, "step": 2900 }, { "epoch": 0.9014423076923077, "grad_norm": 1.2277066707611084, "learning_rate": 8.381273830931207e-05, "loss": 1.6122, "step": 3000 }, { "epoch": 0.9314903846153846, "grad_norm": 1.204302191734314, "learning_rate": 8.257429919192542e-05, "loss": 1.6186, "step": 3100 }, { "epoch": 0.9615384615384616, "grad_norm": 1.2529523372650146, "learning_rate": 8.130012028229512e-05, "loss": 1.6164, "step": 3200 }, { "epoch": 0.9915865384615384, "grad_norm": 1.513980507850647, "learning_rate": 7.999159958101186e-05, "loss": 1.5971, "step": 3300 }, { "epoch": 1.0216346153846154, "grad_norm": 1.420433521270752, "learning_rate": 7.865017276771173e-05, "loss": 1.4976, "step": 3400 }, { "epoch": 1.0516826923076923, "grad_norm": 1.4616764783859253, "learning_rate": 7.727731162588074e-05, "loss": 1.4486, "step": 3500 }, { "epoch": 1.0817307692307692, "grad_norm": 1.4156601428985596, "learning_rate": 7.587452242804676e-05, "loss": 1.4467, "step": 3600 }, { "epoch": 1.1117788461538463, "grad_norm": 1.3352571725845337, "learning_rate": 7.444334428313112e-05, "loss": 1.4516, "step": 3700 }, { "epoch": 1.1418269230769231, "grad_norm": 1.4259686470031738, "learning_rate": 7.298534744777267e-05, "loss": 1.4466, "step": 3800 }, { "epoch": 1.171875, "grad_norm": 1.4755374193191528, "learning_rate": 7.150213160347743e-05, "loss": 1.446, "step": 3900 }, { "epoch": 1.2019230769230769, "grad_norm": 1.4399892091751099, "learning_rate": 6.999532410148371e-05, "loss": 1.4331, "step": 4000 }, { "epoch": 1.2019230769230769, "eval_loss": 1.6244958639144897, "eval_runtime": 248.0586, "eval_samples_per_second": 71.544, "eval_steps_per_second": 8.945, "step": 4000 }, { "epoch": 1.2319711538461537, "grad_norm": 1.629622220993042, "learning_rate": 6.846657817726882e-05, "loss": 1.4356, "step": 4100 }, { "epoch": 1.2620192307692308, "grad_norm": 1.5240803956985474, "learning_rate": 6.691757113665606e-05, "loss": 1.4403, "step": 4200 }, { "epoch": 1.2920673076923077, "grad_norm": 1.947218894958496, "learning_rate": 6.535000251551231e-05, "loss": 1.452, "step": 4300 }, { "epoch": 1.3221153846153846, "grad_norm": 1.6493359804153442, "learning_rate": 6.376559221505535e-05, "loss": 1.4435, "step": 4400 }, { "epoch": 1.3521634615384617, "grad_norm": 1.6366957426071167, "learning_rate": 6.216607861481659e-05, "loss": 1.4385, "step": 4500 }, { "epoch": 1.3822115384615383, "grad_norm": 1.679699182510376, "learning_rate": 6.055321666533013e-05, "loss": 1.4509, "step": 4600 }, { "epoch": 1.4122596153846154, "grad_norm": 1.5405994653701782, "learning_rate": 5.8928775962640146e-05, "loss": 1.4375, "step": 4700 }, { "epoch": 1.4423076923076923, "grad_norm": 1.5734689235687256, "learning_rate": 5.7294538806739775e-05, "loss": 1.4315, "step": 4800 }, { "epoch": 1.4723557692307692, "grad_norm": 1.6284011602401733, "learning_rate": 5.565229824607143e-05, "loss": 1.4457, "step": 4900 }, { "epoch": 1.5024038461538463, "grad_norm": 1.5765283107757568, "learning_rate": 5.400385611023416e-05, "loss": 1.4374, "step": 5000 }, { "epoch": 1.5324519230769231, "grad_norm": 1.7722498178482056, "learning_rate": 5.235102103305654e-05, "loss": 1.4513, "step": 5100 }, { "epoch": 1.5625, "grad_norm": 1.6080434322357178, "learning_rate": 5.0695606468204095e-05, "loss": 1.4322, "step": 5200 }, { "epoch": 1.5925480769230769, "grad_norm": 1.7113045454025269, "learning_rate": 4.90394286994985e-05, "loss": 1.4372, "step": 5300 }, { "epoch": 1.6225961538461537, "grad_norm": 1.595045566558838, "learning_rate": 4.738430484813162e-05, "loss": 1.4391, "step": 5400 }, { "epoch": 1.6526442307692308, "grad_norm": 1.5261282920837402, "learning_rate": 4.5732050878960816e-05, "loss": 1.4375, "step": 5500 }, { "epoch": 1.6826923076923077, "grad_norm": 1.6319518089294434, "learning_rate": 4.40844796080729e-05, "loss": 1.4269, "step": 5600 }, { "epoch": 1.7127403846153846, "grad_norm": 1.6978216171264648, "learning_rate": 4.244339871380291e-05, "loss": 1.4261, "step": 5700 }, { "epoch": 1.7427884615384617, "grad_norm": 1.748810887336731, "learning_rate": 4.0810608753389864e-05, "loss": 1.4349, "step": 5800 }, { "epoch": 1.7728365384615383, "grad_norm": 1.4340012073516846, "learning_rate": 3.9187901187445675e-05, "loss": 1.4349, "step": 5900 }, { "epoch": 1.8028846153846154, "grad_norm": 1.7568167448043823, "learning_rate": 3.757705641440461e-05, "loss": 1.4318, "step": 6000 }, { "epoch": 1.8028846153846154, "eval_loss": 1.613356113433838, "eval_runtime": 246.395, "eval_samples_per_second": 72.027, "eval_steps_per_second": 9.006, "step": 6000 }, { "epoch": 1.8329326923076923, "grad_norm": 1.7349011898040771, "learning_rate": 3.5979841817110014e-05, "loss": 1.4335, "step": 6100 }, { "epoch": 1.8629807692307692, "grad_norm": 1.9418445825576782, "learning_rate": 3.439800982368133e-05, "loss": 1.4282, "step": 6200 }, { "epoch": 1.8930288461538463, "grad_norm": 1.6687694787979126, "learning_rate": 3.283329598478926e-05, "loss": 1.4309, "step": 6300 }, { "epoch": 1.9230769230769231, "grad_norm": 1.694411039352417, "learning_rate": 3.128741706944832e-05, "loss": 1.4178, "step": 6400 }, { "epoch": 1.953125, "grad_norm": 1.5258992910385132, "learning_rate": 2.976206918141635e-05, "loss": 1.4322, "step": 6500 }, { "epoch": 1.9831730769230769, "grad_norm": 1.8099026679992676, "learning_rate": 2.8258925898267385e-05, "loss": 1.416, "step": 6600 }, { "epoch": 2.0132211538461537, "grad_norm": 2.1050777435302734, "learning_rate": 2.6779636435179777e-05, "loss": 1.3215, "step": 6700 }, { "epoch": 2.043269230769231, "grad_norm": 2.225262403488159, "learning_rate": 2.5325823835454278e-05, "loss": 1.1716, "step": 6800 }, { "epoch": 2.0733173076923075, "grad_norm": 2.9141433238983154, "learning_rate": 2.3899083189747123e-05, "loss": 1.1695, "step": 6900 }, { "epoch": 2.1033653846153846, "grad_norm": 2.4766488075256348, "learning_rate": 2.250097988597234e-05, "loss": 1.1692, "step": 7000 }, { "epoch": 2.1334134615384617, "grad_norm": 2.1980690956115723, "learning_rate": 2.1133047891793174e-05, "loss": 1.1755, "step": 7100 }, { "epoch": 2.1634615384615383, "grad_norm": 2.393937587738037, "learning_rate": 1.979678807158698e-05, "loss": 1.1536, "step": 7200 }, { "epoch": 2.1935096153846154, "grad_norm": 2.4240365028381348, "learning_rate": 1.8493666539730515e-05, "loss": 1.169, "step": 7300 }, { "epoch": 2.2235576923076925, "grad_norm": 2.4492228031158447, "learning_rate": 1.7225113052011964e-05, "loss": 1.1532, "step": 7400 }, { "epoch": 2.253605769230769, "grad_norm": 2.1071839332580566, "learning_rate": 1.5992519436935022e-05, "loss": 1.1595, "step": 7500 }, { "epoch": 2.2836538461538463, "grad_norm": 2.5761914253234863, "learning_rate": 1.4797238068635566e-05, "loss": 1.1628, "step": 7600 }, { "epoch": 2.313701923076923, "grad_norm": 2.590533971786499, "learning_rate": 1.3640580383087232e-05, "loss": 1.1634, "step": 7700 }, { "epoch": 2.34375, "grad_norm": 2.3521673679351807, "learning_rate": 1.252381543922313e-05, "loss": 1.1545, "step": 7800 }, { "epoch": 2.373798076923077, "grad_norm": 2.7083418369293213, "learning_rate": 1.1448168526552727e-05, "loss": 1.1542, "step": 7900 }, { "epoch": 2.4038461538461537, "grad_norm": 2.427485227584839, "learning_rate": 1.0414819820801663e-05, "loss": 1.1633, "step": 8000 }, { "epoch": 2.4038461538461537, "eval_loss": 1.730972409248352, "eval_runtime": 246.4517, "eval_samples_per_second": 72.01, "eval_steps_per_second": 9.004, "step": 8000 } ], "logging_steps": 100, "max_steps": 9984, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.4413221989809357e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }