| { | |
| "best_metric": 1.613356113433838, | |
| "best_model_checkpoint": "output/checkpoint-6000", | |
| "epoch": 2.4038461538461537, | |
| "eval_steps": 2000, | |
| "global_step": 8000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.030048076923076924, | |
| "grad_norm": 2.3787589073181152, | |
| "learning_rate": 2e-05, | |
| "loss": 3.6173, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06009615384615385, | |
| "grad_norm": 1.996222734451294, | |
| "learning_rate": 4e-05, | |
| "loss": 1.7498, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.09014423076923077, | |
| "grad_norm": 1.5797070264816284, | |
| "learning_rate": 6e-05, | |
| "loss": 1.7224, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1201923076923077, | |
| "grad_norm": 1.4705523252487183, | |
| "learning_rate": 8e-05, | |
| "loss": 1.6887, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1502403846153846, | |
| "grad_norm": 1.807113766670227, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6883, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.18028846153846154, | |
| "grad_norm": 1.211004376411438, | |
| "learning_rate": 9.99725705593595e-05, | |
| "loss": 1.6908, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.21033653846153846, | |
| "grad_norm": 1.194908857345581, | |
| "learning_rate": 9.989031233240653e-05, | |
| "loss": 1.6865, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2403846153846154, | |
| "grad_norm": 1.125349998474121, | |
| "learning_rate": 9.975331557102723e-05, | |
| "loss": 1.6796, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2704326923076923, | |
| "grad_norm": 1.2960246801376343, | |
| "learning_rate": 9.9561730585003e-05, | |
| "loss": 1.6697, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3004807692307692, | |
| "grad_norm": 1.1631627082824707, | |
| "learning_rate": 9.931576757709384e-05, | |
| "loss": 1.6675, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.33052884615384615, | |
| "grad_norm": 1.0823206901550293, | |
| "learning_rate": 9.901569641240883e-05, | |
| "loss": 1.6532, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3605769230769231, | |
| "grad_norm": 1.1251226663589478, | |
| "learning_rate": 9.866184632231592e-05, | |
| "loss": 1.6625, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "grad_norm": 1.27994704246521, | |
| "learning_rate": 9.825460554321679e-05, | |
| "loss": 1.6463, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4206730769230769, | |
| "grad_norm": 1.3841296434402466, | |
| "learning_rate": 9.779442089058252e-05, | |
| "loss": 1.6462, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.45072115384615385, | |
| "grad_norm": 1.1315901279449463, | |
| "learning_rate": 9.728179726871762e-05, | |
| "loss": 1.6452, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4807692307692308, | |
| "grad_norm": 1.2269665002822876, | |
| "learning_rate": 9.671729711679036e-05, | |
| "loss": 1.6449, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5108173076923077, | |
| "grad_norm": 1.135764479637146, | |
| "learning_rate": 9.610153979173711e-05, | |
| "loss": 1.6367, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5408653846153846, | |
| "grad_norm": 1.2129323482513428, | |
| "learning_rate": 9.543520088871773e-05, | |
| "loss": 1.6271, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5709134615384616, | |
| "grad_norm": 1.2092684507369995, | |
| "learning_rate": 9.471901149986767e-05, | |
| "loss": 1.6353, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6009615384615384, | |
| "grad_norm": 1.2054263353347778, | |
| "learning_rate": 9.39537574121601e-05, | |
| "loss": 1.6386, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6009615384615384, | |
| "eval_loss": 1.627854585647583, | |
| "eval_runtime": 242.6417, | |
| "eval_samples_per_second": 73.141, | |
| "eval_steps_per_second": 9.145, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6310096153846154, | |
| "grad_norm": 1.4155864715576172, | |
| "learning_rate": 9.314027824525798e-05, | |
| "loss": 1.6322, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6610576923076923, | |
| "grad_norm": 1.2875721454620361, | |
| "learning_rate": 9.22794665303021e-05, | |
| "loss": 1.6205, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6911057692307693, | |
| "grad_norm": 1.2036750316619873, | |
| "learning_rate": 9.137226673064603e-05, | |
| "loss": 1.6201, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7211538461538461, | |
| "grad_norm": 1.3741754293441772, | |
| "learning_rate": 9.04196742056119e-05, | |
| "loss": 1.6197, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7512019230769231, | |
| "grad_norm": 1.3001148700714111, | |
| "learning_rate": 8.942273411840452e-05, | |
| "loss": 1.6285, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 1.3223652839660645, | |
| "learning_rate": 8.838254028938162e-05, | |
| "loss": 1.6323, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8112980769230769, | |
| "grad_norm": 1.399418592453003, | |
| "learning_rate": 8.730023399593876e-05, | |
| "loss": 1.6184, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8413461538461539, | |
| "grad_norm": 1.2166377305984497, | |
| "learning_rate": 8.617700272032516e-05, | |
| "loss": 1.6165, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8713942307692307, | |
| "grad_norm": 1.208473563194275, | |
| "learning_rate": 8.501407884676479e-05, | |
| "loss": 1.616, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9014423076923077, | |
| "grad_norm": 1.2277066707611084, | |
| "learning_rate": 8.381273830931207e-05, | |
| "loss": 1.6122, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9314903846153846, | |
| "grad_norm": 1.204302191734314, | |
| "learning_rate": 8.257429919192542e-05, | |
| "loss": 1.6186, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9615384615384616, | |
| "grad_norm": 1.2529523372650146, | |
| "learning_rate": 8.130012028229512e-05, | |
| "loss": 1.6164, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9915865384615384, | |
| "grad_norm": 1.513980507850647, | |
| "learning_rate": 7.999159958101186e-05, | |
| "loss": 1.5971, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.0216346153846154, | |
| "grad_norm": 1.420433521270752, | |
| "learning_rate": 7.865017276771173e-05, | |
| "loss": 1.4976, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.0516826923076923, | |
| "grad_norm": 1.4616764783859253, | |
| "learning_rate": 7.727731162588074e-05, | |
| "loss": 1.4486, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.0817307692307692, | |
| "grad_norm": 1.4156601428985596, | |
| "learning_rate": 7.587452242804676e-05, | |
| "loss": 1.4467, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1117788461538463, | |
| "grad_norm": 1.3352571725845337, | |
| "learning_rate": 7.444334428313112e-05, | |
| "loss": 1.4516, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.1418269230769231, | |
| "grad_norm": 1.4259686470031738, | |
| "learning_rate": 7.298534744777267e-05, | |
| "loss": 1.4466, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.171875, | |
| "grad_norm": 1.4755374193191528, | |
| "learning_rate": 7.150213160347743e-05, | |
| "loss": 1.446, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.2019230769230769, | |
| "grad_norm": 1.4399892091751099, | |
| "learning_rate": 6.999532410148371e-05, | |
| "loss": 1.4331, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2019230769230769, | |
| "eval_loss": 1.6244958639144897, | |
| "eval_runtime": 248.0586, | |
| "eval_samples_per_second": 71.544, | |
| "eval_steps_per_second": 8.945, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2319711538461537, | |
| "grad_norm": 1.629622220993042, | |
| "learning_rate": 6.846657817726882e-05, | |
| "loss": 1.4356, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.2620192307692308, | |
| "grad_norm": 1.5240803956985474, | |
| "learning_rate": 6.691757113665606e-05, | |
| "loss": 1.4403, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.2920673076923077, | |
| "grad_norm": 1.947218894958496, | |
| "learning_rate": 6.535000251551231e-05, | |
| "loss": 1.452, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.3221153846153846, | |
| "grad_norm": 1.6493359804153442, | |
| "learning_rate": 6.376559221505535e-05, | |
| "loss": 1.4435, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.3521634615384617, | |
| "grad_norm": 1.6366957426071167, | |
| "learning_rate": 6.216607861481659e-05, | |
| "loss": 1.4385, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3822115384615383, | |
| "grad_norm": 1.679699182510376, | |
| "learning_rate": 6.055321666533013e-05, | |
| "loss": 1.4509, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.4122596153846154, | |
| "grad_norm": 1.5405994653701782, | |
| "learning_rate": 5.8928775962640146e-05, | |
| "loss": 1.4375, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.4423076923076923, | |
| "grad_norm": 1.5734689235687256, | |
| "learning_rate": 5.7294538806739775e-05, | |
| "loss": 1.4315, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.4723557692307692, | |
| "grad_norm": 1.6284011602401733, | |
| "learning_rate": 5.565229824607143e-05, | |
| "loss": 1.4457, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.5024038461538463, | |
| "grad_norm": 1.5765283107757568, | |
| "learning_rate": 5.400385611023416e-05, | |
| "loss": 1.4374, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.5324519230769231, | |
| "grad_norm": 1.7722498178482056, | |
| "learning_rate": 5.235102103305654e-05, | |
| "loss": 1.4513, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "grad_norm": 1.6080434322357178, | |
| "learning_rate": 5.0695606468204095e-05, | |
| "loss": 1.4322, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.5925480769230769, | |
| "grad_norm": 1.7113045454025269, | |
| "learning_rate": 4.90394286994985e-05, | |
| "loss": 1.4372, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.6225961538461537, | |
| "grad_norm": 1.595045566558838, | |
| "learning_rate": 4.738430484813162e-05, | |
| "loss": 1.4391, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.6526442307692308, | |
| "grad_norm": 1.5261282920837402, | |
| "learning_rate": 4.5732050878960816e-05, | |
| "loss": 1.4375, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6826923076923077, | |
| "grad_norm": 1.6319518089294434, | |
| "learning_rate": 4.40844796080729e-05, | |
| "loss": 1.4269, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.7127403846153846, | |
| "grad_norm": 1.6978216171264648, | |
| "learning_rate": 4.244339871380291e-05, | |
| "loss": 1.4261, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.7427884615384617, | |
| "grad_norm": 1.748810887336731, | |
| "learning_rate": 4.0810608753389864e-05, | |
| "loss": 1.4349, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.7728365384615383, | |
| "grad_norm": 1.4340012073516846, | |
| "learning_rate": 3.9187901187445675e-05, | |
| "loss": 1.4349, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.8028846153846154, | |
| "grad_norm": 1.7568167448043823, | |
| "learning_rate": 3.757705641440461e-05, | |
| "loss": 1.4318, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.8028846153846154, | |
| "eval_loss": 1.613356113433838, | |
| "eval_runtime": 246.395, | |
| "eval_samples_per_second": 72.027, | |
| "eval_steps_per_second": 9.006, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.8329326923076923, | |
| "grad_norm": 1.7349011898040771, | |
| "learning_rate": 3.5979841817110014e-05, | |
| "loss": 1.4335, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.8629807692307692, | |
| "grad_norm": 1.9418445825576782, | |
| "learning_rate": 3.439800982368133e-05, | |
| "loss": 1.4282, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.8930288461538463, | |
| "grad_norm": 1.6687694787979126, | |
| "learning_rate": 3.283329598478926e-05, | |
| "loss": 1.4309, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 1.694411039352417, | |
| "learning_rate": 3.128741706944832e-05, | |
| "loss": 1.4178, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.953125, | |
| "grad_norm": 1.5258992910385132, | |
| "learning_rate": 2.976206918141635e-05, | |
| "loss": 1.4322, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9831730769230769, | |
| "grad_norm": 1.8099026679992676, | |
| "learning_rate": 2.8258925898267385e-05, | |
| "loss": 1.416, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.0132211538461537, | |
| "grad_norm": 2.1050777435302734, | |
| "learning_rate": 2.6779636435179777e-05, | |
| "loss": 1.3215, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.043269230769231, | |
| "grad_norm": 2.225262403488159, | |
| "learning_rate": 2.5325823835454278e-05, | |
| "loss": 1.1716, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.0733173076923075, | |
| "grad_norm": 2.9141433238983154, | |
| "learning_rate": 2.3899083189747123e-05, | |
| "loss": 1.1695, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.1033653846153846, | |
| "grad_norm": 2.4766488075256348, | |
| "learning_rate": 2.250097988597234e-05, | |
| "loss": 1.1692, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.1334134615384617, | |
| "grad_norm": 2.1980690956115723, | |
| "learning_rate": 2.1133047891793174e-05, | |
| "loss": 1.1755, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.1634615384615383, | |
| "grad_norm": 2.393937587738037, | |
| "learning_rate": 1.979678807158698e-05, | |
| "loss": 1.1536, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.1935096153846154, | |
| "grad_norm": 2.4240365028381348, | |
| "learning_rate": 1.8493666539730515e-05, | |
| "loss": 1.169, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.2235576923076925, | |
| "grad_norm": 2.4492228031158447, | |
| "learning_rate": 1.7225113052011964e-05, | |
| "loss": 1.1532, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.253605769230769, | |
| "grad_norm": 2.1071839332580566, | |
| "learning_rate": 1.5992519436935022e-05, | |
| "loss": 1.1595, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.2836538461538463, | |
| "grad_norm": 2.5761914253234863, | |
| "learning_rate": 1.4797238068635566e-05, | |
| "loss": 1.1628, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.313701923076923, | |
| "grad_norm": 2.590533971786499, | |
| "learning_rate": 1.3640580383087232e-05, | |
| "loss": 1.1634, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.34375, | |
| "grad_norm": 2.3521673679351807, | |
| "learning_rate": 1.252381543922313e-05, | |
| "loss": 1.1545, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.373798076923077, | |
| "grad_norm": 2.7083418369293213, | |
| "learning_rate": 1.1448168526552727e-05, | |
| "loss": 1.1542, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.4038461538461537, | |
| "grad_norm": 2.427485227584839, | |
| "learning_rate": 1.0414819820801663e-05, | |
| "loss": 1.1633, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.4038461538461537, | |
| "eval_loss": 1.730972409248352, | |
| "eval_runtime": 246.4517, | |
| "eval_samples_per_second": 72.01, | |
| "eval_steps_per_second": 9.004, | |
| "step": 8000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 9984, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.4413221989809357e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |