{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3201024327784891, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.2925003931857646, "epoch": 0.012804097311139564, "grad_norm": 0.11822043359279633, "learning_rate": 0.0002, "loss": 1.5374, "mean_token_accuracy": 0.7501773500349372, "num_tokens": 163840.0, "step": 10 }, { "entropy": 0.24354165233671665, "epoch": 0.02560819462227913, "grad_norm": 0.06385882943868637, "learning_rate": 0.0002, "loss": 0.2747, "mean_token_accuracy": 0.940368153899908, "num_tokens": 327680.0, "step": 20 }, { "entropy": 0.21027038600295783, "epoch": 0.03841229193341869, "grad_norm": 0.06773430854082108, "learning_rate": 0.0002, "loss": 0.2517, "mean_token_accuracy": 0.9453033253550529, "num_tokens": 491520.0, "step": 30 }, { "entropy": 0.2769933709874749, "epoch": 0.05121638924455826, "grad_norm": 0.055818039923906326, "learning_rate": 0.0002, "loss": 0.2571, "mean_token_accuracy": 0.9446122765541076, "num_tokens": 655360.0, "step": 40 }, { "entropy": 0.2427499696612358, "epoch": 0.06402048655569782, "grad_norm": 0.058828528970479965, "learning_rate": 0.0002, "loss": 0.2485, "mean_token_accuracy": 0.9458353690803051, "num_tokens": 819200.0, "step": 50 }, { "entropy": 0.21829927181825043, "epoch": 0.07682458386683738, "grad_norm": 0.05017698556184769, "learning_rate": 0.0002, "loss": 0.2266, "mean_token_accuracy": 0.9501712337136269, "num_tokens": 983040.0, "step": 60 }, { "entropy": 0.23391168629750608, "epoch": 0.08962868117797695, "grad_norm": 0.04940837249159813, "learning_rate": 0.0002, "loss": 0.2378, "mean_token_accuracy": 0.9473275415599346, "num_tokens": 1146880.0, "step": 70 }, { "entropy": 0.22548514110967516, "epoch": 0.10243277848911651, "grad_norm": 0.05233762413263321, "learning_rate": 0.0002, "loss": 0.2281, "mean_token_accuracy": 0.9489420257508755, "num_tokens": 1310720.0, "step": 80 }, { "entropy": 0.2398780019953847, "epoch": 0.11523687580025609, "grad_norm": 0.048961199820041656, "learning_rate": 0.0002, "loss": 0.2435, "mean_token_accuracy": 0.9454256378114223, "num_tokens": 1474560.0, "step": 90 }, { "entropy": 0.21921717822551728, "epoch": 0.12804097311139565, "grad_norm": 0.05406157672405243, "learning_rate": 0.0002, "loss": 0.2233, "mean_token_accuracy": 0.9502079240977764, "num_tokens": 1638400.0, "step": 100 }, { "entropy": 1.4439915746450425, "epoch": 0.07042253521126761, "grad_norm": 0.5860416889190674, "learning_rate": 0.0002, "loss": 1.5262, "mean_token_accuracy": 0.6517463460564613, "num_tokens": 11002.0, "step": 110 }, { "entropy": 1.4737072110176086, "epoch": 0.07682458386683738, "grad_norm": 0.46929118037223816, "learning_rate": 0.0002, "loss": 1.5329, "mean_token_accuracy": 0.6466514617204666, "num_tokens": 22341.0, "step": 120 }, { "entropy": 1.5261517196893692, "epoch": 0.08322663252240717, "grad_norm": 0.42917096614837646, "learning_rate": 0.0002, "loss": 1.5325, "mean_token_accuracy": 0.6432413414120675, "num_tokens": 34121.0, "step": 130 }, { "entropy": 1.475469598174095, "epoch": 0.08962868117797695, "grad_norm": 0.38441428542137146, "learning_rate": 0.0002, "loss": 1.5251, "mean_token_accuracy": 0.6469125777482987, "num_tokens": 45804.0, "step": 140 }, { "entropy": 1.4375366628170014, "epoch": 0.09603072983354674, "grad_norm": 0.4060431122779846, "learning_rate": 0.0002, "loss": 1.4656, "mean_token_accuracy": 0.65479796230793, "num_tokens": 57477.0, "step": 150 }, { "entropy": 1.3917139500379563, "epoch": 0.10243277848911651, "grad_norm": 0.42650681734085083, "learning_rate": 0.0002, "loss": 1.3855, "mean_token_accuracy": 0.6694141447544097, "num_tokens": 69610.0, "step": 160 }, { "entropy": 1.4115019723773004, "epoch": 0.1088348271446863, "grad_norm": 0.43073078989982605, "learning_rate": 0.0002, "loss": 1.418, "mean_token_accuracy": 0.6599947348237037, "num_tokens": 81802.0, "step": 170 }, { "entropy": 1.4750748038291932, "epoch": 0.11523687580025609, "grad_norm": 0.4140271246433258, "learning_rate": 0.0002, "loss": 1.4688, "mean_token_accuracy": 0.657877217233181, "num_tokens": 94864.0, "step": 180 }, { "entropy": 1.4294287458062171, "epoch": 0.12163892445582586, "grad_norm": 0.4295845627784729, "learning_rate": 0.0002, "loss": 1.4499, "mean_token_accuracy": 0.664360024034977, "num_tokens": 106706.0, "step": 190 }, { "entropy": 1.369442880153656, "epoch": 0.12804097311139565, "grad_norm": 0.39103543758392334, "learning_rate": 0.0002, "loss": 1.3704, "mean_token_accuracy": 0.6727640315890312, "num_tokens": 118294.0, "step": 200 }, { "entropy": 1.4287993058562278, "epoch": 0.13444302176696543, "grad_norm": 0.43736791610717773, "learning_rate": 0.0002, "loss": 1.4618, "mean_token_accuracy": 0.6607629641890526, "num_tokens": 11547.0, "step": 210 }, { "entropy": 1.4580362021923066, "epoch": 0.14084507042253522, "grad_norm": 0.4345095157623291, "learning_rate": 0.0002, "loss": 1.4718, "mean_token_accuracy": 0.6601809754967689, "num_tokens": 23048.0, "step": 220 }, { "entropy": 1.4598326086997986, "epoch": 0.147247119078105, "grad_norm": 0.43506523966789246, "learning_rate": 0.0002, "loss": 1.4721, "mean_token_accuracy": 0.6584218233823776, "num_tokens": 35544.0, "step": 230 }, { "entropy": 1.4226029217243195, "epoch": 0.15364916773367476, "grad_norm": 0.3562639355659485, "learning_rate": 0.0002, "loss": 1.4292, "mean_token_accuracy": 0.6552029058337212, "num_tokens": 47703.0, "step": 240 }, { "entropy": 1.4316969871520997, "epoch": 0.16005121638924455, "grad_norm": 0.3855348527431488, "learning_rate": 0.0002, "loss": 1.4474, "mean_token_accuracy": 0.6541792467236519, "num_tokens": 59619.0, "step": 250 }, { "entropy": 1.4340788170695304, "epoch": 0.16645326504481434, "grad_norm": 0.3725138008594513, "learning_rate": 0.0002, "loss": 1.411, "mean_token_accuracy": 0.6598658114671707, "num_tokens": 70976.0, "step": 260 }, { "entropy": 1.433164033293724, "epoch": 0.17285531370038412, "grad_norm": 0.40799325704574585, "learning_rate": 0.0002, "loss": 1.4326, "mean_token_accuracy": 0.6610979020595551, "num_tokens": 82978.0, "step": 270 }, { "entropy": 1.388225580751896, "epoch": 0.1792573623559539, "grad_norm": 0.46172577142715454, "learning_rate": 0.0002, "loss": 1.3839, "mean_token_accuracy": 0.6689180210232735, "num_tokens": 94835.0, "step": 280 }, { "entropy": 1.3931595474481582, "epoch": 0.1856594110115237, "grad_norm": 0.3368935286998749, "learning_rate": 0.0002, "loss": 1.3842, "mean_token_accuracy": 0.6617935121059417, "num_tokens": 107095.0, "step": 290 }, { "entropy": 1.3946780189871788, "epoch": 0.19206145966709348, "grad_norm": 0.35388654470443726, "learning_rate": 0.0002, "loss": 1.3875, "mean_token_accuracy": 0.6697609886527062, "num_tokens": 119793.0, "step": 300 }, { "entropy": 1.404037845134735, "epoch": 0.19846350832266324, "grad_norm": 0.3220811188220978, "learning_rate": 0.0002, "loss": 1.4132, "mean_token_accuracy": 0.6690483942627907, "num_tokens": 132305.0, "step": 310 }, { "entropy": 1.4252075865864753, "epoch": 0.20486555697823303, "grad_norm": 0.36200419068336487, "learning_rate": 0.0002, "loss": 1.3847, "mean_token_accuracy": 0.6713049352169037, "num_tokens": 144152.0, "step": 320 }, { "entropy": 1.3943257242441178, "epoch": 0.2112676056338028, "grad_norm": 0.3905465304851532, "learning_rate": 0.0002, "loss": 1.4046, "mean_token_accuracy": 0.6624784469604492, "num_tokens": 155760.0, "step": 330 }, { "entropy": 1.3818968921899795, "epoch": 0.2176696542893726, "grad_norm": 0.39985793828964233, "learning_rate": 0.0002, "loss": 1.3703, "mean_token_accuracy": 0.6665896505117417, "num_tokens": 167621.0, "step": 340 }, { "entropy": 1.4194732576608657, "epoch": 0.22407170294494239, "grad_norm": 0.3372517228126526, "learning_rate": 0.0002, "loss": 1.4082, "mean_token_accuracy": 0.6618808403611183, "num_tokens": 180287.0, "step": 350 }, { "entropy": 1.3521318838000298, "epoch": 0.23047375160051217, "grad_norm": 0.3215847313404083, "learning_rate": 0.0002, "loss": 1.3652, "mean_token_accuracy": 0.6767628908157348, "num_tokens": 192710.0, "step": 360 }, { "entropy": 1.3673772186040878, "epoch": 0.23687580025608196, "grad_norm": 0.2965973913669586, "learning_rate": 0.0002, "loss": 1.3505, "mean_token_accuracy": 0.6749694958329201, "num_tokens": 205349.0, "step": 370 }, { "entropy": 1.416155432164669, "epoch": 0.24327784891165172, "grad_norm": 0.3764086961746216, "learning_rate": 0.0002, "loss": 1.3843, "mean_token_accuracy": 0.6687785759568214, "num_tokens": 217150.0, "step": 380 }, { "entropy": 1.3185370564460754, "epoch": 0.2496798975672215, "grad_norm": 0.3607741892337799, "learning_rate": 0.0002, "loss": 1.3166, "mean_token_accuracy": 0.6866846427321434, "num_tokens": 229644.0, "step": 390 }, { "entropy": 1.439865067601204, "epoch": 0.2560819462227913, "grad_norm": 0.3800489604473114, "learning_rate": 0.0002, "loss": 1.429, "mean_token_accuracy": 0.6569151118397712, "num_tokens": 241123.0, "step": 400 }, { "entropy": 1.359485986828804, "epoch": 0.26248399487836105, "grad_norm": 0.3416250944137573, "learning_rate": 0.0002, "loss": 1.3547, "mean_token_accuracy": 0.6739593103528023, "num_tokens": 253195.0, "step": 410 }, { "entropy": 1.4108700543642043, "epoch": 0.26888604353393086, "grad_norm": 0.4621182382106781, "learning_rate": 0.0002, "loss": 1.3914, "mean_token_accuracy": 0.6685805276036263, "num_tokens": 264898.0, "step": 420 }, { "entropy": 1.3867693424224854, "epoch": 0.2752880921895006, "grad_norm": 0.3572266697883606, "learning_rate": 0.0002, "loss": 1.3377, "mean_token_accuracy": 0.6788131222128868, "num_tokens": 277460.0, "step": 430 }, { "entropy": 1.4053112298250199, "epoch": 0.28169014084507044, "grad_norm": 0.39097145199775696, "learning_rate": 0.0002, "loss": 1.3981, "mean_token_accuracy": 0.6630080677568913, "num_tokens": 288981.0, "step": 440 }, { "entropy": 1.411610472202301, "epoch": 0.2880921895006402, "grad_norm": 0.40481698513031006, "learning_rate": 0.0002, "loss": 1.3902, "mean_token_accuracy": 0.6739853829145431, "num_tokens": 300735.0, "step": 450 }, { "entropy": 1.3377737805247307, "epoch": 0.29449423815621, "grad_norm": 0.3631831109523773, "learning_rate": 0.0002, "loss": 1.3227, "mean_token_accuracy": 0.6780247658491134, "num_tokens": 312615.0, "step": 460 }, { "entropy": 1.3312758520245551, "epoch": 0.30089628681177977, "grad_norm": 0.3471841514110565, "learning_rate": 0.0002, "loss": 1.3201, "mean_token_accuracy": 0.6811099037528038, "num_tokens": 324478.0, "step": 470 }, { "entropy": 1.3637498423457146, "epoch": 0.3072983354673495, "grad_norm": 0.3833235502243042, "learning_rate": 0.0002, "loss": 1.3484, "mean_token_accuracy": 0.6715721279382706, "num_tokens": 336495.0, "step": 480 }, { "entropy": 1.4225990623235703, "epoch": 0.31370038412291934, "grad_norm": 0.39976227283477783, "learning_rate": 0.0002, "loss": 1.4174, "mean_token_accuracy": 0.6578968778252602, "num_tokens": 348100.0, "step": 490 }, { "entropy": 1.311194358766079, "epoch": 0.3201024327784891, "grad_norm": 0.3740488588809967, "learning_rate": 0.0002, "loss": 1.3232, "mean_token_accuracy": 0.675893497467041, "num_tokens": 360106.0, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.604568667220992e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }