| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.3201024327784891, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.2925003931857646, | |
| "epoch": 0.012804097311139564, | |
| "grad_norm": 0.11822043359279633, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5374, | |
| "mean_token_accuracy": 0.7501773500349372, | |
| "num_tokens": 163840.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.24354165233671665, | |
| "epoch": 0.02560819462227913, | |
| "grad_norm": 0.06385882943868637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2747, | |
| "mean_token_accuracy": 0.940368153899908, | |
| "num_tokens": 327680.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.21027038600295783, | |
| "epoch": 0.03841229193341869, | |
| "grad_norm": 0.06773430854082108, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2517, | |
| "mean_token_accuracy": 0.9453033253550529, | |
| "num_tokens": 491520.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.2769933709874749, | |
| "epoch": 0.05121638924455826, | |
| "grad_norm": 0.055818039923906326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2571, | |
| "mean_token_accuracy": 0.9446122765541076, | |
| "num_tokens": 655360.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.2427499696612358, | |
| "epoch": 0.06402048655569782, | |
| "grad_norm": 0.058828528970479965, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2485, | |
| "mean_token_accuracy": 0.9458353690803051, | |
| "num_tokens": 819200.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.21829927181825043, | |
| "epoch": 0.07682458386683738, | |
| "grad_norm": 0.05017698556184769, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2266, | |
| "mean_token_accuracy": 0.9501712337136269, | |
| "num_tokens": 983040.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.23391168629750608, | |
| "epoch": 0.08962868117797695, | |
| "grad_norm": 0.04940837249159813, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2378, | |
| "mean_token_accuracy": 0.9473275415599346, | |
| "num_tokens": 1146880.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.22548514110967516, | |
| "epoch": 0.10243277848911651, | |
| "grad_norm": 0.05233762413263321, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2281, | |
| "mean_token_accuracy": 0.9489420257508755, | |
| "num_tokens": 1310720.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.2398780019953847, | |
| "epoch": 0.11523687580025609, | |
| "grad_norm": 0.048961199820041656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2435, | |
| "mean_token_accuracy": 0.9454256378114223, | |
| "num_tokens": 1474560.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.21921717822551728, | |
| "epoch": 0.12804097311139565, | |
| "grad_norm": 0.05406157672405243, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2233, | |
| "mean_token_accuracy": 0.9502079240977764, | |
| "num_tokens": 1638400.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.4439915746450425, | |
| "epoch": 0.07042253521126761, | |
| "grad_norm": 0.5860416889190674, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5262, | |
| "mean_token_accuracy": 0.6517463460564613, | |
| "num_tokens": 11002.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.4737072110176086, | |
| "epoch": 0.07682458386683738, | |
| "grad_norm": 0.46929118037223816, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5329, | |
| "mean_token_accuracy": 0.6466514617204666, | |
| "num_tokens": 22341.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.5261517196893692, | |
| "epoch": 0.08322663252240717, | |
| "grad_norm": 0.42917096614837646, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5325, | |
| "mean_token_accuracy": 0.6432413414120675, | |
| "num_tokens": 34121.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.475469598174095, | |
| "epoch": 0.08962868117797695, | |
| "grad_norm": 0.38441428542137146, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5251, | |
| "mean_token_accuracy": 0.6469125777482987, | |
| "num_tokens": 45804.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.4375366628170014, | |
| "epoch": 0.09603072983354674, | |
| "grad_norm": 0.4060431122779846, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4656, | |
| "mean_token_accuracy": 0.65479796230793, | |
| "num_tokens": 57477.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.3917139500379563, | |
| "epoch": 0.10243277848911651, | |
| "grad_norm": 0.42650681734085083, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3855, | |
| "mean_token_accuracy": 0.6694141447544097, | |
| "num_tokens": 69610.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.4115019723773004, | |
| "epoch": 0.1088348271446863, | |
| "grad_norm": 0.43073078989982605, | |
| "learning_rate": 0.0002, | |
| "loss": 1.418, | |
| "mean_token_accuracy": 0.6599947348237037, | |
| "num_tokens": 81802.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.4750748038291932, | |
| "epoch": 0.11523687580025609, | |
| "grad_norm": 0.4140271246433258, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4688, | |
| "mean_token_accuracy": 0.657877217233181, | |
| "num_tokens": 94864.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.4294287458062171, | |
| "epoch": 0.12163892445582586, | |
| "grad_norm": 0.4295845627784729, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4499, | |
| "mean_token_accuracy": 0.664360024034977, | |
| "num_tokens": 106706.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.369442880153656, | |
| "epoch": 0.12804097311139565, | |
| "grad_norm": 0.39103543758392334, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3704, | |
| "mean_token_accuracy": 0.6727640315890312, | |
| "num_tokens": 118294.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.4287993058562278, | |
| "epoch": 0.13444302176696543, | |
| "grad_norm": 0.43736791610717773, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4618, | |
| "mean_token_accuracy": 0.6607629641890526, | |
| "num_tokens": 11547.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.4580362021923066, | |
| "epoch": 0.14084507042253522, | |
| "grad_norm": 0.4345095157623291, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4718, | |
| "mean_token_accuracy": 0.6601809754967689, | |
| "num_tokens": 23048.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.4598326086997986, | |
| "epoch": 0.147247119078105, | |
| "grad_norm": 0.43506523966789246, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4721, | |
| "mean_token_accuracy": 0.6584218233823776, | |
| "num_tokens": 35544.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.4226029217243195, | |
| "epoch": 0.15364916773367476, | |
| "grad_norm": 0.3562639355659485, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4292, | |
| "mean_token_accuracy": 0.6552029058337212, | |
| "num_tokens": 47703.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.4316969871520997, | |
| "epoch": 0.16005121638924455, | |
| "grad_norm": 0.3855348527431488, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4474, | |
| "mean_token_accuracy": 0.6541792467236519, | |
| "num_tokens": 59619.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.4340788170695304, | |
| "epoch": 0.16645326504481434, | |
| "grad_norm": 0.3725138008594513, | |
| "learning_rate": 0.0002, | |
| "loss": 1.411, | |
| "mean_token_accuracy": 0.6598658114671707, | |
| "num_tokens": 70976.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.433164033293724, | |
| "epoch": 0.17285531370038412, | |
| "grad_norm": 0.40799325704574585, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4326, | |
| "mean_token_accuracy": 0.6610979020595551, | |
| "num_tokens": 82978.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.388225580751896, | |
| "epoch": 0.1792573623559539, | |
| "grad_norm": 0.46172577142715454, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3839, | |
| "mean_token_accuracy": 0.6689180210232735, | |
| "num_tokens": 94835.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.3931595474481582, | |
| "epoch": 0.1856594110115237, | |
| "grad_norm": 0.3368935286998749, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3842, | |
| "mean_token_accuracy": 0.6617935121059417, | |
| "num_tokens": 107095.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.3946780189871788, | |
| "epoch": 0.19206145966709348, | |
| "grad_norm": 0.35388654470443726, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3875, | |
| "mean_token_accuracy": 0.6697609886527062, | |
| "num_tokens": 119793.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.404037845134735, | |
| "epoch": 0.19846350832266324, | |
| "grad_norm": 0.3220811188220978, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4132, | |
| "mean_token_accuracy": 0.6690483942627907, | |
| "num_tokens": 132305.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.4252075865864753, | |
| "epoch": 0.20486555697823303, | |
| "grad_norm": 0.36200419068336487, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3847, | |
| "mean_token_accuracy": 0.6713049352169037, | |
| "num_tokens": 144152.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.3943257242441178, | |
| "epoch": 0.2112676056338028, | |
| "grad_norm": 0.3905465304851532, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4046, | |
| "mean_token_accuracy": 0.6624784469604492, | |
| "num_tokens": 155760.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.3818968921899795, | |
| "epoch": 0.2176696542893726, | |
| "grad_norm": 0.39985793828964233, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3703, | |
| "mean_token_accuracy": 0.6665896505117417, | |
| "num_tokens": 167621.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.4194732576608657, | |
| "epoch": 0.22407170294494239, | |
| "grad_norm": 0.3372517228126526, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4082, | |
| "mean_token_accuracy": 0.6618808403611183, | |
| "num_tokens": 180287.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.3521318838000298, | |
| "epoch": 0.23047375160051217, | |
| "grad_norm": 0.3215847313404083, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3652, | |
| "mean_token_accuracy": 0.6767628908157348, | |
| "num_tokens": 192710.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.3673772186040878, | |
| "epoch": 0.23687580025608196, | |
| "grad_norm": 0.2965973913669586, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3505, | |
| "mean_token_accuracy": 0.6749694958329201, | |
| "num_tokens": 205349.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.416155432164669, | |
| "epoch": 0.24327784891165172, | |
| "grad_norm": 0.3764086961746216, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3843, | |
| "mean_token_accuracy": 0.6687785759568214, | |
| "num_tokens": 217150.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.3185370564460754, | |
| "epoch": 0.2496798975672215, | |
| "grad_norm": 0.3607741892337799, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3166, | |
| "mean_token_accuracy": 0.6866846427321434, | |
| "num_tokens": 229644.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.439865067601204, | |
| "epoch": 0.2560819462227913, | |
| "grad_norm": 0.3800489604473114, | |
| "learning_rate": 0.0002, | |
| "loss": 1.429, | |
| "mean_token_accuracy": 0.6569151118397712, | |
| "num_tokens": 241123.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.359485986828804, | |
| "epoch": 0.26248399487836105, | |
| "grad_norm": 0.3416250944137573, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3547, | |
| "mean_token_accuracy": 0.6739593103528023, | |
| "num_tokens": 253195.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.4108700543642043, | |
| "epoch": 0.26888604353393086, | |
| "grad_norm": 0.4621182382106781, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3914, | |
| "mean_token_accuracy": 0.6685805276036263, | |
| "num_tokens": 264898.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.3867693424224854, | |
| "epoch": 0.2752880921895006, | |
| "grad_norm": 0.3572266697883606, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3377, | |
| "mean_token_accuracy": 0.6788131222128868, | |
| "num_tokens": 277460.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.4053112298250199, | |
| "epoch": 0.28169014084507044, | |
| "grad_norm": 0.39097145199775696, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3981, | |
| "mean_token_accuracy": 0.6630080677568913, | |
| "num_tokens": 288981.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.411610472202301, | |
| "epoch": 0.2880921895006402, | |
| "grad_norm": 0.40481698513031006, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3902, | |
| "mean_token_accuracy": 0.6739853829145431, | |
| "num_tokens": 300735.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.3377737805247307, | |
| "epoch": 0.29449423815621, | |
| "grad_norm": 0.3631831109523773, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3227, | |
| "mean_token_accuracy": 0.6780247658491134, | |
| "num_tokens": 312615.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.3312758520245551, | |
| "epoch": 0.30089628681177977, | |
| "grad_norm": 0.3471841514110565, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3201, | |
| "mean_token_accuracy": 0.6811099037528038, | |
| "num_tokens": 324478.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.3637498423457146, | |
| "epoch": 0.3072983354673495, | |
| "grad_norm": 0.3833235502243042, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3484, | |
| "mean_token_accuracy": 0.6715721279382706, | |
| "num_tokens": 336495.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.4225990623235703, | |
| "epoch": 0.31370038412291934, | |
| "grad_norm": 0.39976227283477783, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4174, | |
| "mean_token_accuracy": 0.6578968778252602, | |
| "num_tokens": 348100.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.311194358766079, | |
| "epoch": 0.3201024327784891, | |
| "grad_norm": 0.3740488588809967, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3232, | |
| "mean_token_accuracy": 0.675893497467041, | |
| "num_tokens": 360106.0, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.604568667220992e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |