{ "best_metric": 0.8172281558013501, "best_model_checkpoint": "outputs/t5-mini/weak_tiny_poe/qqp_87/checkpoint-56855", "epoch": 5.0, "eval_steps": 500, "global_step": 56855, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 1.4921129941940308, "learning_rate": 4.956028493536189e-05, "loss": 0.731, "step": 500 }, { "epoch": 0.09, "grad_norm": 2.265256643295288, "learning_rate": 4.9120569870723775e-05, "loss": 0.6589, "step": 1000 }, { "epoch": 0.13, "grad_norm": 1.7359956502914429, "learning_rate": 4.868085480608566e-05, "loss": 0.6362, "step": 1500 }, { "epoch": 0.18, "grad_norm": 1.5481916666030884, "learning_rate": 4.824113974144755e-05, "loss": 0.6255, "step": 2000 }, { "epoch": 0.22, "grad_norm": 1.8537436723709106, "learning_rate": 4.780142467680943e-05, "loss": 0.6048, "step": 2500 }, { "epoch": 0.26, "grad_norm": 1.7439799308776855, "learning_rate": 4.736170961217132e-05, "loss": 0.5956, "step": 3000 }, { "epoch": 0.31, "grad_norm": 2.284893274307251, "learning_rate": 4.6921994547533196e-05, "loss": 0.6007, "step": 3500 }, { "epoch": 0.35, "grad_norm": 1.466986894607544, "learning_rate": 4.6482279482895086e-05, "loss": 0.6004, "step": 4000 }, { "epoch": 0.4, "grad_norm": 2.0524117946624756, "learning_rate": 4.6042564418256975e-05, "loss": 0.5837, "step": 4500 }, { "epoch": 0.44, "grad_norm": 3.0001771450042725, "learning_rate": 4.560284935361886e-05, "loss": 0.5894, "step": 5000 }, { "epoch": 0.48, "grad_norm": 3.2638933658599854, "learning_rate": 4.516313428898074e-05, "loss": 0.5752, "step": 5500 }, { "epoch": 0.53, "grad_norm": 2.2692410945892334, "learning_rate": 4.472341922434263e-05, "loss": 0.5761, "step": 6000 }, { "epoch": 0.57, "grad_norm": 3.9219467639923096, "learning_rate": 4.428370415970451e-05, "loss": 0.5757, "step": 6500 }, { "epoch": 0.62, "grad_norm": 1.599557876586914, "learning_rate": 4.38439890950664e-05, "loss": 0.5667, "step": 7000 }, { "epoch": 0.66, "grad_norm": 2.1363203525543213, "learning_rate": 4.3404274030428286e-05, "loss": 0.5578, "step": 7500 }, { "epoch": 0.7, "grad_norm": 2.675701141357422, "learning_rate": 4.296455896579017e-05, "loss": 0.5579, "step": 8000 }, { "epoch": 0.75, "grad_norm": 1.613674283027649, "learning_rate": 4.252484390115206e-05, "loss": 0.5535, "step": 8500 }, { "epoch": 0.79, "grad_norm": 3.672745704650879, "learning_rate": 4.208512883651394e-05, "loss": 0.5465, "step": 9000 }, { "epoch": 0.84, "grad_norm": 2.1016011238098145, "learning_rate": 4.1645413771875824e-05, "loss": 0.5457, "step": 9500 }, { "epoch": 0.88, "grad_norm": 2.0293595790863037, "learning_rate": 4.1205698707237714e-05, "loss": 0.5425, "step": 10000 }, { "epoch": 0.92, "grad_norm": 1.5119960308074951, "learning_rate": 4.0765983642599596e-05, "loss": 0.5389, "step": 10500 }, { "epoch": 0.97, "grad_norm": 2.8837521076202393, "learning_rate": 4.0326268577961486e-05, "loss": 0.5411, "step": 11000 }, { "epoch": 1.0, "eval_combined_score": 0.7816986623933021, "eval_f1": 0.7816986623933021, "eval_loss": 0.37052249908447266, "eval_runtime": 38.1713, "eval_samples_per_second": 1059.173, "eval_steps_per_second": 4.139, "step": 11371 }, { "epoch": 1.01, "grad_norm": 2.248117685317993, "learning_rate": 3.988655351332337e-05, "loss": 0.5388, "step": 11500 }, { "epoch": 1.06, "grad_norm": 2.1271088123321533, "learning_rate": 3.944683844868525e-05, "loss": 0.5356, "step": 12000 }, { "epoch": 1.1, "grad_norm": 2.2512476444244385, "learning_rate": 3.900712338404714e-05, "loss": 0.532, "step": 12500 }, { "epoch": 1.14, "grad_norm": 1.627472996711731, "learning_rate": 3.8567408319409024e-05, "loss": 0.5307, "step": 13000 }, { "epoch": 1.19, "grad_norm": 2.581695556640625, "learning_rate": 3.812769325477091e-05, "loss": 0.5291, "step": 13500 }, { "epoch": 1.23, "grad_norm": 1.6895989179611206, "learning_rate": 3.76879781901328e-05, "loss": 0.5168, "step": 14000 }, { "epoch": 1.28, "grad_norm": 1.5524585247039795, "learning_rate": 3.724826312549468e-05, "loss": 0.5243, "step": 14500 }, { "epoch": 1.32, "grad_norm": 1.877912163734436, "learning_rate": 3.680854806085657e-05, "loss": 0.5249, "step": 15000 }, { "epoch": 1.36, "grad_norm": 1.8292911052703857, "learning_rate": 3.636883299621845e-05, "loss": 0.5125, "step": 15500 }, { "epoch": 1.41, "grad_norm": 1.8245925903320312, "learning_rate": 3.5929117931580335e-05, "loss": 0.52, "step": 16000 }, { "epoch": 1.45, "grad_norm": 4.095343112945557, "learning_rate": 3.5489402866942224e-05, "loss": 0.5165, "step": 16500 }, { "epoch": 1.5, "grad_norm": 1.9062578678131104, "learning_rate": 3.504968780230411e-05, "loss": 0.5177, "step": 17000 }, { "epoch": 1.54, "grad_norm": 2.735477924346924, "learning_rate": 3.4609972737666e-05, "loss": 0.5117, "step": 17500 }, { "epoch": 1.58, "grad_norm": 1.8982046842575073, "learning_rate": 3.417025767302788e-05, "loss": 0.5061, "step": 18000 }, { "epoch": 1.63, "grad_norm": 2.2440438270568848, "learning_rate": 3.373054260838976e-05, "loss": 0.5126, "step": 18500 }, { "epoch": 1.67, "grad_norm": 3.681962728500366, "learning_rate": 3.329082754375165e-05, "loss": 0.5066, "step": 19000 }, { "epoch": 1.71, "grad_norm": 2.9211692810058594, "learning_rate": 3.2851112479113535e-05, "loss": 0.5197, "step": 19500 }, { "epoch": 1.76, "grad_norm": 1.4573030471801758, "learning_rate": 3.241139741447542e-05, "loss": 0.507, "step": 20000 }, { "epoch": 1.8, "grad_norm": 1.5176600217819214, "learning_rate": 3.197168234983731e-05, "loss": 0.5023, "step": 20500 }, { "epoch": 1.85, "grad_norm": 2.4117181301116943, "learning_rate": 3.153196728519919e-05, "loss": 0.4966, "step": 21000 }, { "epoch": 1.89, "grad_norm": 1.1353968381881714, "learning_rate": 3.109225222056108e-05, "loss": 0.5071, "step": 21500 }, { "epoch": 1.93, "grad_norm": 1.829007863998413, "learning_rate": 3.065253715592296e-05, "loss": 0.5053, "step": 22000 }, { "epoch": 1.98, "grad_norm": 2.5963010787963867, "learning_rate": 3.021282209128485e-05, "loss": 0.5084, "step": 22500 }, { "epoch": 2.0, "eval_combined_score": 0.7990887806112763, "eval_f1": 0.7990887806112763, "eval_loss": 0.35578909516334534, "eval_runtime": 38.0326, "eval_samples_per_second": 1063.035, "eval_steps_per_second": 4.154, "step": 22742 }, { "epoch": 2.02, "grad_norm": 1.8584606647491455, "learning_rate": 2.9773107026646735e-05, "loss": 0.5022, "step": 23000 }, { "epoch": 2.07, "grad_norm": 2.9419100284576416, "learning_rate": 2.933339196200862e-05, "loss": 0.4968, "step": 23500 }, { "epoch": 2.11, "grad_norm": 2.221026659011841, "learning_rate": 2.8893676897370504e-05, "loss": 0.4884, "step": 24000 }, { "epoch": 2.15, "grad_norm": 2.75677752494812, "learning_rate": 2.845396183273239e-05, "loss": 0.4963, "step": 24500 }, { "epoch": 2.2, "grad_norm": 1.5008267164230347, "learning_rate": 2.8014246768094277e-05, "loss": 0.4963, "step": 25000 }, { "epoch": 2.24, "grad_norm": 1.5264431238174438, "learning_rate": 2.7574531703456163e-05, "loss": 0.4981, "step": 25500 }, { "epoch": 2.29, "grad_norm": 2.450463056564331, "learning_rate": 2.7134816638818046e-05, "loss": 0.4961, "step": 26000 }, { "epoch": 2.33, "grad_norm": 2.3188283443450928, "learning_rate": 2.6695101574179932e-05, "loss": 0.4917, "step": 26500 }, { "epoch": 2.37, "grad_norm": 2.173543691635132, "learning_rate": 2.625538650954182e-05, "loss": 0.4923, "step": 27000 }, { "epoch": 2.42, "grad_norm": 0.7373517155647278, "learning_rate": 2.5815671444903705e-05, "loss": 0.489, "step": 27500 }, { "epoch": 2.46, "grad_norm": 1.6519461870193481, "learning_rate": 2.5375956380265588e-05, "loss": 0.494, "step": 28000 }, { "epoch": 2.51, "grad_norm": 1.8422359228134155, "learning_rate": 2.4936241315627474e-05, "loss": 0.5016, "step": 28500 }, { "epoch": 2.55, "grad_norm": 2.1333723068237305, "learning_rate": 2.449652625098936e-05, "loss": 0.4868, "step": 29000 }, { "epoch": 2.59, "grad_norm": 1.8119902610778809, "learning_rate": 2.4056811186351246e-05, "loss": 0.4885, "step": 29500 }, { "epoch": 2.64, "grad_norm": 1.8639763593673706, "learning_rate": 2.3617096121713133e-05, "loss": 0.4911, "step": 30000 }, { "epoch": 2.68, "grad_norm": 1.5926486253738403, "learning_rate": 2.317738105707502e-05, "loss": 0.4889, "step": 30500 }, { "epoch": 2.73, "grad_norm": 3.753464460372925, "learning_rate": 2.27376659924369e-05, "loss": 0.4905, "step": 31000 }, { "epoch": 2.77, "grad_norm": 1.9747668504714966, "learning_rate": 2.2297950927798788e-05, "loss": 0.4871, "step": 31500 }, { "epoch": 2.81, "grad_norm": 1.5802444219589233, "learning_rate": 2.1858235863160674e-05, "loss": 0.4846, "step": 32000 }, { "epoch": 2.86, "grad_norm": 1.8520300388336182, "learning_rate": 2.141852079852256e-05, "loss": 0.4839, "step": 32500 }, { "epoch": 2.9, "grad_norm": 2.0941038131713867, "learning_rate": 2.0978805733884443e-05, "loss": 0.4769, "step": 33000 }, { "epoch": 2.95, "grad_norm": 1.694678783416748, "learning_rate": 2.053909066924633e-05, "loss": 0.4798, "step": 33500 }, { "epoch": 2.99, "grad_norm": 2.0499818325042725, "learning_rate": 2.0099375604608216e-05, "loss": 0.4841, "step": 34000 }, { "epoch": 3.0, "eval_combined_score": 0.8122094824914782, "eval_f1": 0.8122094824914782, "eval_loss": 0.34033524990081787, "eval_runtime": 37.9138, "eval_samples_per_second": 1066.367, "eval_steps_per_second": 4.167, "step": 34113 }, { "epoch": 3.03, "grad_norm": 3.1498029232025146, "learning_rate": 1.9659660539970102e-05, "loss": 0.4738, "step": 34500 }, { "epoch": 3.08, "grad_norm": 1.5172255039215088, "learning_rate": 1.9219945475331985e-05, "loss": 0.4803, "step": 35000 }, { "epoch": 3.12, "grad_norm": 2.0573079586029053, "learning_rate": 1.878023041069387e-05, "loss": 0.4728, "step": 35500 }, { "epoch": 3.17, "grad_norm": 2.761320114135742, "learning_rate": 1.8340515346055757e-05, "loss": 0.4768, "step": 36000 }, { "epoch": 3.21, "grad_norm": 2.29789137840271, "learning_rate": 1.7900800281417643e-05, "loss": 0.4786, "step": 36500 }, { "epoch": 3.25, "grad_norm": 2.8867733478546143, "learning_rate": 1.7461085216779526e-05, "loss": 0.472, "step": 37000 }, { "epoch": 3.3, "grad_norm": 2.888866901397705, "learning_rate": 1.7021370152141413e-05, "loss": 0.4701, "step": 37500 }, { "epoch": 3.34, "grad_norm": 2.6121418476104736, "learning_rate": 1.65816550875033e-05, "loss": 0.4702, "step": 38000 }, { "epoch": 3.39, "grad_norm": 1.7327868938446045, "learning_rate": 1.6141940022865185e-05, "loss": 0.4751, "step": 38500 }, { "epoch": 3.43, "grad_norm": 2.7664458751678467, "learning_rate": 1.5702224958227068e-05, "loss": 0.4772, "step": 39000 }, { "epoch": 3.47, "grad_norm": 1.0849014520645142, "learning_rate": 1.5262509893588954e-05, "loss": 0.4738, "step": 39500 }, { "epoch": 3.52, "grad_norm": 3.0802319049835205, "learning_rate": 1.482279482895084e-05, "loss": 0.476, "step": 40000 }, { "epoch": 3.56, "grad_norm": 1.9298323392868042, "learning_rate": 1.4383079764312727e-05, "loss": 0.4681, "step": 40500 }, { "epoch": 3.61, "grad_norm": 2.0291521549224854, "learning_rate": 1.3943364699674611e-05, "loss": 0.4769, "step": 41000 }, { "epoch": 3.65, "grad_norm": 2.866079330444336, "learning_rate": 1.3503649635036497e-05, "loss": 0.4771, "step": 41500 }, { "epoch": 3.69, "grad_norm": 1.5067503452301025, "learning_rate": 1.3063934570398382e-05, "loss": 0.4754, "step": 42000 }, { "epoch": 3.74, "grad_norm": 2.293853521347046, "learning_rate": 1.2624219505760268e-05, "loss": 0.4791, "step": 42500 }, { "epoch": 3.78, "grad_norm": 2.38676381111145, "learning_rate": 1.2184504441122153e-05, "loss": 0.4674, "step": 43000 }, { "epoch": 3.83, "grad_norm": 2.829270601272583, "learning_rate": 1.1744789376484039e-05, "loss": 0.4715, "step": 43500 }, { "epoch": 3.87, "grad_norm": 2.3581595420837402, "learning_rate": 1.1305074311845925e-05, "loss": 0.4717, "step": 44000 }, { "epoch": 3.91, "grad_norm": 0.803579568862915, "learning_rate": 1.086535924720781e-05, "loss": 0.4638, "step": 44500 }, { "epoch": 3.96, "grad_norm": 3.1875007152557373, "learning_rate": 1.0425644182569696e-05, "loss": 0.4762, "step": 45000 }, { "epoch": 4.0, "eval_combined_score": 0.8123888182973317, "eval_f1": 0.8123888182973317, "eval_loss": 0.33584973216056824, "eval_runtime": 37.8907, "eval_samples_per_second": 1067.017, "eval_steps_per_second": 4.17, "step": 45484 }, { "epoch": 4.0, "grad_norm": 2.368762969970703, "learning_rate": 9.98592911793158e-06, "loss": 0.4707, "step": 45500 }, { "epoch": 4.05, "grad_norm": 2.4431540966033936, "learning_rate": 9.546214053293467e-06, "loss": 0.4654, "step": 46000 }, { "epoch": 4.09, "grad_norm": 1.8363009691238403, "learning_rate": 9.106498988655351e-06, "loss": 0.4607, "step": 46500 }, { "epoch": 4.13, "grad_norm": 1.9336574077606201, "learning_rate": 8.666783924017237e-06, "loss": 0.4661, "step": 47000 }, { "epoch": 4.18, "grad_norm": 1.5891854763031006, "learning_rate": 8.227068859379122e-06, "loss": 0.4637, "step": 47500 }, { "epoch": 4.22, "grad_norm": 1.3558140993118286, "learning_rate": 7.787353794741008e-06, "loss": 0.4599, "step": 48000 }, { "epoch": 4.27, "grad_norm": 1.899108648300171, "learning_rate": 7.347638730102893e-06, "loss": 0.4713, "step": 48500 }, { "epoch": 4.31, "grad_norm": 1.9433221817016602, "learning_rate": 6.907923665464779e-06, "loss": 0.4655, "step": 49000 }, { "epoch": 4.35, "grad_norm": 3.1865389347076416, "learning_rate": 6.468208600826664e-06, "loss": 0.4598, "step": 49500 }, { "epoch": 4.4, "grad_norm": 2.2305116653442383, "learning_rate": 6.02849353618855e-06, "loss": 0.4659, "step": 50000 }, { "epoch": 4.44, "grad_norm": 2.6832566261291504, "learning_rate": 5.588778471550435e-06, "loss": 0.4597, "step": 50500 }, { "epoch": 4.49, "grad_norm": 2.4481277465820312, "learning_rate": 5.149063406912321e-06, "loss": 0.4721, "step": 51000 }, { "epoch": 4.53, "grad_norm": 2.5793635845184326, "learning_rate": 4.709348342274207e-06, "loss": 0.4661, "step": 51500 }, { "epoch": 4.57, "grad_norm": 2.133437395095825, "learning_rate": 4.269633277636092e-06, "loss": 0.4589, "step": 52000 }, { "epoch": 4.62, "grad_norm": 4.3351006507873535, "learning_rate": 3.8299182129979776e-06, "loss": 0.4697, "step": 52500 }, { "epoch": 4.66, "grad_norm": 3.0949718952178955, "learning_rate": 3.3902031483598625e-06, "loss": 0.4659, "step": 53000 }, { "epoch": 4.7, "grad_norm": 2.1113531589508057, "learning_rate": 2.9504880837217483e-06, "loss": 0.4675, "step": 53500 }, { "epoch": 4.75, "grad_norm": 2.830069065093994, "learning_rate": 2.5107730190836337e-06, "loss": 0.4736, "step": 54000 }, { "epoch": 4.79, "grad_norm": 1.8624334335327148, "learning_rate": 2.0710579544455195e-06, "loss": 0.4596, "step": 54500 }, { "epoch": 4.84, "grad_norm": 3.123922824859619, "learning_rate": 1.6313428898074047e-06, "loss": 0.4601, "step": 55000 }, { "epoch": 4.88, "grad_norm": 2.3546392917633057, "learning_rate": 1.1916278251692903e-06, "loss": 0.4605, "step": 55500 }, { "epoch": 4.92, "grad_norm": 2.721216917037964, "learning_rate": 7.519127605311758e-07, "loss": 0.4652, "step": 56000 }, { "epoch": 4.97, "grad_norm": 2.4257423877716064, "learning_rate": 3.121976958930613e-07, "loss": 0.4614, "step": 56500 }, { "epoch": 5.0, "eval_combined_score": 0.8172281558013501, "eval_f1": 0.8172281558013501, "eval_loss": 0.3320091664791107, "eval_runtime": 37.3626, "eval_samples_per_second": 1082.098, "eval_steps_per_second": 4.229, "step": 56855 }, { "epoch": 5.0, "step": 56855, "total_flos": 0.0, "train_loss": 0.5058023918567175, "train_runtime": 6368.9989, "train_samples_per_second": 285.638, "train_steps_per_second": 8.927 } ], "logging_steps": 500, "max_steps": 56855, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }