{ "best_metric": 0.8088124957369893, "best_model_checkpoint": "outputs/t5-mini/weak_tiny_poe/qqp_42/checkpoint-56855", "epoch": 5.0, "eval_steps": 500, "global_step": 56855, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 2.30487060546875, "learning_rate": 4.956028493536189e-05, "loss": 0.7659, "step": 500 }, { "epoch": 0.09, "grad_norm": 1.7330808639526367, "learning_rate": 4.9120569870723775e-05, "loss": 0.6875, "step": 1000 }, { "epoch": 0.13, "grad_norm": 1.5595574378967285, "learning_rate": 4.868085480608566e-05, "loss": 0.6526, "step": 1500 }, { "epoch": 0.18, "grad_norm": 1.4334590435028076, "learning_rate": 4.824113974144755e-05, "loss": 0.6392, "step": 2000 }, { "epoch": 0.22, "grad_norm": 1.1763668060302734, "learning_rate": 4.780142467680943e-05, "loss": 0.6377, "step": 2500 }, { "epoch": 0.26, "grad_norm": 1.2207213640213013, "learning_rate": 4.736170961217132e-05, "loss": 0.6186, "step": 3000 }, { "epoch": 0.31, "grad_norm": 1.3439034223556519, "learning_rate": 4.6921994547533196e-05, "loss": 0.6166, "step": 3500 }, { "epoch": 0.35, "grad_norm": 1.9837682247161865, "learning_rate": 4.6482279482895086e-05, "loss": 0.6171, "step": 4000 }, { "epoch": 0.4, "grad_norm": 2.070117473602295, "learning_rate": 4.6042564418256975e-05, "loss": 0.602, "step": 4500 }, { "epoch": 0.44, "grad_norm": 2.691117525100708, "learning_rate": 4.560284935361886e-05, "loss": 0.5938, "step": 5000 }, { "epoch": 0.48, "grad_norm": 1.9327023029327393, "learning_rate": 4.516313428898074e-05, "loss": 0.5996, "step": 5500 }, { "epoch": 0.53, "grad_norm": 2.1416549682617188, "learning_rate": 4.472341922434263e-05, "loss": 0.5902, "step": 6000 }, { "epoch": 0.57, "grad_norm": 2.225383996963501, "learning_rate": 4.428370415970451e-05, "loss": 0.584, "step": 6500 }, { "epoch": 0.62, "grad_norm": 1.4455324411392212, "learning_rate": 4.38439890950664e-05, "loss": 0.5719, "step": 7000 }, { "epoch": 0.66, "grad_norm": 2.7837624549865723, "learning_rate": 4.3404274030428286e-05, "loss": 0.5762, "step": 7500 }, { "epoch": 0.7, "grad_norm": 2.026057004928589, "learning_rate": 4.296455896579017e-05, "loss": 0.5707, "step": 8000 }, { "epoch": 0.75, "grad_norm": 1.983589768409729, "learning_rate": 4.252484390115206e-05, "loss": 0.5651, "step": 8500 }, { "epoch": 0.79, "grad_norm": 1.826207160949707, "learning_rate": 4.208512883651394e-05, "loss": 0.5593, "step": 9000 }, { "epoch": 0.84, "grad_norm": 2.2966904640197754, "learning_rate": 4.1645413771875824e-05, "loss": 0.5658, "step": 9500 }, { "epoch": 0.88, "grad_norm": 2.696525812149048, "learning_rate": 4.1205698707237714e-05, "loss": 0.5664, "step": 10000 }, { "epoch": 0.92, "grad_norm": 1.6921826601028442, "learning_rate": 4.0765983642599596e-05, "loss": 0.562, "step": 10500 }, { "epoch": 0.97, "grad_norm": 1.9230735301971436, "learning_rate": 4.0326268577961486e-05, "loss": 0.562, "step": 11000 }, { "epoch": 1.0, "eval_combined_score": 0.7611372754229897, "eval_f1": 0.7611372754229897, "eval_loss": 0.37701112031936646, "eval_runtime": 36.8648, "eval_samples_per_second": 1096.71, "eval_steps_per_second": 4.286, "step": 11371 }, { "epoch": 1.01, "grad_norm": 2.1575307846069336, "learning_rate": 3.988655351332337e-05, "loss": 0.5502, "step": 11500 }, { "epoch": 1.06, "grad_norm": 2.0553629398345947, "learning_rate": 3.944683844868525e-05, "loss": 0.5486, "step": 12000 }, { "epoch": 1.1, "grad_norm": 2.253108024597168, "learning_rate": 3.900712338404714e-05, "loss": 0.5457, "step": 12500 }, { "epoch": 1.14, "grad_norm": 2.2528011798858643, "learning_rate": 3.8567408319409024e-05, "loss": 0.5418, "step": 13000 }, { "epoch": 1.19, "grad_norm": 1.872086524963379, "learning_rate": 3.812769325477091e-05, "loss": 0.5405, "step": 13500 }, { "epoch": 1.23, "grad_norm": 1.549558401107788, "learning_rate": 3.76879781901328e-05, "loss": 0.5432, "step": 14000 }, { "epoch": 1.28, "grad_norm": 2.1427700519561768, "learning_rate": 3.724826312549468e-05, "loss": 0.5363, "step": 14500 }, { "epoch": 1.32, "grad_norm": 2.0889768600463867, "learning_rate": 3.680854806085657e-05, "loss": 0.5401, "step": 15000 }, { "epoch": 1.36, "grad_norm": 1.9004706144332886, "learning_rate": 3.636883299621845e-05, "loss": 0.538, "step": 15500 }, { "epoch": 1.41, "grad_norm": 2.568284273147583, "learning_rate": 3.5929117931580335e-05, "loss": 0.5364, "step": 16000 }, { "epoch": 1.45, "grad_norm": 1.1210036277770996, "learning_rate": 3.5489402866942224e-05, "loss": 0.5278, "step": 16500 }, { "epoch": 1.5, "grad_norm": 2.536067247390747, "learning_rate": 3.504968780230411e-05, "loss": 0.5338, "step": 17000 }, { "epoch": 1.54, "grad_norm": 2.0306429862976074, "learning_rate": 3.4609972737666e-05, "loss": 0.5238, "step": 17500 }, { "epoch": 1.58, "grad_norm": 2.097214460372925, "learning_rate": 3.417025767302788e-05, "loss": 0.5273, "step": 18000 }, { "epoch": 1.63, "grad_norm": 2.1282901763916016, "learning_rate": 3.373054260838976e-05, "loss": 0.5249, "step": 18500 }, { "epoch": 1.67, "grad_norm": 2.8680260181427, "learning_rate": 3.329082754375165e-05, "loss": 0.533, "step": 19000 }, { "epoch": 1.71, "grad_norm": 2.5516855716705322, "learning_rate": 3.2851112479113535e-05, "loss": 0.5254, "step": 19500 }, { "epoch": 1.76, "grad_norm": 2.8096489906311035, "learning_rate": 3.241139741447542e-05, "loss": 0.5282, "step": 20000 }, { "epoch": 1.8, "grad_norm": 3.4034230709075928, "learning_rate": 3.197168234983731e-05, "loss": 0.5219, "step": 20500 }, { "epoch": 1.85, "grad_norm": 2.213874340057373, "learning_rate": 3.153196728519919e-05, "loss": 0.5277, "step": 21000 }, { "epoch": 1.89, "grad_norm": 2.4699976444244385, "learning_rate": 3.109225222056108e-05, "loss": 0.5261, "step": 21500 }, { "epoch": 1.93, "grad_norm": 1.352188229560852, "learning_rate": 3.065253715592296e-05, "loss": 0.5267, "step": 22000 }, { "epoch": 1.98, "grad_norm": 2.8092727661132812, "learning_rate": 3.021282209128485e-05, "loss": 0.5282, "step": 22500 }, { "epoch": 2.0, "eval_combined_score": 0.7765263781861292, "eval_f1": 0.7765263781861292, "eval_loss": 0.3535827398300171, "eval_runtime": 37.1503, "eval_samples_per_second": 1088.283, "eval_steps_per_second": 4.253, "step": 22742 }, { "epoch": 2.02, "grad_norm": 2.087219476699829, "learning_rate": 2.9773107026646735e-05, "loss": 0.5222, "step": 23000 }, { "epoch": 2.07, "grad_norm": 3.6042699813842773, "learning_rate": 2.933339196200862e-05, "loss": 0.5149, "step": 23500 }, { "epoch": 2.11, "grad_norm": 2.8945512771606445, "learning_rate": 2.8893676897370504e-05, "loss": 0.5143, "step": 24000 }, { "epoch": 2.15, "grad_norm": 3.639920234680176, "learning_rate": 2.845396183273239e-05, "loss": 0.5142, "step": 24500 }, { "epoch": 2.2, "grad_norm": 1.5696040391921997, "learning_rate": 2.8014246768094277e-05, "loss": 0.5109, "step": 25000 }, { "epoch": 2.24, "grad_norm": 2.502941608428955, "learning_rate": 2.7574531703456163e-05, "loss": 0.5041, "step": 25500 }, { "epoch": 2.29, "grad_norm": 1.7532044649124146, "learning_rate": 2.7134816638818046e-05, "loss": 0.5099, "step": 26000 }, { "epoch": 2.33, "grad_norm": 2.583704948425293, "learning_rate": 2.6695101574179932e-05, "loss": 0.5109, "step": 26500 }, { "epoch": 2.37, "grad_norm": 1.9148293733596802, "learning_rate": 2.625538650954182e-05, "loss": 0.5088, "step": 27000 }, { "epoch": 2.42, "grad_norm": 1.2226241827011108, "learning_rate": 2.5815671444903705e-05, "loss": 0.5046, "step": 27500 }, { "epoch": 2.46, "grad_norm": 2.276338815689087, "learning_rate": 2.5375956380265588e-05, "loss": 0.5077, "step": 28000 }, { "epoch": 2.51, "grad_norm": 1.4178540706634521, "learning_rate": 2.4936241315627474e-05, "loss": 0.4967, "step": 28500 }, { "epoch": 2.55, "grad_norm": 2.231714963912964, "learning_rate": 2.449652625098936e-05, "loss": 0.5047, "step": 29000 }, { "epoch": 2.59, "grad_norm": 1.2348312139511108, "learning_rate": 2.4056811186351246e-05, "loss": 0.5072, "step": 29500 }, { "epoch": 2.64, "grad_norm": 1.837050199508667, "learning_rate": 2.3617096121713133e-05, "loss": 0.5045, "step": 30000 }, { "epoch": 2.68, "grad_norm": 2.297529697418213, "learning_rate": 2.317738105707502e-05, "loss": 0.5102, "step": 30500 }, { "epoch": 2.73, "grad_norm": 1.2097573280334473, "learning_rate": 2.27376659924369e-05, "loss": 0.5046, "step": 31000 }, { "epoch": 2.77, "grad_norm": 2.4351987838745117, "learning_rate": 2.2297950927798788e-05, "loss": 0.5057, "step": 31500 }, { "epoch": 2.81, "grad_norm": 2.6174986362457275, "learning_rate": 2.1858235863160674e-05, "loss": 0.5051, "step": 32000 }, { "epoch": 2.86, "grad_norm": 2.1429812908172607, "learning_rate": 2.141852079852256e-05, "loss": 0.496, "step": 32500 }, { "epoch": 2.9, "grad_norm": 2.406395673751831, "learning_rate": 2.0978805733884443e-05, "loss": 0.492, "step": 33000 }, { "epoch": 2.95, "grad_norm": 2.600222587585449, "learning_rate": 2.053909066924633e-05, "loss": 0.5063, "step": 33500 }, { "epoch": 2.99, "grad_norm": 1.4866433143615723, "learning_rate": 2.0099375604608216e-05, "loss": 0.4977, "step": 34000 }, { "epoch": 3.0, "eval_combined_score": 0.7951059796656902, "eval_f1": 0.7951059796656902, "eval_loss": 0.3372982144355774, "eval_runtime": 36.9034, "eval_samples_per_second": 1095.563, "eval_steps_per_second": 4.281, "step": 34113 }, { "epoch": 3.03, "grad_norm": 1.615599513053894, "learning_rate": 1.9659660539970102e-05, "loss": 0.4991, "step": 34500 }, { "epoch": 3.08, "grad_norm": 2.992169141769409, "learning_rate": 1.9219945475331985e-05, "loss": 0.4927, "step": 35000 }, { "epoch": 3.12, "grad_norm": 2.639941930770874, "learning_rate": 1.878023041069387e-05, "loss": 0.4933, "step": 35500 }, { "epoch": 3.17, "grad_norm": 2.457653760910034, "learning_rate": 1.8340515346055757e-05, "loss": 0.4955, "step": 36000 }, { "epoch": 3.21, "grad_norm": 2.2147014141082764, "learning_rate": 1.7900800281417643e-05, "loss": 0.4958, "step": 36500 }, { "epoch": 3.25, "grad_norm": 3.189746618270874, "learning_rate": 1.7461085216779526e-05, "loss": 0.4915, "step": 37000 }, { "epoch": 3.3, "grad_norm": 1.946791172027588, "learning_rate": 1.7021370152141413e-05, "loss": 0.492, "step": 37500 }, { "epoch": 3.34, "grad_norm": 1.1339478492736816, "learning_rate": 1.65816550875033e-05, "loss": 0.4889, "step": 38000 }, { "epoch": 3.39, "grad_norm": 2.4532470703125, "learning_rate": 1.6141940022865185e-05, "loss": 0.4851, "step": 38500 }, { "epoch": 3.43, "grad_norm": 1.2711316347122192, "learning_rate": 1.5702224958227068e-05, "loss": 0.5004, "step": 39000 }, { "epoch": 3.47, "grad_norm": 1.6113109588623047, "learning_rate": 1.5262509893588954e-05, "loss": 0.4829, "step": 39500 }, { "epoch": 3.52, "grad_norm": 2.32248592376709, "learning_rate": 1.482279482895084e-05, "loss": 0.4886, "step": 40000 }, { "epoch": 3.56, "grad_norm": 2.664520025253296, "learning_rate": 1.4383079764312727e-05, "loss": 0.4901, "step": 40500 }, { "epoch": 3.61, "grad_norm": 3.6276190280914307, "learning_rate": 1.3943364699674611e-05, "loss": 0.4906, "step": 41000 }, { "epoch": 3.65, "grad_norm": 2.5009617805480957, "learning_rate": 1.3503649635036497e-05, "loss": 0.4888, "step": 41500 }, { "epoch": 3.69, "grad_norm": 2.4080982208251953, "learning_rate": 1.3063934570398382e-05, "loss": 0.4841, "step": 42000 }, { "epoch": 3.74, "grad_norm": 2.2229392528533936, "learning_rate": 1.2624219505760268e-05, "loss": 0.4934, "step": 42500 }, { "epoch": 3.78, "grad_norm": 1.7180906534194946, "learning_rate": 1.2184504441122153e-05, "loss": 0.4864, "step": 43000 }, { "epoch": 3.83, "grad_norm": 2.0107908248901367, "learning_rate": 1.1744789376484039e-05, "loss": 0.4888, "step": 43500 }, { "epoch": 3.87, "grad_norm": 2.2142484188079834, "learning_rate": 1.1305074311845925e-05, "loss": 0.4854, "step": 44000 }, { "epoch": 3.91, "grad_norm": 2.1998207569122314, "learning_rate": 1.086535924720781e-05, "loss": 0.4947, "step": 44500 }, { "epoch": 3.96, "grad_norm": 1.7702863216400146, "learning_rate": 1.0425644182569696e-05, "loss": 0.4871, "step": 45000 }, { "epoch": 4.0, "eval_combined_score": 0.8021821467041941, "eval_f1": 0.8021821467041941, "eval_loss": 0.32757022976875305, "eval_runtime": 37.0242, "eval_samples_per_second": 1091.988, "eval_steps_per_second": 4.267, "step": 45484 }, { "epoch": 4.0, "grad_norm": 5.43105936050415, "learning_rate": 9.98592911793158e-06, "loss": 0.486, "step": 45500 }, { "epoch": 4.05, "grad_norm": 2.509622812271118, "learning_rate": 9.546214053293467e-06, "loss": 0.4838, "step": 46000 }, { "epoch": 4.09, "grad_norm": 1.9040151834487915, "learning_rate": 9.106498988655351e-06, "loss": 0.4848, "step": 46500 }, { "epoch": 4.13, "grad_norm": 1.892439365386963, "learning_rate": 8.666783924017237e-06, "loss": 0.4808, "step": 47000 }, { "epoch": 4.18, "grad_norm": 1.889847755432129, "learning_rate": 8.227068859379122e-06, "loss": 0.4803, "step": 47500 }, { "epoch": 4.22, "grad_norm": 2.515573740005493, "learning_rate": 7.787353794741008e-06, "loss": 0.4749, "step": 48000 }, { "epoch": 4.27, "grad_norm": 2.6856706142425537, "learning_rate": 7.347638730102893e-06, "loss": 0.4854, "step": 48500 }, { "epoch": 4.31, "grad_norm": 2.979976177215576, "learning_rate": 6.907923665464779e-06, "loss": 0.4858, "step": 49000 }, { "epoch": 4.35, "grad_norm": 2.1129562854766846, "learning_rate": 6.468208600826664e-06, "loss": 0.4863, "step": 49500 }, { "epoch": 4.4, "grad_norm": 2.569387435913086, "learning_rate": 6.02849353618855e-06, "loss": 0.4777, "step": 50000 }, { "epoch": 4.44, "grad_norm": 2.261967420578003, "learning_rate": 5.588778471550435e-06, "loss": 0.4726, "step": 50500 }, { "epoch": 4.49, "grad_norm": 2.604426145553589, "learning_rate": 5.149063406912321e-06, "loss": 0.4806, "step": 51000 }, { "epoch": 4.53, "grad_norm": 1.9708985090255737, "learning_rate": 4.709348342274207e-06, "loss": 0.4871, "step": 51500 }, { "epoch": 4.57, "grad_norm": 3.4044077396392822, "learning_rate": 4.269633277636092e-06, "loss": 0.4777, "step": 52000 }, { "epoch": 4.62, "grad_norm": 3.580048084259033, "learning_rate": 3.8299182129979776e-06, "loss": 0.4911, "step": 52500 }, { "epoch": 4.66, "grad_norm": 2.0711894035339355, "learning_rate": 3.3902031483598625e-06, "loss": 0.4789, "step": 53000 }, { "epoch": 4.7, "grad_norm": 3.7427501678466797, "learning_rate": 2.9504880837217483e-06, "loss": 0.48, "step": 53500 }, { "epoch": 4.75, "grad_norm": 1.780761957168579, "learning_rate": 2.5107730190836337e-06, "loss": 0.4821, "step": 54000 }, { "epoch": 4.79, "grad_norm": 2.185481309890747, "learning_rate": 2.0710579544455195e-06, "loss": 0.4874, "step": 54500 }, { "epoch": 4.84, "grad_norm": 2.1227777004241943, "learning_rate": 1.6313428898074047e-06, "loss": 0.4812, "step": 55000 }, { "epoch": 4.88, "grad_norm": 1.4683480262756348, "learning_rate": 1.1916278251692903e-06, "loss": 0.4785, "step": 55500 }, { "epoch": 4.92, "grad_norm": 1.6182867288589478, "learning_rate": 7.519127605311758e-07, "loss": 0.4834, "step": 56000 }, { "epoch": 4.97, "grad_norm": 2.2771718502044678, "learning_rate": 3.121976958930613e-07, "loss": 0.478, "step": 56500 }, { "epoch": 5.0, "eval_combined_score": 0.8088124957369893, "eval_f1": 0.8088124957369893, "eval_loss": 0.31970837712287903, "eval_runtime": 36.9254, "eval_samples_per_second": 1094.91, "eval_steps_per_second": 4.279, "step": 56855 }, { "epoch": 5.0, "step": 56855, "total_flos": 0.0, "train_loss": 0.5228507439386308, "train_runtime": 6326.7637, "train_samples_per_second": 287.545, "train_steps_per_second": 8.986 } ], "logging_steps": 500, "max_steps": 56855, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }