{ "best_metric": 0.9683195592286501, "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-wsdmhar\\checkpoint-4187", "epoch": 100.0, "eval_steps": 500, "global_step": 5300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18867924528301888, "grad_norm": 9.041044235229492, "learning_rate": 9.433962264150943e-07, "loss": 1.4928, "step": 10 }, { "epoch": 0.37735849056603776, "grad_norm": 7.276942253112793, "learning_rate": 1.8867924528301887e-06, "loss": 1.4202, "step": 20 }, { "epoch": 0.5660377358490566, "grad_norm": 8.077420234680176, "learning_rate": 2.830188679245283e-06, "loss": 1.3564, "step": 30 }, { "epoch": 0.7547169811320755, "grad_norm": 7.560600280761719, "learning_rate": 3.7735849056603773e-06, "loss": 1.1939, "step": 40 }, { "epoch": 0.9433962264150944, "grad_norm": 5.84022855758667, "learning_rate": 4.716981132075472e-06, "loss": 1.0624, "step": 50 }, { "epoch": 1.0, "eval_accuracy": 0.6091597796143251, "eval_loss": 0.8878794312477112, "eval_runtime": 12.739, "eval_samples_per_second": 227.961, "eval_steps_per_second": 7.143, "step": 53 }, { "epoch": 1.1320754716981132, "grad_norm": 9.758731842041016, "learning_rate": 5.660377358490566e-06, "loss": 0.9299, "step": 60 }, { "epoch": 1.320754716981132, "grad_norm": 7.677206993103027, "learning_rate": 6.60377358490566e-06, "loss": 0.8724, "step": 70 }, { "epoch": 1.509433962264151, "grad_norm": 8.157505989074707, "learning_rate": 7.547169811320755e-06, "loss": 0.7971, "step": 80 }, { "epoch": 1.6981132075471699, "grad_norm": 9.804584503173828, "learning_rate": 8.49056603773585e-06, "loss": 0.7717, "step": 90 }, { "epoch": 1.8867924528301887, "grad_norm": 8.317395210266113, "learning_rate": 9.433962264150944e-06, "loss": 0.6893, "step": 100 }, { "epoch": 2.0, "eval_accuracy": 0.7090220385674931, "eval_loss": 0.6601111888885498, "eval_runtime": 13.0303, "eval_samples_per_second": 222.865, "eval_steps_per_second": 6.984, "step": 106 }, { "epoch": 2.0754716981132075, "grad_norm": 5.714374542236328, "learning_rate": 1.0377358490566038e-05, "loss": 0.6923, "step": 110 }, { "epoch": 2.2641509433962264, "grad_norm": 8.416643142700195, "learning_rate": 1.1320754716981132e-05, "loss": 0.6475, "step": 120 }, { "epoch": 2.452830188679245, "grad_norm": 8.871334075927734, "learning_rate": 1.2264150943396227e-05, "loss": 0.6156, "step": 130 }, { "epoch": 2.641509433962264, "grad_norm": 10.934247970581055, "learning_rate": 1.320754716981132e-05, "loss": 0.597, "step": 140 }, { "epoch": 2.830188679245283, "grad_norm": 12.311450004577637, "learning_rate": 1.4150943396226415e-05, "loss": 0.6152, "step": 150 }, { "epoch": 3.0, "eval_accuracy": 0.7854683195592287, "eval_loss": 0.5114469528198242, "eval_runtime": 13.0985, "eval_samples_per_second": 221.706, "eval_steps_per_second": 6.947, "step": 159 }, { "epoch": 3.018867924528302, "grad_norm": 8.85444164276123, "learning_rate": 1.509433962264151e-05, "loss": 0.5785, "step": 160 }, { "epoch": 3.207547169811321, "grad_norm": 7.401244640350342, "learning_rate": 1.6037735849056604e-05, "loss": 0.5343, "step": 170 }, { "epoch": 3.3962264150943398, "grad_norm": 6.694042205810547, "learning_rate": 1.69811320754717e-05, "loss": 0.5322, "step": 180 }, { "epoch": 3.5849056603773586, "grad_norm": 7.124511241912842, "learning_rate": 1.7924528301886792e-05, "loss": 0.5533, "step": 190 }, { "epoch": 3.7735849056603774, "grad_norm": 11.249773025512695, "learning_rate": 1.8867924528301888e-05, "loss": 0.4865, "step": 200 }, { "epoch": 3.9622641509433962, "grad_norm": 19.239667892456055, "learning_rate": 1.9811320754716984e-05, "loss": 0.5456, "step": 210 }, { "epoch": 4.0, "eval_accuracy": 0.8422865013774105, "eval_loss": 0.3818637728691101, "eval_runtime": 13.2694, "eval_samples_per_second": 218.849, "eval_steps_per_second": 6.858, "step": 212 }, { "epoch": 4.150943396226415, "grad_norm": 11.330594062805176, "learning_rate": 2.0754716981132076e-05, "loss": 0.5114, "step": 220 }, { "epoch": 4.339622641509434, "grad_norm": 11.731520652770996, "learning_rate": 2.1698113207547172e-05, "loss": 0.4923, "step": 230 }, { "epoch": 4.528301886792453, "grad_norm": 6.535953521728516, "learning_rate": 2.2641509433962265e-05, "loss": 0.4251, "step": 240 }, { "epoch": 4.716981132075472, "grad_norm": 5.095239639282227, "learning_rate": 2.358490566037736e-05, "loss": 0.4388, "step": 250 }, { "epoch": 4.90566037735849, "grad_norm": 8.532243728637695, "learning_rate": 2.4528301886792453e-05, "loss": 0.4673, "step": 260 }, { "epoch": 5.0, "eval_accuracy": 0.871900826446281, "eval_loss": 0.32669946551322937, "eval_runtime": 13.3969, "eval_samples_per_second": 216.767, "eval_steps_per_second": 6.793, "step": 265 }, { "epoch": 5.09433962264151, "grad_norm": 7.914596080780029, "learning_rate": 2.547169811320755e-05, "loss": 0.4331, "step": 270 }, { "epoch": 5.283018867924528, "grad_norm": 7.141151428222656, "learning_rate": 2.641509433962264e-05, "loss": 0.4553, "step": 280 }, { "epoch": 5.471698113207547, "grad_norm": 9.838526725769043, "learning_rate": 2.7358490566037738e-05, "loss": 0.3864, "step": 290 }, { "epoch": 5.660377358490566, "grad_norm": 6.91007137298584, "learning_rate": 2.830188679245283e-05, "loss": 0.4015, "step": 300 }, { "epoch": 5.849056603773585, "grad_norm": 6.716864585876465, "learning_rate": 2.9245283018867926e-05, "loss": 0.4166, "step": 310 }, { "epoch": 6.0, "eval_accuracy": 0.9039256198347108, "eval_loss": 0.2804345488548279, "eval_runtime": 13.5503, "eval_samples_per_second": 214.312, "eval_steps_per_second": 6.716, "step": 318 }, { "epoch": 6.037735849056604, "grad_norm": 19.735078811645508, "learning_rate": 3.018867924528302e-05, "loss": 0.4004, "step": 320 }, { "epoch": 6.226415094339623, "grad_norm": 14.484492301940918, "learning_rate": 3.113207547169811e-05, "loss": 0.4114, "step": 330 }, { "epoch": 6.415094339622642, "grad_norm": 8.806564331054688, "learning_rate": 3.207547169811321e-05, "loss": 0.398, "step": 340 }, { "epoch": 6.60377358490566, "grad_norm": 11.519726753234863, "learning_rate": 3.30188679245283e-05, "loss": 0.413, "step": 350 }, { "epoch": 6.7924528301886795, "grad_norm": 4.552971363067627, "learning_rate": 3.39622641509434e-05, "loss": 0.3775, "step": 360 }, { "epoch": 6.981132075471698, "grad_norm": 6.50352668762207, "learning_rate": 3.490566037735849e-05, "loss": 0.3757, "step": 370 }, { "epoch": 7.0, "eval_accuracy": 0.8994490358126722, "eval_loss": 0.2881180942058563, "eval_runtime": 13.6033, "eval_samples_per_second": 213.478, "eval_steps_per_second": 6.69, "step": 371 }, { "epoch": 7.169811320754717, "grad_norm": 5.162815093994141, "learning_rate": 3.5849056603773584e-05, "loss": 0.3744, "step": 380 }, { "epoch": 7.3584905660377355, "grad_norm": 4.919179916381836, "learning_rate": 3.679245283018868e-05, "loss": 0.3084, "step": 390 }, { "epoch": 7.547169811320755, "grad_norm": 8.160808563232422, "learning_rate": 3.7735849056603776e-05, "loss": 0.3388, "step": 400 }, { "epoch": 7.735849056603773, "grad_norm": 7.450716972351074, "learning_rate": 3.867924528301887e-05, "loss": 0.3578, "step": 410 }, { "epoch": 7.9245283018867925, "grad_norm": 7.976863861083984, "learning_rate": 3.962264150943397e-05, "loss": 0.3798, "step": 420 }, { "epoch": 8.0, "eval_accuracy": 0.903236914600551, "eval_loss": 0.26347047090530396, "eval_runtime": 13.5658, "eval_samples_per_second": 214.068, "eval_steps_per_second": 6.708, "step": 424 }, { "epoch": 8.11320754716981, "grad_norm": 6.626104354858398, "learning_rate": 4.0566037735849064e-05, "loss": 0.3583, "step": 430 }, { "epoch": 8.30188679245283, "grad_norm": 6.100863933563232, "learning_rate": 4.150943396226415e-05, "loss": 0.3201, "step": 440 }, { "epoch": 8.49056603773585, "grad_norm": 5.2761969566345215, "learning_rate": 4.245283018867925e-05, "loss": 0.3452, "step": 450 }, { "epoch": 8.679245283018869, "grad_norm": 4.848881244659424, "learning_rate": 4.3396226415094345e-05, "loss": 0.3602, "step": 460 }, { "epoch": 8.867924528301886, "grad_norm": 10.011419296264648, "learning_rate": 4.433962264150944e-05, "loss": 0.3303, "step": 470 }, { "epoch": 9.0, "eval_accuracy": 0.9073691460055097, "eval_loss": 0.27034133672714233, "eval_runtime": 13.7374, "eval_samples_per_second": 211.393, "eval_steps_per_second": 6.624, "step": 477 }, { "epoch": 9.056603773584905, "grad_norm": 8.56981372833252, "learning_rate": 4.528301886792453e-05, "loss": 0.31, "step": 480 }, { "epoch": 9.245283018867925, "grad_norm": 7.121793270111084, "learning_rate": 4.6226415094339625e-05, "loss": 0.3098, "step": 490 }, { "epoch": 9.433962264150944, "grad_norm": 4.458034515380859, "learning_rate": 4.716981132075472e-05, "loss": 0.3495, "step": 500 }, { "epoch": 9.622641509433961, "grad_norm": 6.7864484786987305, "learning_rate": 4.811320754716982e-05, "loss": 0.3383, "step": 510 }, { "epoch": 9.81132075471698, "grad_norm": 6.478758811950684, "learning_rate": 4.9056603773584906e-05, "loss": 0.316, "step": 520 }, { "epoch": 10.0, "grad_norm": 6.849891185760498, "learning_rate": 5e-05, "loss": 0.3346, "step": 530 }, { "epoch": 10.0, "eval_accuracy": 0.9004820936639119, "eval_loss": 0.25653818249702454, "eval_runtime": 13.7209, "eval_samples_per_second": 211.649, "eval_steps_per_second": 6.632, "step": 530 }, { "epoch": 10.18867924528302, "grad_norm": 5.824264049530029, "learning_rate": 4.989517819706499e-05, "loss": 0.3262, "step": 540 }, { "epoch": 10.377358490566039, "grad_norm": 9.913765907287598, "learning_rate": 4.979035639412998e-05, "loss": 0.2917, "step": 550 }, { "epoch": 10.566037735849056, "grad_norm": 8.706583976745605, "learning_rate": 4.968553459119497e-05, "loss": 0.325, "step": 560 }, { "epoch": 10.754716981132075, "grad_norm": 4.388184070587158, "learning_rate": 4.958071278825996e-05, "loss": 0.2934, "step": 570 }, { "epoch": 10.943396226415095, "grad_norm": 7.9404988288879395, "learning_rate": 4.947589098532495e-05, "loss": 0.2971, "step": 580 }, { "epoch": 11.0, "eval_accuracy": 0.931129476584022, "eval_loss": 0.21819160878658295, "eval_runtime": 13.8344, "eval_samples_per_second": 209.911, "eval_steps_per_second": 6.578, "step": 583 }, { "epoch": 11.132075471698114, "grad_norm": 4.344425678253174, "learning_rate": 4.937106918238994e-05, "loss": 0.25, "step": 590 }, { "epoch": 11.320754716981131, "grad_norm": 7.321628570556641, "learning_rate": 4.9266247379454926e-05, "loss": 0.3084, "step": 600 }, { "epoch": 11.50943396226415, "grad_norm": 12.875089645385742, "learning_rate": 4.916142557651992e-05, "loss": 0.3069, "step": 610 }, { "epoch": 11.69811320754717, "grad_norm": 6.05894660949707, "learning_rate": 4.9056603773584906e-05, "loss": 0.2905, "step": 620 }, { "epoch": 11.88679245283019, "grad_norm": 10.023378372192383, "learning_rate": 4.8951781970649894e-05, "loss": 0.2992, "step": 630 }, { "epoch": 12.0, "eval_accuracy": 0.9256198347107438, "eval_loss": 0.22397693991661072, "eval_runtime": 13.8485, "eval_samples_per_second": 209.698, "eval_steps_per_second": 6.571, "step": 636 }, { "epoch": 12.075471698113208, "grad_norm": 6.025568962097168, "learning_rate": 4.884696016771489e-05, "loss": 0.2936, "step": 640 }, { "epoch": 12.264150943396226, "grad_norm": 5.965641021728516, "learning_rate": 4.8742138364779875e-05, "loss": 0.266, "step": 650 }, { "epoch": 12.452830188679245, "grad_norm": 4.047191143035889, "learning_rate": 4.863731656184486e-05, "loss": 0.2696, "step": 660 }, { "epoch": 12.641509433962264, "grad_norm": 4.262639999389648, "learning_rate": 4.8532494758909855e-05, "loss": 0.2642, "step": 670 }, { "epoch": 12.830188679245284, "grad_norm": 4.320269584655762, "learning_rate": 4.842767295597484e-05, "loss": 0.2637, "step": 680 }, { "epoch": 13.0, "eval_accuracy": 0.9238980716253443, "eval_loss": 0.2130652368068695, "eval_runtime": 13.8031, "eval_samples_per_second": 210.388, "eval_steps_per_second": 6.593, "step": 689 }, { "epoch": 13.018867924528301, "grad_norm": 6.210031032562256, "learning_rate": 4.8322851153039836e-05, "loss": 0.2755, "step": 690 }, { "epoch": 13.20754716981132, "grad_norm": 5.736047744750977, "learning_rate": 4.8218029350104823e-05, "loss": 0.2373, "step": 700 }, { "epoch": 13.39622641509434, "grad_norm": 5.341845989227295, "learning_rate": 4.811320754716982e-05, "loss": 0.2462, "step": 710 }, { "epoch": 13.584905660377359, "grad_norm": 4.025936126708984, "learning_rate": 4.8008385744234804e-05, "loss": 0.2491, "step": 720 }, { "epoch": 13.773584905660378, "grad_norm": 4.850243091583252, "learning_rate": 4.79035639412998e-05, "loss": 0.2437, "step": 730 }, { "epoch": 13.962264150943396, "grad_norm": 4.280818939208984, "learning_rate": 4.7798742138364785e-05, "loss": 0.2653, "step": 740 }, { "epoch": 14.0, "eval_accuracy": 0.9397382920110193, "eval_loss": 0.18005970120429993, "eval_runtime": 13.9135, "eval_samples_per_second": 208.718, "eval_steps_per_second": 6.54, "step": 742 }, { "epoch": 14.150943396226415, "grad_norm": 4.664927959442139, "learning_rate": 4.769392033542977e-05, "loss": 0.2444, "step": 750 }, { "epoch": 14.339622641509434, "grad_norm": 4.106526851654053, "learning_rate": 4.7589098532494766e-05, "loss": 0.2468, "step": 760 }, { "epoch": 14.528301886792454, "grad_norm": 5.82990837097168, "learning_rate": 4.7484276729559753e-05, "loss": 0.2534, "step": 770 }, { "epoch": 14.716981132075471, "grad_norm": 5.992727279663086, "learning_rate": 4.737945492662474e-05, "loss": 0.2247, "step": 780 }, { "epoch": 14.90566037735849, "grad_norm": 5.288634300231934, "learning_rate": 4.7274633123689734e-05, "loss": 0.2472, "step": 790 }, { "epoch": 15.0, "eval_accuracy": 0.9376721763085399, "eval_loss": 0.18071921169757843, "eval_runtime": 13.77, "eval_samples_per_second": 210.893, "eval_steps_per_second": 6.609, "step": 795 }, { "epoch": 15.09433962264151, "grad_norm": 5.116296768188477, "learning_rate": 4.716981132075472e-05, "loss": 0.2706, "step": 800 }, { "epoch": 15.283018867924529, "grad_norm": 4.876150131225586, "learning_rate": 4.706498951781971e-05, "loss": 0.2174, "step": 810 }, { "epoch": 15.471698113207546, "grad_norm": 4.544086456298828, "learning_rate": 4.69601677148847e-05, "loss": 0.1944, "step": 820 }, { "epoch": 15.660377358490566, "grad_norm": 7.460907459259033, "learning_rate": 4.685534591194969e-05, "loss": 0.2432, "step": 830 }, { "epoch": 15.849056603773585, "grad_norm": 5.503137111663818, "learning_rate": 4.6750524109014677e-05, "loss": 0.2263, "step": 840 }, { "epoch": 16.0, "eval_accuracy": 0.946625344352617, "eval_loss": 0.16115374863147736, "eval_runtime": 13.8807, "eval_samples_per_second": 209.211, "eval_steps_per_second": 6.556, "step": 848 }, { "epoch": 16.037735849056602, "grad_norm": 3.875622272491455, "learning_rate": 4.664570230607967e-05, "loss": 0.2365, "step": 850 }, { "epoch": 16.22641509433962, "grad_norm": 4.338544845581055, "learning_rate": 4.654088050314466e-05, "loss": 0.1973, "step": 860 }, { "epoch": 16.41509433962264, "grad_norm": 6.0123724937438965, "learning_rate": 4.6436058700209645e-05, "loss": 0.2341, "step": 870 }, { "epoch": 16.60377358490566, "grad_norm": 3.352548837661743, "learning_rate": 4.633123689727464e-05, "loss": 0.195, "step": 880 }, { "epoch": 16.79245283018868, "grad_norm": 3.6743788719177246, "learning_rate": 4.6226415094339625e-05, "loss": 0.2149, "step": 890 }, { "epoch": 16.9811320754717, "grad_norm": 3.634098768234253, "learning_rate": 4.612159329140461e-05, "loss": 0.1786, "step": 900 }, { "epoch": 17.0, "eval_accuracy": 0.9418044077134986, "eval_loss": 0.17354166507720947, "eval_runtime": 13.8691, "eval_samples_per_second": 209.387, "eval_steps_per_second": 6.561, "step": 901 }, { "epoch": 17.169811320754718, "grad_norm": 4.430752277374268, "learning_rate": 4.6016771488469606e-05, "loss": 0.205, "step": 910 }, { "epoch": 17.358490566037737, "grad_norm": 3.114032030105591, "learning_rate": 4.5911949685534594e-05, "loss": 0.2002, "step": 920 }, { "epoch": 17.547169811320753, "grad_norm": 5.479780197143555, "learning_rate": 4.580712788259958e-05, "loss": 0.2224, "step": 930 }, { "epoch": 17.735849056603772, "grad_norm": 5.793611526489258, "learning_rate": 4.570230607966457e-05, "loss": 0.1961, "step": 940 }, { "epoch": 17.92452830188679, "grad_norm": 3.505262613296509, "learning_rate": 4.559748427672956e-05, "loss": 0.2103, "step": 950 }, { "epoch": 18.0, "eval_accuracy": 0.9462809917355371, "eval_loss": 0.1786232590675354, "eval_runtime": 14.016, "eval_samples_per_second": 207.192, "eval_steps_per_second": 6.493, "step": 954 }, { "epoch": 18.11320754716981, "grad_norm": 3.520742177963257, "learning_rate": 4.549266247379455e-05, "loss": 0.2158, "step": 960 }, { "epoch": 18.30188679245283, "grad_norm": 5.575334072113037, "learning_rate": 4.5387840670859536e-05, "loss": 0.2004, "step": 970 }, { "epoch": 18.49056603773585, "grad_norm": 4.227255344390869, "learning_rate": 4.528301886792453e-05, "loss": 0.173, "step": 980 }, { "epoch": 18.67924528301887, "grad_norm": 4.518486022949219, "learning_rate": 4.517819706498952e-05, "loss": 0.1898, "step": 990 }, { "epoch": 18.867924528301888, "grad_norm": 3.9365193843841553, "learning_rate": 4.5073375262054504e-05, "loss": 0.1725, "step": 1000 }, { "epoch": 19.0, "eval_accuracy": 0.9473140495867769, "eval_loss": 0.16307222843170166, "eval_runtime": 14.0184, "eval_samples_per_second": 207.157, "eval_steps_per_second": 6.491, "step": 1007 }, { "epoch": 19.056603773584907, "grad_norm": 4.218902587890625, "learning_rate": 4.49685534591195e-05, "loss": 0.1953, "step": 1010 }, { "epoch": 19.245283018867923, "grad_norm": 5.483488082885742, "learning_rate": 4.4863731656184485e-05, "loss": 0.1648, "step": 1020 }, { "epoch": 19.433962264150942, "grad_norm": 5.378849029541016, "learning_rate": 4.475890985324948e-05, "loss": 0.1807, "step": 1030 }, { "epoch": 19.62264150943396, "grad_norm": 5.601091384887695, "learning_rate": 4.4654088050314466e-05, "loss": 0.2101, "step": 1040 }, { "epoch": 19.81132075471698, "grad_norm": 2.9287290573120117, "learning_rate": 4.454926624737946e-05, "loss": 0.1628, "step": 1050 }, { "epoch": 20.0, "grad_norm": 6.378795146942139, "learning_rate": 4.4444444444444447e-05, "loss": 0.1787, "step": 1060 }, { "epoch": 20.0, "eval_accuracy": 0.953168044077135, "eval_loss": 0.14394132792949677, "eval_runtime": 13.9963, "eval_samples_per_second": 207.483, "eval_steps_per_second": 6.502, "step": 1060 }, { "epoch": 20.18867924528302, "grad_norm": 5.156091213226318, "learning_rate": 4.433962264150944e-05, "loss": 0.1518, "step": 1070 }, { "epoch": 20.37735849056604, "grad_norm": 5.145476818084717, "learning_rate": 4.423480083857443e-05, "loss": 0.1779, "step": 1080 }, { "epoch": 20.566037735849058, "grad_norm": 4.119344711303711, "learning_rate": 4.4129979035639415e-05, "loss": 0.2038, "step": 1090 }, { "epoch": 20.754716981132077, "grad_norm": 4.134652137756348, "learning_rate": 4.402515723270441e-05, "loss": 0.1713, "step": 1100 }, { "epoch": 20.943396226415093, "grad_norm": 5.130320072174072, "learning_rate": 4.3920335429769396e-05, "loss": 0.1924, "step": 1110 }, { "epoch": 21.0, "eval_accuracy": 0.9504132231404959, "eval_loss": 0.13879624009132385, "eval_runtime": 13.8513, "eval_samples_per_second": 209.655, "eval_steps_per_second": 6.57, "step": 1113 }, { "epoch": 21.132075471698112, "grad_norm": 5.733437538146973, "learning_rate": 4.381551362683438e-05, "loss": 0.164, "step": 1120 }, { "epoch": 21.32075471698113, "grad_norm": 3.7996554374694824, "learning_rate": 4.3710691823899376e-05, "loss": 0.1479, "step": 1130 }, { "epoch": 21.50943396226415, "grad_norm": 4.9020891189575195, "learning_rate": 4.3605870020964364e-05, "loss": 0.1657, "step": 1140 }, { "epoch": 21.69811320754717, "grad_norm": 4.601658344268799, "learning_rate": 4.350104821802935e-05, "loss": 0.1644, "step": 1150 }, { "epoch": 21.88679245283019, "grad_norm": 5.638570785522461, "learning_rate": 4.3396226415094345e-05, "loss": 0.1662, "step": 1160 }, { "epoch": 22.0, "eval_accuracy": 0.9507575757575758, "eval_loss": 0.1469692885875702, "eval_runtime": 13.9412, "eval_samples_per_second": 208.304, "eval_steps_per_second": 6.527, "step": 1166 }, { "epoch": 22.07547169811321, "grad_norm": 7.325222015380859, "learning_rate": 4.329140461215933e-05, "loss": 0.1926, "step": 1170 }, { "epoch": 22.264150943396228, "grad_norm": 5.337314128875732, "learning_rate": 4.318658280922432e-05, "loss": 0.1908, "step": 1180 }, { "epoch": 22.452830188679247, "grad_norm": 3.660973072052002, "learning_rate": 4.308176100628931e-05, "loss": 0.1663, "step": 1190 }, { "epoch": 22.641509433962263, "grad_norm": 7.075264930725098, "learning_rate": 4.29769392033543e-05, "loss": 0.1778, "step": 1200 }, { "epoch": 22.830188679245282, "grad_norm": 2.9561445713043213, "learning_rate": 4.287211740041929e-05, "loss": 0.1724, "step": 1210 }, { "epoch": 23.0, "eval_accuracy": 0.949724517906336, "eval_loss": 0.15376034379005432, "eval_runtime": 13.9076, "eval_samples_per_second": 208.807, "eval_steps_per_second": 6.543, "step": 1219 }, { "epoch": 23.0188679245283, "grad_norm": 6.19976806640625, "learning_rate": 4.276729559748428e-05, "loss": 0.1867, "step": 1220 }, { "epoch": 23.20754716981132, "grad_norm": 5.15887451171875, "learning_rate": 4.266247379454927e-05, "loss": 0.1809, "step": 1230 }, { "epoch": 23.39622641509434, "grad_norm": 5.114108562469482, "learning_rate": 4.2557651991614255e-05, "loss": 0.1636, "step": 1240 }, { "epoch": 23.58490566037736, "grad_norm": 5.130180358886719, "learning_rate": 4.245283018867925e-05, "loss": 0.1835, "step": 1250 }, { "epoch": 23.77358490566038, "grad_norm": 5.375781536102295, "learning_rate": 4.2348008385744236e-05, "loss": 0.1764, "step": 1260 }, { "epoch": 23.962264150943398, "grad_norm": 3.6640334129333496, "learning_rate": 4.224318658280922e-05, "loss": 0.1633, "step": 1270 }, { "epoch": 24.0, "eval_accuracy": 0.9383608815426997, "eval_loss": 0.17309589684009552, "eval_runtime": 22.6597, "eval_samples_per_second": 128.157, "eval_steps_per_second": 4.016, "step": 1272 }, { "epoch": 24.150943396226417, "grad_norm": 4.2943196296691895, "learning_rate": 4.213836477987422e-05, "loss": 0.1458, "step": 1280 }, { "epoch": 24.339622641509433, "grad_norm": 4.514562129974365, "learning_rate": 4.2033542976939204e-05, "loss": 0.159, "step": 1290 }, { "epoch": 24.528301886792452, "grad_norm": 7.0057454109191895, "learning_rate": 4.192872117400419e-05, "loss": 0.1507, "step": 1300 }, { "epoch": 24.71698113207547, "grad_norm": 5.325743198394775, "learning_rate": 4.1823899371069185e-05, "loss": 0.201, "step": 1310 }, { "epoch": 24.90566037735849, "grad_norm": 4.204189300537109, "learning_rate": 4.171907756813417e-05, "loss": 0.174, "step": 1320 }, { "epoch": 25.0, "eval_accuracy": 0.9538567493112947, "eval_loss": 0.15548603236675262, "eval_runtime": 20.4244, "eval_samples_per_second": 142.183, "eval_steps_per_second": 4.455, "step": 1325 }, { "epoch": 25.09433962264151, "grad_norm": 5.918984413146973, "learning_rate": 4.161425576519916e-05, "loss": 0.1604, "step": 1330 }, { "epoch": 25.28301886792453, "grad_norm": 3.3942575454711914, "learning_rate": 4.150943396226415e-05, "loss": 0.1297, "step": 1340 }, { "epoch": 25.471698113207548, "grad_norm": 2.6150972843170166, "learning_rate": 4.140461215932914e-05, "loss": 0.1407, "step": 1350 }, { "epoch": 25.660377358490567, "grad_norm": 4.893429756164551, "learning_rate": 4.129979035639413e-05, "loss": 0.1637, "step": 1360 }, { "epoch": 25.849056603773583, "grad_norm": 5.928979396820068, "learning_rate": 4.119496855345912e-05, "loss": 0.1657, "step": 1370 }, { "epoch": 26.0, "eval_accuracy": 0.9493801652892562, "eval_loss": 0.15420162677764893, "eval_runtime": 20.5055, "eval_samples_per_second": 141.62, "eval_steps_per_second": 4.438, "step": 1378 }, { "epoch": 26.037735849056602, "grad_norm": 3.1796255111694336, "learning_rate": 4.109014675052411e-05, "loss": 0.1548, "step": 1380 }, { "epoch": 26.22641509433962, "grad_norm": 3.422128438949585, "learning_rate": 4.09853249475891e-05, "loss": 0.1379, "step": 1390 }, { "epoch": 26.41509433962264, "grad_norm": 4.308591365814209, "learning_rate": 4.088050314465409e-05, "loss": 0.1588, "step": 1400 }, { "epoch": 26.60377358490566, "grad_norm": 5.928549766540527, "learning_rate": 4.077568134171908e-05, "loss": 0.1793, "step": 1410 }, { "epoch": 26.79245283018868, "grad_norm": 4.973033905029297, "learning_rate": 4.067085953878407e-05, "loss": 0.114, "step": 1420 }, { "epoch": 26.9811320754717, "grad_norm": 3.7284395694732666, "learning_rate": 4.0566037735849064e-05, "loss": 0.1513, "step": 1430 }, { "epoch": 27.0, "eval_accuracy": 0.9507575757575758, "eval_loss": 0.15260200202465057, "eval_runtime": 20.5592, "eval_samples_per_second": 141.251, "eval_steps_per_second": 4.426, "step": 1431 }, { "epoch": 27.169811320754718, "grad_norm": 4.743962287902832, "learning_rate": 4.046121593291405e-05, "loss": 0.1396, "step": 1440 }, { "epoch": 27.358490566037737, "grad_norm": 5.549553871154785, "learning_rate": 4.035639412997904e-05, "loss": 0.1556, "step": 1450 }, { "epoch": 27.547169811320753, "grad_norm": 4.103055953979492, "learning_rate": 4.025157232704403e-05, "loss": 0.1448, "step": 1460 }, { "epoch": 27.735849056603772, "grad_norm": 3.5048537254333496, "learning_rate": 4.014675052410902e-05, "loss": 0.1536, "step": 1470 }, { "epoch": 27.92452830188679, "grad_norm": 3.7681641578674316, "learning_rate": 4.0041928721174006e-05, "loss": 0.126, "step": 1480 }, { "epoch": 28.0, "eval_accuracy": 0.9511019283746557, "eval_loss": 0.15600712597370148, "eval_runtime": 20.7331, "eval_samples_per_second": 140.066, "eval_steps_per_second": 4.389, "step": 1484 }, { "epoch": 28.11320754716981, "grad_norm": 3.522587299346924, "learning_rate": 3.9937106918239e-05, "loss": 0.1438, "step": 1490 }, { "epoch": 28.30188679245283, "grad_norm": 3.7148783206939697, "learning_rate": 3.983228511530399e-05, "loss": 0.154, "step": 1500 }, { "epoch": 28.49056603773585, "grad_norm": 7.454770565032959, "learning_rate": 3.9727463312368974e-05, "loss": 0.1783, "step": 1510 }, { "epoch": 28.67924528301887, "grad_norm": 3.996530532836914, "learning_rate": 3.962264150943397e-05, "loss": 0.1263, "step": 1520 }, { "epoch": 28.867924528301888, "grad_norm": 5.041396141052246, "learning_rate": 3.9517819706498955e-05, "loss": 0.1508, "step": 1530 }, { "epoch": 29.0, "eval_accuracy": 0.9480027548209367, "eval_loss": 0.16071830689907074, "eval_runtime": 21.0223, "eval_samples_per_second": 138.139, "eval_steps_per_second": 4.329, "step": 1537 }, { "epoch": 29.056603773584907, "grad_norm": 3.525146007537842, "learning_rate": 3.941299790356394e-05, "loss": 0.1421, "step": 1540 }, { "epoch": 29.245283018867923, "grad_norm": 5.157039642333984, "learning_rate": 3.9308176100628936e-05, "loss": 0.1276, "step": 1550 }, { "epoch": 29.433962264150942, "grad_norm": 6.565103054046631, "learning_rate": 3.920335429769392e-05, "loss": 0.1662, "step": 1560 }, { "epoch": 29.62264150943396, "grad_norm": 3.140432357788086, "learning_rate": 3.909853249475891e-05, "loss": 0.1446, "step": 1570 }, { "epoch": 29.81132075471698, "grad_norm": 3.157646656036377, "learning_rate": 3.8993710691823904e-05, "loss": 0.1227, "step": 1580 }, { "epoch": 30.0, "grad_norm": 3.9616310596466064, "learning_rate": 3.888888888888889e-05, "loss": 0.1368, "step": 1590 }, { "epoch": 30.0, "eval_accuracy": 0.9435261707988981, "eval_loss": 0.17288929224014282, "eval_runtime": 20.8012, "eval_samples_per_second": 139.608, "eval_steps_per_second": 4.375, "step": 1590 }, { "epoch": 30.18867924528302, "grad_norm": 2.845510482788086, "learning_rate": 3.878406708595388e-05, "loss": 0.1336, "step": 1600 }, { "epoch": 30.37735849056604, "grad_norm": 3.7001242637634277, "learning_rate": 3.867924528301887e-05, "loss": 0.1202, "step": 1610 }, { "epoch": 30.566037735849058, "grad_norm": 2.4213449954986572, "learning_rate": 3.857442348008386e-05, "loss": 0.1529, "step": 1620 }, { "epoch": 30.754716981132077, "grad_norm": 3.6384825706481934, "learning_rate": 3.8469601677148846e-05, "loss": 0.1128, "step": 1630 }, { "epoch": 30.943396226415093, "grad_norm": 9.870887756347656, "learning_rate": 3.836477987421384e-05, "loss": 0.1166, "step": 1640 }, { "epoch": 31.0, "eval_accuracy": 0.953168044077135, "eval_loss": 0.155534565448761, "eval_runtime": 20.9232, "eval_samples_per_second": 138.793, "eval_steps_per_second": 4.349, "step": 1643 }, { "epoch": 31.132075471698112, "grad_norm": 7.167893886566162, "learning_rate": 3.825995807127883e-05, "loss": 0.1234, "step": 1650 }, { "epoch": 31.32075471698113, "grad_norm": 5.442743301391602, "learning_rate": 3.8155136268343814e-05, "loss": 0.1565, "step": 1660 }, { "epoch": 31.50943396226415, "grad_norm": 4.499869346618652, "learning_rate": 3.805031446540881e-05, "loss": 0.1191, "step": 1670 }, { "epoch": 31.69811320754717, "grad_norm": 4.722647666931152, "learning_rate": 3.7945492662473795e-05, "loss": 0.1239, "step": 1680 }, { "epoch": 31.88679245283019, "grad_norm": 3.9234910011291504, "learning_rate": 3.784067085953878e-05, "loss": 0.1076, "step": 1690 }, { "epoch": 32.0, "eval_accuracy": 0.9579889807162535, "eval_loss": 0.14003022015094757, "eval_runtime": 21.2564, "eval_samples_per_second": 136.618, "eval_steps_per_second": 4.281, "step": 1696 }, { "epoch": 32.075471698113205, "grad_norm": 5.3656721115112305, "learning_rate": 3.7735849056603776e-05, "loss": 0.1086, "step": 1700 }, { "epoch": 32.264150943396224, "grad_norm": 2.3153514862060547, "learning_rate": 3.763102725366876e-05, "loss": 0.1109, "step": 1710 }, { "epoch": 32.45283018867924, "grad_norm": 6.487193584442139, "learning_rate": 3.752620545073376e-05, "loss": 0.1296, "step": 1720 }, { "epoch": 32.64150943396226, "grad_norm": 4.362462997436523, "learning_rate": 3.7421383647798744e-05, "loss": 0.1078, "step": 1730 }, { "epoch": 32.83018867924528, "grad_norm": 4.543455123901367, "learning_rate": 3.731656184486374e-05, "loss": 0.1189, "step": 1740 }, { "epoch": 33.0, "eval_accuracy": 0.9590220385674931, "eval_loss": 0.14192205667495728, "eval_runtime": 21.093, "eval_samples_per_second": 137.676, "eval_steps_per_second": 4.314, "step": 1749 }, { "epoch": 33.0188679245283, "grad_norm": 3.2465994358062744, "learning_rate": 3.7211740041928725e-05, "loss": 0.1553, "step": 1750 }, { "epoch": 33.20754716981132, "grad_norm": 3.4110634326934814, "learning_rate": 3.710691823899371e-05, "loss": 0.1123, "step": 1760 }, { "epoch": 33.39622641509434, "grad_norm": 6.8291802406311035, "learning_rate": 3.7002096436058706e-05, "loss": 0.1288, "step": 1770 }, { "epoch": 33.58490566037736, "grad_norm": 5.650381565093994, "learning_rate": 3.689727463312369e-05, "loss": 0.1498, "step": 1780 }, { "epoch": 33.77358490566038, "grad_norm": 4.716341018676758, "learning_rate": 3.679245283018868e-05, "loss": 0.137, "step": 1790 }, { "epoch": 33.9622641509434, "grad_norm": 4.079151153564453, "learning_rate": 3.6687631027253674e-05, "loss": 0.1512, "step": 1800 }, { "epoch": 34.0, "eval_accuracy": 0.9579889807162535, "eval_loss": 0.13637615740299225, "eval_runtime": 21.2019, "eval_samples_per_second": 136.969, "eval_steps_per_second": 4.292, "step": 1802 }, { "epoch": 34.15094339622642, "grad_norm": 3.646527051925659, "learning_rate": 3.658280922431866e-05, "loss": 0.1018, "step": 1810 }, { "epoch": 34.339622641509436, "grad_norm": 6.170238971710205, "learning_rate": 3.647798742138365e-05, "loss": 0.1085, "step": 1820 }, { "epoch": 34.528301886792455, "grad_norm": 3.559018611907959, "learning_rate": 3.637316561844864e-05, "loss": 0.115, "step": 1830 }, { "epoch": 34.716981132075475, "grad_norm": 5.245954990386963, "learning_rate": 3.626834381551363e-05, "loss": 0.1183, "step": 1840 }, { "epoch": 34.905660377358494, "grad_norm": 4.156854629516602, "learning_rate": 3.6163522012578616e-05, "loss": 0.1323, "step": 1850 }, { "epoch": 35.0, "eval_accuracy": 0.9538567493112947, "eval_loss": 0.14969290792942047, "eval_runtime": 21.2793, "eval_samples_per_second": 136.471, "eval_steps_per_second": 4.276, "step": 1855 }, { "epoch": 35.094339622641506, "grad_norm": 4.309362888336182, "learning_rate": 3.605870020964361e-05, "loss": 0.1268, "step": 1860 }, { "epoch": 35.283018867924525, "grad_norm": 3.494779348373413, "learning_rate": 3.59538784067086e-05, "loss": 0.1339, "step": 1870 }, { "epoch": 35.471698113207545, "grad_norm": 4.567333221435547, "learning_rate": 3.5849056603773584e-05, "loss": 0.1003, "step": 1880 }, { "epoch": 35.660377358490564, "grad_norm": 5.073373317718506, "learning_rate": 3.574423480083858e-05, "loss": 0.1413, "step": 1890 }, { "epoch": 35.84905660377358, "grad_norm": 3.0305075645446777, "learning_rate": 3.5639412997903565e-05, "loss": 0.1031, "step": 1900 }, { "epoch": 36.0, "eval_accuracy": 0.9579889807162535, "eval_loss": 0.14369449019432068, "eval_runtime": 21.3446, "eval_samples_per_second": 136.053, "eval_steps_per_second": 4.263, "step": 1908 }, { "epoch": 36.0377358490566, "grad_norm": 2.8232624530792236, "learning_rate": 3.553459119496855e-05, "loss": 0.1213, "step": 1910 }, { "epoch": 36.22641509433962, "grad_norm": 3.2020962238311768, "learning_rate": 3.5429769392033546e-05, "loss": 0.0914, "step": 1920 }, { "epoch": 36.41509433962264, "grad_norm": 4.236616134643555, "learning_rate": 3.532494758909853e-05, "loss": 0.1012, "step": 1930 }, { "epoch": 36.60377358490566, "grad_norm": 4.817173480987549, "learning_rate": 3.522012578616352e-05, "loss": 0.1082, "step": 1940 }, { "epoch": 36.79245283018868, "grad_norm": 3.9018845558166504, "learning_rate": 3.5115303983228514e-05, "loss": 0.1074, "step": 1950 }, { "epoch": 36.9811320754717, "grad_norm": 5.009905815124512, "learning_rate": 3.50104821802935e-05, "loss": 0.1215, "step": 1960 }, { "epoch": 37.0, "eval_accuracy": 0.9559228650137741, "eval_loss": 0.14596055448055267, "eval_runtime": 21.2599, "eval_samples_per_second": 136.595, "eval_steps_per_second": 4.28, "step": 1961 }, { "epoch": 37.16981132075472, "grad_norm": 2.7128424644470215, "learning_rate": 3.490566037735849e-05, "loss": 0.1129, "step": 1970 }, { "epoch": 37.35849056603774, "grad_norm": 4.401316165924072, "learning_rate": 3.480083857442348e-05, "loss": 0.1205, "step": 1980 }, { "epoch": 37.54716981132076, "grad_norm": 5.2666778564453125, "learning_rate": 3.469601677148847e-05, "loss": 0.127, "step": 1990 }, { "epoch": 37.735849056603776, "grad_norm": 2.7217955589294434, "learning_rate": 3.4591194968553456e-05, "loss": 0.1068, "step": 2000 }, { "epoch": 37.924528301886795, "grad_norm": 3.7162227630615234, "learning_rate": 3.448637316561845e-05, "loss": 0.1069, "step": 2010 }, { "epoch": 38.0, "eval_accuracy": 0.9600550964187328, "eval_loss": 0.13623014092445374, "eval_runtime": 21.3213, "eval_samples_per_second": 136.202, "eval_steps_per_second": 4.268, "step": 2014 }, { "epoch": 38.113207547169814, "grad_norm": 4.025697708129883, "learning_rate": 3.438155136268344e-05, "loss": 0.1095, "step": 2020 }, { "epoch": 38.301886792452834, "grad_norm": 9.188610076904297, "learning_rate": 3.4276729559748424e-05, "loss": 0.1233, "step": 2030 }, { "epoch": 38.490566037735846, "grad_norm": 4.473904609680176, "learning_rate": 3.417190775681342e-05, "loss": 0.1043, "step": 2040 }, { "epoch": 38.679245283018865, "grad_norm": 3.2655704021453857, "learning_rate": 3.4067085953878405e-05, "loss": 0.0906, "step": 2050 }, { "epoch": 38.867924528301884, "grad_norm": 5.039525985717773, "learning_rate": 3.39622641509434e-05, "loss": 0.129, "step": 2060 }, { "epoch": 39.0, "eval_accuracy": 0.9590220385674931, "eval_loss": 0.1490471512079239, "eval_runtime": 21.334, "eval_samples_per_second": 136.12, "eval_steps_per_second": 4.265, "step": 2067 }, { "epoch": 39.056603773584904, "grad_norm": 3.194096326828003, "learning_rate": 3.3857442348008386e-05, "loss": 0.0893, "step": 2070 }, { "epoch": 39.24528301886792, "grad_norm": 1.9600547552108765, "learning_rate": 3.375262054507338e-05, "loss": 0.103, "step": 2080 }, { "epoch": 39.43396226415094, "grad_norm": 3.6588046550750732, "learning_rate": 3.364779874213837e-05, "loss": 0.1004, "step": 2090 }, { "epoch": 39.62264150943396, "grad_norm": 5.536744594573975, "learning_rate": 3.354297693920336e-05, "loss": 0.1163, "step": 2100 }, { "epoch": 39.81132075471698, "grad_norm": 2.2924985885620117, "learning_rate": 3.343815513626835e-05, "loss": 0.0862, "step": 2110 }, { "epoch": 40.0, "grad_norm": 4.559634685516357, "learning_rate": 3.3333333333333335e-05, "loss": 0.1202, "step": 2120 }, { "epoch": 40.0, "eval_accuracy": 0.9545454545454546, "eval_loss": 0.16163212060928345, "eval_runtime": 21.3342, "eval_samples_per_second": 136.119, "eval_steps_per_second": 4.265, "step": 2120 }, { "epoch": 40.18867924528302, "grad_norm": 3.6690125465393066, "learning_rate": 3.322851153039833e-05, "loss": 0.111, "step": 2130 }, { "epoch": 40.37735849056604, "grad_norm": 4.344234943389893, "learning_rate": 3.3123689727463316e-05, "loss": 0.1072, "step": 2140 }, { "epoch": 40.56603773584906, "grad_norm": 3.5178353786468506, "learning_rate": 3.30188679245283e-05, "loss": 0.1036, "step": 2150 }, { "epoch": 40.75471698113208, "grad_norm": 4.753892421722412, "learning_rate": 3.29140461215933e-05, "loss": 0.1173, "step": 2160 }, { "epoch": 40.943396226415096, "grad_norm": 3.2936408519744873, "learning_rate": 3.2809224318658284e-05, "loss": 0.1011, "step": 2170 }, { "epoch": 41.0, "eval_accuracy": 0.9569559228650137, "eval_loss": 0.15179601311683655, "eval_runtime": 21.3331, "eval_samples_per_second": 136.127, "eval_steps_per_second": 4.266, "step": 2173 }, { "epoch": 41.132075471698116, "grad_norm": 2.5210537910461426, "learning_rate": 3.270440251572327e-05, "loss": 0.0855, "step": 2180 }, { "epoch": 41.320754716981135, "grad_norm": 4.329328536987305, "learning_rate": 3.2599580712788265e-05, "loss": 0.1184, "step": 2190 }, { "epoch": 41.509433962264154, "grad_norm": 7.0313801765441895, "learning_rate": 3.249475890985325e-05, "loss": 0.1234, "step": 2200 }, { "epoch": 41.698113207547166, "grad_norm": 5.2681884765625, "learning_rate": 3.238993710691824e-05, "loss": 0.1292, "step": 2210 }, { "epoch": 41.886792452830186, "grad_norm": 3.91233229637146, "learning_rate": 3.228511530398323e-05, "loss": 0.1092, "step": 2220 }, { "epoch": 42.0, "eval_accuracy": 0.9617768595041323, "eval_loss": 0.13080263137817383, "eval_runtime": 21.5397, "eval_samples_per_second": 134.821, "eval_steps_per_second": 4.225, "step": 2226 }, { "epoch": 42.075471698113205, "grad_norm": 3.9293324947357178, "learning_rate": 3.218029350104822e-05, "loss": 0.0893, "step": 2230 }, { "epoch": 42.264150943396224, "grad_norm": 3.51786732673645, "learning_rate": 3.207547169811321e-05, "loss": 0.09, "step": 2240 }, { "epoch": 42.45283018867924, "grad_norm": 3.4958715438842773, "learning_rate": 3.19706498951782e-05, "loss": 0.1196, "step": 2250 }, { "epoch": 42.64150943396226, "grad_norm": 3.3699843883514404, "learning_rate": 3.186582809224319e-05, "loss": 0.1042, "step": 2260 }, { "epoch": 42.83018867924528, "grad_norm": 3.706667423248291, "learning_rate": 3.1761006289308175e-05, "loss": 0.1163, "step": 2270 }, { "epoch": 43.0, "eval_accuracy": 0.9590220385674931, "eval_loss": 0.14582620561122894, "eval_runtime": 21.7777, "eval_samples_per_second": 133.347, "eval_steps_per_second": 4.179, "step": 2279 }, { "epoch": 43.0188679245283, "grad_norm": 3.969252347946167, "learning_rate": 3.165618448637317e-05, "loss": 0.0884, "step": 2280 }, { "epoch": 43.20754716981132, "grad_norm": 5.795734882354736, "learning_rate": 3.1551362683438156e-05, "loss": 0.1031, "step": 2290 }, { "epoch": 43.39622641509434, "grad_norm": 2.936450481414795, "learning_rate": 3.144654088050314e-05, "loss": 0.1086, "step": 2300 }, { "epoch": 43.58490566037736, "grad_norm": 2.310685634613037, "learning_rate": 3.134171907756814e-05, "loss": 0.1055, "step": 2310 }, { "epoch": 43.77358490566038, "grad_norm": 5.497471809387207, "learning_rate": 3.1236897274633124e-05, "loss": 0.0898, "step": 2320 }, { "epoch": 43.9622641509434, "grad_norm": 3.115891456604004, "learning_rate": 3.113207547169811e-05, "loss": 0.1074, "step": 2330 }, { "epoch": 44.0, "eval_accuracy": 0.9548898071625345, "eval_loss": 0.14139670133590698, "eval_runtime": 21.5889, "eval_samples_per_second": 134.513, "eval_steps_per_second": 4.215, "step": 2332 }, { "epoch": 44.15094339622642, "grad_norm": 3.430103063583374, "learning_rate": 3.1027253668763105e-05, "loss": 0.0852, "step": 2340 }, { "epoch": 44.339622641509436, "grad_norm": 3.0305774211883545, "learning_rate": 3.092243186582809e-05, "loss": 0.0818, "step": 2350 }, { "epoch": 44.528301886792455, "grad_norm": 2.8462817668914795, "learning_rate": 3.081761006289308e-05, "loss": 0.0893, "step": 2360 }, { "epoch": 44.716981132075475, "grad_norm": 2.0366451740264893, "learning_rate": 3.071278825995807e-05, "loss": 0.088, "step": 2370 }, { "epoch": 44.905660377358494, "grad_norm": 2.5457682609558105, "learning_rate": 3.060796645702306e-05, "loss": 0.0814, "step": 2380 }, { "epoch": 45.0, "eval_accuracy": 0.9579889807162535, "eval_loss": 0.15091215074062347, "eval_runtime": 21.5296, "eval_samples_per_second": 134.884, "eval_steps_per_second": 4.227, "step": 2385 }, { "epoch": 45.094339622641506, "grad_norm": 2.5898990631103516, "learning_rate": 3.050314465408805e-05, "loss": 0.1118, "step": 2390 }, { "epoch": 45.283018867924525, "grad_norm": 4.282632350921631, "learning_rate": 3.0398322851153044e-05, "loss": 0.0861, "step": 2400 }, { "epoch": 45.471698113207545, "grad_norm": 3.0017223358154297, "learning_rate": 3.029350104821803e-05, "loss": 0.0882, "step": 2410 }, { "epoch": 45.660377358490564, "grad_norm": 5.122268199920654, "learning_rate": 3.018867924528302e-05, "loss": 0.0847, "step": 2420 }, { "epoch": 45.84905660377358, "grad_norm": 3.8469204902648926, "learning_rate": 3.0083857442348012e-05, "loss": 0.0985, "step": 2430 }, { "epoch": 46.0, "eval_accuracy": 0.9628099173553719, "eval_loss": 0.12866026163101196, "eval_runtime": 21.7086, "eval_samples_per_second": 133.772, "eval_steps_per_second": 4.192, "step": 2438 }, { "epoch": 46.0377358490566, "grad_norm": 3.621717691421509, "learning_rate": 2.9979035639413e-05, "loss": 0.0899, "step": 2440 }, { "epoch": 46.22641509433962, "grad_norm": 5.262154579162598, "learning_rate": 2.9874213836477987e-05, "loss": 0.1056, "step": 2450 }, { "epoch": 46.41509433962264, "grad_norm": 3.9734673500061035, "learning_rate": 2.976939203354298e-05, "loss": 0.0903, "step": 2460 }, { "epoch": 46.60377358490566, "grad_norm": 5.937262535095215, "learning_rate": 2.9664570230607968e-05, "loss": 0.0728, "step": 2470 }, { "epoch": 46.79245283018868, "grad_norm": 5.732816219329834, "learning_rate": 2.9559748427672958e-05, "loss": 0.1082, "step": 2480 }, { "epoch": 46.9811320754717, "grad_norm": 4.877685070037842, "learning_rate": 2.945492662473795e-05, "loss": 0.0863, "step": 2490 }, { "epoch": 47.0, "eval_accuracy": 0.962465564738292, "eval_loss": 0.12769892811775208, "eval_runtime": 21.6316, "eval_samples_per_second": 134.248, "eval_steps_per_second": 4.207, "step": 2491 }, { "epoch": 47.16981132075472, "grad_norm": 4.740653991699219, "learning_rate": 2.935010482180294e-05, "loss": 0.0924, "step": 2500 }, { "epoch": 47.35849056603774, "grad_norm": 3.5254249572753906, "learning_rate": 2.9245283018867926e-05, "loss": 0.0829, "step": 2510 }, { "epoch": 47.54716981132076, "grad_norm": 2.8752875328063965, "learning_rate": 2.9140461215932913e-05, "loss": 0.0917, "step": 2520 }, { "epoch": 47.735849056603776, "grad_norm": 3.466445207595825, "learning_rate": 2.9035639412997907e-05, "loss": 0.0683, "step": 2530 }, { "epoch": 47.924528301886795, "grad_norm": 4.896220684051514, "learning_rate": 2.8930817610062894e-05, "loss": 0.0932, "step": 2540 }, { "epoch": 48.0, "eval_accuracy": 0.9559228650137741, "eval_loss": 0.14526014029979706, "eval_runtime": 21.6957, "eval_samples_per_second": 133.851, "eval_steps_per_second": 4.194, "step": 2544 }, { "epoch": 48.113207547169814, "grad_norm": 2.655581474304199, "learning_rate": 2.882599580712788e-05, "loss": 0.0709, "step": 2550 }, { "epoch": 48.301886792452834, "grad_norm": 3.7453079223632812, "learning_rate": 2.8721174004192875e-05, "loss": 0.0856, "step": 2560 }, { "epoch": 48.490566037735846, "grad_norm": 4.565659999847412, "learning_rate": 2.8616352201257862e-05, "loss": 0.0737, "step": 2570 }, { "epoch": 48.679245283018865, "grad_norm": 2.7404532432556152, "learning_rate": 2.851153039832285e-05, "loss": 0.0843, "step": 2580 }, { "epoch": 48.867924528301884, "grad_norm": 2.5788304805755615, "learning_rate": 2.8406708595387843e-05, "loss": 0.0863, "step": 2590 }, { "epoch": 49.0, "eval_accuracy": 0.9566115702479339, "eval_loss": 0.15200072526931763, "eval_runtime": 21.6506, "eval_samples_per_second": 134.13, "eval_steps_per_second": 4.203, "step": 2597 }, { "epoch": 49.056603773584904, "grad_norm": 2.0969185829162598, "learning_rate": 2.830188679245283e-05, "loss": 0.0779, "step": 2600 }, { "epoch": 49.24528301886792, "grad_norm": 4.123626232147217, "learning_rate": 2.8197064989517817e-05, "loss": 0.1003, "step": 2610 }, { "epoch": 49.43396226415094, "grad_norm": 3.9485299587249756, "learning_rate": 2.809224318658281e-05, "loss": 0.0862, "step": 2620 }, { "epoch": 49.62264150943396, "grad_norm": 3.080941915512085, "learning_rate": 2.7987421383647798e-05, "loss": 0.072, "step": 2630 }, { "epoch": 49.81132075471698, "grad_norm": 3.656919002532959, "learning_rate": 2.788259958071279e-05, "loss": 0.0945, "step": 2640 }, { "epoch": 50.0, "grad_norm": 2.570844888687134, "learning_rate": 2.777777777777778e-05, "loss": 0.0887, "step": 2650 }, { "epoch": 50.0, "eval_accuracy": 0.9655647382920111, "eval_loss": 0.12789078056812286, "eval_runtime": 21.6593, "eval_samples_per_second": 134.076, "eval_steps_per_second": 4.201, "step": 2650 }, { "epoch": 50.18867924528302, "grad_norm": 2.957178831100464, "learning_rate": 2.767295597484277e-05, "loss": 0.0693, "step": 2660 }, { "epoch": 50.37735849056604, "grad_norm": 4.276056289672852, "learning_rate": 2.7568134171907757e-05, "loss": 0.0892, "step": 2670 }, { "epoch": 50.56603773584906, "grad_norm": 4.443928241729736, "learning_rate": 2.746331236897275e-05, "loss": 0.081, "step": 2680 }, { "epoch": 50.75471698113208, "grad_norm": 2.7647054195404053, "learning_rate": 2.7358490566037738e-05, "loss": 0.0606, "step": 2690 }, { "epoch": 50.943396226415096, "grad_norm": 5.863194942474365, "learning_rate": 2.7253668763102725e-05, "loss": 0.0744, "step": 2700 }, { "epoch": 51.0, "eval_accuracy": 0.9566115702479339, "eval_loss": 0.15517625212669373, "eval_runtime": 21.6847, "eval_samples_per_second": 133.919, "eval_steps_per_second": 4.196, "step": 2703 }, { "epoch": 51.132075471698116, "grad_norm": 3.68581485748291, "learning_rate": 2.714884696016772e-05, "loss": 0.0757, "step": 2710 }, { "epoch": 51.320754716981135, "grad_norm": 6.279058933258057, "learning_rate": 2.7044025157232706e-05, "loss": 0.0839, "step": 2720 }, { "epoch": 51.509433962264154, "grad_norm": 3.6847403049468994, "learning_rate": 2.6939203354297693e-05, "loss": 0.0754, "step": 2730 }, { "epoch": 51.698113207547166, "grad_norm": 3.514678716659546, "learning_rate": 2.6834381551362687e-05, "loss": 0.0717, "step": 2740 }, { "epoch": 51.886792452830186, "grad_norm": 5.7339768409729, "learning_rate": 2.6729559748427674e-05, "loss": 0.0928, "step": 2750 }, { "epoch": 52.0, "eval_accuracy": 0.9621212121212122, "eval_loss": 0.14646016061306, "eval_runtime": 21.7451, "eval_samples_per_second": 133.547, "eval_steps_per_second": 4.185, "step": 2756 }, { "epoch": 52.075471698113205, "grad_norm": 2.6166908740997314, "learning_rate": 2.662473794549266e-05, "loss": 0.0622, "step": 2760 }, { "epoch": 52.264150943396224, "grad_norm": 4.80858850479126, "learning_rate": 2.6519916142557655e-05, "loss": 0.0869, "step": 2770 }, { "epoch": 52.45283018867924, "grad_norm": 4.928915500640869, "learning_rate": 2.641509433962264e-05, "loss": 0.0796, "step": 2780 }, { "epoch": 52.64150943396226, "grad_norm": 4.719991207122803, "learning_rate": 2.631027253668763e-05, "loss": 0.0789, "step": 2790 }, { "epoch": 52.83018867924528, "grad_norm": 3.452692747116089, "learning_rate": 2.6205450733752623e-05, "loss": 0.0776, "step": 2800 }, { "epoch": 53.0, "eval_accuracy": 0.9583333333333334, "eval_loss": 0.15753231942653656, "eval_runtime": 21.7056, "eval_samples_per_second": 133.79, "eval_steps_per_second": 4.192, "step": 2809 }, { "epoch": 53.0188679245283, "grad_norm": 3.827479362487793, "learning_rate": 2.610062893081761e-05, "loss": 0.0614, "step": 2810 }, { "epoch": 53.20754716981132, "grad_norm": 3.628530979156494, "learning_rate": 2.59958071278826e-05, "loss": 0.0836, "step": 2820 }, { "epoch": 53.39622641509434, "grad_norm": 5.494470119476318, "learning_rate": 2.589098532494759e-05, "loss": 0.07, "step": 2830 }, { "epoch": 53.58490566037736, "grad_norm": 2.920034646987915, "learning_rate": 2.578616352201258e-05, "loss": 0.0861, "step": 2840 }, { "epoch": 53.77358490566038, "grad_norm": 2.367824077606201, "learning_rate": 2.5681341719077568e-05, "loss": 0.0824, "step": 2850 }, { "epoch": 53.9622641509434, "grad_norm": 4.855032444000244, "learning_rate": 2.5576519916142562e-05, "loss": 0.088, "step": 2860 }, { "epoch": 54.0, "eval_accuracy": 0.956267217630854, "eval_loss": 0.16139821708202362, "eval_runtime": 21.715, "eval_samples_per_second": 133.733, "eval_steps_per_second": 4.191, "step": 2862 }, { "epoch": 54.15094339622642, "grad_norm": 4.724045753479004, "learning_rate": 2.547169811320755e-05, "loss": 0.076, "step": 2870 }, { "epoch": 54.339622641509436, "grad_norm": 3.425045967102051, "learning_rate": 2.5366876310272536e-05, "loss": 0.0882, "step": 2880 }, { "epoch": 54.528301886792455, "grad_norm": 2.366727590560913, "learning_rate": 2.526205450733753e-05, "loss": 0.0492, "step": 2890 }, { "epoch": 54.716981132075475, "grad_norm": 3.3081116676330566, "learning_rate": 2.5157232704402517e-05, "loss": 0.0755, "step": 2900 }, { "epoch": 54.905660377358494, "grad_norm": 4.172368049621582, "learning_rate": 2.5052410901467504e-05, "loss": 0.0909, "step": 2910 }, { "epoch": 55.0, "eval_accuracy": 0.9638429752066116, "eval_loss": 0.13122335076332092, "eval_runtime": 22.1096, "eval_samples_per_second": 131.346, "eval_steps_per_second": 4.116, "step": 2915 }, { "epoch": 55.094339622641506, "grad_norm": 4.725098609924316, "learning_rate": 2.4947589098532495e-05, "loss": 0.0612, "step": 2920 }, { "epoch": 55.283018867924525, "grad_norm": 2.416428327560425, "learning_rate": 2.4842767295597485e-05, "loss": 0.067, "step": 2930 }, { "epoch": 55.471698113207545, "grad_norm": 4.181457042694092, "learning_rate": 2.4737945492662476e-05, "loss": 0.0771, "step": 2940 }, { "epoch": 55.660377358490564, "grad_norm": 5.096282005310059, "learning_rate": 2.4633123689727463e-05, "loss": 0.0747, "step": 2950 }, { "epoch": 55.84905660377358, "grad_norm": 5.836996555328369, "learning_rate": 2.4528301886792453e-05, "loss": 0.089, "step": 2960 }, { "epoch": 56.0, "eval_accuracy": 0.9652203856749312, "eval_loss": 0.13570785522460938, "eval_runtime": 21.9675, "eval_samples_per_second": 132.195, "eval_steps_per_second": 4.142, "step": 2968 }, { "epoch": 56.0377358490566, "grad_norm": 3.5151453018188477, "learning_rate": 2.4423480083857444e-05, "loss": 0.0653, "step": 2970 }, { "epoch": 56.22641509433962, "grad_norm": 5.056783676147461, "learning_rate": 2.431865828092243e-05, "loss": 0.0768, "step": 2980 }, { "epoch": 56.41509433962264, "grad_norm": 2.0527374744415283, "learning_rate": 2.421383647798742e-05, "loss": 0.0694, "step": 2990 }, { "epoch": 56.60377358490566, "grad_norm": 3.363852024078369, "learning_rate": 2.4109014675052412e-05, "loss": 0.0763, "step": 3000 }, { "epoch": 56.79245283018868, "grad_norm": 2.2541282176971436, "learning_rate": 2.4004192872117402e-05, "loss": 0.0717, "step": 3010 }, { "epoch": 56.9811320754717, "grad_norm": 1.5099104642868042, "learning_rate": 2.3899371069182393e-05, "loss": 0.0587, "step": 3020 }, { "epoch": 57.0, "eval_accuracy": 0.9614325068870524, "eval_loss": 0.15099208056926727, "eval_runtime": 21.4231, "eval_samples_per_second": 135.554, "eval_steps_per_second": 4.248, "step": 3021 }, { "epoch": 57.16981132075472, "grad_norm": 3.0756702423095703, "learning_rate": 2.3794549266247383e-05, "loss": 0.0931, "step": 3030 }, { "epoch": 57.35849056603774, "grad_norm": 5.203828811645508, "learning_rate": 2.368972746331237e-05, "loss": 0.077, "step": 3040 }, { "epoch": 57.54716981132076, "grad_norm": 3.4697303771972656, "learning_rate": 2.358490566037736e-05, "loss": 0.0773, "step": 3050 }, { "epoch": 57.735849056603776, "grad_norm": 5.211211681365967, "learning_rate": 2.348008385744235e-05, "loss": 0.0829, "step": 3060 }, { "epoch": 57.924528301886795, "grad_norm": 3.6697630882263184, "learning_rate": 2.3375262054507338e-05, "loss": 0.0931, "step": 3070 }, { "epoch": 58.0, "eval_accuracy": 0.9579889807162535, "eval_loss": 0.14657209813594818, "eval_runtime": 22.1099, "eval_samples_per_second": 131.344, "eval_steps_per_second": 4.116, "step": 3074 }, { "epoch": 58.113207547169814, "grad_norm": 5.555974960327148, "learning_rate": 2.327044025157233e-05, "loss": 0.0583, "step": 3080 }, { "epoch": 58.301886792452834, "grad_norm": 4.070455551147461, "learning_rate": 2.316561844863732e-05, "loss": 0.0746, "step": 3090 }, { "epoch": 58.490566037735846, "grad_norm": 2.739751100540161, "learning_rate": 2.3060796645702306e-05, "loss": 0.0698, "step": 3100 }, { "epoch": 58.679245283018865, "grad_norm": 4.419424057006836, "learning_rate": 2.2955974842767297e-05, "loss": 0.0669, "step": 3110 }, { "epoch": 58.867924528301884, "grad_norm": 2.998007297515869, "learning_rate": 2.2851153039832284e-05, "loss": 0.0878, "step": 3120 }, { "epoch": 59.0, "eval_accuracy": 0.9590220385674931, "eval_loss": 0.14993391931056976, "eval_runtime": 21.3446, "eval_samples_per_second": 136.053, "eval_steps_per_second": 4.263, "step": 3127 }, { "epoch": 59.056603773584904, "grad_norm": 3.094158411026001, "learning_rate": 2.2746331236897274e-05, "loss": 0.0679, "step": 3130 }, { "epoch": 59.24528301886792, "grad_norm": 3.320544958114624, "learning_rate": 2.2641509433962265e-05, "loss": 0.066, "step": 3140 }, { "epoch": 59.43396226415094, "grad_norm": 5.472837924957275, "learning_rate": 2.2536687631027252e-05, "loss": 0.071, "step": 3150 }, { "epoch": 59.62264150943396, "grad_norm": 2.2281177043914795, "learning_rate": 2.2431865828092242e-05, "loss": 0.075, "step": 3160 }, { "epoch": 59.81132075471698, "grad_norm": 4.7084503173828125, "learning_rate": 2.2327044025157233e-05, "loss": 0.0763, "step": 3170 }, { "epoch": 60.0, "grad_norm": 4.050378322601318, "learning_rate": 2.2222222222222223e-05, "loss": 0.0725, "step": 3180 }, { "epoch": 60.0, "eval_accuracy": 0.9597107438016529, "eval_loss": 0.15237364172935486, "eval_runtime": 21.4189, "eval_samples_per_second": 135.581, "eval_steps_per_second": 4.249, "step": 3180 }, { "epoch": 60.18867924528302, "grad_norm": 5.035741329193115, "learning_rate": 2.2117400419287214e-05, "loss": 0.0754, "step": 3190 }, { "epoch": 60.37735849056604, "grad_norm": 2.6131255626678467, "learning_rate": 2.2012578616352204e-05, "loss": 0.0727, "step": 3200 }, { "epoch": 60.56603773584906, "grad_norm": 2.404343366622925, "learning_rate": 2.190775681341719e-05, "loss": 0.055, "step": 3210 }, { "epoch": 60.75471698113208, "grad_norm": 3.9234628677368164, "learning_rate": 2.1802935010482182e-05, "loss": 0.0671, "step": 3220 }, { "epoch": 60.943396226415096, "grad_norm": 2.465709686279297, "learning_rate": 2.1698113207547172e-05, "loss": 0.0543, "step": 3230 }, { "epoch": 61.0, "eval_accuracy": 0.9583333333333334, "eval_loss": 0.15427254140377045, "eval_runtime": 21.2569, "eval_samples_per_second": 136.615, "eval_steps_per_second": 4.281, "step": 3233 }, { "epoch": 61.132075471698116, "grad_norm": 4.535091876983643, "learning_rate": 2.159329140461216e-05, "loss": 0.0529, "step": 3240 }, { "epoch": 61.320754716981135, "grad_norm": 2.738173484802246, "learning_rate": 2.148846960167715e-05, "loss": 0.0566, "step": 3250 }, { "epoch": 61.509433962264154, "grad_norm": 3.950739860534668, "learning_rate": 2.138364779874214e-05, "loss": 0.0729, "step": 3260 }, { "epoch": 61.698113207547166, "grad_norm": 2.284546375274658, "learning_rate": 2.1278825995807127e-05, "loss": 0.0555, "step": 3270 }, { "epoch": 61.886792452830186, "grad_norm": 4.242305278778076, "learning_rate": 2.1174004192872118e-05, "loss": 0.0773, "step": 3280 }, { "epoch": 62.0, "eval_accuracy": 0.9634986225895317, "eval_loss": 0.15126247704029083, "eval_runtime": 21.4863, "eval_samples_per_second": 135.156, "eval_steps_per_second": 4.235, "step": 3286 }, { "epoch": 62.075471698113205, "grad_norm": 5.880760669708252, "learning_rate": 2.106918238993711e-05, "loss": 0.0663, "step": 3290 }, { "epoch": 62.264150943396224, "grad_norm": 4.1386237144470215, "learning_rate": 2.0964360587002095e-05, "loss": 0.0643, "step": 3300 }, { "epoch": 62.45283018867924, "grad_norm": 3.798180341720581, "learning_rate": 2.0859538784067086e-05, "loss": 0.0789, "step": 3310 }, { "epoch": 62.64150943396226, "grad_norm": 2.6862704753875732, "learning_rate": 2.0754716981132076e-05, "loss": 0.0838, "step": 3320 }, { "epoch": 62.83018867924528, "grad_norm": 4.914183616638184, "learning_rate": 2.0649895178197063e-05, "loss": 0.0626, "step": 3330 }, { "epoch": 63.0, "eval_accuracy": 0.9600550964187328, "eval_loss": 0.1511116325855255, "eval_runtime": 21.3341, "eval_samples_per_second": 136.12, "eval_steps_per_second": 4.265, "step": 3339 }, { "epoch": 63.0188679245283, "grad_norm": 3.25014591217041, "learning_rate": 2.0545073375262054e-05, "loss": 0.0534, "step": 3340 }, { "epoch": 63.20754716981132, "grad_norm": 3.0906035900115967, "learning_rate": 2.0440251572327044e-05, "loss": 0.0598, "step": 3350 }, { "epoch": 63.39622641509434, "grad_norm": 3.2928597927093506, "learning_rate": 2.0335429769392035e-05, "loss": 0.0704, "step": 3360 }, { "epoch": 63.58490566037736, "grad_norm": 2.6541659832000732, "learning_rate": 2.0230607966457025e-05, "loss": 0.0541, "step": 3370 }, { "epoch": 63.77358490566038, "grad_norm": 4.403809070587158, "learning_rate": 2.0125786163522016e-05, "loss": 0.0563, "step": 3380 }, { "epoch": 63.9622641509434, "grad_norm": 4.669613361358643, "learning_rate": 2.0020964360587003e-05, "loss": 0.0649, "step": 3390 }, { "epoch": 64.0, "eval_accuracy": 0.959366391184573, "eval_loss": 0.1467009335756302, "eval_runtime": 21.2656, "eval_samples_per_second": 136.558, "eval_steps_per_second": 4.279, "step": 3392 }, { "epoch": 64.15094339622641, "grad_norm": 2.8066558837890625, "learning_rate": 1.9916142557651993e-05, "loss": 0.0746, "step": 3400 }, { "epoch": 64.33962264150944, "grad_norm": 5.994083881378174, "learning_rate": 1.9811320754716984e-05, "loss": 0.0704, "step": 3410 }, { "epoch": 64.52830188679245, "grad_norm": 4.9809441566467285, "learning_rate": 1.970649895178197e-05, "loss": 0.0773, "step": 3420 }, { "epoch": 64.71698113207547, "grad_norm": 2.497436285018921, "learning_rate": 1.960167714884696e-05, "loss": 0.0781, "step": 3430 }, { "epoch": 64.90566037735849, "grad_norm": 3.954669952392578, "learning_rate": 1.9496855345911952e-05, "loss": 0.0705, "step": 3440 }, { "epoch": 65.0, "eval_accuracy": 0.9590220385674931, "eval_loss": 0.1443195939064026, "eval_runtime": 21.2667, "eval_samples_per_second": 136.551, "eval_steps_per_second": 4.279, "step": 3445 }, { "epoch": 65.09433962264151, "grad_norm": 3.244377374649048, "learning_rate": 1.939203354297694e-05, "loss": 0.0515, "step": 3450 }, { "epoch": 65.28301886792453, "grad_norm": 2.0447616577148438, "learning_rate": 1.928721174004193e-05, "loss": 0.0602, "step": 3460 }, { "epoch": 65.47169811320755, "grad_norm": 9.920838356018066, "learning_rate": 1.918238993710692e-05, "loss": 0.072, "step": 3470 }, { "epoch": 65.66037735849056, "grad_norm": 2.986557960510254, "learning_rate": 1.9077568134171907e-05, "loss": 0.0571, "step": 3480 }, { "epoch": 65.84905660377359, "grad_norm": 3.244969606399536, "learning_rate": 1.8972746331236897e-05, "loss": 0.0737, "step": 3490 }, { "epoch": 66.0, "eval_accuracy": 0.9607438016528925, "eval_loss": 0.13612627983093262, "eval_runtime": 21.1938, "eval_samples_per_second": 137.021, "eval_steps_per_second": 4.294, "step": 3498 }, { "epoch": 66.0377358490566, "grad_norm": 3.0969536304473877, "learning_rate": 1.8867924528301888e-05, "loss": 0.0581, "step": 3500 }, { "epoch": 66.22641509433963, "grad_norm": 5.167777061462402, "learning_rate": 1.876310272536688e-05, "loss": 0.0744, "step": 3510 }, { "epoch": 66.41509433962264, "grad_norm": 3.00007700920105, "learning_rate": 1.865828092243187e-05, "loss": 0.0543, "step": 3520 }, { "epoch": 66.60377358490567, "grad_norm": 2.6348941326141357, "learning_rate": 1.8553459119496856e-05, "loss": 0.0567, "step": 3530 }, { "epoch": 66.79245283018868, "grad_norm": 5.631946563720703, "learning_rate": 1.8448637316561846e-05, "loss": 0.0779, "step": 3540 }, { "epoch": 66.98113207547169, "grad_norm": 1.5974416732788086, "learning_rate": 1.8343815513626837e-05, "loss": 0.0518, "step": 3550 }, { "epoch": 67.0, "eval_accuracy": 0.959366391184573, "eval_loss": 0.14412498474121094, "eval_runtime": 21.2408, "eval_samples_per_second": 136.718, "eval_steps_per_second": 4.284, "step": 3551 }, { "epoch": 67.16981132075472, "grad_norm": 2.0217528343200684, "learning_rate": 1.8238993710691824e-05, "loss": 0.0645, "step": 3560 }, { "epoch": 67.35849056603773, "grad_norm": 3.986748695373535, "learning_rate": 1.8134171907756814e-05, "loss": 0.0647, "step": 3570 }, { "epoch": 67.54716981132076, "grad_norm": 4.671311855316162, "learning_rate": 1.8029350104821805e-05, "loss": 0.0666, "step": 3580 }, { "epoch": 67.73584905660377, "grad_norm": 2.0372939109802246, "learning_rate": 1.7924528301886792e-05, "loss": 0.0554, "step": 3590 }, { "epoch": 67.9245283018868, "grad_norm": 2.52120041847229, "learning_rate": 1.7819706498951782e-05, "loss": 0.0502, "step": 3600 }, { "epoch": 68.0, "eval_accuracy": 0.9590220385674931, "eval_loss": 0.1534823328256607, "eval_runtime": 21.3789, "eval_samples_per_second": 135.835, "eval_steps_per_second": 4.257, "step": 3604 }, { "epoch": 68.11320754716981, "grad_norm": 1.7292485237121582, "learning_rate": 1.7714884696016773e-05, "loss": 0.0696, "step": 3610 }, { "epoch": 68.30188679245283, "grad_norm": 2.2730910778045654, "learning_rate": 1.761006289308176e-05, "loss": 0.0608, "step": 3620 }, { "epoch": 68.49056603773585, "grad_norm": 3.564232110977173, "learning_rate": 1.750524109014675e-05, "loss": 0.0604, "step": 3630 }, { "epoch": 68.67924528301887, "grad_norm": 2.5112924575805664, "learning_rate": 1.740041928721174e-05, "loss": 0.0415, "step": 3640 }, { "epoch": 68.86792452830188, "grad_norm": 5.021323204040527, "learning_rate": 1.7295597484276728e-05, "loss": 0.0701, "step": 3650 }, { "epoch": 69.0, "eval_accuracy": 0.9662534435261708, "eval_loss": 0.1362384557723999, "eval_runtime": 21.2026, "eval_samples_per_second": 136.964, "eval_steps_per_second": 4.292, "step": 3657 }, { "epoch": 69.05660377358491, "grad_norm": 5.013925075531006, "learning_rate": 1.719077568134172e-05, "loss": 0.0744, "step": 3660 }, { "epoch": 69.24528301886792, "grad_norm": 3.5540971755981445, "learning_rate": 1.708595387840671e-05, "loss": 0.0568, "step": 3670 }, { "epoch": 69.43396226415095, "grad_norm": 3.4611597061157227, "learning_rate": 1.69811320754717e-05, "loss": 0.0513, "step": 3680 }, { "epoch": 69.62264150943396, "grad_norm": 2.5300846099853516, "learning_rate": 1.687631027253669e-05, "loss": 0.0442, "step": 3690 }, { "epoch": 69.81132075471699, "grad_norm": 2.6349620819091797, "learning_rate": 1.677148846960168e-05, "loss": 0.054, "step": 3700 }, { "epoch": 70.0, "grad_norm": 3.122040033340454, "learning_rate": 1.6666666666666667e-05, "loss": 0.0826, "step": 3710 }, { "epoch": 70.0, "eval_accuracy": 0.9610881542699724, "eval_loss": 0.1492019146680832, "eval_runtime": 21.2837, "eval_samples_per_second": 136.442, "eval_steps_per_second": 4.276, "step": 3710 }, { "epoch": 70.18867924528301, "grad_norm": 3.5501227378845215, "learning_rate": 1.6561844863731658e-05, "loss": 0.0627, "step": 3720 }, { "epoch": 70.37735849056604, "grad_norm": 2.6497879028320312, "learning_rate": 1.645702306079665e-05, "loss": 0.0461, "step": 3730 }, { "epoch": 70.56603773584905, "grad_norm": 2.9843809604644775, "learning_rate": 1.6352201257861635e-05, "loss": 0.0484, "step": 3740 }, { "epoch": 70.75471698113208, "grad_norm": 8.867347717285156, "learning_rate": 1.6247379454926626e-05, "loss": 0.0595, "step": 3750 }, { "epoch": 70.94339622641509, "grad_norm": 4.957089900970459, "learning_rate": 1.6142557651991616e-05, "loss": 0.0715, "step": 3760 }, { "epoch": 71.0, "eval_accuracy": 0.962465564738292, "eval_loss": 0.16146376729011536, "eval_runtime": 21.2899, "eval_samples_per_second": 136.403, "eval_steps_per_second": 4.274, "step": 3763 }, { "epoch": 71.13207547169812, "grad_norm": 3.982633590698242, "learning_rate": 1.6037735849056604e-05, "loss": 0.0534, "step": 3770 }, { "epoch": 71.32075471698113, "grad_norm": 4.253650188446045, "learning_rate": 1.5932914046121594e-05, "loss": 0.0677, "step": 3780 }, { "epoch": 71.50943396226415, "grad_norm": 4.608425140380859, "learning_rate": 1.5828092243186584e-05, "loss": 0.0495, "step": 3790 }, { "epoch": 71.69811320754717, "grad_norm": 5.12533712387085, "learning_rate": 1.572327044025157e-05, "loss": 0.0633, "step": 3800 }, { "epoch": 71.88679245283019, "grad_norm": 4.220004558563232, "learning_rate": 1.5618448637316562e-05, "loss": 0.0635, "step": 3810 }, { "epoch": 72.0, "eval_accuracy": 0.9641873278236914, "eval_loss": 0.14879465103149414, "eval_runtime": 21.2409, "eval_samples_per_second": 136.717, "eval_steps_per_second": 4.284, "step": 3816 }, { "epoch": 72.0754716981132, "grad_norm": 1.841488242149353, "learning_rate": 1.5513626834381552e-05, "loss": 0.0608, "step": 3820 }, { "epoch": 72.26415094339623, "grad_norm": 2.0446391105651855, "learning_rate": 1.540880503144654e-05, "loss": 0.0465, "step": 3830 }, { "epoch": 72.45283018867924, "grad_norm": 3.4776628017425537, "learning_rate": 1.530398322851153e-05, "loss": 0.0575, "step": 3840 }, { "epoch": 72.64150943396227, "grad_norm": 2.3915700912475586, "learning_rate": 1.5199161425576522e-05, "loss": 0.0564, "step": 3850 }, { "epoch": 72.83018867924528, "grad_norm": 1.4522980451583862, "learning_rate": 1.509433962264151e-05, "loss": 0.0522, "step": 3860 }, { "epoch": 73.0, "eval_accuracy": 0.9621212121212122, "eval_loss": 0.14563634991645813, "eval_runtime": 21.397, "eval_samples_per_second": 135.72, "eval_steps_per_second": 4.253, "step": 3869 }, { "epoch": 73.01886792452831, "grad_norm": 2.048356056213379, "learning_rate": 1.49895178197065e-05, "loss": 0.0565, "step": 3870 }, { "epoch": 73.20754716981132, "grad_norm": 2.7343058586120605, "learning_rate": 1.488469601677149e-05, "loss": 0.0403, "step": 3880 }, { "epoch": 73.39622641509433, "grad_norm": 1.833511471748352, "learning_rate": 1.4779874213836479e-05, "loss": 0.0502, "step": 3890 }, { "epoch": 73.58490566037736, "grad_norm": 2.4152145385742188, "learning_rate": 1.467505241090147e-05, "loss": 0.0539, "step": 3900 }, { "epoch": 73.77358490566037, "grad_norm": 2.6949825286865234, "learning_rate": 1.4570230607966457e-05, "loss": 0.0499, "step": 3910 }, { "epoch": 73.9622641509434, "grad_norm": 3.1027348041534424, "learning_rate": 1.4465408805031447e-05, "loss": 0.0485, "step": 3920 }, { "epoch": 74.0, "eval_accuracy": 0.9645316804407713, "eval_loss": 0.1386471837759018, "eval_runtime": 21.3052, "eval_samples_per_second": 136.304, "eval_steps_per_second": 4.271, "step": 3922 }, { "epoch": 74.15094339622641, "grad_norm": 1.7940607070922852, "learning_rate": 1.4360587002096438e-05, "loss": 0.0541, "step": 3930 }, { "epoch": 74.33962264150944, "grad_norm": 2.581839084625244, "learning_rate": 1.4255765199161425e-05, "loss": 0.044, "step": 3940 }, { "epoch": 74.52830188679245, "grad_norm": 1.7485500574111938, "learning_rate": 1.4150943396226415e-05, "loss": 0.043, "step": 3950 }, { "epoch": 74.71698113207547, "grad_norm": 2.436922550201416, "learning_rate": 1.4046121593291406e-05, "loss": 0.0601, "step": 3960 }, { "epoch": 74.90566037735849, "grad_norm": 2.460512638092041, "learning_rate": 1.3941299790356394e-05, "loss": 0.0629, "step": 3970 }, { "epoch": 75.0, "eval_accuracy": 0.9631542699724518, "eval_loss": 0.14631205797195435, "eval_runtime": 21.2256, "eval_samples_per_second": 136.816, "eval_steps_per_second": 4.287, "step": 3975 }, { "epoch": 75.09433962264151, "grad_norm": 2.925687551498413, "learning_rate": 1.3836477987421385e-05, "loss": 0.0535, "step": 3980 }, { "epoch": 75.28301886792453, "grad_norm": 1.9862323999404907, "learning_rate": 1.3731656184486375e-05, "loss": 0.0586, "step": 3990 }, { "epoch": 75.47169811320755, "grad_norm": 1.5170574188232422, "learning_rate": 1.3626834381551362e-05, "loss": 0.0471, "step": 4000 }, { "epoch": 75.66037735849056, "grad_norm": 4.97707986831665, "learning_rate": 1.3522012578616353e-05, "loss": 0.0529, "step": 4010 }, { "epoch": 75.84905660377359, "grad_norm": 1.9036014080047607, "learning_rate": 1.3417190775681343e-05, "loss": 0.0568, "step": 4020 }, { "epoch": 76.0, "eval_accuracy": 0.9621212121212122, "eval_loss": 0.14720916748046875, "eval_runtime": 21.3087, "eval_samples_per_second": 136.282, "eval_steps_per_second": 4.271, "step": 4028 }, { "epoch": 76.0377358490566, "grad_norm": 3.9243173599243164, "learning_rate": 1.331236897274633e-05, "loss": 0.0531, "step": 4030 }, { "epoch": 76.22641509433963, "grad_norm": 3.3605711460113525, "learning_rate": 1.320754716981132e-05, "loss": 0.0625, "step": 4040 }, { "epoch": 76.41509433962264, "grad_norm": 3.699404716491699, "learning_rate": 1.3102725366876311e-05, "loss": 0.0522, "step": 4050 }, { "epoch": 76.60377358490567, "grad_norm": 4.595207691192627, "learning_rate": 1.29979035639413e-05, "loss": 0.0611, "step": 4060 }, { "epoch": 76.79245283018868, "grad_norm": 6.970567226409912, "learning_rate": 1.289308176100629e-05, "loss": 0.0641, "step": 4070 }, { "epoch": 76.98113207547169, "grad_norm": 5.223586559295654, "learning_rate": 1.2788259958071281e-05, "loss": 0.0556, "step": 4080 }, { "epoch": 77.0, "eval_accuracy": 0.9659090909090909, "eval_loss": 0.14402107894420624, "eval_runtime": 21.2908, "eval_samples_per_second": 136.397, "eval_steps_per_second": 4.274, "step": 4081 }, { "epoch": 77.16981132075472, "grad_norm": 1.7077960968017578, "learning_rate": 1.2683438155136268e-05, "loss": 0.0477, "step": 4090 }, { "epoch": 77.35849056603773, "grad_norm": 3.9550302028656006, "learning_rate": 1.2578616352201259e-05, "loss": 0.0631, "step": 4100 }, { "epoch": 77.54716981132076, "grad_norm": 3.574674129486084, "learning_rate": 1.2473794549266247e-05, "loss": 0.0357, "step": 4110 }, { "epoch": 77.73584905660377, "grad_norm": 3.119210958480835, "learning_rate": 1.2368972746331238e-05, "loss": 0.0382, "step": 4120 }, { "epoch": 77.9245283018868, "grad_norm": 1.9549647569656372, "learning_rate": 1.2264150943396227e-05, "loss": 0.0547, "step": 4130 }, { "epoch": 78.0, "eval_accuracy": 0.9634986225895317, "eval_loss": 0.14210809767246246, "eval_runtime": 21.3339, "eval_samples_per_second": 136.121, "eval_steps_per_second": 4.266, "step": 4134 }, { "epoch": 78.11320754716981, "grad_norm": 3.379382610321045, "learning_rate": 1.2159329140461215e-05, "loss": 0.0632, "step": 4140 }, { "epoch": 78.30188679245283, "grad_norm": 5.461263656616211, "learning_rate": 1.2054507337526206e-05, "loss": 0.0689, "step": 4150 }, { "epoch": 78.49056603773585, "grad_norm": 4.160185813903809, "learning_rate": 1.1949685534591196e-05, "loss": 0.054, "step": 4160 }, { "epoch": 78.67924528301887, "grad_norm": 2.50945782661438, "learning_rate": 1.1844863731656185e-05, "loss": 0.0302, "step": 4170 }, { "epoch": 78.86792452830188, "grad_norm": 3.265209197998047, "learning_rate": 1.1740041928721176e-05, "loss": 0.0527, "step": 4180 }, { "epoch": 79.0, "eval_accuracy": 0.9683195592286501, "eval_loss": 0.14441226422786713, "eval_runtime": 21.2921, "eval_samples_per_second": 136.389, "eval_steps_per_second": 4.274, "step": 4187 }, { "epoch": 79.05660377358491, "grad_norm": 1.1379858255386353, "learning_rate": 1.1635220125786164e-05, "loss": 0.0419, "step": 4190 }, { "epoch": 79.24528301886792, "grad_norm": 5.1714043617248535, "learning_rate": 1.1530398322851153e-05, "loss": 0.053, "step": 4200 }, { "epoch": 79.43396226415095, "grad_norm": 3.441499710083008, "learning_rate": 1.1425576519916142e-05, "loss": 0.0588, "step": 4210 }, { "epoch": 79.62264150943396, "grad_norm": 1.575990915298462, "learning_rate": 1.1320754716981132e-05, "loss": 0.0333, "step": 4220 }, { "epoch": 79.81132075471699, "grad_norm": 1.0692986249923706, "learning_rate": 1.1215932914046121e-05, "loss": 0.0375, "step": 4230 }, { "epoch": 80.0, "grad_norm": 6.881436824798584, "learning_rate": 1.1111111111111112e-05, "loss": 0.054, "step": 4240 }, { "epoch": 80.0, "eval_accuracy": 0.9628099173553719, "eval_loss": 0.1463625133037567, "eval_runtime": 21.276, "eval_samples_per_second": 136.492, "eval_steps_per_second": 4.277, "step": 4240 }, { "epoch": 80.18867924528301, "grad_norm": 3.086602210998535, "learning_rate": 1.1006289308176102e-05, "loss": 0.0614, "step": 4250 }, { "epoch": 80.37735849056604, "grad_norm": 1.642980933189392, "learning_rate": 1.0901467505241091e-05, "loss": 0.048, "step": 4260 }, { "epoch": 80.56603773584905, "grad_norm": 5.067837715148926, "learning_rate": 1.079664570230608e-05, "loss": 0.0461, "step": 4270 }, { "epoch": 80.75471698113208, "grad_norm": 3.674088478088379, "learning_rate": 1.069182389937107e-05, "loss": 0.0529, "step": 4280 }, { "epoch": 80.94339622641509, "grad_norm": 3.663996934890747, "learning_rate": 1.0587002096436059e-05, "loss": 0.0641, "step": 4290 }, { "epoch": 81.0, "eval_accuracy": 0.9634986225895317, "eval_loss": 0.1491348147392273, "eval_runtime": 21.3519, "eval_samples_per_second": 136.007, "eval_steps_per_second": 4.262, "step": 4293 }, { "epoch": 81.13207547169812, "grad_norm": 3.4041941165924072, "learning_rate": 1.0482180293501048e-05, "loss": 0.0511, "step": 4300 }, { "epoch": 81.32075471698113, "grad_norm": 5.157493591308594, "learning_rate": 1.0377358490566038e-05, "loss": 0.0471, "step": 4310 }, { "epoch": 81.50943396226415, "grad_norm": 5.192855358123779, "learning_rate": 1.0272536687631027e-05, "loss": 0.0491, "step": 4320 }, { "epoch": 81.69811320754717, "grad_norm": 2.236807346343994, "learning_rate": 1.0167714884696017e-05, "loss": 0.0358, "step": 4330 }, { "epoch": 81.88679245283019, "grad_norm": 4.674067497253418, "learning_rate": 1.0062893081761008e-05, "loss": 0.0546, "step": 4340 }, { "epoch": 82.0, "eval_accuracy": 0.9610881542699724, "eval_loss": 0.15290114283561707, "eval_runtime": 21.4172, "eval_samples_per_second": 135.592, "eval_steps_per_second": 4.249, "step": 4346 }, { "epoch": 82.0754716981132, "grad_norm": 2.6720902919769287, "learning_rate": 9.958071278825997e-06, "loss": 0.0493, "step": 4350 }, { "epoch": 82.26415094339623, "grad_norm": 3.191582441329956, "learning_rate": 9.853249475890985e-06, "loss": 0.0479, "step": 4360 }, { "epoch": 82.45283018867924, "grad_norm": 6.1738481521606445, "learning_rate": 9.748427672955976e-06, "loss": 0.0517, "step": 4370 }, { "epoch": 82.64150943396227, "grad_norm": 2.8287763595581055, "learning_rate": 9.643605870020965e-06, "loss": 0.0598, "step": 4380 }, { "epoch": 82.83018867924528, "grad_norm": 2.7232823371887207, "learning_rate": 9.538784067085953e-06, "loss": 0.059, "step": 4390 }, { "epoch": 83.0, "eval_accuracy": 0.9652203856749312, "eval_loss": 0.14617380499839783, "eval_runtime": 21.3846, "eval_samples_per_second": 135.798, "eval_steps_per_second": 4.255, "step": 4399 }, { "epoch": 83.01886792452831, "grad_norm": 3.0003349781036377, "learning_rate": 9.433962264150944e-06, "loss": 0.061, "step": 4400 }, { "epoch": 83.20754716981132, "grad_norm": 4.4709038734436035, "learning_rate": 9.329140461215934e-06, "loss": 0.0468, "step": 4410 }, { "epoch": 83.39622641509433, "grad_norm": 3.809194564819336, "learning_rate": 9.224318658280923e-06, "loss": 0.048, "step": 4420 }, { "epoch": 83.58490566037736, "grad_norm": 4.134964942932129, "learning_rate": 9.119496855345912e-06, "loss": 0.0593, "step": 4430 }, { "epoch": 83.77358490566037, "grad_norm": 6.407557964324951, "learning_rate": 9.014675052410902e-06, "loss": 0.058, "step": 4440 }, { "epoch": 83.9622641509434, "grad_norm": 2.055232048034668, "learning_rate": 8.909853249475891e-06, "loss": 0.0485, "step": 4450 }, { "epoch": 84.0, "eval_accuracy": 0.9631542699724518, "eval_loss": 0.15668189525604248, "eval_runtime": 21.3505, "eval_samples_per_second": 136.016, "eval_steps_per_second": 4.262, "step": 4452 }, { "epoch": 84.15094339622641, "grad_norm": 2.221158266067505, "learning_rate": 8.80503144654088e-06, "loss": 0.0555, "step": 4460 }, { "epoch": 84.33962264150944, "grad_norm": 5.540987968444824, "learning_rate": 8.70020964360587e-06, "loss": 0.0507, "step": 4470 }, { "epoch": 84.52830188679245, "grad_norm": 2.087411642074585, "learning_rate": 8.59538784067086e-06, "loss": 0.0485, "step": 4480 }, { "epoch": 84.71698113207547, "grad_norm": 6.342270374298096, "learning_rate": 8.49056603773585e-06, "loss": 0.0533, "step": 4490 }, { "epoch": 84.90566037735849, "grad_norm": 1.974360466003418, "learning_rate": 8.38574423480084e-06, "loss": 0.0388, "step": 4500 }, { "epoch": 85.0, "eval_accuracy": 0.9621212121212122, "eval_loss": 0.15479591488838196, "eval_runtime": 21.3928, "eval_samples_per_second": 135.747, "eval_steps_per_second": 4.254, "step": 4505 }, { "epoch": 85.09433962264151, "grad_norm": 5.822177410125732, "learning_rate": 8.280922431865829e-06, "loss": 0.0541, "step": 4510 }, { "epoch": 85.28301886792453, "grad_norm": 6.9071149826049805, "learning_rate": 8.176100628930818e-06, "loss": 0.0382, "step": 4520 }, { "epoch": 85.47169811320755, "grad_norm": 3.2203402519226074, "learning_rate": 8.071278825995808e-06, "loss": 0.0481, "step": 4530 }, { "epoch": 85.66037735849056, "grad_norm": 2.526183843612671, "learning_rate": 7.966457023060797e-06, "loss": 0.0346, "step": 4540 }, { "epoch": 85.84905660377359, "grad_norm": 1.8161990642547607, "learning_rate": 7.861635220125786e-06, "loss": 0.0421, "step": 4550 }, { "epoch": 86.0, "eval_accuracy": 0.9621212121212122, "eval_loss": 0.1483514904975891, "eval_runtime": 21.3246, "eval_samples_per_second": 136.181, "eval_steps_per_second": 4.267, "step": 4558 }, { "epoch": 86.0377358490566, "grad_norm": 4.167572021484375, "learning_rate": 7.756813417190776e-06, "loss": 0.0558, "step": 4560 }, { "epoch": 86.22641509433963, "grad_norm": 7.080474853515625, "learning_rate": 7.651991614255765e-06, "loss": 0.0374, "step": 4570 }, { "epoch": 86.41509433962264, "grad_norm": 3.921416759490967, "learning_rate": 7.547169811320755e-06, "loss": 0.0557, "step": 4580 }, { "epoch": 86.60377358490567, "grad_norm": 2.932072639465332, "learning_rate": 7.442348008385745e-06, "loss": 0.0492, "step": 4590 }, { "epoch": 86.79245283018868, "grad_norm": 3.091217041015625, "learning_rate": 7.337526205450735e-06, "loss": 0.0348, "step": 4600 }, { "epoch": 86.98113207547169, "grad_norm": 3.1368813514709473, "learning_rate": 7.2327044025157235e-06, "loss": 0.0375, "step": 4610 }, { "epoch": 87.0, "eval_accuracy": 0.9597107438016529, "eval_loss": 0.1680881232023239, "eval_runtime": 21.2539, "eval_samples_per_second": 136.634, "eval_steps_per_second": 4.282, "step": 4611 }, { "epoch": 87.16981132075472, "grad_norm": 3.3242502212524414, "learning_rate": 7.127882599580712e-06, "loss": 0.0363, "step": 4620 }, { "epoch": 87.35849056603773, "grad_norm": 5.762176990509033, "learning_rate": 7.023060796645703e-06, "loss": 0.0449, "step": 4630 }, { "epoch": 87.54716981132076, "grad_norm": 2.077052593231201, "learning_rate": 6.918238993710692e-06, "loss": 0.0457, "step": 4640 }, { "epoch": 87.73584905660377, "grad_norm": 2.2283737659454346, "learning_rate": 6.813417190775681e-06, "loss": 0.0397, "step": 4650 }, { "epoch": 87.9245283018868, "grad_norm": 2.103003740310669, "learning_rate": 6.708595387840672e-06, "loss": 0.0376, "step": 4660 }, { "epoch": 88.0, "eval_accuracy": 0.9631542699724518, "eval_loss": 0.15125274658203125, "eval_runtime": 21.3299, "eval_samples_per_second": 136.147, "eval_steps_per_second": 4.266, "step": 4664 }, { "epoch": 88.11320754716981, "grad_norm": 9.418115615844727, "learning_rate": 6.60377358490566e-06, "loss": 0.0458, "step": 4670 }, { "epoch": 88.30188679245283, "grad_norm": 2.476033926010132, "learning_rate": 6.49895178197065e-06, "loss": 0.0396, "step": 4680 }, { "epoch": 88.49056603773585, "grad_norm": 2.3755970001220703, "learning_rate": 6.3941299790356405e-06, "loss": 0.0384, "step": 4690 }, { "epoch": 88.67924528301887, "grad_norm": 2.3153653144836426, "learning_rate": 6.289308176100629e-06, "loss": 0.0357, "step": 4700 }, { "epoch": 88.86792452830188, "grad_norm": 5.5892720222473145, "learning_rate": 6.184486373165619e-06, "loss": 0.0514, "step": 4710 }, { "epoch": 89.0, "eval_accuracy": 0.9641873278236914, "eval_loss": 0.148544043302536, "eval_runtime": 21.3145, "eval_samples_per_second": 136.245, "eval_steps_per_second": 4.269, "step": 4717 }, { "epoch": 89.05660377358491, "grad_norm": 3.5253398418426514, "learning_rate": 6.079664570230608e-06, "loss": 0.0286, "step": 4720 }, { "epoch": 89.24528301886792, "grad_norm": 5.891650676727295, "learning_rate": 5.974842767295598e-06, "loss": 0.0607, "step": 4730 }, { "epoch": 89.43396226415095, "grad_norm": 2.1808536052703857, "learning_rate": 5.870020964360588e-06, "loss": 0.0404, "step": 4740 }, { "epoch": 89.62264150943396, "grad_norm": 6.388125896453857, "learning_rate": 5.7651991614255766e-06, "loss": 0.0469, "step": 4750 }, { "epoch": 89.81132075471699, "grad_norm": 2.3668999671936035, "learning_rate": 5.660377358490566e-06, "loss": 0.0421, "step": 4760 }, { "epoch": 90.0, "grad_norm": 3.1038739681243896, "learning_rate": 5.555555555555556e-06, "loss": 0.0598, "step": 4770 }, { "epoch": 90.0, "eval_accuracy": 0.9638429752066116, "eval_loss": 0.15414932370185852, "eval_runtime": 21.2451, "eval_samples_per_second": 136.69, "eval_steps_per_second": 4.283, "step": 4770 }, { "epoch": 90.18867924528301, "grad_norm": 3.297988176345825, "learning_rate": 5.4507337526205454e-06, "loss": 0.0397, "step": 4780 }, { "epoch": 90.37735849056604, "grad_norm": 4.2701897621154785, "learning_rate": 5.345911949685535e-06, "loss": 0.0406, "step": 4790 }, { "epoch": 90.56603773584905, "grad_norm": 3.7925121784210205, "learning_rate": 5.241090146750524e-06, "loss": 0.0458, "step": 4800 }, { "epoch": 90.75471698113208, "grad_norm": 4.8097686767578125, "learning_rate": 5.1362683438155135e-06, "loss": 0.0555, "step": 4810 }, { "epoch": 90.94339622641509, "grad_norm": 3.2213995456695557, "learning_rate": 5.031446540880504e-06, "loss": 0.0431, "step": 4820 }, { "epoch": 91.0, "eval_accuracy": 0.9628099173553719, "eval_loss": 0.14735093712806702, "eval_runtime": 21.3261, "eval_samples_per_second": 136.171, "eval_steps_per_second": 4.267, "step": 4823 }, { "epoch": 91.13207547169812, "grad_norm": 3.0545461177825928, "learning_rate": 4.926624737945493e-06, "loss": 0.0415, "step": 4830 }, { "epoch": 91.32075471698113, "grad_norm": 3.0625436305999756, "learning_rate": 4.821802935010482e-06, "loss": 0.0454, "step": 4840 }, { "epoch": 91.50943396226415, "grad_norm": 3.4635112285614014, "learning_rate": 4.716981132075472e-06, "loss": 0.0276, "step": 4850 }, { "epoch": 91.69811320754717, "grad_norm": 2.14255428314209, "learning_rate": 4.612159329140462e-06, "loss": 0.0341, "step": 4860 }, { "epoch": 91.88679245283019, "grad_norm": 5.496433258056641, "learning_rate": 4.507337526205451e-06, "loss": 0.0432, "step": 4870 }, { "epoch": 92.0, "eval_accuracy": 0.9645316804407713, "eval_loss": 0.14980562031269073, "eval_runtime": 21.3199, "eval_samples_per_second": 136.211, "eval_steps_per_second": 4.268, "step": 4876 }, { "epoch": 92.0754716981132, "grad_norm": 4.054795265197754, "learning_rate": 4.40251572327044e-06, "loss": 0.0529, "step": 4880 }, { "epoch": 92.26415094339623, "grad_norm": 5.012380123138428, "learning_rate": 4.29769392033543e-06, "loss": 0.0492, "step": 4890 }, { "epoch": 92.45283018867924, "grad_norm": 0.6688435673713684, "learning_rate": 4.19287211740042e-06, "loss": 0.052, "step": 4900 }, { "epoch": 92.64150943396227, "grad_norm": 3.9398372173309326, "learning_rate": 4.088050314465409e-06, "loss": 0.0468, "step": 4910 }, { "epoch": 92.83018867924528, "grad_norm": 1.439302682876587, "learning_rate": 3.9832285115303985e-06, "loss": 0.0391, "step": 4920 }, { "epoch": 93.0, "eval_accuracy": 0.9645316804407713, "eval_loss": 0.15064947307109833, "eval_runtime": 21.2878, "eval_samples_per_second": 136.416, "eval_steps_per_second": 4.275, "step": 4929 }, { "epoch": 93.01886792452831, "grad_norm": 2.215027093887329, "learning_rate": 3.878406708595388e-06, "loss": 0.0322, "step": 4930 }, { "epoch": 93.20754716981132, "grad_norm": 4.157674312591553, "learning_rate": 3.7735849056603773e-06, "loss": 0.0481, "step": 4940 }, { "epoch": 93.39622641509433, "grad_norm": 2.998509645462036, "learning_rate": 3.6687631027253674e-06, "loss": 0.0424, "step": 4950 }, { "epoch": 93.58490566037736, "grad_norm": 2.701420783996582, "learning_rate": 3.563941299790356e-06, "loss": 0.0347, "step": 4960 }, { "epoch": 93.77358490566037, "grad_norm": 2.793337821960449, "learning_rate": 3.459119496855346e-06, "loss": 0.0448, "step": 4970 }, { "epoch": 93.9622641509434, "grad_norm": 2.3559460639953613, "learning_rate": 3.354297693920336e-06, "loss": 0.0408, "step": 4980 }, { "epoch": 94.0, "eval_accuracy": 0.9641873278236914, "eval_loss": 0.1462460607290268, "eval_runtime": 21.3706, "eval_samples_per_second": 135.888, "eval_steps_per_second": 4.258, "step": 4982 }, { "epoch": 94.15094339622641, "grad_norm": 1.9885700941085815, "learning_rate": 3.249475890985325e-06, "loss": 0.0373, "step": 4990 }, { "epoch": 94.33962264150944, "grad_norm": 4.4361443519592285, "learning_rate": 3.1446540880503146e-06, "loss": 0.0598, "step": 5000 }, { "epoch": 94.52830188679245, "grad_norm": 3.5327351093292236, "learning_rate": 3.039832285115304e-06, "loss": 0.0388, "step": 5010 }, { "epoch": 94.71698113207547, "grad_norm": 0.833363950252533, "learning_rate": 2.935010482180294e-06, "loss": 0.0333, "step": 5020 }, { "epoch": 94.90566037735849, "grad_norm": 3.4921486377716064, "learning_rate": 2.830188679245283e-06, "loss": 0.0335, "step": 5030 }, { "epoch": 95.0, "eval_accuracy": 0.9652203856749312, "eval_loss": 0.15087169408798218, "eval_runtime": 21.4165, "eval_samples_per_second": 135.597, "eval_steps_per_second": 4.249, "step": 5035 }, { "epoch": 95.09433962264151, "grad_norm": 4.499444484710693, "learning_rate": 2.7253668763102727e-06, "loss": 0.0394, "step": 5040 }, { "epoch": 95.28301886792453, "grad_norm": 4.537922382354736, "learning_rate": 2.620545073375262e-06, "loss": 0.0428, "step": 5050 }, { "epoch": 95.47169811320755, "grad_norm": 1.5033186674118042, "learning_rate": 2.515723270440252e-06, "loss": 0.0363, "step": 5060 }, { "epoch": 95.66037735849056, "grad_norm": 0.8121969103813171, "learning_rate": 2.410901467505241e-06, "loss": 0.0298, "step": 5070 }, { "epoch": 95.84905660377359, "grad_norm": 1.9728448390960693, "learning_rate": 2.306079664570231e-06, "loss": 0.0447, "step": 5080 }, { "epoch": 96.0, "eval_accuracy": 0.9634986225895317, "eval_loss": 0.15081696212291718, "eval_runtime": 21.3522, "eval_samples_per_second": 136.005, "eval_steps_per_second": 4.262, "step": 5088 }, { "epoch": 96.0377358490566, "grad_norm": 2.9549922943115234, "learning_rate": 2.20125786163522e-06, "loss": 0.0513, "step": 5090 }, { "epoch": 96.22641509433963, "grad_norm": 4.062204837799072, "learning_rate": 2.09643605870021e-06, "loss": 0.0449, "step": 5100 }, { "epoch": 96.41509433962264, "grad_norm": 2.071183443069458, "learning_rate": 1.9916142557651992e-06, "loss": 0.04, "step": 5110 }, { "epoch": 96.60377358490567, "grad_norm": 0.9506546258926392, "learning_rate": 1.8867924528301887e-06, "loss": 0.0321, "step": 5120 }, { "epoch": 96.79245283018868, "grad_norm": 2.33597469329834, "learning_rate": 1.781970649895178e-06, "loss": 0.024, "step": 5130 }, { "epoch": 96.98113207547169, "grad_norm": 3.253387451171875, "learning_rate": 1.677148846960168e-06, "loss": 0.0477, "step": 5140 }, { "epoch": 97.0, "eval_accuracy": 0.9634986225895317, "eval_loss": 0.1510278433561325, "eval_runtime": 21.3381, "eval_samples_per_second": 136.095, "eval_steps_per_second": 4.265, "step": 5141 }, { "epoch": 97.16981132075472, "grad_norm": 2.858771324157715, "learning_rate": 1.5723270440251573e-06, "loss": 0.0386, "step": 5150 }, { "epoch": 97.35849056603773, "grad_norm": 6.633811950683594, "learning_rate": 1.467505241090147e-06, "loss": 0.0419, "step": 5160 }, { "epoch": 97.54716981132076, "grad_norm": 3.115361452102661, "learning_rate": 1.3626834381551364e-06, "loss": 0.0291, "step": 5170 }, { "epoch": 97.73584905660377, "grad_norm": 3.6518330574035645, "learning_rate": 1.257861635220126e-06, "loss": 0.0372, "step": 5180 }, { "epoch": 97.9245283018868, "grad_norm": 3.8466641902923584, "learning_rate": 1.1530398322851154e-06, "loss": 0.0504, "step": 5190 }, { "epoch": 98.0, "eval_accuracy": 0.9641873278236914, "eval_loss": 0.15101788938045502, "eval_runtime": 21.5013, "eval_samples_per_second": 135.061, "eval_steps_per_second": 4.232, "step": 5194 }, { "epoch": 98.11320754716981, "grad_norm": 4.916390895843506, "learning_rate": 1.048218029350105e-06, "loss": 0.0474, "step": 5200 }, { "epoch": 98.30188679245283, "grad_norm": 3.228938102722168, "learning_rate": 9.433962264150943e-07, "loss": 0.0401, "step": 5210 }, { "epoch": 98.49056603773585, "grad_norm": 1.5599745512008667, "learning_rate": 8.38574423480084e-07, "loss": 0.0439, "step": 5220 }, { "epoch": 98.67924528301887, "grad_norm": 2.1779186725616455, "learning_rate": 7.337526205450735e-07, "loss": 0.0411, "step": 5230 }, { "epoch": 98.86792452830188, "grad_norm": 3.2559657096862793, "learning_rate": 6.28930817610063e-07, "loss": 0.0406, "step": 5240 }, { "epoch": 99.0, "eval_accuracy": 0.9648760330578512, "eval_loss": 0.14794644713401794, "eval_runtime": 21.5282, "eval_samples_per_second": 134.893, "eval_steps_per_second": 4.227, "step": 5247 }, { "epoch": 99.05660377358491, "grad_norm": 2.303928852081299, "learning_rate": 5.241090146750525e-07, "loss": 0.0461, "step": 5250 }, { "epoch": 99.24528301886792, "grad_norm": 5.052129745483398, "learning_rate": 4.19287211740042e-07, "loss": 0.0398, "step": 5260 }, { "epoch": 99.43396226415095, "grad_norm": 5.838912010192871, "learning_rate": 3.144654088050315e-07, "loss": 0.0436, "step": 5270 }, { "epoch": 99.62264150943396, "grad_norm": 1.3886405229568481, "learning_rate": 2.09643605870021e-07, "loss": 0.0281, "step": 5280 }, { "epoch": 99.81132075471699, "grad_norm": 2.893517017364502, "learning_rate": 1.048218029350105e-07, "loss": 0.0495, "step": 5290 }, { "epoch": 100.0, "grad_norm": 3.087257146835327, "learning_rate": 0.0, "loss": 0.0343, "step": 5300 }, { "epoch": 100.0, "eval_accuracy": 0.9645316804407713, "eval_loss": 0.1479804962873459, "eval_runtime": 21.646, "eval_samples_per_second": 134.159, "eval_steps_per_second": 4.204, "step": 5300 }, { "epoch": 100.0, "step": 5300, "total_flos": 1.6838724615023002e+19, "train_loss": 0.14425156983845638, "train_runtime": 35971.5646, "train_samples_per_second": 18.829, "train_steps_per_second": 0.147 } ], "logging_steps": 10, "max_steps": 5300, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6838724615023002e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }