{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.2333984375, "learning_rate": 1.5151515151515152e-06, "loss": 2.504, "step": 1 }, { "epoch": 0.02, "grad_norm": 0.26171875, "learning_rate": 7.5757575757575764e-06, "loss": 2.4856, "step": 5 }, { "epoch": 0.04, "grad_norm": 0.197265625, "learning_rate": 1.5151515151515153e-05, "loss": 2.4231, "step": 10 }, { "epoch": 0.06, "grad_norm": 0.1708984375, "learning_rate": 2.272727272727273e-05, "loss": 2.4239, "step": 15 }, { "epoch": 0.08, "grad_norm": 0.20703125, "learning_rate": 3.0303030303030306e-05, "loss": 2.4352, "step": 20 }, { "epoch": 0.09, "grad_norm": 0.23046875, "learning_rate": 3.787878787878788e-05, "loss": 2.4108, "step": 25 }, { "epoch": 0.11, "grad_norm": 0.2314453125, "learning_rate": 4.545454545454546e-05, "loss": 2.3658, "step": 30 }, { "epoch": 0.13, "grad_norm": 0.1875, "learning_rate": 5.303030303030303e-05, "loss": 2.3141, "step": 35 }, { "epoch": 0.15, "grad_norm": 0.181640625, "learning_rate": 6.060606060606061e-05, "loss": 2.3152, "step": 40 }, { "epoch": 0.17, "grad_norm": 0.1572265625, "learning_rate": 6.818181818181818e-05, "loss": 2.294, "step": 45 }, { "epoch": 0.19, "grad_norm": 0.1611328125, "learning_rate": 7.575757575757576e-05, "loss": 2.2869, "step": 50 }, { "epoch": 0.21, "grad_norm": 0.158203125, "learning_rate": 8.333333333333334e-05, "loss": 2.2961, "step": 55 }, { "epoch": 0.23, "grad_norm": 0.150390625, "learning_rate": 9.090909090909092e-05, "loss": 2.2337, "step": 60 }, { "epoch": 0.25, "grad_norm": 0.171875, "learning_rate": 9.848484848484849e-05, "loss": 2.2716, "step": 65 }, { "epoch": 0.27, "grad_norm": 0.166015625, "learning_rate": 0.00010606060606060606, "loss": 2.2456, "step": 70 }, { "epoch": 0.28, "grad_norm": 0.162109375, "learning_rate": 0.00011363636363636365, "loss": 2.2372, "step": 75 }, { "epoch": 0.3, "grad_norm": 0.158203125, "learning_rate": 0.00012121212121212122, "loss": 2.231, "step": 80 }, { "epoch": 0.32, "grad_norm": 0.2412109375, "learning_rate": 0.00012878787878787878, "loss": 2.2575, "step": 85 }, { "epoch": 0.34, "grad_norm": 0.1640625, "learning_rate": 0.00013636363636363637, "loss": 2.2734, "step": 90 }, { "epoch": 0.36, "grad_norm": 0.17578125, "learning_rate": 0.00014393939393939396, "loss": 2.2141, "step": 95 }, { "epoch": 0.38, "grad_norm": 0.1767578125, "learning_rate": 0.00015151515151515152, "loss": 2.2526, "step": 100 }, { "epoch": 0.4, "grad_norm": 0.1748046875, "learning_rate": 0.0001590909090909091, "loss": 2.2879, "step": 105 }, { "epoch": 0.42, "grad_norm": 0.1884765625, "learning_rate": 0.0001666666666666667, "loss": 2.1986, "step": 110 }, { "epoch": 0.44, "grad_norm": 0.2021484375, "learning_rate": 0.00017424242424242425, "loss": 2.2265, "step": 115 }, { "epoch": 0.45, "grad_norm": 0.2001953125, "learning_rate": 0.00018181818181818183, "loss": 2.196, "step": 120 }, { "epoch": 0.47, "grad_norm": 0.234375, "learning_rate": 0.00018939393939393942, "loss": 2.1943, "step": 125 }, { "epoch": 0.49, "grad_norm": 0.2314453125, "learning_rate": 0.00019696969696969698, "loss": 2.2055, "step": 130 }, { "epoch": 0.51, "grad_norm": 0.2392578125, "learning_rate": 0.0001999968531423333, "loss": 2.173, "step": 135 }, { "epoch": 0.53, "grad_norm": 0.2431640625, "learning_rate": 0.0001999776230627102, "loss": 2.1762, "step": 140 }, { "epoch": 0.55, "grad_norm": 0.255859375, "learning_rate": 0.0001999409145155235, "loss": 2.1762, "step": 145 }, { "epoch": 0.57, "grad_norm": 0.265625, "learning_rate": 0.0001998867339183008, "loss": 2.157, "step": 150 }, { "epoch": 0.59, "grad_norm": 0.32421875, "learning_rate": 0.0001998150907430998, "loss": 2.1838, "step": 155 }, { "epoch": 0.61, "grad_norm": 0.2734375, "learning_rate": 0.00019972599751485226, "loss": 2.1985, "step": 160 }, { "epoch": 0.62, "grad_norm": 0.31640625, "learning_rate": 0.00019961946980917456, "loss": 2.1016, "step": 165 }, { "epoch": 0.64, "grad_norm": 0.27734375, "learning_rate": 0.0001994955262496446, "loss": 2.1371, "step": 170 }, { "epoch": 0.66, "grad_norm": 0.3125, "learning_rate": 0.00019935418850454588, "loss": 2.1527, "step": 175 }, { "epoch": 0.68, "grad_norm": 0.37109375, "learning_rate": 0.00019919548128307954, "loss": 2.118, "step": 180 }, { "epoch": 0.7, "grad_norm": 0.2890625, "learning_rate": 0.00019901943233104443, "loss": 2.098, "step": 185 }, { "epoch": 0.72, "grad_norm": 0.3515625, "learning_rate": 0.00019882607242598663, "loss": 2.0853, "step": 190 }, { "epoch": 0.74, "grad_norm": 0.30078125, "learning_rate": 0.00019861543537181867, "loss": 2.0832, "step": 195 }, { "epoch": 0.76, "grad_norm": 0.31640625, "learning_rate": 0.00019838755799290994, "loss": 2.1097, "step": 200 }, { "epoch": 0.78, "grad_norm": 0.306640625, "learning_rate": 0.00019814248012764877, "loss": 2.0618, "step": 205 }, { "epoch": 0.8, "grad_norm": 0.294921875, "learning_rate": 0.00019788024462147788, "loss": 2.0607, "step": 210 }, { "epoch": 0.81, "grad_norm": 0.3125, "learning_rate": 0.00019760089731940384, "loss": 2.0829, "step": 215 }, { "epoch": 0.83, "grad_norm": 0.361328125, "learning_rate": 0.00019730448705798239, "loss": 1.9736, "step": 220 }, { "epoch": 0.85, "grad_norm": 0.37109375, "learning_rate": 0.0001969910656567805, "loss": 2.033, "step": 225 }, { "epoch": 0.87, "grad_norm": 0.3671875, "learning_rate": 0.00019666068790931732, "loss": 1.9864, "step": 230 }, { "epoch": 0.89, "grad_norm": 0.37890625, "learning_rate": 0.00019631341157348465, "loss": 2.008, "step": 235 }, { "epoch": 0.91, "grad_norm": 0.361328125, "learning_rate": 0.00019594929736144976, "loss": 2.089, "step": 240 }, { "epoch": 0.93, "grad_norm": 0.34765625, "learning_rate": 0.00019556840892904126, "loss": 1.9851, "step": 245 }, { "epoch": 0.95, "grad_norm": 0.36328125, "learning_rate": 0.0001951708128646208, "loss": 1.9994, "step": 250 }, { "epoch": 0.97, "grad_norm": 0.3515625, "learning_rate": 0.0001947565786774415, "loss": 1.9324, "step": 255 }, { "epoch": 0.98, "grad_norm": 0.34765625, "learning_rate": 0.00019432577878549637, "loss": 2.063, "step": 260 }, { "epoch": 1.0, "eval_loss": 2.1617629528045654, "eval_runtime": 67.9579, "eval_samples_per_second": 3.899, "eval_steps_per_second": 0.5, "step": 264 }, { "epoch": 1.0, "grad_norm": 0.4453125, "learning_rate": 0.00019387848850285772, "loss": 1.8981, "step": 265 }, { "epoch": 1.02, "grad_norm": 0.439453125, "learning_rate": 0.00019341478602651069, "loss": 1.7606, "step": 270 }, { "epoch": 1.04, "grad_norm": 0.47265625, "learning_rate": 0.00019293475242268223, "loss": 1.814, "step": 275 }, { "epoch": 1.06, "grad_norm": 0.4140625, "learning_rate": 0.0001924384716126692, "loss": 1.7713, "step": 280 }, { "epoch": 1.08, "grad_norm": 0.4765625, "learning_rate": 0.00019192603035816656, "loss": 1.8325, "step": 285 }, { "epoch": 1.1, "grad_norm": 0.443359375, "learning_rate": 0.0001913975182460996, "loss": 1.7617, "step": 290 }, { "epoch": 1.12, "grad_norm": 0.4140625, "learning_rate": 0.00019085302767296182, "loss": 1.8414, "step": 295 }, { "epoch": 1.14, "grad_norm": 0.46484375, "learning_rate": 0.00019029265382866214, "loss": 1.7312, "step": 300 }, { "epoch": 1.16, "grad_norm": 0.435546875, "learning_rate": 0.000189716494679883, "loss": 1.8303, "step": 305 }, { "epoch": 1.17, "grad_norm": 0.4765625, "learning_rate": 0.00018912465095295388, "loss": 1.7397, "step": 310 }, { "epoch": 1.19, "grad_norm": 0.54296875, "learning_rate": 0.00018851722611624164, "loss": 1.7772, "step": 315 }, { "epoch": 1.21, "grad_norm": 0.45703125, "learning_rate": 0.00018789432636206197, "loss": 1.7317, "step": 320 }, { "epoch": 1.23, "grad_norm": 0.5078125, "learning_rate": 0.00018725606058811424, "loss": 1.7184, "step": 325 }, { "epoch": 1.25, "grad_norm": 0.451171875, "learning_rate": 0.00018660254037844388, "loss": 1.798, "step": 330 }, { "epoch": 1.27, "grad_norm": 0.61328125, "learning_rate": 0.00018593387998393457, "loss": 1.6324, "step": 335 }, { "epoch": 1.29, "grad_norm": 0.48046875, "learning_rate": 0.00018525019630233463, "loss": 1.6136, "step": 340 }, { "epoch": 1.31, "grad_norm": 0.51171875, "learning_rate": 0.00018455160885782045, "loss": 1.7733, "step": 345 }, { "epoch": 1.33, "grad_norm": 0.6328125, "learning_rate": 0.00018383823978010075, "loss": 1.6591, "step": 350 }, { "epoch": 1.34, "grad_norm": 0.62109375, "learning_rate": 0.00018311021378306563, "loss": 1.6217, "step": 355 }, { "epoch": 1.36, "grad_norm": 0.6015625, "learning_rate": 0.0001823676581429833, "loss": 1.5634, "step": 360 }, { "epoch": 1.38, "grad_norm": 0.5078125, "learning_rate": 0.00018161070267624937, "loss": 1.7062, "step": 365 }, { "epoch": 1.4, "grad_norm": 0.55078125, "learning_rate": 0.0001808394797166919, "loss": 1.6042, "step": 370 }, { "epoch": 1.42, "grad_norm": 0.51171875, "learning_rate": 0.00018005412409243606, "loss": 1.6031, "step": 375 }, { "epoch": 1.44, "grad_norm": 0.56640625, "learning_rate": 0.00017925477310233316, "loss": 1.4434, "step": 380 }, { "epoch": 1.46, "grad_norm": 0.63671875, "learning_rate": 0.00017844156649195759, "loss": 1.4816, "step": 385 }, { "epoch": 1.48, "grad_norm": 0.5703125, "learning_rate": 0.0001776146464291757, "loss": 1.5563, "step": 390 }, { "epoch": 1.5, "grad_norm": 0.640625, "learning_rate": 0.00017677415747929174, "loss": 1.5238, "step": 395 }, { "epoch": 1.52, "grad_norm": 0.6328125, "learning_rate": 0.00017592024657977432, "loss": 1.5693, "step": 400 }, { "epoch": 1.53, "grad_norm": 0.5625, "learning_rate": 0.00017505306301456822, "loss": 1.5205, "step": 405 }, { "epoch": 1.55, "grad_norm": 0.58984375, "learning_rate": 0.00017417275838799596, "loss": 1.537, "step": 410 }, { "epoch": 1.57, "grad_norm": 0.6484375, "learning_rate": 0.0001732794865982539, "loss": 1.4182, "step": 415 }, { "epoch": 1.59, "grad_norm": 0.609375, "learning_rate": 0.00017237340381050703, "loss": 1.4714, "step": 420 }, { "epoch": 1.61, "grad_norm": 0.578125, "learning_rate": 0.00017145466842958764, "loss": 1.4167, "step": 425 }, { "epoch": 1.63, "grad_norm": 0.6796875, "learning_rate": 0.00017052344107230241, "loss": 1.4556, "step": 430 }, { "epoch": 1.65, "grad_norm": 0.64453125, "learning_rate": 0.00016957988453935276, "loss": 1.3048, "step": 435 }, { "epoch": 1.67, "grad_norm": 0.5859375, "learning_rate": 0.0001686241637868734, "loss": 1.4648, "step": 440 }, { "epoch": 1.69, "grad_norm": 0.609375, "learning_rate": 0.00016765644589759396, "loss": 1.3969, "step": 445 }, { "epoch": 1.7, "grad_norm": 0.65625, "learning_rate": 0.00016667690005162916, "loss": 1.5284, "step": 450 }, { "epoch": 1.72, "grad_norm": 0.7265625, "learning_rate": 0.00016568569749690208, "loss": 1.3291, "step": 455 }, { "epoch": 1.74, "grad_norm": 0.6484375, "learning_rate": 0.00016468301151920575, "loss": 1.3605, "step": 460 }, { "epoch": 1.76, "grad_norm": 0.61328125, "learning_rate": 0.00016366901741190882, "loss": 1.4962, "step": 465 }, { "epoch": 1.78, "grad_norm": 0.703125, "learning_rate": 0.00016264389244531014, "loss": 1.3618, "step": 470 }, { "epoch": 1.8, "grad_norm": 0.64453125, "learning_rate": 0.0001616078158356475, "loss": 1.391, "step": 475 }, { "epoch": 1.82, "grad_norm": 0.75, "learning_rate": 0.00016056096871376667, "loss": 1.2776, "step": 480 }, { "epoch": 1.84, "grad_norm": 0.64453125, "learning_rate": 0.00015950353409345517, "loss": 1.4267, "step": 485 }, { "epoch": 1.86, "grad_norm": 0.609375, "learning_rate": 0.0001584356968394471, "loss": 1.2524, "step": 490 }, { "epoch": 1.88, "grad_norm": 0.63671875, "learning_rate": 0.0001573576436351046, "loss": 1.3176, "step": 495 }, { "epoch": 1.89, "grad_norm": 0.7578125, "learning_rate": 0.00015626956294978103, "loss": 1.2975, "step": 500 }, { "epoch": 1.91, "grad_norm": 0.70703125, "learning_rate": 0.0001551716450058719, "loss": 1.2803, "step": 505 }, { "epoch": 1.93, "grad_norm": 0.59375, "learning_rate": 0.00015406408174555976, "loss": 1.3139, "step": 510 }, { "epoch": 1.95, "grad_norm": 0.74609375, "learning_rate": 0.0001529470667972579, "loss": 1.2867, "step": 515 }, { "epoch": 1.97, "grad_norm": 0.70703125, "learning_rate": 0.00015182079544175955, "loss": 1.3683, "step": 520 }, { "epoch": 1.99, "grad_norm": 0.82421875, "learning_rate": 0.0001506854645780983, "loss": 1.2293, "step": 525 }, { "epoch": 2.0, "eval_loss": 1.9121202230453491, "eval_runtime": 67.9476, "eval_samples_per_second": 3.9, "eval_steps_per_second": 0.5, "step": 528 }, { "epoch": 2.01, "grad_norm": 0.796875, "learning_rate": 0.00014954127268912526, "loss": 1.1291, "step": 530 }, { "epoch": 2.03, "grad_norm": 0.765625, "learning_rate": 0.0001483884198068096, "loss": 1.045, "step": 535 }, { "epoch": 2.05, "grad_norm": 0.8125, "learning_rate": 0.0001472271074772683, "loss": 0.9242, "step": 540 }, { "epoch": 2.06, "grad_norm": 0.89453125, "learning_rate": 0.00014605753872553093, "loss": 1.0677, "step": 545 }, { "epoch": 2.08, "grad_norm": 0.81640625, "learning_rate": 0.00014487991802004623, "loss": 0.9137, "step": 550 }, { "epoch": 2.1, "grad_norm": 0.78515625, "learning_rate": 0.00014369445123693596, "loss": 0.9222, "step": 555 }, { "epoch": 2.12, "grad_norm": 0.7734375, "learning_rate": 0.000142501345624003, "loss": 0.89, "step": 560 }, { "epoch": 2.14, "grad_norm": 0.72265625, "learning_rate": 0.00014130080976449948, "loss": 1.0561, "step": 565 }, { "epoch": 2.16, "grad_norm": 0.76953125, "learning_rate": 0.00014009305354066137, "loss": 1.0657, "step": 570 }, { "epoch": 2.18, "grad_norm": 0.671875, "learning_rate": 0.0001388782880970162, "loss": 0.8967, "step": 575 }, { "epoch": 2.2, "grad_norm": 0.8359375, "learning_rate": 0.00013765672580346987, "loss": 0.9202, "step": 580 }, { "epoch": 2.22, "grad_norm": 0.92578125, "learning_rate": 0.00013642858021817943, "loss": 0.8938, "step": 585 }, { "epoch": 2.23, "grad_norm": 0.875, "learning_rate": 0.00013519406605021797, "loss": 0.9264, "step": 590 }, { "epoch": 2.25, "grad_norm": 0.765625, "learning_rate": 0.00013395339912203829, "loss": 0.9314, "step": 595 }, { "epoch": 2.27, "grad_norm": 0.84375, "learning_rate": 0.00013270679633174218, "loss": 0.7482, "step": 600 }, { "epoch": 2.29, "grad_norm": 0.83203125, "learning_rate": 0.00013145447561516138, "loss": 0.8059, "step": 605 }, { "epoch": 2.31, "grad_norm": 0.81640625, "learning_rate": 0.00013019665590775716, "loss": 0.8082, "step": 610 }, { "epoch": 2.33, "grad_norm": 0.98046875, "learning_rate": 0.0001289335571063453, "loss": 0.7883, "step": 615 }, { "epoch": 2.35, "grad_norm": 0.73046875, "learning_rate": 0.0001276654000306527, "loss": 0.7813, "step": 620 }, { "epoch": 2.37, "grad_norm": 0.76953125, "learning_rate": 0.00012639240638471317, "loss": 0.9305, "step": 625 }, { "epoch": 2.39, "grad_norm": 0.59765625, "learning_rate": 0.0001251147987181079, "loss": 0.7705, "step": 630 }, { "epoch": 2.41, "grad_norm": 0.9375, "learning_rate": 0.00012383280038705884, "loss": 0.8381, "step": 635 }, { "epoch": 2.42, "grad_norm": 0.70703125, "learning_rate": 0.00012254663551538046, "loss": 0.9174, "step": 640 }, { "epoch": 2.44, "grad_norm": 0.9453125, "learning_rate": 0.00012125652895529766, "loss": 0.9508, "step": 645 }, { "epoch": 2.46, "grad_norm": 0.78515625, "learning_rate": 0.00011996270624813642, "loss": 0.9384, "step": 650 }, { "epoch": 2.48, "grad_norm": 0.8828125, "learning_rate": 0.00011866539358489345, "loss": 0.7756, "step": 655 }, { "epoch": 2.5, "grad_norm": 0.88671875, "learning_rate": 0.00011736481776669306, "loss": 0.8655, "step": 660 }, { "epoch": 2.52, "grad_norm": 0.73046875, "learning_rate": 0.00011606120616513648, "loss": 0.8007, "step": 665 }, { "epoch": 2.54, "grad_norm": 0.7734375, "learning_rate": 0.00011475478668255222, "loss": 0.9225, "step": 670 }, { "epoch": 2.56, "grad_norm": 0.86328125, "learning_rate": 0.00011344578771215319, "loss": 0.7905, "step": 675 }, { "epoch": 2.58, "grad_norm": 0.82421875, "learning_rate": 0.0001121344380981082, "loss": 0.8412, "step": 680 }, { "epoch": 2.59, "grad_norm": 0.69921875, "learning_rate": 0.00011082096709553442, "loss": 0.8604, "step": 685 }, { "epoch": 2.61, "grad_norm": 0.828125, "learning_rate": 0.00010950560433041826, "loss": 0.8263, "step": 690 }, { "epoch": 2.63, "grad_norm": 0.78515625, "learning_rate": 0.00010818857975947128, "loss": 0.8888, "step": 695 }, { "epoch": 2.65, "grad_norm": 0.7109375, "learning_rate": 0.0001068701236299281, "loss": 0.868, "step": 700 }, { "epoch": 2.67, "grad_norm": 0.765625, "learning_rate": 0.00010555046643929403, "loss": 0.8699, "step": 705 }, { "epoch": 2.69, "grad_norm": 0.73828125, "learning_rate": 0.00010422983889504831, "loss": 0.7818, "step": 710 }, { "epoch": 2.71, "grad_norm": 0.81640625, "learning_rate": 0.00010290847187431113, "loss": 0.8963, "step": 715 }, { "epoch": 2.73, "grad_norm": 0.58203125, "learning_rate": 0.00010158659638348081, "loss": 0.659, "step": 720 }, { "epoch": 2.75, "grad_norm": 0.74609375, "learning_rate": 0.00010026444351784822, "loss": 0.7786, "step": 725 }, { "epoch": 2.77, "grad_norm": 0.87109375, "learning_rate": 9.894224442119607e-05, "loss": 0.7708, "step": 730 }, { "epoch": 2.78, "grad_norm": 0.71875, "learning_rate": 9.762023024538926e-05, "loss": 0.7358, "step": 735 }, { "epoch": 2.8, "grad_norm": 0.77734375, "learning_rate": 9.629863210996419e-05, "loss": 0.859, "step": 740 }, { "epoch": 2.82, "grad_norm": 0.79296875, "learning_rate": 9.49776810617235e-05, "loss": 0.6883, "step": 745 }, { "epoch": 2.84, "grad_norm": 0.82421875, "learning_rate": 9.365760803434355e-05, "loss": 0.6727, "step": 750 }, { "epoch": 2.86, "grad_norm": 0.7734375, "learning_rate": 9.233864380800178e-05, "loss": 0.7685, "step": 755 }, { "epoch": 2.88, "grad_norm": 0.87890625, "learning_rate": 9.102101896903084e-05, "loss": 0.8278, "step": 760 }, { "epoch": 2.9, "grad_norm": 0.83203125, "learning_rate": 8.970496386960656e-05, "loss": 0.7174, "step": 765 }, { "epoch": 2.92, "grad_norm": 0.82421875, "learning_rate": 8.839070858747697e-05, "loss": 0.7189, "step": 770 }, { "epoch": 2.94, "grad_norm": 0.80078125, "learning_rate": 8.707848288573926e-05, "loss": 0.6564, "step": 775 }, { "epoch": 2.95, "grad_norm": 0.79296875, "learning_rate": 8.57685161726715e-05, "loss": 0.6793, "step": 780 }, { "epoch": 2.97, "grad_norm": 0.70703125, "learning_rate": 8.446103746162698e-05, "loss": 0.8086, "step": 785 }, { "epoch": 2.99, "grad_norm": 0.7734375, "learning_rate": 8.315627533099696e-05, "loss": 0.6985, "step": 790 }, { "epoch": 3.0, "eval_loss": 1.6915704011917114, "eval_runtime": 67.8527, "eval_samples_per_second": 3.906, "eval_steps_per_second": 0.501, "step": 792 }, { "epoch": 3.01, "grad_norm": 0.73046875, "learning_rate": 8.185445788424974e-05, "loss": 0.5262, "step": 795 }, { "epoch": 3.03, "grad_norm": 0.703125, "learning_rate": 8.055581271005292e-05, "loss": 0.57, "step": 800 }, { "epoch": 3.05, "grad_norm": 0.84375, "learning_rate": 7.92605668424853e-05, "loss": 0.4825, "step": 805 }, { "epoch": 3.07, "grad_norm": 0.73828125, "learning_rate": 7.796894672134594e-05, "loss": 0.5857, "step": 810 }, { "epoch": 3.09, "grad_norm": 0.76171875, "learning_rate": 7.668117815256729e-05, "loss": 0.4708, "step": 815 }, { "epoch": 3.11, "grad_norm": 0.78515625, "learning_rate": 7.539748626873866e-05, "loss": 0.5465, "step": 820 }, { "epoch": 3.12, "grad_norm": 0.80859375, "learning_rate": 7.411809548974792e-05, "loss": 0.5395, "step": 825 }, { "epoch": 3.14, "grad_norm": 0.6640625, "learning_rate": 7.28432294835474e-05, "loss": 0.5176, "step": 830 }, { "epoch": 3.16, "grad_norm": 0.640625, "learning_rate": 7.157311112705149e-05, "loss": 0.5352, "step": 835 }, { "epoch": 3.18, "grad_norm": 0.796875, "learning_rate": 7.030796246717255e-05, "loss": 0.5097, "step": 840 }, { "epoch": 3.2, "grad_norm": 0.78515625, "learning_rate": 6.904800468200143e-05, "loss": 0.4998, "step": 845 }, { "epoch": 3.22, "grad_norm": 0.73828125, "learning_rate": 6.779345804214088e-05, "loss": 0.491, "step": 850 }, { "epoch": 3.24, "grad_norm": 0.66796875, "learning_rate": 6.654454187219649e-05, "loss": 0.5098, "step": 855 }, { "epoch": 3.26, "grad_norm": 0.609375, "learning_rate": 6.530147451243377e-05, "loss": 0.5362, "step": 860 }, { "epoch": 3.28, "grad_norm": 0.875, "learning_rate": 6.406447328060709e-05, "loss": 0.5549, "step": 865 }, { "epoch": 3.3, "grad_norm": 0.67578125, "learning_rate": 6.283375443396726e-05, "loss": 0.4999, "step": 870 }, { "epoch": 3.31, "grad_norm": 0.64453125, "learning_rate": 6.160953313145463e-05, "loss": 0.471, "step": 875 }, { "epoch": 3.33, "grad_norm": 0.6640625, "learning_rate": 6.039202339608432e-05, "loss": 0.4818, "step": 880 }, { "epoch": 3.35, "grad_norm": 0.75, "learning_rate": 5.918143807752972e-05, "loss": 0.5522, "step": 885 }, { "epoch": 3.37, "grad_norm": 0.7890625, "learning_rate": 5.797798881491138e-05, "loss": 0.4821, "step": 890 }, { "epoch": 3.39, "grad_norm": 0.7578125, "learning_rate": 5.678188599979753e-05, "loss": 0.5222, "step": 895 }, { "epoch": 3.41, "grad_norm": 0.7734375, "learning_rate": 5.559333873942259e-05, "loss": 0.4699, "step": 900 }, { "epoch": 3.43, "grad_norm": 0.69921875, "learning_rate": 5.44125548201301e-05, "loss": 0.4958, "step": 905 }, { "epoch": 3.45, "grad_norm": 0.66015625, "learning_rate": 5.3239740671046864e-05, "loss": 0.6005, "step": 910 }, { "epoch": 3.47, "grad_norm": 0.6953125, "learning_rate": 5.207510132799436e-05, "loss": 0.5334, "step": 915 }, { "epoch": 3.48, "grad_norm": 0.80859375, "learning_rate": 5.091884039764321e-05, "loss": 0.481, "step": 920 }, { "epoch": 3.5, "grad_norm": 0.79296875, "learning_rate": 4.9771160021918305e-05, "loss": 0.5972, "step": 925 }, { "epoch": 3.52, "grad_norm": 0.66796875, "learning_rate": 4.8632260842659393e-05, "loss": 0.5062, "step": 930 }, { "epoch": 3.54, "grad_norm": 0.765625, "learning_rate": 4.7502341966544e-05, "loss": 0.5411, "step": 935 }, { "epoch": 3.56, "grad_norm": 0.765625, "learning_rate": 4.638160093027908e-05, "loss": 0.4579, "step": 940 }, { "epoch": 3.58, "grad_norm": 0.703125, "learning_rate": 4.527023366606679e-05, "loss": 0.5362, "step": 945 }, { "epoch": 3.6, "grad_norm": 0.859375, "learning_rate": 4.416843446735077e-05, "loss": 0.5176, "step": 950 }, { "epoch": 3.62, "grad_norm": 0.83984375, "learning_rate": 4.3076395954849236e-05, "loss": 0.5847, "step": 955 }, { "epoch": 3.64, "grad_norm": 0.6015625, "learning_rate": 4.19943090428802e-05, "loss": 0.4638, "step": 960 }, { "epoch": 3.66, "grad_norm": 0.859375, "learning_rate": 4.092236290598499e-05, "loss": 0.5305, "step": 965 }, { "epoch": 3.67, "grad_norm": 0.69140625, "learning_rate": 3.986074494585619e-05, "loss": 0.4742, "step": 970 }, { "epoch": 3.69, "grad_norm": 0.68359375, "learning_rate": 3.880964075857535e-05, "loss": 0.5312, "step": 975 }, { "epoch": 3.71, "grad_norm": 0.64453125, "learning_rate": 3.776923410216636e-05, "loss": 0.4508, "step": 980 }, { "epoch": 3.73, "grad_norm": 0.69140625, "learning_rate": 3.673970686447005e-05, "loss": 0.5022, "step": 985 }, { "epoch": 3.75, "grad_norm": 0.765625, "learning_rate": 3.5721239031346066e-05, "loss": 0.4655, "step": 990 }, { "epoch": 3.77, "grad_norm": 0.7734375, "learning_rate": 3.4714008655207e-05, "loss": 0.3936, "step": 995 }, { "epoch": 3.79, "grad_norm": 0.82421875, "learning_rate": 3.37181918238904e-05, "loss": 0.508, "step": 1000 }, { "epoch": 3.81, "grad_norm": 0.703125, "learning_rate": 3.273396262987475e-05, "loss": 0.4346, "step": 1005 }, { "epoch": 3.83, "grad_norm": 0.6640625, "learning_rate": 3.1761493139843735e-05, "loss": 0.4455, "step": 1010 }, { "epoch": 3.84, "grad_norm": 0.62109375, "learning_rate": 3.080095336460491e-05, "loss": 0.431, "step": 1015 }, { "epoch": 3.86, "grad_norm": 0.7265625, "learning_rate": 2.9852511229367865e-05, "loss": 0.4371, "step": 1020 }, { "epoch": 3.88, "grad_norm": 0.6875, "learning_rate": 2.891633254438685e-05, "loss": 0.4578, "step": 1025 }, { "epoch": 3.9, "grad_norm": 0.59375, "learning_rate": 2.7992580975973136e-05, "loss": 0.5402, "step": 1030 }, { "epoch": 3.92, "grad_norm": 0.6953125, "learning_rate": 2.70814180178823e-05, "loss": 0.4338, "step": 1035 }, { "epoch": 3.94, "grad_norm": 0.8046875, "learning_rate": 2.618300296308135e-05, "loss": 0.4541, "step": 1040 }, { "epoch": 3.96, "grad_norm": 0.640625, "learning_rate": 2.529749287590042e-05, "loss": 0.4433, "step": 1045 }, { "epoch": 3.98, "grad_norm": 0.640625, "learning_rate": 2.4425042564574184e-05, "loss": 0.4724, "step": 1050 }, { "epoch": 4.0, "grad_norm": 0.8125, "learning_rate": 2.356580455417776e-05, "loss": 0.4922, "step": 1055 }, { "epoch": 4.0, "eval_loss": 1.6053533554077148, "eval_runtime": 67.9802, "eval_samples_per_second": 3.898, "eval_steps_per_second": 0.5, "step": 1056 }, { "epoch": 4.02, "grad_norm": 0.73828125, "learning_rate": 2.2719929059961698e-05, "loss": 0.4008, "step": 1060 }, { "epoch": 4.03, "grad_norm": 0.5859375, "learning_rate": 2.1887563961090663e-05, "loss": 0.3496, "step": 1065 }, { "epoch": 4.05, "grad_norm": 0.62109375, "learning_rate": 2.106885477479078e-05, "loss": 0.486, "step": 1070 }, { "epoch": 4.07, "grad_norm": 0.671875, "learning_rate": 2.0263944630909738e-05, "loss": 0.3697, "step": 1075 }, { "epoch": 4.09, "grad_norm": 0.65625, "learning_rate": 1.947297424689414e-05, "loss": 0.4396, "step": 1080 }, { "epoch": 4.11, "grad_norm": 0.71484375, "learning_rate": 1.8696081903188955e-05, "loss": 0.3916, "step": 1085 }, { "epoch": 4.13, "grad_norm": 0.6640625, "learning_rate": 1.7933403419062688e-05, "loss": 0.4399, "step": 1090 }, { "epoch": 4.15, "grad_norm": 0.6953125, "learning_rate": 1.7185072128862933e-05, "loss": 0.486, "step": 1095 }, { "epoch": 4.17, "grad_norm": 0.6640625, "learning_rate": 1.6451218858706374e-05, "loss": 0.4195, "step": 1100 }, { "epoch": 4.19, "grad_norm": 0.609375, "learning_rate": 1.573197190360729e-05, "loss": 0.3948, "step": 1105 }, { "epoch": 4.2, "grad_norm": 0.70703125, "learning_rate": 1.5027457005048573e-05, "loss": 0.4698, "step": 1110 }, { "epoch": 4.22, "grad_norm": 0.6796875, "learning_rate": 1.433779732899897e-05, "loss": 0.3929, "step": 1115 }, { "epoch": 4.24, "grad_norm": 0.71484375, "learning_rate": 1.3663113444380905e-05, "loss": 0.3774, "step": 1120 }, { "epoch": 4.26, "grad_norm": 0.72265625, "learning_rate": 1.3003523301992104e-05, "loss": 0.495, "step": 1125 }, { "epoch": 4.28, "grad_norm": 0.63671875, "learning_rate": 1.2359142213884933e-05, "loss": 0.4335, "step": 1130 }, { "epoch": 4.3, "grad_norm": 0.640625, "learning_rate": 1.1730082833207202e-05, "loss": 0.358, "step": 1135 }, { "epoch": 4.32, "grad_norm": 0.57421875, "learning_rate": 1.1116455134507664e-05, "loss": 0.357, "step": 1140 }, { "epoch": 4.34, "grad_norm": 0.6875, "learning_rate": 1.0518366394509804e-05, "loss": 0.4767, "step": 1145 }, { "epoch": 4.36, "grad_norm": 0.63671875, "learning_rate": 9.935921173357442e-06, "loss": 0.4063, "step": 1150 }, { "epoch": 4.38, "grad_norm": 0.6484375, "learning_rate": 9.369221296335006e-06, "loss": 0.3668, "step": 1155 }, { "epoch": 4.39, "grad_norm": 0.85546875, "learning_rate": 8.818365836066101e-06, "loss": 0.3531, "step": 1160 }, { "epoch": 4.41, "grad_norm": 0.66015625, "learning_rate": 8.283451095193229e-06, "loss": 0.3754, "step": 1165 }, { "epoch": 4.43, "grad_norm": 0.734375, "learning_rate": 7.764570589541875e-06, "loss": 0.4278, "step": 1170 }, { "epoch": 4.45, "grad_norm": 0.73828125, "learning_rate": 7.261815031771602e-06, "loss": 0.4919, "step": 1175 }, { "epoch": 4.47, "grad_norm": 0.73046875, "learning_rate": 6.7752723155174226e-06, "loss": 0.4837, "step": 1180 }, { "epoch": 4.49, "grad_norm": 0.78125, "learning_rate": 6.3050275000238414e-06, "loss": 0.4196, "step": 1185 }, { "epoch": 4.51, "grad_norm": 0.671875, "learning_rate": 5.851162795274445e-06, "loss": 0.4928, "step": 1190 }, { "epoch": 4.53, "grad_norm": 0.84765625, "learning_rate": 5.413757547619747e-06, "loss": 0.4489, "step": 1195 }, { "epoch": 4.55, "grad_norm": 0.5390625, "learning_rate": 4.992888225905468e-06, "loss": 0.4083, "step": 1200 }, { "epoch": 4.56, "grad_norm": 0.734375, "learning_rate": 4.5886284081039675e-06, "loss": 0.5487, "step": 1205 }, { "epoch": 4.58, "grad_norm": 0.70703125, "learning_rate": 4.20104876845111e-06, "loss": 0.3817, "step": 1210 }, { "epoch": 4.6, "grad_norm": 0.68359375, "learning_rate": 3.830217065090702e-06, "loss": 0.4394, "step": 1215 }, { "epoch": 4.62, "grad_norm": 0.6640625, "learning_rate": 3.476198128228736e-06, "loss": 0.3811, "step": 1220 }, { "epoch": 4.64, "grad_norm": 0.6328125, "learning_rate": 3.139053848799556e-06, "loss": 0.3809, "step": 1225 }, { "epoch": 4.66, "grad_norm": 0.828125, "learning_rate": 2.818843167645835e-06, "loss": 0.3755, "step": 1230 }, { "epoch": 4.68, "grad_norm": 0.6640625, "learning_rate": 2.5156220652143404e-06, "loss": 0.4108, "step": 1235 }, { "epoch": 4.7, "grad_norm": 0.58984375, "learning_rate": 2.2294435517691503e-06, "loss": 0.5031, "step": 1240 }, { "epoch": 4.72, "grad_norm": 0.63671875, "learning_rate": 1.960357658124301e-06, "loss": 0.4176, "step": 1245 }, { "epoch": 4.73, "grad_norm": 0.64453125, "learning_rate": 1.7084114268971275e-06, "loss": 0.3757, "step": 1250 }, { "epoch": 4.75, "grad_norm": 0.60546875, "learning_rate": 1.4736489042840973e-06, "loss": 0.4244, "step": 1255 }, { "epoch": 4.77, "grad_norm": 0.640625, "learning_rate": 1.2561111323605712e-06, "loss": 0.3933, "step": 1260 }, { "epoch": 4.79, "grad_norm": 0.62109375, "learning_rate": 1.055836141905553e-06, "loss": 0.4223, "step": 1265 }, { "epoch": 4.81, "grad_norm": 0.62890625, "learning_rate": 8.728589457530855e-07, "loss": 0.3988, "step": 1270 }, { "epoch": 4.83, "grad_norm": 0.5078125, "learning_rate": 7.072115326711704e-07, "loss": 0.3916, "step": 1275 }, { "epoch": 4.85, "grad_norm": 0.67578125, "learning_rate": 5.589228617693288e-07, "loss": 0.3814, "step": 1280 }, { "epoch": 4.87, "grad_norm": 0.7578125, "learning_rate": 4.2801885743588567e-07, "loss": 0.4261, "step": 1285 }, { "epoch": 4.89, "grad_norm": 0.62890625, "learning_rate": 3.145224048057727e-07, "loss": 0.4018, "step": 1290 }, { "epoch": 4.91, "grad_norm": 0.60546875, "learning_rate": 2.1845334575963938e-07, "loss": 0.4824, "step": 1295 }, { "epoch": 4.92, "grad_norm": 0.734375, "learning_rate": 1.3982847545507271e-07, "loss": 0.4293, "step": 1300 }, { "epoch": 4.94, "grad_norm": 0.7734375, "learning_rate": 7.866153939033449e-08, "loss": 0.3955, "step": 1305 }, { "epoch": 4.96, "grad_norm": 0.80859375, "learning_rate": 3.496323100138366e-08, "loss": 0.3909, "step": 1310 }, { "epoch": 4.98, "grad_norm": 0.7734375, "learning_rate": 8.7411897923384e-09, "loss": 0.4108, "step": 1315 }, { "epoch": 5.0, "grad_norm": 0.66796875, "learning_rate": 0.0, "loss": 0.3396, "step": 1320 }, { "epoch": 5.0, "eval_loss": 1.6054625511169434, "eval_runtime": 67.9944, "eval_samples_per_second": 3.897, "eval_steps_per_second": 0.5, "step": 1320 }, { "epoch": 5.0, "step": 1320, "total_flos": 9.280995169497252e+17, "train_loss": 1.0955926315350966, "train_runtime": 10263.5013, "train_samples_per_second": 1.028, "train_steps_per_second": 0.129 } ], "logging_steps": 5, "max_steps": 1320, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "total_flos": 9.280995169497252e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }