{ "best_metric": null, "best_model_checkpoint": null, "epoch": 49.441786283891545, "eval_steps": 500, "global_step": 31000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 3.6600120067596436, "learning_rate": 0.0019993620414673046, "loss": 2.7717, "step": 10 }, { "epoch": 0.03, "grad_norm": 1.797737717628479, "learning_rate": 0.0019987240829346096, "loss": 2.2538, "step": 20 }, { "epoch": 0.05, "grad_norm": 1.503915548324585, "learning_rate": 0.001998086124401914, "loss": 2.5162, "step": 30 }, { "epoch": 0.06, "grad_norm": 2.3732383251190186, "learning_rate": 0.0019974481658692187, "loss": 2.7087, "step": 40 }, { "epoch": 0.08, "grad_norm": 4.7273030281066895, "learning_rate": 0.0019968102073365233, "loss": 2.3868, "step": 50 }, { "epoch": 0.1, "grad_norm": 4.214552402496338, "learning_rate": 0.001996172248803828, "loss": 2.4794, "step": 60 }, { "epoch": 0.11, "grad_norm": 5.231570243835449, "learning_rate": 0.0019955342902711324, "loss": 2.5851, "step": 70 }, { "epoch": 0.13, "grad_norm": 1.2388306856155396, "learning_rate": 0.001994896331738437, "loss": 2.3979, "step": 80 }, { "epoch": 0.14, "grad_norm": 5.343308448791504, "learning_rate": 0.001994258373205742, "loss": 2.5801, "step": 90 }, { "epoch": 0.16, "grad_norm": 5.010150909423828, "learning_rate": 0.0019936204146730465, "loss": 2.782, "step": 100 }, { "epoch": 0.18, "grad_norm": 4.422647953033447, "learning_rate": 0.001992982456140351, "loss": 2.5693, "step": 110 }, { "epoch": 0.19, "grad_norm": 2.5217037200927734, "learning_rate": 0.0019923444976076557, "loss": 2.6707, "step": 120 }, { "epoch": 0.21, "grad_norm": 4.945374011993408, "learning_rate": 0.0019917065390749602, "loss": 2.7236, "step": 130 }, { "epoch": 0.22, "grad_norm": 1.9850760698318481, "learning_rate": 0.001991068580542265, "loss": 2.5799, "step": 140 }, { "epoch": 0.24, "grad_norm": 5.4692206382751465, "learning_rate": 0.0019904306220095693, "loss": 2.6595, "step": 150 }, { "epoch": 0.26, "grad_norm": 2.3442928791046143, "learning_rate": 0.0019897926634768743, "loss": 2.8178, "step": 160 }, { "epoch": 0.27, "grad_norm": 1.8789024353027344, "learning_rate": 0.001989154704944179, "loss": 2.5928, "step": 170 }, { "epoch": 0.29, "grad_norm": 4.694440841674805, "learning_rate": 0.0019885167464114835, "loss": 2.7751, "step": 180 }, { "epoch": 0.3, "grad_norm": 6.304182529449463, "learning_rate": 0.001987878787878788, "loss": 2.5837, "step": 190 }, { "epoch": 0.32, "grad_norm": 5.747090816497803, "learning_rate": 0.0019872408293460926, "loss": 2.7569, "step": 200 }, { "epoch": 0.33, "grad_norm": 5.950802803039551, "learning_rate": 0.001986602870813397, "loss": 2.5465, "step": 210 }, { "epoch": 0.35, "grad_norm": 3.991403102874756, "learning_rate": 0.0019859649122807017, "loss": 2.3429, "step": 220 }, { "epoch": 0.37, "grad_norm": 15.909507751464844, "learning_rate": 0.0019853269537480063, "loss": 2.6886, "step": 230 }, { "epoch": 0.38, "grad_norm": 3.464792490005493, "learning_rate": 0.0019846889952153113, "loss": 2.6745, "step": 240 }, { "epoch": 0.4, "grad_norm": 2.647952079772949, "learning_rate": 0.001984051036682616, "loss": 2.666, "step": 250 }, { "epoch": 0.41, "grad_norm": 5.060638427734375, "learning_rate": 0.0019834130781499204, "loss": 2.502, "step": 260 }, { "epoch": 0.43, "grad_norm": 2.046036720275879, "learning_rate": 0.001982775119617225, "loss": 2.6536, "step": 270 }, { "epoch": 0.45, "grad_norm": 3.072054147720337, "learning_rate": 0.0019821371610845295, "loss": 2.7964, "step": 280 }, { "epoch": 0.46, "grad_norm": 4.118267059326172, "learning_rate": 0.001981499202551834, "loss": 2.7141, "step": 290 }, { "epoch": 0.48, "grad_norm": 6.113500118255615, "learning_rate": 0.0019808612440191387, "loss": 2.6802, "step": 300 }, { "epoch": 0.49, "grad_norm": 4.462850570678711, "learning_rate": 0.0019802232854864437, "loss": 2.611, "step": 310 }, { "epoch": 0.51, "grad_norm": 4.859583854675293, "learning_rate": 0.001979585326953748, "loss": 2.954, "step": 320 }, { "epoch": 0.53, "grad_norm": 2.9786384105682373, "learning_rate": 0.0019789473684210528, "loss": 2.7084, "step": 330 }, { "epoch": 0.54, "grad_norm": 2.122506618499756, "learning_rate": 0.0019783094098883573, "loss": 2.5917, "step": 340 }, { "epoch": 0.56, "grad_norm": 2.5119547843933105, "learning_rate": 0.001977671451355662, "loss": 2.7874, "step": 350 }, { "epoch": 0.57, "grad_norm": 5.032376289367676, "learning_rate": 0.0019770334928229665, "loss": 2.7288, "step": 360 }, { "epoch": 0.59, "grad_norm": 6.186531066894531, "learning_rate": 0.001976395534290271, "loss": 2.7858, "step": 370 }, { "epoch": 0.61, "grad_norm": 1.5598961114883423, "learning_rate": 0.001975757575757576, "loss": 2.8022, "step": 380 }, { "epoch": 0.62, "grad_norm": 3.7895145416259766, "learning_rate": 0.0019751196172248806, "loss": 2.9117, "step": 390 }, { "epoch": 0.64, "grad_norm": 9.171380043029785, "learning_rate": 0.001974481658692185, "loss": 2.8098, "step": 400 }, { "epoch": 0.65, "grad_norm": 3.090906858444214, "learning_rate": 0.0019738437001594897, "loss": 2.7351, "step": 410 }, { "epoch": 0.67, "grad_norm": 8.468884468078613, "learning_rate": 0.0019732057416267943, "loss": 2.868, "step": 420 }, { "epoch": 0.69, "grad_norm": 3.8941705226898193, "learning_rate": 0.001972567783094099, "loss": 2.9424, "step": 430 }, { "epoch": 0.7, "grad_norm": 2.694938898086548, "learning_rate": 0.0019719298245614034, "loss": 2.7966, "step": 440 }, { "epoch": 0.72, "grad_norm": 3.2257883548736572, "learning_rate": 0.0019712918660287084, "loss": 2.8685, "step": 450 }, { "epoch": 0.73, "grad_norm": 2.8044159412384033, "learning_rate": 0.001970653907496013, "loss": 2.9069, "step": 460 }, { "epoch": 0.75, "grad_norm": 3.731559991836548, "learning_rate": 0.0019700159489633175, "loss": 2.9604, "step": 470 }, { "epoch": 0.77, "grad_norm": 3.8493754863739014, "learning_rate": 0.001969377990430622, "loss": 3.1143, "step": 480 }, { "epoch": 0.78, "grad_norm": 2.380948781967163, "learning_rate": 0.0019687400318979266, "loss": 2.8833, "step": 490 }, { "epoch": 0.8, "grad_norm": 2.755617141723633, "learning_rate": 0.001968102073365231, "loss": 2.8669, "step": 500 }, { "epoch": 0.81, "grad_norm": 3.0759685039520264, "learning_rate": 0.0019674641148325358, "loss": 2.659, "step": 510 }, { "epoch": 0.83, "grad_norm": 2.367964744567871, "learning_rate": 0.0019668261562998408, "loss": 3.0106, "step": 520 }, { "epoch": 0.85, "grad_norm": 4.058963775634766, "learning_rate": 0.0019661881977671453, "loss": 2.8824, "step": 530 }, { "epoch": 0.86, "grad_norm": 1.9852606058120728, "learning_rate": 0.00196555023923445, "loss": 2.8171, "step": 540 }, { "epoch": 0.88, "grad_norm": 3.7825193405151367, "learning_rate": 0.0019649122807017545, "loss": 2.9357, "step": 550 }, { "epoch": 0.89, "grad_norm": 4.315491676330566, "learning_rate": 0.001964274322169059, "loss": 2.8921, "step": 560 }, { "epoch": 0.91, "grad_norm": 2.1023871898651123, "learning_rate": 0.0019636363636363636, "loss": 2.8568, "step": 570 }, { "epoch": 0.93, "grad_norm": 2.552720785140991, "learning_rate": 0.001962998405103668, "loss": 2.9858, "step": 580 }, { "epoch": 0.94, "grad_norm": 9.927117347717285, "learning_rate": 0.001962360446570973, "loss": 2.8938, "step": 590 }, { "epoch": 0.96, "grad_norm": 5.051787853240967, "learning_rate": 0.0019617224880382777, "loss": 2.7605, "step": 600 }, { "epoch": 0.97, "grad_norm": 3.3584322929382324, "learning_rate": 0.0019610845295055823, "loss": 2.7514, "step": 610 }, { "epoch": 0.99, "grad_norm": 1.7937440872192383, "learning_rate": 0.001960446570972887, "loss": 2.9748, "step": 620 }, { "epoch": 1.0, "grad_norm": 5.3826003074646, "learning_rate": 0.0019598086124401914, "loss": 2.7608, "step": 630 }, { "epoch": 1.02, "grad_norm": 4.8360724449157715, "learning_rate": 0.001959170653907496, "loss": 2.2607, "step": 640 }, { "epoch": 1.04, "grad_norm": 2.8146324157714844, "learning_rate": 0.0019585326953748005, "loss": 2.2973, "step": 650 }, { "epoch": 1.05, "grad_norm": 2.535956859588623, "learning_rate": 0.0019578947368421055, "loss": 2.2598, "step": 660 }, { "epoch": 1.07, "grad_norm": 4.664738655090332, "learning_rate": 0.00195725677830941, "loss": 2.3387, "step": 670 }, { "epoch": 1.08, "grad_norm": 3.6893537044525146, "learning_rate": 0.0019566188197767146, "loss": 2.249, "step": 680 }, { "epoch": 1.1, "grad_norm": 4.679712295532227, "learning_rate": 0.001955980861244019, "loss": 2.3068, "step": 690 }, { "epoch": 1.12, "grad_norm": 2.374504327774048, "learning_rate": 0.0019553429027113238, "loss": 2.5318, "step": 700 }, { "epoch": 1.13, "grad_norm": 3.3196609020233154, "learning_rate": 0.0019547049441786283, "loss": 2.4066, "step": 710 }, { "epoch": 1.15, "grad_norm": 5.714052200317383, "learning_rate": 0.001954066985645933, "loss": 2.4376, "step": 720 }, { "epoch": 1.16, "grad_norm": 6.177249431610107, "learning_rate": 0.001953429027113238, "loss": 2.2004, "step": 730 }, { "epoch": 1.18, "grad_norm": 2.936424970626831, "learning_rate": 0.0019527910685805422, "loss": 2.3424, "step": 740 }, { "epoch": 1.2, "grad_norm": 3.634345769882202, "learning_rate": 0.0019521531100478468, "loss": 2.5118, "step": 750 }, { "epoch": 1.21, "grad_norm": 1.8677217960357666, "learning_rate": 0.0019515151515151514, "loss": 2.3253, "step": 760 }, { "epoch": 1.23, "grad_norm": 2.1149327754974365, "learning_rate": 0.0019508771929824564, "loss": 2.3353, "step": 770 }, { "epoch": 1.24, "grad_norm": 4.144554138183594, "learning_rate": 0.001950239234449761, "loss": 2.6115, "step": 780 }, { "epoch": 1.26, "grad_norm": 4.8128814697265625, "learning_rate": 0.0019496012759170655, "loss": 2.5473, "step": 790 }, { "epoch": 1.28, "grad_norm": 2.2847745418548584, "learning_rate": 0.0019489633173843703, "loss": 2.4678, "step": 800 }, { "epoch": 1.29, "grad_norm": 4.829673767089844, "learning_rate": 0.0019483253588516748, "loss": 2.638, "step": 810 }, { "epoch": 1.31, "grad_norm": 2.4837145805358887, "learning_rate": 0.0019476874003189794, "loss": 2.4926, "step": 820 }, { "epoch": 1.32, "grad_norm": 2.7193591594696045, "learning_rate": 0.001947049441786284, "loss": 2.4601, "step": 830 }, { "epoch": 1.34, "grad_norm": 3.0565848350524902, "learning_rate": 0.0019464114832535887, "loss": 2.3951, "step": 840 }, { "epoch": 1.36, "grad_norm": 3.4434733390808105, "learning_rate": 0.0019457735247208933, "loss": 2.4247, "step": 850 }, { "epoch": 1.37, "grad_norm": 2.9921035766601562, "learning_rate": 0.0019451355661881979, "loss": 2.223, "step": 860 }, { "epoch": 1.39, "grad_norm": 4.521476745605469, "learning_rate": 0.0019444976076555026, "loss": 2.371, "step": 870 }, { "epoch": 1.4, "grad_norm": 1.987562656402588, "learning_rate": 0.0019438596491228072, "loss": 2.3996, "step": 880 }, { "epoch": 1.42, "grad_norm": 2.5876095294952393, "learning_rate": 0.0019432216905901118, "loss": 2.6735, "step": 890 }, { "epoch": 1.44, "grad_norm": 3.727102279663086, "learning_rate": 0.0019425837320574163, "loss": 2.3431, "step": 900 }, { "epoch": 1.45, "grad_norm": 2.775712728500366, "learning_rate": 0.001941945773524721, "loss": 2.5217, "step": 910 }, { "epoch": 1.47, "grad_norm": 4.316661357879639, "learning_rate": 0.0019413078149920257, "loss": 2.5654, "step": 920 }, { "epoch": 1.48, "grad_norm": 5.313731670379639, "learning_rate": 0.0019406698564593302, "loss": 2.5676, "step": 930 }, { "epoch": 1.5, "grad_norm": 3.3875491619110107, "learning_rate": 0.0019400318979266348, "loss": 2.701, "step": 940 }, { "epoch": 1.52, "grad_norm": 5.12388801574707, "learning_rate": 0.0019393939393939396, "loss": 2.6469, "step": 950 }, { "epoch": 1.53, "grad_norm": 5.213893890380859, "learning_rate": 0.0019387559808612441, "loss": 2.5527, "step": 960 }, { "epoch": 1.55, "grad_norm": 2.5714313983917236, "learning_rate": 0.0019381180223285487, "loss": 2.5112, "step": 970 }, { "epoch": 1.56, "grad_norm": 3.034376859664917, "learning_rate": 0.0019374800637958535, "loss": 2.7126, "step": 980 }, { "epoch": 1.58, "grad_norm": 4.801724910736084, "learning_rate": 0.001936842105263158, "loss": 2.6247, "step": 990 }, { "epoch": 1.59, "grad_norm": 2.8160829544067383, "learning_rate": 0.0019362041467304626, "loss": 2.5434, "step": 1000 }, { "epoch": 1.61, "grad_norm": 5.186509132385254, "learning_rate": 0.0019355661881977672, "loss": 2.6724, "step": 1010 }, { "epoch": 1.63, "grad_norm": 4.558096408843994, "learning_rate": 0.001934928229665072, "loss": 2.6487, "step": 1020 }, { "epoch": 1.64, "grad_norm": 4.698276519775391, "learning_rate": 0.0019342902711323765, "loss": 2.5392, "step": 1030 }, { "epoch": 1.66, "grad_norm": 3.624025821685791, "learning_rate": 0.001933652312599681, "loss": 2.6413, "step": 1040 }, { "epoch": 1.67, "grad_norm": 2.634162664413452, "learning_rate": 0.0019330143540669858, "loss": 2.7082, "step": 1050 }, { "epoch": 1.69, "grad_norm": 2.483462333679199, "learning_rate": 0.0019323763955342904, "loss": 2.716, "step": 1060 }, { "epoch": 1.71, "grad_norm": 3.256911277770996, "learning_rate": 0.001931738437001595, "loss": 2.7508, "step": 1070 }, { "epoch": 1.72, "grad_norm": 2.233299970626831, "learning_rate": 0.0019311004784688995, "loss": 2.7591, "step": 1080 }, { "epoch": 1.74, "grad_norm": 3.583534002304077, "learning_rate": 0.0019304625199362043, "loss": 2.5285, "step": 1090 }, { "epoch": 1.75, "grad_norm": 2.7138407230377197, "learning_rate": 0.0019298245614035089, "loss": 2.4167, "step": 1100 }, { "epoch": 1.77, "grad_norm": 4.423559665679932, "learning_rate": 0.0019291866028708134, "loss": 2.6601, "step": 1110 }, { "epoch": 1.79, "grad_norm": 4.424483776092529, "learning_rate": 0.0019285486443381182, "loss": 2.6824, "step": 1120 }, { "epoch": 1.8, "grad_norm": 1.5624890327453613, "learning_rate": 0.0019279106858054228, "loss": 2.7301, "step": 1130 }, { "epoch": 1.82, "grad_norm": 3.3539106845855713, "learning_rate": 0.0019272727272727273, "loss": 2.6942, "step": 1140 }, { "epoch": 1.83, "grad_norm": 3.6363375186920166, "learning_rate": 0.001926634768740032, "loss": 2.6089, "step": 1150 }, { "epoch": 1.85, "grad_norm": 4.147191047668457, "learning_rate": 0.0019259968102073367, "loss": 2.7383, "step": 1160 }, { "epoch": 1.87, "grad_norm": 2.932065486907959, "learning_rate": 0.0019253588516746412, "loss": 2.6189, "step": 1170 }, { "epoch": 1.88, "grad_norm": 4.373189449310303, "learning_rate": 0.0019247208931419458, "loss": 2.8137, "step": 1180 }, { "epoch": 1.9, "grad_norm": 3.7591166496276855, "learning_rate": 0.0019240829346092506, "loss": 2.7852, "step": 1190 }, { "epoch": 1.91, "grad_norm": 4.965326309204102, "learning_rate": 0.0019234449760765552, "loss": 2.6627, "step": 1200 }, { "epoch": 1.93, "grad_norm": 3.0201761722564697, "learning_rate": 0.0019228070175438597, "loss": 2.8458, "step": 1210 }, { "epoch": 1.95, "grad_norm": 4.8068695068359375, "learning_rate": 0.0019221690590111643, "loss": 2.6854, "step": 1220 }, { "epoch": 1.96, "grad_norm": 2.76481032371521, "learning_rate": 0.001921531100478469, "loss": 2.721, "step": 1230 }, { "epoch": 1.98, "grad_norm": 4.103845596313477, "learning_rate": 0.0019208931419457736, "loss": 2.7458, "step": 1240 }, { "epoch": 1.99, "grad_norm": 2.848653793334961, "learning_rate": 0.0019202551834130782, "loss": 2.8408, "step": 1250 }, { "epoch": 2.01, "grad_norm": 2.1055376529693604, "learning_rate": 0.0019196172248803827, "loss": 2.2232, "step": 1260 }, { "epoch": 2.03, "grad_norm": 2.4950308799743652, "learning_rate": 0.0019189792663476875, "loss": 1.9928, "step": 1270 }, { "epoch": 2.04, "grad_norm": 4.026719093322754, "learning_rate": 0.001918341307814992, "loss": 1.8887, "step": 1280 }, { "epoch": 2.06, "grad_norm": 2.439951181411743, "learning_rate": 0.0019177033492822966, "loss": 1.7856, "step": 1290 }, { "epoch": 2.07, "grad_norm": 3.8327765464782715, "learning_rate": 0.0019170653907496014, "loss": 1.9734, "step": 1300 }, { "epoch": 2.09, "grad_norm": 4.497558116912842, "learning_rate": 0.001916427432216906, "loss": 1.9576, "step": 1310 }, { "epoch": 2.11, "grad_norm": 4.017326831817627, "learning_rate": 0.0019157894736842106, "loss": 2.0879, "step": 1320 }, { "epoch": 2.12, "grad_norm": 5.959986209869385, "learning_rate": 0.0019151515151515151, "loss": 2.0042, "step": 1330 }, { "epoch": 2.14, "grad_norm": 2.2927639484405518, "learning_rate": 0.00191451355661882, "loss": 1.8132, "step": 1340 }, { "epoch": 2.15, "grad_norm": 6.273167133331299, "learning_rate": 0.0019138755980861245, "loss": 2.0617, "step": 1350 }, { "epoch": 2.17, "grad_norm": 2.9032981395721436, "learning_rate": 0.001913237639553429, "loss": 2.0173, "step": 1360 }, { "epoch": 2.19, "grad_norm": 5.651817798614502, "learning_rate": 0.0019125996810207338, "loss": 2.0464, "step": 1370 }, { "epoch": 2.2, "grad_norm": 4.344000339508057, "learning_rate": 0.0019119617224880384, "loss": 1.9377, "step": 1380 }, { "epoch": 2.22, "grad_norm": 3.4183313846588135, "learning_rate": 0.001911323763955343, "loss": 2.1948, "step": 1390 }, { "epoch": 2.23, "grad_norm": 8.772147178649902, "learning_rate": 0.0019106858054226475, "loss": 2.0348, "step": 1400 }, { "epoch": 2.25, "grad_norm": 2.020637273788452, "learning_rate": 0.0019100478468899523, "loss": 2.0723, "step": 1410 }, { "epoch": 2.26, "grad_norm": 4.4110565185546875, "learning_rate": 0.0019094098883572568, "loss": 1.9483, "step": 1420 }, { "epoch": 2.28, "grad_norm": 4.694215774536133, "learning_rate": 0.0019087719298245614, "loss": 2.0805, "step": 1430 }, { "epoch": 2.3, "grad_norm": 4.042151927947998, "learning_rate": 0.0019081339712918662, "loss": 2.0796, "step": 1440 }, { "epoch": 2.31, "grad_norm": 3.466386318206787, "learning_rate": 0.0019074960127591707, "loss": 2.049, "step": 1450 }, { "epoch": 2.33, "grad_norm": 5.676107406616211, "learning_rate": 0.0019068580542264753, "loss": 2.183, "step": 1460 }, { "epoch": 2.34, "grad_norm": 2.662849187850952, "learning_rate": 0.0019062200956937799, "loss": 2.0795, "step": 1470 }, { "epoch": 2.36, "grad_norm": 2.9790804386138916, "learning_rate": 0.0019055821371610846, "loss": 2.0264, "step": 1480 }, { "epoch": 2.38, "grad_norm": 5.430638790130615, "learning_rate": 0.0019049441786283892, "loss": 2.0712, "step": 1490 }, { "epoch": 2.39, "grad_norm": 3.5230486392974854, "learning_rate": 0.0019043062200956938, "loss": 2.1533, "step": 1500 }, { "epoch": 2.41, "grad_norm": 3.2345664501190186, "learning_rate": 0.0019036682615629985, "loss": 2.325, "step": 1510 }, { "epoch": 2.42, "grad_norm": 2.0495986938476562, "learning_rate": 0.001903030303030303, "loss": 2.225, "step": 1520 }, { "epoch": 2.44, "grad_norm": 4.178987979888916, "learning_rate": 0.0019023923444976077, "loss": 2.3576, "step": 1530 }, { "epoch": 2.46, "grad_norm": 4.338198184967041, "learning_rate": 0.0019017543859649122, "loss": 2.1247, "step": 1540 }, { "epoch": 2.47, "grad_norm": 2.644819736480713, "learning_rate": 0.001901116427432217, "loss": 2.3011, "step": 1550 }, { "epoch": 2.49, "grad_norm": 2.83943772315979, "learning_rate": 0.0019004784688995216, "loss": 2.3449, "step": 1560 }, { "epoch": 2.5, "grad_norm": 5.573853492736816, "learning_rate": 0.0018998405103668261, "loss": 2.1616, "step": 1570 }, { "epoch": 2.52, "grad_norm": 5.958674430847168, "learning_rate": 0.001899202551834131, "loss": 2.2724, "step": 1580 }, { "epoch": 2.54, "grad_norm": 4.136911392211914, "learning_rate": 0.0018985645933014355, "loss": 2.1089, "step": 1590 }, { "epoch": 2.55, "grad_norm": 2.5898241996765137, "learning_rate": 0.00189792663476874, "loss": 2.3466, "step": 1600 }, { "epoch": 2.57, "grad_norm": 3.101346015930176, "learning_rate": 0.0018972886762360446, "loss": 2.2233, "step": 1610 }, { "epoch": 2.58, "grad_norm": 3.3297476768493652, "learning_rate": 0.0018966507177033494, "loss": 2.1519, "step": 1620 }, { "epoch": 2.6, "grad_norm": 5.8525848388671875, "learning_rate": 0.001896012759170654, "loss": 2.4774, "step": 1630 }, { "epoch": 2.62, "grad_norm": 5.049089431762695, "learning_rate": 0.0018953748006379585, "loss": 2.2551, "step": 1640 }, { "epoch": 2.63, "grad_norm": 3.721668004989624, "learning_rate": 0.001894736842105263, "loss": 2.1791, "step": 1650 }, { "epoch": 2.65, "grad_norm": 3.234546661376953, "learning_rate": 0.0018940988835725679, "loss": 2.3578, "step": 1660 }, { "epoch": 2.66, "grad_norm": 3.65110445022583, "learning_rate": 0.0018934609250398724, "loss": 2.4003, "step": 1670 }, { "epoch": 2.68, "grad_norm": 4.681038856506348, "learning_rate": 0.001892822966507177, "loss": 2.3949, "step": 1680 }, { "epoch": 2.7, "grad_norm": 4.4321136474609375, "learning_rate": 0.0018921850079744818, "loss": 2.3568, "step": 1690 }, { "epoch": 2.71, "grad_norm": 3.075857162475586, "learning_rate": 0.0018915470494417863, "loss": 2.3458, "step": 1700 }, { "epoch": 2.73, "grad_norm": 2.5896382331848145, "learning_rate": 0.0018909090909090909, "loss": 2.4374, "step": 1710 }, { "epoch": 2.74, "grad_norm": 4.7238006591796875, "learning_rate": 0.0018902711323763954, "loss": 2.5829, "step": 1720 }, { "epoch": 2.76, "grad_norm": 2.7794413566589355, "learning_rate": 0.0018896331738437002, "loss": 2.2883, "step": 1730 }, { "epoch": 2.78, "grad_norm": 2.269745349884033, "learning_rate": 0.0018889952153110048, "loss": 2.1972, "step": 1740 }, { "epoch": 2.79, "grad_norm": 4.9795918464660645, "learning_rate": 0.0018883572567783093, "loss": 2.3769, "step": 1750 }, { "epoch": 2.81, "grad_norm": 2.1848745346069336, "learning_rate": 0.0018877192982456141, "loss": 2.3916, "step": 1760 }, { "epoch": 2.82, "grad_norm": 3.246695041656494, "learning_rate": 0.0018870813397129187, "loss": 2.3303, "step": 1770 }, { "epoch": 2.84, "grad_norm": 4.342026710510254, "learning_rate": 0.0018864433811802233, "loss": 2.3351, "step": 1780 }, { "epoch": 2.85, "grad_norm": 3.7110838890075684, "learning_rate": 0.0018858054226475278, "loss": 2.2379, "step": 1790 }, { "epoch": 2.87, "grad_norm": 3.2227859497070312, "learning_rate": 0.0018851674641148326, "loss": 2.3734, "step": 1800 }, { "epoch": 2.89, "grad_norm": 3.257556915283203, "learning_rate": 0.0018845295055821372, "loss": 2.533, "step": 1810 }, { "epoch": 2.9, "grad_norm": 3.287235975265503, "learning_rate": 0.0018838915470494417, "loss": 2.4149, "step": 1820 }, { "epoch": 2.92, "grad_norm": 5.332248210906982, "learning_rate": 0.0018832535885167465, "loss": 2.4045, "step": 1830 }, { "epoch": 2.93, "grad_norm": 6.954147815704346, "learning_rate": 0.001882615629984051, "loss": 2.4621, "step": 1840 }, { "epoch": 2.95, "grad_norm": 3.5278656482696533, "learning_rate": 0.0018819776714513556, "loss": 2.3869, "step": 1850 }, { "epoch": 2.97, "grad_norm": 5.101337909698486, "learning_rate": 0.0018813397129186602, "loss": 2.507, "step": 1860 }, { "epoch": 2.98, "grad_norm": 3.60355544090271, "learning_rate": 0.001880701754385965, "loss": 2.409, "step": 1870 }, { "epoch": 3.0, "grad_norm": 4.093048572540283, "learning_rate": 0.0018800637958532695, "loss": 2.2993, "step": 1880 }, { "epoch": 3.01, "grad_norm": 2.8062260150909424, "learning_rate": 0.001879425837320574, "loss": 1.7359, "step": 1890 }, { "epoch": 3.03, "grad_norm": 6.747288227081299, "learning_rate": 0.0018787878787878789, "loss": 1.6095, "step": 1900 }, { "epoch": 3.05, "grad_norm": 4.082904815673828, "learning_rate": 0.0018781499202551834, "loss": 1.69, "step": 1910 }, { "epoch": 3.06, "grad_norm": 1.8365858793258667, "learning_rate": 0.001877511961722488, "loss": 1.5815, "step": 1920 }, { "epoch": 3.08, "grad_norm": 2.940593719482422, "learning_rate": 0.0018768740031897926, "loss": 1.6635, "step": 1930 }, { "epoch": 3.09, "grad_norm": 2.660888433456421, "learning_rate": 0.0018762360446570973, "loss": 1.632, "step": 1940 }, { "epoch": 3.11, "grad_norm": 3.061300039291382, "learning_rate": 0.001875598086124402, "loss": 1.6964, "step": 1950 }, { "epoch": 3.13, "grad_norm": 3.134197235107422, "learning_rate": 0.0018749601275917065, "loss": 1.6702, "step": 1960 }, { "epoch": 3.14, "grad_norm": 5.188543796539307, "learning_rate": 0.001874322169059011, "loss": 1.7615, "step": 1970 }, { "epoch": 3.16, "grad_norm": 3.115239381790161, "learning_rate": 0.0018736842105263158, "loss": 1.8079, "step": 1980 }, { "epoch": 3.17, "grad_norm": 4.397618770599365, "learning_rate": 0.0018730462519936204, "loss": 1.8256, "step": 1990 }, { "epoch": 3.19, "grad_norm": 6.745879650115967, "learning_rate": 0.001872408293460925, "loss": 1.6285, "step": 2000 }, { "epoch": 3.21, "grad_norm": 4.609273433685303, "learning_rate": 0.0018717703349282297, "loss": 1.8885, "step": 2010 }, { "epoch": 3.22, "grad_norm": 2.650247097015381, "learning_rate": 0.0018711323763955343, "loss": 1.6762, "step": 2020 }, { "epoch": 3.24, "grad_norm": 5.857548713684082, "learning_rate": 0.0018704944178628388, "loss": 1.7824, "step": 2030 }, { "epoch": 3.25, "grad_norm": 3.2646751403808594, "learning_rate": 0.0018698564593301434, "loss": 1.8078, "step": 2040 }, { "epoch": 3.27, "grad_norm": 3.6167776584625244, "learning_rate": 0.0018692185007974482, "loss": 1.7395, "step": 2050 }, { "epoch": 3.29, "grad_norm": 3.98301100730896, "learning_rate": 0.0018685805422647527, "loss": 1.811, "step": 2060 }, { "epoch": 3.3, "grad_norm": 5.3117594718933105, "learning_rate": 0.0018679425837320573, "loss": 1.7647, "step": 2070 }, { "epoch": 3.32, "grad_norm": 6.290541172027588, "learning_rate": 0.001867304625199362, "loss": 1.8698, "step": 2080 }, { "epoch": 3.33, "grad_norm": 6.5661091804504395, "learning_rate": 0.0018666666666666666, "loss": 2.0072, "step": 2090 }, { "epoch": 3.35, "grad_norm": 6.150557994842529, "learning_rate": 0.0018660287081339712, "loss": 1.8055, "step": 2100 }, { "epoch": 3.37, "grad_norm": 3.677581310272217, "learning_rate": 0.0018653907496012758, "loss": 1.8711, "step": 2110 }, { "epoch": 3.38, "grad_norm": 2.2296063899993896, "learning_rate": 0.0018647527910685806, "loss": 1.8763, "step": 2120 }, { "epoch": 3.4, "grad_norm": 3.410414695739746, "learning_rate": 0.0018641148325358851, "loss": 1.8947, "step": 2130 }, { "epoch": 3.41, "grad_norm": 3.566406726837158, "learning_rate": 0.0018634768740031897, "loss": 1.9423, "step": 2140 }, { "epoch": 3.43, "grad_norm": 5.5341668128967285, "learning_rate": 0.0018628389154704945, "loss": 2.0945, "step": 2150 }, { "epoch": 3.44, "grad_norm": 4.542388439178467, "learning_rate": 0.001862200956937799, "loss": 1.9816, "step": 2160 }, { "epoch": 3.46, "grad_norm": 3.3940858840942383, "learning_rate": 0.0018615629984051036, "loss": 1.9789, "step": 2170 }, { "epoch": 3.48, "grad_norm": 3.9412808418273926, "learning_rate": 0.0018609250398724081, "loss": 2.1294, "step": 2180 }, { "epoch": 3.49, "grad_norm": 2.695256233215332, "learning_rate": 0.001860287081339713, "loss": 2.0233, "step": 2190 }, { "epoch": 3.51, "grad_norm": 3.1621010303497314, "learning_rate": 0.0018596491228070175, "loss": 1.8246, "step": 2200 }, { "epoch": 3.52, "grad_norm": 5.293850898742676, "learning_rate": 0.001859011164274322, "loss": 2.0004, "step": 2210 }, { "epoch": 3.54, "grad_norm": 3.9184532165527344, "learning_rate": 0.0018583732057416268, "loss": 2.1011, "step": 2220 }, { "epoch": 3.56, "grad_norm": 2.1356756687164307, "learning_rate": 0.0018577352472089314, "loss": 2.1129, "step": 2230 }, { "epoch": 3.57, "grad_norm": 3.8817296028137207, "learning_rate": 0.001857097288676236, "loss": 2.0047, "step": 2240 }, { "epoch": 3.59, "grad_norm": 3.2533388137817383, "learning_rate": 0.0018564593301435405, "loss": 1.9137, "step": 2250 }, { "epoch": 3.6, "grad_norm": 3.3586273193359375, "learning_rate": 0.0018558213716108455, "loss": 2.065, "step": 2260 }, { "epoch": 3.62, "grad_norm": 4.144857406616211, "learning_rate": 0.00185518341307815, "loss": 2.0735, "step": 2270 }, { "epoch": 3.64, "grad_norm": 3.9639623165130615, "learning_rate": 0.0018545454545454546, "loss": 2.118, "step": 2280 }, { "epoch": 3.65, "grad_norm": 3.5141801834106445, "learning_rate": 0.0018539074960127592, "loss": 2.0006, "step": 2290 }, { "epoch": 3.67, "grad_norm": 3.2397677898406982, "learning_rate": 0.001853269537480064, "loss": 2.1261, "step": 2300 }, { "epoch": 3.68, "grad_norm": 5.273965835571289, "learning_rate": 0.0018526315789473685, "loss": 2.0134, "step": 2310 }, { "epoch": 3.7, "grad_norm": 4.7644805908203125, "learning_rate": 0.001851993620414673, "loss": 1.9781, "step": 2320 }, { "epoch": 3.72, "grad_norm": 3.042400598526001, "learning_rate": 0.0018513556618819779, "loss": 2.0429, "step": 2330 }, { "epoch": 3.73, "grad_norm": 4.666615009307861, "learning_rate": 0.0018507177033492824, "loss": 2.052, "step": 2340 }, { "epoch": 3.75, "grad_norm": 2.8000500202178955, "learning_rate": 0.001850079744816587, "loss": 1.8426, "step": 2350 }, { "epoch": 3.76, "grad_norm": 4.616471767425537, "learning_rate": 0.0018494417862838916, "loss": 2.1656, "step": 2360 }, { "epoch": 3.78, "grad_norm": 4.575398921966553, "learning_rate": 0.0018488038277511964, "loss": 2.176, "step": 2370 }, { "epoch": 3.8, "grad_norm": 4.790685176849365, "learning_rate": 0.001848165869218501, "loss": 2.2702, "step": 2380 }, { "epoch": 3.81, "grad_norm": 4.581923007965088, "learning_rate": 0.0018475279106858055, "loss": 2.1888, "step": 2390 }, { "epoch": 3.83, "grad_norm": 4.997535705566406, "learning_rate": 0.0018468899521531103, "loss": 2.0964, "step": 2400 }, { "epoch": 3.84, "grad_norm": 3.024472951889038, "learning_rate": 0.0018462519936204148, "loss": 1.9286, "step": 2410 }, { "epoch": 3.86, "grad_norm": 3.9244346618652344, "learning_rate": 0.0018456140350877194, "loss": 2.0429, "step": 2420 }, { "epoch": 3.88, "grad_norm": 5.021399974822998, "learning_rate": 0.001844976076555024, "loss": 2.0872, "step": 2430 }, { "epoch": 3.89, "grad_norm": 2.4256746768951416, "learning_rate": 0.0018443381180223287, "loss": 2.0974, "step": 2440 }, { "epoch": 3.91, "grad_norm": 1.7723888158798218, "learning_rate": 0.0018437001594896333, "loss": 2.1741, "step": 2450 }, { "epoch": 3.92, "grad_norm": 2.9281272888183594, "learning_rate": 0.0018430622009569379, "loss": 2.0566, "step": 2460 }, { "epoch": 3.94, "grad_norm": 3.0242364406585693, "learning_rate": 0.0018424242424242426, "loss": 2.2248, "step": 2470 }, { "epoch": 3.96, "grad_norm": 3.027165651321411, "learning_rate": 0.0018417862838915472, "loss": 2.0762, "step": 2480 }, { "epoch": 3.97, "grad_norm": 4.249027729034424, "learning_rate": 0.0018411483253588518, "loss": 2.1017, "step": 2490 }, { "epoch": 3.99, "grad_norm": 3.3154234886169434, "learning_rate": 0.0018405103668261563, "loss": 2.0766, "step": 2500 }, { "epoch": 4.0, "grad_norm": 1.4245625734329224, "learning_rate": 0.001839872408293461, "loss": 1.9103, "step": 2510 }, { "epoch": 4.02, "grad_norm": 1.513168454170227, "learning_rate": 0.0018392344497607657, "loss": 1.4412, "step": 2520 }, { "epoch": 4.04, "grad_norm": 4.4338507652282715, "learning_rate": 0.0018385964912280702, "loss": 1.3431, "step": 2530 }, { "epoch": 4.05, "grad_norm": 4.030521869659424, "learning_rate": 0.001837958532695375, "loss": 1.4325, "step": 2540 }, { "epoch": 4.07, "grad_norm": 4.0168137550354, "learning_rate": 0.0018373205741626796, "loss": 1.4083, "step": 2550 }, { "epoch": 4.08, "grad_norm": 5.304862022399902, "learning_rate": 0.0018366826156299841, "loss": 1.5336, "step": 2560 }, { "epoch": 4.1, "grad_norm": 3.5825703144073486, "learning_rate": 0.0018360446570972887, "loss": 1.4663, "step": 2570 }, { "epoch": 4.11, "grad_norm": 3.8972997665405273, "learning_rate": 0.0018354066985645935, "loss": 1.5203, "step": 2580 }, { "epoch": 4.13, "grad_norm": 5.68231725692749, "learning_rate": 0.001834768740031898, "loss": 1.6814, "step": 2590 }, { "epoch": 4.15, "grad_norm": 3.8971197605133057, "learning_rate": 0.0018341307814992026, "loss": 1.3574, "step": 2600 }, { "epoch": 4.16, "grad_norm": 3.819286346435547, "learning_rate": 0.0018334928229665074, "loss": 1.4937, "step": 2610 }, { "epoch": 4.18, "grad_norm": 3.3106937408447266, "learning_rate": 0.001832854864433812, "loss": 1.5814, "step": 2620 }, { "epoch": 4.19, "grad_norm": 5.2803754806518555, "learning_rate": 0.0018322169059011165, "loss": 1.641, "step": 2630 }, { "epoch": 4.21, "grad_norm": 4.728196620941162, "learning_rate": 0.001831578947368421, "loss": 1.5647, "step": 2640 }, { "epoch": 4.23, "grad_norm": 3.2671823501586914, "learning_rate": 0.0018309409888357258, "loss": 1.7673, "step": 2650 }, { "epoch": 4.24, "grad_norm": 2.539050579071045, "learning_rate": 0.0018303030303030304, "loss": 1.5397, "step": 2660 }, { "epoch": 4.26, "grad_norm": 2.7646982669830322, "learning_rate": 0.001829665071770335, "loss": 1.4788, "step": 2670 }, { "epoch": 4.27, "grad_norm": 3.103675603866577, "learning_rate": 0.0018290271132376395, "loss": 1.5428, "step": 2680 }, { "epoch": 4.29, "grad_norm": 5.560327053070068, "learning_rate": 0.0018283891547049443, "loss": 1.5389, "step": 2690 }, { "epoch": 4.31, "grad_norm": 3.355659246444702, "learning_rate": 0.0018277511961722489, "loss": 1.6061, "step": 2700 }, { "epoch": 4.32, "grad_norm": 5.579101085662842, "learning_rate": 0.0018271132376395534, "loss": 1.6554, "step": 2710 }, { "epoch": 4.34, "grad_norm": 4.465839862823486, "learning_rate": 0.0018264752791068582, "loss": 1.7019, "step": 2720 }, { "epoch": 4.35, "grad_norm": 2.9531333446502686, "learning_rate": 0.0018258373205741628, "loss": 1.6395, "step": 2730 }, { "epoch": 4.37, "grad_norm": 3.912163257598877, "learning_rate": 0.0018251993620414673, "loss": 1.7232, "step": 2740 }, { "epoch": 4.39, "grad_norm": 3.955035924911499, "learning_rate": 0.001824561403508772, "loss": 1.6601, "step": 2750 }, { "epoch": 4.4, "grad_norm": 5.796784400939941, "learning_rate": 0.0018239234449760767, "loss": 1.7742, "step": 2760 }, { "epoch": 4.42, "grad_norm": 3.470076322555542, "learning_rate": 0.0018232854864433812, "loss": 1.7226, "step": 2770 }, { "epoch": 4.43, "grad_norm": 4.716192245483398, "learning_rate": 0.0018226475279106858, "loss": 1.6537, "step": 2780 }, { "epoch": 4.45, "grad_norm": 4.586126327514648, "learning_rate": 0.0018220095693779906, "loss": 1.6319, "step": 2790 }, { "epoch": 4.47, "grad_norm": 4.049830913543701, "learning_rate": 0.0018213716108452952, "loss": 1.8295, "step": 2800 }, { "epoch": 4.48, "grad_norm": 2.4487478733062744, "learning_rate": 0.0018207336523125997, "loss": 1.9085, "step": 2810 }, { "epoch": 4.5, "grad_norm": 3.3505730628967285, "learning_rate": 0.0018200956937799043, "loss": 1.7341, "step": 2820 }, { "epoch": 4.51, "grad_norm": 3.655205011367798, "learning_rate": 0.001819457735247209, "loss": 1.7667, "step": 2830 }, { "epoch": 4.53, "grad_norm": 4.730507850646973, "learning_rate": 0.0018188197767145136, "loss": 1.6444, "step": 2840 }, { "epoch": 4.55, "grad_norm": 3.635011911392212, "learning_rate": 0.0018181818181818182, "loss": 1.7706, "step": 2850 }, { "epoch": 4.56, "grad_norm": 2.98230242729187, "learning_rate": 0.001817543859649123, "loss": 1.7443, "step": 2860 }, { "epoch": 4.58, "grad_norm": 2.706557035446167, "learning_rate": 0.0018169059011164275, "loss": 1.738, "step": 2870 }, { "epoch": 4.59, "grad_norm": 5.715457439422607, "learning_rate": 0.001816267942583732, "loss": 1.9244, "step": 2880 }, { "epoch": 4.61, "grad_norm": 4.379674911499023, "learning_rate": 0.0018156299840510366, "loss": 1.8655, "step": 2890 }, { "epoch": 4.63, "grad_norm": 3.1540908813476562, "learning_rate": 0.0018149920255183414, "loss": 1.8485, "step": 2900 }, { "epoch": 4.64, "grad_norm": 4.2252373695373535, "learning_rate": 0.001814354066985646, "loss": 1.8581, "step": 2910 }, { "epoch": 4.66, "grad_norm": 2.00207781791687, "learning_rate": 0.0018137161084529506, "loss": 2.0877, "step": 2920 }, { "epoch": 4.67, "grad_norm": 2.710052013397217, "learning_rate": 0.0018130781499202553, "loss": 1.6488, "step": 2930 }, { "epoch": 4.69, "grad_norm": 5.69435453414917, "learning_rate": 0.00181244019138756, "loss": 1.8438, "step": 2940 }, { "epoch": 4.7, "grad_norm": 3.163170576095581, "learning_rate": 0.0018118022328548645, "loss": 1.8168, "step": 2950 }, { "epoch": 4.72, "grad_norm": 2.5819127559661865, "learning_rate": 0.001811164274322169, "loss": 1.8733, "step": 2960 }, { "epoch": 4.74, "grad_norm": 3.780280351638794, "learning_rate": 0.0018105263157894738, "loss": 1.7007, "step": 2970 }, { "epoch": 4.75, "grad_norm": 4.294229030609131, "learning_rate": 0.0018098883572567784, "loss": 1.838, "step": 2980 }, { "epoch": 4.77, "grad_norm": 4.328463077545166, "learning_rate": 0.001809250398724083, "loss": 1.8721, "step": 2990 }, { "epoch": 4.78, "grad_norm": 3.204005241394043, "learning_rate": 0.0018086124401913875, "loss": 1.8796, "step": 3000 }, { "epoch": 4.8, "grad_norm": 6.089762210845947, "learning_rate": 0.0018079744816586923, "loss": 1.8764, "step": 3010 }, { "epoch": 4.82, "grad_norm": 5.21134090423584, "learning_rate": 0.0018073365231259968, "loss": 1.8615, "step": 3020 }, { "epoch": 4.83, "grad_norm": 5.567359924316406, "learning_rate": 0.0018066985645933014, "loss": 1.9394, "step": 3030 }, { "epoch": 4.85, "grad_norm": 3.8925669193267822, "learning_rate": 0.0018060606060606062, "loss": 1.9556, "step": 3040 }, { "epoch": 4.86, "grad_norm": 3.123612642288208, "learning_rate": 0.0018054226475279107, "loss": 1.8685, "step": 3050 }, { "epoch": 4.88, "grad_norm": 3.970958709716797, "learning_rate": 0.0018047846889952153, "loss": 1.8955, "step": 3060 }, { "epoch": 4.9, "grad_norm": 4.519131660461426, "learning_rate": 0.0018041467304625199, "loss": 1.9885, "step": 3070 }, { "epoch": 4.91, "grad_norm": 3.834430456161499, "learning_rate": 0.0018035087719298246, "loss": 1.9572, "step": 3080 }, { "epoch": 4.93, "grad_norm": 5.614201068878174, "learning_rate": 0.0018028708133971292, "loss": 1.9366, "step": 3090 }, { "epoch": 4.94, "grad_norm": 3.8492119312286377, "learning_rate": 0.0018022328548644338, "loss": 1.8337, "step": 3100 }, { "epoch": 4.96, "grad_norm": 5.122296333312988, "learning_rate": 0.0018015948963317385, "loss": 1.9093, "step": 3110 }, { "epoch": 4.98, "grad_norm": 3.0235679149627686, "learning_rate": 0.001800956937799043, "loss": 1.8341, "step": 3120 }, { "epoch": 4.99, "grad_norm": 3.4031426906585693, "learning_rate": 0.0018003189792663477, "loss": 1.8851, "step": 3130 }, { "epoch": 5.01, "grad_norm": 3.6404995918273926, "learning_rate": 0.0017996810207336522, "loss": 1.4797, "step": 3140 }, { "epoch": 5.02, "grad_norm": 3.4057798385620117, "learning_rate": 0.001799043062200957, "loss": 1.1585, "step": 3150 }, { "epoch": 5.04, "grad_norm": 3.314164161682129, "learning_rate": 0.0017984051036682616, "loss": 1.3921, "step": 3160 }, { "epoch": 5.06, "grad_norm": 4.028993606567383, "learning_rate": 0.0017977671451355661, "loss": 1.2938, "step": 3170 }, { "epoch": 5.07, "grad_norm": 4.128094673156738, "learning_rate": 0.001797129186602871, "loss": 1.2979, "step": 3180 }, { "epoch": 5.09, "grad_norm": 3.079228639602661, "learning_rate": 0.0017964912280701755, "loss": 1.292, "step": 3190 }, { "epoch": 5.1, "grad_norm": 4.176467418670654, "learning_rate": 0.00179585326953748, "loss": 1.528, "step": 3200 }, { "epoch": 5.12, "grad_norm": 3.689857244491577, "learning_rate": 0.0017952153110047846, "loss": 1.2832, "step": 3210 }, { "epoch": 5.14, "grad_norm": 3.580005645751953, "learning_rate": 0.0017945773524720894, "loss": 1.3447, "step": 3220 }, { "epoch": 5.15, "grad_norm": 3.8672592639923096, "learning_rate": 0.001793939393939394, "loss": 1.234, "step": 3230 }, { "epoch": 5.17, "grad_norm": 3.5929276943206787, "learning_rate": 0.0017933014354066985, "loss": 1.2326, "step": 3240 }, { "epoch": 5.18, "grad_norm": 3.1610376834869385, "learning_rate": 0.0017926634768740033, "loss": 1.3224, "step": 3250 }, { "epoch": 5.2, "grad_norm": 3.908184289932251, "learning_rate": 0.0017920255183413079, "loss": 1.4446, "step": 3260 }, { "epoch": 5.22, "grad_norm": 4.269443511962891, "learning_rate": 0.0017913875598086124, "loss": 1.4286, "step": 3270 }, { "epoch": 5.23, "grad_norm": 3.0032732486724854, "learning_rate": 0.001790749601275917, "loss": 1.3768, "step": 3280 }, { "epoch": 5.25, "grad_norm": 3.841958522796631, "learning_rate": 0.0017901116427432218, "loss": 1.3441, "step": 3290 }, { "epoch": 5.26, "grad_norm": 3.0527617931365967, "learning_rate": 0.0017894736842105263, "loss": 1.4091, "step": 3300 }, { "epoch": 5.28, "grad_norm": 3.266508102416992, "learning_rate": 0.0017888357256778309, "loss": 1.3933, "step": 3310 }, { "epoch": 5.3, "grad_norm": 4.250580310821533, "learning_rate": 0.0017881977671451357, "loss": 1.5105, "step": 3320 }, { "epoch": 5.31, "grad_norm": 3.375892162322998, "learning_rate": 0.0017875598086124402, "loss": 1.4668, "step": 3330 }, { "epoch": 5.33, "grad_norm": 4.1522297859191895, "learning_rate": 0.0017869218500797448, "loss": 1.4022, "step": 3340 }, { "epoch": 5.34, "grad_norm": 5.130900859832764, "learning_rate": 0.0017862838915470493, "loss": 1.4457, "step": 3350 }, { "epoch": 5.36, "grad_norm": 3.176265239715576, "learning_rate": 0.0017856459330143541, "loss": 1.5113, "step": 3360 }, { "epoch": 5.37, "grad_norm": 5.0800557136535645, "learning_rate": 0.0017850079744816587, "loss": 1.5258, "step": 3370 }, { "epoch": 5.39, "grad_norm": 4.9642534255981445, "learning_rate": 0.0017843700159489633, "loss": 1.4101, "step": 3380 }, { "epoch": 5.41, "grad_norm": 4.7204270362854, "learning_rate": 0.0017837320574162678, "loss": 1.5812, "step": 3390 }, { "epoch": 5.42, "grad_norm": 3.163360834121704, "learning_rate": 0.0017830940988835726, "loss": 1.5647, "step": 3400 }, { "epoch": 5.44, "grad_norm": 5.122838973999023, "learning_rate": 0.0017824561403508772, "loss": 1.5151, "step": 3410 }, { "epoch": 5.45, "grad_norm": 3.543826103210449, "learning_rate": 0.0017818181818181817, "loss": 1.4134, "step": 3420 }, { "epoch": 5.47, "grad_norm": 3.4644534587860107, "learning_rate": 0.0017811802232854865, "loss": 1.6267, "step": 3430 }, { "epoch": 5.49, "grad_norm": 4.8260040283203125, "learning_rate": 0.001780542264752791, "loss": 1.4154, "step": 3440 }, { "epoch": 5.5, "grad_norm": 2.8876681327819824, "learning_rate": 0.0017799043062200956, "loss": 1.675, "step": 3450 }, { "epoch": 5.52, "grad_norm": 2.8691539764404297, "learning_rate": 0.0017792663476874002, "loss": 1.6627, "step": 3460 }, { "epoch": 5.53, "grad_norm": 4.810047626495361, "learning_rate": 0.001778628389154705, "loss": 1.4778, "step": 3470 }, { "epoch": 5.55, "grad_norm": 6.102086067199707, "learning_rate": 0.0017779904306220095, "loss": 1.597, "step": 3480 }, { "epoch": 5.57, "grad_norm": 2.5562939643859863, "learning_rate": 0.001777352472089314, "loss": 1.6155, "step": 3490 }, { "epoch": 5.58, "grad_norm": 4.305008888244629, "learning_rate": 0.0017767145135566189, "loss": 1.6084, "step": 3500 }, { "epoch": 5.6, "grad_norm": 3.545440673828125, "learning_rate": 0.0017760765550239234, "loss": 1.7705, "step": 3510 }, { "epoch": 5.61, "grad_norm": 3.9225101470947266, "learning_rate": 0.001775438596491228, "loss": 1.7765, "step": 3520 }, { "epoch": 5.63, "grad_norm": 3.6406924724578857, "learning_rate": 0.0017748006379585326, "loss": 1.5835, "step": 3530 }, { "epoch": 5.65, "grad_norm": 3.9222354888916016, "learning_rate": 0.0017741626794258373, "loss": 1.625, "step": 3540 }, { "epoch": 5.66, "grad_norm": 3.7696895599365234, "learning_rate": 0.001773524720893142, "loss": 1.7595, "step": 3550 }, { "epoch": 5.68, "grad_norm": 3.927811622619629, "learning_rate": 0.0017728867623604465, "loss": 1.5485, "step": 3560 }, { "epoch": 5.69, "grad_norm": 5.417560577392578, "learning_rate": 0.0017722488038277512, "loss": 1.7188, "step": 3570 }, { "epoch": 5.71, "grad_norm": 3.9058241844177246, "learning_rate": 0.0017716108452950558, "loss": 1.5654, "step": 3580 }, { "epoch": 5.73, "grad_norm": 3.2079246044158936, "learning_rate": 0.0017709728867623604, "loss": 1.8261, "step": 3590 }, { "epoch": 5.74, "grad_norm": 3.8155903816223145, "learning_rate": 0.001770334928229665, "loss": 1.5338, "step": 3600 }, { "epoch": 5.76, "grad_norm": 4.09771203994751, "learning_rate": 0.0017696969696969697, "loss": 1.7654, "step": 3610 }, { "epoch": 5.77, "grad_norm": 3.3263423442840576, "learning_rate": 0.0017690590111642743, "loss": 1.5809, "step": 3620 }, { "epoch": 5.79, "grad_norm": 4.113112926483154, "learning_rate": 0.0017684210526315788, "loss": 1.7932, "step": 3630 }, { "epoch": 5.81, "grad_norm": 4.0192694664001465, "learning_rate": 0.0017677830940988836, "loss": 1.7115, "step": 3640 }, { "epoch": 5.82, "grad_norm": 3.165609121322632, "learning_rate": 0.0017671451355661882, "loss": 1.4939, "step": 3650 }, { "epoch": 5.84, "grad_norm": 3.859196186065674, "learning_rate": 0.0017665071770334927, "loss": 1.7767, "step": 3660 }, { "epoch": 5.85, "grad_norm": 4.11074686050415, "learning_rate": 0.0017658692185007973, "loss": 1.6754, "step": 3670 }, { "epoch": 5.87, "grad_norm": 2.926147937774658, "learning_rate": 0.001765231259968102, "loss": 1.6904, "step": 3680 }, { "epoch": 5.89, "grad_norm": 4.121160507202148, "learning_rate": 0.0017645933014354066, "loss": 1.575, "step": 3690 }, { "epoch": 5.9, "grad_norm": 4.0827131271362305, "learning_rate": 0.0017639553429027112, "loss": 1.6392, "step": 3700 }, { "epoch": 5.92, "grad_norm": 4.13917875289917, "learning_rate": 0.0017633173843700158, "loss": 1.6323, "step": 3710 }, { "epoch": 5.93, "grad_norm": 3.052493095397949, "learning_rate": 0.0017626794258373206, "loss": 1.8932, "step": 3720 }, { "epoch": 5.95, "grad_norm": 5.432674407958984, "learning_rate": 0.0017620414673046251, "loss": 1.58, "step": 3730 }, { "epoch": 5.96, "grad_norm": 6.6524505615234375, "learning_rate": 0.0017614035087719297, "loss": 1.6361, "step": 3740 }, { "epoch": 5.98, "grad_norm": 7.488154888153076, "learning_rate": 0.0017607655502392347, "loss": 1.823, "step": 3750 }, { "epoch": 6.0, "grad_norm": 3.211604118347168, "learning_rate": 0.0017601275917065392, "loss": 1.8265, "step": 3760 }, { "epoch": 6.01, "grad_norm": 2.5021958351135254, "learning_rate": 0.0017594896331738438, "loss": 1.3958, "step": 3770 }, { "epoch": 6.03, "grad_norm": 3.7297511100769043, "learning_rate": 0.0017588516746411484, "loss": 1.1193, "step": 3780 }, { "epoch": 6.04, "grad_norm": 4.050276279449463, "learning_rate": 0.0017582137161084531, "loss": 1.2042, "step": 3790 }, { "epoch": 6.06, "grad_norm": 4.484896659851074, "learning_rate": 0.0017575757575757577, "loss": 1.2483, "step": 3800 }, { "epoch": 6.08, "grad_norm": 7.920963764190674, "learning_rate": 0.0017569377990430623, "loss": 1.2832, "step": 3810 }, { "epoch": 6.09, "grad_norm": 2.772211790084839, "learning_rate": 0.001756299840510367, "loss": 1.1111, "step": 3820 }, { "epoch": 6.11, "grad_norm": 3.2087087631225586, "learning_rate": 0.0017556618819776716, "loss": 1.108, "step": 3830 }, { "epoch": 6.12, "grad_norm": 3.650775194168091, "learning_rate": 0.0017550239234449762, "loss": 1.2609, "step": 3840 }, { "epoch": 6.14, "grad_norm": 3.6753830909729004, "learning_rate": 0.0017543859649122807, "loss": 1.1581, "step": 3850 }, { "epoch": 6.16, "grad_norm": 3.568274974822998, "learning_rate": 0.0017537480063795855, "loss": 1.2661, "step": 3860 }, { "epoch": 6.17, "grad_norm": 3.6179471015930176, "learning_rate": 0.00175311004784689, "loss": 1.092, "step": 3870 }, { "epoch": 6.19, "grad_norm": 2.885768413543701, "learning_rate": 0.0017524720893141946, "loss": 1.1903, "step": 3880 }, { "epoch": 6.2, "grad_norm": 2.389308214187622, "learning_rate": 0.0017518341307814994, "loss": 1.1977, "step": 3890 }, { "epoch": 6.22, "grad_norm": 4.820352554321289, "learning_rate": 0.001751196172248804, "loss": 1.3104, "step": 3900 }, { "epoch": 6.24, "grad_norm": 2.8304367065429688, "learning_rate": 0.0017505582137161085, "loss": 1.3522, "step": 3910 }, { "epoch": 6.25, "grad_norm": 4.1842732429504395, "learning_rate": 0.001749920255183413, "loss": 1.2901, "step": 3920 }, { "epoch": 6.27, "grad_norm": 4.698485851287842, "learning_rate": 0.0017492822966507179, "loss": 1.2261, "step": 3930 }, { "epoch": 6.28, "grad_norm": 5.434518814086914, "learning_rate": 0.0017486443381180224, "loss": 1.3348, "step": 3940 }, { "epoch": 6.3, "grad_norm": 4.726064682006836, "learning_rate": 0.001748006379585327, "loss": 1.308, "step": 3950 }, { "epoch": 6.32, "grad_norm": 3.2794930934906006, "learning_rate": 0.0017473684210526318, "loss": 1.232, "step": 3960 }, { "epoch": 6.33, "grad_norm": 4.248810768127441, "learning_rate": 0.0017467304625199364, "loss": 1.2118, "step": 3970 }, { "epoch": 6.35, "grad_norm": 5.226914882659912, "learning_rate": 0.001746092503987241, "loss": 1.273, "step": 3980 }, { "epoch": 6.36, "grad_norm": 4.917492866516113, "learning_rate": 0.0017454545454545455, "loss": 1.2566, "step": 3990 }, { "epoch": 6.38, "grad_norm": 6.164140224456787, "learning_rate": 0.0017448165869218503, "loss": 1.4062, "step": 4000 }, { "epoch": 6.4, "grad_norm": 4.235147953033447, "learning_rate": 0.0017441786283891548, "loss": 1.2668, "step": 4010 }, { "epoch": 6.41, "grad_norm": 4.627178192138672, "learning_rate": 0.0017435406698564594, "loss": 1.2448, "step": 4020 }, { "epoch": 6.43, "grad_norm": 5.603235721588135, "learning_rate": 0.0017429027113237642, "loss": 1.4769, "step": 4030 }, { "epoch": 6.44, "grad_norm": 2.3861303329467773, "learning_rate": 0.0017422647527910687, "loss": 1.6294, "step": 4040 }, { "epoch": 6.46, "grad_norm": 3.891209840774536, "learning_rate": 0.0017416267942583733, "loss": 1.3206, "step": 4050 }, { "epoch": 6.48, "grad_norm": 2.741506576538086, "learning_rate": 0.0017409888357256779, "loss": 1.3517, "step": 4060 }, { "epoch": 6.49, "grad_norm": 3.152433156967163, "learning_rate": 0.0017403508771929826, "loss": 1.2853, "step": 4070 }, { "epoch": 6.51, "grad_norm": 3.4589314460754395, "learning_rate": 0.0017397129186602872, "loss": 1.4094, "step": 4080 }, { "epoch": 6.52, "grad_norm": 6.630537033081055, "learning_rate": 0.0017390749601275918, "loss": 1.3614, "step": 4090 }, { "epoch": 6.54, "grad_norm": 3.220771551132202, "learning_rate": 0.0017384370015948963, "loss": 1.3354, "step": 4100 }, { "epoch": 6.56, "grad_norm": 2.8003170490264893, "learning_rate": 0.001737799043062201, "loss": 1.398, "step": 4110 }, { "epoch": 6.57, "grad_norm": 5.145318984985352, "learning_rate": 0.0017371610845295057, "loss": 1.4861, "step": 4120 }, { "epoch": 6.59, "grad_norm": 3.6889803409576416, "learning_rate": 0.0017365231259968102, "loss": 1.3687, "step": 4130 }, { "epoch": 6.6, "grad_norm": 3.3676440715789795, "learning_rate": 0.001735885167464115, "loss": 1.3656, "step": 4140 }, { "epoch": 6.62, "grad_norm": 4.406673431396484, "learning_rate": 0.0017352472089314196, "loss": 1.4002, "step": 4150 }, { "epoch": 6.63, "grad_norm": 4.088317394256592, "learning_rate": 0.0017346092503987241, "loss": 1.4265, "step": 4160 }, { "epoch": 6.65, "grad_norm": 4.677865982055664, "learning_rate": 0.0017339712918660287, "loss": 1.5231, "step": 4170 }, { "epoch": 6.67, "grad_norm": 4.6024322509765625, "learning_rate": 0.0017333333333333335, "loss": 1.5746, "step": 4180 }, { "epoch": 6.68, "grad_norm": 4.752773284912109, "learning_rate": 0.001732695374800638, "loss": 1.3851, "step": 4190 }, { "epoch": 6.7, "grad_norm": 3.870704412460327, "learning_rate": 0.0017320574162679426, "loss": 1.4281, "step": 4200 }, { "epoch": 6.71, "grad_norm": 3.5712807178497314, "learning_rate": 0.0017314194577352474, "loss": 1.6449, "step": 4210 }, { "epoch": 6.73, "grad_norm": 3.4372332096099854, "learning_rate": 0.001730781499202552, "loss": 1.5439, "step": 4220 }, { "epoch": 6.75, "grad_norm": 5.638207912445068, "learning_rate": 0.0017301435406698565, "loss": 1.5426, "step": 4230 }, { "epoch": 6.76, "grad_norm": 5.095453262329102, "learning_rate": 0.001729505582137161, "loss": 1.4252, "step": 4240 }, { "epoch": 6.78, "grad_norm": 2.4728281497955322, "learning_rate": 0.0017288676236044658, "loss": 1.4374, "step": 4250 }, { "epoch": 6.79, "grad_norm": 3.4558870792388916, "learning_rate": 0.0017282296650717704, "loss": 1.4457, "step": 4260 }, { "epoch": 6.81, "grad_norm": 3.5765767097473145, "learning_rate": 0.001727591706539075, "loss": 1.5364, "step": 4270 }, { "epoch": 6.83, "grad_norm": 4.479535102844238, "learning_rate": 0.0017269537480063797, "loss": 1.561, "step": 4280 }, { "epoch": 6.84, "grad_norm": 3.1493709087371826, "learning_rate": 0.0017263157894736843, "loss": 1.4131, "step": 4290 }, { "epoch": 6.86, "grad_norm": 4.1836042404174805, "learning_rate": 0.0017256778309409889, "loss": 1.5031, "step": 4300 }, { "epoch": 6.87, "grad_norm": 3.2860119342803955, "learning_rate": 0.0017250398724082934, "loss": 1.6286, "step": 4310 }, { "epoch": 6.89, "grad_norm": 2.8824214935302734, "learning_rate": 0.0017244019138755982, "loss": 1.5281, "step": 4320 }, { "epoch": 6.91, "grad_norm": 5.243397235870361, "learning_rate": 0.0017237639553429028, "loss": 1.5868, "step": 4330 }, { "epoch": 6.92, "grad_norm": 2.8732147216796875, "learning_rate": 0.0017231259968102073, "loss": 1.5813, "step": 4340 }, { "epoch": 6.94, "grad_norm": 4.3689494132995605, "learning_rate": 0.0017224880382775121, "loss": 1.5706, "step": 4350 }, { "epoch": 6.95, "grad_norm": 4.520773887634277, "learning_rate": 0.0017218500797448167, "loss": 1.5769, "step": 4360 }, { "epoch": 6.97, "grad_norm": 3.988919734954834, "learning_rate": 0.0017212121212121212, "loss": 1.6688, "step": 4370 }, { "epoch": 6.99, "grad_norm": 3.1639842987060547, "learning_rate": 0.0017205741626794258, "loss": 1.5045, "step": 4380 }, { "epoch": 7.0, "grad_norm": 1.8472672700881958, "learning_rate": 0.0017199362041467306, "loss": 1.4904, "step": 4390 }, { "epoch": 7.02, "grad_norm": 3.472080945968628, "learning_rate": 0.0017192982456140352, "loss": 1.0386, "step": 4400 }, { "epoch": 7.03, "grad_norm": 3.553772211074829, "learning_rate": 0.0017186602870813397, "loss": 0.9889, "step": 4410 }, { "epoch": 7.05, "grad_norm": 4.417268753051758, "learning_rate": 0.0017180223285486443, "loss": 0.9838, "step": 4420 }, { "epoch": 7.07, "grad_norm": 5.340514659881592, "learning_rate": 0.001717384370015949, "loss": 1.0057, "step": 4430 }, { "epoch": 7.08, "grad_norm": 3.2239015102386475, "learning_rate": 0.0017167464114832536, "loss": 1.1003, "step": 4440 }, { "epoch": 7.1, "grad_norm": 3.5991039276123047, "learning_rate": 0.0017161084529505582, "loss": 1.0384, "step": 4450 }, { "epoch": 7.11, "grad_norm": 3.8520448207855225, "learning_rate": 0.001715470494417863, "loss": 0.9872, "step": 4460 }, { "epoch": 7.13, "grad_norm": 3.489706516265869, "learning_rate": 0.0017148325358851675, "loss": 1.1452, "step": 4470 }, { "epoch": 7.15, "grad_norm": 2.60661244392395, "learning_rate": 0.001714194577352472, "loss": 1.0007, "step": 4480 }, { "epoch": 7.16, "grad_norm": 5.66582727432251, "learning_rate": 0.0017135566188197766, "loss": 1.1072, "step": 4490 }, { "epoch": 7.18, "grad_norm": 4.794973373413086, "learning_rate": 0.0017129186602870814, "loss": 1.1994, "step": 4500 }, { "epoch": 7.19, "grad_norm": 5.310514450073242, "learning_rate": 0.001712280701754386, "loss": 1.104, "step": 4510 }, { "epoch": 7.21, "grad_norm": 3.0956227779388428, "learning_rate": 0.0017116427432216906, "loss": 1.0067, "step": 4520 }, { "epoch": 7.22, "grad_norm": 4.637990474700928, "learning_rate": 0.0017110047846889953, "loss": 1.0235, "step": 4530 }, { "epoch": 7.24, "grad_norm": 2.7805848121643066, "learning_rate": 0.0017103668261563, "loss": 1.2051, "step": 4540 }, { "epoch": 7.26, "grad_norm": 4.313024520874023, "learning_rate": 0.0017097288676236045, "loss": 1.1176, "step": 4550 }, { "epoch": 7.27, "grad_norm": 3.135601282119751, "learning_rate": 0.001709090909090909, "loss": 1.2627, "step": 4560 }, { "epoch": 7.29, "grad_norm": 3.0150604248046875, "learning_rate": 0.0017084529505582138, "loss": 1.0905, "step": 4570 }, { "epoch": 7.3, "grad_norm": 3.5915615558624268, "learning_rate": 0.0017078149920255184, "loss": 1.0766, "step": 4580 }, { "epoch": 7.32, "grad_norm": 3.981519937515259, "learning_rate": 0.001707177033492823, "loss": 1.2256, "step": 4590 }, { "epoch": 7.34, "grad_norm": 3.5225601196289062, "learning_rate": 0.0017065390749601277, "loss": 1.2533, "step": 4600 }, { "epoch": 7.35, "grad_norm": 4.971328258514404, "learning_rate": 0.0017059011164274323, "loss": 1.2287, "step": 4610 }, { "epoch": 7.37, "grad_norm": 3.6815969944000244, "learning_rate": 0.0017052631578947368, "loss": 1.1224, "step": 4620 }, { "epoch": 7.38, "grad_norm": 2.50472354888916, "learning_rate": 0.0017046251993620414, "loss": 1.3059, "step": 4630 }, { "epoch": 7.4, "grad_norm": 2.376018524169922, "learning_rate": 0.0017039872408293462, "loss": 1.3294, "step": 4640 }, { "epoch": 7.42, "grad_norm": 3.935692548751831, "learning_rate": 0.0017033492822966507, "loss": 1.3172, "step": 4650 }, { "epoch": 7.43, "grad_norm": 2.974992513656616, "learning_rate": 0.0017027113237639553, "loss": 1.2235, "step": 4660 }, { "epoch": 7.45, "grad_norm": 6.238065242767334, "learning_rate": 0.00170207336523126, "loss": 1.2355, "step": 4670 }, { "epoch": 7.46, "grad_norm": 4.15529727935791, "learning_rate": 0.0017014354066985646, "loss": 1.1975, "step": 4680 }, { "epoch": 7.48, "grad_norm": 3.069063663482666, "learning_rate": 0.0017007974481658692, "loss": 1.0709, "step": 4690 }, { "epoch": 7.5, "grad_norm": 3.6260762214660645, "learning_rate": 0.0017001594896331738, "loss": 1.2178, "step": 4700 }, { "epoch": 7.51, "grad_norm": 3.0013301372528076, "learning_rate": 0.0016995215311004785, "loss": 1.1398, "step": 4710 }, { "epoch": 7.53, "grad_norm": 2.0015666484832764, "learning_rate": 0.001698883572567783, "loss": 1.3866, "step": 4720 }, { "epoch": 7.54, "grad_norm": 3.997130870819092, "learning_rate": 0.0016982456140350877, "loss": 1.3066, "step": 4730 }, { "epoch": 7.56, "grad_norm": 3.5671958923339844, "learning_rate": 0.0016976076555023924, "loss": 1.0524, "step": 4740 }, { "epoch": 7.58, "grad_norm": 2.9513649940490723, "learning_rate": 0.001696969696969697, "loss": 1.173, "step": 4750 }, { "epoch": 7.59, "grad_norm": 3.9709384441375732, "learning_rate": 0.0016963317384370016, "loss": 1.4574, "step": 4760 }, { "epoch": 7.61, "grad_norm": 4.372689723968506, "learning_rate": 0.0016956937799043061, "loss": 1.4686, "step": 4770 }, { "epoch": 7.62, "grad_norm": 3.748054265975952, "learning_rate": 0.001695055821371611, "loss": 1.4393, "step": 4780 }, { "epoch": 7.64, "grad_norm": 3.7790236473083496, "learning_rate": 0.0016944178628389155, "loss": 1.2965, "step": 4790 }, { "epoch": 7.66, "grad_norm": 4.572340965270996, "learning_rate": 0.00169377990430622, "loss": 1.5793, "step": 4800 }, { "epoch": 7.67, "grad_norm": 3.838794231414795, "learning_rate": 0.0016931419457735246, "loss": 1.2758, "step": 4810 }, { "epoch": 7.69, "grad_norm": 3.9073917865753174, "learning_rate": 0.0016925039872408294, "loss": 1.2657, "step": 4820 }, { "epoch": 7.7, "grad_norm": 3.6725800037384033, "learning_rate": 0.001691866028708134, "loss": 1.1664, "step": 4830 }, { "epoch": 7.72, "grad_norm": 2.742488384246826, "learning_rate": 0.0016912280701754385, "loss": 1.3705, "step": 4840 }, { "epoch": 7.74, "grad_norm": 5.307029724121094, "learning_rate": 0.0016905901116427433, "loss": 1.3447, "step": 4850 }, { "epoch": 7.75, "grad_norm": 3.2814066410064697, "learning_rate": 0.0016899521531100479, "loss": 1.2705, "step": 4860 }, { "epoch": 7.77, "grad_norm": 4.674114227294922, "learning_rate": 0.0016893141945773524, "loss": 1.3062, "step": 4870 }, { "epoch": 7.78, "grad_norm": 3.455000638961792, "learning_rate": 0.001688676236044657, "loss": 1.3788, "step": 4880 }, { "epoch": 7.8, "grad_norm": 3.7969977855682373, "learning_rate": 0.0016880382775119618, "loss": 1.4107, "step": 4890 }, { "epoch": 7.81, "grad_norm": 4.002437591552734, "learning_rate": 0.0016874003189792663, "loss": 1.3215, "step": 4900 }, { "epoch": 7.83, "grad_norm": 2.509416103363037, "learning_rate": 0.0016867623604465709, "loss": 1.2652, "step": 4910 }, { "epoch": 7.85, "grad_norm": 2.7716715335845947, "learning_rate": 0.0016861244019138757, "loss": 1.4095, "step": 4920 }, { "epoch": 7.86, "grad_norm": 5.537817001342773, "learning_rate": 0.0016854864433811802, "loss": 1.2998, "step": 4930 }, { "epoch": 7.88, "grad_norm": 3.2739720344543457, "learning_rate": 0.0016848484848484848, "loss": 1.3311, "step": 4940 }, { "epoch": 7.89, "grad_norm": 3.1102712154388428, "learning_rate": 0.0016842105263157893, "loss": 1.2811, "step": 4950 }, { "epoch": 7.91, "grad_norm": 4.807369709014893, "learning_rate": 0.0016835725677830941, "loss": 1.4339, "step": 4960 }, { "epoch": 7.93, "grad_norm": 8.400796890258789, "learning_rate": 0.0016829346092503987, "loss": 1.5323, "step": 4970 }, { "epoch": 7.94, "grad_norm": 3.6073365211486816, "learning_rate": 0.0016822966507177033, "loss": 1.5019, "step": 4980 }, { "epoch": 7.96, "grad_norm": 3.30039644241333, "learning_rate": 0.001681658692185008, "loss": 1.4698, "step": 4990 }, { "epoch": 7.97, "grad_norm": 3.7990474700927734, "learning_rate": 0.0016810207336523126, "loss": 1.5408, "step": 5000 }, { "epoch": 7.99, "grad_norm": 4.0094499588012695, "learning_rate": 0.0016803827751196172, "loss": 1.4195, "step": 5010 }, { "epoch": 8.01, "grad_norm": 2.0265750885009766, "learning_rate": 0.0016797448165869217, "loss": 1.181, "step": 5020 }, { "epoch": 8.02, "grad_norm": 2.840583086013794, "learning_rate": 0.0016791068580542265, "loss": 0.8481, "step": 5030 }, { "epoch": 8.04, "grad_norm": 2.409465789794922, "learning_rate": 0.001678468899521531, "loss": 0.9815, "step": 5040 }, { "epoch": 8.05, "grad_norm": 5.790297031402588, "learning_rate": 0.0016778309409888356, "loss": 0.9621, "step": 5050 }, { "epoch": 8.07, "grad_norm": 2.9008841514587402, "learning_rate": 0.0016771929824561404, "loss": 0.9734, "step": 5060 }, { "epoch": 8.09, "grad_norm": 2.495950937271118, "learning_rate": 0.001676555023923445, "loss": 0.9353, "step": 5070 }, { "epoch": 8.1, "grad_norm": 3.870645523071289, "learning_rate": 0.0016759170653907495, "loss": 1.023, "step": 5080 }, { "epoch": 8.12, "grad_norm": 2.352860450744629, "learning_rate": 0.001675279106858054, "loss": 0.9064, "step": 5090 }, { "epoch": 8.13, "grad_norm": 3.9795637130737305, "learning_rate": 0.0016746411483253589, "loss": 1.0378, "step": 5100 }, { "epoch": 8.15, "grad_norm": 3.728628396987915, "learning_rate": 0.0016740031897926634, "loss": 0.8394, "step": 5110 }, { "epoch": 8.17, "grad_norm": 4.232802391052246, "learning_rate": 0.001673365231259968, "loss": 1.0115, "step": 5120 }, { "epoch": 8.18, "grad_norm": 4.09517765045166, "learning_rate": 0.0016727272727272726, "loss": 1.0139, "step": 5130 }, { "epoch": 8.2, "grad_norm": 2.101757287979126, "learning_rate": 0.0016720893141945773, "loss": 1.0698, "step": 5140 }, { "epoch": 8.21, "grad_norm": 4.124992370605469, "learning_rate": 0.001671451355661882, "loss": 1.0214, "step": 5150 }, { "epoch": 8.23, "grad_norm": 4.000357151031494, "learning_rate": 0.0016708133971291865, "loss": 1.0302, "step": 5160 }, { "epoch": 8.25, "grad_norm": 4.556628704071045, "learning_rate": 0.0016701754385964912, "loss": 1.1185, "step": 5170 }, { "epoch": 8.26, "grad_norm": 4.288385391235352, "learning_rate": 0.0016695374800637958, "loss": 1.1112, "step": 5180 }, { "epoch": 8.28, "grad_norm": 3.5157744884490967, "learning_rate": 0.0016688995215311004, "loss": 1.0085, "step": 5190 }, { "epoch": 8.29, "grad_norm": 4.492936611175537, "learning_rate": 0.001668261562998405, "loss": 0.9928, "step": 5200 }, { "epoch": 8.31, "grad_norm": 2.415928363800049, "learning_rate": 0.0016676236044657097, "loss": 1.0586, "step": 5210 }, { "epoch": 8.33, "grad_norm": 4.508685111999512, "learning_rate": 0.0016669856459330143, "loss": 1.1178, "step": 5220 }, { "epoch": 8.34, "grad_norm": 6.090748310089111, "learning_rate": 0.0016663476874003188, "loss": 1.0743, "step": 5230 }, { "epoch": 8.36, "grad_norm": 4.638314723968506, "learning_rate": 0.0016657097288676238, "loss": 1.1763, "step": 5240 }, { "epoch": 8.37, "grad_norm": 3.5104875564575195, "learning_rate": 0.0016650717703349284, "loss": 1.0157, "step": 5250 }, { "epoch": 8.39, "grad_norm": 3.11543869972229, "learning_rate": 0.001664433811802233, "loss": 1.1719, "step": 5260 }, { "epoch": 8.41, "grad_norm": 3.698253631591797, "learning_rate": 0.0016637958532695375, "loss": 1.3834, "step": 5270 }, { "epoch": 8.42, "grad_norm": 4.070870876312256, "learning_rate": 0.0016631578947368423, "loss": 1.2485, "step": 5280 }, { "epoch": 8.44, "grad_norm": 3.328082323074341, "learning_rate": 0.0016625199362041469, "loss": 1.0835, "step": 5290 }, { "epoch": 8.45, "grad_norm": 4.319711208343506, "learning_rate": 0.0016618819776714514, "loss": 1.1718, "step": 5300 }, { "epoch": 8.47, "grad_norm": 3.040421485900879, "learning_rate": 0.0016612440191387562, "loss": 1.1825, "step": 5310 }, { "epoch": 8.48, "grad_norm": 3.8478896617889404, "learning_rate": 0.0016606060606060608, "loss": 1.2003, "step": 5320 }, { "epoch": 8.5, "grad_norm": 3.5098345279693604, "learning_rate": 0.0016599681020733653, "loss": 1.3229, "step": 5330 }, { "epoch": 8.52, "grad_norm": 1.7316700220108032, "learning_rate": 0.00165933014354067, "loss": 1.1779, "step": 5340 }, { "epoch": 8.53, "grad_norm": 2.3097381591796875, "learning_rate": 0.0016586921850079747, "loss": 1.0794, "step": 5350 }, { "epoch": 8.55, "grad_norm": 2.1922430992126465, "learning_rate": 0.0016580542264752792, "loss": 1.2068, "step": 5360 }, { "epoch": 8.56, "grad_norm": 5.0043864250183105, "learning_rate": 0.0016574162679425838, "loss": 1.2837, "step": 5370 }, { "epoch": 8.58, "grad_norm": 4.01829195022583, "learning_rate": 0.0016567783094098886, "loss": 1.0479, "step": 5380 }, { "epoch": 8.6, "grad_norm": 3.4249794483184814, "learning_rate": 0.0016561403508771931, "loss": 1.3172, "step": 5390 }, { "epoch": 8.61, "grad_norm": 3.52347993850708, "learning_rate": 0.0016555023923444977, "loss": 1.2796, "step": 5400 }, { "epoch": 8.63, "grad_norm": 3.3641083240509033, "learning_rate": 0.0016548644338118023, "loss": 1.2373, "step": 5410 }, { "epoch": 8.64, "grad_norm": 4.422176361083984, "learning_rate": 0.001654226475279107, "loss": 1.1601, "step": 5420 }, { "epoch": 8.66, "grad_norm": 4.531619071960449, "learning_rate": 0.0016535885167464116, "loss": 1.1254, "step": 5430 }, { "epoch": 8.68, "grad_norm": 2.5183939933776855, "learning_rate": 0.0016529505582137162, "loss": 1.2257, "step": 5440 }, { "epoch": 8.69, "grad_norm": 2.493967294692993, "learning_rate": 0.001652312599681021, "loss": 1.1369, "step": 5450 }, { "epoch": 8.71, "grad_norm": 4.203963279724121, "learning_rate": 0.0016516746411483255, "loss": 1.0819, "step": 5460 }, { "epoch": 8.72, "grad_norm": 4.204017639160156, "learning_rate": 0.00165103668261563, "loss": 1.2126, "step": 5470 }, { "epoch": 8.74, "grad_norm": 5.039621353149414, "learning_rate": 0.0016503987240829346, "loss": 1.265, "step": 5480 }, { "epoch": 8.76, "grad_norm": 2.5682952404022217, "learning_rate": 0.0016497607655502394, "loss": 1.2403, "step": 5490 }, { "epoch": 8.77, "grad_norm": 2.821531057357788, "learning_rate": 0.001649122807017544, "loss": 1.3182, "step": 5500 }, { "epoch": 8.79, "grad_norm": 3.4752848148345947, "learning_rate": 0.0016484848484848485, "loss": 1.264, "step": 5510 }, { "epoch": 8.8, "grad_norm": 2.5011346340179443, "learning_rate": 0.001647846889952153, "loss": 1.1792, "step": 5520 }, { "epoch": 8.82, "grad_norm": 4.323322772979736, "learning_rate": 0.0016472089314194579, "loss": 1.1745, "step": 5530 }, { "epoch": 8.84, "grad_norm": 4.5369768142700195, "learning_rate": 0.0016465709728867624, "loss": 1.2837, "step": 5540 }, { "epoch": 8.85, "grad_norm": 4.292603492736816, "learning_rate": 0.001645933014354067, "loss": 1.3317, "step": 5550 }, { "epoch": 8.87, "grad_norm": 2.312387228012085, "learning_rate": 0.0016452950558213718, "loss": 1.2081, "step": 5560 }, { "epoch": 8.88, "grad_norm": 3.759363889694214, "learning_rate": 0.0016446570972886764, "loss": 1.2549, "step": 5570 }, { "epoch": 8.9, "grad_norm": 4.08116340637207, "learning_rate": 0.001644019138755981, "loss": 1.2823, "step": 5580 }, { "epoch": 8.92, "grad_norm": 3.29032039642334, "learning_rate": 0.0016433811802232855, "loss": 1.2339, "step": 5590 }, { "epoch": 8.93, "grad_norm": 4.082303524017334, "learning_rate": 0.0016427432216905903, "loss": 1.2537, "step": 5600 }, { "epoch": 8.95, "grad_norm": 3.470620632171631, "learning_rate": 0.0016421052631578948, "loss": 1.2605, "step": 5610 }, { "epoch": 8.96, "grad_norm": 5.008780002593994, "learning_rate": 0.0016414673046251994, "loss": 1.3909, "step": 5620 }, { "epoch": 8.98, "grad_norm": 3.3247218132019043, "learning_rate": 0.0016408293460925042, "loss": 1.2504, "step": 5630 }, { "epoch": 9.0, "grad_norm": 3.653365135192871, "learning_rate": 0.0016401913875598087, "loss": 1.3043, "step": 5640 }, { "epoch": 9.01, "grad_norm": 2.061579942703247, "learning_rate": 0.0016395534290271133, "loss": 1.0127, "step": 5650 }, { "epoch": 9.03, "grad_norm": 3.8204243183135986, "learning_rate": 0.0016389154704944179, "loss": 0.8594, "step": 5660 }, { "epoch": 9.04, "grad_norm": 3.1755354404449463, "learning_rate": 0.0016382775119617226, "loss": 0.8146, "step": 5670 }, { "epoch": 9.06, "grad_norm": 6.688543319702148, "learning_rate": 0.0016376395534290272, "loss": 0.8972, "step": 5680 }, { "epoch": 9.07, "grad_norm": 2.6439781188964844, "learning_rate": 0.0016370015948963318, "loss": 0.8627, "step": 5690 }, { "epoch": 9.09, "grad_norm": 2.962597131729126, "learning_rate": 0.0016363636363636365, "loss": 0.8263, "step": 5700 }, { "epoch": 9.11, "grad_norm": 4.008563995361328, "learning_rate": 0.001635725677830941, "loss": 0.849, "step": 5710 }, { "epoch": 9.12, "grad_norm": 3.4718000888824463, "learning_rate": 0.0016350877192982457, "loss": 0.873, "step": 5720 }, { "epoch": 9.14, "grad_norm": 3.4607927799224854, "learning_rate": 0.0016344497607655502, "loss": 0.9231, "step": 5730 }, { "epoch": 9.15, "grad_norm": 3.6140999794006348, "learning_rate": 0.001633811802232855, "loss": 0.8654, "step": 5740 }, { "epoch": 9.17, "grad_norm": 4.108109474182129, "learning_rate": 0.0016331738437001596, "loss": 1.1229, "step": 5750 }, { "epoch": 9.19, "grad_norm": 3.7185311317443848, "learning_rate": 0.0016325358851674641, "loss": 0.8597, "step": 5760 }, { "epoch": 9.2, "grad_norm": 2.866516351699829, "learning_rate": 0.001631897926634769, "loss": 0.9815, "step": 5770 }, { "epoch": 9.22, "grad_norm": 3.452366590499878, "learning_rate": 0.0016312599681020735, "loss": 0.8871, "step": 5780 }, { "epoch": 9.23, "grad_norm": 3.129293203353882, "learning_rate": 0.001630622009569378, "loss": 0.9316, "step": 5790 }, { "epoch": 9.25, "grad_norm": 3.3350937366485596, "learning_rate": 0.0016299840510366826, "loss": 1.0546, "step": 5800 }, { "epoch": 9.27, "grad_norm": 3.9579129219055176, "learning_rate": 0.0016293460925039874, "loss": 0.8929, "step": 5810 }, { "epoch": 9.28, "grad_norm": 2.4661436080932617, "learning_rate": 0.001628708133971292, "loss": 0.8749, "step": 5820 }, { "epoch": 9.3, "grad_norm": 5.519815444946289, "learning_rate": 0.0016280701754385965, "loss": 1.0323, "step": 5830 }, { "epoch": 9.31, "grad_norm": 4.336925983428955, "learning_rate": 0.001627432216905901, "loss": 0.9915, "step": 5840 }, { "epoch": 9.33, "grad_norm": 3.4402873516082764, "learning_rate": 0.0016267942583732058, "loss": 1.0824, "step": 5850 }, { "epoch": 9.35, "grad_norm": 2.801079034805298, "learning_rate": 0.0016261562998405104, "loss": 1.0676, "step": 5860 }, { "epoch": 9.36, "grad_norm": 4.0579729080200195, "learning_rate": 0.001625518341307815, "loss": 1.1761, "step": 5870 }, { "epoch": 9.38, "grad_norm": 3.309401035308838, "learning_rate": 0.0016248803827751197, "loss": 0.9767, "step": 5880 }, { "epoch": 9.39, "grad_norm": 2.3733794689178467, "learning_rate": 0.0016242424242424243, "loss": 0.874, "step": 5890 }, { "epoch": 9.41, "grad_norm": 2.4089362621307373, "learning_rate": 0.0016236044657097289, "loss": 0.9179, "step": 5900 }, { "epoch": 9.43, "grad_norm": 4.605165004730225, "learning_rate": 0.0016229665071770334, "loss": 1.1248, "step": 5910 }, { "epoch": 9.44, "grad_norm": 5.862342357635498, "learning_rate": 0.0016223285486443382, "loss": 1.1098, "step": 5920 }, { "epoch": 9.46, "grad_norm": 4.282538890838623, "learning_rate": 0.0016216905901116428, "loss": 0.9369, "step": 5930 }, { "epoch": 9.47, "grad_norm": 4.155124187469482, "learning_rate": 0.0016210526315789473, "loss": 1.0002, "step": 5940 }, { "epoch": 9.49, "grad_norm": 2.9133784770965576, "learning_rate": 0.0016204146730462521, "loss": 0.9377, "step": 5950 }, { "epoch": 9.51, "grad_norm": 3.7865607738494873, "learning_rate": 0.0016197767145135567, "loss": 1.1844, "step": 5960 }, { "epoch": 9.52, "grad_norm": 4.1235270500183105, "learning_rate": 0.0016191387559808612, "loss": 1.0376, "step": 5970 }, { "epoch": 9.54, "grad_norm": 2.8695414066314697, "learning_rate": 0.0016185007974481658, "loss": 1.1122, "step": 5980 }, { "epoch": 9.55, "grad_norm": 4.424979209899902, "learning_rate": 0.0016178628389154706, "loss": 1.0555, "step": 5990 }, { "epoch": 9.57, "grad_norm": 4.249617576599121, "learning_rate": 0.0016172248803827752, "loss": 1.0784, "step": 6000 }, { "epoch": 9.59, "grad_norm": 3.7470569610595703, "learning_rate": 0.0016165869218500797, "loss": 1.295, "step": 6010 }, { "epoch": 9.6, "grad_norm": 3.8228983879089355, "learning_rate": 0.0016159489633173845, "loss": 1.005, "step": 6020 }, { "epoch": 9.62, "grad_norm": 3.3642499446868896, "learning_rate": 0.001615311004784689, "loss": 1.002, "step": 6030 }, { "epoch": 9.63, "grad_norm": 3.2379348278045654, "learning_rate": 0.0016146730462519936, "loss": 0.9958, "step": 6040 }, { "epoch": 9.65, "grad_norm": 2.8118715286254883, "learning_rate": 0.0016140350877192982, "loss": 1.2711, "step": 6050 }, { "epoch": 9.67, "grad_norm": 4.146730899810791, "learning_rate": 0.001613397129186603, "loss": 1.2352, "step": 6060 }, { "epoch": 9.68, "grad_norm": 3.6763010025024414, "learning_rate": 0.0016127591706539075, "loss": 1.1935, "step": 6070 }, { "epoch": 9.7, "grad_norm": 2.510589838027954, "learning_rate": 0.001612121212121212, "loss": 1.2449, "step": 6080 }, { "epoch": 9.71, "grad_norm": 3.556995153427124, "learning_rate": 0.0016114832535885169, "loss": 1.3257, "step": 6090 }, { "epoch": 9.73, "grad_norm": 3.670929193496704, "learning_rate": 0.0016108452950558214, "loss": 1.0892, "step": 6100 }, { "epoch": 9.74, "grad_norm": 3.0864908695220947, "learning_rate": 0.001610207336523126, "loss": 1.0478, "step": 6110 }, { "epoch": 9.76, "grad_norm": 2.65902042388916, "learning_rate": 0.0016095693779904306, "loss": 1.1211, "step": 6120 }, { "epoch": 9.78, "grad_norm": 2.6973979473114014, "learning_rate": 0.0016089314194577353, "loss": 1.0727, "step": 6130 }, { "epoch": 9.79, "grad_norm": 4.574107646942139, "learning_rate": 0.00160829346092504, "loss": 1.0997, "step": 6140 }, { "epoch": 9.81, "grad_norm": 3.178717613220215, "learning_rate": 0.0016076555023923445, "loss": 1.0726, "step": 6150 }, { "epoch": 9.82, "grad_norm": 3.5710108280181885, "learning_rate": 0.0016070175438596492, "loss": 1.0341, "step": 6160 }, { "epoch": 9.84, "grad_norm": 3.28791880607605, "learning_rate": 0.0016063795853269538, "loss": 1.2142, "step": 6170 }, { "epoch": 9.86, "grad_norm": 2.811490535736084, "learning_rate": 0.0016057416267942584, "loss": 1.2218, "step": 6180 }, { "epoch": 9.87, "grad_norm": 2.8246653079986572, "learning_rate": 0.001605103668261563, "loss": 1.0756, "step": 6190 }, { "epoch": 9.89, "grad_norm": 4.228902339935303, "learning_rate": 0.0016044657097288677, "loss": 1.133, "step": 6200 }, { "epoch": 9.9, "grad_norm": 3.8225128650665283, "learning_rate": 0.0016038277511961723, "loss": 1.2428, "step": 6210 }, { "epoch": 9.92, "grad_norm": 4.282769680023193, "learning_rate": 0.0016031897926634768, "loss": 1.2566, "step": 6220 }, { "epoch": 9.94, "grad_norm": 4.843967437744141, "learning_rate": 0.0016025518341307814, "loss": 1.1299, "step": 6230 }, { "epoch": 9.95, "grad_norm": 3.592618227005005, "learning_rate": 0.0016019138755980862, "loss": 1.2274, "step": 6240 }, { "epoch": 9.97, "grad_norm": 4.132793426513672, "learning_rate": 0.0016012759170653907, "loss": 1.1314, "step": 6250 }, { "epoch": 9.98, "grad_norm": 4.065629005432129, "learning_rate": 0.0016006379585326953, "loss": 1.2704, "step": 6260 }, { "epoch": 10.0, "grad_norm": 4.359400749206543, "learning_rate": 0.0016, "loss": 1.1157, "step": 6270 }, { "epoch": 10.02, "grad_norm": 3.357485771179199, "learning_rate": 0.0015993620414673046, "loss": 0.8463, "step": 6280 }, { "epoch": 10.03, "grad_norm": 3.480729579925537, "learning_rate": 0.0015987240829346092, "loss": 0.7863, "step": 6290 }, { "epoch": 10.05, "grad_norm": 2.2290802001953125, "learning_rate": 0.0015980861244019138, "loss": 0.7799, "step": 6300 }, { "epoch": 10.06, "grad_norm": 2.3109190464019775, "learning_rate": 0.0015974481658692185, "loss": 0.6763, "step": 6310 }, { "epoch": 10.08, "grad_norm": 4.066445350646973, "learning_rate": 0.001596810207336523, "loss": 0.8017, "step": 6320 }, { "epoch": 10.1, "grad_norm": 3.2394723892211914, "learning_rate": 0.0015961722488038277, "loss": 0.9444, "step": 6330 }, { "epoch": 10.11, "grad_norm": 3.250441551208496, "learning_rate": 0.0015955342902711324, "loss": 0.8472, "step": 6340 }, { "epoch": 10.13, "grad_norm": 2.4321706295013428, "learning_rate": 0.001594896331738437, "loss": 0.784, "step": 6350 }, { "epoch": 10.14, "grad_norm": 2.8150861263275146, "learning_rate": 0.0015942583732057416, "loss": 0.8903, "step": 6360 }, { "epoch": 10.16, "grad_norm": 2.4516990184783936, "learning_rate": 0.0015936204146730461, "loss": 0.7983, "step": 6370 }, { "epoch": 10.18, "grad_norm": 4.499327659606934, "learning_rate": 0.001592982456140351, "loss": 0.8822, "step": 6380 }, { "epoch": 10.19, "grad_norm": 3.4871532917022705, "learning_rate": 0.0015923444976076555, "loss": 0.8933, "step": 6390 }, { "epoch": 10.21, "grad_norm": 2.1855833530426025, "learning_rate": 0.00159170653907496, "loss": 0.7781, "step": 6400 }, { "epoch": 10.22, "grad_norm": 2.45394229888916, "learning_rate": 0.0015910685805422648, "loss": 0.7879, "step": 6410 }, { "epoch": 10.24, "grad_norm": 4.465210914611816, "learning_rate": 0.0015904306220095694, "loss": 0.9825, "step": 6420 }, { "epoch": 10.26, "grad_norm": 4.026763916015625, "learning_rate": 0.001589792663476874, "loss": 0.9108, "step": 6430 }, { "epoch": 10.27, "grad_norm": 3.5490238666534424, "learning_rate": 0.0015891547049441785, "loss": 0.8742, "step": 6440 }, { "epoch": 10.29, "grad_norm": 3.857203960418701, "learning_rate": 0.0015885167464114833, "loss": 0.9243, "step": 6450 }, { "epoch": 10.3, "grad_norm": 5.296510696411133, "learning_rate": 0.0015878787878787879, "loss": 1.0742, "step": 6460 }, { "epoch": 10.32, "grad_norm": 3.6743974685668945, "learning_rate": 0.0015872408293460924, "loss": 1.0541, "step": 6470 }, { "epoch": 10.33, "grad_norm": 3.527785301208496, "learning_rate": 0.0015866028708133972, "loss": 0.958, "step": 6480 }, { "epoch": 10.35, "grad_norm": 2.7961020469665527, "learning_rate": 0.0015859649122807018, "loss": 1.0794, "step": 6490 }, { "epoch": 10.37, "grad_norm": 5.286695957183838, "learning_rate": 0.0015853269537480063, "loss": 1.0691, "step": 6500 }, { "epoch": 10.38, "grad_norm": 2.792459011077881, "learning_rate": 0.0015846889952153109, "loss": 1.005, "step": 6510 }, { "epoch": 10.4, "grad_norm": 4.287434101104736, "learning_rate": 0.0015840510366826157, "loss": 1.0826, "step": 6520 }, { "epoch": 10.41, "grad_norm": 3.291612148284912, "learning_rate": 0.0015834130781499202, "loss": 0.9482, "step": 6530 }, { "epoch": 10.43, "grad_norm": 2.1570498943328857, "learning_rate": 0.0015827751196172248, "loss": 0.8198, "step": 6540 }, { "epoch": 10.45, "grad_norm": 5.364358901977539, "learning_rate": 0.0015821371610845293, "loss": 0.922, "step": 6550 }, { "epoch": 10.46, "grad_norm": 2.493326187133789, "learning_rate": 0.0015814992025518341, "loss": 1.0143, "step": 6560 }, { "epoch": 10.48, "grad_norm": 4.336993217468262, "learning_rate": 0.0015808612440191387, "loss": 0.9091, "step": 6570 }, { "epoch": 10.49, "grad_norm": 3.2809929847717285, "learning_rate": 0.0015802232854864433, "loss": 0.947, "step": 6580 }, { "epoch": 10.51, "grad_norm": 3.941453456878662, "learning_rate": 0.001579585326953748, "loss": 1.0424, "step": 6590 }, { "epoch": 10.53, "grad_norm": 2.2481088638305664, "learning_rate": 0.0015789473684210526, "loss": 0.9591, "step": 6600 }, { "epoch": 10.54, "grad_norm": 2.889963388442993, "learning_rate": 0.0015783094098883572, "loss": 1.0191, "step": 6610 }, { "epoch": 10.56, "grad_norm": 3.2319366931915283, "learning_rate": 0.0015776714513556617, "loss": 1.1833, "step": 6620 }, { "epoch": 10.57, "grad_norm": 2.6110410690307617, "learning_rate": 0.0015770334928229665, "loss": 1.082, "step": 6630 }, { "epoch": 10.59, "grad_norm": 2.9207470417022705, "learning_rate": 0.001576395534290271, "loss": 0.9628, "step": 6640 }, { "epoch": 10.61, "grad_norm": 4.300070285797119, "learning_rate": 0.0015757575757575756, "loss": 0.9607, "step": 6650 }, { "epoch": 10.62, "grad_norm": 3.435377597808838, "learning_rate": 0.0015751196172248804, "loss": 0.9709, "step": 6660 }, { "epoch": 10.64, "grad_norm": 3.129941940307617, "learning_rate": 0.001574481658692185, "loss": 1.0623, "step": 6670 }, { "epoch": 10.65, "grad_norm": 3.273089647293091, "learning_rate": 0.0015738437001594895, "loss": 1.0186, "step": 6680 }, { "epoch": 10.67, "grad_norm": 3.147507667541504, "learning_rate": 0.001573205741626794, "loss": 0.9143, "step": 6690 }, { "epoch": 10.69, "grad_norm": 3.1906449794769287, "learning_rate": 0.0015725677830940989, "loss": 1.0698, "step": 6700 }, { "epoch": 10.7, "grad_norm": 2.52282452583313, "learning_rate": 0.0015719298245614034, "loss": 1.2058, "step": 6710 }, { "epoch": 10.72, "grad_norm": 3.526111602783203, "learning_rate": 0.001571291866028708, "loss": 1.0494, "step": 6720 }, { "epoch": 10.73, "grad_norm": 4.391296863555908, "learning_rate": 0.001570653907496013, "loss": 1.0639, "step": 6730 }, { "epoch": 10.75, "grad_norm": 3.623323678970337, "learning_rate": 0.0015700159489633176, "loss": 1.1105, "step": 6740 }, { "epoch": 10.77, "grad_norm": 3.705646514892578, "learning_rate": 0.0015693779904306221, "loss": 0.964, "step": 6750 }, { "epoch": 10.78, "grad_norm": 2.726846694946289, "learning_rate": 0.0015687400318979267, "loss": 1.1207, "step": 6760 }, { "epoch": 10.8, "grad_norm": 2.3796093463897705, "learning_rate": 0.0015681020733652315, "loss": 1.038, "step": 6770 }, { "epoch": 10.81, "grad_norm": 2.321793794631958, "learning_rate": 0.001567464114832536, "loss": 1.1098, "step": 6780 }, { "epoch": 10.83, "grad_norm": 4.951314926147461, "learning_rate": 0.0015668261562998406, "loss": 1.0766, "step": 6790 }, { "epoch": 10.85, "grad_norm": 2.3192481994628906, "learning_rate": 0.0015661881977671454, "loss": 1.1615, "step": 6800 }, { "epoch": 10.86, "grad_norm": 3.576709508895874, "learning_rate": 0.00156555023923445, "loss": 1.0031, "step": 6810 }, { "epoch": 10.88, "grad_norm": 3.2805440425872803, "learning_rate": 0.0015649122807017545, "loss": 0.9738, "step": 6820 }, { "epoch": 10.89, "grad_norm": 3.367990016937256, "learning_rate": 0.001564274322169059, "loss": 0.9948, "step": 6830 }, { "epoch": 10.91, "grad_norm": 4.358039379119873, "learning_rate": 0.0015636363636363638, "loss": 1.0906, "step": 6840 }, { "epoch": 10.93, "grad_norm": 3.0704123973846436, "learning_rate": 0.0015629984051036684, "loss": 1.1433, "step": 6850 }, { "epoch": 10.94, "grad_norm": 3.6105406284332275, "learning_rate": 0.001562360446570973, "loss": 1.0975, "step": 6860 }, { "epoch": 10.96, "grad_norm": 2.0646121501922607, "learning_rate": 0.0015617224880382775, "loss": 1.1646, "step": 6870 }, { "epoch": 10.97, "grad_norm": 3.911951780319214, "learning_rate": 0.0015610845295055823, "loss": 1.0135, "step": 6880 }, { "epoch": 10.99, "grad_norm": 3.6417315006256104, "learning_rate": 0.0015604465709728869, "loss": 1.1479, "step": 6890 }, { "epoch": 11.0, "grad_norm": 2.3251378536224365, "learning_rate": 0.0015598086124401914, "loss": 1.058, "step": 6900 }, { "epoch": 11.02, "grad_norm": 2.8822855949401855, "learning_rate": 0.0015591706539074962, "loss": 0.7655, "step": 6910 }, { "epoch": 11.04, "grad_norm": 3.3327693939208984, "learning_rate": 0.0015585326953748008, "loss": 0.8605, "step": 6920 }, { "epoch": 11.05, "grad_norm": 2.5779995918273926, "learning_rate": 0.0015578947368421053, "loss": 0.8452, "step": 6930 }, { "epoch": 11.07, "grad_norm": 3.0843000411987305, "learning_rate": 0.00155725677830941, "loss": 0.7461, "step": 6940 }, { "epoch": 11.08, "grad_norm": 3.354552984237671, "learning_rate": 0.0015566188197767147, "loss": 0.8311, "step": 6950 }, { "epoch": 11.1, "grad_norm": 2.7717132568359375, "learning_rate": 0.0015559808612440192, "loss": 0.8472, "step": 6960 }, { "epoch": 11.12, "grad_norm": 2.2292239665985107, "learning_rate": 0.0015553429027113238, "loss": 0.759, "step": 6970 }, { "epoch": 11.13, "grad_norm": 4.184642791748047, "learning_rate": 0.0015547049441786286, "loss": 0.7796, "step": 6980 }, { "epoch": 11.15, "grad_norm": 3.987525463104248, "learning_rate": 0.0015540669856459331, "loss": 0.7885, "step": 6990 }, { "epoch": 11.16, "grad_norm": 2.9014410972595215, "learning_rate": 0.0015534290271132377, "loss": 0.8425, "step": 7000 }, { "epoch": 11.18, "grad_norm": 2.4290761947631836, "learning_rate": 0.0015527910685805423, "loss": 0.7046, "step": 7010 }, { "epoch": 11.2, "grad_norm": 4.5738701820373535, "learning_rate": 0.001552153110047847, "loss": 0.794, "step": 7020 }, { "epoch": 11.21, "grad_norm": 3.999741792678833, "learning_rate": 0.0015515151515151516, "loss": 0.8193, "step": 7030 }, { "epoch": 11.23, "grad_norm": 3.0708727836608887, "learning_rate": 0.0015508771929824562, "loss": 0.8712, "step": 7040 }, { "epoch": 11.24, "grad_norm": 3.396559715270996, "learning_rate": 0.001550239234449761, "loss": 0.8411, "step": 7050 }, { "epoch": 11.26, "grad_norm": 3.517340898513794, "learning_rate": 0.0015496012759170655, "loss": 0.7907, "step": 7060 }, { "epoch": 11.28, "grad_norm": 2.170309066772461, "learning_rate": 0.00154896331738437, "loss": 0.7761, "step": 7070 }, { "epoch": 11.29, "grad_norm": 4.765143871307373, "learning_rate": 0.0015483253588516746, "loss": 1.0072, "step": 7080 }, { "epoch": 11.31, "grad_norm": 2.595566749572754, "learning_rate": 0.0015476874003189794, "loss": 0.9483, "step": 7090 }, { "epoch": 11.32, "grad_norm": 3.8784263134002686, "learning_rate": 0.001547049441786284, "loss": 0.8049, "step": 7100 }, { "epoch": 11.34, "grad_norm": 3.033404588699341, "learning_rate": 0.0015464114832535885, "loss": 0.8574, "step": 7110 }, { "epoch": 11.36, "grad_norm": 3.059054136276245, "learning_rate": 0.0015457735247208933, "loss": 0.8477, "step": 7120 }, { "epoch": 11.37, "grad_norm": 4.744221210479736, "learning_rate": 0.0015451355661881979, "loss": 1.0191, "step": 7130 }, { "epoch": 11.39, "grad_norm": 2.8809046745300293, "learning_rate": 0.0015444976076555024, "loss": 0.8683, "step": 7140 }, { "epoch": 11.4, "grad_norm": 2.913546323776245, "learning_rate": 0.001543859649122807, "loss": 0.9145, "step": 7150 }, { "epoch": 11.42, "grad_norm": 3.83941650390625, "learning_rate": 0.0015432216905901118, "loss": 0.8532, "step": 7160 }, { "epoch": 11.44, "grad_norm": 3.471904754638672, "learning_rate": 0.0015425837320574164, "loss": 0.8605, "step": 7170 }, { "epoch": 11.45, "grad_norm": 3.6713290214538574, "learning_rate": 0.001541945773524721, "loss": 1.013, "step": 7180 }, { "epoch": 11.47, "grad_norm": 3.537461996078491, "learning_rate": 0.0015413078149920257, "loss": 0.9493, "step": 7190 }, { "epoch": 11.48, "grad_norm": 3.101954460144043, "learning_rate": 0.0015406698564593303, "loss": 0.892, "step": 7200 }, { "epoch": 11.5, "grad_norm": 4.835020542144775, "learning_rate": 0.0015400318979266348, "loss": 0.9719, "step": 7210 }, { "epoch": 11.52, "grad_norm": 3.35196852684021, "learning_rate": 0.0015393939393939394, "loss": 0.84, "step": 7220 }, { "epoch": 11.53, "grad_norm": 3.0783281326293945, "learning_rate": 0.0015387559808612442, "loss": 1.0172, "step": 7230 }, { "epoch": 11.55, "grad_norm": 3.5924274921417236, "learning_rate": 0.0015381180223285487, "loss": 0.9569, "step": 7240 }, { "epoch": 11.56, "grad_norm": 4.351842403411865, "learning_rate": 0.0015374800637958533, "loss": 0.9124, "step": 7250 }, { "epoch": 11.58, "grad_norm": 5.1138200759887695, "learning_rate": 0.0015368421052631579, "loss": 1.0265, "step": 7260 }, { "epoch": 11.59, "grad_norm": 4.592616558074951, "learning_rate": 0.0015362041467304626, "loss": 0.8753, "step": 7270 }, { "epoch": 11.61, "grad_norm": 2.198404550552368, "learning_rate": 0.0015355661881977672, "loss": 0.9964, "step": 7280 }, { "epoch": 11.63, "grad_norm": 3.718247175216675, "learning_rate": 0.0015349282296650718, "loss": 0.9982, "step": 7290 }, { "epoch": 11.64, "grad_norm": 2.973299980163574, "learning_rate": 0.0015342902711323765, "loss": 0.9022, "step": 7300 }, { "epoch": 11.66, "grad_norm": 3.1553690433502197, "learning_rate": 0.001533652312599681, "loss": 0.8882, "step": 7310 }, { "epoch": 11.67, "grad_norm": 5.204711437225342, "learning_rate": 0.0015330143540669857, "loss": 0.9439, "step": 7320 }, { "epoch": 11.69, "grad_norm": 2.575793981552124, "learning_rate": 0.0015323763955342902, "loss": 0.9633, "step": 7330 }, { "epoch": 11.71, "grad_norm": 3.682734251022339, "learning_rate": 0.001531738437001595, "loss": 0.8831, "step": 7340 }, { "epoch": 11.72, "grad_norm": 4.238563060760498, "learning_rate": 0.0015311004784688996, "loss": 1.2109, "step": 7350 }, { "epoch": 11.74, "grad_norm": 4.091822147369385, "learning_rate": 0.0015304625199362041, "loss": 0.9663, "step": 7360 }, { "epoch": 11.75, "grad_norm": 4.6950154304504395, "learning_rate": 0.001529824561403509, "loss": 0.9541, "step": 7370 }, { "epoch": 11.77, "grad_norm": 2.6994404792785645, "learning_rate": 0.0015291866028708135, "loss": 0.938, "step": 7380 }, { "epoch": 11.79, "grad_norm": 3.632509708404541, "learning_rate": 0.001528548644338118, "loss": 1.0444, "step": 7390 }, { "epoch": 11.8, "grad_norm": 3.1459712982177734, "learning_rate": 0.0015279106858054226, "loss": 1.0143, "step": 7400 }, { "epoch": 11.82, "grad_norm": 3.5480315685272217, "learning_rate": 0.0015272727272727274, "loss": 1.001, "step": 7410 }, { "epoch": 11.83, "grad_norm": 2.908008575439453, "learning_rate": 0.001526634768740032, "loss": 1.0674, "step": 7420 }, { "epoch": 11.85, "grad_norm": 3.147965431213379, "learning_rate": 0.0015259968102073365, "loss": 1.0779, "step": 7430 }, { "epoch": 11.87, "grad_norm": 3.2961347103118896, "learning_rate": 0.0015253588516746413, "loss": 0.9077, "step": 7440 }, { "epoch": 11.88, "grad_norm": 3.34252667427063, "learning_rate": 0.0015247208931419458, "loss": 1.1163, "step": 7450 }, { "epoch": 11.9, "grad_norm": 3.7476675510406494, "learning_rate": 0.0015240829346092504, "loss": 1.0136, "step": 7460 }, { "epoch": 11.91, "grad_norm": 3.686720609664917, "learning_rate": 0.001523444976076555, "loss": 1.0083, "step": 7470 }, { "epoch": 11.93, "grad_norm": 3.023853302001953, "learning_rate": 0.0015228070175438597, "loss": 1.1149, "step": 7480 }, { "epoch": 11.95, "grad_norm": 2.11389422416687, "learning_rate": 0.0015221690590111643, "loss": 0.9332, "step": 7490 }, { "epoch": 11.96, "grad_norm": 2.868576765060425, "learning_rate": 0.0015215311004784689, "loss": 1.0661, "step": 7500 }, { "epoch": 11.98, "grad_norm": 2.1617486476898193, "learning_rate": 0.0015208931419457737, "loss": 1.0098, "step": 7510 }, { "epoch": 11.99, "grad_norm": 3.540294647216797, "learning_rate": 0.0015202551834130782, "loss": 1.032, "step": 7520 }, { "epoch": 12.01, "grad_norm": 3.1346607208251953, "learning_rate": 0.0015196172248803828, "loss": 0.9046, "step": 7530 }, { "epoch": 12.03, "grad_norm": 2.131230115890503, "learning_rate": 0.0015189792663476873, "loss": 0.6433, "step": 7540 }, { "epoch": 12.04, "grad_norm": 1.7812432050704956, "learning_rate": 0.0015183413078149921, "loss": 0.6029, "step": 7550 }, { "epoch": 12.06, "grad_norm": 3.244680643081665, "learning_rate": 0.0015177033492822967, "loss": 0.7685, "step": 7560 }, { "epoch": 12.07, "grad_norm": 2.641512393951416, "learning_rate": 0.0015170653907496012, "loss": 0.5811, "step": 7570 }, { "epoch": 12.09, "grad_norm": 2.1574976444244385, "learning_rate": 0.0015164274322169058, "loss": 0.7704, "step": 7580 }, { "epoch": 12.11, "grad_norm": 2.7403526306152344, "learning_rate": 0.0015157894736842106, "loss": 0.822, "step": 7590 }, { "epoch": 12.12, "grad_norm": 4.00333309173584, "learning_rate": 0.0015151515151515152, "loss": 0.6791, "step": 7600 }, { "epoch": 12.14, "grad_norm": 3.1871447563171387, "learning_rate": 0.0015145135566188197, "loss": 0.6589, "step": 7610 }, { "epoch": 12.15, "grad_norm": 2.847644567489624, "learning_rate": 0.0015138755980861245, "loss": 0.7128, "step": 7620 }, { "epoch": 12.17, "grad_norm": 2.5338680744171143, "learning_rate": 0.001513237639553429, "loss": 0.9216, "step": 7630 }, { "epoch": 12.19, "grad_norm": 2.299643039703369, "learning_rate": 0.0015125996810207336, "loss": 0.8705, "step": 7640 }, { "epoch": 12.2, "grad_norm": 2.6167166233062744, "learning_rate": 0.0015119617224880382, "loss": 0.7226, "step": 7650 }, { "epoch": 12.22, "grad_norm": 1.9708589315414429, "learning_rate": 0.001511323763955343, "loss": 0.7894, "step": 7660 }, { "epoch": 12.23, "grad_norm": 2.8870623111724854, "learning_rate": 0.0015106858054226475, "loss": 0.7716, "step": 7670 }, { "epoch": 12.25, "grad_norm": 2.571887493133545, "learning_rate": 0.001510047846889952, "loss": 0.6934, "step": 7680 }, { "epoch": 12.26, "grad_norm": 3.059251070022583, "learning_rate": 0.0015094098883572569, "loss": 0.7272, "step": 7690 }, { "epoch": 12.28, "grad_norm": 2.94647216796875, "learning_rate": 0.0015087719298245614, "loss": 0.7966, "step": 7700 }, { "epoch": 12.3, "grad_norm": 2.6510915756225586, "learning_rate": 0.001508133971291866, "loss": 0.735, "step": 7710 }, { "epoch": 12.31, "grad_norm": 2.9655959606170654, "learning_rate": 0.0015074960127591706, "loss": 0.8989, "step": 7720 }, { "epoch": 12.33, "grad_norm": 2.72773814201355, "learning_rate": 0.0015068580542264753, "loss": 0.8239, "step": 7730 }, { "epoch": 12.34, "grad_norm": 2.8079593181610107, "learning_rate": 0.00150622009569378, "loss": 0.7945, "step": 7740 }, { "epoch": 12.36, "grad_norm": 2.3012099266052246, "learning_rate": 0.0015055821371610845, "loss": 0.8224, "step": 7750 }, { "epoch": 12.38, "grad_norm": 3.559399127960205, "learning_rate": 0.0015049441786283892, "loss": 0.7912, "step": 7760 }, { "epoch": 12.39, "grad_norm": 2.993138551712036, "learning_rate": 0.0015043062200956938, "loss": 0.8127, "step": 7770 }, { "epoch": 12.41, "grad_norm": 3.5749433040618896, "learning_rate": 0.0015036682615629984, "loss": 0.8071, "step": 7780 }, { "epoch": 12.42, "grad_norm": 2.879560947418213, "learning_rate": 0.001503030303030303, "loss": 0.8794, "step": 7790 }, { "epoch": 12.44, "grad_norm": 3.648130416870117, "learning_rate": 0.0015023923444976077, "loss": 0.8795, "step": 7800 }, { "epoch": 12.46, "grad_norm": 5.283175468444824, "learning_rate": 0.0015017543859649123, "loss": 1.0504, "step": 7810 }, { "epoch": 12.47, "grad_norm": 3.602062940597534, "learning_rate": 0.0015011164274322168, "loss": 0.8283, "step": 7820 }, { "epoch": 12.49, "grad_norm": 2.755488872528076, "learning_rate": 0.0015004784688995216, "loss": 0.8203, "step": 7830 }, { "epoch": 12.5, "grad_norm": 3.24674654006958, "learning_rate": 0.0014998405103668262, "loss": 0.7843, "step": 7840 }, { "epoch": 12.52, "grad_norm": 2.072895050048828, "learning_rate": 0.0014992025518341307, "loss": 0.7738, "step": 7850 }, { "epoch": 12.54, "grad_norm": 4.0108208656311035, "learning_rate": 0.0014985645933014353, "loss": 0.8728, "step": 7860 }, { "epoch": 12.55, "grad_norm": 2.896224021911621, "learning_rate": 0.00149792663476874, "loss": 0.823, "step": 7870 }, { "epoch": 12.57, "grad_norm": 3.3562960624694824, "learning_rate": 0.0014972886762360446, "loss": 1.0389, "step": 7880 }, { "epoch": 12.58, "grad_norm": 3.14931058883667, "learning_rate": 0.0014966507177033492, "loss": 0.9448, "step": 7890 }, { "epoch": 12.6, "grad_norm": 6.942476272583008, "learning_rate": 0.001496012759170654, "loss": 0.8526, "step": 7900 }, { "epoch": 12.62, "grad_norm": 2.516266107559204, "learning_rate": 0.0014953748006379585, "loss": 0.8342, "step": 7910 }, { "epoch": 12.63, "grad_norm": 2.6325111389160156, "learning_rate": 0.001494736842105263, "loss": 0.9933, "step": 7920 }, { "epoch": 12.65, "grad_norm": 3.630423069000244, "learning_rate": 0.0014940988835725677, "loss": 0.8403, "step": 7930 }, { "epoch": 12.66, "grad_norm": 3.6334409713745117, "learning_rate": 0.0014934609250398724, "loss": 1.0628, "step": 7940 }, { "epoch": 12.68, "grad_norm": 3.110170841217041, "learning_rate": 0.001492822966507177, "loss": 0.8604, "step": 7950 }, { "epoch": 12.7, "grad_norm": 3.0557703971862793, "learning_rate": 0.0014921850079744816, "loss": 0.9121, "step": 7960 }, { "epoch": 12.71, "grad_norm": 3.6271071434020996, "learning_rate": 0.0014915470494417861, "loss": 0.9177, "step": 7970 }, { "epoch": 12.73, "grad_norm": 3.5513288974761963, "learning_rate": 0.001490909090909091, "loss": 0.8542, "step": 7980 }, { "epoch": 12.74, "grad_norm": 4.270805358886719, "learning_rate": 0.0014902711323763955, "loss": 0.9907, "step": 7990 }, { "epoch": 12.76, "grad_norm": 2.8084616661071777, "learning_rate": 0.0014896331738437, "loss": 0.9405, "step": 8000 }, { "epoch": 12.78, "grad_norm": 5.405944347381592, "learning_rate": 0.0014889952153110048, "loss": 0.8483, "step": 8010 }, { "epoch": 12.79, "grad_norm": 3.2791013717651367, "learning_rate": 0.0014883572567783094, "loss": 0.9408, "step": 8020 }, { "epoch": 12.81, "grad_norm": 3.3789143562316895, "learning_rate": 0.001487719298245614, "loss": 1.0307, "step": 8030 }, { "epoch": 12.82, "grad_norm": 3.513697624206543, "learning_rate": 0.0014870813397129185, "loss": 0.8657, "step": 8040 }, { "epoch": 12.84, "grad_norm": 3.4501123428344727, "learning_rate": 0.0014864433811802233, "loss": 0.9202, "step": 8050 }, { "epoch": 12.85, "grad_norm": 3.0335283279418945, "learning_rate": 0.0014858054226475279, "loss": 0.941, "step": 8060 }, { "epoch": 12.87, "grad_norm": 3.0770187377929688, "learning_rate": 0.0014851674641148324, "loss": 0.9562, "step": 8070 }, { "epoch": 12.89, "grad_norm": 2.967750310897827, "learning_rate": 0.0014845295055821372, "loss": 0.9318, "step": 8080 }, { "epoch": 12.9, "grad_norm": 4.517429828643799, "learning_rate": 0.0014838915470494418, "loss": 0.9225, "step": 8090 }, { "epoch": 12.92, "grad_norm": 4.639514923095703, "learning_rate": 0.0014832535885167463, "loss": 0.8997, "step": 8100 }, { "epoch": 12.93, "grad_norm": 4.017191410064697, "learning_rate": 0.0014826156299840509, "loss": 1.0325, "step": 8110 }, { "epoch": 12.95, "grad_norm": 4.688587188720703, "learning_rate": 0.0014819776714513557, "loss": 0.8542, "step": 8120 }, { "epoch": 12.97, "grad_norm": 5.4787821769714355, "learning_rate": 0.0014813397129186602, "loss": 1.2567, "step": 8130 }, { "epoch": 12.98, "grad_norm": 3.8270418643951416, "learning_rate": 0.0014807017543859648, "loss": 0.9071, "step": 8140 }, { "epoch": 13.0, "grad_norm": 5.171020984649658, "learning_rate": 0.0014800637958532696, "loss": 0.9396, "step": 8150 }, { "epoch": 13.01, "grad_norm": 2.2651660442352295, "learning_rate": 0.0014794258373205741, "loss": 0.6466, "step": 8160 }, { "epoch": 13.03, "grad_norm": 1.7244137525558472, "learning_rate": 0.0014787878787878787, "loss": 0.5765, "step": 8170 }, { "epoch": 13.05, "grad_norm": 2.143556833267212, "learning_rate": 0.0014781499202551833, "loss": 0.6964, "step": 8180 }, { "epoch": 13.06, "grad_norm": 3.048412561416626, "learning_rate": 0.001477511961722488, "loss": 0.5957, "step": 8190 }, { "epoch": 13.08, "grad_norm": 3.002617120742798, "learning_rate": 0.0014768740031897926, "loss": 0.6172, "step": 8200 }, { "epoch": 13.09, "grad_norm": 2.4327642917633057, "learning_rate": 0.0014762360446570972, "loss": 0.6952, "step": 8210 }, { "epoch": 13.11, "grad_norm": 3.3259124755859375, "learning_rate": 0.0014755980861244022, "loss": 0.7637, "step": 8220 }, { "epoch": 13.13, "grad_norm": 2.1302742958068848, "learning_rate": 0.0014749601275917067, "loss": 0.8759, "step": 8230 }, { "epoch": 13.14, "grad_norm": 2.8593993186950684, "learning_rate": 0.0014743221690590113, "loss": 0.6421, "step": 8240 }, { "epoch": 13.16, "grad_norm": 3.1945838928222656, "learning_rate": 0.0014736842105263158, "loss": 0.8016, "step": 8250 }, { "epoch": 13.17, "grad_norm": 2.6106722354888916, "learning_rate": 0.0014730462519936206, "loss": 0.9062, "step": 8260 }, { "epoch": 13.19, "grad_norm": 2.938920021057129, "learning_rate": 0.0014724082934609252, "loss": 0.6848, "step": 8270 }, { "epoch": 13.21, "grad_norm": 2.4809677600860596, "learning_rate": 0.0014717703349282297, "loss": 0.7019, "step": 8280 }, { "epoch": 13.22, "grad_norm": 3.0914158821105957, "learning_rate": 0.0014711323763955343, "loss": 0.8232, "step": 8290 }, { "epoch": 13.24, "grad_norm": 3.0564115047454834, "learning_rate": 0.001470494417862839, "loss": 0.7435, "step": 8300 }, { "epoch": 13.25, "grad_norm": 3.3561959266662598, "learning_rate": 0.0014698564593301437, "loss": 0.7295, "step": 8310 }, { "epoch": 13.27, "grad_norm": 1.9883933067321777, "learning_rate": 0.0014692185007974482, "loss": 0.7224, "step": 8320 }, { "epoch": 13.29, "grad_norm": 2.7677059173583984, "learning_rate": 0.001468580542264753, "loss": 0.7052, "step": 8330 }, { "epoch": 13.3, "grad_norm": 2.8097822666168213, "learning_rate": 0.0014679425837320576, "loss": 0.8795, "step": 8340 }, { "epoch": 13.32, "grad_norm": 2.9403786659240723, "learning_rate": 0.0014673046251993621, "loss": 0.8753, "step": 8350 }, { "epoch": 13.33, "grad_norm": 2.103468179702759, "learning_rate": 0.0014666666666666667, "loss": 0.7013, "step": 8360 }, { "epoch": 13.35, "grad_norm": 4.1119489669799805, "learning_rate": 0.0014660287081339715, "loss": 0.7666, "step": 8370 }, { "epoch": 13.37, "grad_norm": 2.627279758453369, "learning_rate": 0.001465390749601276, "loss": 0.912, "step": 8380 }, { "epoch": 13.38, "grad_norm": 3.824855327606201, "learning_rate": 0.0014647527910685806, "loss": 0.8233, "step": 8390 }, { "epoch": 13.4, "grad_norm": 2.9254772663116455, "learning_rate": 0.0014641148325358854, "loss": 0.7541, "step": 8400 }, { "epoch": 13.41, "grad_norm": 3.6978065967559814, "learning_rate": 0.00146347687400319, "loss": 0.7604, "step": 8410 }, { "epoch": 13.43, "grad_norm": 2.875696897506714, "learning_rate": 0.0014628389154704945, "loss": 0.7459, "step": 8420 }, { "epoch": 13.44, "grad_norm": 3.1799988746643066, "learning_rate": 0.001462200956937799, "loss": 0.7081, "step": 8430 }, { "epoch": 13.46, "grad_norm": 1.9684711694717407, "learning_rate": 0.0014615629984051038, "loss": 0.7558, "step": 8440 }, { "epoch": 13.48, "grad_norm": 2.5012054443359375, "learning_rate": 0.0014609250398724084, "loss": 0.8078, "step": 8450 }, { "epoch": 13.49, "grad_norm": 2.650980234146118, "learning_rate": 0.001460287081339713, "loss": 0.7813, "step": 8460 }, { "epoch": 13.51, "grad_norm": 2.5888872146606445, "learning_rate": 0.0014596491228070177, "loss": 0.8004, "step": 8470 }, { "epoch": 13.52, "grad_norm": 4.472870349884033, "learning_rate": 0.0014590111642743223, "loss": 0.868, "step": 8480 }, { "epoch": 13.54, "grad_norm": 2.7261528968811035, "learning_rate": 0.0014583732057416269, "loss": 0.7751, "step": 8490 }, { "epoch": 13.56, "grad_norm": 5.15382194519043, "learning_rate": 0.0014577352472089314, "loss": 0.8288, "step": 8500 }, { "epoch": 13.57, "grad_norm": 3.0572078227996826, "learning_rate": 0.0014570972886762362, "loss": 0.9282, "step": 8510 }, { "epoch": 13.59, "grad_norm": 2.779832363128662, "learning_rate": 0.0014564593301435408, "loss": 0.8, "step": 8520 }, { "epoch": 13.6, "grad_norm": 3.26220965385437, "learning_rate": 0.0014558213716108453, "loss": 0.7932, "step": 8530 }, { "epoch": 13.62, "grad_norm": 5.765030384063721, "learning_rate": 0.0014551834130781501, "loss": 0.7697, "step": 8540 }, { "epoch": 13.64, "grad_norm": 3.393489122390747, "learning_rate": 0.0014545454545454547, "loss": 0.7436, "step": 8550 }, { "epoch": 13.65, "grad_norm": 3.4582221508026123, "learning_rate": 0.0014539074960127592, "loss": 0.8099, "step": 8560 }, { "epoch": 13.67, "grad_norm": 2.931617498397827, "learning_rate": 0.0014532695374800638, "loss": 0.8833, "step": 8570 }, { "epoch": 13.68, "grad_norm": 3.149649143218994, "learning_rate": 0.0014526315789473686, "loss": 0.826, "step": 8580 }, { "epoch": 13.7, "grad_norm": 2.6606719493865967, "learning_rate": 0.0014519936204146731, "loss": 0.9458, "step": 8590 }, { "epoch": 13.72, "grad_norm": 3.6608333587646484, "learning_rate": 0.0014513556618819777, "loss": 0.8182, "step": 8600 }, { "epoch": 13.73, "grad_norm": 4.276224136352539, "learning_rate": 0.0014507177033492825, "loss": 0.8656, "step": 8610 }, { "epoch": 13.75, "grad_norm": 3.306110382080078, "learning_rate": 0.001450079744816587, "loss": 0.8356, "step": 8620 }, { "epoch": 13.76, "grad_norm": 3.0018744468688965, "learning_rate": 0.0014494417862838916, "loss": 0.8602, "step": 8630 }, { "epoch": 13.78, "grad_norm": 3.3632960319519043, "learning_rate": 0.0014488038277511962, "loss": 1.0159, "step": 8640 }, { "epoch": 13.8, "grad_norm": 2.006432056427002, "learning_rate": 0.001448165869218501, "loss": 0.8172, "step": 8650 }, { "epoch": 13.81, "grad_norm": 3.5842230319976807, "learning_rate": 0.0014475279106858055, "loss": 0.8222, "step": 8660 }, { "epoch": 13.83, "grad_norm": 3.855170488357544, "learning_rate": 0.00144688995215311, "loss": 0.8993, "step": 8670 }, { "epoch": 13.84, "grad_norm": 3.3235816955566406, "learning_rate": 0.0014462519936204146, "loss": 0.8374, "step": 8680 }, { "epoch": 13.86, "grad_norm": 3.43414568901062, "learning_rate": 0.0014456140350877194, "loss": 0.9525, "step": 8690 }, { "epoch": 13.88, "grad_norm": 3.4128949642181396, "learning_rate": 0.001444976076555024, "loss": 0.8527, "step": 8700 }, { "epoch": 13.89, "grad_norm": 5.165436744689941, "learning_rate": 0.0014443381180223285, "loss": 0.8698, "step": 8710 }, { "epoch": 13.91, "grad_norm": 3.940591812133789, "learning_rate": 0.0014437001594896333, "loss": 0.9243, "step": 8720 }, { "epoch": 13.92, "grad_norm": 3.3081157207489014, "learning_rate": 0.0014430622009569379, "loss": 0.9268, "step": 8730 }, { "epoch": 13.94, "grad_norm": 3.6998980045318604, "learning_rate": 0.0014424242424242424, "loss": 0.9113, "step": 8740 }, { "epoch": 13.96, "grad_norm": 3.386359214782715, "learning_rate": 0.001441786283891547, "loss": 0.9067, "step": 8750 }, { "epoch": 13.97, "grad_norm": 2.559299945831299, "learning_rate": 0.0014411483253588518, "loss": 0.9951, "step": 8760 }, { "epoch": 13.99, "grad_norm": 2.8027663230895996, "learning_rate": 0.0014405103668261564, "loss": 1.0755, "step": 8770 }, { "epoch": 14.0, "grad_norm": 2.2676618099212646, "learning_rate": 0.001439872408293461, "loss": 0.7299, "step": 8780 }, { "epoch": 14.02, "grad_norm": 3.9087278842926025, "learning_rate": 0.0014392344497607657, "loss": 0.8103, "step": 8790 }, { "epoch": 14.04, "grad_norm": 3.4694509506225586, "learning_rate": 0.0014385964912280703, "loss": 0.6432, "step": 8800 }, { "epoch": 14.05, "grad_norm": 2.3048603534698486, "learning_rate": 0.0014379585326953748, "loss": 0.5185, "step": 8810 }, { "epoch": 14.07, "grad_norm": 3.251046895980835, "learning_rate": 0.0014373205741626794, "loss": 0.6668, "step": 8820 }, { "epoch": 14.08, "grad_norm": 1.8965840339660645, "learning_rate": 0.0014366826156299842, "loss": 0.5914, "step": 8830 }, { "epoch": 14.1, "grad_norm": 4.089531421661377, "learning_rate": 0.0014360446570972887, "loss": 0.6342, "step": 8840 }, { "epoch": 14.11, "grad_norm": 2.4392364025115967, "learning_rate": 0.0014354066985645933, "loss": 0.6801, "step": 8850 }, { "epoch": 14.13, "grad_norm": 2.7692840099334717, "learning_rate": 0.001434768740031898, "loss": 0.7158, "step": 8860 }, { "epoch": 14.15, "grad_norm": 2.2414801120758057, "learning_rate": 0.0014341307814992026, "loss": 0.6251, "step": 8870 }, { "epoch": 14.16, "grad_norm": 2.941929578781128, "learning_rate": 0.0014334928229665072, "loss": 0.7273, "step": 8880 }, { "epoch": 14.18, "grad_norm": 2.245312452316284, "learning_rate": 0.0014328548644338118, "loss": 0.6797, "step": 8890 }, { "epoch": 14.19, "grad_norm": 2.1441662311553955, "learning_rate": 0.0014322169059011165, "loss": 0.6244, "step": 8900 }, { "epoch": 14.21, "grad_norm": 3.0492477416992188, "learning_rate": 0.001431578947368421, "loss": 0.6268, "step": 8910 }, { "epoch": 14.23, "grad_norm": 2.6444950103759766, "learning_rate": 0.0014309409888357257, "loss": 0.6373, "step": 8920 }, { "epoch": 14.24, "grad_norm": 2.9322099685668945, "learning_rate": 0.0014303030303030304, "loss": 0.7353, "step": 8930 }, { "epoch": 14.26, "grad_norm": 2.753868341445923, "learning_rate": 0.001429665071770335, "loss": 0.6592, "step": 8940 }, { "epoch": 14.27, "grad_norm": 3.1307361125946045, "learning_rate": 0.0014290271132376396, "loss": 0.6678, "step": 8950 }, { "epoch": 14.29, "grad_norm": 2.1127524375915527, "learning_rate": 0.0014283891547049441, "loss": 0.6473, "step": 8960 }, { "epoch": 14.31, "grad_norm": 2.359909772872925, "learning_rate": 0.001427751196172249, "loss": 0.8442, "step": 8970 }, { "epoch": 14.32, "grad_norm": 3.395587205886841, "learning_rate": 0.0014271132376395535, "loss": 0.6634, "step": 8980 }, { "epoch": 14.34, "grad_norm": 3.500505208969116, "learning_rate": 0.001426475279106858, "loss": 0.6735, "step": 8990 }, { "epoch": 14.35, "grad_norm": 1.948743224143982, "learning_rate": 0.0014258373205741626, "loss": 0.8773, "step": 9000 }, { "epoch": 14.37, "grad_norm": 4.593191146850586, "learning_rate": 0.0014251993620414674, "loss": 0.7344, "step": 9010 }, { "epoch": 14.39, "grad_norm": 2.9138360023498535, "learning_rate": 0.001424561403508772, "loss": 0.7962, "step": 9020 }, { "epoch": 14.4, "grad_norm": 2.7665469646453857, "learning_rate": 0.0014239234449760765, "loss": 0.7066, "step": 9030 }, { "epoch": 14.42, "grad_norm": 2.5287930965423584, "learning_rate": 0.0014232854864433813, "loss": 0.799, "step": 9040 }, { "epoch": 14.43, "grad_norm": 1.9143520593643188, "learning_rate": 0.0014226475279106858, "loss": 0.877, "step": 9050 }, { "epoch": 14.45, "grad_norm": 3.114867925643921, "learning_rate": 0.0014220095693779904, "loss": 0.7229, "step": 9060 }, { "epoch": 14.47, "grad_norm": 4.132133960723877, "learning_rate": 0.001421371610845295, "loss": 0.7723, "step": 9070 }, { "epoch": 14.48, "grad_norm": 2.8847928047180176, "learning_rate": 0.0014207336523125997, "loss": 0.8349, "step": 9080 }, { "epoch": 14.5, "grad_norm": 4.3192009925842285, "learning_rate": 0.0014200956937799043, "loss": 0.7454, "step": 9090 }, { "epoch": 14.51, "grad_norm": 2.5490753650665283, "learning_rate": 0.0014194577352472089, "loss": 0.8047, "step": 9100 }, { "epoch": 14.53, "grad_norm": 3.995173215866089, "learning_rate": 0.0014188197767145137, "loss": 0.7209, "step": 9110 }, { "epoch": 14.55, "grad_norm": 3.334613084793091, "learning_rate": 0.0014181818181818182, "loss": 0.9342, "step": 9120 }, { "epoch": 14.56, "grad_norm": 2.7369375228881836, "learning_rate": 0.0014175438596491228, "loss": 0.7473, "step": 9130 }, { "epoch": 14.58, "grad_norm": 4.180137634277344, "learning_rate": 0.0014169059011164273, "loss": 0.8705, "step": 9140 }, { "epoch": 14.59, "grad_norm": 3.7026357650756836, "learning_rate": 0.0014162679425837321, "loss": 0.7836, "step": 9150 }, { "epoch": 14.61, "grad_norm": 1.8971599340438843, "learning_rate": 0.0014156299840510367, "loss": 0.7062, "step": 9160 }, { "epoch": 14.63, "grad_norm": 2.8083083629608154, "learning_rate": 0.0014149920255183412, "loss": 0.7808, "step": 9170 }, { "epoch": 14.64, "grad_norm": 2.1013123989105225, "learning_rate": 0.001414354066985646, "loss": 0.919, "step": 9180 }, { "epoch": 14.66, "grad_norm": 2.5876877307891846, "learning_rate": 0.0014137161084529506, "loss": 0.7824, "step": 9190 }, { "epoch": 14.67, "grad_norm": 2.3595352172851562, "learning_rate": 0.0014130781499202552, "loss": 0.8545, "step": 9200 }, { "epoch": 14.69, "grad_norm": 6.161678314208984, "learning_rate": 0.0014124401913875597, "loss": 0.764, "step": 9210 }, { "epoch": 14.7, "grad_norm": 2.7124509811401367, "learning_rate": 0.0014118022328548645, "loss": 0.7967, "step": 9220 }, { "epoch": 14.72, "grad_norm": 3.200411081314087, "learning_rate": 0.001411164274322169, "loss": 0.8589, "step": 9230 }, { "epoch": 14.74, "grad_norm": 1.9819875955581665, "learning_rate": 0.0014105263157894736, "loss": 0.809, "step": 9240 }, { "epoch": 14.75, "grad_norm": 3.223145008087158, "learning_rate": 0.0014098883572567784, "loss": 0.739, "step": 9250 }, { "epoch": 14.77, "grad_norm": 3.0328469276428223, "learning_rate": 0.001409250398724083, "loss": 0.8469, "step": 9260 }, { "epoch": 14.78, "grad_norm": 2.144221305847168, "learning_rate": 0.0014086124401913875, "loss": 0.9141, "step": 9270 }, { "epoch": 14.8, "grad_norm": 2.3607845306396484, "learning_rate": 0.001407974481658692, "loss": 0.8812, "step": 9280 }, { "epoch": 14.82, "grad_norm": 2.356010913848877, "learning_rate": 0.0014073365231259969, "loss": 0.7773, "step": 9290 }, { "epoch": 14.83, "grad_norm": 3.326063394546509, "learning_rate": 0.0014066985645933014, "loss": 0.9775, "step": 9300 }, { "epoch": 14.85, "grad_norm": 3.0373737812042236, "learning_rate": 0.001406060606060606, "loss": 0.8173, "step": 9310 }, { "epoch": 14.86, "grad_norm": 3.7840776443481445, "learning_rate": 0.0014054226475279108, "loss": 0.8301, "step": 9320 }, { "epoch": 14.88, "grad_norm": 3.13913893699646, "learning_rate": 0.0014047846889952153, "loss": 0.7767, "step": 9330 }, { "epoch": 14.9, "grad_norm": 4.028443813323975, "learning_rate": 0.00140414673046252, "loss": 0.7516, "step": 9340 }, { "epoch": 14.91, "grad_norm": 3.6890182495117188, "learning_rate": 0.0014035087719298245, "loss": 0.769, "step": 9350 }, { "epoch": 14.93, "grad_norm": 4.084263801574707, "learning_rate": 0.0014028708133971292, "loss": 0.8712, "step": 9360 }, { "epoch": 14.94, "grad_norm": 2.6253440380096436, "learning_rate": 0.0014022328548644338, "loss": 0.9161, "step": 9370 }, { "epoch": 14.96, "grad_norm": 3.6379435062408447, "learning_rate": 0.0014015948963317384, "loss": 0.9507, "step": 9380 }, { "epoch": 14.98, "grad_norm": 3.1507678031921387, "learning_rate": 0.001400956937799043, "loss": 0.9133, "step": 9390 }, { "epoch": 14.99, "grad_norm": 2.170366048812866, "learning_rate": 0.0014003189792663477, "loss": 0.7949, "step": 9400 }, { "epoch": 15.01, "grad_norm": 1.886562705039978, "learning_rate": 0.0013996810207336523, "loss": 0.7133, "step": 9410 }, { "epoch": 15.02, "grad_norm": 2.3615992069244385, "learning_rate": 0.0013990430622009568, "loss": 0.598, "step": 9420 }, { "epoch": 15.04, "grad_norm": 2.0564517974853516, "learning_rate": 0.0013984051036682616, "loss": 0.6673, "step": 9430 }, { "epoch": 15.06, "grad_norm": 2.599745273590088, "learning_rate": 0.0013977671451355662, "loss": 0.5725, "step": 9440 }, { "epoch": 15.07, "grad_norm": 2.5613441467285156, "learning_rate": 0.0013971291866028707, "loss": 0.5816, "step": 9450 }, { "epoch": 15.09, "grad_norm": 2.8341970443725586, "learning_rate": 0.0013964912280701753, "loss": 0.5968, "step": 9460 }, { "epoch": 15.1, "grad_norm": 3.303835391998291, "learning_rate": 0.00139585326953748, "loss": 0.6412, "step": 9470 }, { "epoch": 15.12, "grad_norm": 3.2321808338165283, "learning_rate": 0.0013952153110047846, "loss": 0.6451, "step": 9480 }, { "epoch": 15.14, "grad_norm": 2.747515916824341, "learning_rate": 0.0013945773524720892, "loss": 0.6491, "step": 9490 }, { "epoch": 15.15, "grad_norm": 2.1695239543914795, "learning_rate": 0.001393939393939394, "loss": 0.6463, "step": 9500 }, { "epoch": 15.17, "grad_norm": 2.5514535903930664, "learning_rate": 0.0013933014354066985, "loss": 0.696, "step": 9510 }, { "epoch": 15.18, "grad_norm": 2.224310874938965, "learning_rate": 0.001392663476874003, "loss": 0.6733, "step": 9520 }, { "epoch": 15.2, "grad_norm": 3.0674171447753906, "learning_rate": 0.0013920255183413077, "loss": 0.655, "step": 9530 }, { "epoch": 15.22, "grad_norm": 1.9924139976501465, "learning_rate": 0.0013913875598086124, "loss": 0.6315, "step": 9540 }, { "epoch": 15.23, "grad_norm": 3.7744829654693604, "learning_rate": 0.001390749601275917, "loss": 0.6991, "step": 9550 }, { "epoch": 15.25, "grad_norm": 3.4672529697418213, "learning_rate": 0.0013901116427432216, "loss": 0.7007, "step": 9560 }, { "epoch": 15.26, "grad_norm": 3.2644975185394287, "learning_rate": 0.0013894736842105264, "loss": 0.6403, "step": 9570 }, { "epoch": 15.28, "grad_norm": 2.8029818534851074, "learning_rate": 0.001388835725677831, "loss": 0.7316, "step": 9580 }, { "epoch": 15.3, "grad_norm": 1.8042049407958984, "learning_rate": 0.0013881977671451355, "loss": 0.6332, "step": 9590 }, { "epoch": 15.31, "grad_norm": 2.2891921997070312, "learning_rate": 0.00138755980861244, "loss": 0.6348, "step": 9600 }, { "epoch": 15.33, "grad_norm": 2.8570570945739746, "learning_rate": 0.0013869218500797448, "loss": 0.6607, "step": 9610 }, { "epoch": 15.34, "grad_norm": 2.2186977863311768, "learning_rate": 0.0013862838915470494, "loss": 0.627, "step": 9620 }, { "epoch": 15.36, "grad_norm": 2.3791275024414062, "learning_rate": 0.001385645933014354, "loss": 0.6721, "step": 9630 }, { "epoch": 15.37, "grad_norm": 2.992490530014038, "learning_rate": 0.0013850079744816587, "loss": 0.6734, "step": 9640 }, { "epoch": 15.39, "grad_norm": 5.538806438446045, "learning_rate": 0.0013843700159489633, "loss": 0.678, "step": 9650 }, { "epoch": 15.41, "grad_norm": 2.7970008850097656, "learning_rate": 0.0013837320574162679, "loss": 0.6576, "step": 9660 }, { "epoch": 15.42, "grad_norm": 5.550447940826416, "learning_rate": 0.0013830940988835724, "loss": 0.7144, "step": 9670 }, { "epoch": 15.44, "grad_norm": 2.3102047443389893, "learning_rate": 0.0013824561403508772, "loss": 0.669, "step": 9680 }, { "epoch": 15.45, "grad_norm": 3.720393419265747, "learning_rate": 0.0013818181818181818, "loss": 0.7442, "step": 9690 }, { "epoch": 15.47, "grad_norm": 2.284290075302124, "learning_rate": 0.0013811802232854863, "loss": 0.7835, "step": 9700 }, { "epoch": 15.49, "grad_norm": 3.2873239517211914, "learning_rate": 0.0013805422647527909, "loss": 0.6662, "step": 9710 }, { "epoch": 15.5, "grad_norm": 2.7117483615875244, "learning_rate": 0.0013799043062200959, "loss": 0.7348, "step": 9720 }, { "epoch": 15.52, "grad_norm": 3.2797791957855225, "learning_rate": 0.0013792663476874004, "loss": 0.8664, "step": 9730 }, { "epoch": 15.53, "grad_norm": 3.7056384086608887, "learning_rate": 0.001378628389154705, "loss": 0.7128, "step": 9740 }, { "epoch": 15.55, "grad_norm": 2.3162360191345215, "learning_rate": 0.0013779904306220098, "loss": 0.6596, "step": 9750 }, { "epoch": 15.57, "grad_norm": 2.1081748008728027, "learning_rate": 0.0013773524720893143, "loss": 0.7492, "step": 9760 }, { "epoch": 15.58, "grad_norm": 3.5717201232910156, "learning_rate": 0.001376714513556619, "loss": 0.8277, "step": 9770 }, { "epoch": 15.6, "grad_norm": 3.751756429672241, "learning_rate": 0.0013760765550239235, "loss": 0.7028, "step": 9780 }, { "epoch": 15.61, "grad_norm": 3.4455363750457764, "learning_rate": 0.0013754385964912283, "loss": 0.7339, "step": 9790 }, { "epoch": 15.63, "grad_norm": 2.6450400352478027, "learning_rate": 0.0013748006379585328, "loss": 0.8658, "step": 9800 }, { "epoch": 15.65, "grad_norm": 2.7757487297058105, "learning_rate": 0.0013741626794258374, "loss": 0.7462, "step": 9810 }, { "epoch": 15.66, "grad_norm": 2.791318416595459, "learning_rate": 0.0013735247208931422, "loss": 0.7724, "step": 9820 }, { "epoch": 15.68, "grad_norm": 2.722747802734375, "learning_rate": 0.0013728867623604467, "loss": 0.8533, "step": 9830 }, { "epoch": 15.69, "grad_norm": 2.778831958770752, "learning_rate": 0.0013722488038277513, "loss": 0.7243, "step": 9840 }, { "epoch": 15.71, "grad_norm": 2.3124783039093018, "learning_rate": 0.0013716108452950558, "loss": 0.87, "step": 9850 }, { "epoch": 15.73, "grad_norm": 2.3077304363250732, "learning_rate": 0.0013709728867623606, "loss": 0.8462, "step": 9860 }, { "epoch": 15.74, "grad_norm": 4.141488552093506, "learning_rate": 0.0013703349282296652, "loss": 0.8513, "step": 9870 }, { "epoch": 15.76, "grad_norm": 2.998544454574585, "learning_rate": 0.0013696969696969697, "loss": 0.7472, "step": 9880 }, { "epoch": 15.77, "grad_norm": 2.3463869094848633, "learning_rate": 0.0013690590111642745, "loss": 0.8363, "step": 9890 }, { "epoch": 15.79, "grad_norm": 2.782196521759033, "learning_rate": 0.001368421052631579, "loss": 0.7076, "step": 9900 }, { "epoch": 15.81, "grad_norm": 3.6227550506591797, "learning_rate": 0.0013677830940988837, "loss": 0.677, "step": 9910 }, { "epoch": 15.82, "grad_norm": 3.1042935848236084, "learning_rate": 0.0013671451355661882, "loss": 0.8747, "step": 9920 }, { "epoch": 15.84, "grad_norm": 2.9278554916381836, "learning_rate": 0.001366507177033493, "loss": 0.8876, "step": 9930 }, { "epoch": 15.85, "grad_norm": 2.6750121116638184, "learning_rate": 0.0013658692185007976, "loss": 0.7374, "step": 9940 }, { "epoch": 15.87, "grad_norm": 2.563796043395996, "learning_rate": 0.0013652312599681021, "loss": 0.6978, "step": 9950 }, { "epoch": 15.89, "grad_norm": 2.839409112930298, "learning_rate": 0.001364593301435407, "loss": 0.7307, "step": 9960 }, { "epoch": 15.9, "grad_norm": 2.651336908340454, "learning_rate": 0.0013639553429027115, "loss": 0.6775, "step": 9970 }, { "epoch": 15.92, "grad_norm": 3.166914701461792, "learning_rate": 0.001363317384370016, "loss": 0.9131, "step": 9980 }, { "epoch": 15.93, "grad_norm": 2.0613489151000977, "learning_rate": 0.0013626794258373206, "loss": 0.8707, "step": 9990 }, { "epoch": 15.95, "grad_norm": 3.3213391304016113, "learning_rate": 0.0013620414673046254, "loss": 0.7631, "step": 10000 }, { "epoch": 15.96, "grad_norm": 3.0203397274017334, "learning_rate": 0.00136140350877193, "loss": 0.8512, "step": 10010 }, { "epoch": 15.98, "grad_norm": 2.070725202560425, "learning_rate": 0.0013607655502392345, "loss": 0.8031, "step": 10020 }, { "epoch": 16.0, "grad_norm": 2.3090660572052, "learning_rate": 0.0013601275917065393, "loss": 0.7327, "step": 10030 }, { "epoch": 16.01, "grad_norm": 2.2141530513763428, "learning_rate": 0.0013594896331738438, "loss": 0.6028, "step": 10040 }, { "epoch": 16.03, "grad_norm": 2.6139416694641113, "learning_rate": 0.0013588516746411484, "loss": 0.5008, "step": 10050 }, { "epoch": 16.04, "grad_norm": 4.001714706420898, "learning_rate": 0.001358213716108453, "loss": 0.5758, "step": 10060 }, { "epoch": 16.06, "grad_norm": 3.021545886993408, "learning_rate": 0.0013575757575757577, "loss": 0.5445, "step": 10070 }, { "epoch": 16.08, "grad_norm": 3.3995559215545654, "learning_rate": 0.0013569377990430623, "loss": 0.6745, "step": 10080 }, { "epoch": 16.09, "grad_norm": 1.990356683731079, "learning_rate": 0.0013562998405103669, "loss": 0.57, "step": 10090 }, { "epoch": 16.11, "grad_norm": 2.0073490142822266, "learning_rate": 0.0013556618819776714, "loss": 0.532, "step": 10100 }, { "epoch": 16.12, "grad_norm": 3.74519681930542, "learning_rate": 0.0013550239234449762, "loss": 0.5446, "step": 10110 }, { "epoch": 16.14, "grad_norm": 3.367241144180298, "learning_rate": 0.0013543859649122808, "loss": 0.6287, "step": 10120 }, { "epoch": 16.16, "grad_norm": 2.2096097469329834, "learning_rate": 0.0013537480063795853, "loss": 0.6978, "step": 10130 }, { "epoch": 16.17, "grad_norm": 1.8970123529434204, "learning_rate": 0.0013531100478468901, "loss": 0.6808, "step": 10140 }, { "epoch": 16.19, "grad_norm": 3.6559560298919678, "learning_rate": 0.0013524720893141947, "loss": 0.6263, "step": 10150 }, { "epoch": 16.2, "grad_norm": 2.3549561500549316, "learning_rate": 0.0013518341307814992, "loss": 0.6148, "step": 10160 }, { "epoch": 16.22, "grad_norm": 1.73717200756073, "learning_rate": 0.0013511961722488038, "loss": 0.5918, "step": 10170 }, { "epoch": 16.24, "grad_norm": 2.160614252090454, "learning_rate": 0.0013505582137161086, "loss": 0.6008, "step": 10180 }, { "epoch": 16.25, "grad_norm": 3.5011887550354004, "learning_rate": 0.0013499202551834131, "loss": 0.7248, "step": 10190 }, { "epoch": 16.27, "grad_norm": 2.6921088695526123, "learning_rate": 0.0013492822966507177, "loss": 0.7165, "step": 10200 }, { "epoch": 16.28, "grad_norm": 1.8108500242233276, "learning_rate": 0.0013486443381180225, "loss": 0.5676, "step": 10210 }, { "epoch": 16.3, "grad_norm": 2.7839293479919434, "learning_rate": 0.001348006379585327, "loss": 0.65, "step": 10220 }, { "epoch": 16.32, "grad_norm": 2.6478052139282227, "learning_rate": 0.0013473684210526316, "loss": 0.7421, "step": 10230 }, { "epoch": 16.33, "grad_norm": 2.5701370239257812, "learning_rate": 0.0013467304625199362, "loss": 0.7106, "step": 10240 }, { "epoch": 16.35, "grad_norm": 2.3916168212890625, "learning_rate": 0.001346092503987241, "loss": 0.5939, "step": 10250 }, { "epoch": 16.36, "grad_norm": 2.6145966053009033, "learning_rate": 0.0013454545454545455, "loss": 0.6589, "step": 10260 }, { "epoch": 16.38, "grad_norm": 2.416173219680786, "learning_rate": 0.00134481658692185, "loss": 0.5956, "step": 10270 }, { "epoch": 16.4, "grad_norm": 3.0522923469543457, "learning_rate": 0.0013441786283891549, "loss": 0.7672, "step": 10280 }, { "epoch": 16.41, "grad_norm": 3.9606542587280273, "learning_rate": 0.0013435406698564594, "loss": 0.6298, "step": 10290 }, { "epoch": 16.43, "grad_norm": 2.6333351135253906, "learning_rate": 0.001342902711323764, "loss": 0.758, "step": 10300 }, { "epoch": 16.44, "grad_norm": 3.0208117961883545, "learning_rate": 0.0013422647527910685, "loss": 0.7356, "step": 10310 }, { "epoch": 16.46, "grad_norm": 2.344989776611328, "learning_rate": 0.0013416267942583733, "loss": 0.6947, "step": 10320 }, { "epoch": 16.48, "grad_norm": 1.7995355129241943, "learning_rate": 0.0013409888357256779, "loss": 0.7519, "step": 10330 }, { "epoch": 16.49, "grad_norm": 1.8002946376800537, "learning_rate": 0.0013403508771929824, "loss": 0.5739, "step": 10340 }, { "epoch": 16.51, "grad_norm": 2.094810962677002, "learning_rate": 0.0013397129186602872, "loss": 0.6613, "step": 10350 }, { "epoch": 16.52, "grad_norm": 3.815561294555664, "learning_rate": 0.0013390749601275918, "loss": 0.6778, "step": 10360 }, { "epoch": 16.54, "grad_norm": 2.7428698539733887, "learning_rate": 0.0013384370015948964, "loss": 0.6428, "step": 10370 }, { "epoch": 16.56, "grad_norm": 2.3527848720550537, "learning_rate": 0.001337799043062201, "loss": 0.703, "step": 10380 }, { "epoch": 16.57, "grad_norm": 2.305804967880249, "learning_rate": 0.0013371610845295057, "loss": 0.6954, "step": 10390 }, { "epoch": 16.59, "grad_norm": 2.0628771781921387, "learning_rate": 0.0013365231259968103, "loss": 0.6499, "step": 10400 }, { "epoch": 16.6, "grad_norm": 3.61171555519104, "learning_rate": 0.0013358851674641148, "loss": 0.767, "step": 10410 }, { "epoch": 16.62, "grad_norm": 1.9354444742202759, "learning_rate": 0.0013352472089314194, "loss": 0.7736, "step": 10420 }, { "epoch": 16.63, "grad_norm": 2.2509772777557373, "learning_rate": 0.0013346092503987242, "loss": 0.6941, "step": 10430 }, { "epoch": 16.65, "grad_norm": 3.0013530254364014, "learning_rate": 0.0013339712918660287, "loss": 0.7728, "step": 10440 }, { "epoch": 16.67, "grad_norm": 2.995089292526245, "learning_rate": 0.0013333333333333333, "loss": 0.6111, "step": 10450 }, { "epoch": 16.68, "grad_norm": 2.9852423667907715, "learning_rate": 0.001332695374800638, "loss": 0.7604, "step": 10460 }, { "epoch": 16.7, "grad_norm": 1.8430482149124146, "learning_rate": 0.0013320574162679426, "loss": 0.6248, "step": 10470 }, { "epoch": 16.71, "grad_norm": 2.271106481552124, "learning_rate": 0.0013314194577352472, "loss": 0.6576, "step": 10480 }, { "epoch": 16.73, "grad_norm": 3.168851852416992, "learning_rate": 0.0013307814992025518, "loss": 0.7078, "step": 10490 }, { "epoch": 16.75, "grad_norm": 3.591390371322632, "learning_rate": 0.0013301435406698565, "loss": 0.7218, "step": 10500 }, { "epoch": 16.76, "grad_norm": 2.9601821899414062, "learning_rate": 0.001329505582137161, "loss": 0.6973, "step": 10510 }, { "epoch": 16.78, "grad_norm": 2.4465489387512207, "learning_rate": 0.0013288676236044657, "loss": 0.6793, "step": 10520 }, { "epoch": 16.79, "grad_norm": 3.1582698822021484, "learning_rate": 0.0013282296650717704, "loss": 0.6457, "step": 10530 }, { "epoch": 16.81, "grad_norm": 2.704655408859253, "learning_rate": 0.001327591706539075, "loss": 0.7386, "step": 10540 }, { "epoch": 16.83, "grad_norm": 2.8489794731140137, "learning_rate": 0.0013269537480063796, "loss": 0.7504, "step": 10550 }, { "epoch": 16.84, "grad_norm": 3.1505606174468994, "learning_rate": 0.0013263157894736841, "loss": 0.7292, "step": 10560 }, { "epoch": 16.86, "grad_norm": 2.3454043865203857, "learning_rate": 0.001325677830940989, "loss": 0.7907, "step": 10570 }, { "epoch": 16.87, "grad_norm": 3.128525972366333, "learning_rate": 0.0013250398724082935, "loss": 0.6484, "step": 10580 }, { "epoch": 16.89, "grad_norm": 3.890327215194702, "learning_rate": 0.001324401913875598, "loss": 0.7387, "step": 10590 }, { "epoch": 16.91, "grad_norm": 3.827643394470215, "learning_rate": 0.0013237639553429028, "loss": 0.8169, "step": 10600 }, { "epoch": 16.92, "grad_norm": 2.757068395614624, "learning_rate": 0.0013231259968102074, "loss": 0.7266, "step": 10610 }, { "epoch": 16.94, "grad_norm": 2.3636882305145264, "learning_rate": 0.001322488038277512, "loss": 0.8095, "step": 10620 }, { "epoch": 16.95, "grad_norm": 2.3341169357299805, "learning_rate": 0.0013218500797448165, "loss": 0.757, "step": 10630 }, { "epoch": 16.97, "grad_norm": 3.235461950302124, "learning_rate": 0.0013212121212121213, "loss": 0.7138, "step": 10640 }, { "epoch": 16.99, "grad_norm": 3.797213315963745, "learning_rate": 0.0013205741626794258, "loss": 0.7479, "step": 10650 }, { "epoch": 17.0, "grad_norm": 2.1070356369018555, "learning_rate": 0.0013199362041467304, "loss": 0.7187, "step": 10660 }, { "epoch": 17.02, "grad_norm": 2.2326266765594482, "learning_rate": 0.0013192982456140352, "loss": 0.4728, "step": 10670 }, { "epoch": 17.03, "grad_norm": 2.8324732780456543, "learning_rate": 0.0013186602870813397, "loss": 0.5915, "step": 10680 }, { "epoch": 17.05, "grad_norm": 2.2015562057495117, "learning_rate": 0.0013180223285486443, "loss": 0.6181, "step": 10690 }, { "epoch": 17.07, "grad_norm": 1.9790899753570557, "learning_rate": 0.0013173843700159489, "loss": 0.5131, "step": 10700 }, { "epoch": 17.08, "grad_norm": 2.4350438117980957, "learning_rate": 0.0013167464114832537, "loss": 0.5058, "step": 10710 }, { "epoch": 17.1, "grad_norm": 2.701519250869751, "learning_rate": 0.0013161084529505582, "loss": 0.5272, "step": 10720 }, { "epoch": 17.11, "grad_norm": 2.316878318786621, "learning_rate": 0.0013154704944178628, "loss": 0.5489, "step": 10730 }, { "epoch": 17.13, "grad_norm": 2.2858500480651855, "learning_rate": 0.0013148325358851676, "loss": 0.6087, "step": 10740 }, { "epoch": 17.15, "grad_norm": 1.5047816038131714, "learning_rate": 0.0013141945773524721, "loss": 0.5726, "step": 10750 }, { "epoch": 17.16, "grad_norm": 2.081256628036499, "learning_rate": 0.0013135566188197767, "loss": 0.5417, "step": 10760 }, { "epoch": 17.18, "grad_norm": 1.8512243032455444, "learning_rate": 0.0013129186602870812, "loss": 0.7781, "step": 10770 }, { "epoch": 17.19, "grad_norm": 2.651259422302246, "learning_rate": 0.001312280701754386, "loss": 0.6324, "step": 10780 }, { "epoch": 17.21, "grad_norm": 1.8741660118103027, "learning_rate": 0.0013116427432216906, "loss": 0.5242, "step": 10790 }, { "epoch": 17.22, "grad_norm": 2.223308801651001, "learning_rate": 0.0013110047846889952, "loss": 0.5837, "step": 10800 }, { "epoch": 17.24, "grad_norm": 2.954585552215576, "learning_rate": 0.0013103668261562997, "loss": 0.7762, "step": 10810 }, { "epoch": 17.26, "grad_norm": 2.075242519378662, "learning_rate": 0.0013097288676236045, "loss": 0.6981, "step": 10820 }, { "epoch": 17.27, "grad_norm": 1.9512617588043213, "learning_rate": 0.001309090909090909, "loss": 0.5878, "step": 10830 }, { "epoch": 17.29, "grad_norm": 2.4567389488220215, "learning_rate": 0.0013084529505582136, "loss": 0.5861, "step": 10840 }, { "epoch": 17.3, "grad_norm": 2.4589033126831055, "learning_rate": 0.0013078149920255184, "loss": 0.5776, "step": 10850 }, { "epoch": 17.32, "grad_norm": 3.0933573246002197, "learning_rate": 0.001307177033492823, "loss": 0.6265, "step": 10860 }, { "epoch": 17.34, "grad_norm": 2.9563870429992676, "learning_rate": 0.0013065390749601275, "loss": 0.688, "step": 10870 }, { "epoch": 17.35, "grad_norm": 2.6502304077148438, "learning_rate": 0.001305901116427432, "loss": 0.6685, "step": 10880 }, { "epoch": 17.37, "grad_norm": 2.815063238143921, "learning_rate": 0.0013052631578947369, "loss": 0.651, "step": 10890 }, { "epoch": 17.38, "grad_norm": 2.2861077785491943, "learning_rate": 0.0013046251993620414, "loss": 0.588, "step": 10900 }, { "epoch": 17.4, "grad_norm": 2.0195345878601074, "learning_rate": 0.001303987240829346, "loss": 0.6129, "step": 10910 }, { "epoch": 17.42, "grad_norm": 2.192063331604004, "learning_rate": 0.0013033492822966508, "loss": 0.6819, "step": 10920 }, { "epoch": 17.43, "grad_norm": 3.0410258769989014, "learning_rate": 0.0013027113237639553, "loss": 0.6265, "step": 10930 }, { "epoch": 17.45, "grad_norm": 1.9278006553649902, "learning_rate": 0.00130207336523126, "loss": 0.5889, "step": 10940 }, { "epoch": 17.46, "grad_norm": 2.2657618522644043, "learning_rate": 0.0013014354066985645, "loss": 0.6477, "step": 10950 }, { "epoch": 17.48, "grad_norm": 3.8989851474761963, "learning_rate": 0.0013007974481658692, "loss": 0.6831, "step": 10960 }, { "epoch": 17.5, "grad_norm": 2.630307197570801, "learning_rate": 0.0013001594896331738, "loss": 0.7138, "step": 10970 }, { "epoch": 17.51, "grad_norm": 2.4029276371002197, "learning_rate": 0.0012995215311004784, "loss": 0.6426, "step": 10980 }, { "epoch": 17.53, "grad_norm": 2.127747058868408, "learning_rate": 0.0012988835725677831, "loss": 0.7466, "step": 10990 }, { "epoch": 17.54, "grad_norm": 1.9066559076309204, "learning_rate": 0.0012982456140350877, "loss": 0.578, "step": 11000 }, { "epoch": 17.56, "grad_norm": 2.585181713104248, "learning_rate": 0.0012976076555023923, "loss": 0.6634, "step": 11010 }, { "epoch": 17.58, "grad_norm": 1.7290911674499512, "learning_rate": 0.0012969696969696968, "loss": 0.6455, "step": 11020 }, { "epoch": 17.59, "grad_norm": 3.596162796020508, "learning_rate": 0.0012963317384370016, "loss": 0.6049, "step": 11030 }, { "epoch": 17.61, "grad_norm": 1.8785613775253296, "learning_rate": 0.0012956937799043062, "loss": 0.6151, "step": 11040 }, { "epoch": 17.62, "grad_norm": 3.3277037143707275, "learning_rate": 0.0012950558213716107, "loss": 0.5931, "step": 11050 }, { "epoch": 17.64, "grad_norm": 2.3854615688323975, "learning_rate": 0.0012944178628389155, "loss": 0.6332, "step": 11060 }, { "epoch": 17.66, "grad_norm": 2.8721373081207275, "learning_rate": 0.00129377990430622, "loss": 0.731, "step": 11070 }, { "epoch": 17.67, "grad_norm": 3.060612440109253, "learning_rate": 0.0012931419457735246, "loss": 0.7137, "step": 11080 }, { "epoch": 17.69, "grad_norm": 2.5586180686950684, "learning_rate": 0.0012925039872408292, "loss": 0.5945, "step": 11090 }, { "epoch": 17.7, "grad_norm": 2.5695533752441406, "learning_rate": 0.001291866028708134, "loss": 0.7658, "step": 11100 }, { "epoch": 17.72, "grad_norm": 4.104732036590576, "learning_rate": 0.0012912280701754385, "loss": 0.7704, "step": 11110 }, { "epoch": 17.74, "grad_norm": 3.1808011531829834, "learning_rate": 0.001290590111642743, "loss": 0.6476, "step": 11120 }, { "epoch": 17.75, "grad_norm": 2.210597038269043, "learning_rate": 0.0012899521531100477, "loss": 0.722, "step": 11130 }, { "epoch": 17.77, "grad_norm": 2.6710522174835205, "learning_rate": 0.0012893141945773524, "loss": 0.6275, "step": 11140 }, { "epoch": 17.78, "grad_norm": 2.2379961013793945, "learning_rate": 0.001288676236044657, "loss": 0.6321, "step": 11150 }, { "epoch": 17.8, "grad_norm": 2.719963312149048, "learning_rate": 0.0012880382775119616, "loss": 0.6912, "step": 11160 }, { "epoch": 17.81, "grad_norm": 2.1712732315063477, "learning_rate": 0.0012874003189792664, "loss": 0.6567, "step": 11170 }, { "epoch": 17.83, "grad_norm": 4.386512279510498, "learning_rate": 0.001286762360446571, "loss": 0.6933, "step": 11180 }, { "epoch": 17.85, "grad_norm": 4.8913373947143555, "learning_rate": 0.0012861244019138755, "loss": 0.6339, "step": 11190 }, { "epoch": 17.86, "grad_norm": 3.154282569885254, "learning_rate": 0.00128548644338118, "loss": 0.6558, "step": 11200 }, { "epoch": 17.88, "grad_norm": 4.05545711517334, "learning_rate": 0.001284848484848485, "loss": 0.7379, "step": 11210 }, { "epoch": 17.89, "grad_norm": 2.9737448692321777, "learning_rate": 0.0012842105263157896, "loss": 0.7563, "step": 11220 }, { "epoch": 17.91, "grad_norm": 4.375244617462158, "learning_rate": 0.0012835725677830942, "loss": 0.7779, "step": 11230 }, { "epoch": 17.93, "grad_norm": 2.775324821472168, "learning_rate": 0.001282934609250399, "loss": 0.6884, "step": 11240 }, { "epoch": 17.94, "grad_norm": 2.1626110076904297, "learning_rate": 0.0012822966507177035, "loss": 0.7123, "step": 11250 }, { "epoch": 17.96, "grad_norm": 3.951596260070801, "learning_rate": 0.001281658692185008, "loss": 0.7645, "step": 11260 }, { "epoch": 17.97, "grad_norm": 2.271362066268921, "learning_rate": 0.0012810207336523126, "loss": 0.8049, "step": 11270 }, { "epoch": 17.99, "grad_norm": 3.3153727054595947, "learning_rate": 0.0012803827751196174, "loss": 0.7484, "step": 11280 }, { "epoch": 18.01, "grad_norm": 2.451831340789795, "learning_rate": 0.001279744816586922, "loss": 0.6549, "step": 11290 }, { "epoch": 18.02, "grad_norm": 2.1044692993164062, "learning_rate": 0.0012791068580542265, "loss": 0.5569, "step": 11300 }, { "epoch": 18.04, "grad_norm": 5.281918525695801, "learning_rate": 0.0012784688995215313, "loss": 0.5877, "step": 11310 }, { "epoch": 18.05, "grad_norm": 2.706597328186035, "learning_rate": 0.0012778309409888359, "loss": 0.541, "step": 11320 }, { "epoch": 18.07, "grad_norm": 2.2525746822357178, "learning_rate": 0.0012771929824561404, "loss": 0.5092, "step": 11330 }, { "epoch": 18.09, "grad_norm": 2.823735475540161, "learning_rate": 0.001276555023923445, "loss": 0.5205, "step": 11340 }, { "epoch": 18.1, "grad_norm": 2.758739948272705, "learning_rate": 0.0012759170653907498, "loss": 0.5012, "step": 11350 }, { "epoch": 18.12, "grad_norm": 3.0362417697906494, "learning_rate": 0.0012752791068580543, "loss": 0.6093, "step": 11360 }, { "epoch": 18.13, "grad_norm": 1.967462182044983, "learning_rate": 0.001274641148325359, "loss": 0.6387, "step": 11370 }, { "epoch": 18.15, "grad_norm": 2.352168083190918, "learning_rate": 0.0012740031897926637, "loss": 0.6707, "step": 11380 }, { "epoch": 18.17, "grad_norm": 1.6705100536346436, "learning_rate": 0.0012733652312599683, "loss": 0.6675, "step": 11390 }, { "epoch": 18.18, "grad_norm": 2.1992321014404297, "learning_rate": 0.0012727272727272728, "loss": 0.7195, "step": 11400 }, { "epoch": 18.2, "grad_norm": 1.7198143005371094, "learning_rate": 0.0012720893141945774, "loss": 0.6287, "step": 11410 }, { "epoch": 18.21, "grad_norm": 2.229097604751587, "learning_rate": 0.0012714513556618822, "loss": 0.5207, "step": 11420 }, { "epoch": 18.23, "grad_norm": 3.074547529220581, "learning_rate": 0.0012708133971291867, "loss": 0.5929, "step": 11430 }, { "epoch": 18.25, "grad_norm": 2.6688926219940186, "learning_rate": 0.0012701754385964913, "loss": 0.5927, "step": 11440 }, { "epoch": 18.26, "grad_norm": 2.6187679767608643, "learning_rate": 0.0012695374800637958, "loss": 0.5733, "step": 11450 }, { "epoch": 18.28, "grad_norm": 2.056699752807617, "learning_rate": 0.0012688995215311006, "loss": 0.6502, "step": 11460 }, { "epoch": 18.29, "grad_norm": 3.4182140827178955, "learning_rate": 0.0012682615629984052, "loss": 0.5932, "step": 11470 }, { "epoch": 18.31, "grad_norm": 2.9311532974243164, "learning_rate": 0.0012676236044657097, "loss": 0.5568, "step": 11480 }, { "epoch": 18.33, "grad_norm": 1.7414332628250122, "learning_rate": 0.0012669856459330145, "loss": 0.5494, "step": 11490 }, { "epoch": 18.34, "grad_norm": 2.6820008754730225, "learning_rate": 0.001266347687400319, "loss": 0.7878, "step": 11500 }, { "epoch": 18.36, "grad_norm": 2.811760663986206, "learning_rate": 0.0012657097288676237, "loss": 0.5324, "step": 11510 }, { "epoch": 18.37, "grad_norm": 3.026895046234131, "learning_rate": 0.0012650717703349282, "loss": 0.7353, "step": 11520 }, { "epoch": 18.39, "grad_norm": 2.6072068214416504, "learning_rate": 0.001264433811802233, "loss": 0.6509, "step": 11530 }, { "epoch": 18.41, "grad_norm": 2.0730879306793213, "learning_rate": 0.0012637958532695376, "loss": 0.5353, "step": 11540 }, { "epoch": 18.42, "grad_norm": 3.863426923751831, "learning_rate": 0.0012631578947368421, "loss": 0.6443, "step": 11550 }, { "epoch": 18.44, "grad_norm": 1.9193871021270752, "learning_rate": 0.001262519936204147, "loss": 0.5827, "step": 11560 }, { "epoch": 18.45, "grad_norm": 3.126490354537964, "learning_rate": 0.0012618819776714515, "loss": 0.6556, "step": 11570 }, { "epoch": 18.47, "grad_norm": 3.189641237258911, "learning_rate": 0.001261244019138756, "loss": 0.693, "step": 11580 }, { "epoch": 18.48, "grad_norm": 3.374671220779419, "learning_rate": 0.0012606060606060606, "loss": 0.6156, "step": 11590 }, { "epoch": 18.5, "grad_norm": 1.7221401929855347, "learning_rate": 0.0012599681020733654, "loss": 0.6865, "step": 11600 }, { "epoch": 18.52, "grad_norm": 1.6494935750961304, "learning_rate": 0.00125933014354067, "loss": 0.6058, "step": 11610 }, { "epoch": 18.53, "grad_norm": 2.8912765979766846, "learning_rate": 0.0012586921850079745, "loss": 0.6022, "step": 11620 }, { "epoch": 18.55, "grad_norm": 2.1293585300445557, "learning_rate": 0.0012580542264752793, "loss": 0.5817, "step": 11630 }, { "epoch": 18.56, "grad_norm": 3.3972530364990234, "learning_rate": 0.0012574162679425838, "loss": 0.6279, "step": 11640 }, { "epoch": 18.58, "grad_norm": 4.464833736419678, "learning_rate": 0.0012567783094098884, "loss": 0.6584, "step": 11650 }, { "epoch": 18.6, "grad_norm": 3.3168396949768066, "learning_rate": 0.001256140350877193, "loss": 0.7492, "step": 11660 }, { "epoch": 18.61, "grad_norm": 1.7018378973007202, "learning_rate": 0.0012555023923444977, "loss": 0.6064, "step": 11670 }, { "epoch": 18.63, "grad_norm": 2.8935000896453857, "learning_rate": 0.0012548644338118023, "loss": 0.6793, "step": 11680 }, { "epoch": 18.64, "grad_norm": 3.3293614387512207, "learning_rate": 0.0012542264752791069, "loss": 0.6478, "step": 11690 }, { "epoch": 18.66, "grad_norm": 2.4878737926483154, "learning_rate": 0.0012535885167464116, "loss": 0.5137, "step": 11700 }, { "epoch": 18.68, "grad_norm": 2.662574529647827, "learning_rate": 0.0012529505582137162, "loss": 0.6051, "step": 11710 }, { "epoch": 18.69, "grad_norm": 2.5218799114227295, "learning_rate": 0.0012523125996810208, "loss": 0.6176, "step": 11720 }, { "epoch": 18.71, "grad_norm": 2.6172173023223877, "learning_rate": 0.0012516746411483253, "loss": 0.627, "step": 11730 }, { "epoch": 18.72, "grad_norm": 2.4706501960754395, "learning_rate": 0.0012510366826156301, "loss": 0.6826, "step": 11740 }, { "epoch": 18.74, "grad_norm": 1.9907801151275635, "learning_rate": 0.0012503987240829347, "loss": 0.6528, "step": 11750 }, { "epoch": 18.76, "grad_norm": 4.803826808929443, "learning_rate": 0.0012497607655502392, "loss": 0.599, "step": 11760 }, { "epoch": 18.77, "grad_norm": 2.5642504692077637, "learning_rate": 0.001249122807017544, "loss": 0.6657, "step": 11770 }, { "epoch": 18.79, "grad_norm": 4.334081649780273, "learning_rate": 0.0012484848484848486, "loss": 0.6764, "step": 11780 }, { "epoch": 18.8, "grad_norm": 2.7521369457244873, "learning_rate": 0.0012478468899521531, "loss": 0.6906, "step": 11790 }, { "epoch": 18.82, "grad_norm": 2.13214373588562, "learning_rate": 0.0012472089314194577, "loss": 0.6823, "step": 11800 }, { "epoch": 18.84, "grad_norm": 3.1697006225585938, "learning_rate": 0.0012465709728867625, "loss": 0.6231, "step": 11810 }, { "epoch": 18.85, "grad_norm": 2.6898703575134277, "learning_rate": 0.001245933014354067, "loss": 0.8077, "step": 11820 }, { "epoch": 18.87, "grad_norm": 3.177943706512451, "learning_rate": 0.0012452950558213716, "loss": 0.634, "step": 11830 }, { "epoch": 18.88, "grad_norm": 2.5923023223876953, "learning_rate": 0.0012446570972886762, "loss": 0.5839, "step": 11840 }, { "epoch": 18.9, "grad_norm": 1.8359884023666382, "learning_rate": 0.001244019138755981, "loss": 0.5992, "step": 11850 }, { "epoch": 18.92, "grad_norm": 2.252401828765869, "learning_rate": 0.0012433811802232855, "loss": 0.5877, "step": 11860 }, { "epoch": 18.93, "grad_norm": 2.945974588394165, "learning_rate": 0.00124274322169059, "loss": 0.8981, "step": 11870 }, { "epoch": 18.95, "grad_norm": 2.5869786739349365, "learning_rate": 0.0012421052631578949, "loss": 0.7204, "step": 11880 }, { "epoch": 18.96, "grad_norm": 2.0073652267456055, "learning_rate": 0.0012414673046251994, "loss": 0.7112, "step": 11890 }, { "epoch": 18.98, "grad_norm": 2.726731777191162, "learning_rate": 0.001240829346092504, "loss": 0.6721, "step": 11900 }, { "epoch": 19.0, "grad_norm": 2.646214246749878, "learning_rate": 0.0012401913875598085, "loss": 0.6353, "step": 11910 }, { "epoch": 19.01, "grad_norm": 1.7647764682769775, "learning_rate": 0.0012395534290271133, "loss": 0.5528, "step": 11920 }, { "epoch": 19.03, "grad_norm": 1.6511797904968262, "learning_rate": 0.0012389154704944179, "loss": 0.5311, "step": 11930 }, { "epoch": 19.04, "grad_norm": 3.120816707611084, "learning_rate": 0.0012382775119617224, "loss": 0.4873, "step": 11940 }, { "epoch": 19.06, "grad_norm": 1.2211092710494995, "learning_rate": 0.0012376395534290272, "loss": 0.5031, "step": 11950 }, { "epoch": 19.07, "grad_norm": 2.295135021209717, "learning_rate": 0.0012370015948963318, "loss": 0.5656, "step": 11960 }, { "epoch": 19.09, "grad_norm": 1.805337905883789, "learning_rate": 0.0012363636363636364, "loss": 0.5322, "step": 11970 }, { "epoch": 19.11, "grad_norm": 1.8517502546310425, "learning_rate": 0.001235725677830941, "loss": 0.5287, "step": 11980 }, { "epoch": 19.12, "grad_norm": 2.464036464691162, "learning_rate": 0.0012350877192982457, "loss": 0.5865, "step": 11990 }, { "epoch": 19.14, "grad_norm": 1.964254379272461, "learning_rate": 0.0012344497607655503, "loss": 0.512, "step": 12000 }, { "epoch": 19.15, "grad_norm": 2.386060953140259, "learning_rate": 0.0012338118022328548, "loss": 0.6416, "step": 12010 }, { "epoch": 19.17, "grad_norm": 2.4723477363586426, "learning_rate": 0.0012331738437001596, "loss": 0.5147, "step": 12020 }, { "epoch": 19.19, "grad_norm": 1.7513999938964844, "learning_rate": 0.0012325358851674642, "loss": 0.5413, "step": 12030 }, { "epoch": 19.2, "grad_norm": 1.3666512966156006, "learning_rate": 0.0012318979266347687, "loss": 0.619, "step": 12040 }, { "epoch": 19.22, "grad_norm": 2.0821938514709473, "learning_rate": 0.0012312599681020733, "loss": 0.5351, "step": 12050 }, { "epoch": 19.23, "grad_norm": 2.403721570968628, "learning_rate": 0.001230622009569378, "loss": 0.5107, "step": 12060 }, { "epoch": 19.25, "grad_norm": 2.3420348167419434, "learning_rate": 0.0012299840510366826, "loss": 0.5017, "step": 12070 }, { "epoch": 19.27, "grad_norm": 1.8931384086608887, "learning_rate": 0.0012293460925039872, "loss": 0.5291, "step": 12080 }, { "epoch": 19.28, "grad_norm": 1.815537691116333, "learning_rate": 0.001228708133971292, "loss": 0.6106, "step": 12090 }, { "epoch": 19.3, "grad_norm": 2.327855348587036, "learning_rate": 0.0012280701754385965, "loss": 0.4726, "step": 12100 }, { "epoch": 19.31, "grad_norm": 2.388517141342163, "learning_rate": 0.001227432216905901, "loss": 0.5529, "step": 12110 }, { "epoch": 19.33, "grad_norm": 3.0234811305999756, "learning_rate": 0.0012267942583732057, "loss": 0.6617, "step": 12120 }, { "epoch": 19.35, "grad_norm": 3.1139323711395264, "learning_rate": 0.0012261562998405104, "loss": 0.5682, "step": 12130 }, { "epoch": 19.36, "grad_norm": 3.9127554893493652, "learning_rate": 0.001225518341307815, "loss": 0.6042, "step": 12140 }, { "epoch": 19.38, "grad_norm": 3.9032232761383057, "learning_rate": 0.0012248803827751196, "loss": 0.6022, "step": 12150 }, { "epoch": 19.39, "grad_norm": 1.7738832235336304, "learning_rate": 0.0012242424242424241, "loss": 0.5483, "step": 12160 }, { "epoch": 19.41, "grad_norm": 3.865807294845581, "learning_rate": 0.001223604465709729, "loss": 0.6323, "step": 12170 }, { "epoch": 19.43, "grad_norm": 2.653740406036377, "learning_rate": 0.0012229665071770335, "loss": 0.6286, "step": 12180 }, { "epoch": 19.44, "grad_norm": 1.727924108505249, "learning_rate": 0.001222328548644338, "loss": 0.5744, "step": 12190 }, { "epoch": 19.46, "grad_norm": 2.1040127277374268, "learning_rate": 0.0012216905901116428, "loss": 0.5941, "step": 12200 }, { "epoch": 19.47, "grad_norm": 2.8161518573760986, "learning_rate": 0.0012210526315789474, "loss": 0.542, "step": 12210 }, { "epoch": 19.49, "grad_norm": 2.4196929931640625, "learning_rate": 0.001220414673046252, "loss": 0.5612, "step": 12220 }, { "epoch": 19.51, "grad_norm": 2.2649526596069336, "learning_rate": 0.0012197767145135565, "loss": 0.4941, "step": 12230 }, { "epoch": 19.52, "grad_norm": 3.1256422996520996, "learning_rate": 0.0012191387559808613, "loss": 0.551, "step": 12240 }, { "epoch": 19.54, "grad_norm": 2.1946921348571777, "learning_rate": 0.0012185007974481658, "loss": 0.5294, "step": 12250 }, { "epoch": 19.55, "grad_norm": 2.897484064102173, "learning_rate": 0.0012178628389154704, "loss": 0.7128, "step": 12260 }, { "epoch": 19.57, "grad_norm": 2.024834156036377, "learning_rate": 0.0012172248803827752, "loss": 0.5942, "step": 12270 }, { "epoch": 19.59, "grad_norm": 1.7685123682022095, "learning_rate": 0.0012165869218500797, "loss": 0.5827, "step": 12280 }, { "epoch": 19.6, "grad_norm": 2.0234525203704834, "learning_rate": 0.0012159489633173843, "loss": 0.6377, "step": 12290 }, { "epoch": 19.62, "grad_norm": 4.358128070831299, "learning_rate": 0.0012153110047846889, "loss": 0.7277, "step": 12300 }, { "epoch": 19.63, "grad_norm": 2.4699137210845947, "learning_rate": 0.0012146730462519937, "loss": 0.5023, "step": 12310 }, { "epoch": 19.65, "grad_norm": 2.4853904247283936, "learning_rate": 0.0012140350877192982, "loss": 0.585, "step": 12320 }, { "epoch": 19.67, "grad_norm": 2.560833215713501, "learning_rate": 0.0012133971291866028, "loss": 0.6899, "step": 12330 }, { "epoch": 19.68, "grad_norm": 3.2827863693237305, "learning_rate": 0.0012127591706539076, "loss": 0.6114, "step": 12340 }, { "epoch": 19.7, "grad_norm": 2.643315553665161, "learning_rate": 0.0012121212121212121, "loss": 0.6714, "step": 12350 }, { "epoch": 19.71, "grad_norm": 2.2757856845855713, "learning_rate": 0.0012114832535885167, "loss": 0.5607, "step": 12360 }, { "epoch": 19.73, "grad_norm": 2.054987668991089, "learning_rate": 0.0012108452950558212, "loss": 0.5481, "step": 12370 }, { "epoch": 19.74, "grad_norm": 2.3429064750671387, "learning_rate": 0.001210207336523126, "loss": 0.6629, "step": 12380 }, { "epoch": 19.76, "grad_norm": 1.6089274883270264, "learning_rate": 0.0012095693779904306, "loss": 0.6237, "step": 12390 }, { "epoch": 19.78, "grad_norm": 4.483922004699707, "learning_rate": 0.0012089314194577352, "loss": 0.6287, "step": 12400 }, { "epoch": 19.79, "grad_norm": 2.133923292160034, "learning_rate": 0.00120829346092504, "loss": 0.6648, "step": 12410 }, { "epoch": 19.81, "grad_norm": 2.3778302669525146, "learning_rate": 0.0012076555023923445, "loss": 0.639, "step": 12420 }, { "epoch": 19.82, "grad_norm": 2.589620351791382, "learning_rate": 0.001207017543859649, "loss": 0.6414, "step": 12430 }, { "epoch": 19.84, "grad_norm": 1.5527355670928955, "learning_rate": 0.0012063795853269536, "loss": 0.6587, "step": 12440 }, { "epoch": 19.86, "grad_norm": 3.8891091346740723, "learning_rate": 0.0012057416267942584, "loss": 0.6672, "step": 12450 }, { "epoch": 19.87, "grad_norm": 2.5779592990875244, "learning_rate": 0.001205103668261563, "loss": 0.6224, "step": 12460 }, { "epoch": 19.89, "grad_norm": 2.218827486038208, "learning_rate": 0.0012044657097288675, "loss": 0.6988, "step": 12470 }, { "epoch": 19.9, "grad_norm": 3.827039957046509, "learning_rate": 0.0012038277511961723, "loss": 0.5554, "step": 12480 }, { "epoch": 19.92, "grad_norm": 3.635878562927246, "learning_rate": 0.0012031897926634769, "loss": 0.6447, "step": 12490 }, { "epoch": 19.94, "grad_norm": 1.9988211393356323, "learning_rate": 0.0012025518341307814, "loss": 0.5721, "step": 12500 }, { "epoch": 19.95, "grad_norm": 4.294229984283447, "learning_rate": 0.001201913875598086, "loss": 0.6425, "step": 12510 }, { "epoch": 19.97, "grad_norm": 2.2810208797454834, "learning_rate": 0.0012012759170653908, "loss": 0.609, "step": 12520 }, { "epoch": 19.98, "grad_norm": 2.6013190746307373, "learning_rate": 0.0012006379585326953, "loss": 0.615, "step": 12530 }, { "epoch": 20.0, "grad_norm": 3.9176077842712402, "learning_rate": 0.0012, "loss": 0.6626, "step": 12540 }, { "epoch": 20.02, "grad_norm": 1.4916435480117798, "learning_rate": 0.0011993620414673045, "loss": 0.4802, "step": 12550 }, { "epoch": 20.03, "grad_norm": 1.8869787454605103, "learning_rate": 0.0011987240829346092, "loss": 0.4899, "step": 12560 }, { "epoch": 20.05, "grad_norm": 1.645322561264038, "learning_rate": 0.0011980861244019138, "loss": 0.4875, "step": 12570 }, { "epoch": 20.06, "grad_norm": 3.0053963661193848, "learning_rate": 0.0011974481658692184, "loss": 0.5344, "step": 12580 }, { "epoch": 20.08, "grad_norm": 1.9125926494598389, "learning_rate": 0.0011968102073365231, "loss": 0.5512, "step": 12590 }, { "epoch": 20.1, "grad_norm": 2.4130938053131104, "learning_rate": 0.0011961722488038277, "loss": 0.6046, "step": 12600 }, { "epoch": 20.11, "grad_norm": 2.648345947265625, "learning_rate": 0.0011955342902711323, "loss": 0.5085, "step": 12610 }, { "epoch": 20.13, "grad_norm": 3.288292646408081, "learning_rate": 0.0011948963317384368, "loss": 0.5101, "step": 12620 }, { "epoch": 20.14, "grad_norm": 2.3620495796203613, "learning_rate": 0.0011942583732057416, "loss": 0.502, "step": 12630 }, { "epoch": 20.16, "grad_norm": 2.2232260704040527, "learning_rate": 0.0011936204146730462, "loss": 0.5246, "step": 12640 }, { "epoch": 20.18, "grad_norm": 3.120986223220825, "learning_rate": 0.0011929824561403507, "loss": 0.621, "step": 12650 }, { "epoch": 20.19, "grad_norm": 1.5366686582565308, "learning_rate": 0.0011923444976076555, "loss": 0.4844, "step": 12660 }, { "epoch": 20.21, "grad_norm": 2.0947461128234863, "learning_rate": 0.00119170653907496, "loss": 0.6025, "step": 12670 }, { "epoch": 20.22, "grad_norm": 1.5178321599960327, "learning_rate": 0.0011910685805422646, "loss": 0.5421, "step": 12680 }, { "epoch": 20.24, "grad_norm": 2.9309802055358887, "learning_rate": 0.0011904306220095692, "loss": 0.4816, "step": 12690 }, { "epoch": 20.26, "grad_norm": 1.6734910011291504, "learning_rate": 0.0011897926634768742, "loss": 0.668, "step": 12700 }, { "epoch": 20.27, "grad_norm": 1.755245327949524, "learning_rate": 0.0011891547049441788, "loss": 0.5436, "step": 12710 }, { "epoch": 20.29, "grad_norm": 2.458543062210083, "learning_rate": 0.0011885167464114833, "loss": 0.5392, "step": 12720 }, { "epoch": 20.3, "grad_norm": 2.2478575706481934, "learning_rate": 0.001187878787878788, "loss": 0.5686, "step": 12730 }, { "epoch": 20.32, "grad_norm": 1.2976596355438232, "learning_rate": 0.0011872408293460927, "loss": 0.4879, "step": 12740 }, { "epoch": 20.33, "grad_norm": 2.250114917755127, "learning_rate": 0.0011866028708133972, "loss": 0.6064, "step": 12750 }, { "epoch": 20.35, "grad_norm": 2.2818796634674072, "learning_rate": 0.0011859649122807018, "loss": 0.5619, "step": 12760 }, { "epoch": 20.37, "grad_norm": 2.3820178508758545, "learning_rate": 0.0011853269537480066, "loss": 0.5126, "step": 12770 }, { "epoch": 20.38, "grad_norm": 3.2392003536224365, "learning_rate": 0.0011846889952153111, "loss": 0.544, "step": 12780 }, { "epoch": 20.4, "grad_norm": 3.075946092605591, "learning_rate": 0.0011840510366826157, "loss": 0.6301, "step": 12790 }, { "epoch": 20.41, "grad_norm": 4.1104230880737305, "learning_rate": 0.0011834130781499205, "loss": 0.6136, "step": 12800 }, { "epoch": 20.43, "grad_norm": 1.715682029724121, "learning_rate": 0.001182775119617225, "loss": 0.5667, "step": 12810 }, { "epoch": 20.45, "grad_norm": 1.5427650213241577, "learning_rate": 0.0011821371610845296, "loss": 0.6001, "step": 12820 }, { "epoch": 20.46, "grad_norm": 2.109271764755249, "learning_rate": 0.0011814992025518342, "loss": 0.5762, "step": 12830 }, { "epoch": 20.48, "grad_norm": 1.6117897033691406, "learning_rate": 0.001180861244019139, "loss": 0.5072, "step": 12840 }, { "epoch": 20.49, "grad_norm": 2.525860071182251, "learning_rate": 0.0011802232854864435, "loss": 0.6302, "step": 12850 }, { "epoch": 20.51, "grad_norm": 1.8826050758361816, "learning_rate": 0.001179585326953748, "loss": 0.5483, "step": 12860 }, { "epoch": 20.53, "grad_norm": 1.7801835536956787, "learning_rate": 0.0011789473684210526, "loss": 0.5239, "step": 12870 }, { "epoch": 20.54, "grad_norm": 3.124882459640503, "learning_rate": 0.0011783094098883574, "loss": 0.4724, "step": 12880 }, { "epoch": 20.56, "grad_norm": 2.8056161403656006, "learning_rate": 0.001177671451355662, "loss": 0.5681, "step": 12890 }, { "epoch": 20.57, "grad_norm": 2.165199041366577, "learning_rate": 0.0011770334928229665, "loss": 0.6012, "step": 12900 }, { "epoch": 20.59, "grad_norm": 2.297102451324463, "learning_rate": 0.0011763955342902713, "loss": 0.517, "step": 12910 }, { "epoch": 20.61, "grad_norm": 2.99562668800354, "learning_rate": 0.0011757575757575759, "loss": 0.5696, "step": 12920 }, { "epoch": 20.62, "grad_norm": 2.0757791996002197, "learning_rate": 0.0011751196172248804, "loss": 0.6102, "step": 12930 }, { "epoch": 20.64, "grad_norm": 2.441718816757202, "learning_rate": 0.001174481658692185, "loss": 0.6066, "step": 12940 }, { "epoch": 20.65, "grad_norm": 1.4816184043884277, "learning_rate": 0.0011738437001594898, "loss": 0.5083, "step": 12950 }, { "epoch": 20.67, "grad_norm": 2.349161386489868, "learning_rate": 0.0011732057416267943, "loss": 0.5705, "step": 12960 }, { "epoch": 20.69, "grad_norm": 2.0626585483551025, "learning_rate": 0.001172567783094099, "loss": 0.5551, "step": 12970 }, { "epoch": 20.7, "grad_norm": 2.3144423961639404, "learning_rate": 0.0011719298245614037, "loss": 0.7276, "step": 12980 }, { "epoch": 20.72, "grad_norm": 2.9268980026245117, "learning_rate": 0.0011712918660287083, "loss": 0.6176, "step": 12990 }, { "epoch": 20.73, "grad_norm": 2.339564323425293, "learning_rate": 0.0011706539074960128, "loss": 0.6206, "step": 13000 }, { "epoch": 20.75, "grad_norm": 2.358088493347168, "learning_rate": 0.0011700159489633174, "loss": 0.5862, "step": 13010 }, { "epoch": 20.77, "grad_norm": 2.127462863922119, "learning_rate": 0.0011693779904306222, "loss": 0.5488, "step": 13020 }, { "epoch": 20.78, "grad_norm": 3.3488762378692627, "learning_rate": 0.0011687400318979267, "loss": 0.6345, "step": 13030 }, { "epoch": 20.8, "grad_norm": 3.2236621379852295, "learning_rate": 0.0011681020733652313, "loss": 0.5897, "step": 13040 }, { "epoch": 20.81, "grad_norm": 3.0065135955810547, "learning_rate": 0.001167464114832536, "loss": 0.6214, "step": 13050 }, { "epoch": 20.83, "grad_norm": 1.797853946685791, "learning_rate": 0.0011668261562998406, "loss": 0.6224, "step": 13060 }, { "epoch": 20.85, "grad_norm": 1.6769222021102905, "learning_rate": 0.0011661881977671452, "loss": 0.533, "step": 13070 }, { "epoch": 20.86, "grad_norm": 2.111424207687378, "learning_rate": 0.0011655502392344497, "loss": 0.5674, "step": 13080 }, { "epoch": 20.88, "grad_norm": 1.9882782697677612, "learning_rate": 0.0011649122807017545, "loss": 0.6407, "step": 13090 }, { "epoch": 20.89, "grad_norm": 2.0077192783355713, "learning_rate": 0.001164274322169059, "loss": 0.4901, "step": 13100 }, { "epoch": 20.91, "grad_norm": 1.3955817222595215, "learning_rate": 0.0011636363636363637, "loss": 0.6638, "step": 13110 }, { "epoch": 20.93, "grad_norm": 2.236403226852417, "learning_rate": 0.0011629984051036684, "loss": 0.6267, "step": 13120 }, { "epoch": 20.94, "grad_norm": 2.0299949645996094, "learning_rate": 0.001162360446570973, "loss": 0.6097, "step": 13130 }, { "epoch": 20.96, "grad_norm": 3.4427030086517334, "learning_rate": 0.0011617224880382776, "loss": 0.6125, "step": 13140 }, { "epoch": 20.97, "grad_norm": 2.427687168121338, "learning_rate": 0.0011610845295055821, "loss": 0.6464, "step": 13150 }, { "epoch": 20.99, "grad_norm": 2.512589454650879, "learning_rate": 0.001160446570972887, "loss": 0.636, "step": 13160 }, { "epoch": 21.0, "grad_norm": 1.6817240715026855, "learning_rate": 0.0011598086124401915, "loss": 0.5498, "step": 13170 }, { "epoch": 21.02, "grad_norm": 2.4378724098205566, "learning_rate": 0.001159170653907496, "loss": 0.4912, "step": 13180 }, { "epoch": 21.04, "grad_norm": 1.7349364757537842, "learning_rate": 0.0011585326953748008, "loss": 0.4135, "step": 13190 }, { "epoch": 21.05, "grad_norm": 2.0071072578430176, "learning_rate": 0.0011578947368421054, "loss": 0.4892, "step": 13200 }, { "epoch": 21.07, "grad_norm": 2.1567165851593018, "learning_rate": 0.00115725677830941, "loss": 0.4291, "step": 13210 }, { "epoch": 21.08, "grad_norm": 1.5533453226089478, "learning_rate": 0.0011566188197767145, "loss": 0.4515, "step": 13220 }, { "epoch": 21.1, "grad_norm": 1.481789231300354, "learning_rate": 0.0011559808612440193, "loss": 0.4933, "step": 13230 }, { "epoch": 21.12, "grad_norm": 1.985859990119934, "learning_rate": 0.0011553429027113238, "loss": 0.6186, "step": 13240 }, { "epoch": 21.13, "grad_norm": 0.9559075236320496, "learning_rate": 0.0011547049441786284, "loss": 0.4367, "step": 13250 }, { "epoch": 21.15, "grad_norm": 3.953303575515747, "learning_rate": 0.001154066985645933, "loss": 0.5434, "step": 13260 }, { "epoch": 21.16, "grad_norm": 1.7408164739608765, "learning_rate": 0.0011534290271132377, "loss": 0.5025, "step": 13270 }, { "epoch": 21.18, "grad_norm": 2.5240061283111572, "learning_rate": 0.0011527910685805423, "loss": 0.5206, "step": 13280 }, { "epoch": 21.2, "grad_norm": 1.7967180013656616, "learning_rate": 0.0011521531100478469, "loss": 0.4679, "step": 13290 }, { "epoch": 21.21, "grad_norm": 1.5482749938964844, "learning_rate": 0.0011515151515151516, "loss": 0.5074, "step": 13300 }, { "epoch": 21.23, "grad_norm": 2.0703771114349365, "learning_rate": 0.0011508771929824562, "loss": 0.5012, "step": 13310 }, { "epoch": 21.24, "grad_norm": 1.4565823078155518, "learning_rate": 0.0011502392344497608, "loss": 0.5855, "step": 13320 }, { "epoch": 21.26, "grad_norm": 2.0159592628479004, "learning_rate": 0.0011496012759170653, "loss": 0.5762, "step": 13330 }, { "epoch": 21.28, "grad_norm": 1.8826504945755005, "learning_rate": 0.0011489633173843701, "loss": 0.5274, "step": 13340 }, { "epoch": 21.29, "grad_norm": 1.7150112390518188, "learning_rate": 0.0011483253588516747, "loss": 0.5659, "step": 13350 }, { "epoch": 21.31, "grad_norm": 2.8087666034698486, "learning_rate": 0.0011476874003189792, "loss": 0.4906, "step": 13360 }, { "epoch": 21.32, "grad_norm": 1.7748334407806396, "learning_rate": 0.001147049441786284, "loss": 0.4622, "step": 13370 }, { "epoch": 21.34, "grad_norm": 2.2863359451293945, "learning_rate": 0.0011464114832535886, "loss": 0.5015, "step": 13380 }, { "epoch": 21.36, "grad_norm": 2.4490015506744385, "learning_rate": 0.0011457735247208931, "loss": 0.5743, "step": 13390 }, { "epoch": 21.37, "grad_norm": 1.4806760549545288, "learning_rate": 0.0011451355661881977, "loss": 0.516, "step": 13400 }, { "epoch": 21.39, "grad_norm": 1.909926176071167, "learning_rate": 0.0011444976076555025, "loss": 0.516, "step": 13410 }, { "epoch": 21.4, "grad_norm": 2.3129677772521973, "learning_rate": 0.001143859649122807, "loss": 0.5169, "step": 13420 }, { "epoch": 21.42, "grad_norm": 2.589088201522827, "learning_rate": 0.0011432216905901116, "loss": 0.5535, "step": 13430 }, { "epoch": 21.44, "grad_norm": 2.4051127433776855, "learning_rate": 0.0011425837320574164, "loss": 0.5236, "step": 13440 }, { "epoch": 21.45, "grad_norm": 2.466587781906128, "learning_rate": 0.001141945773524721, "loss": 0.5304, "step": 13450 }, { "epoch": 21.47, "grad_norm": 1.5987040996551514, "learning_rate": 0.0011413078149920255, "loss": 0.4916, "step": 13460 }, { "epoch": 21.48, "grad_norm": 3.281262159347534, "learning_rate": 0.00114066985645933, "loss": 0.5194, "step": 13470 }, { "epoch": 21.5, "grad_norm": 2.3112425804138184, "learning_rate": 0.0011400318979266349, "loss": 0.5282, "step": 13480 }, { "epoch": 21.52, "grad_norm": 1.8901677131652832, "learning_rate": 0.0011393939393939394, "loss": 0.5697, "step": 13490 }, { "epoch": 21.53, "grad_norm": 2.8748323917388916, "learning_rate": 0.001138755980861244, "loss": 0.5219, "step": 13500 }, { "epoch": 21.55, "grad_norm": 2.162447690963745, "learning_rate": 0.0011381180223285488, "loss": 0.5469, "step": 13510 }, { "epoch": 21.56, "grad_norm": 2.3993029594421387, "learning_rate": 0.0011374800637958533, "loss": 0.5083, "step": 13520 }, { "epoch": 21.58, "grad_norm": 2.262704372406006, "learning_rate": 0.0011368421052631579, "loss": 0.5152, "step": 13530 }, { "epoch": 21.59, "grad_norm": 1.8415032625198364, "learning_rate": 0.0011362041467304624, "loss": 0.6413, "step": 13540 }, { "epoch": 21.61, "grad_norm": 1.7143352031707764, "learning_rate": 0.0011355661881977672, "loss": 0.5897, "step": 13550 }, { "epoch": 21.63, "grad_norm": 1.9199092388153076, "learning_rate": 0.0011349282296650718, "loss": 0.5259, "step": 13560 }, { "epoch": 21.64, "grad_norm": 1.6734964847564697, "learning_rate": 0.0011342902711323764, "loss": 0.5658, "step": 13570 }, { "epoch": 21.66, "grad_norm": 2.817392349243164, "learning_rate": 0.001133652312599681, "loss": 0.6579, "step": 13580 }, { "epoch": 21.67, "grad_norm": 3.3291382789611816, "learning_rate": 0.0011330143540669857, "loss": 0.603, "step": 13590 }, { "epoch": 21.69, "grad_norm": 2.5923471450805664, "learning_rate": 0.0011323763955342903, "loss": 0.659, "step": 13600 }, { "epoch": 21.71, "grad_norm": 2.979832410812378, "learning_rate": 0.0011317384370015948, "loss": 0.6129, "step": 13610 }, { "epoch": 21.72, "grad_norm": 3.666498899459839, "learning_rate": 0.0011311004784688996, "loss": 0.7636, "step": 13620 }, { "epoch": 21.74, "grad_norm": 1.8010962009429932, "learning_rate": 0.0011304625199362042, "loss": 0.5779, "step": 13630 }, { "epoch": 21.75, "grad_norm": 2.430271625518799, "learning_rate": 0.0011298245614035087, "loss": 0.5292, "step": 13640 }, { "epoch": 21.77, "grad_norm": 2.2051026821136475, "learning_rate": 0.0011291866028708133, "loss": 0.5498, "step": 13650 }, { "epoch": 21.79, "grad_norm": 3.7122042179107666, "learning_rate": 0.001128548644338118, "loss": 0.5772, "step": 13660 }, { "epoch": 21.8, "grad_norm": 2.4475326538085938, "learning_rate": 0.0011279106858054226, "loss": 0.5221, "step": 13670 }, { "epoch": 21.82, "grad_norm": 2.862783193588257, "learning_rate": 0.0011272727272727272, "loss": 0.5799, "step": 13680 }, { "epoch": 21.83, "grad_norm": 2.2433278560638428, "learning_rate": 0.001126634768740032, "loss": 0.5334, "step": 13690 }, { "epoch": 21.85, "grad_norm": 2.5554163455963135, "learning_rate": 0.0011259968102073365, "loss": 0.5254, "step": 13700 }, { "epoch": 21.87, "grad_norm": 2.6535990238189697, "learning_rate": 0.001125358851674641, "loss": 0.6575, "step": 13710 }, { "epoch": 21.88, "grad_norm": 2.348066806793213, "learning_rate": 0.0011247208931419457, "loss": 0.5326, "step": 13720 }, { "epoch": 21.9, "grad_norm": 2.7629575729370117, "learning_rate": 0.0011240829346092504, "loss": 0.5429, "step": 13730 }, { "epoch": 21.91, "grad_norm": 1.7561380863189697, "learning_rate": 0.001123444976076555, "loss": 0.612, "step": 13740 }, { "epoch": 21.93, "grad_norm": 1.6795223951339722, "learning_rate": 0.0011228070175438596, "loss": 0.5954, "step": 13750 }, { "epoch": 21.95, "grad_norm": 2.316612958908081, "learning_rate": 0.0011221690590111643, "loss": 0.6165, "step": 13760 }, { "epoch": 21.96, "grad_norm": 2.91849422454834, "learning_rate": 0.001121531100478469, "loss": 0.5407, "step": 13770 }, { "epoch": 21.98, "grad_norm": 1.6966789960861206, "learning_rate": 0.0011208931419457735, "loss": 0.5186, "step": 13780 }, { "epoch": 21.99, "grad_norm": 2.0186002254486084, "learning_rate": 0.001120255183413078, "loss": 0.6633, "step": 13790 }, { "epoch": 22.01, "grad_norm": 1.466770052909851, "learning_rate": 0.0011196172248803828, "loss": 0.4628, "step": 13800 }, { "epoch": 22.03, "grad_norm": 1.2927073240280151, "learning_rate": 0.0011189792663476874, "loss": 0.3833, "step": 13810 }, { "epoch": 22.04, "grad_norm": 1.5075204372406006, "learning_rate": 0.001118341307814992, "loss": 0.4408, "step": 13820 }, { "epoch": 22.06, "grad_norm": 1.5921709537506104, "learning_rate": 0.0011177033492822967, "loss": 0.4546, "step": 13830 }, { "epoch": 22.07, "grad_norm": 1.4669833183288574, "learning_rate": 0.0011170653907496013, "loss": 0.4423, "step": 13840 }, { "epoch": 22.09, "grad_norm": 2.846984624862671, "learning_rate": 0.0011164274322169058, "loss": 0.5419, "step": 13850 }, { "epoch": 22.11, "grad_norm": 1.1254881620407104, "learning_rate": 0.0011157894736842104, "loss": 0.4977, "step": 13860 }, { "epoch": 22.12, "grad_norm": 1.3367946147918701, "learning_rate": 0.0011151515151515152, "loss": 0.4496, "step": 13870 }, { "epoch": 22.14, "grad_norm": 1.633335828781128, "learning_rate": 0.0011145135566188197, "loss": 0.4433, "step": 13880 }, { "epoch": 22.15, "grad_norm": 2.3413655757904053, "learning_rate": 0.0011138755980861243, "loss": 0.5003, "step": 13890 }, { "epoch": 22.17, "grad_norm": 2.336428642272949, "learning_rate": 0.001113237639553429, "loss": 0.532, "step": 13900 }, { "epoch": 22.19, "grad_norm": 1.713782787322998, "learning_rate": 0.0011125996810207337, "loss": 0.4474, "step": 13910 }, { "epoch": 22.2, "grad_norm": 2.55415678024292, "learning_rate": 0.0011119617224880382, "loss": 0.4168, "step": 13920 }, { "epoch": 22.22, "grad_norm": 1.7358187437057495, "learning_rate": 0.0011113237639553428, "loss": 0.5048, "step": 13930 }, { "epoch": 22.23, "grad_norm": 1.6725515127182007, "learning_rate": 0.0011106858054226476, "loss": 0.508, "step": 13940 }, { "epoch": 22.25, "grad_norm": 1.3164896965026855, "learning_rate": 0.0011100478468899521, "loss": 0.463, "step": 13950 }, { "epoch": 22.26, "grad_norm": 1.724993348121643, "learning_rate": 0.0011094098883572567, "loss": 0.5165, "step": 13960 }, { "epoch": 22.28, "grad_norm": 3.1152279376983643, "learning_rate": 0.0011087719298245612, "loss": 0.4615, "step": 13970 }, { "epoch": 22.3, "grad_norm": 2.1127662658691406, "learning_rate": 0.001108133971291866, "loss": 0.4926, "step": 13980 }, { "epoch": 22.31, "grad_norm": 2.012160062789917, "learning_rate": 0.0011074960127591706, "loss": 0.4331, "step": 13990 }, { "epoch": 22.33, "grad_norm": 1.761988639831543, "learning_rate": 0.0011068580542264752, "loss": 0.4927, "step": 14000 }, { "epoch": 22.34, "grad_norm": 1.8735899925231934, "learning_rate": 0.00110622009569378, "loss": 0.5162, "step": 14010 }, { "epoch": 22.36, "grad_norm": 1.9117660522460938, "learning_rate": 0.0011055821371610845, "loss": 0.4583, "step": 14020 }, { "epoch": 22.38, "grad_norm": 1.598494291305542, "learning_rate": 0.001104944178628389, "loss": 0.4943, "step": 14030 }, { "epoch": 22.39, "grad_norm": 1.6611143350601196, "learning_rate": 0.0011043062200956936, "loss": 0.4889, "step": 14040 }, { "epoch": 22.41, "grad_norm": 2.4984424114227295, "learning_rate": 0.0011036682615629984, "loss": 0.4838, "step": 14050 }, { "epoch": 22.42, "grad_norm": 2.082078695297241, "learning_rate": 0.001103030303030303, "loss": 0.6166, "step": 14060 }, { "epoch": 22.44, "grad_norm": 2.6350715160369873, "learning_rate": 0.0011023923444976075, "loss": 0.5082, "step": 14070 }, { "epoch": 22.46, "grad_norm": 1.6463345289230347, "learning_rate": 0.0011017543859649123, "loss": 0.4486, "step": 14080 }, { "epoch": 22.47, "grad_norm": 2.0142619609832764, "learning_rate": 0.0011011164274322169, "loss": 0.5573, "step": 14090 }, { "epoch": 22.49, "grad_norm": 2.3120744228363037, "learning_rate": 0.0011004784688995214, "loss": 0.5478, "step": 14100 }, { "epoch": 22.5, "grad_norm": 1.7484601736068726, "learning_rate": 0.001099840510366826, "loss": 0.5557, "step": 14110 }, { "epoch": 22.52, "grad_norm": 2.2994306087493896, "learning_rate": 0.0010992025518341308, "loss": 0.507, "step": 14120 }, { "epoch": 22.54, "grad_norm": 3.1111643314361572, "learning_rate": 0.0010985645933014353, "loss": 0.783, "step": 14130 }, { "epoch": 22.55, "grad_norm": 2.3941569328308105, "learning_rate": 0.00109792663476874, "loss": 0.6618, "step": 14140 }, { "epoch": 22.57, "grad_norm": 1.893367052078247, "learning_rate": 0.0010972886762360447, "loss": 0.5318, "step": 14150 }, { "epoch": 22.58, "grad_norm": 2.1536896228790283, "learning_rate": 0.0010966507177033492, "loss": 0.5581, "step": 14160 }, { "epoch": 22.6, "grad_norm": 2.7636032104492188, "learning_rate": 0.0010960127591706538, "loss": 0.5706, "step": 14170 }, { "epoch": 22.62, "grad_norm": 2.516028642654419, "learning_rate": 0.0010953748006379584, "loss": 0.5557, "step": 14180 }, { "epoch": 22.63, "grad_norm": 1.5299115180969238, "learning_rate": 0.0010947368421052634, "loss": 0.5118, "step": 14190 }, { "epoch": 22.65, "grad_norm": 2.1962053775787354, "learning_rate": 0.001094098883572568, "loss": 0.5678, "step": 14200 }, { "epoch": 22.66, "grad_norm": 4.639540195465088, "learning_rate": 0.0010934609250398725, "loss": 0.4226, "step": 14210 }, { "epoch": 22.68, "grad_norm": 3.8349008560180664, "learning_rate": 0.0010928229665071773, "loss": 0.5659, "step": 14220 }, { "epoch": 22.7, "grad_norm": 2.3924553394317627, "learning_rate": 0.0010921850079744818, "loss": 0.5681, "step": 14230 }, { "epoch": 22.71, "grad_norm": 3.5269014835357666, "learning_rate": 0.0010915470494417864, "loss": 0.5181, "step": 14240 }, { "epoch": 22.73, "grad_norm": 2.35038685798645, "learning_rate": 0.001090909090909091, "loss": 0.4825, "step": 14250 }, { "epoch": 22.74, "grad_norm": 2.1526710987091064, "learning_rate": 0.0010902711323763957, "loss": 0.5347, "step": 14260 }, { "epoch": 22.76, "grad_norm": 2.3087081909179688, "learning_rate": 0.0010896331738437003, "loss": 0.6681, "step": 14270 }, { "epoch": 22.78, "grad_norm": 1.9781696796417236, "learning_rate": 0.0010889952153110049, "loss": 0.5765, "step": 14280 }, { "epoch": 22.79, "grad_norm": 2.716538429260254, "learning_rate": 0.0010883572567783094, "loss": 0.6403, "step": 14290 }, { "epoch": 22.81, "grad_norm": 2.0449490547180176, "learning_rate": 0.0010877192982456142, "loss": 0.4586, "step": 14300 }, { "epoch": 22.82, "grad_norm": 2.0720322132110596, "learning_rate": 0.0010870813397129188, "loss": 0.597, "step": 14310 }, { "epoch": 22.84, "grad_norm": 1.6174436807632446, "learning_rate": 0.0010864433811802233, "loss": 0.4503, "step": 14320 }, { "epoch": 22.85, "grad_norm": 2.284149169921875, "learning_rate": 0.001085805422647528, "loss": 0.554, "step": 14330 }, { "epoch": 22.87, "grad_norm": 1.6513159275054932, "learning_rate": 0.0010851674641148327, "loss": 0.4929, "step": 14340 }, { "epoch": 22.89, "grad_norm": 3.105323076248169, "learning_rate": 0.0010845295055821372, "loss": 0.601, "step": 14350 }, { "epoch": 22.9, "grad_norm": 1.6782584190368652, "learning_rate": 0.0010838915470494418, "loss": 0.5175, "step": 14360 }, { "epoch": 22.92, "grad_norm": 2.065708875656128, "learning_rate": 0.0010832535885167466, "loss": 0.5765, "step": 14370 }, { "epoch": 22.93, "grad_norm": 3.1577556133270264, "learning_rate": 0.0010826156299840511, "loss": 0.6024, "step": 14380 }, { "epoch": 22.95, "grad_norm": 3.8669426441192627, "learning_rate": 0.0010819776714513557, "loss": 0.5703, "step": 14390 }, { "epoch": 22.97, "grad_norm": 2.084577798843384, "learning_rate": 0.0010813397129186605, "loss": 0.5736, "step": 14400 }, { "epoch": 22.98, "grad_norm": 2.3322348594665527, "learning_rate": 0.001080701754385965, "loss": 0.4955, "step": 14410 }, { "epoch": 23.0, "grad_norm": 2.981834650039673, "learning_rate": 0.0010800637958532696, "loss": 0.5941, "step": 14420 }, { "epoch": 23.01, "grad_norm": 1.649495244026184, "learning_rate": 0.0010794258373205742, "loss": 0.4623, "step": 14430 }, { "epoch": 23.03, "grad_norm": 2.4361202716827393, "learning_rate": 0.001078787878787879, "loss": 0.5178, "step": 14440 }, { "epoch": 23.05, "grad_norm": 1.9195847511291504, "learning_rate": 0.0010781499202551835, "loss": 0.5056, "step": 14450 }, { "epoch": 23.06, "grad_norm": 1.472584843635559, "learning_rate": 0.001077511961722488, "loss": 0.422, "step": 14460 }, { "epoch": 23.08, "grad_norm": 1.9220826625823975, "learning_rate": 0.0010768740031897928, "loss": 0.4461, "step": 14470 }, { "epoch": 23.09, "grad_norm": 2.0163981914520264, "learning_rate": 0.0010762360446570974, "loss": 0.405, "step": 14480 }, { "epoch": 23.11, "grad_norm": 2.0835061073303223, "learning_rate": 0.001075598086124402, "loss": 0.5177, "step": 14490 }, { "epoch": 23.13, "grad_norm": 0.9891412258148193, "learning_rate": 0.0010749601275917065, "loss": 0.5585, "step": 14500 }, { "epoch": 23.14, "grad_norm": 1.3112674951553345, "learning_rate": 0.0010743221690590113, "loss": 0.5358, "step": 14510 }, { "epoch": 23.16, "grad_norm": 1.358392357826233, "learning_rate": 0.0010736842105263159, "loss": 0.3928, "step": 14520 }, { "epoch": 23.17, "grad_norm": 1.7104527950286865, "learning_rate": 0.0010730462519936204, "loss": 0.4469, "step": 14530 }, { "epoch": 23.19, "grad_norm": 2.007497787475586, "learning_rate": 0.0010724082934609252, "loss": 0.4123, "step": 14540 }, { "epoch": 23.21, "grad_norm": 2.2213757038116455, "learning_rate": 0.0010717703349282298, "loss": 0.4139, "step": 14550 }, { "epoch": 23.22, "grad_norm": 1.6128385066986084, "learning_rate": 0.0010711323763955343, "loss": 0.4715, "step": 14560 }, { "epoch": 23.24, "grad_norm": 1.6998387575149536, "learning_rate": 0.001070494417862839, "loss": 0.433, "step": 14570 }, { "epoch": 23.25, "grad_norm": 1.7560913562774658, "learning_rate": 0.0010698564593301437, "loss": 0.5458, "step": 14580 }, { "epoch": 23.27, "grad_norm": 1.0924944877624512, "learning_rate": 0.0010692185007974483, "loss": 0.4552, "step": 14590 }, { "epoch": 23.29, "grad_norm": 1.2721997499465942, "learning_rate": 0.0010685805422647528, "loss": 0.4142, "step": 14600 }, { "epoch": 23.3, "grad_norm": 1.5277657508850098, "learning_rate": 0.0010679425837320576, "loss": 0.4749, "step": 14610 }, { "epoch": 23.32, "grad_norm": 1.4912691116333008, "learning_rate": 0.0010673046251993622, "loss": 0.4994, "step": 14620 }, { "epoch": 23.33, "grad_norm": 2.7884340286254883, "learning_rate": 0.0010666666666666667, "loss": 0.4654, "step": 14630 }, { "epoch": 23.35, "grad_norm": 3.288153886795044, "learning_rate": 0.0010660287081339713, "loss": 0.4726, "step": 14640 }, { "epoch": 23.37, "grad_norm": 1.869439721107483, "learning_rate": 0.001065390749601276, "loss": 0.5029, "step": 14650 }, { "epoch": 23.38, "grad_norm": 1.9574953317642212, "learning_rate": 0.0010647527910685806, "loss": 0.5279, "step": 14660 }, { "epoch": 23.4, "grad_norm": 3.001887321472168, "learning_rate": 0.0010641148325358852, "loss": 0.5283, "step": 14670 }, { "epoch": 23.41, "grad_norm": 3.0924551486968994, "learning_rate": 0.0010634768740031897, "loss": 0.4474, "step": 14680 }, { "epoch": 23.43, "grad_norm": 1.803222894668579, "learning_rate": 0.0010628389154704945, "loss": 0.4873, "step": 14690 }, { "epoch": 23.44, "grad_norm": 2.564887762069702, "learning_rate": 0.001062200956937799, "loss": 0.4802, "step": 14700 }, { "epoch": 23.46, "grad_norm": 2.3837051391601562, "learning_rate": 0.0010615629984051037, "loss": 0.5515, "step": 14710 }, { "epoch": 23.48, "grad_norm": 2.775334358215332, "learning_rate": 0.0010609250398724084, "loss": 0.5124, "step": 14720 }, { "epoch": 23.49, "grad_norm": 2.805455207824707, "learning_rate": 0.001060287081339713, "loss": 0.5103, "step": 14730 }, { "epoch": 23.51, "grad_norm": 4.685495376586914, "learning_rate": 0.0010596491228070176, "loss": 0.5481, "step": 14740 }, { "epoch": 23.52, "grad_norm": 1.6772174835205078, "learning_rate": 0.0010590111642743221, "loss": 0.5023, "step": 14750 }, { "epoch": 23.54, "grad_norm": 3.1417901515960693, "learning_rate": 0.001058373205741627, "loss": 0.4909, "step": 14760 }, { "epoch": 23.56, "grad_norm": 2.6341207027435303, "learning_rate": 0.0010577352472089315, "loss": 0.577, "step": 14770 }, { "epoch": 23.57, "grad_norm": 1.981137990951538, "learning_rate": 0.001057097288676236, "loss": 0.527, "step": 14780 }, { "epoch": 23.59, "grad_norm": 1.8690191507339478, "learning_rate": 0.0010564593301435408, "loss": 0.6524, "step": 14790 }, { "epoch": 23.6, "grad_norm": 2.470585584640503, "learning_rate": 0.0010558213716108454, "loss": 0.5171, "step": 14800 }, { "epoch": 23.62, "grad_norm": 1.5225473642349243, "learning_rate": 0.00105518341307815, "loss": 0.574, "step": 14810 }, { "epoch": 23.64, "grad_norm": 1.244357705116272, "learning_rate": 0.0010545454545454545, "loss": 0.513, "step": 14820 }, { "epoch": 23.65, "grad_norm": 3.7984049320220947, "learning_rate": 0.0010539074960127593, "loss": 0.4659, "step": 14830 }, { "epoch": 23.67, "grad_norm": 2.2695350646972656, "learning_rate": 0.0010532695374800638, "loss": 0.5469, "step": 14840 }, { "epoch": 23.68, "grad_norm": 2.1727049350738525, "learning_rate": 0.0010526315789473684, "loss": 0.4998, "step": 14850 }, { "epoch": 23.7, "grad_norm": 2.2124183177948, "learning_rate": 0.0010519936204146732, "loss": 0.4202, "step": 14860 }, { "epoch": 23.72, "grad_norm": 1.9910480976104736, "learning_rate": 0.0010513556618819777, "loss": 0.4944, "step": 14870 }, { "epoch": 23.73, "grad_norm": 2.623316526412964, "learning_rate": 0.0010507177033492823, "loss": 0.5959, "step": 14880 }, { "epoch": 23.75, "grad_norm": 3.7587718963623047, "learning_rate": 0.0010500797448165869, "loss": 0.5584, "step": 14890 }, { "epoch": 23.76, "grad_norm": 2.0342280864715576, "learning_rate": 0.0010494417862838916, "loss": 0.5426, "step": 14900 }, { "epoch": 23.78, "grad_norm": 2.8675320148468018, "learning_rate": 0.0010488038277511962, "loss": 0.6461, "step": 14910 }, { "epoch": 23.8, "grad_norm": 2.026543617248535, "learning_rate": 0.0010481658692185008, "loss": 0.5184, "step": 14920 }, { "epoch": 23.81, "grad_norm": 2.560939073562622, "learning_rate": 0.0010475279106858055, "loss": 0.5449, "step": 14930 }, { "epoch": 23.83, "grad_norm": 2.085392951965332, "learning_rate": 0.0010468899521531101, "loss": 0.5984, "step": 14940 }, { "epoch": 23.84, "grad_norm": 2.2556986808776855, "learning_rate": 0.0010462519936204147, "loss": 0.4763, "step": 14950 }, { "epoch": 23.86, "grad_norm": 1.4370797872543335, "learning_rate": 0.0010456140350877192, "loss": 0.5311, "step": 14960 }, { "epoch": 23.88, "grad_norm": 1.252243161201477, "learning_rate": 0.001044976076555024, "loss": 0.4934, "step": 14970 }, { "epoch": 23.89, "grad_norm": 2.0001296997070312, "learning_rate": 0.0010443381180223286, "loss": 0.4836, "step": 14980 }, { "epoch": 23.91, "grad_norm": 2.259216070175171, "learning_rate": 0.0010437001594896331, "loss": 0.5749, "step": 14990 }, { "epoch": 23.92, "grad_norm": 1.5871505737304688, "learning_rate": 0.0010430622009569377, "loss": 0.5055, "step": 15000 }, { "epoch": 23.94, "grad_norm": 3.0217132568359375, "learning_rate": 0.0010424242424242425, "loss": 0.4674, "step": 15010 }, { "epoch": 23.96, "grad_norm": 2.425215482711792, "learning_rate": 0.001041786283891547, "loss": 0.4853, "step": 15020 }, { "epoch": 23.97, "grad_norm": 2.7950572967529297, "learning_rate": 0.0010411483253588516, "loss": 0.5244, "step": 15030 }, { "epoch": 23.99, "grad_norm": 1.8970431089401245, "learning_rate": 0.0010405103668261564, "loss": 0.538, "step": 15040 }, { "epoch": 24.0, "grad_norm": 0.7786374688148499, "learning_rate": 0.001039872408293461, "loss": 0.4562, "step": 15050 }, { "epoch": 24.02, "grad_norm": 1.385309100151062, "learning_rate": 0.0010392344497607655, "loss": 0.4353, "step": 15060 }, { "epoch": 24.04, "grad_norm": 3.768200397491455, "learning_rate": 0.00103859649122807, "loss": 0.4483, "step": 15070 }, { "epoch": 24.05, "grad_norm": 3.530329704284668, "learning_rate": 0.0010379585326953749, "loss": 0.4374, "step": 15080 }, { "epoch": 24.07, "grad_norm": 1.2706865072250366, "learning_rate": 0.0010373205741626794, "loss": 0.3817, "step": 15090 }, { "epoch": 24.08, "grad_norm": 0.9349244832992554, "learning_rate": 0.001036682615629984, "loss": 0.5057, "step": 15100 }, { "epoch": 24.1, "grad_norm": 3.0068447589874268, "learning_rate": 0.0010360446570972888, "loss": 0.4718, "step": 15110 }, { "epoch": 24.11, "grad_norm": 2.423353672027588, "learning_rate": 0.0010354066985645933, "loss": 0.4668, "step": 15120 }, { "epoch": 24.13, "grad_norm": 1.5053311586380005, "learning_rate": 0.0010347687400318979, "loss": 0.4525, "step": 15130 }, { "epoch": 24.15, "grad_norm": 1.660056710243225, "learning_rate": 0.0010341307814992024, "loss": 0.4734, "step": 15140 }, { "epoch": 24.16, "grad_norm": 1.5876003503799438, "learning_rate": 0.0010334928229665072, "loss": 0.3713, "step": 15150 }, { "epoch": 24.18, "grad_norm": 1.1910775899887085, "learning_rate": 0.0010328548644338118, "loss": 0.4542, "step": 15160 }, { "epoch": 24.19, "grad_norm": 2.1305978298187256, "learning_rate": 0.0010322169059011164, "loss": 0.4496, "step": 15170 }, { "epoch": 24.21, "grad_norm": 1.9429091215133667, "learning_rate": 0.0010315789473684211, "loss": 0.4965, "step": 15180 }, { "epoch": 24.23, "grad_norm": 1.6149272918701172, "learning_rate": 0.0010309409888357257, "loss": 0.4369, "step": 15190 }, { "epoch": 24.24, "grad_norm": 1.5995999574661255, "learning_rate": 0.0010303030303030303, "loss": 0.4579, "step": 15200 }, { "epoch": 24.26, "grad_norm": 1.6771583557128906, "learning_rate": 0.0010296650717703348, "loss": 0.4289, "step": 15210 }, { "epoch": 24.27, "grad_norm": 2.6575920581817627, "learning_rate": 0.0010290271132376396, "loss": 0.5622, "step": 15220 }, { "epoch": 24.29, "grad_norm": 1.5423036813735962, "learning_rate": 0.0010283891547049442, "loss": 0.4558, "step": 15230 }, { "epoch": 24.31, "grad_norm": 1.731204628944397, "learning_rate": 0.0010277511961722487, "loss": 0.4732, "step": 15240 }, { "epoch": 24.32, "grad_norm": 2.5990333557128906, "learning_rate": 0.0010271132376395535, "loss": 0.5179, "step": 15250 }, { "epoch": 24.34, "grad_norm": 1.5724194049835205, "learning_rate": 0.001026475279106858, "loss": 0.4896, "step": 15260 }, { "epoch": 24.35, "grad_norm": 3.0556674003601074, "learning_rate": 0.0010258373205741626, "loss": 0.522, "step": 15270 }, { "epoch": 24.37, "grad_norm": 2.603013515472412, "learning_rate": 0.0010251993620414672, "loss": 0.4626, "step": 15280 }, { "epoch": 24.39, "grad_norm": 1.3041783571243286, "learning_rate": 0.001024561403508772, "loss": 0.3713, "step": 15290 }, { "epoch": 24.4, "grad_norm": 1.5249779224395752, "learning_rate": 0.0010239234449760765, "loss": 0.4899, "step": 15300 }, { "epoch": 24.42, "grad_norm": 1.814285159111023, "learning_rate": 0.001023285486443381, "loss": 0.534, "step": 15310 }, { "epoch": 24.43, "grad_norm": 2.138099431991577, "learning_rate": 0.0010226475279106859, "loss": 0.4634, "step": 15320 }, { "epoch": 24.45, "grad_norm": 1.3936606645584106, "learning_rate": 0.0010220095693779904, "loss": 0.4094, "step": 15330 }, { "epoch": 24.47, "grad_norm": 1.609049677848816, "learning_rate": 0.001021371610845295, "loss": 0.4397, "step": 15340 }, { "epoch": 24.48, "grad_norm": 1.38874351978302, "learning_rate": 0.0010207336523125996, "loss": 0.5197, "step": 15350 }, { "epoch": 24.5, "grad_norm": 2.1596977710723877, "learning_rate": 0.0010200956937799043, "loss": 0.4791, "step": 15360 }, { "epoch": 24.51, "grad_norm": 1.4566435813903809, "learning_rate": 0.001019457735247209, "loss": 0.4453, "step": 15370 }, { "epoch": 24.53, "grad_norm": 1.784945011138916, "learning_rate": 0.0010188197767145135, "loss": 0.4974, "step": 15380 }, { "epoch": 24.55, "grad_norm": 1.9153186082839966, "learning_rate": 0.001018181818181818, "loss": 0.4708, "step": 15390 }, { "epoch": 24.56, "grad_norm": 1.7097647190093994, "learning_rate": 0.0010175438596491228, "loss": 0.4766, "step": 15400 }, { "epoch": 24.58, "grad_norm": 1.6198031902313232, "learning_rate": 0.0010169059011164274, "loss": 0.4857, "step": 15410 }, { "epoch": 24.59, "grad_norm": 2.2390496730804443, "learning_rate": 0.001016267942583732, "loss": 0.5341, "step": 15420 }, { "epoch": 24.61, "grad_norm": 2.7094318866729736, "learning_rate": 0.0010156299840510367, "loss": 0.5316, "step": 15430 }, { "epoch": 24.63, "grad_norm": 1.7831966876983643, "learning_rate": 0.0010149920255183413, "loss": 0.5638, "step": 15440 }, { "epoch": 24.64, "grad_norm": 1.7682468891143799, "learning_rate": 0.0010143540669856458, "loss": 0.4232, "step": 15450 }, { "epoch": 24.66, "grad_norm": 3.382634401321411, "learning_rate": 0.0010137161084529504, "loss": 0.5211, "step": 15460 }, { "epoch": 24.67, "grad_norm": 1.6117042303085327, "learning_rate": 0.0010130781499202552, "loss": 0.5442, "step": 15470 }, { "epoch": 24.69, "grad_norm": 2.2903084754943848, "learning_rate": 0.0010124401913875597, "loss": 0.5145, "step": 15480 }, { "epoch": 24.7, "grad_norm": 1.3082456588745117, "learning_rate": 0.0010118022328548643, "loss": 0.5143, "step": 15490 }, { "epoch": 24.72, "grad_norm": 1.9928056001663208, "learning_rate": 0.001011164274322169, "loss": 0.5222, "step": 15500 }, { "epoch": 24.74, "grad_norm": 1.8907785415649414, "learning_rate": 0.0010105263157894737, "loss": 0.4507, "step": 15510 }, { "epoch": 24.75, "grad_norm": 1.9465861320495605, "learning_rate": 0.0010098883572567782, "loss": 0.4636, "step": 15520 }, { "epoch": 24.77, "grad_norm": 1.3651535511016846, "learning_rate": 0.0010092503987240828, "loss": 0.4484, "step": 15530 }, { "epoch": 24.78, "grad_norm": 1.8174107074737549, "learning_rate": 0.0010086124401913876, "loss": 0.4479, "step": 15540 }, { "epoch": 24.8, "grad_norm": 1.7005228996276855, "learning_rate": 0.0010079744816586921, "loss": 0.5416, "step": 15550 }, { "epoch": 24.82, "grad_norm": 1.941279649734497, "learning_rate": 0.0010073365231259967, "loss": 0.5404, "step": 15560 }, { "epoch": 24.83, "grad_norm": 2.1660587787628174, "learning_rate": 0.0010066985645933015, "loss": 0.5335, "step": 15570 }, { "epoch": 24.85, "grad_norm": 2.4644267559051514, "learning_rate": 0.001006060606060606, "loss": 0.6635, "step": 15580 }, { "epoch": 24.86, "grad_norm": 1.596439242362976, "learning_rate": 0.0010054226475279106, "loss": 0.4986, "step": 15590 }, { "epoch": 24.88, "grad_norm": 1.4329978227615356, "learning_rate": 0.0010047846889952152, "loss": 0.4721, "step": 15600 }, { "epoch": 24.9, "grad_norm": 1.454533338546753, "learning_rate": 0.00100414673046252, "loss": 0.4796, "step": 15610 }, { "epoch": 24.91, "grad_norm": 2.5839779376983643, "learning_rate": 0.0010035087719298245, "loss": 0.5149, "step": 15620 }, { "epoch": 24.93, "grad_norm": 2.21061110496521, "learning_rate": 0.001002870813397129, "loss": 0.5191, "step": 15630 }, { "epoch": 24.94, "grad_norm": 1.6350433826446533, "learning_rate": 0.0010022328548644338, "loss": 0.4748, "step": 15640 }, { "epoch": 24.96, "grad_norm": 1.638689637184143, "learning_rate": 0.0010015948963317384, "loss": 0.4639, "step": 15650 }, { "epoch": 24.98, "grad_norm": 1.925967812538147, "learning_rate": 0.001000956937799043, "loss": 0.6168, "step": 15660 }, { "epoch": 24.99, "grad_norm": 1.7674167156219482, "learning_rate": 0.0010003189792663475, "loss": 0.5219, "step": 15670 }, { "epoch": 25.01, "grad_norm": 0.9835655689239502, "learning_rate": 0.0009996810207336523, "loss": 0.391, "step": 15680 }, { "epoch": 25.02, "grad_norm": 1.7107539176940918, "learning_rate": 0.000999043062200957, "loss": 0.3556, "step": 15690 }, { "epoch": 25.04, "grad_norm": 2.7259128093719482, "learning_rate": 0.0009984051036682616, "loss": 0.5067, "step": 15700 }, { "epoch": 25.06, "grad_norm": 1.4780336618423462, "learning_rate": 0.0009977671451355662, "loss": 0.4045, "step": 15710 }, { "epoch": 25.07, "grad_norm": 1.699403166770935, "learning_rate": 0.000997129186602871, "loss": 0.4629, "step": 15720 }, { "epoch": 25.09, "grad_norm": 2.0610368251800537, "learning_rate": 0.0009964912280701755, "loss": 0.4457, "step": 15730 }, { "epoch": 25.1, "grad_norm": 1.1959340572357178, "learning_rate": 0.0009958532695374801, "loss": 0.435, "step": 15740 }, { "epoch": 25.12, "grad_norm": 2.4365720748901367, "learning_rate": 0.0009952153110047847, "loss": 0.444, "step": 15750 }, { "epoch": 25.14, "grad_norm": 1.2574375867843628, "learning_rate": 0.0009945773524720895, "loss": 0.4484, "step": 15760 }, { "epoch": 25.15, "grad_norm": 3.0744266510009766, "learning_rate": 0.000993939393939394, "loss": 0.5128, "step": 15770 }, { "epoch": 25.17, "grad_norm": 1.3673443794250488, "learning_rate": 0.0009933014354066986, "loss": 0.528, "step": 15780 }, { "epoch": 25.18, "grad_norm": 1.0166288614273071, "learning_rate": 0.0009926634768740031, "loss": 0.4576, "step": 15790 }, { "epoch": 25.2, "grad_norm": 2.5745012760162354, "learning_rate": 0.000992025518341308, "loss": 0.4262, "step": 15800 }, { "epoch": 25.22, "grad_norm": 1.265143871307373, "learning_rate": 0.0009913875598086125, "loss": 0.4196, "step": 15810 }, { "epoch": 25.23, "grad_norm": 2.3100552558898926, "learning_rate": 0.000990749601275917, "loss": 0.5067, "step": 15820 }, { "epoch": 25.25, "grad_norm": 1.1458524465560913, "learning_rate": 0.0009901116427432218, "loss": 0.4197, "step": 15830 }, { "epoch": 25.26, "grad_norm": 1.4825867414474487, "learning_rate": 0.0009894736842105264, "loss": 0.4208, "step": 15840 }, { "epoch": 25.28, "grad_norm": 2.505919933319092, "learning_rate": 0.000988835725677831, "loss": 0.5082, "step": 15850 }, { "epoch": 25.3, "grad_norm": 1.547998070716858, "learning_rate": 0.0009881977671451355, "loss": 0.4517, "step": 15860 }, { "epoch": 25.31, "grad_norm": 1.6311086416244507, "learning_rate": 0.0009875598086124403, "loss": 0.4928, "step": 15870 }, { "epoch": 25.33, "grad_norm": 1.7544368505477905, "learning_rate": 0.0009869218500797449, "loss": 0.4175, "step": 15880 }, { "epoch": 25.34, "grad_norm": 1.1133722066879272, "learning_rate": 0.0009862838915470494, "loss": 0.4709, "step": 15890 }, { "epoch": 25.36, "grad_norm": 1.8425043821334839, "learning_rate": 0.0009856459330143542, "loss": 0.425, "step": 15900 }, { "epoch": 25.37, "grad_norm": 1.6408649682998657, "learning_rate": 0.0009850079744816588, "loss": 0.4856, "step": 15910 }, { "epoch": 25.39, "grad_norm": 2.6448709964752197, "learning_rate": 0.0009843700159489633, "loss": 0.5066, "step": 15920 }, { "epoch": 25.41, "grad_norm": 3.6012330055236816, "learning_rate": 0.0009837320574162679, "loss": 0.4534, "step": 15930 }, { "epoch": 25.42, "grad_norm": 3.10849666595459, "learning_rate": 0.0009830940988835727, "loss": 0.458, "step": 15940 }, { "epoch": 25.44, "grad_norm": 1.3097262382507324, "learning_rate": 0.0009824561403508772, "loss": 0.4695, "step": 15950 }, { "epoch": 25.45, "grad_norm": 1.4666467905044556, "learning_rate": 0.0009818181818181818, "loss": 0.4955, "step": 15960 }, { "epoch": 25.47, "grad_norm": 2.279972791671753, "learning_rate": 0.0009811802232854866, "loss": 0.4531, "step": 15970 }, { "epoch": 25.49, "grad_norm": 1.8388824462890625, "learning_rate": 0.0009805422647527911, "loss": 0.5209, "step": 15980 }, { "epoch": 25.5, "grad_norm": 1.2906782627105713, "learning_rate": 0.0009799043062200957, "loss": 0.5712, "step": 15990 }, { "epoch": 25.52, "grad_norm": 1.1561537981033325, "learning_rate": 0.0009792663476874003, "loss": 0.4498, "step": 16000 }, { "epoch": 25.53, "grad_norm": 0.9394503831863403, "learning_rate": 0.000978628389154705, "loss": 0.4817, "step": 16010 }, { "epoch": 25.55, "grad_norm": 1.3297114372253418, "learning_rate": 0.0009779904306220096, "loss": 0.4238, "step": 16020 }, { "epoch": 25.57, "grad_norm": 1.832533597946167, "learning_rate": 0.0009773524720893142, "loss": 0.4492, "step": 16030 }, { "epoch": 25.58, "grad_norm": 1.8517677783966064, "learning_rate": 0.000976714513556619, "loss": 0.4209, "step": 16040 }, { "epoch": 25.6, "grad_norm": 1.57618248462677, "learning_rate": 0.0009760765550239234, "loss": 0.4237, "step": 16050 }, { "epoch": 25.61, "grad_norm": 2.6795618534088135, "learning_rate": 0.0009754385964912282, "loss": 0.4727, "step": 16060 }, { "epoch": 25.63, "grad_norm": 1.7043702602386475, "learning_rate": 0.0009748006379585327, "loss": 0.5071, "step": 16070 }, { "epoch": 25.65, "grad_norm": 2.142303228378296, "learning_rate": 0.0009741626794258374, "loss": 0.5109, "step": 16080 }, { "epoch": 25.66, "grad_norm": 1.5886117219924927, "learning_rate": 0.000973524720893142, "loss": 0.4755, "step": 16090 }, { "epoch": 25.68, "grad_norm": 1.767467975616455, "learning_rate": 0.0009728867623604466, "loss": 0.455, "step": 16100 }, { "epoch": 25.69, "grad_norm": 1.6067595481872559, "learning_rate": 0.0009722488038277513, "loss": 0.4701, "step": 16110 }, { "epoch": 25.71, "grad_norm": 1.5716664791107178, "learning_rate": 0.0009716108452950559, "loss": 0.4849, "step": 16120 }, { "epoch": 25.73, "grad_norm": 1.2191548347473145, "learning_rate": 0.0009709728867623605, "loss": 0.4129, "step": 16130 }, { "epoch": 25.74, "grad_norm": 1.7351710796356201, "learning_rate": 0.0009703349282296651, "loss": 0.5078, "step": 16140 }, { "epoch": 25.76, "grad_norm": 1.1435052156448364, "learning_rate": 0.0009696969696969698, "loss": 0.4075, "step": 16150 }, { "epoch": 25.77, "grad_norm": 2.094747304916382, "learning_rate": 0.0009690590111642743, "loss": 0.3737, "step": 16160 }, { "epoch": 25.79, "grad_norm": 2.186330556869507, "learning_rate": 0.000968421052631579, "loss": 0.4566, "step": 16170 }, { "epoch": 25.81, "grad_norm": 2.0006825923919678, "learning_rate": 0.0009677830940988836, "loss": 0.458, "step": 16180 }, { "epoch": 25.82, "grad_norm": 1.7449229955673218, "learning_rate": 0.0009671451355661883, "loss": 0.4487, "step": 16190 }, { "epoch": 25.84, "grad_norm": 1.6336495876312256, "learning_rate": 0.0009665071770334929, "loss": 0.4743, "step": 16200 }, { "epoch": 25.85, "grad_norm": 2.408162832260132, "learning_rate": 0.0009658692185007975, "loss": 0.5951, "step": 16210 }, { "epoch": 25.87, "grad_norm": 1.5623067617416382, "learning_rate": 0.0009652312599681022, "loss": 0.4353, "step": 16220 }, { "epoch": 25.89, "grad_norm": 1.187019944190979, "learning_rate": 0.0009645933014354067, "loss": 0.4622, "step": 16230 }, { "epoch": 25.9, "grad_norm": 1.6125158071517944, "learning_rate": 0.0009639553429027114, "loss": 0.4263, "step": 16240 }, { "epoch": 25.92, "grad_norm": 2.758575677871704, "learning_rate": 0.000963317384370016, "loss": 0.4074, "step": 16250 }, { "epoch": 25.93, "grad_norm": 1.477206826210022, "learning_rate": 0.0009626794258373206, "loss": 0.5476, "step": 16260 }, { "epoch": 25.95, "grad_norm": 2.584649085998535, "learning_rate": 0.0009620414673046253, "loss": 0.4626, "step": 16270 }, { "epoch": 25.96, "grad_norm": 1.4972593784332275, "learning_rate": 0.0009614035087719299, "loss": 0.4884, "step": 16280 }, { "epoch": 25.98, "grad_norm": 1.7186070680618286, "learning_rate": 0.0009607655502392345, "loss": 0.4263, "step": 16290 }, { "epoch": 26.0, "grad_norm": 2.3209738731384277, "learning_rate": 0.0009601275917065391, "loss": 0.5385, "step": 16300 }, { "epoch": 26.01, "grad_norm": 1.8050909042358398, "learning_rate": 0.0009594896331738438, "loss": 0.4111, "step": 16310 }, { "epoch": 26.03, "grad_norm": 1.729257345199585, "learning_rate": 0.0009588516746411483, "loss": 0.3666, "step": 16320 }, { "epoch": 26.04, "grad_norm": 2.2084038257598877, "learning_rate": 0.000958213716108453, "loss": 0.3862, "step": 16330 }, { "epoch": 26.06, "grad_norm": 1.1707019805908203, "learning_rate": 0.0009575757575757576, "loss": 0.4629, "step": 16340 }, { "epoch": 26.08, "grad_norm": 3.4062771797180176, "learning_rate": 0.0009569377990430622, "loss": 0.3774, "step": 16350 }, { "epoch": 26.09, "grad_norm": 1.5490500926971436, "learning_rate": 0.0009562998405103669, "loss": 0.4169, "step": 16360 }, { "epoch": 26.11, "grad_norm": 1.3803966045379639, "learning_rate": 0.0009556618819776715, "loss": 0.4544, "step": 16370 }, { "epoch": 26.12, "grad_norm": 1.512718915939331, "learning_rate": 0.0009550239234449761, "loss": 0.4163, "step": 16380 }, { "epoch": 26.14, "grad_norm": 1.462695837020874, "learning_rate": 0.0009543859649122807, "loss": 0.4556, "step": 16390 }, { "epoch": 26.16, "grad_norm": 1.237164855003357, "learning_rate": 0.0009537480063795854, "loss": 0.3972, "step": 16400 }, { "epoch": 26.17, "grad_norm": 1.3175599575042725, "learning_rate": 0.0009531100478468899, "loss": 0.4673, "step": 16410 }, { "epoch": 26.19, "grad_norm": 1.3138153553009033, "learning_rate": 0.0009524720893141946, "loss": 0.3655, "step": 16420 }, { "epoch": 26.2, "grad_norm": 1.4665474891662598, "learning_rate": 0.0009518341307814993, "loss": 0.4013, "step": 16430 }, { "epoch": 26.22, "grad_norm": 1.0699955224990845, "learning_rate": 0.0009511961722488038, "loss": 0.4483, "step": 16440 }, { "epoch": 26.24, "grad_norm": 1.7993961572647095, "learning_rate": 0.0009505582137161085, "loss": 0.5006, "step": 16450 }, { "epoch": 26.25, "grad_norm": 2.075788736343384, "learning_rate": 0.0009499202551834131, "loss": 0.4183, "step": 16460 }, { "epoch": 26.27, "grad_norm": 1.428053379058838, "learning_rate": 0.0009492822966507177, "loss": 0.4499, "step": 16470 }, { "epoch": 26.28, "grad_norm": 0.9592264294624329, "learning_rate": 0.0009486443381180223, "loss": 0.4191, "step": 16480 }, { "epoch": 26.3, "grad_norm": 1.708006739616394, "learning_rate": 0.000948006379585327, "loss": 0.4259, "step": 16490 }, { "epoch": 26.32, "grad_norm": 2.1585805416107178, "learning_rate": 0.0009473684210526315, "loss": 0.5309, "step": 16500 }, { "epoch": 26.33, "grad_norm": 1.7619798183441162, "learning_rate": 0.0009467304625199362, "loss": 0.4947, "step": 16510 }, { "epoch": 26.35, "grad_norm": 2.408426523208618, "learning_rate": 0.0009460925039872409, "loss": 0.4576, "step": 16520 }, { "epoch": 26.36, "grad_norm": 1.5698516368865967, "learning_rate": 0.0009454545454545454, "loss": 0.4161, "step": 16530 }, { "epoch": 26.38, "grad_norm": 3.033655881881714, "learning_rate": 0.0009448165869218501, "loss": 0.5235, "step": 16540 }, { "epoch": 26.4, "grad_norm": 2.2422995567321777, "learning_rate": 0.0009441786283891547, "loss": 0.5035, "step": 16550 }, { "epoch": 26.41, "grad_norm": 2.3441011905670166, "learning_rate": 0.0009435406698564593, "loss": 0.4001, "step": 16560 }, { "epoch": 26.43, "grad_norm": 1.529283881187439, "learning_rate": 0.0009429027113237639, "loss": 0.3795, "step": 16570 }, { "epoch": 26.44, "grad_norm": 1.2047476768493652, "learning_rate": 0.0009422647527910686, "loss": 0.4504, "step": 16580 }, { "epoch": 26.46, "grad_norm": 2.407144069671631, "learning_rate": 0.0009416267942583733, "loss": 0.3977, "step": 16590 }, { "epoch": 26.48, "grad_norm": 1.3065524101257324, "learning_rate": 0.0009409888357256778, "loss": 0.4534, "step": 16600 }, { "epoch": 26.49, "grad_norm": 2.119401693344116, "learning_rate": 0.0009403508771929825, "loss": 0.513, "step": 16610 }, { "epoch": 26.51, "grad_norm": 1.1828601360321045, "learning_rate": 0.000939712918660287, "loss": 0.4113, "step": 16620 }, { "epoch": 26.52, "grad_norm": 3.7420921325683594, "learning_rate": 0.0009390749601275917, "loss": 0.4948, "step": 16630 }, { "epoch": 26.54, "grad_norm": 1.524720549583435, "learning_rate": 0.0009384370015948963, "loss": 0.4233, "step": 16640 }, { "epoch": 26.56, "grad_norm": 2.136596918106079, "learning_rate": 0.000937799043062201, "loss": 0.472, "step": 16650 }, { "epoch": 26.57, "grad_norm": 2.395744800567627, "learning_rate": 0.0009371610845295055, "loss": 0.4629, "step": 16660 }, { "epoch": 26.59, "grad_norm": 1.2973766326904297, "learning_rate": 0.0009365231259968102, "loss": 0.4332, "step": 16670 }, { "epoch": 26.6, "grad_norm": 2.164285659790039, "learning_rate": 0.0009358851674641149, "loss": 0.4451, "step": 16680 }, { "epoch": 26.62, "grad_norm": 1.284764051437378, "learning_rate": 0.0009352472089314194, "loss": 0.5361, "step": 16690 }, { "epoch": 26.63, "grad_norm": 1.187538743019104, "learning_rate": 0.0009346092503987241, "loss": 0.4414, "step": 16700 }, { "epoch": 26.65, "grad_norm": 2.0321905612945557, "learning_rate": 0.0009339712918660287, "loss": 0.3823, "step": 16710 }, { "epoch": 26.67, "grad_norm": 2.023181676864624, "learning_rate": 0.0009333333333333333, "loss": 0.5097, "step": 16720 }, { "epoch": 26.68, "grad_norm": 1.6229287385940552, "learning_rate": 0.0009326953748006379, "loss": 0.4747, "step": 16730 }, { "epoch": 26.7, "grad_norm": 1.848752498626709, "learning_rate": 0.0009320574162679426, "loss": 0.4416, "step": 16740 }, { "epoch": 26.71, "grad_norm": 1.674248218536377, "learning_rate": 0.0009314194577352472, "loss": 0.5362, "step": 16750 }, { "epoch": 26.73, "grad_norm": 0.7888638973236084, "learning_rate": 0.0009307814992025518, "loss": 0.4482, "step": 16760 }, { "epoch": 26.75, "grad_norm": 1.2110415697097778, "learning_rate": 0.0009301435406698565, "loss": 0.4771, "step": 16770 }, { "epoch": 26.76, "grad_norm": 2.884260654449463, "learning_rate": 0.000929505582137161, "loss": 0.44, "step": 16780 }, { "epoch": 26.78, "grad_norm": 1.4633077383041382, "learning_rate": 0.0009288676236044657, "loss": 0.4565, "step": 16790 }, { "epoch": 26.79, "grad_norm": 1.6688116788864136, "learning_rate": 0.0009282296650717703, "loss": 0.4524, "step": 16800 }, { "epoch": 26.81, "grad_norm": 1.4576424360275269, "learning_rate": 0.000927591706539075, "loss": 0.4253, "step": 16810 }, { "epoch": 26.83, "grad_norm": 1.47834312915802, "learning_rate": 0.0009269537480063796, "loss": 0.4857, "step": 16820 }, { "epoch": 26.84, "grad_norm": 1.2933154106140137, "learning_rate": 0.0009263157894736843, "loss": 0.4483, "step": 16830 }, { "epoch": 26.86, "grad_norm": 2.671135663986206, "learning_rate": 0.0009256778309409889, "loss": 0.4321, "step": 16840 }, { "epoch": 26.87, "grad_norm": 0.900836169719696, "learning_rate": 0.0009250398724082935, "loss": 0.4142, "step": 16850 }, { "epoch": 26.89, "grad_norm": 1.467921257019043, "learning_rate": 0.0009244019138755982, "loss": 0.4381, "step": 16860 }, { "epoch": 26.91, "grad_norm": 1.2465593814849854, "learning_rate": 0.0009237639553429027, "loss": 0.4075, "step": 16870 }, { "epoch": 26.92, "grad_norm": 1.7828130722045898, "learning_rate": 0.0009231259968102074, "loss": 0.4855, "step": 16880 }, { "epoch": 26.94, "grad_norm": 2.368098735809326, "learning_rate": 0.000922488038277512, "loss": 0.5569, "step": 16890 }, { "epoch": 26.95, "grad_norm": 1.8269487619400024, "learning_rate": 0.0009218500797448166, "loss": 0.5397, "step": 16900 }, { "epoch": 26.97, "grad_norm": 1.5869868993759155, "learning_rate": 0.0009212121212121213, "loss": 0.447, "step": 16910 }, { "epoch": 26.99, "grad_norm": 2.1379966735839844, "learning_rate": 0.0009205741626794259, "loss": 0.3943, "step": 16920 }, { "epoch": 27.0, "grad_norm": 0.7937178611755371, "learning_rate": 0.0009199362041467305, "loss": 0.4762, "step": 16930 }, { "epoch": 27.02, "grad_norm": 0.7434051036834717, "learning_rate": 0.0009192982456140351, "loss": 0.3192, "step": 16940 }, { "epoch": 27.03, "grad_norm": 1.9126826524734497, "learning_rate": 0.0009186602870813398, "loss": 0.3383, "step": 16950 }, { "epoch": 27.05, "grad_norm": 0.8884724378585815, "learning_rate": 0.0009180223285486443, "loss": 0.335, "step": 16960 }, { "epoch": 27.07, "grad_norm": 0.7101998329162598, "learning_rate": 0.000917384370015949, "loss": 0.355, "step": 16970 }, { "epoch": 27.08, "grad_norm": 1.6486220359802246, "learning_rate": 0.0009167464114832537, "loss": 0.4237, "step": 16980 }, { "epoch": 27.1, "grad_norm": 1.5720986127853394, "learning_rate": 0.0009161084529505583, "loss": 0.3649, "step": 16990 }, { "epoch": 27.11, "grad_norm": 1.331430196762085, "learning_rate": 0.0009154704944178629, "loss": 0.4673, "step": 17000 }, { "epoch": 27.13, "grad_norm": 1.1665971279144287, "learning_rate": 0.0009148325358851675, "loss": 0.3722, "step": 17010 }, { "epoch": 27.15, "grad_norm": 0.784977376461029, "learning_rate": 0.0009141945773524722, "loss": 0.3737, "step": 17020 }, { "epoch": 27.16, "grad_norm": 1.2977066040039062, "learning_rate": 0.0009135566188197767, "loss": 0.4048, "step": 17030 }, { "epoch": 27.18, "grad_norm": 0.9560506343841553, "learning_rate": 0.0009129186602870814, "loss": 0.4475, "step": 17040 }, { "epoch": 27.19, "grad_norm": 1.2674915790557861, "learning_rate": 0.000912280701754386, "loss": 0.4099, "step": 17050 }, { "epoch": 27.21, "grad_norm": 1.7103983163833618, "learning_rate": 0.0009116427432216906, "loss": 0.3881, "step": 17060 }, { "epoch": 27.22, "grad_norm": 1.464312195777893, "learning_rate": 0.0009110047846889953, "loss": 0.3751, "step": 17070 }, { "epoch": 27.24, "grad_norm": 1.2396901845932007, "learning_rate": 0.0009103668261562999, "loss": 0.3781, "step": 17080 }, { "epoch": 27.26, "grad_norm": 1.704807162284851, "learning_rate": 0.0009097288676236045, "loss": 0.3462, "step": 17090 }, { "epoch": 27.27, "grad_norm": 1.6080540418624878, "learning_rate": 0.0009090909090909091, "loss": 0.4836, "step": 17100 }, { "epoch": 27.29, "grad_norm": 2.0571115016937256, "learning_rate": 0.0009084529505582138, "loss": 0.424, "step": 17110 }, { "epoch": 27.3, "grad_norm": 1.9051276445388794, "learning_rate": 0.0009078149920255183, "loss": 0.4107, "step": 17120 }, { "epoch": 27.32, "grad_norm": 1.0614899396896362, "learning_rate": 0.000907177033492823, "loss": 0.4258, "step": 17130 }, { "epoch": 27.34, "grad_norm": 0.771443247795105, "learning_rate": 0.0009065390749601277, "loss": 0.4893, "step": 17140 }, { "epoch": 27.35, "grad_norm": 1.2591562271118164, "learning_rate": 0.0009059011164274322, "loss": 0.3847, "step": 17150 }, { "epoch": 27.37, "grad_norm": 2.423963785171509, "learning_rate": 0.0009052631578947369, "loss": 0.4034, "step": 17160 }, { "epoch": 27.38, "grad_norm": 1.5385760068893433, "learning_rate": 0.0009046251993620415, "loss": 0.4347, "step": 17170 }, { "epoch": 27.4, "grad_norm": 1.886620044708252, "learning_rate": 0.0009039872408293461, "loss": 0.5058, "step": 17180 }, { "epoch": 27.42, "grad_norm": 1.3259475231170654, "learning_rate": 0.0009033492822966507, "loss": 0.4189, "step": 17190 }, { "epoch": 27.43, "grad_norm": 2.392594814300537, "learning_rate": 0.0009027113237639554, "loss": 0.3627, "step": 17200 }, { "epoch": 27.45, "grad_norm": 2.2240548133850098, "learning_rate": 0.0009020733652312599, "loss": 0.4367, "step": 17210 }, { "epoch": 27.46, "grad_norm": 1.4467096328735352, "learning_rate": 0.0009014354066985646, "loss": 0.4677, "step": 17220 }, { "epoch": 27.48, "grad_norm": 1.5662921667099, "learning_rate": 0.0009007974481658693, "loss": 0.508, "step": 17230 }, { "epoch": 27.5, "grad_norm": 1.6414707899093628, "learning_rate": 0.0009001594896331738, "loss": 0.3679, "step": 17240 }, { "epoch": 27.51, "grad_norm": 1.0565104484558105, "learning_rate": 0.0008995215311004785, "loss": 0.3557, "step": 17250 }, { "epoch": 27.53, "grad_norm": 1.353499174118042, "learning_rate": 0.0008988835725677831, "loss": 0.4911, "step": 17260 }, { "epoch": 27.54, "grad_norm": 3.16988205909729, "learning_rate": 0.0008982456140350877, "loss": 0.5339, "step": 17270 }, { "epoch": 27.56, "grad_norm": 2.6091325283050537, "learning_rate": 0.0008976076555023923, "loss": 0.5084, "step": 17280 }, { "epoch": 27.58, "grad_norm": 1.3425127267837524, "learning_rate": 0.000896969696969697, "loss": 0.4548, "step": 17290 }, { "epoch": 27.59, "grad_norm": 1.9189682006835938, "learning_rate": 0.0008963317384370016, "loss": 0.4727, "step": 17300 }, { "epoch": 27.61, "grad_norm": 3.497046709060669, "learning_rate": 0.0008956937799043062, "loss": 0.5446, "step": 17310 }, { "epoch": 27.62, "grad_norm": 1.4161769151687622, "learning_rate": 0.0008950558213716109, "loss": 0.362, "step": 17320 }, { "epoch": 27.64, "grad_norm": 1.7099406719207764, "learning_rate": 0.0008944178628389154, "loss": 0.3934, "step": 17330 }, { "epoch": 27.66, "grad_norm": 2.129094362258911, "learning_rate": 0.0008937799043062201, "loss": 0.4169, "step": 17340 }, { "epoch": 27.67, "grad_norm": 1.5544568300247192, "learning_rate": 0.0008931419457735247, "loss": 0.4201, "step": 17350 }, { "epoch": 27.69, "grad_norm": 1.1022940874099731, "learning_rate": 0.0008925039872408293, "loss": 0.4958, "step": 17360 }, { "epoch": 27.7, "grad_norm": 1.771380066871643, "learning_rate": 0.0008918660287081339, "loss": 0.3859, "step": 17370 }, { "epoch": 27.72, "grad_norm": 2.647625207901001, "learning_rate": 0.0008912280701754386, "loss": 0.4444, "step": 17380 }, { "epoch": 27.74, "grad_norm": 1.4977500438690186, "learning_rate": 0.0008905901116427433, "loss": 0.4672, "step": 17390 }, { "epoch": 27.75, "grad_norm": 1.1140875816345215, "learning_rate": 0.0008899521531100478, "loss": 0.4828, "step": 17400 }, { "epoch": 27.77, "grad_norm": 2.1960301399230957, "learning_rate": 0.0008893141945773525, "loss": 0.5277, "step": 17410 }, { "epoch": 27.78, "grad_norm": 1.2357120513916016, "learning_rate": 0.000888676236044657, "loss": 0.3831, "step": 17420 }, { "epoch": 27.8, "grad_norm": 2.183209180831909, "learning_rate": 0.0008880382775119617, "loss": 0.4646, "step": 17430 }, { "epoch": 27.81, "grad_norm": 1.4991573095321655, "learning_rate": 0.0008874003189792663, "loss": 0.4445, "step": 17440 }, { "epoch": 27.83, "grad_norm": 2.547933340072632, "learning_rate": 0.000886762360446571, "loss": 0.609, "step": 17450 }, { "epoch": 27.85, "grad_norm": 1.751570224761963, "learning_rate": 0.0008861244019138756, "loss": 0.4723, "step": 17460 }, { "epoch": 27.86, "grad_norm": 1.5204071998596191, "learning_rate": 0.0008854864433811802, "loss": 0.3944, "step": 17470 }, { "epoch": 27.88, "grad_norm": 0.8941110372543335, "learning_rate": 0.0008848484848484849, "loss": 0.4384, "step": 17480 }, { "epoch": 27.89, "grad_norm": 3.1257965564727783, "learning_rate": 0.0008842105263157894, "loss": 0.403, "step": 17490 }, { "epoch": 27.91, "grad_norm": 1.1965994834899902, "learning_rate": 0.0008835725677830941, "loss": 0.4476, "step": 17500 }, { "epoch": 27.93, "grad_norm": 2.3756983280181885, "learning_rate": 0.0008829346092503987, "loss": 0.5574, "step": 17510 }, { "epoch": 27.94, "grad_norm": 2.028165578842163, "learning_rate": 0.0008822966507177033, "loss": 0.5269, "step": 17520 }, { "epoch": 27.96, "grad_norm": 2.061138868331909, "learning_rate": 0.0008816586921850079, "loss": 0.3863, "step": 17530 }, { "epoch": 27.97, "grad_norm": 1.6647778749465942, "learning_rate": 0.0008810207336523126, "loss": 0.4803, "step": 17540 }, { "epoch": 27.99, "grad_norm": 1.8999004364013672, "learning_rate": 0.0008803827751196173, "loss": 0.4465, "step": 17550 }, { "epoch": 28.01, "grad_norm": 1.4546337127685547, "learning_rate": 0.0008797448165869219, "loss": 0.4054, "step": 17560 }, { "epoch": 28.02, "grad_norm": 0.5947902798652649, "learning_rate": 0.0008791068580542266, "loss": 0.3905, "step": 17570 }, { "epoch": 28.04, "grad_norm": 1.231417179107666, "learning_rate": 0.0008784688995215311, "loss": 0.3693, "step": 17580 }, { "epoch": 28.05, "grad_norm": 1.068305253982544, "learning_rate": 0.0008778309409888358, "loss": 0.3525, "step": 17590 }, { "epoch": 28.07, "grad_norm": 1.7087610960006714, "learning_rate": 0.0008771929824561404, "loss": 0.3472, "step": 17600 }, { "epoch": 28.09, "grad_norm": 1.412925362586975, "learning_rate": 0.000876555023923445, "loss": 0.469, "step": 17610 }, { "epoch": 28.1, "grad_norm": 1.3570494651794434, "learning_rate": 0.0008759170653907497, "loss": 0.4222, "step": 17620 }, { "epoch": 28.12, "grad_norm": 0.9123827219009399, "learning_rate": 0.0008752791068580543, "loss": 0.3517, "step": 17630 }, { "epoch": 28.13, "grad_norm": 1.3093185424804688, "learning_rate": 0.0008746411483253589, "loss": 0.4117, "step": 17640 }, { "epoch": 28.15, "grad_norm": 1.1676615476608276, "learning_rate": 0.0008740031897926635, "loss": 0.3839, "step": 17650 }, { "epoch": 28.17, "grad_norm": 0.8572595119476318, "learning_rate": 0.0008733652312599682, "loss": 0.38, "step": 17660 }, { "epoch": 28.18, "grad_norm": 1.9796086549758911, "learning_rate": 0.0008727272727272727, "loss": 0.4742, "step": 17670 }, { "epoch": 28.2, "grad_norm": 1.599166989326477, "learning_rate": 0.0008720893141945774, "loss": 0.4556, "step": 17680 }, { "epoch": 28.21, "grad_norm": 1.9437137842178345, "learning_rate": 0.0008714513556618821, "loss": 0.4952, "step": 17690 }, { "epoch": 28.23, "grad_norm": 1.6551004648208618, "learning_rate": 0.0008708133971291866, "loss": 0.3986, "step": 17700 }, { "epoch": 28.25, "grad_norm": 1.8391096591949463, "learning_rate": 0.0008701754385964913, "loss": 0.4194, "step": 17710 }, { "epoch": 28.26, "grad_norm": 0.9920051097869873, "learning_rate": 0.0008695374800637959, "loss": 0.3206, "step": 17720 }, { "epoch": 28.28, "grad_norm": 1.8732203245162964, "learning_rate": 0.0008688995215311005, "loss": 0.4823, "step": 17730 }, { "epoch": 28.29, "grad_norm": 1.4714813232421875, "learning_rate": 0.0008682615629984051, "loss": 0.4007, "step": 17740 }, { "epoch": 28.31, "grad_norm": 1.8994234800338745, "learning_rate": 0.0008676236044657098, "loss": 0.4203, "step": 17750 }, { "epoch": 28.33, "grad_norm": 1.9376466274261475, "learning_rate": 0.0008669856459330143, "loss": 0.4652, "step": 17760 }, { "epoch": 28.34, "grad_norm": 2.3434700965881348, "learning_rate": 0.000866347687400319, "loss": 0.3574, "step": 17770 }, { "epoch": 28.36, "grad_norm": 1.5705221891403198, "learning_rate": 0.0008657097288676237, "loss": 0.3634, "step": 17780 }, { "epoch": 28.37, "grad_norm": 2.1308560371398926, "learning_rate": 0.0008650717703349283, "loss": 0.4423, "step": 17790 }, { "epoch": 28.39, "grad_norm": 0.966135561466217, "learning_rate": 0.0008644338118022329, "loss": 0.4221, "step": 17800 }, { "epoch": 28.41, "grad_norm": 1.37132728099823, "learning_rate": 0.0008637958532695375, "loss": 0.4348, "step": 17810 }, { "epoch": 28.42, "grad_norm": 1.676096796989441, "learning_rate": 0.0008631578947368422, "loss": 0.4338, "step": 17820 }, { "epoch": 28.44, "grad_norm": 1.1030077934265137, "learning_rate": 0.0008625199362041467, "loss": 0.4399, "step": 17830 }, { "epoch": 28.45, "grad_norm": 0.8978865146636963, "learning_rate": 0.0008618819776714514, "loss": 0.4306, "step": 17840 }, { "epoch": 28.47, "grad_norm": 1.170512080192566, "learning_rate": 0.0008612440191387561, "loss": 0.4347, "step": 17850 }, { "epoch": 28.48, "grad_norm": 1.0260136127471924, "learning_rate": 0.0008606060606060606, "loss": 0.3928, "step": 17860 }, { "epoch": 28.5, "grad_norm": 1.04338800907135, "learning_rate": 0.0008599681020733653, "loss": 0.4193, "step": 17870 }, { "epoch": 28.52, "grad_norm": 0.9068986177444458, "learning_rate": 0.0008593301435406699, "loss": 0.3889, "step": 17880 }, { "epoch": 28.53, "grad_norm": 1.3259004354476929, "learning_rate": 0.0008586921850079745, "loss": 0.3873, "step": 17890 }, { "epoch": 28.55, "grad_norm": 1.3916800022125244, "learning_rate": 0.0008580542264752791, "loss": 0.4631, "step": 17900 }, { "epoch": 28.56, "grad_norm": 2.1619112491607666, "learning_rate": 0.0008574162679425838, "loss": 0.4275, "step": 17910 }, { "epoch": 28.58, "grad_norm": 1.750162959098816, "learning_rate": 0.0008567783094098883, "loss": 0.4469, "step": 17920 }, { "epoch": 28.6, "grad_norm": 1.2156579494476318, "learning_rate": 0.000856140350877193, "loss": 0.3898, "step": 17930 }, { "epoch": 28.61, "grad_norm": 1.1427280902862549, "learning_rate": 0.0008555023923444977, "loss": 0.4424, "step": 17940 }, { "epoch": 28.63, "grad_norm": 1.6410181522369385, "learning_rate": 0.0008548644338118022, "loss": 0.4153, "step": 17950 }, { "epoch": 28.64, "grad_norm": 1.1331639289855957, "learning_rate": 0.0008542264752791069, "loss": 0.4638, "step": 17960 }, { "epoch": 28.66, "grad_norm": 0.9264315366744995, "learning_rate": 0.0008535885167464115, "loss": 0.4634, "step": 17970 }, { "epoch": 28.68, "grad_norm": 1.4615089893341064, "learning_rate": 0.0008529505582137161, "loss": 0.383, "step": 17980 }, { "epoch": 28.69, "grad_norm": 1.291256070137024, "learning_rate": 0.0008523125996810207, "loss": 0.4415, "step": 17990 }, { "epoch": 28.71, "grad_norm": 1.3759894371032715, "learning_rate": 0.0008516746411483254, "loss": 0.4275, "step": 18000 }, { "epoch": 28.72, "grad_norm": 2.605381488800049, "learning_rate": 0.00085103668261563, "loss": 0.4614, "step": 18010 }, { "epoch": 28.74, "grad_norm": 1.3442084789276123, "learning_rate": 0.0008503987240829346, "loss": 0.4276, "step": 18020 }, { "epoch": 28.76, "grad_norm": 1.7800729274749756, "learning_rate": 0.0008497607655502393, "loss": 0.5137, "step": 18030 }, { "epoch": 28.77, "grad_norm": 1.6473747491836548, "learning_rate": 0.0008491228070175438, "loss": 0.4381, "step": 18040 }, { "epoch": 28.79, "grad_norm": 1.2551579475402832, "learning_rate": 0.0008484848484848485, "loss": 0.3784, "step": 18050 }, { "epoch": 28.8, "grad_norm": 1.7706053256988525, "learning_rate": 0.0008478468899521531, "loss": 0.4137, "step": 18060 }, { "epoch": 28.82, "grad_norm": 1.2189148664474487, "learning_rate": 0.0008472089314194577, "loss": 0.3987, "step": 18070 }, { "epoch": 28.84, "grad_norm": 2.0609757900238037, "learning_rate": 0.0008465709728867623, "loss": 0.4321, "step": 18080 }, { "epoch": 28.85, "grad_norm": 3.152968406677246, "learning_rate": 0.000845933014354067, "loss": 0.4866, "step": 18090 }, { "epoch": 28.87, "grad_norm": 1.9931256771087646, "learning_rate": 0.0008452950558213716, "loss": 0.3915, "step": 18100 }, { "epoch": 28.88, "grad_norm": 1.5088871717453003, "learning_rate": 0.0008446570972886762, "loss": 0.4172, "step": 18110 }, { "epoch": 28.9, "grad_norm": 0.9786420464515686, "learning_rate": 0.0008440191387559809, "loss": 0.392, "step": 18120 }, { "epoch": 28.92, "grad_norm": 1.9202160835266113, "learning_rate": 0.0008433811802232854, "loss": 0.3735, "step": 18130 }, { "epoch": 28.93, "grad_norm": 1.7300411462783813, "learning_rate": 0.0008427432216905901, "loss": 0.493, "step": 18140 }, { "epoch": 28.95, "grad_norm": 1.3852993249893188, "learning_rate": 0.0008421052631578947, "loss": 0.447, "step": 18150 }, { "epoch": 28.96, "grad_norm": 1.5973821878433228, "learning_rate": 0.0008414673046251993, "loss": 0.4247, "step": 18160 }, { "epoch": 28.98, "grad_norm": 1.521041989326477, "learning_rate": 0.000840829346092504, "loss": 0.514, "step": 18170 }, { "epoch": 29.0, "grad_norm": 1.2108961343765259, "learning_rate": 0.0008401913875598086, "loss": 0.4258, "step": 18180 }, { "epoch": 29.01, "grad_norm": 1.1732271909713745, "learning_rate": 0.0008395534290271133, "loss": 0.391, "step": 18190 }, { "epoch": 29.03, "grad_norm": 2.4832112789154053, "learning_rate": 0.0008389154704944178, "loss": 0.4139, "step": 18200 }, { "epoch": 29.04, "grad_norm": 1.2037804126739502, "learning_rate": 0.0008382775119617225, "loss": 0.3189, "step": 18210 }, { "epoch": 29.06, "grad_norm": 1.1315257549285889, "learning_rate": 0.000837639553429027, "loss": 0.3455, "step": 18220 }, { "epoch": 29.07, "grad_norm": 0.7903701663017273, "learning_rate": 0.0008370015948963317, "loss": 0.3511, "step": 18230 }, { "epoch": 29.09, "grad_norm": 1.1077697277069092, "learning_rate": 0.0008363636363636363, "loss": 0.4795, "step": 18240 }, { "epoch": 29.11, "grad_norm": 0.752619206905365, "learning_rate": 0.000835725677830941, "loss": 0.3194, "step": 18250 }, { "epoch": 29.12, "grad_norm": 2.1113548278808594, "learning_rate": 0.0008350877192982456, "loss": 0.3937, "step": 18260 }, { "epoch": 29.14, "grad_norm": 1.051826000213623, "learning_rate": 0.0008344497607655502, "loss": 0.3966, "step": 18270 }, { "epoch": 29.15, "grad_norm": 1.329938530921936, "learning_rate": 0.0008338118022328549, "loss": 0.4212, "step": 18280 }, { "epoch": 29.17, "grad_norm": 1.7144334316253662, "learning_rate": 0.0008331738437001594, "loss": 0.4333, "step": 18290 }, { "epoch": 29.19, "grad_norm": 1.252589464187622, "learning_rate": 0.0008325358851674642, "loss": 0.379, "step": 18300 }, { "epoch": 29.2, "grad_norm": 0.7238291501998901, "learning_rate": 0.0008318979266347688, "loss": 0.353, "step": 18310 }, { "epoch": 29.22, "grad_norm": 1.5246005058288574, "learning_rate": 0.0008312599681020734, "loss": 0.361, "step": 18320 }, { "epoch": 29.23, "grad_norm": 1.488550066947937, "learning_rate": 0.0008306220095693781, "loss": 0.4171, "step": 18330 }, { "epoch": 29.25, "grad_norm": 1.30950129032135, "learning_rate": 0.0008299840510366827, "loss": 0.37, "step": 18340 }, { "epoch": 29.27, "grad_norm": 2.5584652423858643, "learning_rate": 0.0008293460925039873, "loss": 0.4683, "step": 18350 }, { "epoch": 29.28, "grad_norm": 1.2544807195663452, "learning_rate": 0.0008287081339712919, "loss": 0.3517, "step": 18360 }, { "epoch": 29.3, "grad_norm": 1.9312729835510254, "learning_rate": 0.0008280701754385966, "loss": 0.3754, "step": 18370 }, { "epoch": 29.31, "grad_norm": 1.9227901697158813, "learning_rate": 0.0008274322169059011, "loss": 0.353, "step": 18380 }, { "epoch": 29.33, "grad_norm": 0.7560509443283081, "learning_rate": 0.0008267942583732058, "loss": 0.4141, "step": 18390 }, { "epoch": 29.35, "grad_norm": 1.5966806411743164, "learning_rate": 0.0008261562998405105, "loss": 0.419, "step": 18400 }, { "epoch": 29.36, "grad_norm": 1.8788731098175049, "learning_rate": 0.000825518341307815, "loss": 0.4134, "step": 18410 }, { "epoch": 29.38, "grad_norm": 0.7582562565803528, "learning_rate": 0.0008248803827751197, "loss": 0.3092, "step": 18420 }, { "epoch": 29.39, "grad_norm": 1.155375599861145, "learning_rate": 0.0008242424242424243, "loss": 0.4677, "step": 18430 }, { "epoch": 29.41, "grad_norm": 0.581142246723175, "learning_rate": 0.0008236044657097289, "loss": 0.4426, "step": 18440 }, { "epoch": 29.43, "grad_norm": 1.6078975200653076, "learning_rate": 0.0008229665071770335, "loss": 0.3892, "step": 18450 }, { "epoch": 29.44, "grad_norm": 1.3083795309066772, "learning_rate": 0.0008223285486443382, "loss": 0.3718, "step": 18460 }, { "epoch": 29.46, "grad_norm": 1.41934072971344, "learning_rate": 0.0008216905901116427, "loss": 0.3701, "step": 18470 }, { "epoch": 29.47, "grad_norm": 1.7969826459884644, "learning_rate": 0.0008210526315789474, "loss": 0.44, "step": 18480 }, { "epoch": 29.49, "grad_norm": 1.134151577949524, "learning_rate": 0.0008204146730462521, "loss": 0.3765, "step": 18490 }, { "epoch": 29.51, "grad_norm": 1.9421136379241943, "learning_rate": 0.0008197767145135566, "loss": 0.3526, "step": 18500 }, { "epoch": 29.52, "grad_norm": 1.0447088479995728, "learning_rate": 0.0008191387559808613, "loss": 0.3929, "step": 18510 }, { "epoch": 29.54, "grad_norm": 2.2842037677764893, "learning_rate": 0.0008185007974481659, "loss": 0.428, "step": 18520 }, { "epoch": 29.55, "grad_norm": 1.4780536890029907, "learning_rate": 0.0008178628389154705, "loss": 0.4503, "step": 18530 }, { "epoch": 29.57, "grad_norm": 1.1551343202590942, "learning_rate": 0.0008172248803827751, "loss": 0.3608, "step": 18540 }, { "epoch": 29.59, "grad_norm": 0.9097251892089844, "learning_rate": 0.0008165869218500798, "loss": 0.3289, "step": 18550 }, { "epoch": 29.6, "grad_norm": 1.372117042541504, "learning_rate": 0.0008159489633173845, "loss": 0.4009, "step": 18560 }, { "epoch": 29.62, "grad_norm": 1.3583626747131348, "learning_rate": 0.000815311004784689, "loss": 0.4491, "step": 18570 }, { "epoch": 29.63, "grad_norm": 1.208733320236206, "learning_rate": 0.0008146730462519937, "loss": 0.4185, "step": 18580 }, { "epoch": 29.65, "grad_norm": 1.2088313102722168, "learning_rate": 0.0008140350877192983, "loss": 0.3733, "step": 18590 }, { "epoch": 29.67, "grad_norm": 1.3854396343231201, "learning_rate": 0.0008133971291866029, "loss": 0.4189, "step": 18600 }, { "epoch": 29.68, "grad_norm": 2.3625354766845703, "learning_rate": 0.0008127591706539075, "loss": 0.4158, "step": 18610 }, { "epoch": 29.7, "grad_norm": 1.1244155168533325, "learning_rate": 0.0008121212121212122, "loss": 0.4763, "step": 18620 }, { "epoch": 29.71, "grad_norm": 1.6580774784088135, "learning_rate": 0.0008114832535885167, "loss": 0.441, "step": 18630 }, { "epoch": 29.73, "grad_norm": 1.9566985368728638, "learning_rate": 0.0008108452950558214, "loss": 0.5087, "step": 18640 }, { "epoch": 29.74, "grad_norm": 1.5847853422164917, "learning_rate": 0.0008102073365231261, "loss": 0.4876, "step": 18650 }, { "epoch": 29.76, "grad_norm": 1.6287412643432617, "learning_rate": 0.0008095693779904306, "loss": 0.3967, "step": 18660 }, { "epoch": 29.78, "grad_norm": 1.0776193141937256, "learning_rate": 0.0008089314194577353, "loss": 0.3406, "step": 18670 }, { "epoch": 29.79, "grad_norm": 1.6697405576705933, "learning_rate": 0.0008082934609250399, "loss": 0.3976, "step": 18680 }, { "epoch": 29.81, "grad_norm": 1.0817621946334839, "learning_rate": 0.0008076555023923445, "loss": 0.4045, "step": 18690 }, { "epoch": 29.82, "grad_norm": 1.333869457244873, "learning_rate": 0.0008070175438596491, "loss": 0.4219, "step": 18700 }, { "epoch": 29.84, "grad_norm": 1.135141134262085, "learning_rate": 0.0008063795853269538, "loss": 0.4283, "step": 18710 }, { "epoch": 29.86, "grad_norm": 1.497247576713562, "learning_rate": 0.0008057416267942584, "loss": 0.4653, "step": 18720 }, { "epoch": 29.87, "grad_norm": 1.3332675695419312, "learning_rate": 0.000805103668261563, "loss": 0.4842, "step": 18730 }, { "epoch": 29.89, "grad_norm": 1.2101079225540161, "learning_rate": 0.0008044657097288677, "loss": 0.4352, "step": 18740 }, { "epoch": 29.9, "grad_norm": 1.161740779876709, "learning_rate": 0.0008038277511961722, "loss": 0.4775, "step": 18750 }, { "epoch": 29.92, "grad_norm": 2.115443229675293, "learning_rate": 0.0008031897926634769, "loss": 0.4047, "step": 18760 }, { "epoch": 29.94, "grad_norm": 1.6414830684661865, "learning_rate": 0.0008025518341307815, "loss": 0.4002, "step": 18770 }, { "epoch": 29.95, "grad_norm": 2.5109338760375977, "learning_rate": 0.0008019138755980861, "loss": 0.456, "step": 18780 }, { "epoch": 29.97, "grad_norm": 1.0796329975128174, "learning_rate": 0.0008012759170653907, "loss": 0.3812, "step": 18790 }, { "epoch": 29.98, "grad_norm": 1.581429362297058, "learning_rate": 0.0008006379585326954, "loss": 0.4038, "step": 18800 }, { "epoch": 30.0, "grad_norm": 1.5293798446655273, "learning_rate": 0.0008, "loss": 0.4527, "step": 18810 }, { "epoch": 30.02, "grad_norm": 0.3642142117023468, "learning_rate": 0.0007993620414673046, "loss": 0.3424, "step": 18820 }, { "epoch": 30.03, "grad_norm": 1.181534767150879, "learning_rate": 0.0007987240829346093, "loss": 0.3467, "step": 18830 }, { "epoch": 30.05, "grad_norm": 1.3809243440628052, "learning_rate": 0.0007980861244019138, "loss": 0.3445, "step": 18840 }, { "epoch": 30.06, "grad_norm": 0.9193634986877441, "learning_rate": 0.0007974481658692185, "loss": 0.418, "step": 18850 }, { "epoch": 30.08, "grad_norm": 1.8701001405715942, "learning_rate": 0.0007968102073365231, "loss": 0.3377, "step": 18860 }, { "epoch": 30.1, "grad_norm": 1.0947514772415161, "learning_rate": 0.0007961722488038277, "loss": 0.3047, "step": 18870 }, { "epoch": 30.11, "grad_norm": 1.5115679502487183, "learning_rate": 0.0007955342902711324, "loss": 0.4687, "step": 18880 }, { "epoch": 30.13, "grad_norm": 1.4967734813690186, "learning_rate": 0.000794896331738437, "loss": 0.3585, "step": 18890 }, { "epoch": 30.14, "grad_norm": 1.0271711349487305, "learning_rate": 0.0007942583732057416, "loss": 0.3857, "step": 18900 }, { "epoch": 30.16, "grad_norm": 0.6969228982925415, "learning_rate": 0.0007936204146730462, "loss": 0.3916, "step": 18910 }, { "epoch": 30.18, "grad_norm": 1.0234570503234863, "learning_rate": 0.0007929824561403509, "loss": 0.2736, "step": 18920 }, { "epoch": 30.19, "grad_norm": 0.3337653577327728, "learning_rate": 0.0007923444976076554, "loss": 0.3358, "step": 18930 }, { "epoch": 30.21, "grad_norm": 0.8475213646888733, "learning_rate": 0.0007917065390749601, "loss": 0.4215, "step": 18940 }, { "epoch": 30.22, "grad_norm": 1.190527319908142, "learning_rate": 0.0007910685805422647, "loss": 0.3912, "step": 18950 }, { "epoch": 30.24, "grad_norm": 2.408203601837158, "learning_rate": 0.0007904306220095693, "loss": 0.4475, "step": 18960 }, { "epoch": 30.26, "grad_norm": 0.8776838183403015, "learning_rate": 0.000789792663476874, "loss": 0.3504, "step": 18970 }, { "epoch": 30.27, "grad_norm": 2.3490686416625977, "learning_rate": 0.0007891547049441786, "loss": 0.3283, "step": 18980 }, { "epoch": 30.29, "grad_norm": 0.9696643948554993, "learning_rate": 0.0007885167464114833, "loss": 0.3629, "step": 18990 }, { "epoch": 30.3, "grad_norm": 1.1829396486282349, "learning_rate": 0.0007878787878787878, "loss": 0.3908, "step": 19000 }, { "epoch": 30.32, "grad_norm": 1.0642168521881104, "learning_rate": 0.0007872408293460925, "loss": 0.359, "step": 19010 }, { "epoch": 30.33, "grad_norm": 1.3545867204666138, "learning_rate": 0.000786602870813397, "loss": 0.3656, "step": 19020 }, { "epoch": 30.35, "grad_norm": 1.1527637243270874, "learning_rate": 0.0007859649122807017, "loss": 0.358, "step": 19030 }, { "epoch": 30.37, "grad_norm": 1.6512736082077026, "learning_rate": 0.0007853269537480065, "loss": 0.3228, "step": 19040 }, { "epoch": 30.38, "grad_norm": 0.9381676912307739, "learning_rate": 0.0007846889952153111, "loss": 0.3604, "step": 19050 }, { "epoch": 30.4, "grad_norm": 0.786491870880127, "learning_rate": 0.0007840510366826157, "loss": 0.4115, "step": 19060 }, { "epoch": 30.41, "grad_norm": 1.0724458694458008, "learning_rate": 0.0007834130781499203, "loss": 0.431, "step": 19070 }, { "epoch": 30.43, "grad_norm": 1.1807036399841309, "learning_rate": 0.000782775119617225, "loss": 0.3628, "step": 19080 }, { "epoch": 30.45, "grad_norm": 2.059079647064209, "learning_rate": 0.0007821371610845295, "loss": 0.4, "step": 19090 }, { "epoch": 30.46, "grad_norm": 2.4132237434387207, "learning_rate": 0.0007814992025518342, "loss": 0.41, "step": 19100 }, { "epoch": 30.48, "grad_norm": 1.3958185911178589, "learning_rate": 0.0007808612440191388, "loss": 0.4278, "step": 19110 }, { "epoch": 30.49, "grad_norm": 1.9119430780410767, "learning_rate": 0.0007802232854864434, "loss": 0.3688, "step": 19120 }, { "epoch": 30.51, "grad_norm": 0.8936794996261597, "learning_rate": 0.0007795853269537481, "loss": 0.3909, "step": 19130 }, { "epoch": 30.53, "grad_norm": 0.8988013863563538, "learning_rate": 0.0007789473684210527, "loss": 0.413, "step": 19140 }, { "epoch": 30.54, "grad_norm": 1.2821409702301025, "learning_rate": 0.0007783094098883573, "loss": 0.3746, "step": 19150 }, { "epoch": 30.56, "grad_norm": 1.8209261894226074, "learning_rate": 0.0007776714513556619, "loss": 0.4328, "step": 19160 }, { "epoch": 30.57, "grad_norm": 0.5443445444107056, "learning_rate": 0.0007770334928229666, "loss": 0.3094, "step": 19170 }, { "epoch": 30.59, "grad_norm": 1.1508780717849731, "learning_rate": 0.0007763955342902711, "loss": 0.3781, "step": 19180 }, { "epoch": 30.61, "grad_norm": 1.742360234260559, "learning_rate": 0.0007757575757575758, "loss": 0.4211, "step": 19190 }, { "epoch": 30.62, "grad_norm": 1.645337462425232, "learning_rate": 0.0007751196172248805, "loss": 0.4973, "step": 19200 }, { "epoch": 30.64, "grad_norm": 3.0447423458099365, "learning_rate": 0.000774481658692185, "loss": 0.4262, "step": 19210 }, { "epoch": 30.65, "grad_norm": 1.7042852640151978, "learning_rate": 0.0007738437001594897, "loss": 0.3602, "step": 19220 }, { "epoch": 30.67, "grad_norm": 1.517238974571228, "learning_rate": 0.0007732057416267943, "loss": 0.4683, "step": 19230 }, { "epoch": 30.69, "grad_norm": 1.3933392763137817, "learning_rate": 0.0007725677830940989, "loss": 0.3464, "step": 19240 }, { "epoch": 30.7, "grad_norm": 1.2021284103393555, "learning_rate": 0.0007719298245614035, "loss": 0.3205, "step": 19250 }, { "epoch": 30.72, "grad_norm": 2.1373493671417236, "learning_rate": 0.0007712918660287082, "loss": 0.3901, "step": 19260 }, { "epoch": 30.73, "grad_norm": 1.1844630241394043, "learning_rate": 0.0007706539074960128, "loss": 0.4874, "step": 19270 }, { "epoch": 30.75, "grad_norm": 1.7633776664733887, "learning_rate": 0.0007700159489633174, "loss": 0.4525, "step": 19280 }, { "epoch": 30.77, "grad_norm": 0.8574751615524292, "learning_rate": 0.0007693779904306221, "loss": 0.4477, "step": 19290 }, { "epoch": 30.78, "grad_norm": 0.9806014895439148, "learning_rate": 0.0007687400318979266, "loss": 0.4231, "step": 19300 }, { "epoch": 30.8, "grad_norm": 1.515453577041626, "learning_rate": 0.0007681020733652313, "loss": 0.4159, "step": 19310 }, { "epoch": 30.81, "grad_norm": 1.195142149925232, "learning_rate": 0.0007674641148325359, "loss": 0.463, "step": 19320 }, { "epoch": 30.83, "grad_norm": 1.2401401996612549, "learning_rate": 0.0007668261562998405, "loss": 0.3719, "step": 19330 }, { "epoch": 30.85, "grad_norm": 1.4845614433288574, "learning_rate": 0.0007661881977671451, "loss": 0.4013, "step": 19340 }, { "epoch": 30.86, "grad_norm": 1.5066015720367432, "learning_rate": 0.0007655502392344498, "loss": 0.3917, "step": 19350 }, { "epoch": 30.88, "grad_norm": 1.3425683975219727, "learning_rate": 0.0007649122807017545, "loss": 0.4719, "step": 19360 }, { "epoch": 30.89, "grad_norm": 1.1638840436935425, "learning_rate": 0.000764274322169059, "loss": 0.3526, "step": 19370 }, { "epoch": 30.91, "grad_norm": 1.2238682508468628, "learning_rate": 0.0007636363636363637, "loss": 0.4084, "step": 19380 }, { "epoch": 30.93, "grad_norm": 2.78072190284729, "learning_rate": 0.0007629984051036683, "loss": 0.4198, "step": 19390 }, { "epoch": 30.94, "grad_norm": 1.495713472366333, "learning_rate": 0.0007623604465709729, "loss": 0.451, "step": 19400 }, { "epoch": 30.96, "grad_norm": 1.5464080572128296, "learning_rate": 0.0007617224880382775, "loss": 0.492, "step": 19410 }, { "epoch": 30.97, "grad_norm": 1.054413080215454, "learning_rate": 0.0007610845295055822, "loss": 0.372, "step": 19420 }, { "epoch": 30.99, "grad_norm": 1.212849736213684, "learning_rate": 0.0007604465709728868, "loss": 0.4701, "step": 19430 }, { "epoch": 31.0, "grad_norm": 1.4419000148773193, "learning_rate": 0.0007598086124401914, "loss": 0.4366, "step": 19440 }, { "epoch": 31.02, "grad_norm": 1.8938413858413696, "learning_rate": 0.0007591706539074961, "loss": 0.3519, "step": 19450 }, { "epoch": 31.04, "grad_norm": 0.6526773571968079, "learning_rate": 0.0007585326953748006, "loss": 0.3049, "step": 19460 }, { "epoch": 31.05, "grad_norm": 1.1578338146209717, "learning_rate": 0.0007578947368421053, "loss": 0.32, "step": 19470 }, { "epoch": 31.07, "grad_norm": 1.577438235282898, "learning_rate": 0.0007572567783094099, "loss": 0.3386, "step": 19480 }, { "epoch": 31.08, "grad_norm": 1.2335830926895142, "learning_rate": 0.0007566188197767145, "loss": 0.3304, "step": 19490 }, { "epoch": 31.1, "grad_norm": 1.2585694789886475, "learning_rate": 0.0007559808612440191, "loss": 0.347, "step": 19500 }, { "epoch": 31.12, "grad_norm": 0.6757459044456482, "learning_rate": 0.0007553429027113238, "loss": 0.3262, "step": 19510 }, { "epoch": 31.13, "grad_norm": 1.0831152200698853, "learning_rate": 0.0007547049441786284, "loss": 0.3829, "step": 19520 }, { "epoch": 31.15, "grad_norm": 0.7511752843856812, "learning_rate": 0.000754066985645933, "loss": 0.3401, "step": 19530 }, { "epoch": 31.16, "grad_norm": 1.0880722999572754, "learning_rate": 0.0007534290271132377, "loss": 0.3095, "step": 19540 }, { "epoch": 31.18, "grad_norm": 0.5444307923316956, "learning_rate": 0.0007527910685805422, "loss": 0.3768, "step": 19550 }, { "epoch": 31.2, "grad_norm": 0.3923839330673218, "learning_rate": 0.0007521531100478469, "loss": 0.2829, "step": 19560 }, { "epoch": 31.21, "grad_norm": 1.0065523386001587, "learning_rate": 0.0007515151515151515, "loss": 0.3505, "step": 19570 }, { "epoch": 31.23, "grad_norm": 1.3828843832015991, "learning_rate": 0.0007508771929824561, "loss": 0.3315, "step": 19580 }, { "epoch": 31.24, "grad_norm": 1.0128448009490967, "learning_rate": 0.0007502392344497608, "loss": 0.3646, "step": 19590 }, { "epoch": 31.26, "grad_norm": 0.8419422507286072, "learning_rate": 0.0007496012759170654, "loss": 0.3779, "step": 19600 }, { "epoch": 31.28, "grad_norm": 0.926729679107666, "learning_rate": 0.00074896331738437, "loss": 0.3611, "step": 19610 }, { "epoch": 31.29, "grad_norm": 0.6536783576011658, "learning_rate": 0.0007483253588516746, "loss": 0.4033, "step": 19620 }, { "epoch": 31.31, "grad_norm": 0.8884857892990112, "learning_rate": 0.0007476874003189793, "loss": 0.3712, "step": 19630 }, { "epoch": 31.32, "grad_norm": 0.7593963146209717, "learning_rate": 0.0007470494417862838, "loss": 0.3676, "step": 19640 }, { "epoch": 31.34, "grad_norm": 0.4570366442203522, "learning_rate": 0.0007464114832535885, "loss": 0.3733, "step": 19650 }, { "epoch": 31.36, "grad_norm": 0.5554494857788086, "learning_rate": 0.0007457735247208931, "loss": 0.3223, "step": 19660 }, { "epoch": 31.37, "grad_norm": 0.5048463344573975, "learning_rate": 0.0007451355661881977, "loss": 0.3381, "step": 19670 }, { "epoch": 31.39, "grad_norm": 0.8749020099639893, "learning_rate": 0.0007444976076555024, "loss": 0.4279, "step": 19680 }, { "epoch": 31.4, "grad_norm": 1.8333324193954468, "learning_rate": 0.000743859649122807, "loss": 0.3923, "step": 19690 }, { "epoch": 31.42, "grad_norm": 0.5864129662513733, "learning_rate": 0.0007432216905901116, "loss": 0.3666, "step": 19700 }, { "epoch": 31.44, "grad_norm": 1.0455960035324097, "learning_rate": 0.0007425837320574162, "loss": 0.2746, "step": 19710 }, { "epoch": 31.45, "grad_norm": 0.851701021194458, "learning_rate": 0.0007419457735247209, "loss": 0.4655, "step": 19720 }, { "epoch": 31.47, "grad_norm": 1.4521914720535278, "learning_rate": 0.0007413078149920254, "loss": 0.3904, "step": 19730 }, { "epoch": 31.48, "grad_norm": 0.7903003096580505, "learning_rate": 0.0007406698564593301, "loss": 0.3511, "step": 19740 }, { "epoch": 31.5, "grad_norm": 0.5965768694877625, "learning_rate": 0.0007400318979266348, "loss": 0.348, "step": 19750 }, { "epoch": 31.52, "grad_norm": 1.1456114053726196, "learning_rate": 0.0007393939393939393, "loss": 0.3759, "step": 19760 }, { "epoch": 31.53, "grad_norm": 1.1218417882919312, "learning_rate": 0.000738755980861244, "loss": 0.3504, "step": 19770 }, { "epoch": 31.55, "grad_norm": 1.06869375705719, "learning_rate": 0.0007381180223285486, "loss": 0.3465, "step": 19780 }, { "epoch": 31.56, "grad_norm": 1.4235601425170898, "learning_rate": 0.0007374800637958534, "loss": 0.4782, "step": 19790 }, { "epoch": 31.58, "grad_norm": 1.7712465524673462, "learning_rate": 0.0007368421052631579, "loss": 0.4207, "step": 19800 }, { "epoch": 31.59, "grad_norm": 2.5546319484710693, "learning_rate": 0.0007362041467304626, "loss": 0.3945, "step": 19810 }, { "epoch": 31.61, "grad_norm": 1.4989862442016602, "learning_rate": 0.0007355661881977672, "loss": 0.3541, "step": 19820 }, { "epoch": 31.63, "grad_norm": 1.1824603080749512, "learning_rate": 0.0007349282296650718, "loss": 0.3834, "step": 19830 }, { "epoch": 31.64, "grad_norm": 1.468544602394104, "learning_rate": 0.0007342902711323765, "loss": 0.4389, "step": 19840 }, { "epoch": 31.66, "grad_norm": 1.5891380310058594, "learning_rate": 0.0007336523125996811, "loss": 0.3104, "step": 19850 }, { "epoch": 31.67, "grad_norm": 1.6829235553741455, "learning_rate": 0.0007330143540669857, "loss": 0.4124, "step": 19860 }, { "epoch": 31.69, "grad_norm": 0.8262020349502563, "learning_rate": 0.0007323763955342903, "loss": 0.4355, "step": 19870 }, { "epoch": 31.71, "grad_norm": 1.0964784622192383, "learning_rate": 0.000731738437001595, "loss": 0.4051, "step": 19880 }, { "epoch": 31.72, "grad_norm": 1.241181492805481, "learning_rate": 0.0007311004784688995, "loss": 0.3317, "step": 19890 }, { "epoch": 31.74, "grad_norm": 0.7173839807510376, "learning_rate": 0.0007304625199362042, "loss": 0.303, "step": 19900 }, { "epoch": 31.75, "grad_norm": 1.3341323137283325, "learning_rate": 0.0007298245614035089, "loss": 0.4809, "step": 19910 }, { "epoch": 31.77, "grad_norm": 0.7707849144935608, "learning_rate": 0.0007291866028708134, "loss": 0.3865, "step": 19920 }, { "epoch": 31.79, "grad_norm": 1.0697121620178223, "learning_rate": 0.0007285486443381181, "loss": 0.3991, "step": 19930 }, { "epoch": 31.8, "grad_norm": 0.8612807989120483, "learning_rate": 0.0007279106858054227, "loss": 0.3753, "step": 19940 }, { "epoch": 31.82, "grad_norm": 1.1503595113754272, "learning_rate": 0.0007272727272727273, "loss": 0.4917, "step": 19950 }, { "epoch": 31.83, "grad_norm": 2.088966131210327, "learning_rate": 0.0007266347687400319, "loss": 0.4367, "step": 19960 }, { "epoch": 31.85, "grad_norm": 1.2572288513183594, "learning_rate": 0.0007259968102073366, "loss": 0.4804, "step": 19970 }, { "epoch": 31.87, "grad_norm": 1.781175136566162, "learning_rate": 0.0007253588516746412, "loss": 0.4508, "step": 19980 }, { "epoch": 31.88, "grad_norm": 1.0523390769958496, "learning_rate": 0.0007247208931419458, "loss": 0.364, "step": 19990 }, { "epoch": 31.9, "grad_norm": 1.5974029302597046, "learning_rate": 0.0007240829346092505, "loss": 0.3279, "step": 20000 }, { "epoch": 31.91, "grad_norm": 1.0655145645141602, "learning_rate": 0.000723444976076555, "loss": 0.3507, "step": 20010 }, { "epoch": 31.93, "grad_norm": 1.4828819036483765, "learning_rate": 0.0007228070175438597, "loss": 0.3621, "step": 20020 }, { "epoch": 31.95, "grad_norm": 0.6305584907531738, "learning_rate": 0.0007221690590111643, "loss": 0.47, "step": 20030 }, { "epoch": 31.96, "grad_norm": 0.873736560344696, "learning_rate": 0.0007215311004784689, "loss": 0.3372, "step": 20040 }, { "epoch": 31.98, "grad_norm": 0.7609186768531799, "learning_rate": 0.0007208931419457735, "loss": 0.4067, "step": 20050 }, { "epoch": 31.99, "grad_norm": 0.638640284538269, "learning_rate": 0.0007202551834130782, "loss": 0.4417, "step": 20060 }, { "epoch": 32.01, "grad_norm": 1.3461627960205078, "learning_rate": 0.0007196172248803828, "loss": 0.3463, "step": 20070 }, { "epoch": 32.03, "grad_norm": 0.9440305233001709, "learning_rate": 0.0007189792663476874, "loss": 0.3378, "step": 20080 }, { "epoch": 32.04, "grad_norm": 1.406764268875122, "learning_rate": 0.0007183413078149921, "loss": 0.3547, "step": 20090 }, { "epoch": 32.06, "grad_norm": 1.0603829622268677, "learning_rate": 0.0007177033492822966, "loss": 0.303, "step": 20100 }, { "epoch": 32.07, "grad_norm": 0.5735631585121155, "learning_rate": 0.0007170653907496013, "loss": 0.321, "step": 20110 }, { "epoch": 32.09, "grad_norm": 0.5425032377243042, "learning_rate": 0.0007164274322169059, "loss": 0.3486, "step": 20120 }, { "epoch": 32.11, "grad_norm": 0.4281626045703888, "learning_rate": 0.0007157894736842105, "loss": 0.4408, "step": 20130 }, { "epoch": 32.12, "grad_norm": 0.6306964755058289, "learning_rate": 0.0007151515151515152, "loss": 0.3664, "step": 20140 }, { "epoch": 32.14, "grad_norm": 1.3869534730911255, "learning_rate": 0.0007145135566188198, "loss": 0.3345, "step": 20150 }, { "epoch": 32.15, "grad_norm": 1.281069040298462, "learning_rate": 0.0007138755980861245, "loss": 0.3758, "step": 20160 }, { "epoch": 32.17, "grad_norm": 0.7174175381660461, "learning_rate": 0.000713237639553429, "loss": 0.4612, "step": 20170 }, { "epoch": 32.19, "grad_norm": 0.7497925758361816, "learning_rate": 0.0007125996810207337, "loss": 0.3671, "step": 20180 }, { "epoch": 32.2, "grad_norm": 0.6942813992500305, "learning_rate": 0.0007119617224880383, "loss": 0.3147, "step": 20190 }, { "epoch": 32.22, "grad_norm": 1.1213644742965698, "learning_rate": 0.0007113237639553429, "loss": 0.3739, "step": 20200 }, { "epoch": 32.23, "grad_norm": 0.7664075493812561, "learning_rate": 0.0007106858054226475, "loss": 0.3105, "step": 20210 }, { "epoch": 32.25, "grad_norm": 0.9661602973937988, "learning_rate": 0.0007100478468899522, "loss": 0.3266, "step": 20220 }, { "epoch": 32.26, "grad_norm": 1.2888504266738892, "learning_rate": 0.0007094098883572568, "loss": 0.3405, "step": 20230 }, { "epoch": 32.28, "grad_norm": 0.829325258731842, "learning_rate": 0.0007087719298245614, "loss": 0.3778, "step": 20240 }, { "epoch": 32.3, "grad_norm": 0.8283563256263733, "learning_rate": 0.0007081339712918661, "loss": 0.389, "step": 20250 }, { "epoch": 32.31, "grad_norm": 2.8405203819274902, "learning_rate": 0.0007074960127591706, "loss": 0.4051, "step": 20260 }, { "epoch": 32.33, "grad_norm": 0.9580861330032349, "learning_rate": 0.0007068580542264753, "loss": 0.3376, "step": 20270 }, { "epoch": 32.34, "grad_norm": 1.7252624034881592, "learning_rate": 0.0007062200956937799, "loss": 0.3626, "step": 20280 }, { "epoch": 32.36, "grad_norm": 0.8010210990905762, "learning_rate": 0.0007055821371610845, "loss": 0.3524, "step": 20290 }, { "epoch": 32.38, "grad_norm": 0.7796013951301575, "learning_rate": 0.0007049441786283892, "loss": 0.3778, "step": 20300 }, { "epoch": 32.39, "grad_norm": 0.8661381602287292, "learning_rate": 0.0007043062200956938, "loss": 0.4093, "step": 20310 }, { "epoch": 32.41, "grad_norm": 0.987169623374939, "learning_rate": 0.0007036682615629984, "loss": 0.2886, "step": 20320 }, { "epoch": 32.42, "grad_norm": 1.090738296508789, "learning_rate": 0.000703030303030303, "loss": 0.3848, "step": 20330 }, { "epoch": 32.44, "grad_norm": 2.1070291996002197, "learning_rate": 0.0007023923444976077, "loss": 0.3214, "step": 20340 }, { "epoch": 32.46, "grad_norm": 1.216748833656311, "learning_rate": 0.0007017543859649122, "loss": 0.3446, "step": 20350 }, { "epoch": 32.47, "grad_norm": 0.8944370150566101, "learning_rate": 0.0007011164274322169, "loss": 0.3595, "step": 20360 }, { "epoch": 32.49, "grad_norm": 0.7445225119590759, "learning_rate": 0.0007004784688995215, "loss": 0.3376, "step": 20370 }, { "epoch": 32.5, "grad_norm": 0.584930419921875, "learning_rate": 0.0006998405103668261, "loss": 0.361, "step": 20380 }, { "epoch": 32.52, "grad_norm": 1.1852445602416992, "learning_rate": 0.0006992025518341308, "loss": 0.423, "step": 20390 }, { "epoch": 32.54, "grad_norm": 0.8096782565116882, "learning_rate": 0.0006985645933014354, "loss": 0.3028, "step": 20400 }, { "epoch": 32.55, "grad_norm": 0.7330004572868347, "learning_rate": 0.00069792663476874, "loss": 0.3499, "step": 20410 }, { "epoch": 32.57, "grad_norm": 0.556844174861908, "learning_rate": 0.0006972886762360446, "loss": 0.4042, "step": 20420 }, { "epoch": 32.58, "grad_norm": 1.2681604623794556, "learning_rate": 0.0006966507177033493, "loss": 0.2892, "step": 20430 }, { "epoch": 32.6, "grad_norm": 0.985937237739563, "learning_rate": 0.0006960127591706538, "loss": 0.367, "step": 20440 }, { "epoch": 32.62, "grad_norm": 0.685664176940918, "learning_rate": 0.0006953748006379585, "loss": 0.4311, "step": 20450 }, { "epoch": 32.63, "grad_norm": 0.580774188041687, "learning_rate": 0.0006947368421052632, "loss": 0.3973, "step": 20460 }, { "epoch": 32.65, "grad_norm": 0.5588364601135254, "learning_rate": 0.0006940988835725677, "loss": 0.3902, "step": 20470 }, { "epoch": 32.66, "grad_norm": 2.530954599380493, "learning_rate": 0.0006934609250398724, "loss": 0.4405, "step": 20480 }, { "epoch": 32.68, "grad_norm": 0.9018158316612244, "learning_rate": 0.000692822966507177, "loss": 0.4219, "step": 20490 }, { "epoch": 32.7, "grad_norm": 1.1370121240615845, "learning_rate": 0.0006921850079744816, "loss": 0.3901, "step": 20500 }, { "epoch": 32.71, "grad_norm": 0.9494215250015259, "learning_rate": 0.0006915470494417862, "loss": 0.3377, "step": 20510 }, { "epoch": 32.73, "grad_norm": 0.7825329899787903, "learning_rate": 0.0006909090909090909, "loss": 0.3479, "step": 20520 }, { "epoch": 32.74, "grad_norm": 1.0042078495025635, "learning_rate": 0.0006902711323763954, "loss": 0.3888, "step": 20530 }, { "epoch": 32.76, "grad_norm": 1.4132115840911865, "learning_rate": 0.0006896331738437002, "loss": 0.3898, "step": 20540 }, { "epoch": 32.78, "grad_norm": 0.9790666103363037, "learning_rate": 0.0006889952153110049, "loss": 0.4135, "step": 20550 }, { "epoch": 32.79, "grad_norm": 1.599612832069397, "learning_rate": 0.0006883572567783095, "loss": 0.4875, "step": 20560 }, { "epoch": 32.81, "grad_norm": 0.934172511100769, "learning_rate": 0.0006877192982456141, "loss": 0.4245, "step": 20570 }, { "epoch": 32.82, "grad_norm": 1.2698485851287842, "learning_rate": 0.0006870813397129187, "loss": 0.3811, "step": 20580 }, { "epoch": 32.84, "grad_norm": 1.3154641389846802, "learning_rate": 0.0006864433811802234, "loss": 0.3682, "step": 20590 }, { "epoch": 32.85, "grad_norm": 0.9714843034744263, "learning_rate": 0.0006858054226475279, "loss": 0.3105, "step": 20600 }, { "epoch": 32.87, "grad_norm": 0.6481144428253174, "learning_rate": 0.0006851674641148326, "loss": 0.3489, "step": 20610 }, { "epoch": 32.89, "grad_norm": 1.0251431465148926, "learning_rate": 0.0006845295055821373, "loss": 0.382, "step": 20620 }, { "epoch": 32.9, "grad_norm": 1.046749234199524, "learning_rate": 0.0006838915470494418, "loss": 0.4344, "step": 20630 }, { "epoch": 32.92, "grad_norm": 1.45224928855896, "learning_rate": 0.0006832535885167465, "loss": 0.3381, "step": 20640 }, { "epoch": 32.93, "grad_norm": 1.0642521381378174, "learning_rate": 0.0006826156299840511, "loss": 0.4037, "step": 20650 }, { "epoch": 32.95, "grad_norm": 0.7638436555862427, "learning_rate": 0.0006819776714513557, "loss": 0.4348, "step": 20660 }, { "epoch": 32.97, "grad_norm": 0.9343836307525635, "learning_rate": 0.0006813397129186603, "loss": 0.4039, "step": 20670 }, { "epoch": 32.98, "grad_norm": 0.6389197707176208, "learning_rate": 0.000680701754385965, "loss": 0.3571, "step": 20680 }, { "epoch": 33.0, "grad_norm": 1.0805469751358032, "learning_rate": 0.0006800637958532696, "loss": 0.3274, "step": 20690 }, { "epoch": 33.01, "grad_norm": 1.199524998664856, "learning_rate": 0.0006794258373205742, "loss": 0.3143, "step": 20700 }, { "epoch": 33.03, "grad_norm": 1.2967311143875122, "learning_rate": 0.0006787878787878789, "loss": 0.3555, "step": 20710 }, { "epoch": 33.05, "grad_norm": 1.0752925872802734, "learning_rate": 0.0006781499202551834, "loss": 0.3797, "step": 20720 }, { "epoch": 33.06, "grad_norm": 0.7720149159431458, "learning_rate": 0.0006775119617224881, "loss": 0.2873, "step": 20730 }, { "epoch": 33.08, "grad_norm": 0.6133707761764526, "learning_rate": 0.0006768740031897927, "loss": 0.3092, "step": 20740 }, { "epoch": 33.09, "grad_norm": 0.8874982595443726, "learning_rate": 0.0006762360446570973, "loss": 0.3187, "step": 20750 }, { "epoch": 33.11, "grad_norm": 1.3732993602752686, "learning_rate": 0.0006755980861244019, "loss": 0.2782, "step": 20760 }, { "epoch": 33.13, "grad_norm": 0.9452306032180786, "learning_rate": 0.0006749601275917066, "loss": 0.2779, "step": 20770 }, { "epoch": 33.14, "grad_norm": 1.7680912017822266, "learning_rate": 0.0006743221690590112, "loss": 0.2884, "step": 20780 }, { "epoch": 33.16, "grad_norm": 1.6482670307159424, "learning_rate": 0.0006736842105263158, "loss": 0.359, "step": 20790 }, { "epoch": 33.17, "grad_norm": 0.7076551914215088, "learning_rate": 0.0006730462519936205, "loss": 0.3134, "step": 20800 }, { "epoch": 33.19, "grad_norm": 0.630064845085144, "learning_rate": 0.000672408293460925, "loss": 0.4372, "step": 20810 }, { "epoch": 33.21, "grad_norm": 1.7952457666397095, "learning_rate": 0.0006717703349282297, "loss": 0.3711, "step": 20820 }, { "epoch": 33.22, "grad_norm": 2.3427815437316895, "learning_rate": 0.0006711323763955343, "loss": 0.3385, "step": 20830 }, { "epoch": 33.24, "grad_norm": 0.6796151995658875, "learning_rate": 0.0006704944178628389, "loss": 0.2952, "step": 20840 }, { "epoch": 33.25, "grad_norm": 1.292067527770996, "learning_rate": 0.0006698564593301436, "loss": 0.3058, "step": 20850 }, { "epoch": 33.27, "grad_norm": 0.8857368230819702, "learning_rate": 0.0006692185007974482, "loss": 0.3184, "step": 20860 }, { "epoch": 33.29, "grad_norm": 0.8489099740982056, "learning_rate": 0.0006685805422647528, "loss": 0.3405, "step": 20870 }, { "epoch": 33.3, "grad_norm": 0.4213086664676666, "learning_rate": 0.0006679425837320574, "loss": 0.2919, "step": 20880 }, { "epoch": 33.32, "grad_norm": 0.3164719343185425, "learning_rate": 0.0006673046251993621, "loss": 0.3406, "step": 20890 }, { "epoch": 33.33, "grad_norm": 1.2104874849319458, "learning_rate": 0.0006666666666666666, "loss": 0.3277, "step": 20900 }, { "epoch": 33.35, "grad_norm": 1.2871508598327637, "learning_rate": 0.0006660287081339713, "loss": 0.4363, "step": 20910 }, { "epoch": 33.37, "grad_norm": 1.1551439762115479, "learning_rate": 0.0006653907496012759, "loss": 0.4146, "step": 20920 }, { "epoch": 33.38, "grad_norm": 0.4967116713523865, "learning_rate": 0.0006647527910685805, "loss": 0.3614, "step": 20930 }, { "epoch": 33.4, "grad_norm": 1.4939340353012085, "learning_rate": 0.0006641148325358852, "loss": 0.2574, "step": 20940 }, { "epoch": 33.41, "grad_norm": 2.036379337310791, "learning_rate": 0.0006634768740031898, "loss": 0.2869, "step": 20950 }, { "epoch": 33.43, "grad_norm": 0.6936982870101929, "learning_rate": 0.0006628389154704945, "loss": 0.3495, "step": 20960 }, { "epoch": 33.44, "grad_norm": 1.4173444509506226, "learning_rate": 0.000662200956937799, "loss": 0.4171, "step": 20970 }, { "epoch": 33.46, "grad_norm": 0.9318954348564148, "learning_rate": 0.0006615629984051037, "loss": 0.368, "step": 20980 }, { "epoch": 33.48, "grad_norm": 0.6373530030250549, "learning_rate": 0.0006609250398724083, "loss": 0.3407, "step": 20990 }, { "epoch": 33.49, "grad_norm": 0.5435881614685059, "learning_rate": 0.0006602870813397129, "loss": 0.3582, "step": 21000 }, { "epoch": 33.51, "grad_norm": 0.5529409050941467, "learning_rate": 0.0006596491228070176, "loss": 0.397, "step": 21010 }, { "epoch": 33.52, "grad_norm": 0.6477614641189575, "learning_rate": 0.0006590111642743222, "loss": 0.4706, "step": 21020 }, { "epoch": 33.54, "grad_norm": 0.6400772929191589, "learning_rate": 0.0006583732057416268, "loss": 0.3936, "step": 21030 }, { "epoch": 33.56, "grad_norm": 0.7568894624710083, "learning_rate": 0.0006577352472089314, "loss": 0.2936, "step": 21040 }, { "epoch": 33.57, "grad_norm": 0.6574100852012634, "learning_rate": 0.0006570972886762361, "loss": 0.3454, "step": 21050 }, { "epoch": 33.59, "grad_norm": 0.6999270915985107, "learning_rate": 0.0006564593301435406, "loss": 0.3597, "step": 21060 }, { "epoch": 33.6, "grad_norm": 1.4358758926391602, "learning_rate": 0.0006558213716108453, "loss": 0.4673, "step": 21070 }, { "epoch": 33.62, "grad_norm": 0.8735805749893188, "learning_rate": 0.0006551834130781499, "loss": 0.3329, "step": 21080 }, { "epoch": 33.64, "grad_norm": 0.42257770895957947, "learning_rate": 0.0006545454545454545, "loss": 0.3934, "step": 21090 }, { "epoch": 33.65, "grad_norm": 0.5228465795516968, "learning_rate": 0.0006539074960127592, "loss": 0.3563, "step": 21100 }, { "epoch": 33.67, "grad_norm": 0.7257753014564514, "learning_rate": 0.0006532695374800638, "loss": 0.3194, "step": 21110 }, { "epoch": 33.68, "grad_norm": 1.101475477218628, "learning_rate": 0.0006526315789473684, "loss": 0.3633, "step": 21120 }, { "epoch": 33.7, "grad_norm": 1.2462613582611084, "learning_rate": 0.000651993620414673, "loss": 0.3845, "step": 21130 }, { "epoch": 33.72, "grad_norm": 0.8615121841430664, "learning_rate": 0.0006513556618819777, "loss": 0.3183, "step": 21140 }, { "epoch": 33.73, "grad_norm": 1.6341915130615234, "learning_rate": 0.0006507177033492822, "loss": 0.3739, "step": 21150 }, { "epoch": 33.75, "grad_norm": 0.7129934430122375, "learning_rate": 0.0006500797448165869, "loss": 0.3184, "step": 21160 }, { "epoch": 33.76, "grad_norm": 1.0505317449569702, "learning_rate": 0.0006494417862838916, "loss": 0.3377, "step": 21170 }, { "epoch": 33.78, "grad_norm": 0.6486239433288574, "learning_rate": 0.0006488038277511961, "loss": 0.4064, "step": 21180 }, { "epoch": 33.8, "grad_norm": 0.805962324142456, "learning_rate": 0.0006481658692185008, "loss": 0.4032, "step": 21190 }, { "epoch": 33.81, "grad_norm": 0.8866637349128723, "learning_rate": 0.0006475279106858054, "loss": 0.4194, "step": 21200 }, { "epoch": 33.83, "grad_norm": 2.0624029636383057, "learning_rate": 0.00064688995215311, "loss": 0.3906, "step": 21210 }, { "epoch": 33.84, "grad_norm": 0.9357002377510071, "learning_rate": 0.0006462519936204146, "loss": 0.4074, "step": 21220 }, { "epoch": 33.86, "grad_norm": 0.7102904915809631, "learning_rate": 0.0006456140350877193, "loss": 0.365, "step": 21230 }, { "epoch": 33.88, "grad_norm": 1.7485020160675049, "learning_rate": 0.0006449760765550238, "loss": 0.3714, "step": 21240 }, { "epoch": 33.89, "grad_norm": 1.0567692518234253, "learning_rate": 0.0006443381180223285, "loss": 0.3404, "step": 21250 }, { "epoch": 33.91, "grad_norm": 1.1951782703399658, "learning_rate": 0.0006437001594896332, "loss": 0.3729, "step": 21260 }, { "epoch": 33.92, "grad_norm": 1.056022047996521, "learning_rate": 0.0006430622009569377, "loss": 0.4126, "step": 21270 }, { "epoch": 33.94, "grad_norm": 0.45082104206085205, "learning_rate": 0.0006424242424242425, "loss": 0.3527, "step": 21280 }, { "epoch": 33.96, "grad_norm": 0.9164630174636841, "learning_rate": 0.0006417862838915471, "loss": 0.3731, "step": 21290 }, { "epoch": 33.97, "grad_norm": 1.3435860872268677, "learning_rate": 0.0006411483253588518, "loss": 0.3889, "step": 21300 }, { "epoch": 33.99, "grad_norm": 1.0820218324661255, "learning_rate": 0.0006405103668261563, "loss": 0.3823, "step": 21310 }, { "epoch": 34.0, "grad_norm": 0.43368542194366455, "learning_rate": 0.000639872408293461, "loss": 0.2854, "step": 21320 }, { "epoch": 34.02, "grad_norm": 1.282607913017273, "learning_rate": 0.0006392344497607657, "loss": 0.3346, "step": 21330 }, { "epoch": 34.04, "grad_norm": 0.48912495374679565, "learning_rate": 0.0006385964912280702, "loss": 0.3253, "step": 21340 }, { "epoch": 34.05, "grad_norm": 0.5396437644958496, "learning_rate": 0.0006379585326953749, "loss": 0.3848, "step": 21350 }, { "epoch": 34.07, "grad_norm": 0.43667685985565186, "learning_rate": 0.0006373205741626795, "loss": 0.389, "step": 21360 }, { "epoch": 34.08, "grad_norm": 0.7458956837654114, "learning_rate": 0.0006366826156299841, "loss": 0.3669, "step": 21370 }, { "epoch": 34.1, "grad_norm": 0.5539690256118774, "learning_rate": 0.0006360446570972887, "loss": 0.3428, "step": 21380 }, { "epoch": 34.11, "grad_norm": 0.5653215646743774, "learning_rate": 0.0006354066985645934, "loss": 0.3716, "step": 21390 }, { "epoch": 34.13, "grad_norm": 0.7448431849479675, "learning_rate": 0.0006347687400318979, "loss": 0.3799, "step": 21400 }, { "epoch": 34.15, "grad_norm": 1.6342990398406982, "learning_rate": 0.0006341307814992026, "loss": 0.281, "step": 21410 }, { "epoch": 34.16, "grad_norm": 1.0939606428146362, "learning_rate": 0.0006334928229665073, "loss": 0.2995, "step": 21420 }, { "epoch": 34.18, "grad_norm": 0.4550718665122986, "learning_rate": 0.0006328548644338118, "loss": 0.2839, "step": 21430 }, { "epoch": 34.19, "grad_norm": 0.7015230655670166, "learning_rate": 0.0006322169059011165, "loss": 0.3624, "step": 21440 }, { "epoch": 34.21, "grad_norm": 0.9311388731002808, "learning_rate": 0.0006315789473684211, "loss": 0.3116, "step": 21450 }, { "epoch": 34.23, "grad_norm": 0.519597053527832, "learning_rate": 0.0006309409888357257, "loss": 0.3464, "step": 21460 }, { "epoch": 34.24, "grad_norm": 0.687154233455658, "learning_rate": 0.0006303030303030303, "loss": 0.2955, "step": 21470 }, { "epoch": 34.26, "grad_norm": 0.6777644753456116, "learning_rate": 0.000629665071770335, "loss": 0.3579, "step": 21480 }, { "epoch": 34.27, "grad_norm": 1.1561830043792725, "learning_rate": 0.0006290271132376396, "loss": 0.324, "step": 21490 }, { "epoch": 34.29, "grad_norm": 0.6058475375175476, "learning_rate": 0.0006283891547049442, "loss": 0.4582, "step": 21500 }, { "epoch": 34.31, "grad_norm": 1.3952281475067139, "learning_rate": 0.0006277511961722489, "loss": 0.2972, "step": 21510 }, { "epoch": 34.32, "grad_norm": 0.9021815061569214, "learning_rate": 0.0006271132376395534, "loss": 0.3736, "step": 21520 }, { "epoch": 34.34, "grad_norm": 0.5777958631515503, "learning_rate": 0.0006264752791068581, "loss": 0.3418, "step": 21530 }, { "epoch": 34.35, "grad_norm": 0.5624024271965027, "learning_rate": 0.0006258373205741627, "loss": 0.3897, "step": 21540 }, { "epoch": 34.37, "grad_norm": 1.0554344654083252, "learning_rate": 0.0006251993620414673, "loss": 0.3636, "step": 21550 }, { "epoch": 34.39, "grad_norm": 0.39624953269958496, "learning_rate": 0.000624561403508772, "loss": 0.3199, "step": 21560 }, { "epoch": 34.4, "grad_norm": 0.8201066255569458, "learning_rate": 0.0006239234449760766, "loss": 0.4237, "step": 21570 }, { "epoch": 34.42, "grad_norm": 0.7447034120559692, "learning_rate": 0.0006232854864433812, "loss": 0.3045, "step": 21580 }, { "epoch": 34.43, "grad_norm": 0.37216076254844666, "learning_rate": 0.0006226475279106858, "loss": 0.287, "step": 21590 }, { "epoch": 34.45, "grad_norm": 1.3851195573806763, "learning_rate": 0.0006220095693779905, "loss": 0.3752, "step": 21600 }, { "epoch": 34.47, "grad_norm": 0.5135475397109985, "learning_rate": 0.000621371610845295, "loss": 0.2575, "step": 21610 }, { "epoch": 34.48, "grad_norm": 1.3252980709075928, "learning_rate": 0.0006207336523125997, "loss": 0.3204, "step": 21620 }, { "epoch": 34.5, "grad_norm": 1.036947250366211, "learning_rate": 0.0006200956937799043, "loss": 0.3715, "step": 21630 }, { "epoch": 34.51, "grad_norm": 0.9725881218910217, "learning_rate": 0.0006194577352472089, "loss": 0.2895, "step": 21640 }, { "epoch": 34.53, "grad_norm": 0.8383840322494507, "learning_rate": 0.0006188197767145136, "loss": 0.2823, "step": 21650 }, { "epoch": 34.55, "grad_norm": 0.5011244416236877, "learning_rate": 0.0006181818181818182, "loss": 0.2946, "step": 21660 }, { "epoch": 34.56, "grad_norm": 0.5851901769638062, "learning_rate": 0.0006175438596491228, "loss": 0.3679, "step": 21670 }, { "epoch": 34.58, "grad_norm": 1.2106326818466187, "learning_rate": 0.0006169059011164274, "loss": 0.3548, "step": 21680 }, { "epoch": 34.59, "grad_norm": 0.7996150255203247, "learning_rate": 0.0006162679425837321, "loss": 0.3279, "step": 21690 }, { "epoch": 34.61, "grad_norm": 0.9852333664894104, "learning_rate": 0.0006156299840510366, "loss": 0.3803, "step": 21700 }, { "epoch": 34.63, "grad_norm": 1.8588385581970215, "learning_rate": 0.0006149920255183413, "loss": 0.3173, "step": 21710 }, { "epoch": 34.64, "grad_norm": 1.403646469116211, "learning_rate": 0.000614354066985646, "loss": 0.383, "step": 21720 }, { "epoch": 34.66, "grad_norm": 0.7591367363929749, "learning_rate": 0.0006137161084529505, "loss": 0.3134, "step": 21730 }, { "epoch": 34.67, "grad_norm": 0.8111428022384644, "learning_rate": 0.0006130781499202552, "loss": 0.4017, "step": 21740 }, { "epoch": 34.69, "grad_norm": 0.7398600578308105, "learning_rate": 0.0006124401913875598, "loss": 0.3378, "step": 21750 }, { "epoch": 34.7, "grad_norm": 2.6811933517456055, "learning_rate": 0.0006118022328548645, "loss": 0.4099, "step": 21760 }, { "epoch": 34.72, "grad_norm": 0.5849010944366455, "learning_rate": 0.000611164274322169, "loss": 0.3977, "step": 21770 }, { "epoch": 34.74, "grad_norm": 1.6892285346984863, "learning_rate": 0.0006105263157894737, "loss": 0.3775, "step": 21780 }, { "epoch": 34.75, "grad_norm": 0.6772777438163757, "learning_rate": 0.0006098883572567783, "loss": 0.3051, "step": 21790 }, { "epoch": 34.77, "grad_norm": 0.7815658450126648, "learning_rate": 0.0006092503987240829, "loss": 0.3252, "step": 21800 }, { "epoch": 34.78, "grad_norm": 0.7828931212425232, "learning_rate": 0.0006086124401913876, "loss": 0.3276, "step": 21810 }, { "epoch": 34.8, "grad_norm": 0.6614720821380615, "learning_rate": 0.0006079744816586922, "loss": 0.3114, "step": 21820 }, { "epoch": 34.82, "grad_norm": 0.6951574087142944, "learning_rate": 0.0006073365231259968, "loss": 0.4708, "step": 21830 }, { "epoch": 34.83, "grad_norm": 0.5724729895591736, "learning_rate": 0.0006066985645933014, "loss": 0.4992, "step": 21840 }, { "epoch": 34.85, "grad_norm": 0.5912214517593384, "learning_rate": 0.0006060606060606061, "loss": 0.3571, "step": 21850 }, { "epoch": 34.86, "grad_norm": 1.9406144618988037, "learning_rate": 0.0006054226475279106, "loss": 0.4081, "step": 21860 }, { "epoch": 34.88, "grad_norm": 0.6928081512451172, "learning_rate": 0.0006047846889952153, "loss": 0.3651, "step": 21870 }, { "epoch": 34.9, "grad_norm": 1.4750044345855713, "learning_rate": 0.00060414673046252, "loss": 0.3445, "step": 21880 }, { "epoch": 34.91, "grad_norm": 1.0808309316635132, "learning_rate": 0.0006035087719298245, "loss": 0.3738, "step": 21890 }, { "epoch": 34.93, "grad_norm": 0.8171405792236328, "learning_rate": 0.0006028708133971292, "loss": 0.3313, "step": 21900 }, { "epoch": 34.94, "grad_norm": 0.9406991004943848, "learning_rate": 0.0006022328548644338, "loss": 0.3488, "step": 21910 }, { "epoch": 34.96, "grad_norm": 0.7322232127189636, "learning_rate": 0.0006015948963317384, "loss": 0.3948, "step": 21920 }, { "epoch": 34.98, "grad_norm": 1.1117455959320068, "learning_rate": 0.000600956937799043, "loss": 0.385, "step": 21930 }, { "epoch": 34.99, "grad_norm": 1.0977877378463745, "learning_rate": 0.0006003189792663477, "loss": 0.419, "step": 21940 }, { "epoch": 35.01, "grad_norm": 0.726335346698761, "learning_rate": 0.0005996810207336522, "loss": 0.3169, "step": 21950 }, { "epoch": 35.02, "grad_norm": 0.49332767724990845, "learning_rate": 0.0005990430622009569, "loss": 0.2902, "step": 21960 }, { "epoch": 35.04, "grad_norm": 0.9816588759422302, "learning_rate": 0.0005984051036682616, "loss": 0.3322, "step": 21970 }, { "epoch": 35.06, "grad_norm": 0.8066359162330627, "learning_rate": 0.0005977671451355661, "loss": 0.3058, "step": 21980 }, { "epoch": 35.07, "grad_norm": 0.38948720693588257, "learning_rate": 0.0005971291866028708, "loss": 0.2839, "step": 21990 }, { "epoch": 35.09, "grad_norm": 0.2944769561290741, "learning_rate": 0.0005964912280701754, "loss": 0.3656, "step": 22000 }, { "epoch": 35.1, "grad_norm": 0.3112677335739136, "learning_rate": 0.00059585326953748, "loss": 0.2358, "step": 22010 }, { "epoch": 35.12, "grad_norm": 2.4940788745880127, "learning_rate": 0.0005952153110047846, "loss": 0.3809, "step": 22020 }, { "epoch": 35.14, "grad_norm": 0.9833939671516418, "learning_rate": 0.0005945773524720894, "loss": 0.3437, "step": 22030 }, { "epoch": 35.15, "grad_norm": 1.0946290493011475, "learning_rate": 0.000593939393939394, "loss": 0.3872, "step": 22040 }, { "epoch": 35.17, "grad_norm": 1.2367923259735107, "learning_rate": 0.0005933014354066986, "loss": 0.3041, "step": 22050 }, { "epoch": 35.18, "grad_norm": 1.032891035079956, "learning_rate": 0.0005926634768740033, "loss": 0.2409, "step": 22060 }, { "epoch": 35.2, "grad_norm": 0.7659148573875427, "learning_rate": 0.0005920255183413078, "loss": 0.3928, "step": 22070 }, { "epoch": 35.22, "grad_norm": 0.337522953748703, "learning_rate": 0.0005913875598086125, "loss": 0.3563, "step": 22080 }, { "epoch": 35.23, "grad_norm": 0.6713753342628479, "learning_rate": 0.0005907496012759171, "loss": 0.3728, "step": 22090 }, { "epoch": 35.25, "grad_norm": 1.1470608711242676, "learning_rate": 0.0005901116427432218, "loss": 0.3121, "step": 22100 }, { "epoch": 35.26, "grad_norm": 0.5234013199806213, "learning_rate": 0.0005894736842105263, "loss": 0.42, "step": 22110 }, { "epoch": 35.28, "grad_norm": 0.6255330443382263, "learning_rate": 0.000588835725677831, "loss": 0.2818, "step": 22120 }, { "epoch": 35.3, "grad_norm": 1.1830130815505981, "learning_rate": 0.0005881977671451357, "loss": 0.3217, "step": 22130 }, { "epoch": 35.31, "grad_norm": 0.47124946117401123, "learning_rate": 0.0005875598086124402, "loss": 0.3054, "step": 22140 }, { "epoch": 35.33, "grad_norm": 0.5270739793777466, "learning_rate": 0.0005869218500797449, "loss": 0.3684, "step": 22150 }, { "epoch": 35.34, "grad_norm": 1.9852588176727295, "learning_rate": 0.0005862838915470495, "loss": 0.3598, "step": 22160 }, { "epoch": 35.36, "grad_norm": 1.0637511014938354, "learning_rate": 0.0005856459330143541, "loss": 0.3553, "step": 22170 }, { "epoch": 35.37, "grad_norm": 0.7305306792259216, "learning_rate": 0.0005850079744816587, "loss": 0.3361, "step": 22180 }, { "epoch": 35.39, "grad_norm": 1.0449053049087524, "learning_rate": 0.0005843700159489634, "loss": 0.45, "step": 22190 }, { "epoch": 35.41, "grad_norm": 0.3895207643508911, "learning_rate": 0.000583732057416268, "loss": 0.38, "step": 22200 }, { "epoch": 35.42, "grad_norm": 0.8981882333755493, "learning_rate": 0.0005830940988835726, "loss": 0.376, "step": 22210 }, { "epoch": 35.44, "grad_norm": 1.1853015422821045, "learning_rate": 0.0005824561403508773, "loss": 0.4054, "step": 22220 }, { "epoch": 35.45, "grad_norm": 0.6197064518928528, "learning_rate": 0.0005818181818181818, "loss": 0.3198, "step": 22230 }, { "epoch": 35.47, "grad_norm": 0.5569806694984436, "learning_rate": 0.0005811802232854865, "loss": 0.4118, "step": 22240 }, { "epoch": 35.49, "grad_norm": 0.48562178015708923, "learning_rate": 0.0005805422647527911, "loss": 0.2063, "step": 22250 }, { "epoch": 35.5, "grad_norm": 0.5743929743766785, "learning_rate": 0.0005799043062200957, "loss": 0.2263, "step": 22260 }, { "epoch": 35.52, "grad_norm": 0.5665689706802368, "learning_rate": 0.0005792663476874004, "loss": 0.3017, "step": 22270 }, { "epoch": 35.53, "grad_norm": 0.7719668745994568, "learning_rate": 0.000578628389154705, "loss": 0.3591, "step": 22280 }, { "epoch": 35.55, "grad_norm": 1.785213828086853, "learning_rate": 0.0005779904306220096, "loss": 0.3357, "step": 22290 }, { "epoch": 35.57, "grad_norm": 0.3386642336845398, "learning_rate": 0.0005773524720893142, "loss": 0.3164, "step": 22300 }, { "epoch": 35.58, "grad_norm": 0.8696405291557312, "learning_rate": 0.0005767145135566189, "loss": 0.3237, "step": 22310 }, { "epoch": 35.6, "grad_norm": 0.32794955372810364, "learning_rate": 0.0005760765550239234, "loss": 0.2756, "step": 22320 }, { "epoch": 35.61, "grad_norm": 0.3796286880970001, "learning_rate": 0.0005754385964912281, "loss": 0.3981, "step": 22330 }, { "epoch": 35.63, "grad_norm": 0.31685948371887207, "learning_rate": 0.0005748006379585327, "loss": 0.3877, "step": 22340 }, { "epoch": 35.65, "grad_norm": 0.3694205582141876, "learning_rate": 0.0005741626794258373, "loss": 0.2914, "step": 22350 }, { "epoch": 35.66, "grad_norm": 0.6097325086593628, "learning_rate": 0.000573524720893142, "loss": 0.2863, "step": 22360 }, { "epoch": 35.68, "grad_norm": 0.7454453110694885, "learning_rate": 0.0005728867623604466, "loss": 0.3531, "step": 22370 }, { "epoch": 35.69, "grad_norm": 0.4996640086174011, "learning_rate": 0.0005722488038277512, "loss": 0.3063, "step": 22380 }, { "epoch": 35.71, "grad_norm": 0.4868077337741852, "learning_rate": 0.0005716108452950558, "loss": 0.3087, "step": 22390 }, { "epoch": 35.73, "grad_norm": 0.3814201056957245, "learning_rate": 0.0005709728867623605, "loss": 0.3242, "step": 22400 }, { "epoch": 35.74, "grad_norm": 0.5458118915557861, "learning_rate": 0.000570334928229665, "loss": 0.2917, "step": 22410 }, { "epoch": 35.76, "grad_norm": 0.7367342114448547, "learning_rate": 0.0005696969696969697, "loss": 0.3678, "step": 22420 }, { "epoch": 35.77, "grad_norm": 0.876809298992157, "learning_rate": 0.0005690590111642744, "loss": 0.2996, "step": 22430 }, { "epoch": 35.79, "grad_norm": 0.392926424741745, "learning_rate": 0.0005684210526315789, "loss": 0.2191, "step": 22440 }, { "epoch": 35.81, "grad_norm": 0.5339792966842651, "learning_rate": 0.0005677830940988836, "loss": 0.3324, "step": 22450 }, { "epoch": 35.82, "grad_norm": 0.31976428627967834, "learning_rate": 0.0005671451355661882, "loss": 0.3723, "step": 22460 }, { "epoch": 35.84, "grad_norm": 1.3592702150344849, "learning_rate": 0.0005665071770334928, "loss": 0.3606, "step": 22470 }, { "epoch": 35.85, "grad_norm": 0.4233976900577545, "learning_rate": 0.0005658692185007974, "loss": 0.329, "step": 22480 }, { "epoch": 35.87, "grad_norm": 0.6980434656143188, "learning_rate": 0.0005652312599681021, "loss": 0.3132, "step": 22490 }, { "epoch": 35.89, "grad_norm": 0.766575813293457, "learning_rate": 0.0005645933014354066, "loss": 0.3447, "step": 22500 }, { "epoch": 35.9, "grad_norm": 0.6142354011535645, "learning_rate": 0.0005639553429027113, "loss": 0.2992, "step": 22510 }, { "epoch": 35.92, "grad_norm": 0.41867053508758545, "learning_rate": 0.000563317384370016, "loss": 0.3276, "step": 22520 }, { "epoch": 35.93, "grad_norm": 0.5943330526351929, "learning_rate": 0.0005626794258373205, "loss": 0.4112, "step": 22530 }, { "epoch": 35.95, "grad_norm": 1.2840982675552368, "learning_rate": 0.0005620414673046252, "loss": 0.406, "step": 22540 }, { "epoch": 35.96, "grad_norm": 0.5472711324691772, "learning_rate": 0.0005614035087719298, "loss": 0.3947, "step": 22550 }, { "epoch": 35.98, "grad_norm": 0.49946820735931396, "learning_rate": 0.0005607655502392345, "loss": 0.3561, "step": 22560 }, { "epoch": 36.0, "grad_norm": 0.5711825489997864, "learning_rate": 0.000560127591706539, "loss": 0.4207, "step": 22570 }, { "epoch": 36.01, "grad_norm": 2.452195882797241, "learning_rate": 0.0005594896331738437, "loss": 0.3378, "step": 22580 }, { "epoch": 36.03, "grad_norm": 0.39312276244163513, "learning_rate": 0.0005588516746411484, "loss": 0.3136, "step": 22590 }, { "epoch": 36.04, "grad_norm": 0.7896597981452942, "learning_rate": 0.0005582137161084529, "loss": 0.3815, "step": 22600 }, { "epoch": 36.06, "grad_norm": 0.5603874921798706, "learning_rate": 0.0005575757575757576, "loss": 0.3509, "step": 22610 }, { "epoch": 36.08, "grad_norm": 0.3025873899459839, "learning_rate": 0.0005569377990430622, "loss": 0.2575, "step": 22620 }, { "epoch": 36.09, "grad_norm": 0.2621009349822998, "learning_rate": 0.0005562998405103668, "loss": 0.2562, "step": 22630 }, { "epoch": 36.11, "grad_norm": 0.6688899397850037, "learning_rate": 0.0005556618819776714, "loss": 0.3194, "step": 22640 }, { "epoch": 36.12, "grad_norm": 2.3156378269195557, "learning_rate": 0.0005550239234449761, "loss": 0.3564, "step": 22650 }, { "epoch": 36.14, "grad_norm": 0.35387060046195984, "learning_rate": 0.0005543859649122806, "loss": 0.3054, "step": 22660 }, { "epoch": 36.16, "grad_norm": 0.3707694709300995, "learning_rate": 0.0005537480063795853, "loss": 0.3391, "step": 22670 }, { "epoch": 36.17, "grad_norm": 0.675459086894989, "learning_rate": 0.00055311004784689, "loss": 0.3202, "step": 22680 }, { "epoch": 36.19, "grad_norm": 0.46194231510162354, "learning_rate": 0.0005524720893141945, "loss": 0.2755, "step": 22690 }, { "epoch": 36.2, "grad_norm": 0.4732086956501007, "learning_rate": 0.0005518341307814992, "loss": 0.2891, "step": 22700 }, { "epoch": 36.22, "grad_norm": 0.5394445061683655, "learning_rate": 0.0005511961722488038, "loss": 0.2845, "step": 22710 }, { "epoch": 36.24, "grad_norm": 0.7429685592651367, "learning_rate": 0.0005505582137161084, "loss": 0.3268, "step": 22720 }, { "epoch": 36.25, "grad_norm": 0.4031120240688324, "learning_rate": 0.000549920255183413, "loss": 0.3197, "step": 22730 }, { "epoch": 36.27, "grad_norm": 1.3633867502212524, "learning_rate": 0.0005492822966507177, "loss": 0.2912, "step": 22740 }, { "epoch": 36.28, "grad_norm": 0.246135875582695, "learning_rate": 0.0005486443381180223, "loss": 0.3485, "step": 22750 }, { "epoch": 36.3, "grad_norm": 0.7717587351799011, "learning_rate": 0.0005480063795853269, "loss": 0.2938, "step": 22760 }, { "epoch": 36.32, "grad_norm": 0.5031578540802002, "learning_rate": 0.0005473684210526317, "loss": 0.3693, "step": 22770 }, { "epoch": 36.33, "grad_norm": 0.46057426929473877, "learning_rate": 0.0005467304625199362, "loss": 0.2782, "step": 22780 }, { "epoch": 36.35, "grad_norm": 0.33407339453697205, "learning_rate": 0.0005460925039872409, "loss": 0.3133, "step": 22790 }, { "epoch": 36.36, "grad_norm": 0.7417854070663452, "learning_rate": 0.0005454545454545455, "loss": 0.3441, "step": 22800 }, { "epoch": 36.38, "grad_norm": 0.3010425567626953, "learning_rate": 0.0005448165869218501, "loss": 0.314, "step": 22810 }, { "epoch": 36.4, "grad_norm": 0.5968150496482849, "learning_rate": 0.0005441786283891547, "loss": 0.3526, "step": 22820 }, { "epoch": 36.41, "grad_norm": 0.8175147771835327, "learning_rate": 0.0005435406698564594, "loss": 0.3451, "step": 22830 }, { "epoch": 36.43, "grad_norm": 1.3906422853469849, "learning_rate": 0.000542902711323764, "loss": 0.2813, "step": 22840 }, { "epoch": 36.44, "grad_norm": 0.47024595737457275, "learning_rate": 0.0005422647527910686, "loss": 0.3559, "step": 22850 }, { "epoch": 36.46, "grad_norm": 0.3460497558116913, "learning_rate": 0.0005416267942583733, "loss": 0.2783, "step": 22860 }, { "epoch": 36.48, "grad_norm": 0.5971447825431824, "learning_rate": 0.0005409888357256778, "loss": 0.3274, "step": 22870 }, { "epoch": 36.49, "grad_norm": 0.9573736190795898, "learning_rate": 0.0005403508771929825, "loss": 0.2442, "step": 22880 }, { "epoch": 36.51, "grad_norm": 0.4627261757850647, "learning_rate": 0.0005397129186602871, "loss": 0.2677, "step": 22890 }, { "epoch": 36.52, "grad_norm": 0.45995354652404785, "learning_rate": 0.0005390749601275918, "loss": 0.3892, "step": 22900 }, { "epoch": 36.54, "grad_norm": 0.2959776818752289, "learning_rate": 0.0005384370015948964, "loss": 0.3216, "step": 22910 }, { "epoch": 36.56, "grad_norm": 0.4786494970321655, "learning_rate": 0.000537799043062201, "loss": 0.3105, "step": 22920 }, { "epoch": 36.57, "grad_norm": 0.462162584066391, "learning_rate": 0.0005371610845295057, "loss": 0.4404, "step": 22930 }, { "epoch": 36.59, "grad_norm": 0.37563401460647583, "learning_rate": 0.0005365231259968102, "loss": 0.2949, "step": 22940 }, { "epoch": 36.6, "grad_norm": 0.4217167794704437, "learning_rate": 0.0005358851674641149, "loss": 0.425, "step": 22950 }, { "epoch": 36.62, "grad_norm": 0.5127308964729309, "learning_rate": 0.0005352472089314195, "loss": 0.3215, "step": 22960 }, { "epoch": 36.63, "grad_norm": 1.0700709819793701, "learning_rate": 0.0005346092503987241, "loss": 0.3379, "step": 22970 }, { "epoch": 36.65, "grad_norm": 0.6836196184158325, "learning_rate": 0.0005339712918660288, "loss": 0.3811, "step": 22980 }, { "epoch": 36.67, "grad_norm": 0.2946398854255676, "learning_rate": 0.0005333333333333334, "loss": 0.2362, "step": 22990 }, { "epoch": 36.68, "grad_norm": 0.38813692331314087, "learning_rate": 0.000532695374800638, "loss": 0.3188, "step": 23000 }, { "epoch": 36.7, "grad_norm": 0.483698308467865, "learning_rate": 0.0005320574162679426, "loss": 0.4241, "step": 23010 }, { "epoch": 36.71, "grad_norm": 0.5879315733909607, "learning_rate": 0.0005314194577352473, "loss": 0.3343, "step": 23020 }, { "epoch": 36.73, "grad_norm": 0.3913237154483795, "learning_rate": 0.0005307814992025518, "loss": 0.3438, "step": 23030 }, { "epoch": 36.75, "grad_norm": 0.9392869472503662, "learning_rate": 0.0005301435406698565, "loss": 0.3663, "step": 23040 }, { "epoch": 36.76, "grad_norm": 0.4291793704032898, "learning_rate": 0.0005295055821371611, "loss": 0.3078, "step": 23050 }, { "epoch": 36.78, "grad_norm": 0.6778882741928101, "learning_rate": 0.0005288676236044657, "loss": 0.3618, "step": 23060 }, { "epoch": 36.79, "grad_norm": 0.9089276194572449, "learning_rate": 0.0005282296650717704, "loss": 0.3424, "step": 23070 }, { "epoch": 36.81, "grad_norm": 0.6602213978767395, "learning_rate": 0.000527591706539075, "loss": 0.3148, "step": 23080 }, { "epoch": 36.83, "grad_norm": 0.4564104378223419, "learning_rate": 0.0005269537480063796, "loss": 0.4301, "step": 23090 }, { "epoch": 36.84, "grad_norm": 0.23501376807689667, "learning_rate": 0.0005263157894736842, "loss": 0.3264, "step": 23100 }, { "epoch": 36.86, "grad_norm": 1.654263973236084, "learning_rate": 0.0005256778309409889, "loss": 0.3627, "step": 23110 }, { "epoch": 36.87, "grad_norm": 0.8504493236541748, "learning_rate": 0.0005250398724082934, "loss": 0.3381, "step": 23120 }, { "epoch": 36.89, "grad_norm": 0.7040032744407654, "learning_rate": 0.0005244019138755981, "loss": 0.4432, "step": 23130 }, { "epoch": 36.91, "grad_norm": 0.5224348902702332, "learning_rate": 0.0005237639553429028, "loss": 0.3652, "step": 23140 }, { "epoch": 36.92, "grad_norm": 0.5879861116409302, "learning_rate": 0.0005231259968102073, "loss": 0.3216, "step": 23150 }, { "epoch": 36.94, "grad_norm": 0.3892087936401367, "learning_rate": 0.000522488038277512, "loss": 0.3308, "step": 23160 }, { "epoch": 36.95, "grad_norm": 0.2915053069591522, "learning_rate": 0.0005218500797448166, "loss": 0.3044, "step": 23170 }, { "epoch": 36.97, "grad_norm": 0.515186607837677, "learning_rate": 0.0005212121212121212, "loss": 0.4347, "step": 23180 }, { "epoch": 36.99, "grad_norm": 0.4125446677207947, "learning_rate": 0.0005205741626794258, "loss": 0.3828, "step": 23190 }, { "epoch": 37.0, "grad_norm": 0.4284899830818176, "learning_rate": 0.0005199362041467305, "loss": 0.4189, "step": 23200 }, { "epoch": 37.02, "grad_norm": 1.1735564470291138, "learning_rate": 0.000519298245614035, "loss": 0.3337, "step": 23210 }, { "epoch": 37.03, "grad_norm": 1.21298348903656, "learning_rate": 0.0005186602870813397, "loss": 0.3879, "step": 23220 }, { "epoch": 37.05, "grad_norm": 3.3211417198181152, "learning_rate": 0.0005180223285486444, "loss": 0.3724, "step": 23230 }, { "epoch": 37.07, "grad_norm": 0.5634852647781372, "learning_rate": 0.0005173843700159489, "loss": 0.2916, "step": 23240 }, { "epoch": 37.08, "grad_norm": 0.40934479236602783, "learning_rate": 0.0005167464114832536, "loss": 0.3133, "step": 23250 }, { "epoch": 37.1, "grad_norm": 0.6190032958984375, "learning_rate": 0.0005161084529505582, "loss": 0.2639, "step": 23260 }, { "epoch": 37.11, "grad_norm": 0.38555908203125, "learning_rate": 0.0005154704944178628, "loss": 0.3493, "step": 23270 }, { "epoch": 37.13, "grad_norm": 0.2890884280204773, "learning_rate": 0.0005148325358851674, "loss": 0.3042, "step": 23280 }, { "epoch": 37.15, "grad_norm": 0.3978734016418457, "learning_rate": 0.0005141945773524721, "loss": 0.276, "step": 23290 }, { "epoch": 37.16, "grad_norm": 0.6064948439598083, "learning_rate": 0.0005135566188197768, "loss": 0.3809, "step": 23300 }, { "epoch": 37.18, "grad_norm": 0.6788705587387085, "learning_rate": 0.0005129186602870813, "loss": 0.2914, "step": 23310 }, { "epoch": 37.19, "grad_norm": 0.4636113941669464, "learning_rate": 0.000512280701754386, "loss": 0.3822, "step": 23320 }, { "epoch": 37.21, "grad_norm": 0.6636508107185364, "learning_rate": 0.0005116427432216905, "loss": 0.3767, "step": 23330 }, { "epoch": 37.22, "grad_norm": 0.435531347990036, "learning_rate": 0.0005110047846889952, "loss": 0.4045, "step": 23340 }, { "epoch": 37.24, "grad_norm": 0.5816912651062012, "learning_rate": 0.0005103668261562998, "loss": 0.3083, "step": 23350 }, { "epoch": 37.26, "grad_norm": 0.8348118662834167, "learning_rate": 0.0005097288676236045, "loss": 0.2738, "step": 23360 }, { "epoch": 37.27, "grad_norm": 0.5250842571258545, "learning_rate": 0.000509090909090909, "loss": 0.3266, "step": 23370 }, { "epoch": 37.29, "grad_norm": 0.3116588592529297, "learning_rate": 0.0005084529505582137, "loss": 0.2874, "step": 23380 }, { "epoch": 37.3, "grad_norm": 0.5619212985038757, "learning_rate": 0.0005078149920255184, "loss": 0.2712, "step": 23390 }, { "epoch": 37.32, "grad_norm": 0.34848636388778687, "learning_rate": 0.0005071770334928229, "loss": 0.4232, "step": 23400 }, { "epoch": 37.34, "grad_norm": 0.38688668608665466, "learning_rate": 0.0005065390749601276, "loss": 0.2947, "step": 23410 }, { "epoch": 37.35, "grad_norm": 1.7782784700393677, "learning_rate": 0.0005059011164274322, "loss": 0.3313, "step": 23420 }, { "epoch": 37.37, "grad_norm": 0.2640959620475769, "learning_rate": 0.0005052631578947368, "loss": 0.2622, "step": 23430 }, { "epoch": 37.38, "grad_norm": 0.2727811932563782, "learning_rate": 0.0005046251993620414, "loss": 0.3136, "step": 23440 }, { "epoch": 37.4, "grad_norm": 0.5404552817344666, "learning_rate": 0.0005039872408293461, "loss": 0.3078, "step": 23450 }, { "epoch": 37.42, "grad_norm": 0.38602226972579956, "learning_rate": 0.0005033492822966507, "loss": 0.2807, "step": 23460 }, { "epoch": 37.43, "grad_norm": 0.30310848355293274, "learning_rate": 0.0005027113237639553, "loss": 0.3203, "step": 23470 }, { "epoch": 37.45, "grad_norm": 0.41210854053497314, "learning_rate": 0.00050207336523126, "loss": 0.3025, "step": 23480 }, { "epoch": 37.46, "grad_norm": 0.7113584876060486, "learning_rate": 0.0005014354066985645, "loss": 0.3876, "step": 23490 }, { "epoch": 37.48, "grad_norm": 0.8924645185470581, "learning_rate": 0.0005007974481658692, "loss": 0.2642, "step": 23500 }, { "epoch": 37.5, "grad_norm": 0.5637812614440918, "learning_rate": 0.0005001594896331738, "loss": 0.3732, "step": 23510 }, { "epoch": 37.51, "grad_norm": 0.34932073950767517, "learning_rate": 0.0004995215311004785, "loss": 0.2753, "step": 23520 }, { "epoch": 37.53, "grad_norm": 0.39498457312583923, "learning_rate": 0.0004988835725677831, "loss": 0.2949, "step": 23530 }, { "epoch": 37.54, "grad_norm": 0.4476890563964844, "learning_rate": 0.0004982456140350878, "loss": 0.318, "step": 23540 }, { "epoch": 37.56, "grad_norm": 0.3034002482891083, "learning_rate": 0.0004976076555023923, "loss": 0.2778, "step": 23550 }, { "epoch": 37.58, "grad_norm": 0.7696762084960938, "learning_rate": 0.000496969696969697, "loss": 0.3525, "step": 23560 }, { "epoch": 37.59, "grad_norm": 0.6639572978019714, "learning_rate": 0.0004963317384370016, "loss": 0.2929, "step": 23570 }, { "epoch": 37.61, "grad_norm": 0.8098918199539185, "learning_rate": 0.0004956937799043062, "loss": 0.3162, "step": 23580 }, { "epoch": 37.62, "grad_norm": 0.7061499357223511, "learning_rate": 0.0004950558213716109, "loss": 0.4247, "step": 23590 }, { "epoch": 37.64, "grad_norm": 0.7736586928367615, "learning_rate": 0.0004944178628389155, "loss": 0.3253, "step": 23600 }, { "epoch": 37.66, "grad_norm": 0.22601386904716492, "learning_rate": 0.0004937799043062201, "loss": 0.322, "step": 23610 }, { "epoch": 37.67, "grad_norm": 0.34596702456474304, "learning_rate": 0.0004931419457735247, "loss": 0.381, "step": 23620 }, { "epoch": 37.69, "grad_norm": 0.399099737405777, "learning_rate": 0.0004925039872408294, "loss": 0.367, "step": 23630 }, { "epoch": 37.7, "grad_norm": 0.4223106801509857, "learning_rate": 0.0004918660287081339, "loss": 0.3502, "step": 23640 }, { "epoch": 37.72, "grad_norm": 0.36701181530952454, "learning_rate": 0.0004912280701754386, "loss": 0.3696, "step": 23650 }, { "epoch": 37.74, "grad_norm": 1.0397878885269165, "learning_rate": 0.0004905901116427433, "loss": 0.2922, "step": 23660 }, { "epoch": 37.75, "grad_norm": 0.6061972975730896, "learning_rate": 0.0004899521531100478, "loss": 0.3406, "step": 23670 }, { "epoch": 37.77, "grad_norm": 0.46018704771995544, "learning_rate": 0.0004893141945773525, "loss": 0.3847, "step": 23680 }, { "epoch": 37.78, "grad_norm": 0.4098079204559326, "learning_rate": 0.0004886762360446571, "loss": 0.2944, "step": 23690 }, { "epoch": 37.8, "grad_norm": 0.33187025785446167, "learning_rate": 0.0004880382775119617, "loss": 0.3696, "step": 23700 }, { "epoch": 37.81, "grad_norm": 1.5086455345153809, "learning_rate": 0.00048740031897926637, "loss": 0.3415, "step": 23710 }, { "epoch": 37.83, "grad_norm": 0.1812012493610382, "learning_rate": 0.000486762360446571, "loss": 0.3317, "step": 23720 }, { "epoch": 37.85, "grad_norm": 0.4595651924610138, "learning_rate": 0.00048612440191387566, "loss": 0.3449, "step": 23730 }, { "epoch": 37.86, "grad_norm": 0.7050609588623047, "learning_rate": 0.0004854864433811803, "loss": 0.3273, "step": 23740 }, { "epoch": 37.88, "grad_norm": 0.4877799451351166, "learning_rate": 0.0004848484848484849, "loss": 0.2679, "step": 23750 }, { "epoch": 37.89, "grad_norm": 0.4837338328361511, "learning_rate": 0.0004842105263157895, "loss": 0.3203, "step": 23760 }, { "epoch": 37.91, "grad_norm": 0.5711174607276917, "learning_rate": 0.0004835725677830941, "loss": 0.3088, "step": 23770 }, { "epoch": 37.93, "grad_norm": 0.7363555431365967, "learning_rate": 0.00048293460925039874, "loss": 0.3494, "step": 23780 }, { "epoch": 37.94, "grad_norm": 0.4688860774040222, "learning_rate": 0.00048229665071770336, "loss": 0.406, "step": 23790 }, { "epoch": 37.96, "grad_norm": 0.22278854250907898, "learning_rate": 0.000481658692185008, "loss": 0.2775, "step": 23800 }, { "epoch": 37.97, "grad_norm": 0.5794351100921631, "learning_rate": 0.00048102073365231265, "loss": 0.3613, "step": 23810 }, { "epoch": 37.99, "grad_norm": 0.7034667730331421, "learning_rate": 0.00048038277511961726, "loss": 0.3417, "step": 23820 }, { "epoch": 38.01, "grad_norm": 0.5369040966033936, "learning_rate": 0.0004797448165869219, "loss": 0.2919, "step": 23830 }, { "epoch": 38.02, "grad_norm": 0.4583072066307068, "learning_rate": 0.0004791068580542265, "loss": 0.2711, "step": 23840 }, { "epoch": 38.04, "grad_norm": 0.32047978043556213, "learning_rate": 0.0004784688995215311, "loss": 0.3133, "step": 23850 }, { "epoch": 38.05, "grad_norm": 0.4489063024520874, "learning_rate": 0.00047783094098883573, "loss": 0.3027, "step": 23860 }, { "epoch": 38.07, "grad_norm": 0.29304754734039307, "learning_rate": 0.00047719298245614035, "loss": 0.2937, "step": 23870 }, { "epoch": 38.09, "grad_norm": 0.5141634345054626, "learning_rate": 0.00047655502392344496, "loss": 0.2787, "step": 23880 }, { "epoch": 38.1, "grad_norm": 0.6913502216339111, "learning_rate": 0.00047591706539074964, "loss": 0.3666, "step": 23890 }, { "epoch": 38.12, "grad_norm": 0.49919384717941284, "learning_rate": 0.00047527910685805425, "loss": 0.3182, "step": 23900 }, { "epoch": 38.13, "grad_norm": 0.27605143189430237, "learning_rate": 0.00047464114832535887, "loss": 0.2354, "step": 23910 }, { "epoch": 38.15, "grad_norm": 1.246079921722412, "learning_rate": 0.0004740031897926635, "loss": 0.3429, "step": 23920 }, { "epoch": 38.17, "grad_norm": 0.18399390578269958, "learning_rate": 0.0004733652312599681, "loss": 0.2916, "step": 23930 }, { "epoch": 38.18, "grad_norm": 0.3015744388103485, "learning_rate": 0.0004727272727272727, "loss": 0.3216, "step": 23940 }, { "epoch": 38.2, "grad_norm": 0.5281094312667847, "learning_rate": 0.00047208931419457734, "loss": 0.4193, "step": 23950 }, { "epoch": 38.21, "grad_norm": 0.6574485301971436, "learning_rate": 0.00047145135566188195, "loss": 0.318, "step": 23960 }, { "epoch": 38.23, "grad_norm": 0.5636985898017883, "learning_rate": 0.0004708133971291866, "loss": 0.3493, "step": 23970 }, { "epoch": 38.25, "grad_norm": 0.3899206817150116, "learning_rate": 0.00047017543859649124, "loss": 0.313, "step": 23980 }, { "epoch": 38.26, "grad_norm": 0.465703547000885, "learning_rate": 0.00046953748006379586, "loss": 0.3227, "step": 23990 }, { "epoch": 38.28, "grad_norm": 0.9873224496841431, "learning_rate": 0.0004688995215311005, "loss": 0.3093, "step": 24000 }, { "epoch": 38.29, "grad_norm": 0.545748233795166, "learning_rate": 0.0004682615629984051, "loss": 0.3427, "step": 24010 }, { "epoch": 38.31, "grad_norm": 1.6173831224441528, "learning_rate": 0.0004676236044657097, "loss": 0.3487, "step": 24020 }, { "epoch": 38.33, "grad_norm": 0.43845269083976746, "learning_rate": 0.0004669856459330143, "loss": 0.3368, "step": 24030 }, { "epoch": 38.34, "grad_norm": 0.6073929071426392, "learning_rate": 0.00046634768740031894, "loss": 0.4022, "step": 24040 }, { "epoch": 38.36, "grad_norm": 0.3742305636405945, "learning_rate": 0.0004657097288676236, "loss": 0.2748, "step": 24050 }, { "epoch": 38.37, "grad_norm": 0.2694351375102997, "learning_rate": 0.00046507177033492823, "loss": 0.3874, "step": 24060 }, { "epoch": 38.39, "grad_norm": 0.48228031396865845, "learning_rate": 0.00046443381180223285, "loss": 0.2613, "step": 24070 }, { "epoch": 38.41, "grad_norm": 0.7061280608177185, "learning_rate": 0.0004637958532695375, "loss": 0.3152, "step": 24080 }, { "epoch": 38.42, "grad_norm": 0.5890529751777649, "learning_rate": 0.00046315789473684214, "loss": 0.2593, "step": 24090 }, { "epoch": 38.44, "grad_norm": 0.5934563875198364, "learning_rate": 0.00046251993620414675, "loss": 0.3712, "step": 24100 }, { "epoch": 38.45, "grad_norm": 0.30438482761383057, "learning_rate": 0.00046188197767145137, "loss": 0.3082, "step": 24110 }, { "epoch": 38.47, "grad_norm": 0.1404085010290146, "learning_rate": 0.000461244019138756, "loss": 0.2458, "step": 24120 }, { "epoch": 38.48, "grad_norm": 0.45408958196640015, "learning_rate": 0.00046060606060606066, "loss": 0.302, "step": 24130 }, { "epoch": 38.5, "grad_norm": 0.4974878430366516, "learning_rate": 0.0004599681020733653, "loss": 0.3416, "step": 24140 }, { "epoch": 38.52, "grad_norm": 0.2546900510787964, "learning_rate": 0.0004593301435406699, "loss": 0.2762, "step": 24150 }, { "epoch": 38.53, "grad_norm": 0.5472551584243774, "learning_rate": 0.0004586921850079745, "loss": 0.3176, "step": 24160 }, { "epoch": 38.55, "grad_norm": 0.7795162200927734, "learning_rate": 0.0004580542264752791, "loss": 0.3699, "step": 24170 }, { "epoch": 38.56, "grad_norm": 0.4223695397377014, "learning_rate": 0.00045741626794258374, "loss": 0.3011, "step": 24180 }, { "epoch": 38.58, "grad_norm": 2.152009963989258, "learning_rate": 0.00045677830940988836, "loss": 0.36, "step": 24190 }, { "epoch": 38.6, "grad_norm": 0.477445513010025, "learning_rate": 0.000456140350877193, "loss": 0.279, "step": 24200 }, { "epoch": 38.61, "grad_norm": 0.546576738357544, "learning_rate": 0.00045550239234449765, "loss": 0.3519, "step": 24210 }, { "epoch": 38.63, "grad_norm": 0.3089749217033386, "learning_rate": 0.00045486443381180226, "loss": 0.3306, "step": 24220 }, { "epoch": 38.64, "grad_norm": 0.5986670851707458, "learning_rate": 0.0004542264752791069, "loss": 0.2692, "step": 24230 }, { "epoch": 38.66, "grad_norm": 1.1655359268188477, "learning_rate": 0.0004535885167464115, "loss": 0.3717, "step": 24240 }, { "epoch": 38.68, "grad_norm": 0.349162757396698, "learning_rate": 0.0004529505582137161, "loss": 0.4474, "step": 24250 }, { "epoch": 38.69, "grad_norm": 0.3474232256412506, "learning_rate": 0.00045231259968102073, "loss": 0.3711, "step": 24260 }, { "epoch": 38.71, "grad_norm": 0.38125041127204895, "learning_rate": 0.00045167464114832535, "loss": 0.3417, "step": 24270 }, { "epoch": 38.72, "grad_norm": 0.49059632420539856, "learning_rate": 0.00045103668261562996, "loss": 0.3728, "step": 24280 }, { "epoch": 38.74, "grad_norm": 0.34616127610206604, "learning_rate": 0.00045039872408293464, "loss": 0.3007, "step": 24290 }, { "epoch": 38.76, "grad_norm": 0.6310774087905884, "learning_rate": 0.00044976076555023925, "loss": 0.2896, "step": 24300 }, { "epoch": 38.77, "grad_norm": 1.5255939960479736, "learning_rate": 0.00044912280701754387, "loss": 0.3429, "step": 24310 }, { "epoch": 38.79, "grad_norm": 0.38608258962631226, "learning_rate": 0.0004484848484848485, "loss": 0.3438, "step": 24320 }, { "epoch": 38.8, "grad_norm": 1.0546627044677734, "learning_rate": 0.0004478468899521531, "loss": 0.3801, "step": 24330 }, { "epoch": 38.82, "grad_norm": 0.3056943118572235, "learning_rate": 0.0004472089314194577, "loss": 0.2759, "step": 24340 }, { "epoch": 38.84, "grad_norm": 0.7335503101348877, "learning_rate": 0.00044657097288676234, "loss": 0.3331, "step": 24350 }, { "epoch": 38.85, "grad_norm": 0.36230140924453735, "learning_rate": 0.00044593301435406695, "loss": 0.2871, "step": 24360 }, { "epoch": 38.87, "grad_norm": 0.3868005573749542, "learning_rate": 0.0004452950558213716, "loss": 0.3049, "step": 24370 }, { "epoch": 38.88, "grad_norm": 0.4695385992527008, "learning_rate": 0.00044465709728867624, "loss": 0.3754, "step": 24380 }, { "epoch": 38.9, "grad_norm": 0.2892504036426544, "learning_rate": 0.00044401913875598086, "loss": 0.3438, "step": 24390 }, { "epoch": 38.92, "grad_norm": 0.7235500812530518, "learning_rate": 0.0004433811802232855, "loss": 0.3345, "step": 24400 }, { "epoch": 38.93, "grad_norm": 0.48276352882385254, "learning_rate": 0.0004427432216905901, "loss": 0.3486, "step": 24410 }, { "epoch": 38.95, "grad_norm": 0.384084016084671, "learning_rate": 0.0004421052631578947, "loss": 0.3657, "step": 24420 }, { "epoch": 38.96, "grad_norm": 0.31639254093170166, "learning_rate": 0.0004414673046251993, "loss": 0.3392, "step": 24430 }, { "epoch": 38.98, "grad_norm": 0.3250158131122589, "learning_rate": 0.00044082934609250394, "loss": 0.3391, "step": 24440 }, { "epoch": 39.0, "grad_norm": 0.37524476647377014, "learning_rate": 0.00044019138755980867, "loss": 0.3313, "step": 24450 }, { "epoch": 39.01, "grad_norm": 0.5987895131111145, "learning_rate": 0.0004395534290271133, "loss": 0.2781, "step": 24460 }, { "epoch": 39.03, "grad_norm": 0.18153107166290283, "learning_rate": 0.0004389154704944179, "loss": 0.2496, "step": 24470 }, { "epoch": 39.04, "grad_norm": 0.34211575984954834, "learning_rate": 0.0004382775119617225, "loss": 0.272, "step": 24480 }, { "epoch": 39.06, "grad_norm": 0.391075074672699, "learning_rate": 0.00043763955342902714, "loss": 0.3307, "step": 24490 }, { "epoch": 39.07, "grad_norm": 0.2632424831390381, "learning_rate": 0.00043700159489633175, "loss": 0.2906, "step": 24500 }, { "epoch": 39.09, "grad_norm": 0.5995433926582336, "learning_rate": 0.00043636363636363637, "loss": 0.293, "step": 24510 }, { "epoch": 39.11, "grad_norm": 0.6448796987533569, "learning_rate": 0.00043572567783094104, "loss": 0.251, "step": 24520 }, { "epoch": 39.12, "grad_norm": 0.5249642729759216, "learning_rate": 0.00043508771929824566, "loss": 0.3272, "step": 24530 }, { "epoch": 39.14, "grad_norm": 0.8031821250915527, "learning_rate": 0.0004344497607655503, "loss": 0.3417, "step": 24540 }, { "epoch": 39.15, "grad_norm": 0.9898377656936646, "learning_rate": 0.0004338118022328549, "loss": 0.3799, "step": 24550 }, { "epoch": 39.17, "grad_norm": 0.301408976316452, "learning_rate": 0.0004331738437001595, "loss": 0.2618, "step": 24560 }, { "epoch": 39.19, "grad_norm": 0.3909609317779541, "learning_rate": 0.0004325358851674641, "loss": 0.3, "step": 24570 }, { "epoch": 39.2, "grad_norm": 0.3314201533794403, "learning_rate": 0.00043189792663476874, "loss": 0.3461, "step": 24580 }, { "epoch": 39.22, "grad_norm": 0.8803900480270386, "learning_rate": 0.00043125996810207336, "loss": 0.3438, "step": 24590 }, { "epoch": 39.23, "grad_norm": 0.3051396906375885, "learning_rate": 0.00043062200956937803, "loss": 0.3489, "step": 24600 }, { "epoch": 39.25, "grad_norm": 0.5020725131034851, "learning_rate": 0.00042998405103668265, "loss": 0.316, "step": 24610 }, { "epoch": 39.27, "grad_norm": 0.7016777396202087, "learning_rate": 0.00042934609250398726, "loss": 0.3582, "step": 24620 }, { "epoch": 39.28, "grad_norm": 0.21689297258853912, "learning_rate": 0.0004287081339712919, "loss": 0.2597, "step": 24630 }, { "epoch": 39.3, "grad_norm": 0.6638566851615906, "learning_rate": 0.0004280701754385965, "loss": 0.3905, "step": 24640 }, { "epoch": 39.31, "grad_norm": 0.24087496101856232, "learning_rate": 0.0004274322169059011, "loss": 0.3349, "step": 24650 }, { "epoch": 39.33, "grad_norm": 0.14746366441249847, "learning_rate": 0.00042679425837320573, "loss": 0.2942, "step": 24660 }, { "epoch": 39.35, "grad_norm": 0.3620028495788574, "learning_rate": 0.00042615629984051035, "loss": 0.2394, "step": 24670 }, { "epoch": 39.36, "grad_norm": 0.5359326004981995, "learning_rate": 0.000425518341307815, "loss": 0.3661, "step": 24680 }, { "epoch": 39.38, "grad_norm": 0.26914021372795105, "learning_rate": 0.00042488038277511964, "loss": 0.2544, "step": 24690 }, { "epoch": 39.39, "grad_norm": 0.22984707355499268, "learning_rate": 0.00042424242424242425, "loss": 0.2545, "step": 24700 }, { "epoch": 39.41, "grad_norm": 0.2788347601890564, "learning_rate": 0.00042360446570972887, "loss": 0.3354, "step": 24710 }, { "epoch": 39.43, "grad_norm": 0.24124827980995178, "learning_rate": 0.0004229665071770335, "loss": 0.2551, "step": 24720 }, { "epoch": 39.44, "grad_norm": 0.547863781452179, "learning_rate": 0.0004223285486443381, "loss": 0.2597, "step": 24730 }, { "epoch": 39.46, "grad_norm": 0.25198522210121155, "learning_rate": 0.0004216905901116427, "loss": 0.3179, "step": 24740 }, { "epoch": 39.47, "grad_norm": 0.3968208134174347, "learning_rate": 0.00042105263157894734, "loss": 0.2876, "step": 24750 }, { "epoch": 39.49, "grad_norm": 0.33785438537597656, "learning_rate": 0.000420414673046252, "loss": 0.3097, "step": 24760 }, { "epoch": 39.51, "grad_norm": 0.5009357333183289, "learning_rate": 0.0004197767145135566, "loss": 0.3202, "step": 24770 }, { "epoch": 39.52, "grad_norm": 0.4793984889984131, "learning_rate": 0.00041913875598086124, "loss": 0.3518, "step": 24780 }, { "epoch": 39.54, "grad_norm": 0.19300325214862823, "learning_rate": 0.00041850079744816586, "loss": 0.3672, "step": 24790 }, { "epoch": 39.55, "grad_norm": 0.5630788803100586, "learning_rate": 0.0004178628389154705, "loss": 0.3217, "step": 24800 }, { "epoch": 39.57, "grad_norm": 0.28488433361053467, "learning_rate": 0.0004172248803827751, "loss": 0.3063, "step": 24810 }, { "epoch": 39.59, "grad_norm": 0.25450441241264343, "learning_rate": 0.0004165869218500797, "loss": 0.3638, "step": 24820 }, { "epoch": 39.6, "grad_norm": 0.4360348880290985, "learning_rate": 0.0004159489633173844, "loss": 0.3122, "step": 24830 }, { "epoch": 39.62, "grad_norm": 0.5293656587600708, "learning_rate": 0.00041531100478468905, "loss": 0.3944, "step": 24840 }, { "epoch": 39.63, "grad_norm": 0.46485990285873413, "learning_rate": 0.00041467304625199367, "loss": 0.2323, "step": 24850 }, { "epoch": 39.65, "grad_norm": 0.501832127571106, "learning_rate": 0.0004140350877192983, "loss": 0.3502, "step": 24860 }, { "epoch": 39.67, "grad_norm": 0.4300176799297333, "learning_rate": 0.0004133971291866029, "loss": 0.2736, "step": 24870 }, { "epoch": 39.68, "grad_norm": 0.253682941198349, "learning_rate": 0.0004127591706539075, "loss": 0.3306, "step": 24880 }, { "epoch": 39.7, "grad_norm": 0.18599876761436462, "learning_rate": 0.00041212121212121214, "loss": 0.3534, "step": 24890 }, { "epoch": 39.71, "grad_norm": 0.21810634434223175, "learning_rate": 0.00041148325358851675, "loss": 0.2772, "step": 24900 }, { "epoch": 39.73, "grad_norm": 0.3228086233139038, "learning_rate": 0.00041084529505582137, "loss": 0.2433, "step": 24910 }, { "epoch": 39.74, "grad_norm": 0.30225640535354614, "learning_rate": 0.00041020733652312604, "loss": 0.2261, "step": 24920 }, { "epoch": 39.76, "grad_norm": 0.19185695052146912, "learning_rate": 0.00040956937799043066, "loss": 0.3096, "step": 24930 }, { "epoch": 39.78, "grad_norm": 0.40327930450439453, "learning_rate": 0.0004089314194577353, "loss": 0.3472, "step": 24940 }, { "epoch": 39.79, "grad_norm": 0.4578391909599304, "learning_rate": 0.0004082934609250399, "loss": 0.3177, "step": 24950 }, { "epoch": 39.81, "grad_norm": 0.24900272488594055, "learning_rate": 0.0004076555023923445, "loss": 0.3526, "step": 24960 }, { "epoch": 39.82, "grad_norm": 0.8984745144844055, "learning_rate": 0.0004070175438596491, "loss": 0.3307, "step": 24970 }, { "epoch": 39.84, "grad_norm": 0.2043074071407318, "learning_rate": 0.00040637958532695374, "loss": 0.3346, "step": 24980 }, { "epoch": 39.86, "grad_norm": 0.293965220451355, "learning_rate": 0.00040574162679425836, "loss": 0.3562, "step": 24990 }, { "epoch": 39.87, "grad_norm": 0.1676713526248932, "learning_rate": 0.00040510366826156303, "loss": 0.2771, "step": 25000 }, { "epoch": 39.89, "grad_norm": 0.7040833830833435, "learning_rate": 0.00040446570972886765, "loss": 0.3782, "step": 25010 }, { "epoch": 39.9, "grad_norm": 1.6222413778305054, "learning_rate": 0.00040382775119617226, "loss": 0.2836, "step": 25020 }, { "epoch": 39.92, "grad_norm": 0.3965054750442505, "learning_rate": 0.0004031897926634769, "loss": 0.4055, "step": 25030 }, { "epoch": 39.94, "grad_norm": 0.5142346024513245, "learning_rate": 0.0004025518341307815, "loss": 0.34, "step": 25040 }, { "epoch": 39.95, "grad_norm": 0.4719744622707367, "learning_rate": 0.0004019138755980861, "loss": 0.3643, "step": 25050 }, { "epoch": 39.97, "grad_norm": 0.29006433486938477, "learning_rate": 0.00040127591706539073, "loss": 0.3195, "step": 25060 }, { "epoch": 39.98, "grad_norm": 0.40275096893310547, "learning_rate": 0.00040063795853269535, "loss": 0.3247, "step": 25070 }, { "epoch": 40.0, "grad_norm": 0.19441524147987366, "learning_rate": 0.0004, "loss": 0.2905, "step": 25080 }, { "epoch": 40.02, "grad_norm": 0.30110710859298706, "learning_rate": 0.00039936204146730464, "loss": 0.2916, "step": 25090 }, { "epoch": 40.03, "grad_norm": 0.36237674951553345, "learning_rate": 0.00039872408293460925, "loss": 0.3144, "step": 25100 }, { "epoch": 40.05, "grad_norm": 0.4144202172756195, "learning_rate": 0.00039808612440191387, "loss": 0.2537, "step": 25110 }, { "epoch": 40.06, "grad_norm": 0.5469448566436768, "learning_rate": 0.0003974481658692185, "loss": 0.2778, "step": 25120 }, { "epoch": 40.08, "grad_norm": 0.6350633502006531, "learning_rate": 0.0003968102073365231, "loss": 0.3139, "step": 25130 }, { "epoch": 40.1, "grad_norm": 0.6425772905349731, "learning_rate": 0.0003961722488038277, "loss": 0.3137, "step": 25140 }, { "epoch": 40.11, "grad_norm": 0.5132192373275757, "learning_rate": 0.00039553429027113234, "loss": 0.3182, "step": 25150 }, { "epoch": 40.13, "grad_norm": 0.3655058443546295, "learning_rate": 0.000394896331738437, "loss": 0.3213, "step": 25160 }, { "epoch": 40.14, "grad_norm": 0.3207656145095825, "learning_rate": 0.0003942583732057416, "loss": 0.2839, "step": 25170 }, { "epoch": 40.16, "grad_norm": 0.4457024037837982, "learning_rate": 0.00039362041467304624, "loss": 0.3271, "step": 25180 }, { "epoch": 40.18, "grad_norm": 0.457660049200058, "learning_rate": 0.00039298245614035086, "loss": 0.3049, "step": 25190 }, { "epoch": 40.19, "grad_norm": 0.44609880447387695, "learning_rate": 0.00039234449760765553, "loss": 0.3159, "step": 25200 }, { "epoch": 40.21, "grad_norm": 0.14960619807243347, "learning_rate": 0.00039170653907496015, "loss": 0.2678, "step": 25210 }, { "epoch": 40.22, "grad_norm": 0.20554865896701813, "learning_rate": 0.00039106858054226476, "loss": 0.2969, "step": 25220 }, { "epoch": 40.24, "grad_norm": 0.25997835397720337, "learning_rate": 0.0003904306220095694, "loss": 0.2159, "step": 25230 }, { "epoch": 40.26, "grad_norm": 0.18251359462738037, "learning_rate": 0.00038979266347687405, "loss": 0.3482, "step": 25240 }, { "epoch": 40.27, "grad_norm": 0.3024716377258301, "learning_rate": 0.00038915470494417867, "loss": 0.3027, "step": 25250 }, { "epoch": 40.29, "grad_norm": 0.38427066802978516, "learning_rate": 0.0003885167464114833, "loss": 0.297, "step": 25260 }, { "epoch": 40.3, "grad_norm": 0.4605743885040283, "learning_rate": 0.0003878787878787879, "loss": 0.2966, "step": 25270 }, { "epoch": 40.32, "grad_norm": 0.3320145010948181, "learning_rate": 0.0003872408293460925, "loss": 0.2449, "step": 25280 }, { "epoch": 40.33, "grad_norm": 0.23880721628665924, "learning_rate": 0.00038660287081339714, "loss": 0.2779, "step": 25290 }, { "epoch": 40.35, "grad_norm": 0.442751407623291, "learning_rate": 0.00038596491228070175, "loss": 0.2729, "step": 25300 }, { "epoch": 40.37, "grad_norm": 0.2670186758041382, "learning_rate": 0.0003853269537480064, "loss": 0.3296, "step": 25310 }, { "epoch": 40.38, "grad_norm": 0.2149314135313034, "learning_rate": 0.00038468899521531104, "loss": 0.3094, "step": 25320 }, { "epoch": 40.4, "grad_norm": 0.15769945085048676, "learning_rate": 0.00038405103668261566, "loss": 0.2962, "step": 25330 }, { "epoch": 40.41, "grad_norm": 0.30012694001197815, "learning_rate": 0.0003834130781499203, "loss": 0.2864, "step": 25340 }, { "epoch": 40.43, "grad_norm": 0.6400253772735596, "learning_rate": 0.0003827751196172249, "loss": 0.4076, "step": 25350 }, { "epoch": 40.45, "grad_norm": 0.5464116334915161, "learning_rate": 0.0003821371610845295, "loss": 0.3281, "step": 25360 }, { "epoch": 40.46, "grad_norm": 0.463392972946167, "learning_rate": 0.0003814992025518341, "loss": 0.3192, "step": 25370 }, { "epoch": 40.48, "grad_norm": 0.1991080492734909, "learning_rate": 0.00038086124401913874, "loss": 0.2582, "step": 25380 }, { "epoch": 40.49, "grad_norm": 0.5955290198326111, "learning_rate": 0.0003802232854864434, "loss": 0.4031, "step": 25390 }, { "epoch": 40.51, "grad_norm": 0.22706195712089539, "learning_rate": 0.00037958532695374803, "loss": 0.2928, "step": 25400 }, { "epoch": 40.53, "grad_norm": 0.4163839817047119, "learning_rate": 0.00037894736842105265, "loss": 0.2956, "step": 25410 }, { "epoch": 40.54, "grad_norm": 0.2746015787124634, "learning_rate": 0.00037830940988835726, "loss": 0.2378, "step": 25420 }, { "epoch": 40.56, "grad_norm": 0.23401568830013275, "learning_rate": 0.0003776714513556619, "loss": 0.3618, "step": 25430 }, { "epoch": 40.57, "grad_norm": 1.4698227643966675, "learning_rate": 0.0003770334928229665, "loss": 0.3472, "step": 25440 }, { "epoch": 40.59, "grad_norm": 0.29799923300743103, "learning_rate": 0.0003763955342902711, "loss": 0.321, "step": 25450 }, { "epoch": 40.61, "grad_norm": 0.27735623717308044, "learning_rate": 0.00037575757575757573, "loss": 0.346, "step": 25460 }, { "epoch": 40.62, "grad_norm": 0.34145793318748474, "learning_rate": 0.0003751196172248804, "loss": 0.3426, "step": 25470 }, { "epoch": 40.64, "grad_norm": 0.24481597542762756, "learning_rate": 0.000374481658692185, "loss": 0.2791, "step": 25480 }, { "epoch": 40.65, "grad_norm": 0.5041400194168091, "learning_rate": 0.00037384370015948964, "loss": 0.2817, "step": 25490 }, { "epoch": 40.67, "grad_norm": 0.3849920332431793, "learning_rate": 0.00037320574162679425, "loss": 0.3522, "step": 25500 }, { "epoch": 40.69, "grad_norm": 0.4459153413772583, "learning_rate": 0.00037256778309409887, "loss": 0.3244, "step": 25510 }, { "epoch": 40.7, "grad_norm": 0.441022127866745, "learning_rate": 0.0003719298245614035, "loss": 0.2834, "step": 25520 }, { "epoch": 40.72, "grad_norm": 0.16988414525985718, "learning_rate": 0.0003712918660287081, "loss": 0.2639, "step": 25530 }, { "epoch": 40.73, "grad_norm": 0.3544873893260956, "learning_rate": 0.0003706539074960127, "loss": 0.3157, "step": 25540 }, { "epoch": 40.75, "grad_norm": 0.6139649152755737, "learning_rate": 0.0003700159489633174, "loss": 0.3729, "step": 25550 }, { "epoch": 40.77, "grad_norm": 0.22452673316001892, "learning_rate": 0.000369377990430622, "loss": 0.2209, "step": 25560 }, { "epoch": 40.78, "grad_norm": 0.458019495010376, "learning_rate": 0.0003687400318979267, "loss": 0.3738, "step": 25570 }, { "epoch": 40.8, "grad_norm": 0.24333609640598297, "learning_rate": 0.0003681020733652313, "loss": 0.3406, "step": 25580 }, { "epoch": 40.81, "grad_norm": 0.135534405708313, "learning_rate": 0.0003674641148325359, "loss": 0.2845, "step": 25590 }, { "epoch": 40.83, "grad_norm": 1.0264251232147217, "learning_rate": 0.00036682615629984053, "loss": 0.3014, "step": 25600 }, { "epoch": 40.85, "grad_norm": 0.5027388334274292, "learning_rate": 0.00036618819776714515, "loss": 0.2648, "step": 25610 }, { "epoch": 40.86, "grad_norm": 0.37629154324531555, "learning_rate": 0.00036555023923444976, "loss": 0.3263, "step": 25620 }, { "epoch": 40.88, "grad_norm": 0.16155029833316803, "learning_rate": 0.00036491228070175443, "loss": 0.2677, "step": 25630 }, { "epoch": 40.89, "grad_norm": 0.5950889587402344, "learning_rate": 0.00036427432216905905, "loss": 0.2737, "step": 25640 }, { "epoch": 40.91, "grad_norm": 1.288246750831604, "learning_rate": 0.00036363636363636367, "loss": 0.3458, "step": 25650 }, { "epoch": 40.93, "grad_norm": 0.21823683381080627, "learning_rate": 0.0003629984051036683, "loss": 0.2528, "step": 25660 }, { "epoch": 40.94, "grad_norm": 0.2102632224559784, "learning_rate": 0.0003623604465709729, "loss": 0.3346, "step": 25670 }, { "epoch": 40.96, "grad_norm": 0.753999650478363, "learning_rate": 0.0003617224880382775, "loss": 0.3758, "step": 25680 }, { "epoch": 40.97, "grad_norm": 0.20464596152305603, "learning_rate": 0.00036108452950558214, "loss": 0.3724, "step": 25690 }, { "epoch": 40.99, "grad_norm": 0.38693875074386597, "learning_rate": 0.00036044657097288675, "loss": 0.4987, "step": 25700 }, { "epoch": 41.0, "grad_norm": 1.1584486961364746, "learning_rate": 0.0003598086124401914, "loss": 0.3108, "step": 25710 }, { "epoch": 41.02, "grad_norm": 0.23398354649543762, "learning_rate": 0.00035917065390749604, "loss": 0.321, "step": 25720 }, { "epoch": 41.04, "grad_norm": 0.265209823846817, "learning_rate": 0.00035853269537480066, "loss": 0.379, "step": 25730 }, { "epoch": 41.05, "grad_norm": 0.5159454941749573, "learning_rate": 0.0003578947368421053, "loss": 0.2849, "step": 25740 }, { "epoch": 41.07, "grad_norm": 0.3185652792453766, "learning_rate": 0.0003572567783094099, "loss": 0.2885, "step": 25750 }, { "epoch": 41.08, "grad_norm": 0.6398610472679138, "learning_rate": 0.0003566188197767145, "loss": 0.3583, "step": 25760 }, { "epoch": 41.1, "grad_norm": 0.5768219232559204, "learning_rate": 0.0003559808612440191, "loss": 0.3427, "step": 25770 }, { "epoch": 41.12, "grad_norm": 0.5042071342468262, "learning_rate": 0.00035534290271132374, "loss": 0.3047, "step": 25780 }, { "epoch": 41.13, "grad_norm": 0.20871587097644806, "learning_rate": 0.0003547049441786284, "loss": 0.2634, "step": 25790 }, { "epoch": 41.15, "grad_norm": 0.20863570272922516, "learning_rate": 0.00035406698564593303, "loss": 0.3444, "step": 25800 }, { "epoch": 41.16, "grad_norm": 0.43497905135154724, "learning_rate": 0.00035342902711323765, "loss": 0.3717, "step": 25810 }, { "epoch": 41.18, "grad_norm": 0.5420474410057068, "learning_rate": 0.00035279106858054226, "loss": 0.2936, "step": 25820 }, { "epoch": 41.2, "grad_norm": 0.16857664287090302, "learning_rate": 0.0003521531100478469, "loss": 0.2317, "step": 25830 }, { "epoch": 41.21, "grad_norm": 0.176952064037323, "learning_rate": 0.0003515151515151515, "loss": 0.2895, "step": 25840 }, { "epoch": 41.23, "grad_norm": 0.3629634380340576, "learning_rate": 0.0003508771929824561, "loss": 0.3161, "step": 25850 }, { "epoch": 41.24, "grad_norm": 0.3649951219558716, "learning_rate": 0.00035023923444976073, "loss": 0.3293, "step": 25860 }, { "epoch": 41.26, "grad_norm": 0.2517475187778473, "learning_rate": 0.0003496012759170654, "loss": 0.3179, "step": 25870 }, { "epoch": 41.28, "grad_norm": 0.18728438019752502, "learning_rate": 0.00034896331738437, "loss": 0.278, "step": 25880 }, { "epoch": 41.29, "grad_norm": 0.3795156180858612, "learning_rate": 0.00034832535885167464, "loss": 0.3076, "step": 25890 }, { "epoch": 41.31, "grad_norm": 0.6630691289901733, "learning_rate": 0.00034768740031897925, "loss": 0.3059, "step": 25900 }, { "epoch": 41.32, "grad_norm": 0.5528631210327148, "learning_rate": 0.00034704944178628387, "loss": 0.287, "step": 25910 }, { "epoch": 41.34, "grad_norm": 0.43808212876319885, "learning_rate": 0.0003464114832535885, "loss": 0.2863, "step": 25920 }, { "epoch": 41.36, "grad_norm": 0.18791545927524567, "learning_rate": 0.0003457735247208931, "loss": 0.3391, "step": 25930 }, { "epoch": 41.37, "grad_norm": 0.3744913935661316, "learning_rate": 0.0003451355661881977, "loss": 0.3083, "step": 25940 }, { "epoch": 41.39, "grad_norm": 0.48115044832229614, "learning_rate": 0.00034449760765550245, "loss": 0.3296, "step": 25950 }, { "epoch": 41.4, "grad_norm": 0.38108351826667786, "learning_rate": 0.00034385964912280706, "loss": 0.2663, "step": 25960 }, { "epoch": 41.42, "grad_norm": 0.3938140869140625, "learning_rate": 0.0003432216905901117, "loss": 0.3353, "step": 25970 }, { "epoch": 41.44, "grad_norm": 0.2402111142873764, "learning_rate": 0.0003425837320574163, "loss": 0.3786, "step": 25980 }, { "epoch": 41.45, "grad_norm": 0.39668262004852295, "learning_rate": 0.0003419457735247209, "loss": 0.2367, "step": 25990 }, { "epoch": 41.47, "grad_norm": 0.3418915569782257, "learning_rate": 0.00034130781499202553, "loss": 0.2675, "step": 26000 }, { "epoch": 41.48, "grad_norm": 0.5036392211914062, "learning_rate": 0.00034066985645933015, "loss": 0.4014, "step": 26010 }, { "epoch": 41.5, "grad_norm": 0.4944436550140381, "learning_rate": 0.0003400318979266348, "loss": 0.3061, "step": 26020 }, { "epoch": 41.52, "grad_norm": 0.36498111486434937, "learning_rate": 0.00033939393939393943, "loss": 0.2843, "step": 26030 }, { "epoch": 41.53, "grad_norm": 0.4892807900905609, "learning_rate": 0.00033875598086124405, "loss": 0.349, "step": 26040 }, { "epoch": 41.55, "grad_norm": 0.23948755860328674, "learning_rate": 0.00033811802232854867, "loss": 0.2737, "step": 26050 }, { "epoch": 41.56, "grad_norm": 0.22319771349430084, "learning_rate": 0.0003374800637958533, "loss": 0.2962, "step": 26060 }, { "epoch": 41.58, "grad_norm": 0.3389337956905365, "learning_rate": 0.0003368421052631579, "loss": 0.3462, "step": 26070 }, { "epoch": 41.59, "grad_norm": 0.15749427676200867, "learning_rate": 0.0003362041467304625, "loss": 0.3707, "step": 26080 }, { "epoch": 41.61, "grad_norm": 0.5987353324890137, "learning_rate": 0.00033556618819776714, "loss": 0.3666, "step": 26090 }, { "epoch": 41.63, "grad_norm": 0.21494194865226746, "learning_rate": 0.0003349282296650718, "loss": 0.296, "step": 26100 }, { "epoch": 41.64, "grad_norm": 0.4202018976211548, "learning_rate": 0.0003342902711323764, "loss": 0.3, "step": 26110 }, { "epoch": 41.66, "grad_norm": 0.33832699060440063, "learning_rate": 0.00033365231259968104, "loss": 0.3593, "step": 26120 }, { "epoch": 41.67, "grad_norm": 0.18312333524227142, "learning_rate": 0.00033301435406698566, "loss": 0.2593, "step": 26130 }, { "epoch": 41.69, "grad_norm": 0.6346192359924316, "learning_rate": 0.0003323763955342903, "loss": 0.3646, "step": 26140 }, { "epoch": 41.71, "grad_norm": 0.2041671872138977, "learning_rate": 0.0003317384370015949, "loss": 0.2913, "step": 26150 }, { "epoch": 41.72, "grad_norm": 0.23247523605823517, "learning_rate": 0.0003311004784688995, "loss": 0.2864, "step": 26160 }, { "epoch": 41.74, "grad_norm": 0.6074626445770264, "learning_rate": 0.0003304625199362041, "loss": 0.2761, "step": 26170 }, { "epoch": 41.75, "grad_norm": 0.2906535267829895, "learning_rate": 0.0003298245614035088, "loss": 0.2929, "step": 26180 }, { "epoch": 41.77, "grad_norm": 0.36293816566467285, "learning_rate": 0.0003291866028708134, "loss": 0.2971, "step": 26190 }, { "epoch": 41.79, "grad_norm": 0.3410266041755676, "learning_rate": 0.00032854864433811803, "loss": 0.2934, "step": 26200 }, { "epoch": 41.8, "grad_norm": 0.5327407717704773, "learning_rate": 0.00032791068580542265, "loss": 0.3566, "step": 26210 }, { "epoch": 41.82, "grad_norm": 0.4243089556694031, "learning_rate": 0.00032727272727272726, "loss": 0.2588, "step": 26220 }, { "epoch": 41.83, "grad_norm": 0.3032602369785309, "learning_rate": 0.0003266347687400319, "loss": 0.2629, "step": 26230 }, { "epoch": 41.85, "grad_norm": 0.4830479621887207, "learning_rate": 0.0003259968102073365, "loss": 0.3939, "step": 26240 }, { "epoch": 41.87, "grad_norm": 0.11178059130907059, "learning_rate": 0.0003253588516746411, "loss": 0.2336, "step": 26250 }, { "epoch": 41.88, "grad_norm": 0.11337348073720932, "learning_rate": 0.0003247208931419458, "loss": 0.2311, "step": 26260 }, { "epoch": 41.9, "grad_norm": 0.42159444093704224, "learning_rate": 0.0003240829346092504, "loss": 0.3213, "step": 26270 }, { "epoch": 41.91, "grad_norm": 0.36887168884277344, "learning_rate": 0.000323444976076555, "loss": 0.2928, "step": 26280 }, { "epoch": 41.93, "grad_norm": 0.4706740081310272, "learning_rate": 0.00032280701754385964, "loss": 0.2848, "step": 26290 }, { "epoch": 41.95, "grad_norm": 0.5931901335716248, "learning_rate": 0.00032216905901116425, "loss": 0.3273, "step": 26300 }, { "epoch": 41.96, "grad_norm": 0.21357150375843048, "learning_rate": 0.00032153110047846887, "loss": 0.2859, "step": 26310 }, { "epoch": 41.98, "grad_norm": 0.48659244179725647, "learning_rate": 0.00032089314194577354, "loss": 0.3588, "step": 26320 }, { "epoch": 41.99, "grad_norm": 0.28712713718414307, "learning_rate": 0.00032025518341307816, "loss": 0.3016, "step": 26330 }, { "epoch": 42.01, "grad_norm": 0.5351189970970154, "learning_rate": 0.00031961722488038283, "loss": 0.3306, "step": 26340 }, { "epoch": 42.03, "grad_norm": 0.22305412590503693, "learning_rate": 0.00031897926634768745, "loss": 0.283, "step": 26350 }, { "epoch": 42.04, "grad_norm": 0.3026597797870636, "learning_rate": 0.00031834130781499206, "loss": 0.2912, "step": 26360 }, { "epoch": 42.06, "grad_norm": 0.3411235809326172, "learning_rate": 0.0003177033492822967, "loss": 0.3273, "step": 26370 }, { "epoch": 42.07, "grad_norm": 0.31902214884757996, "learning_rate": 0.0003170653907496013, "loss": 0.2511, "step": 26380 }, { "epoch": 42.09, "grad_norm": 0.2367999106645584, "learning_rate": 0.0003164274322169059, "loss": 0.2427, "step": 26390 }, { "epoch": 42.11, "grad_norm": 0.24773749709129333, "learning_rate": 0.00031578947368421053, "loss": 0.2991, "step": 26400 }, { "epoch": 42.12, "grad_norm": 0.33940422534942627, "learning_rate": 0.00031515151515151515, "loss": 0.2688, "step": 26410 }, { "epoch": 42.14, "grad_norm": 0.4297594130039215, "learning_rate": 0.0003145135566188198, "loss": 0.2882, "step": 26420 }, { "epoch": 42.15, "grad_norm": 0.38739773631095886, "learning_rate": 0.00031387559808612443, "loss": 0.299, "step": 26430 }, { "epoch": 42.17, "grad_norm": 0.19908225536346436, "learning_rate": 0.00031323763955342905, "loss": 0.2284, "step": 26440 }, { "epoch": 42.19, "grad_norm": 0.2024683952331543, "learning_rate": 0.00031259968102073367, "loss": 0.27, "step": 26450 }, { "epoch": 42.2, "grad_norm": 0.27837881445884705, "learning_rate": 0.0003119617224880383, "loss": 0.3353, "step": 26460 }, { "epoch": 42.22, "grad_norm": 0.25491103529930115, "learning_rate": 0.0003113237639553429, "loss": 0.3187, "step": 26470 }, { "epoch": 42.23, "grad_norm": 0.430846244096756, "learning_rate": 0.0003106858054226475, "loss": 0.2994, "step": 26480 }, { "epoch": 42.25, "grad_norm": 0.3018259108066559, "learning_rate": 0.00031004784688995214, "loss": 0.3171, "step": 26490 }, { "epoch": 42.26, "grad_norm": 0.29348355531692505, "learning_rate": 0.0003094098883572568, "loss": 0.3, "step": 26500 }, { "epoch": 42.28, "grad_norm": 0.3258605897426605, "learning_rate": 0.0003087719298245614, "loss": 0.2795, "step": 26510 }, { "epoch": 42.3, "grad_norm": 0.17465408146381378, "learning_rate": 0.00030813397129186604, "loss": 0.3106, "step": 26520 }, { "epoch": 42.31, "grad_norm": 0.2361348271369934, "learning_rate": 0.00030749601275917066, "loss": 0.2802, "step": 26530 }, { "epoch": 42.33, "grad_norm": 0.18255957961082458, "learning_rate": 0.0003068580542264753, "loss": 0.2396, "step": 26540 }, { "epoch": 42.34, "grad_norm": 0.5694864988327026, "learning_rate": 0.0003062200956937799, "loss": 0.3536, "step": 26550 }, { "epoch": 42.36, "grad_norm": 0.37303659319877625, "learning_rate": 0.0003055821371610845, "loss": 0.263, "step": 26560 }, { "epoch": 42.38, "grad_norm": 0.3398790657520294, "learning_rate": 0.0003049441786283891, "loss": 0.2204, "step": 26570 }, { "epoch": 42.39, "grad_norm": 0.28415796160697937, "learning_rate": 0.0003043062200956938, "loss": 0.3556, "step": 26580 }, { "epoch": 42.41, "grad_norm": 0.4093596637248993, "learning_rate": 0.0003036682615629984, "loss": 0.2996, "step": 26590 }, { "epoch": 42.42, "grad_norm": 0.25546014308929443, "learning_rate": 0.00030303030303030303, "loss": 0.3492, "step": 26600 }, { "epoch": 42.44, "grad_norm": 0.7774071097373962, "learning_rate": 0.00030239234449760765, "loss": 0.3175, "step": 26610 }, { "epoch": 42.46, "grad_norm": 0.7066117525100708, "learning_rate": 0.00030175438596491226, "loss": 0.3957, "step": 26620 }, { "epoch": 42.47, "grad_norm": 0.42754918336868286, "learning_rate": 0.0003011164274322169, "loss": 0.2983, "step": 26630 }, { "epoch": 42.49, "grad_norm": 0.5412092208862305, "learning_rate": 0.0003004784688995215, "loss": 0.3556, "step": 26640 }, { "epoch": 42.5, "grad_norm": 0.19610168039798737, "learning_rate": 0.0002998405103668261, "loss": 0.309, "step": 26650 }, { "epoch": 42.52, "grad_norm": 0.4178897738456726, "learning_rate": 0.0002992025518341308, "loss": 0.2874, "step": 26660 }, { "epoch": 42.54, "grad_norm": 0.24159128963947296, "learning_rate": 0.0002985645933014354, "loss": 0.3137, "step": 26670 }, { "epoch": 42.55, "grad_norm": 0.3273567259311676, "learning_rate": 0.00029792663476874, "loss": 0.3238, "step": 26680 }, { "epoch": 42.57, "grad_norm": 0.48245471715927124, "learning_rate": 0.0002972886762360447, "loss": 0.42, "step": 26690 }, { "epoch": 42.58, "grad_norm": 0.7114046216011047, "learning_rate": 0.0002966507177033493, "loss": 0.3498, "step": 26700 }, { "epoch": 42.6, "grad_norm": 0.6506601572036743, "learning_rate": 0.0002960127591706539, "loss": 0.3406, "step": 26710 }, { "epoch": 42.62, "grad_norm": 0.5458781719207764, "learning_rate": 0.00029537480063795854, "loss": 0.2737, "step": 26720 }, { "epoch": 42.63, "grad_norm": 0.19456742703914642, "learning_rate": 0.00029473684210526316, "loss": 0.31, "step": 26730 }, { "epoch": 42.65, "grad_norm": 0.178878054022789, "learning_rate": 0.00029409888357256783, "loss": 0.3827, "step": 26740 }, { "epoch": 42.66, "grad_norm": 0.40357646346092224, "learning_rate": 0.00029346092503987245, "loss": 0.2755, "step": 26750 }, { "epoch": 42.68, "grad_norm": 0.5037977695465088, "learning_rate": 0.00029282296650717706, "loss": 0.3228, "step": 26760 }, { "epoch": 42.7, "grad_norm": 0.20705698430538177, "learning_rate": 0.0002921850079744817, "loss": 0.2309, "step": 26770 }, { "epoch": 42.71, "grad_norm": 0.22491195797920227, "learning_rate": 0.0002915470494417863, "loss": 0.2569, "step": 26780 }, { "epoch": 42.73, "grad_norm": 0.270967036485672, "learning_rate": 0.0002909090909090909, "loss": 0.2846, "step": 26790 }, { "epoch": 42.74, "grad_norm": 0.1675962209701538, "learning_rate": 0.00029027113237639553, "loss": 0.3479, "step": 26800 }, { "epoch": 42.76, "grad_norm": 0.24002137780189514, "learning_rate": 0.0002896331738437002, "loss": 0.3825, "step": 26810 }, { "epoch": 42.78, "grad_norm": 0.7108230590820312, "learning_rate": 0.0002889952153110048, "loss": 0.4057, "step": 26820 }, { "epoch": 42.79, "grad_norm": 0.5931742787361145, "learning_rate": 0.00028835725677830943, "loss": 0.3256, "step": 26830 }, { "epoch": 42.81, "grad_norm": 0.4527370035648346, "learning_rate": 0.00028771929824561405, "loss": 0.2943, "step": 26840 }, { "epoch": 42.82, "grad_norm": 0.6159200072288513, "learning_rate": 0.00028708133971291867, "loss": 0.3117, "step": 26850 }, { "epoch": 42.84, "grad_norm": 0.1614978313446045, "learning_rate": 0.0002864433811802233, "loss": 0.2834, "step": 26860 }, { "epoch": 42.85, "grad_norm": 0.37030118703842163, "learning_rate": 0.0002858054226475279, "loss": 0.3029, "step": 26870 }, { "epoch": 42.87, "grad_norm": 0.13131965696811676, "learning_rate": 0.0002851674641148325, "loss": 0.27, "step": 26880 }, { "epoch": 42.89, "grad_norm": 0.42525768280029297, "learning_rate": 0.0002845295055821372, "loss": 0.3307, "step": 26890 }, { "epoch": 42.9, "grad_norm": 0.17870941758155823, "learning_rate": 0.0002838915470494418, "loss": 0.2564, "step": 26900 }, { "epoch": 42.92, "grad_norm": 0.7622866630554199, "learning_rate": 0.0002832535885167464, "loss": 0.3327, "step": 26910 }, { "epoch": 42.93, "grad_norm": 0.5731341242790222, "learning_rate": 0.00028261562998405104, "loss": 0.3194, "step": 26920 }, { "epoch": 42.95, "grad_norm": 0.3763886094093323, "learning_rate": 0.00028197767145135566, "loss": 0.2775, "step": 26930 }, { "epoch": 42.97, "grad_norm": 0.33604711294174194, "learning_rate": 0.0002813397129186603, "loss": 0.2825, "step": 26940 }, { "epoch": 42.98, "grad_norm": 0.2752174437046051, "learning_rate": 0.0002807017543859649, "loss": 0.3769, "step": 26950 }, { "epoch": 43.0, "grad_norm": 0.4602324962615967, "learning_rate": 0.0002800637958532695, "loss": 0.3297, "step": 26960 }, { "epoch": 43.01, "grad_norm": 0.263231486082077, "learning_rate": 0.0002794258373205742, "loss": 0.226, "step": 26970 }, { "epoch": 43.03, "grad_norm": 0.085409976541996, "learning_rate": 0.0002787878787878788, "loss": 0.2887, "step": 26980 }, { "epoch": 43.05, "grad_norm": 0.3499665856361389, "learning_rate": 0.0002781499202551834, "loss": 0.3344, "step": 26990 }, { "epoch": 43.06, "grad_norm": 0.6164402365684509, "learning_rate": 0.00027751196172248803, "loss": 0.3555, "step": 27000 }, { "epoch": 43.08, "grad_norm": 0.22411352396011353, "learning_rate": 0.00027687400318979265, "loss": 0.3044, "step": 27010 }, { "epoch": 43.09, "grad_norm": 0.5322696566581726, "learning_rate": 0.00027623604465709726, "loss": 0.2515, "step": 27020 }, { "epoch": 43.11, "grad_norm": 0.382097989320755, "learning_rate": 0.0002755980861244019, "loss": 0.399, "step": 27030 }, { "epoch": 43.13, "grad_norm": 0.13839659094810486, "learning_rate": 0.0002749601275917065, "loss": 0.3297, "step": 27040 }, { "epoch": 43.14, "grad_norm": 0.22009891271591187, "learning_rate": 0.00027432216905901117, "loss": 0.277, "step": 27050 }, { "epoch": 43.16, "grad_norm": 0.41159576177597046, "learning_rate": 0.00027368421052631584, "loss": 0.3107, "step": 27060 }, { "epoch": 43.17, "grad_norm": 0.21699748933315277, "learning_rate": 0.00027304625199362046, "loss": 0.2765, "step": 27070 }, { "epoch": 43.19, "grad_norm": 0.21291545033454895, "learning_rate": 0.0002724082934609251, "loss": 0.2479, "step": 27080 }, { "epoch": 43.21, "grad_norm": 0.20848800241947174, "learning_rate": 0.0002717703349282297, "loss": 0.316, "step": 27090 }, { "epoch": 43.22, "grad_norm": 0.41950148344039917, "learning_rate": 0.0002711323763955343, "loss": 0.3588, "step": 27100 }, { "epoch": 43.24, "grad_norm": 0.33547741174697876, "learning_rate": 0.0002704944178628389, "loss": 0.2836, "step": 27110 }, { "epoch": 43.25, "grad_norm": 1.4663803577423096, "learning_rate": 0.00026985645933014354, "loss": 0.3852, "step": 27120 }, { "epoch": 43.27, "grad_norm": 0.2404787391424179, "learning_rate": 0.0002692185007974482, "loss": 0.2629, "step": 27130 }, { "epoch": 43.29, "grad_norm": 0.08930987864732742, "learning_rate": 0.00026858054226475283, "loss": 0.1914, "step": 27140 }, { "epoch": 43.3, "grad_norm": 0.3522126376628876, "learning_rate": 0.00026794258373205745, "loss": 0.3222, "step": 27150 }, { "epoch": 43.32, "grad_norm": 2.05954909324646, "learning_rate": 0.00026730462519936206, "loss": 0.3058, "step": 27160 }, { "epoch": 43.33, "grad_norm": 0.36962321400642395, "learning_rate": 0.0002666666666666667, "loss": 0.2829, "step": 27170 }, { "epoch": 43.35, "grad_norm": 0.18911263346672058, "learning_rate": 0.0002660287081339713, "loss": 0.3731, "step": 27180 }, { "epoch": 43.37, "grad_norm": 0.19024628400802612, "learning_rate": 0.0002653907496012759, "loss": 0.2726, "step": 27190 }, { "epoch": 43.38, "grad_norm": 0.20783045887947083, "learning_rate": 0.00026475279106858053, "loss": 0.2787, "step": 27200 }, { "epoch": 43.4, "grad_norm": 1.8203842639923096, "learning_rate": 0.0002641148325358852, "loss": 0.2755, "step": 27210 }, { "epoch": 43.41, "grad_norm": 0.41969624161720276, "learning_rate": 0.0002634768740031898, "loss": 0.3315, "step": 27220 }, { "epoch": 43.43, "grad_norm": 0.17119190096855164, "learning_rate": 0.00026283891547049443, "loss": 0.2718, "step": 27230 }, { "epoch": 43.44, "grad_norm": 0.24514427781105042, "learning_rate": 0.00026220095693779905, "loss": 0.2831, "step": 27240 }, { "epoch": 43.46, "grad_norm": 0.24649424850940704, "learning_rate": 0.00026156299840510367, "loss": 0.2647, "step": 27250 }, { "epoch": 43.48, "grad_norm": 0.3236254155635834, "learning_rate": 0.0002609250398724083, "loss": 0.2733, "step": 27260 }, { "epoch": 43.49, "grad_norm": 0.4180354177951813, "learning_rate": 0.0002602870813397129, "loss": 0.3352, "step": 27270 }, { "epoch": 43.51, "grad_norm": 0.4652386009693146, "learning_rate": 0.0002596491228070175, "loss": 0.301, "step": 27280 }, { "epoch": 43.52, "grad_norm": 0.30387723445892334, "learning_rate": 0.0002590111642743222, "loss": 0.2286, "step": 27290 }, { "epoch": 43.54, "grad_norm": 0.13368535041809082, "learning_rate": 0.0002583732057416268, "loss": 0.2803, "step": 27300 }, { "epoch": 43.56, "grad_norm": 0.10856983065605164, "learning_rate": 0.0002577352472089314, "loss": 0.3184, "step": 27310 }, { "epoch": 43.57, "grad_norm": 0.3115447759628296, "learning_rate": 0.00025709728867623604, "loss": 0.3274, "step": 27320 }, { "epoch": 43.59, "grad_norm": 0.3176775276660919, "learning_rate": 0.00025645933014354066, "loss": 0.3476, "step": 27330 }, { "epoch": 43.6, "grad_norm": 0.17715303599834442, "learning_rate": 0.0002558213716108453, "loss": 0.2684, "step": 27340 }, { "epoch": 43.62, "grad_norm": 0.39675870537757874, "learning_rate": 0.0002551834130781499, "loss": 0.317, "step": 27350 }, { "epoch": 43.64, "grad_norm": 0.29539576172828674, "learning_rate": 0.0002545454545454545, "loss": 0.3124, "step": 27360 }, { "epoch": 43.65, "grad_norm": 0.1516566276550293, "learning_rate": 0.0002539074960127592, "loss": 0.3537, "step": 27370 }, { "epoch": 43.67, "grad_norm": 0.3762792944908142, "learning_rate": 0.0002532695374800638, "loss": 0.3026, "step": 27380 }, { "epoch": 43.68, "grad_norm": 0.15428495407104492, "learning_rate": 0.0002526315789473684, "loss": 0.2849, "step": 27390 }, { "epoch": 43.7, "grad_norm": 0.22668874263763428, "learning_rate": 0.00025199362041467303, "loss": 0.2174, "step": 27400 }, { "epoch": 43.72, "grad_norm": 0.12878923118114471, "learning_rate": 0.00025135566188197765, "loss": 0.2367, "step": 27410 }, { "epoch": 43.73, "grad_norm": 0.1742442101240158, "learning_rate": 0.00025071770334928226, "loss": 0.3225, "step": 27420 }, { "epoch": 43.75, "grad_norm": 0.2178335189819336, "learning_rate": 0.0002500797448165869, "loss": 0.3674, "step": 27430 }, { "epoch": 43.76, "grad_norm": 0.07598412036895752, "learning_rate": 0.00024944178628389155, "loss": 0.3825, "step": 27440 }, { "epoch": 43.78, "grad_norm": 0.3597804605960846, "learning_rate": 0.00024880382775119617, "loss": 0.2679, "step": 27450 }, { "epoch": 43.8, "grad_norm": 0.5584509968757629, "learning_rate": 0.0002481658692185008, "loss": 0.3459, "step": 27460 }, { "epoch": 43.81, "grad_norm": 0.19587256014347076, "learning_rate": 0.00024752791068580546, "loss": 0.3732, "step": 27470 }, { "epoch": 43.83, "grad_norm": 0.4442209303379059, "learning_rate": 0.0002468899521531101, "loss": 0.3049, "step": 27480 }, { "epoch": 43.84, "grad_norm": 0.259143590927124, "learning_rate": 0.0002462519936204147, "loss": 0.2928, "step": 27490 }, { "epoch": 43.86, "grad_norm": 0.19528359174728394, "learning_rate": 0.0002456140350877193, "loss": 0.2799, "step": 27500 }, { "epoch": 43.88, "grad_norm": 0.47608378529548645, "learning_rate": 0.0002449760765550239, "loss": 0.3079, "step": 27510 }, { "epoch": 43.89, "grad_norm": 0.2542645335197449, "learning_rate": 0.00024433811802232854, "loss": 0.2834, "step": 27520 }, { "epoch": 43.91, "grad_norm": 0.37310686707496643, "learning_rate": 0.00024370015948963318, "loss": 0.3171, "step": 27530 }, { "epoch": 43.92, "grad_norm": 0.6291790008544922, "learning_rate": 0.00024306220095693783, "loss": 0.3439, "step": 27540 }, { "epoch": 43.94, "grad_norm": 0.5721063613891602, "learning_rate": 0.00024242424242424245, "loss": 0.3039, "step": 27550 }, { "epoch": 43.96, "grad_norm": 0.1536693423986435, "learning_rate": 0.00024178628389154706, "loss": 0.2981, "step": 27560 }, { "epoch": 43.97, "grad_norm": 0.3179001212120056, "learning_rate": 0.00024114832535885168, "loss": 0.2213, "step": 27570 }, { "epoch": 43.99, "grad_norm": 0.19436044991016388, "learning_rate": 0.00024051036682615632, "loss": 0.2756, "step": 27580 }, { "epoch": 44.0, "grad_norm": 0.21824301779270172, "learning_rate": 0.00023987240829346094, "loss": 0.2627, "step": 27590 }, { "epoch": 44.02, "grad_norm": 0.38110193610191345, "learning_rate": 0.00023923444976076556, "loss": 0.248, "step": 27600 }, { "epoch": 44.04, "grad_norm": 0.2160405069589615, "learning_rate": 0.00023859649122807017, "loss": 0.2404, "step": 27610 }, { "epoch": 44.05, "grad_norm": 0.3136873245239258, "learning_rate": 0.00023795853269537482, "loss": 0.3034, "step": 27620 }, { "epoch": 44.07, "grad_norm": 0.21699780225753784, "learning_rate": 0.00023732057416267943, "loss": 0.3371, "step": 27630 }, { "epoch": 44.08, "grad_norm": 0.3122328221797943, "learning_rate": 0.00023668261562998405, "loss": 0.2569, "step": 27640 }, { "epoch": 44.1, "grad_norm": 0.45483753085136414, "learning_rate": 0.00023604465709728867, "loss": 0.2873, "step": 27650 }, { "epoch": 44.11, "grad_norm": 0.39906224608421326, "learning_rate": 0.0002354066985645933, "loss": 0.3227, "step": 27660 }, { "epoch": 44.13, "grad_norm": 0.24932830035686493, "learning_rate": 0.00023476874003189793, "loss": 0.2343, "step": 27670 }, { "epoch": 44.15, "grad_norm": 0.09502261132001877, "learning_rate": 0.00023413078149920255, "loss": 0.3065, "step": 27680 }, { "epoch": 44.16, "grad_norm": 0.3910047709941864, "learning_rate": 0.00023349282296650716, "loss": 0.3902, "step": 27690 }, { "epoch": 44.18, "grad_norm": 0.2578485310077667, "learning_rate": 0.0002328548644338118, "loss": 0.3062, "step": 27700 }, { "epoch": 44.19, "grad_norm": 0.40186047554016113, "learning_rate": 0.00023221690590111642, "loss": 0.3129, "step": 27710 }, { "epoch": 44.21, "grad_norm": 0.8674927353858948, "learning_rate": 0.00023157894736842107, "loss": 0.3522, "step": 27720 }, { "epoch": 44.23, "grad_norm": 0.1684367060661316, "learning_rate": 0.00023094098883572568, "loss": 0.2683, "step": 27730 }, { "epoch": 44.24, "grad_norm": 0.34888872504234314, "learning_rate": 0.00023030303030303033, "loss": 0.2477, "step": 27740 }, { "epoch": 44.26, "grad_norm": 0.5431171655654907, "learning_rate": 0.00022966507177033495, "loss": 0.35, "step": 27750 }, { "epoch": 44.27, "grad_norm": 0.3396085202693939, "learning_rate": 0.00022902711323763956, "loss": 0.3115, "step": 27760 }, { "epoch": 44.29, "grad_norm": 0.327421635389328, "learning_rate": 0.00022838915470494418, "loss": 0.3153, "step": 27770 }, { "epoch": 44.31, "grad_norm": 0.34646356105804443, "learning_rate": 0.00022775119617224882, "loss": 0.3603, "step": 27780 }, { "epoch": 44.32, "grad_norm": 0.3496292233467102, "learning_rate": 0.00022711323763955344, "loss": 0.2877, "step": 27790 }, { "epoch": 44.34, "grad_norm": 0.19173116981983185, "learning_rate": 0.00022647527910685806, "loss": 0.2755, "step": 27800 }, { "epoch": 44.35, "grad_norm": 0.6964245438575745, "learning_rate": 0.00022583732057416267, "loss": 0.3568, "step": 27810 }, { "epoch": 44.37, "grad_norm": 0.283237099647522, "learning_rate": 0.00022519936204146732, "loss": 0.3004, "step": 27820 }, { "epoch": 44.39, "grad_norm": 0.3077571988105774, "learning_rate": 0.00022456140350877193, "loss": 0.3139, "step": 27830 }, { "epoch": 44.4, "grad_norm": 0.44178569316864014, "learning_rate": 0.00022392344497607655, "loss": 0.2253, "step": 27840 }, { "epoch": 44.42, "grad_norm": 0.23611438274383545, "learning_rate": 0.00022328548644338117, "loss": 0.3357, "step": 27850 }, { "epoch": 44.43, "grad_norm": 0.402852326631546, "learning_rate": 0.0002226475279106858, "loss": 0.3024, "step": 27860 }, { "epoch": 44.45, "grad_norm": 0.5001922249794006, "learning_rate": 0.00022200956937799043, "loss": 0.2424, "step": 27870 }, { "epoch": 44.47, "grad_norm": 0.5164135098457336, "learning_rate": 0.00022137161084529505, "loss": 0.2952, "step": 27880 }, { "epoch": 44.48, "grad_norm": 0.35648113489151, "learning_rate": 0.00022073365231259966, "loss": 0.2745, "step": 27890 }, { "epoch": 44.5, "grad_norm": 0.6341779232025146, "learning_rate": 0.00022009569377990433, "loss": 0.3428, "step": 27900 }, { "epoch": 44.51, "grad_norm": 0.5282499194145203, "learning_rate": 0.00021945773524720895, "loss": 0.3385, "step": 27910 }, { "epoch": 44.53, "grad_norm": 0.34089717268943787, "learning_rate": 0.00021881977671451357, "loss": 0.3049, "step": 27920 }, { "epoch": 44.55, "grad_norm": 0.44440943002700806, "learning_rate": 0.00021818181818181818, "loss": 0.3613, "step": 27930 }, { "epoch": 44.56, "grad_norm": 0.3817773461341858, "learning_rate": 0.00021754385964912283, "loss": 0.3126, "step": 27940 }, { "epoch": 44.58, "grad_norm": 0.24557062983512878, "learning_rate": 0.00021690590111642745, "loss": 0.2949, "step": 27950 }, { "epoch": 44.59, "grad_norm": 0.7320693135261536, "learning_rate": 0.00021626794258373206, "loss": 0.3664, "step": 27960 }, { "epoch": 44.61, "grad_norm": 0.4003210663795471, "learning_rate": 0.00021562998405103668, "loss": 0.3629, "step": 27970 }, { "epoch": 44.63, "grad_norm": 0.30994275212287903, "learning_rate": 0.00021499202551834132, "loss": 0.2988, "step": 27980 }, { "epoch": 44.64, "grad_norm": 0.2852626442909241, "learning_rate": 0.00021435406698564594, "loss": 0.2767, "step": 27990 }, { "epoch": 44.66, "grad_norm": 0.2598101794719696, "learning_rate": 0.00021371610845295056, "loss": 0.3016, "step": 28000 }, { "epoch": 44.67, "grad_norm": 0.1722613275051117, "learning_rate": 0.00021307814992025517, "loss": 0.1633, "step": 28010 }, { "epoch": 44.69, "grad_norm": 0.2119804471731186, "learning_rate": 0.00021244019138755982, "loss": 0.2715, "step": 28020 }, { "epoch": 44.7, "grad_norm": 0.27806442975997925, "learning_rate": 0.00021180223285486443, "loss": 0.2886, "step": 28030 }, { "epoch": 44.72, "grad_norm": 0.47258105874061584, "learning_rate": 0.00021116427432216905, "loss": 0.2836, "step": 28040 }, { "epoch": 44.74, "grad_norm": 0.295608252286911, "learning_rate": 0.00021052631578947367, "loss": 0.306, "step": 28050 }, { "epoch": 44.75, "grad_norm": 0.2584683895111084, "learning_rate": 0.0002098883572567783, "loss": 0.3159, "step": 28060 }, { "epoch": 44.77, "grad_norm": 0.41258344054222107, "learning_rate": 0.00020925039872408293, "loss": 0.3437, "step": 28070 }, { "epoch": 44.78, "grad_norm": 0.13248884677886963, "learning_rate": 0.00020861244019138755, "loss": 0.3474, "step": 28080 }, { "epoch": 44.8, "grad_norm": 0.2799845337867737, "learning_rate": 0.0002079744816586922, "loss": 0.3041, "step": 28090 }, { "epoch": 44.82, "grad_norm": 0.34866270422935486, "learning_rate": 0.00020733652312599683, "loss": 0.2709, "step": 28100 }, { "epoch": 44.83, "grad_norm": 0.22995953261852264, "learning_rate": 0.00020669856459330145, "loss": 0.3103, "step": 28110 }, { "epoch": 44.85, "grad_norm": 0.6735871434211731, "learning_rate": 0.00020606060606060607, "loss": 0.3272, "step": 28120 }, { "epoch": 44.86, "grad_norm": 0.8896424770355225, "learning_rate": 0.00020542264752791068, "loss": 0.2944, "step": 28130 }, { "epoch": 44.88, "grad_norm": 0.12463853508234024, "learning_rate": 0.00020478468899521533, "loss": 0.3212, "step": 28140 }, { "epoch": 44.9, "grad_norm": 0.29286473989486694, "learning_rate": 0.00020414673046251995, "loss": 0.2537, "step": 28150 }, { "epoch": 44.91, "grad_norm": 0.4333782494068146, "learning_rate": 0.00020350877192982456, "loss": 0.2964, "step": 28160 }, { "epoch": 44.93, "grad_norm": 0.25017327070236206, "learning_rate": 0.00020287081339712918, "loss": 0.3095, "step": 28170 }, { "epoch": 44.94, "grad_norm": 0.18606650829315186, "learning_rate": 0.00020223285486443382, "loss": 0.2461, "step": 28180 }, { "epoch": 44.96, "grad_norm": 0.14284665882587433, "learning_rate": 0.00020159489633173844, "loss": 0.2787, "step": 28190 }, { "epoch": 44.98, "grad_norm": 0.6224771738052368, "learning_rate": 0.00020095693779904306, "loss": 0.3598, "step": 28200 }, { "epoch": 44.99, "grad_norm": 0.32806506752967834, "learning_rate": 0.00020031897926634767, "loss": 0.2615, "step": 28210 }, { "epoch": 45.01, "grad_norm": 0.45343583822250366, "learning_rate": 0.00019968102073365232, "loss": 0.3322, "step": 28220 }, { "epoch": 45.02, "grad_norm": 0.18727990984916687, "learning_rate": 0.00019904306220095693, "loss": 0.2696, "step": 28230 }, { "epoch": 45.04, "grad_norm": 0.28035393357276917, "learning_rate": 0.00019840510366826155, "loss": 0.256, "step": 28240 }, { "epoch": 45.06, "grad_norm": 0.37490570545196533, "learning_rate": 0.00019776714513556617, "loss": 0.3105, "step": 28250 }, { "epoch": 45.07, "grad_norm": 0.27727392315864563, "learning_rate": 0.0001971291866028708, "loss": 0.2573, "step": 28260 }, { "epoch": 45.09, "grad_norm": 0.2856091856956482, "learning_rate": 0.00019649122807017543, "loss": 0.297, "step": 28270 }, { "epoch": 45.1, "grad_norm": 0.3423827886581421, "learning_rate": 0.00019585326953748007, "loss": 0.3219, "step": 28280 }, { "epoch": 45.12, "grad_norm": 0.2217862457036972, "learning_rate": 0.0001952153110047847, "loss": 0.2413, "step": 28290 }, { "epoch": 45.14, "grad_norm": 0.49296557903289795, "learning_rate": 0.00019457735247208933, "loss": 0.2936, "step": 28300 }, { "epoch": 45.15, "grad_norm": 0.28188827633857727, "learning_rate": 0.00019393939393939395, "loss": 0.2937, "step": 28310 }, { "epoch": 45.17, "grad_norm": 0.6118289232254028, "learning_rate": 0.00019330143540669857, "loss": 0.3172, "step": 28320 }, { "epoch": 45.18, "grad_norm": 0.38920632004737854, "learning_rate": 0.0001926634768740032, "loss": 0.3467, "step": 28330 }, { "epoch": 45.2, "grad_norm": 0.2669709324836731, "learning_rate": 0.00019202551834130783, "loss": 0.2941, "step": 28340 }, { "epoch": 45.22, "grad_norm": 0.17795272171497345, "learning_rate": 0.00019138755980861245, "loss": 0.3487, "step": 28350 }, { "epoch": 45.23, "grad_norm": 0.3200840651988983, "learning_rate": 0.00019074960127591706, "loss": 0.3224, "step": 28360 }, { "epoch": 45.25, "grad_norm": 0.3185681700706482, "learning_rate": 0.0001901116427432217, "loss": 0.2754, "step": 28370 }, { "epoch": 45.26, "grad_norm": 0.35010969638824463, "learning_rate": 0.00018947368421052632, "loss": 0.2843, "step": 28380 }, { "epoch": 45.28, "grad_norm": 0.19338567554950714, "learning_rate": 0.00018883572567783094, "loss": 0.3434, "step": 28390 }, { "epoch": 45.3, "grad_norm": 0.13185134530067444, "learning_rate": 0.00018819776714513556, "loss": 0.2991, "step": 28400 }, { "epoch": 45.31, "grad_norm": 0.2024078220129013, "learning_rate": 0.0001875598086124402, "loss": 0.3248, "step": 28410 }, { "epoch": 45.33, "grad_norm": 0.22243604063987732, "learning_rate": 0.00018692185007974482, "loss": 0.2409, "step": 28420 }, { "epoch": 45.34, "grad_norm": 0.5372808575630188, "learning_rate": 0.00018628389154704943, "loss": 0.2738, "step": 28430 }, { "epoch": 45.36, "grad_norm": 0.17532573640346527, "learning_rate": 0.00018564593301435405, "loss": 0.2954, "step": 28440 }, { "epoch": 45.37, "grad_norm": 0.2568674087524414, "learning_rate": 0.0001850079744816587, "loss": 0.2461, "step": 28450 }, { "epoch": 45.39, "grad_norm": 0.36683690547943115, "learning_rate": 0.00018437001594896334, "loss": 0.301, "step": 28460 }, { "epoch": 45.41, "grad_norm": 0.32988253235816956, "learning_rate": 0.00018373205741626796, "loss": 0.2522, "step": 28470 }, { "epoch": 45.42, "grad_norm": 0.28334781527519226, "learning_rate": 0.00018309409888357257, "loss": 0.2795, "step": 28480 }, { "epoch": 45.44, "grad_norm": 0.26257234811782837, "learning_rate": 0.00018245614035087722, "loss": 0.3357, "step": 28490 }, { "epoch": 45.45, "grad_norm": 0.376924067735672, "learning_rate": 0.00018181818181818183, "loss": 0.3157, "step": 28500 }, { "epoch": 45.47, "grad_norm": 0.06856755167245865, "learning_rate": 0.00018118022328548645, "loss": 0.3126, "step": 28510 }, { "epoch": 45.49, "grad_norm": 0.3555695414543152, "learning_rate": 0.00018054226475279107, "loss": 0.2759, "step": 28520 }, { "epoch": 45.5, "grad_norm": 0.44711726903915405, "learning_rate": 0.0001799043062200957, "loss": 0.3947, "step": 28530 }, { "epoch": 45.52, "grad_norm": 0.5563350319862366, "learning_rate": 0.00017926634768740033, "loss": 0.3077, "step": 28540 }, { "epoch": 45.53, "grad_norm": 0.22353103756904602, "learning_rate": 0.00017862838915470495, "loss": 0.319, "step": 28550 }, { "epoch": 45.55, "grad_norm": 0.23482950031757355, "learning_rate": 0.00017799043062200956, "loss": 0.2164, "step": 28560 }, { "epoch": 45.57, "grad_norm": 0.3976686894893646, "learning_rate": 0.0001773524720893142, "loss": 0.2903, "step": 28570 }, { "epoch": 45.58, "grad_norm": 0.31743720173835754, "learning_rate": 0.00017671451355661882, "loss": 0.3855, "step": 28580 }, { "epoch": 45.6, "grad_norm": 0.2157888561487198, "learning_rate": 0.00017607655502392344, "loss": 0.2198, "step": 28590 }, { "epoch": 45.61, "grad_norm": 0.42237186431884766, "learning_rate": 0.00017543859649122806, "loss": 0.3859, "step": 28600 }, { "epoch": 45.63, "grad_norm": 0.1533055305480957, "learning_rate": 0.0001748006379585327, "loss": 0.2198, "step": 28610 }, { "epoch": 45.65, "grad_norm": 0.16389824450016022, "learning_rate": 0.00017416267942583732, "loss": 0.2789, "step": 28620 }, { "epoch": 45.66, "grad_norm": 0.4902271032333374, "learning_rate": 0.00017352472089314193, "loss": 0.3184, "step": 28630 }, { "epoch": 45.68, "grad_norm": 0.31961241364479065, "learning_rate": 0.00017288676236044655, "loss": 0.2653, "step": 28640 }, { "epoch": 45.69, "grad_norm": 1.2578412294387817, "learning_rate": 0.00017224880382775122, "loss": 0.2537, "step": 28650 }, { "epoch": 45.71, "grad_norm": 0.19706355035305023, "learning_rate": 0.00017161084529505584, "loss": 0.2666, "step": 28660 }, { "epoch": 45.73, "grad_norm": 0.17647922039031982, "learning_rate": 0.00017097288676236046, "loss": 0.2909, "step": 28670 }, { "epoch": 45.74, "grad_norm": 0.20171548426151276, "learning_rate": 0.00017033492822966507, "loss": 0.3303, "step": 28680 }, { "epoch": 45.76, "grad_norm": 0.1995372623205185, "learning_rate": 0.00016969696969696972, "loss": 0.2621, "step": 28690 }, { "epoch": 45.77, "grad_norm": 0.23527149856090546, "learning_rate": 0.00016905901116427433, "loss": 0.3213, "step": 28700 }, { "epoch": 45.79, "grad_norm": 0.2143118530511856, "learning_rate": 0.00016842105263157895, "loss": 0.2584, "step": 28710 }, { "epoch": 45.81, "grad_norm": 0.05645094811916351, "learning_rate": 0.00016778309409888357, "loss": 0.2731, "step": 28720 }, { "epoch": 45.82, "grad_norm": 0.6314740777015686, "learning_rate": 0.0001671451355661882, "loss": 0.3243, "step": 28730 }, { "epoch": 45.84, "grad_norm": 0.15495331585407257, "learning_rate": 0.00016650717703349283, "loss": 0.2936, "step": 28740 }, { "epoch": 45.85, "grad_norm": 0.47223085165023804, "learning_rate": 0.00016586921850079745, "loss": 0.2497, "step": 28750 }, { "epoch": 45.87, "grad_norm": 0.3611065447330475, "learning_rate": 0.00016523125996810206, "loss": 0.356, "step": 28760 }, { "epoch": 45.89, "grad_norm": 0.38601604104042053, "learning_rate": 0.0001645933014354067, "loss": 0.2902, "step": 28770 }, { "epoch": 45.9, "grad_norm": 0.36279168725013733, "learning_rate": 0.00016395534290271132, "loss": 0.2713, "step": 28780 }, { "epoch": 45.92, "grad_norm": 0.15732410550117493, "learning_rate": 0.00016331738437001594, "loss": 0.323, "step": 28790 }, { "epoch": 45.93, "grad_norm": 0.21142350137233734, "learning_rate": 0.00016267942583732056, "loss": 0.2383, "step": 28800 }, { "epoch": 45.95, "grad_norm": 0.17822466790676117, "learning_rate": 0.0001620414673046252, "loss": 0.2362, "step": 28810 }, { "epoch": 45.96, "grad_norm": 0.21047089993953705, "learning_rate": 0.00016140350877192982, "loss": 0.3138, "step": 28820 }, { "epoch": 45.98, "grad_norm": 0.4122728109359741, "learning_rate": 0.00016076555023923443, "loss": 0.2932, "step": 28830 }, { "epoch": 46.0, "grad_norm": 0.27697211503982544, "learning_rate": 0.00016012759170653908, "loss": 0.3276, "step": 28840 }, { "epoch": 46.01, "grad_norm": 0.32727357745170593, "learning_rate": 0.00015948963317384372, "loss": 0.2403, "step": 28850 }, { "epoch": 46.03, "grad_norm": 0.26314985752105713, "learning_rate": 0.00015885167464114834, "loss": 0.2484, "step": 28860 }, { "epoch": 46.04, "grad_norm": 0.12502922117710114, "learning_rate": 0.00015821371610845296, "loss": 0.2411, "step": 28870 }, { "epoch": 46.06, "grad_norm": 0.35499653220176697, "learning_rate": 0.00015757575757575757, "loss": 0.2731, "step": 28880 }, { "epoch": 46.08, "grad_norm": 0.4001838266849518, "learning_rate": 0.00015693779904306222, "loss": 0.256, "step": 28890 }, { "epoch": 46.09, "grad_norm": 0.49199333786964417, "learning_rate": 0.00015629984051036683, "loss": 0.3412, "step": 28900 }, { "epoch": 46.11, "grad_norm": 0.22476720809936523, "learning_rate": 0.00015566188197767145, "loss": 0.2704, "step": 28910 }, { "epoch": 46.12, "grad_norm": 0.42547646164894104, "learning_rate": 0.00015502392344497607, "loss": 0.3282, "step": 28920 }, { "epoch": 46.14, "grad_norm": 0.14458052814006805, "learning_rate": 0.0001543859649122807, "loss": 0.3036, "step": 28930 }, { "epoch": 46.16, "grad_norm": 0.23600299656391144, "learning_rate": 0.00015374800637958533, "loss": 0.3372, "step": 28940 }, { "epoch": 46.17, "grad_norm": 0.31214261054992676, "learning_rate": 0.00015311004784688995, "loss": 0.2513, "step": 28950 }, { "epoch": 46.19, "grad_norm": 0.6175329685211182, "learning_rate": 0.00015247208931419456, "loss": 0.297, "step": 28960 }, { "epoch": 46.2, "grad_norm": 0.3160916864871979, "learning_rate": 0.0001518341307814992, "loss": 0.3279, "step": 28970 }, { "epoch": 46.22, "grad_norm": 0.37880146503448486, "learning_rate": 0.00015119617224880382, "loss": 0.253, "step": 28980 }, { "epoch": 46.24, "grad_norm": 0.16760538518428802, "learning_rate": 0.00015055821371610844, "loss": 0.2691, "step": 28990 }, { "epoch": 46.25, "grad_norm": 0.34026291966438293, "learning_rate": 0.00014992025518341306, "loss": 0.4, "step": 29000 }, { "epoch": 46.27, "grad_norm": 0.22685553133487701, "learning_rate": 0.0001492822966507177, "loss": 0.2779, "step": 29010 }, { "epoch": 46.28, "grad_norm": 0.3551049530506134, "learning_rate": 0.00014864433811802235, "loss": 0.3183, "step": 29020 }, { "epoch": 46.3, "grad_norm": 0.5112924575805664, "learning_rate": 0.00014800637958532696, "loss": 0.3357, "step": 29030 }, { "epoch": 46.32, "grad_norm": 0.4620679020881653, "learning_rate": 0.00014736842105263158, "loss": 0.2737, "step": 29040 }, { "epoch": 46.33, "grad_norm": 0.25304239988327026, "learning_rate": 0.00014673046251993622, "loss": 0.2402, "step": 29050 }, { "epoch": 46.35, "grad_norm": 0.20634669065475464, "learning_rate": 0.00014609250398724084, "loss": 0.2407, "step": 29060 }, { "epoch": 46.36, "grad_norm": 0.4903095066547394, "learning_rate": 0.00014545454545454546, "loss": 0.3264, "step": 29070 }, { "epoch": 46.38, "grad_norm": 0.09498465806245804, "learning_rate": 0.0001448165869218501, "loss": 0.2885, "step": 29080 }, { "epoch": 46.4, "grad_norm": 0.292100191116333, "learning_rate": 0.00014417862838915472, "loss": 0.3582, "step": 29090 }, { "epoch": 46.41, "grad_norm": 0.23083628714084625, "learning_rate": 0.00014354066985645933, "loss": 0.2363, "step": 29100 }, { "epoch": 46.43, "grad_norm": 0.3492584228515625, "learning_rate": 0.00014290271132376395, "loss": 0.3224, "step": 29110 }, { "epoch": 46.44, "grad_norm": 0.5817916393280029, "learning_rate": 0.0001422647527910686, "loss": 0.3166, "step": 29120 }, { "epoch": 46.46, "grad_norm": 0.3647211194038391, "learning_rate": 0.0001416267942583732, "loss": 0.2748, "step": 29130 }, { "epoch": 46.48, "grad_norm": 0.46294817328453064, "learning_rate": 0.00014098883572567783, "loss": 0.3511, "step": 29140 }, { "epoch": 46.49, "grad_norm": 0.09461899846792221, "learning_rate": 0.00014035087719298245, "loss": 0.2238, "step": 29150 }, { "epoch": 46.51, "grad_norm": 0.3371366262435913, "learning_rate": 0.0001397129186602871, "loss": 0.314, "step": 29160 }, { "epoch": 46.52, "grad_norm": 0.4762924313545227, "learning_rate": 0.0001390749601275917, "loss": 0.2712, "step": 29170 }, { "epoch": 46.54, "grad_norm": 0.22956405580043793, "learning_rate": 0.00013843700159489632, "loss": 0.3876, "step": 29180 }, { "epoch": 46.56, "grad_norm": 0.11843441426753998, "learning_rate": 0.00013779904306220094, "loss": 0.2649, "step": 29190 }, { "epoch": 46.57, "grad_norm": 0.38579946756362915, "learning_rate": 0.00013716108452950558, "loss": 0.2696, "step": 29200 }, { "epoch": 46.59, "grad_norm": 0.33288395404815674, "learning_rate": 0.00013652312599681023, "loss": 0.3079, "step": 29210 }, { "epoch": 46.6, "grad_norm": 0.2519354224205017, "learning_rate": 0.00013588516746411485, "loss": 0.3065, "step": 29220 }, { "epoch": 46.62, "grad_norm": 0.1389375776052475, "learning_rate": 0.00013524720893141946, "loss": 0.2599, "step": 29230 }, { "epoch": 46.63, "grad_norm": 0.33025360107421875, "learning_rate": 0.0001346092503987241, "loss": 0.3191, "step": 29240 }, { "epoch": 46.65, "grad_norm": 0.1264023780822754, "learning_rate": 0.00013397129186602872, "loss": 0.2695, "step": 29250 }, { "epoch": 46.67, "grad_norm": 0.4093076288700104, "learning_rate": 0.00013333333333333334, "loss": 0.2471, "step": 29260 }, { "epoch": 46.68, "grad_norm": 0.722159743309021, "learning_rate": 0.00013269537480063796, "loss": 0.28, "step": 29270 }, { "epoch": 46.7, "grad_norm": 0.4530355632305145, "learning_rate": 0.0001320574162679426, "loss": 0.3053, "step": 29280 }, { "epoch": 46.71, "grad_norm": 0.10739301890134811, "learning_rate": 0.00013141945773524722, "loss": 0.2983, "step": 29290 }, { "epoch": 46.73, "grad_norm": 0.1723739355802536, "learning_rate": 0.00013078149920255183, "loss": 0.2239, "step": 29300 }, { "epoch": 46.75, "grad_norm": 0.2270219624042511, "learning_rate": 0.00013014354066985645, "loss": 0.3093, "step": 29310 }, { "epoch": 46.76, "grad_norm": 0.6445925831794739, "learning_rate": 0.0001295055821371611, "loss": 0.3212, "step": 29320 }, { "epoch": 46.78, "grad_norm": 0.22848792374134064, "learning_rate": 0.0001288676236044657, "loss": 0.3183, "step": 29330 }, { "epoch": 46.79, "grad_norm": 0.3686947822570801, "learning_rate": 0.00012822966507177033, "loss": 0.2628, "step": 29340 }, { "epoch": 46.81, "grad_norm": 0.27950429916381836, "learning_rate": 0.00012759170653907495, "loss": 0.312, "step": 29350 }, { "epoch": 46.83, "grad_norm": 0.13954879343509674, "learning_rate": 0.0001269537480063796, "loss": 0.315, "step": 29360 }, { "epoch": 46.84, "grad_norm": 0.314480185508728, "learning_rate": 0.0001263157894736842, "loss": 0.3344, "step": 29370 }, { "epoch": 46.86, "grad_norm": 0.3248406946659088, "learning_rate": 0.00012567783094098882, "loss": 0.3106, "step": 29380 }, { "epoch": 46.87, "grad_norm": 0.3097328543663025, "learning_rate": 0.00012503987240829344, "loss": 0.2933, "step": 29390 }, { "epoch": 46.89, "grad_norm": 0.4338608384132385, "learning_rate": 0.00012440191387559808, "loss": 0.2809, "step": 29400 }, { "epoch": 46.91, "grad_norm": 0.35394251346588135, "learning_rate": 0.00012376395534290273, "loss": 0.3051, "step": 29410 }, { "epoch": 46.92, "grad_norm": 0.07790148258209229, "learning_rate": 0.00012312599681020735, "loss": 0.3595, "step": 29420 }, { "epoch": 46.94, "grad_norm": 0.2738390564918518, "learning_rate": 0.00012248803827751196, "loss": 0.2533, "step": 29430 }, { "epoch": 46.95, "grad_norm": 0.19870556890964508, "learning_rate": 0.00012185007974481659, "loss": 0.2967, "step": 29440 }, { "epoch": 46.97, "grad_norm": 0.15914097428321838, "learning_rate": 0.00012121212121212122, "loss": 0.3222, "step": 29450 }, { "epoch": 46.99, "grad_norm": 0.22630850970745087, "learning_rate": 0.00012057416267942584, "loss": 0.2994, "step": 29460 }, { "epoch": 47.0, "grad_norm": 0.32556214928627014, "learning_rate": 0.00011993620414673047, "loss": 0.2816, "step": 29470 }, { "epoch": 47.02, "grad_norm": 0.274972528219223, "learning_rate": 0.00011929824561403509, "loss": 0.2624, "step": 29480 }, { "epoch": 47.03, "grad_norm": 0.3284093737602234, "learning_rate": 0.00011866028708133972, "loss": 0.2596, "step": 29490 }, { "epoch": 47.05, "grad_norm": 0.2033546268939972, "learning_rate": 0.00011802232854864433, "loss": 0.2444, "step": 29500 }, { "epoch": 47.07, "grad_norm": 0.3881695866584778, "learning_rate": 0.00011738437001594896, "loss": 0.2642, "step": 29510 }, { "epoch": 47.08, "grad_norm": 0.3856006860733032, "learning_rate": 0.00011674641148325358, "loss": 0.2641, "step": 29520 }, { "epoch": 47.1, "grad_norm": 0.3555915355682373, "learning_rate": 0.00011610845295055821, "loss": 0.2547, "step": 29530 }, { "epoch": 47.11, "grad_norm": 0.39494889974594116, "learning_rate": 0.00011547049441786284, "loss": 0.3229, "step": 29540 }, { "epoch": 47.13, "grad_norm": 0.39036959409713745, "learning_rate": 0.00011483253588516747, "loss": 0.2395, "step": 29550 }, { "epoch": 47.15, "grad_norm": 0.14146322011947632, "learning_rate": 0.00011419457735247209, "loss": 0.1988, "step": 29560 }, { "epoch": 47.16, "grad_norm": 0.22183720767498016, "learning_rate": 0.00011355661881977672, "loss": 0.3234, "step": 29570 }, { "epoch": 47.18, "grad_norm": 0.19865743815898895, "learning_rate": 0.00011291866028708134, "loss": 0.2811, "step": 29580 }, { "epoch": 47.19, "grad_norm": 0.457445353269577, "learning_rate": 0.00011228070175438597, "loss": 0.2627, "step": 29590 }, { "epoch": 47.21, "grad_norm": 0.28596189618110657, "learning_rate": 0.00011164274322169058, "loss": 0.2277, "step": 29600 }, { "epoch": 47.22, "grad_norm": 0.4201318025588989, "learning_rate": 0.00011100478468899521, "loss": 0.3182, "step": 29610 }, { "epoch": 47.24, "grad_norm": 0.31965920329093933, "learning_rate": 0.00011036682615629983, "loss": 0.2838, "step": 29620 }, { "epoch": 47.26, "grad_norm": 0.10794230550527573, "learning_rate": 0.00010972886762360448, "loss": 0.2655, "step": 29630 }, { "epoch": 47.27, "grad_norm": 0.13269487023353577, "learning_rate": 0.00010909090909090909, "loss": 0.3302, "step": 29640 }, { "epoch": 47.29, "grad_norm": 0.5231210589408875, "learning_rate": 0.00010845295055821372, "loss": 0.3293, "step": 29650 }, { "epoch": 47.3, "grad_norm": 0.158706933259964, "learning_rate": 0.00010781499202551834, "loss": 0.2234, "step": 29660 }, { "epoch": 47.32, "grad_norm": 0.2540994882583618, "learning_rate": 0.00010717703349282297, "loss": 0.3066, "step": 29670 }, { "epoch": 47.34, "grad_norm": 0.32114022970199585, "learning_rate": 0.00010653907496012759, "loss": 0.26, "step": 29680 }, { "epoch": 47.35, "grad_norm": 0.14222322404384613, "learning_rate": 0.00010590111642743222, "loss": 0.1941, "step": 29690 }, { "epoch": 47.37, "grad_norm": 0.33291783928871155, "learning_rate": 0.00010526315789473683, "loss": 0.3337, "step": 29700 }, { "epoch": 47.38, "grad_norm": 0.21735547482967377, "learning_rate": 0.00010462519936204146, "loss": 0.2755, "step": 29710 }, { "epoch": 47.4, "grad_norm": 0.37341004610061646, "learning_rate": 0.0001039872408293461, "loss": 0.2765, "step": 29720 }, { "epoch": 47.42, "grad_norm": 0.13885751366615295, "learning_rate": 0.00010334928229665073, "loss": 0.3081, "step": 29730 }, { "epoch": 47.43, "grad_norm": 0.6437707543373108, "learning_rate": 0.00010271132376395534, "loss": 0.322, "step": 29740 }, { "epoch": 47.45, "grad_norm": 0.13305498659610748, "learning_rate": 0.00010207336523125997, "loss": 0.2947, "step": 29750 }, { "epoch": 47.46, "grad_norm": 0.6117695569992065, "learning_rate": 0.00010143540669856459, "loss": 0.3218, "step": 29760 }, { "epoch": 47.48, "grad_norm": 0.34142374992370605, "learning_rate": 0.00010079744816586922, "loss": 0.3295, "step": 29770 }, { "epoch": 47.5, "grad_norm": 0.37447261810302734, "learning_rate": 0.00010015948963317384, "loss": 0.2824, "step": 29780 }, { "epoch": 47.51, "grad_norm": 0.14651019871234894, "learning_rate": 9.952153110047847e-05, "loss": 0.2739, "step": 29790 }, { "epoch": 47.53, "grad_norm": 0.14142945408821106, "learning_rate": 9.888357256778308e-05, "loss": 0.217, "step": 29800 }, { "epoch": 47.54, "grad_norm": 0.3807011544704437, "learning_rate": 9.824561403508771e-05, "loss": 0.3054, "step": 29810 }, { "epoch": 47.56, "grad_norm": 0.2842819392681122, "learning_rate": 9.760765550239235e-05, "loss": 0.3078, "step": 29820 }, { "epoch": 47.58, "grad_norm": 0.7402997016906738, "learning_rate": 9.696969696969698e-05, "loss": 0.3118, "step": 29830 }, { "epoch": 47.59, "grad_norm": 0.13281618058681488, "learning_rate": 9.63317384370016e-05, "loss": 0.3055, "step": 29840 }, { "epoch": 47.61, "grad_norm": 0.3724515736103058, "learning_rate": 9.569377990430622e-05, "loss": 0.3274, "step": 29850 }, { "epoch": 47.62, "grad_norm": 0.33854445815086365, "learning_rate": 9.505582137161085e-05, "loss": 0.2533, "step": 29860 }, { "epoch": 47.64, "grad_norm": 0.42690280079841614, "learning_rate": 9.441786283891547e-05, "loss": 0.3262, "step": 29870 }, { "epoch": 47.66, "grad_norm": 0.6151228547096252, "learning_rate": 9.37799043062201e-05, "loss": 0.2889, "step": 29880 }, { "epoch": 47.67, "grad_norm": 0.26469776034355164, "learning_rate": 9.314194577352472e-05, "loss": 0.3036, "step": 29890 }, { "epoch": 47.69, "grad_norm": 0.2703404426574707, "learning_rate": 9.250398724082935e-05, "loss": 0.2673, "step": 29900 }, { "epoch": 47.7, "grad_norm": 0.3791040778160095, "learning_rate": 9.186602870813398e-05, "loss": 0.3244, "step": 29910 }, { "epoch": 47.72, "grad_norm": 0.29400941729545593, "learning_rate": 9.122807017543861e-05, "loss": 0.2538, "step": 29920 }, { "epoch": 47.74, "grad_norm": 0.4795028567314148, "learning_rate": 9.059011164274323e-05, "loss": 0.2482, "step": 29930 }, { "epoch": 47.75, "grad_norm": 0.36813196539878845, "learning_rate": 8.995215311004786e-05, "loss": 0.2548, "step": 29940 }, { "epoch": 47.77, "grad_norm": 0.22788019478321075, "learning_rate": 8.931419457735247e-05, "loss": 0.2961, "step": 29950 }, { "epoch": 47.78, "grad_norm": 0.24274033308029175, "learning_rate": 8.86762360446571e-05, "loss": 0.2933, "step": 29960 }, { "epoch": 47.8, "grad_norm": 0.4556421637535095, "learning_rate": 8.803827751196172e-05, "loss": 0.3562, "step": 29970 }, { "epoch": 47.81, "grad_norm": 0.5702171921730042, "learning_rate": 8.740031897926635e-05, "loss": 0.3005, "step": 29980 }, { "epoch": 47.83, "grad_norm": 0.32142460346221924, "learning_rate": 8.676236044657097e-05, "loss": 0.2663, "step": 29990 }, { "epoch": 47.85, "grad_norm": 0.17863740026950836, "learning_rate": 8.612440191387561e-05, "loss": 0.2509, "step": 30000 }, { "epoch": 47.86, "grad_norm": 0.09491372853517532, "learning_rate": 8.548644338118023e-05, "loss": 0.3576, "step": 30010 }, { "epoch": 47.88, "grad_norm": 0.4455479085445404, "learning_rate": 8.484848484848486e-05, "loss": 0.4137, "step": 30020 }, { "epoch": 47.89, "grad_norm": 0.33983567357063293, "learning_rate": 8.421052631578948e-05, "loss": 0.2838, "step": 30030 }, { "epoch": 47.91, "grad_norm": 0.4801020622253418, "learning_rate": 8.35725677830941e-05, "loss": 0.2983, "step": 30040 }, { "epoch": 47.93, "grad_norm": 0.33874234557151794, "learning_rate": 8.293460925039872e-05, "loss": 0.2689, "step": 30050 }, { "epoch": 47.94, "grad_norm": 0.269828736782074, "learning_rate": 8.229665071770335e-05, "loss": 0.3258, "step": 30060 }, { "epoch": 47.96, "grad_norm": 0.0987486019730568, "learning_rate": 8.165869218500797e-05, "loss": 0.2461, "step": 30070 }, { "epoch": 47.97, "grad_norm": 0.3457973897457123, "learning_rate": 8.10207336523126e-05, "loss": 0.2783, "step": 30080 }, { "epoch": 47.99, "grad_norm": 0.10124126076698303, "learning_rate": 8.038277511961722e-05, "loss": 0.2794, "step": 30090 }, { "epoch": 48.01, "grad_norm": 0.40085652470588684, "learning_rate": 7.974481658692186e-05, "loss": 0.326, "step": 30100 }, { "epoch": 48.02, "grad_norm": 0.184198796749115, "learning_rate": 7.910685805422648e-05, "loss": 0.2469, "step": 30110 }, { "epoch": 48.04, "grad_norm": 0.2005092054605484, "learning_rate": 7.846889952153111e-05, "loss": 0.3044, "step": 30120 }, { "epoch": 48.05, "grad_norm": 0.35767000913619995, "learning_rate": 7.783094098883573e-05, "loss": 0.2981, "step": 30130 }, { "epoch": 48.07, "grad_norm": 0.38873291015625, "learning_rate": 7.719298245614036e-05, "loss": 0.3039, "step": 30140 }, { "epoch": 48.09, "grad_norm": 0.22854940593242645, "learning_rate": 7.655502392344497e-05, "loss": 0.2068, "step": 30150 }, { "epoch": 48.1, "grad_norm": 0.1659734845161438, "learning_rate": 7.59170653907496e-05, "loss": 0.277, "step": 30160 }, { "epoch": 48.12, "grad_norm": 0.1869482696056366, "learning_rate": 7.527910685805422e-05, "loss": 0.2194, "step": 30170 }, { "epoch": 48.13, "grad_norm": 0.08279826492071152, "learning_rate": 7.464114832535885e-05, "loss": 0.2824, "step": 30180 }, { "epoch": 48.15, "grad_norm": 0.4725863039493561, "learning_rate": 7.400318979266348e-05, "loss": 0.2503, "step": 30190 }, { "epoch": 48.17, "grad_norm": 0.172104611992836, "learning_rate": 7.336523125996811e-05, "loss": 0.2658, "step": 30200 }, { "epoch": 48.18, "grad_norm": 0.21676242351531982, "learning_rate": 7.272727272727273e-05, "loss": 0.2658, "step": 30210 }, { "epoch": 48.2, "grad_norm": 0.3602610230445862, "learning_rate": 7.208931419457736e-05, "loss": 0.2367, "step": 30220 }, { "epoch": 48.21, "grad_norm": 0.3500073552131653, "learning_rate": 7.145135566188198e-05, "loss": 0.3014, "step": 30230 }, { "epoch": 48.23, "grad_norm": 0.3083650469779968, "learning_rate": 7.08133971291866e-05, "loss": 0.3093, "step": 30240 }, { "epoch": 48.25, "grad_norm": 0.20540174841880798, "learning_rate": 7.017543859649122e-05, "loss": 0.2626, "step": 30250 }, { "epoch": 48.26, "grad_norm": 0.26233381032943726, "learning_rate": 6.953748006379585e-05, "loss": 0.3743, "step": 30260 }, { "epoch": 48.28, "grad_norm": 0.6622065901756287, "learning_rate": 6.889952153110047e-05, "loss": 0.3261, "step": 30270 }, { "epoch": 48.29, "grad_norm": 0.50579833984375, "learning_rate": 6.826156299840511e-05, "loss": 0.2812, "step": 30280 }, { "epoch": 48.31, "grad_norm": 0.22522664070129395, "learning_rate": 6.762360446570973e-05, "loss": 0.247, "step": 30290 }, { "epoch": 48.33, "grad_norm": 0.334440141916275, "learning_rate": 6.698564593301436e-05, "loss": 0.2827, "step": 30300 }, { "epoch": 48.34, "grad_norm": 0.10422962158918381, "learning_rate": 6.634768740031898e-05, "loss": 0.2475, "step": 30310 }, { "epoch": 48.36, "grad_norm": 0.409278005361557, "learning_rate": 6.570972886762361e-05, "loss": 0.3053, "step": 30320 }, { "epoch": 48.37, "grad_norm": 0.15748478472232819, "learning_rate": 6.507177033492823e-05, "loss": 0.3292, "step": 30330 }, { "epoch": 48.39, "grad_norm": 0.1966976523399353, "learning_rate": 6.443381180223286e-05, "loss": 0.2462, "step": 30340 }, { "epoch": 48.41, "grad_norm": 0.34300366044044495, "learning_rate": 6.379585326953747e-05, "loss": 0.3215, "step": 30350 }, { "epoch": 48.42, "grad_norm": 0.15784505009651184, "learning_rate": 6.31578947368421e-05, "loss": 0.2191, "step": 30360 }, { "epoch": 48.44, "grad_norm": 0.1942838877439499, "learning_rate": 6.251993620414672e-05, "loss": 0.2964, "step": 30370 }, { "epoch": 48.45, "grad_norm": 0.23638346791267395, "learning_rate": 6.188197767145136e-05, "loss": 0.2913, "step": 30380 }, { "epoch": 48.47, "grad_norm": 0.18222945928573608, "learning_rate": 6.124401913875598e-05, "loss": 0.2863, "step": 30390 }, { "epoch": 48.48, "grad_norm": 0.13442523777484894, "learning_rate": 6.060606060606061e-05, "loss": 0.3137, "step": 30400 }, { "epoch": 48.5, "grad_norm": 0.09403583407402039, "learning_rate": 5.9968102073365235e-05, "loss": 0.2212, "step": 30410 }, { "epoch": 48.52, "grad_norm": 0.21507516503334045, "learning_rate": 5.933014354066986e-05, "loss": 0.2613, "step": 30420 }, { "epoch": 48.53, "grad_norm": 0.41693365573883057, "learning_rate": 5.869218500797448e-05, "loss": 0.2824, "step": 30430 }, { "epoch": 48.55, "grad_norm": 0.23327617347240448, "learning_rate": 5.8054226475279106e-05, "loss": 0.3104, "step": 30440 }, { "epoch": 48.56, "grad_norm": 0.6092672348022461, "learning_rate": 5.7416267942583736e-05, "loss": 0.309, "step": 30450 }, { "epoch": 48.58, "grad_norm": 0.14301355183124542, "learning_rate": 5.677830940988836e-05, "loss": 0.2445, "step": 30460 }, { "epoch": 48.6, "grad_norm": 0.42832037806510925, "learning_rate": 5.6140350877192984e-05, "loss": 0.2872, "step": 30470 }, { "epoch": 48.61, "grad_norm": 0.25466400384902954, "learning_rate": 5.550239234449761e-05, "loss": 0.2659, "step": 30480 }, { "epoch": 48.63, "grad_norm": 0.2657581865787506, "learning_rate": 5.486443381180224e-05, "loss": 0.2374, "step": 30490 }, { "epoch": 48.64, "grad_norm": 0.40479007363319397, "learning_rate": 5.422647527910686e-05, "loss": 0.3537, "step": 30500 }, { "epoch": 48.66, "grad_norm": 0.28331390023231506, "learning_rate": 5.3588516746411485e-05, "loss": 0.3156, "step": 30510 }, { "epoch": 48.68, "grad_norm": 0.27074429392814636, "learning_rate": 5.295055821371611e-05, "loss": 0.2869, "step": 30520 }, { "epoch": 48.69, "grad_norm": 0.21443207561969757, "learning_rate": 5.231259968102073e-05, "loss": 0.2715, "step": 30530 }, { "epoch": 48.71, "grad_norm": 0.28873592615127563, "learning_rate": 5.167464114832536e-05, "loss": 0.283, "step": 30540 }, { "epoch": 48.72, "grad_norm": 0.2248823344707489, "learning_rate": 5.1036682615629986e-05, "loss": 0.2609, "step": 30550 }, { "epoch": 48.74, "grad_norm": 0.16412192583084106, "learning_rate": 5.039872408293461e-05, "loss": 0.2673, "step": 30560 }, { "epoch": 48.76, "grad_norm": 0.37860092520713806, "learning_rate": 4.9760765550239234e-05, "loss": 0.2795, "step": 30570 }, { "epoch": 48.77, "grad_norm": 0.5846998691558838, "learning_rate": 4.912280701754386e-05, "loss": 0.3345, "step": 30580 }, { "epoch": 48.79, "grad_norm": 0.4207826554775238, "learning_rate": 4.848484848484849e-05, "loss": 0.2574, "step": 30590 }, { "epoch": 48.8, "grad_norm": 0.2351989895105362, "learning_rate": 4.784688995215311e-05, "loss": 0.2994, "step": 30600 }, { "epoch": 48.82, "grad_norm": 0.29773497581481934, "learning_rate": 4.7208931419457735e-05, "loss": 0.3491, "step": 30610 }, { "epoch": 48.84, "grad_norm": 0.3682696521282196, "learning_rate": 4.657097288676236e-05, "loss": 0.2976, "step": 30620 }, { "epoch": 48.85, "grad_norm": 0.33122923970222473, "learning_rate": 4.593301435406699e-05, "loss": 0.314, "step": 30630 }, { "epoch": 48.87, "grad_norm": 0.3438310921192169, "learning_rate": 4.529505582137161e-05, "loss": 0.3169, "step": 30640 }, { "epoch": 48.88, "grad_norm": 0.32344672083854675, "learning_rate": 4.4657097288676236e-05, "loss": 0.2184, "step": 30650 }, { "epoch": 48.9, "grad_norm": 0.4275621771812439, "learning_rate": 4.401913875598086e-05, "loss": 0.3592, "step": 30660 }, { "epoch": 48.92, "grad_norm": 0.514369785785675, "learning_rate": 4.3381180223285484e-05, "loss": 0.2393, "step": 30670 }, { "epoch": 48.93, "grad_norm": 0.23344865441322327, "learning_rate": 4.2743221690590114e-05, "loss": 0.3637, "step": 30680 }, { "epoch": 48.95, "grad_norm": 0.2496626079082489, "learning_rate": 4.210526315789474e-05, "loss": 0.3157, "step": 30690 }, { "epoch": 48.96, "grad_norm": 0.15069235861301422, "learning_rate": 4.146730462519936e-05, "loss": 0.2731, "step": 30700 }, { "epoch": 48.98, "grad_norm": 0.5047960877418518, "learning_rate": 4.0829346092503985e-05, "loss": 0.2811, "step": 30710 }, { "epoch": 49.0, "grad_norm": 0.34830254316329956, "learning_rate": 4.019138755980861e-05, "loss": 0.2925, "step": 30720 }, { "epoch": 49.01, "grad_norm": 0.4893124997615814, "learning_rate": 3.955342902711324e-05, "loss": 0.308, "step": 30730 }, { "epoch": 49.03, "grad_norm": 0.3630107045173645, "learning_rate": 3.891547049441786e-05, "loss": 0.2671, "step": 30740 }, { "epoch": 49.04, "grad_norm": 0.16974857449531555, "learning_rate": 3.8277511961722486e-05, "loss": 0.3295, "step": 30750 }, { "epoch": 49.06, "grad_norm": 0.34105682373046875, "learning_rate": 3.763955342902711e-05, "loss": 0.2895, "step": 30760 }, { "epoch": 49.07, "grad_norm": 0.47773271799087524, "learning_rate": 3.700159489633174e-05, "loss": 0.2591, "step": 30770 }, { "epoch": 49.09, "grad_norm": 0.3436296582221985, "learning_rate": 3.6363636363636364e-05, "loss": 0.2138, "step": 30780 }, { "epoch": 49.11, "grad_norm": 0.1262790709733963, "learning_rate": 3.572567783094099e-05, "loss": 0.2472, "step": 30790 }, { "epoch": 49.12, "grad_norm": 0.2755976915359497, "learning_rate": 3.508771929824561e-05, "loss": 0.2532, "step": 30800 }, { "epoch": 49.14, "grad_norm": 0.16442789137363434, "learning_rate": 3.4449760765550235e-05, "loss": 0.2813, "step": 30810 }, { "epoch": 49.15, "grad_norm": 0.29541754722595215, "learning_rate": 3.3811802232854866e-05, "loss": 0.3036, "step": 30820 }, { "epoch": 49.17, "grad_norm": 0.07406118512153625, "learning_rate": 3.317384370015949e-05, "loss": 0.2552, "step": 30830 }, { "epoch": 49.19, "grad_norm": 0.413967102766037, "learning_rate": 3.253588516746411e-05, "loss": 0.3118, "step": 30840 }, { "epoch": 49.2, "grad_norm": 0.567054808139801, "learning_rate": 3.1897926634768736e-05, "loss": 0.259, "step": 30850 }, { "epoch": 49.22, "grad_norm": 0.19133225083351135, "learning_rate": 3.125996810207336e-05, "loss": 0.216, "step": 30860 }, { "epoch": 49.23, "grad_norm": 0.35869938135147095, "learning_rate": 3.062200956937799e-05, "loss": 0.3244, "step": 30870 }, { "epoch": 49.25, "grad_norm": 0.3546787202358246, "learning_rate": 2.9984051036682618e-05, "loss": 0.3345, "step": 30880 }, { "epoch": 49.27, "grad_norm": 0.3473091721534729, "learning_rate": 2.934609250398724e-05, "loss": 0.2133, "step": 30890 }, { "epoch": 49.28, "grad_norm": 0.4771929979324341, "learning_rate": 2.8708133971291868e-05, "loss": 0.3053, "step": 30900 }, { "epoch": 49.3, "grad_norm": 0.3776096701622009, "learning_rate": 2.8070175438596492e-05, "loss": 0.2765, "step": 30910 }, { "epoch": 49.31, "grad_norm": 0.2937834560871124, "learning_rate": 2.743221690590112e-05, "loss": 0.2581, "step": 30920 }, { "epoch": 49.33, "grad_norm": 0.2534268796443939, "learning_rate": 2.6794258373205743e-05, "loss": 0.2985, "step": 30930 }, { "epoch": 49.35, "grad_norm": 0.18742942810058594, "learning_rate": 2.6156299840510366e-05, "loss": 0.2159, "step": 30940 }, { "epoch": 49.36, "grad_norm": 0.3918183147907257, "learning_rate": 2.5518341307814993e-05, "loss": 0.3054, "step": 30950 }, { "epoch": 49.38, "grad_norm": 0.33097043633461, "learning_rate": 2.4880382775119617e-05, "loss": 0.3201, "step": 30960 }, { "epoch": 49.39, "grad_norm": 0.37174108624458313, "learning_rate": 2.4242424242424244e-05, "loss": 0.2708, "step": 30970 }, { "epoch": 49.41, "grad_norm": 0.27249741554260254, "learning_rate": 2.3604465709728868e-05, "loss": 0.2347, "step": 30980 }, { "epoch": 49.43, "grad_norm": 0.7410305738449097, "learning_rate": 2.2966507177033495e-05, "loss": 0.3456, "step": 30990 }, { "epoch": 49.44, "grad_norm": 0.4471137225627899, "learning_rate": 2.2328548644338118e-05, "loss": 0.2472, "step": 31000 } ], "logging_steps": 10, "max_steps": 31350, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "total_flos": 8.371975248433152e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }