{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.081212657667626, "eval_steps": 500, "global_step": 36000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019670035160187848, "grad_norm": 0.8963498473167419, "learning_rate": 3.0257186081694406e-08, "loss": 0.9638, "step": 1 }, { "epoch": 0.0019670035160187847, "grad_norm": 2.033263683319092, "learning_rate": 3.0257186081694407e-07, "loss": 0.8112, "step": 10 }, { "epoch": 0.003934007032037569, "grad_norm": 1.7033007144927979, "learning_rate": 6.051437216338881e-07, "loss": 0.7522, "step": 20 }, { "epoch": 0.005901010548056355, "grad_norm": 1.5076667070388794, "learning_rate": 9.077155824508321e-07, "loss": 0.9369, "step": 30 }, { "epoch": 0.007868014064075139, "grad_norm": 1.0427778959274292, "learning_rate": 1.2102874432677763e-06, "loss": 0.7922, "step": 40 }, { "epoch": 0.009835017580093924, "grad_norm": 2.3299496173858643, "learning_rate": 1.5128593040847204e-06, "loss": 0.7739, "step": 50 }, { "epoch": 0.01180202109611271, "grad_norm": 0.9595165252685547, "learning_rate": 1.8154311649016642e-06, "loss": 0.7738, "step": 60 }, { "epoch": 0.013769024612131494, "grad_norm": 1.5153745412826538, "learning_rate": 2.118003025718608e-06, "loss": 0.7583, "step": 70 }, { "epoch": 0.015736028128150278, "grad_norm": 1.1440826654434204, "learning_rate": 2.4205748865355526e-06, "loss": 0.7769, "step": 80 }, { "epoch": 0.017703031644169063, "grad_norm": 2.1569674015045166, "learning_rate": 2.7231467473524962e-06, "loss": 0.7269, "step": 90 }, { "epoch": 0.019670035160187848, "grad_norm": 1.6127907037734985, "learning_rate": 3.0257186081694407e-06, "loss": 0.6396, "step": 100 }, { "epoch": 0.021637038676206633, "grad_norm": 1.2766520977020264, "learning_rate": 3.3282904689863844e-06, "loss": 0.6954, "step": 110 }, { "epoch": 0.02360404219222542, "grad_norm": 1.3288867473602295, "learning_rate": 3.6308623298033285e-06, "loss": 0.7179, "step": 120 }, { "epoch": 0.025571045708244203, "grad_norm": 0.6131573915481567, "learning_rate": 3.933434190620273e-06, "loss": 0.6522, "step": 130 }, { "epoch": 0.02753804922426299, "grad_norm": 0.6934183239936829, "learning_rate": 4.236006051437216e-06, "loss": 0.6974, "step": 140 }, { "epoch": 0.029505052740281774, "grad_norm": 0.6823475360870361, "learning_rate": 4.53857791225416e-06, "loss": 0.6047, "step": 150 }, { "epoch": 0.031472056256300555, "grad_norm": 1.4353721141815186, "learning_rate": 4.841149773071105e-06, "loss": 0.5891, "step": 160 }, { "epoch": 0.03343905977231934, "grad_norm": 0.7245773077011108, "learning_rate": 5.143721633888049e-06, "loss": 0.6713, "step": 170 }, { "epoch": 0.035406063288338126, "grad_norm": 0.84307461977005, "learning_rate": 5.4462934947049925e-06, "loss": 0.5529, "step": 180 }, { "epoch": 0.03737306680435691, "grad_norm": 0.6518082618713379, "learning_rate": 5.7488653555219365e-06, "loss": 0.6697, "step": 190 }, { "epoch": 0.039340070320375696, "grad_norm": 0.724744439125061, "learning_rate": 6.0514372163388815e-06, "loss": 0.6472, "step": 200 }, { "epoch": 0.04130707383639448, "grad_norm": 0.6548435091972351, "learning_rate": 6.354009077155825e-06, "loss": 0.5387, "step": 210 }, { "epoch": 0.043274077352413266, "grad_norm": 0.372646689414978, "learning_rate": 6.656580937972769e-06, "loss": 0.565, "step": 220 }, { "epoch": 0.04524108086843205, "grad_norm": 0.5441346168518066, "learning_rate": 6.959152798789714e-06, "loss": 0.5121, "step": 230 }, { "epoch": 0.04720808438445084, "grad_norm": 0.9839174151420593, "learning_rate": 7.261724659606657e-06, "loss": 0.5849, "step": 240 }, { "epoch": 0.04917508790046962, "grad_norm": 1.2899951934814453, "learning_rate": 7.564296520423601e-06, "loss": 0.5569, "step": 250 }, { "epoch": 0.05114209141648841, "grad_norm": 0.5471493601799011, "learning_rate": 7.866868381240546e-06, "loss": 0.5766, "step": 260 }, { "epoch": 0.05310909493250719, "grad_norm": 0.7858185768127441, "learning_rate": 8.169440242057489e-06, "loss": 0.4571, "step": 270 }, { "epoch": 0.05507609844852598, "grad_norm": 0.7448890209197998, "learning_rate": 8.472012102874432e-06, "loss": 0.4962, "step": 280 }, { "epoch": 0.05704310196454476, "grad_norm": 0.7200512886047363, "learning_rate": 8.774583963691377e-06, "loss": 0.56, "step": 290 }, { "epoch": 0.05901010548056355, "grad_norm": 0.725430965423584, "learning_rate": 9.07715582450832e-06, "loss": 0.593, "step": 300 }, { "epoch": 0.06097710899658233, "grad_norm": 0.9160416126251221, "learning_rate": 9.379727685325265e-06, "loss": 0.6143, "step": 310 }, { "epoch": 0.06294411251260111, "grad_norm": 0.6902405619621277, "learning_rate": 9.68229954614221e-06, "loss": 0.6567, "step": 320 }, { "epoch": 0.0649111160286199, "grad_norm": 0.47368934750556946, "learning_rate": 9.984871406959154e-06, "loss": 0.601, "step": 330 }, { "epoch": 0.06687811954463868, "grad_norm": 0.9510943293571472, "learning_rate": 1.0287443267776098e-05, "loss": 0.6315, "step": 340 }, { "epoch": 0.06884512306065747, "grad_norm": 0.5105342268943787, "learning_rate": 1.059001512859304e-05, "loss": 0.5069, "step": 350 }, { "epoch": 0.07081212657667625, "grad_norm": 0.7906575202941895, "learning_rate": 1.0892586989409985e-05, "loss": 0.6008, "step": 360 }, { "epoch": 0.07277913009269504, "grad_norm": 0.6110881567001343, "learning_rate": 1.119515885022693e-05, "loss": 0.6122, "step": 370 }, { "epoch": 0.07474613360871382, "grad_norm": 0.6386378407478333, "learning_rate": 1.1497730711043873e-05, "loss": 0.6167, "step": 380 }, { "epoch": 0.07671313712473261, "grad_norm": 0.7307829856872559, "learning_rate": 1.1800302571860818e-05, "loss": 0.6236, "step": 390 }, { "epoch": 0.07868014064075139, "grad_norm": 0.8760218620300293, "learning_rate": 1.2102874432677763e-05, "loss": 0.5672, "step": 400 }, { "epoch": 0.08064714415677018, "grad_norm": 0.4042556881904602, "learning_rate": 1.2405446293494704e-05, "loss": 0.4975, "step": 410 }, { "epoch": 0.08261414767278896, "grad_norm": 0.4333750903606415, "learning_rate": 1.270801815431165e-05, "loss": 0.634, "step": 420 }, { "epoch": 0.08458115118880775, "grad_norm": 0.36654746532440186, "learning_rate": 1.3010590015128594e-05, "loss": 0.6007, "step": 430 }, { "epoch": 0.08654815470482653, "grad_norm": 0.7794342041015625, "learning_rate": 1.3313161875945538e-05, "loss": 0.6959, "step": 440 }, { "epoch": 0.08851515822084532, "grad_norm": 1.202374815940857, "learning_rate": 1.3615733736762482e-05, "loss": 0.4908, "step": 450 }, { "epoch": 0.0904821617368641, "grad_norm": 1.0065367221832275, "learning_rate": 1.3918305597579427e-05, "loss": 0.5246, "step": 460 }, { "epoch": 0.0924491652528829, "grad_norm": 0.7841054797172546, "learning_rate": 1.4220877458396369e-05, "loss": 0.6303, "step": 470 }, { "epoch": 0.09441616876890167, "grad_norm": 0.49154767394065857, "learning_rate": 1.4523449319213314e-05, "loss": 0.4197, "step": 480 }, { "epoch": 0.09638317228492047, "grad_norm": 0.7281906604766846, "learning_rate": 1.4826021180030259e-05, "loss": 0.5574, "step": 490 }, { "epoch": 0.09835017580093924, "grad_norm": 0.46101605892181396, "learning_rate": 1.5128593040847202e-05, "loss": 0.5686, "step": 500 }, { "epoch": 0.09835017580093924, "eval_loss": 0.3269956707954407, "eval_runtime": 8.9032, "eval_samples_per_second": 5.616, "eval_steps_per_second": 2.808, "step": 500 }, { "epoch": 0.10031717931695804, "grad_norm": 0.7430837750434875, "learning_rate": 1.5431164901664147e-05, "loss": 0.6991, "step": 510 }, { "epoch": 0.10228418283297681, "grad_norm": 0.4325253367424011, "learning_rate": 1.5733736762481092e-05, "loss": 0.6192, "step": 520 }, { "epoch": 0.10425118634899559, "grad_norm": 0.45161041617393494, "learning_rate": 1.6036308623298033e-05, "loss": 0.5495, "step": 530 }, { "epoch": 0.10621818986501438, "grad_norm": 0.43786877393722534, "learning_rate": 1.6338880484114978e-05, "loss": 0.5687, "step": 540 }, { "epoch": 0.10818519338103316, "grad_norm": 0.5659717917442322, "learning_rate": 1.6641452344931923e-05, "loss": 0.5838, "step": 550 }, { "epoch": 0.11015219689705195, "grad_norm": 0.6761313676834106, "learning_rate": 1.6944024205748865e-05, "loss": 0.5822, "step": 560 }, { "epoch": 0.11211920041307073, "grad_norm": 0.4713389575481415, "learning_rate": 1.724659606656581e-05, "loss": 0.5271, "step": 570 }, { "epoch": 0.11408620392908952, "grad_norm": 0.7082213759422302, "learning_rate": 1.7549167927382755e-05, "loss": 0.5033, "step": 580 }, { "epoch": 0.1160532074451083, "grad_norm": 0.6817394495010376, "learning_rate": 1.78517397881997e-05, "loss": 0.6074, "step": 590 }, { "epoch": 0.1180202109611271, "grad_norm": 0.5328401923179626, "learning_rate": 1.815431164901664e-05, "loss": 0.6559, "step": 600 }, { "epoch": 0.11998721447714587, "grad_norm": 0.5534793138504028, "learning_rate": 1.8456883509833586e-05, "loss": 0.5362, "step": 610 }, { "epoch": 0.12195421799316467, "grad_norm": 0.3978525996208191, "learning_rate": 1.875945537065053e-05, "loss": 0.5953, "step": 620 }, { "epoch": 0.12392122150918344, "grad_norm": 0.7429622411727905, "learning_rate": 1.9062027231467476e-05, "loss": 0.5147, "step": 630 }, { "epoch": 0.12588822502520222, "grad_norm": 0.4334476888179779, "learning_rate": 1.936459909228442e-05, "loss": 0.5867, "step": 640 }, { "epoch": 0.12785522854122103, "grad_norm": 0.5612448453903198, "learning_rate": 1.9667170953101362e-05, "loss": 0.471, "step": 650 }, { "epoch": 0.1298222320572398, "grad_norm": 0.6829410791397095, "learning_rate": 1.9969742813918307e-05, "loss": 0.4989, "step": 660 }, { "epoch": 0.13178923557325858, "grad_norm": 0.42095333337783813, "learning_rate": 1.9997248463725582e-05, "loss": 0.5226, "step": 670 }, { "epoch": 0.13375623908927736, "grad_norm": 0.4745030105113983, "learning_rate": 1.999419120119845e-05, "loss": 0.49, "step": 680 }, { "epoch": 0.13572324260529617, "grad_norm": 0.6165034770965576, "learning_rate": 1.9991133938671313e-05, "loss": 0.5754, "step": 690 }, { "epoch": 0.13769024612131495, "grad_norm": 0.5824708342552185, "learning_rate": 1.9988076676144183e-05, "loss": 0.442, "step": 700 }, { "epoch": 0.13965724963733372, "grad_norm": 0.764433741569519, "learning_rate": 1.998501941361705e-05, "loss": 0.557, "step": 710 }, { "epoch": 0.1416242531533525, "grad_norm": 0.5732194185256958, "learning_rate": 1.9981962151089914e-05, "loss": 0.5354, "step": 720 }, { "epoch": 0.1435912566693713, "grad_norm": 0.4292770028114319, "learning_rate": 1.9978904888562783e-05, "loss": 0.5574, "step": 730 }, { "epoch": 0.1455582601853901, "grad_norm": 0.9347184300422668, "learning_rate": 1.997584762603565e-05, "loss": 0.5045, "step": 740 }, { "epoch": 0.14752526370140887, "grad_norm": 0.7119300365447998, "learning_rate": 1.9972790363508515e-05, "loss": 0.4441, "step": 750 }, { "epoch": 0.14949226721742764, "grad_norm": 0.48641037940979004, "learning_rate": 1.9969733100981384e-05, "loss": 0.6329, "step": 760 }, { "epoch": 0.15145927073344645, "grad_norm": 0.7848897576332092, "learning_rate": 1.996667583845425e-05, "loss": 0.4577, "step": 770 }, { "epoch": 0.15342627424946523, "grad_norm": 0.6484516263008118, "learning_rate": 1.996361857592712e-05, "loss": 0.4841, "step": 780 }, { "epoch": 0.155393277765484, "grad_norm": 0.7445515394210815, "learning_rate": 1.9960561313399985e-05, "loss": 0.4113, "step": 790 }, { "epoch": 0.15736028128150278, "grad_norm": 0.6570308804512024, "learning_rate": 1.995750405087285e-05, "loss": 0.4707, "step": 800 }, { "epoch": 0.15932728479752156, "grad_norm": 0.7139286994934082, "learning_rate": 1.9954446788345716e-05, "loss": 0.5357, "step": 810 }, { "epoch": 0.16129428831354037, "grad_norm": 0.8385933637619019, "learning_rate": 1.9951389525818582e-05, "loss": 0.4862, "step": 820 }, { "epoch": 0.16326129182955915, "grad_norm": 0.55597984790802, "learning_rate": 1.994833226329145e-05, "loss": 0.4843, "step": 830 }, { "epoch": 0.16522829534557792, "grad_norm": 0.6878874897956848, "learning_rate": 1.9945275000764317e-05, "loss": 0.5163, "step": 840 }, { "epoch": 0.1671952988615967, "grad_norm": 0.8103552460670471, "learning_rate": 1.9942217738237183e-05, "loss": 0.5898, "step": 850 }, { "epoch": 0.1691623023776155, "grad_norm": 0.6601850390434265, "learning_rate": 1.9939160475710052e-05, "loss": 0.4824, "step": 860 }, { "epoch": 0.1711293058936343, "grad_norm": 0.6179708242416382, "learning_rate": 1.9936103213182918e-05, "loss": 0.5301, "step": 870 }, { "epoch": 0.17309630940965307, "grad_norm": 0.5425893664360046, "learning_rate": 1.9933045950655784e-05, "loss": 0.5353, "step": 880 }, { "epoch": 0.17506331292567184, "grad_norm": 0.533669650554657, "learning_rate": 1.9929988688128653e-05, "loss": 0.4981, "step": 890 }, { "epoch": 0.17703031644169065, "grad_norm": 0.5814224481582642, "learning_rate": 1.992693142560152e-05, "loss": 0.6519, "step": 900 }, { "epoch": 0.17899731995770943, "grad_norm": 0.8669481873512268, "learning_rate": 1.9923874163074388e-05, "loss": 0.6702, "step": 910 }, { "epoch": 0.1809643234737282, "grad_norm": 0.8260190486907959, "learning_rate": 1.992081690054725e-05, "loss": 0.551, "step": 920 }, { "epoch": 0.18293132698974698, "grad_norm": 0.8739385008811951, "learning_rate": 1.991775963802012e-05, "loss": 0.4831, "step": 930 }, { "epoch": 0.1848983305057658, "grad_norm": 1.0561296939849854, "learning_rate": 1.9914702375492985e-05, "loss": 0.5674, "step": 940 }, { "epoch": 0.18686533402178457, "grad_norm": 0.6565110683441162, "learning_rate": 1.991164511296585e-05, "loss": 0.39, "step": 950 }, { "epoch": 0.18883233753780335, "grad_norm": 0.45213982462882996, "learning_rate": 1.990858785043872e-05, "loss": 0.4733, "step": 960 }, { "epoch": 0.19079934105382212, "grad_norm": 0.5731518268585205, "learning_rate": 1.9905530587911586e-05, "loss": 0.5276, "step": 970 }, { "epoch": 0.19276634456984093, "grad_norm": 0.5559749603271484, "learning_rate": 1.990247332538445e-05, "loss": 0.5062, "step": 980 }, { "epoch": 0.1947333480858597, "grad_norm": 0.7155632376670837, "learning_rate": 1.989941606285732e-05, "loss": 0.5511, "step": 990 }, { "epoch": 0.1967003516018785, "grad_norm": 0.7516645193099976, "learning_rate": 1.9896358800330186e-05, "loss": 0.4663, "step": 1000 }, { "epoch": 0.1967003516018785, "eval_loss": 0.2955791652202606, "eval_runtime": 8.8701, "eval_samples_per_second": 5.637, "eval_steps_per_second": 2.818, "step": 1000 }, { "epoch": 0.19866735511789727, "grad_norm": 0.6724827885627747, "learning_rate": 1.9893301537803052e-05, "loss": 0.3468, "step": 1010 }, { "epoch": 0.20063435863391607, "grad_norm": 0.8730838298797607, "learning_rate": 1.989024427527592e-05, "loss": 0.4836, "step": 1020 }, { "epoch": 0.20260136214993485, "grad_norm": 0.9460917711257935, "learning_rate": 1.9887187012748784e-05, "loss": 0.4645, "step": 1030 }, { "epoch": 0.20456836566595363, "grad_norm": 1.7044711112976074, "learning_rate": 1.9884129750221653e-05, "loss": 0.6025, "step": 1040 }, { "epoch": 0.2065353691819724, "grad_norm": 1.0980366468429565, "learning_rate": 1.988107248769452e-05, "loss": 0.5283, "step": 1050 }, { "epoch": 0.20850237269799118, "grad_norm": 0.7067188024520874, "learning_rate": 1.9878015225167388e-05, "loss": 0.4825, "step": 1060 }, { "epoch": 0.21046937621401, "grad_norm": 0.6638745665550232, "learning_rate": 1.9874957962640254e-05, "loss": 0.4808, "step": 1070 }, { "epoch": 0.21243637973002877, "grad_norm": 0.6948477029800415, "learning_rate": 1.987190070011312e-05, "loss": 0.5871, "step": 1080 }, { "epoch": 0.21440338324604755, "grad_norm": 0.7186503410339355, "learning_rate": 1.986884343758599e-05, "loss": 0.6654, "step": 1090 }, { "epoch": 0.21637038676206632, "grad_norm": 0.737399160861969, "learning_rate": 1.9865786175058854e-05, "loss": 0.5239, "step": 1100 }, { "epoch": 0.21833739027808513, "grad_norm": 0.7045579552650452, "learning_rate": 1.986272891253172e-05, "loss": 0.4644, "step": 1110 }, { "epoch": 0.2203043937941039, "grad_norm": 0.6651056408882141, "learning_rate": 1.985967165000459e-05, "loss": 0.4719, "step": 1120 }, { "epoch": 0.2222713973101227, "grad_norm": 0.6420923471450806, "learning_rate": 1.9856614387477455e-05, "loss": 0.4595, "step": 1130 }, { "epoch": 0.22423840082614147, "grad_norm": 0.5954447984695435, "learning_rate": 1.985355712495032e-05, "loss": 0.4917, "step": 1140 }, { "epoch": 0.22620540434216027, "grad_norm": 0.6256354451179504, "learning_rate": 1.9850499862423187e-05, "loss": 0.4765, "step": 1150 }, { "epoch": 0.22817240785817905, "grad_norm": 1.5549144744873047, "learning_rate": 1.9847442599896052e-05, "loss": 0.62, "step": 1160 }, { "epoch": 0.23013941137419783, "grad_norm": 1.0412925481796265, "learning_rate": 1.984438533736892e-05, "loss": 0.5452, "step": 1170 }, { "epoch": 0.2321064148902166, "grad_norm": 0.7398366332054138, "learning_rate": 1.9841328074841787e-05, "loss": 0.6219, "step": 1180 }, { "epoch": 0.2340734184062354, "grad_norm": 0.9944019317626953, "learning_rate": 1.9838270812314657e-05, "loss": 0.5009, "step": 1190 }, { "epoch": 0.2360404219222542, "grad_norm": 0.9399623870849609, "learning_rate": 1.9835213549787522e-05, "loss": 0.4176, "step": 1200 }, { "epoch": 0.23800742543827297, "grad_norm": 0.6136744022369385, "learning_rate": 1.9832156287260388e-05, "loss": 0.5097, "step": 1210 }, { "epoch": 0.23997442895429175, "grad_norm": 0.657649576663971, "learning_rate": 1.9829099024733257e-05, "loss": 0.3794, "step": 1220 }, { "epoch": 0.24194143247031055, "grad_norm": 0.6419724822044373, "learning_rate": 1.9826041762206123e-05, "loss": 0.6161, "step": 1230 }, { "epoch": 0.24390843598632933, "grad_norm": 0.43958067893981934, "learning_rate": 1.982298449967899e-05, "loss": 0.5344, "step": 1240 }, { "epoch": 0.2458754395023481, "grad_norm": 0.9898470044136047, "learning_rate": 1.9819927237151858e-05, "loss": 0.5375, "step": 1250 }, { "epoch": 0.2478424430183669, "grad_norm": 1.0850481986999512, "learning_rate": 1.981686997462472e-05, "loss": 0.5602, "step": 1260 }, { "epoch": 0.2498094465343857, "grad_norm": 2.405172109603882, "learning_rate": 1.981381271209759e-05, "loss": 0.4474, "step": 1270 }, { "epoch": 0.25177645005040444, "grad_norm": 0.7804758548736572, "learning_rate": 1.9810755449570455e-05, "loss": 0.5501, "step": 1280 }, { "epoch": 0.25374345356642325, "grad_norm": 0.706414520740509, "learning_rate": 1.980769818704332e-05, "loss": 0.5222, "step": 1290 }, { "epoch": 0.25571045708244206, "grad_norm": 0.8361694812774658, "learning_rate": 1.980464092451619e-05, "loss": 0.559, "step": 1300 }, { "epoch": 0.2576774605984608, "grad_norm": 0.8130835890769958, "learning_rate": 1.9801583661989056e-05, "loss": 0.5284, "step": 1310 }, { "epoch": 0.2596444641144796, "grad_norm": 1.4580860137939453, "learning_rate": 1.9798526399461925e-05, "loss": 0.4438, "step": 1320 }, { "epoch": 0.26161146763049836, "grad_norm": 0.7845149636268616, "learning_rate": 1.979546913693479e-05, "loss": 0.4895, "step": 1330 }, { "epoch": 0.26357847114651717, "grad_norm": 0.851684033870697, "learning_rate": 1.9792411874407657e-05, "loss": 0.5788, "step": 1340 }, { "epoch": 0.265545474662536, "grad_norm": 1.1903982162475586, "learning_rate": 1.9789354611880526e-05, "loss": 0.5113, "step": 1350 }, { "epoch": 0.2675124781785547, "grad_norm": 0.595227837562561, "learning_rate": 1.978629734935339e-05, "loss": 0.4556, "step": 1360 }, { "epoch": 0.26947948169457353, "grad_norm": 0.6648783087730408, "learning_rate": 1.9783240086826257e-05, "loss": 0.3519, "step": 1370 }, { "epoch": 0.27144648521059234, "grad_norm": 0.5223883986473083, "learning_rate": 1.9780182824299123e-05, "loss": 0.6364, "step": 1380 }, { "epoch": 0.2734134887266111, "grad_norm": 0.96187824010849, "learning_rate": 1.977712556177199e-05, "loss": 0.4932, "step": 1390 }, { "epoch": 0.2753804922426299, "grad_norm": 0.8614581227302551, "learning_rate": 1.9774068299244858e-05, "loss": 0.5926, "step": 1400 }, { "epoch": 0.27734749575864864, "grad_norm": 1.119659423828125, "learning_rate": 1.9771011036717724e-05, "loss": 0.3975, "step": 1410 }, { "epoch": 0.27931449927466745, "grad_norm": 0.9885017275810242, "learning_rate": 1.976795377419059e-05, "loss": 0.5982, "step": 1420 }, { "epoch": 0.28128150279068626, "grad_norm": 1.0573168992996216, "learning_rate": 1.976489651166346e-05, "loss": 0.5111, "step": 1430 }, { "epoch": 0.283248506306705, "grad_norm": 0.9164770245552063, "learning_rate": 1.9761839249136325e-05, "loss": 0.6055, "step": 1440 }, { "epoch": 0.2852155098227238, "grad_norm": 0.7315860986709595, "learning_rate": 1.9758781986609194e-05, "loss": 0.4542, "step": 1450 }, { "epoch": 0.2871825133387426, "grad_norm": 0.679958701133728, "learning_rate": 1.975572472408206e-05, "loss": 0.5706, "step": 1460 }, { "epoch": 0.28914951685476137, "grad_norm": 0.8225506544113159, "learning_rate": 1.9752667461554925e-05, "loss": 0.5128, "step": 1470 }, { "epoch": 0.2911165203707802, "grad_norm": 0.3660297095775604, "learning_rate": 1.974961019902779e-05, "loss": 0.6373, "step": 1480 }, { "epoch": 0.2930835238867989, "grad_norm": 0.6732741594314575, "learning_rate": 1.9746552936500657e-05, "loss": 0.577, "step": 1490 }, { "epoch": 0.29505052740281773, "grad_norm": 0.38270995020866394, "learning_rate": 1.9743495673973526e-05, "loss": 0.5341, "step": 1500 }, { "epoch": 0.29505052740281773, "eval_loss": 0.2840212285518646, "eval_runtime": 8.8894, "eval_samples_per_second": 5.625, "eval_steps_per_second": 2.812, "step": 1500 }, { "epoch": 0.29701753091883654, "grad_norm": 0.7969959378242493, "learning_rate": 1.9740438411446392e-05, "loss": 0.4869, "step": 1510 }, { "epoch": 0.2989845344348553, "grad_norm": 0.6881989240646362, "learning_rate": 1.9737381148919258e-05, "loss": 0.4047, "step": 1520 }, { "epoch": 0.3009515379508741, "grad_norm": 0.5518563389778137, "learning_rate": 1.9734323886392127e-05, "loss": 0.5494, "step": 1530 }, { "epoch": 0.3029185414668929, "grad_norm": 0.6757585406303406, "learning_rate": 1.9731266623864993e-05, "loss": 0.6683, "step": 1540 }, { "epoch": 0.30488554498291165, "grad_norm": 1.0032838582992554, "learning_rate": 1.972820936133786e-05, "loss": 0.442, "step": 1550 }, { "epoch": 0.30685254849893046, "grad_norm": 0.6686198711395264, "learning_rate": 1.9725152098810728e-05, "loss": 0.4487, "step": 1560 }, { "epoch": 0.3088195520149492, "grad_norm": 0.6934469938278198, "learning_rate": 1.9722094836283593e-05, "loss": 0.5372, "step": 1570 }, { "epoch": 0.310786555530968, "grad_norm": 1.1455458402633667, "learning_rate": 1.9719037573756463e-05, "loss": 0.5002, "step": 1580 }, { "epoch": 0.3127535590469868, "grad_norm": 0.7301696538925171, "learning_rate": 1.9715980311229328e-05, "loss": 0.4819, "step": 1590 }, { "epoch": 0.31472056256300557, "grad_norm": 0.905255138874054, "learning_rate": 1.9712923048702194e-05, "loss": 0.5643, "step": 1600 }, { "epoch": 0.3166875660790244, "grad_norm": 0.869118869304657, "learning_rate": 1.970986578617506e-05, "loss": 0.6479, "step": 1610 }, { "epoch": 0.3186545695950431, "grad_norm": 0.6825863122940063, "learning_rate": 1.9706808523647926e-05, "loss": 0.4414, "step": 1620 }, { "epoch": 0.32062157311106193, "grad_norm": 0.8974255323410034, "learning_rate": 1.9703751261120795e-05, "loss": 0.3885, "step": 1630 }, { "epoch": 0.32258857662708074, "grad_norm": 0.6362448930740356, "learning_rate": 1.970069399859366e-05, "loss": 0.5692, "step": 1640 }, { "epoch": 0.3245555801430995, "grad_norm": 0.6962127685546875, "learning_rate": 1.9697636736066526e-05, "loss": 0.5196, "step": 1650 }, { "epoch": 0.3265225836591183, "grad_norm": 0.8557025790214539, "learning_rate": 1.9694579473539396e-05, "loss": 0.476, "step": 1660 }, { "epoch": 0.3284895871751371, "grad_norm": 0.439887672662735, "learning_rate": 1.969152221101226e-05, "loss": 0.6215, "step": 1670 }, { "epoch": 0.33045659069115585, "grad_norm": 0.5827410817146301, "learning_rate": 1.9688464948485127e-05, "loss": 0.4626, "step": 1680 }, { "epoch": 0.33242359420717466, "grad_norm": 0.8372606635093689, "learning_rate": 1.9685407685957996e-05, "loss": 0.4751, "step": 1690 }, { "epoch": 0.3343905977231934, "grad_norm": 0.7339947819709778, "learning_rate": 1.9682350423430862e-05, "loss": 0.5446, "step": 1700 }, { "epoch": 0.3363576012392122, "grad_norm": 0.8041804432868958, "learning_rate": 1.9679293160903728e-05, "loss": 0.5248, "step": 1710 }, { "epoch": 0.338324604755231, "grad_norm": 0.6950403451919556, "learning_rate": 1.9676235898376594e-05, "loss": 0.6077, "step": 1720 }, { "epoch": 0.34029160827124977, "grad_norm": 0.8875169157981873, "learning_rate": 1.9673178635849463e-05, "loss": 0.5625, "step": 1730 }, { "epoch": 0.3422586117872686, "grad_norm": 1.0996932983398438, "learning_rate": 1.967012137332233e-05, "loss": 0.5417, "step": 1740 }, { "epoch": 0.3442256153032874, "grad_norm": 0.685312807559967, "learning_rate": 1.9667064110795194e-05, "loss": 0.4577, "step": 1750 }, { "epoch": 0.34619261881930613, "grad_norm": 0.6270304322242737, "learning_rate": 1.9664006848268063e-05, "loss": 0.5995, "step": 1760 }, { "epoch": 0.34815962233532494, "grad_norm": 0.49572035670280457, "learning_rate": 1.966094958574093e-05, "loss": 0.5602, "step": 1770 }, { "epoch": 0.3501266258513437, "grad_norm": 0.8444635272026062, "learning_rate": 1.9657892323213795e-05, "loss": 0.536, "step": 1780 }, { "epoch": 0.3520936293673625, "grad_norm": 0.7252330780029297, "learning_rate": 1.9654835060686664e-05, "loss": 0.5329, "step": 1790 }, { "epoch": 0.3540606328833813, "grad_norm": 1.0123865604400635, "learning_rate": 1.965177779815953e-05, "loss": 0.5508, "step": 1800 }, { "epoch": 0.35602763639940005, "grad_norm": 0.6840813159942627, "learning_rate": 1.9648720535632396e-05, "loss": 0.4869, "step": 1810 }, { "epoch": 0.35799463991541886, "grad_norm": 0.9481569528579712, "learning_rate": 1.964566327310526e-05, "loss": 0.4472, "step": 1820 }, { "epoch": 0.35996164343143766, "grad_norm": 0.7030127048492432, "learning_rate": 1.964260601057813e-05, "loss": 0.4563, "step": 1830 }, { "epoch": 0.3619286469474564, "grad_norm": 1.2299069166183472, "learning_rate": 1.9639548748050996e-05, "loss": 0.3174, "step": 1840 }, { "epoch": 0.3638956504634752, "grad_norm": 0.9066298007965088, "learning_rate": 1.9636491485523862e-05, "loss": 0.4359, "step": 1850 }, { "epoch": 0.36586265397949397, "grad_norm": 0.9739953279495239, "learning_rate": 1.963343422299673e-05, "loss": 0.3874, "step": 1860 }, { "epoch": 0.3678296574955128, "grad_norm": 0.6393256187438965, "learning_rate": 1.9630376960469597e-05, "loss": 0.4747, "step": 1870 }, { "epoch": 0.3697966610115316, "grad_norm": 0.8060562014579773, "learning_rate": 1.9627319697942463e-05, "loss": 0.6424, "step": 1880 }, { "epoch": 0.37176366452755033, "grad_norm": 0.8817802667617798, "learning_rate": 1.9624262435415332e-05, "loss": 0.4514, "step": 1890 }, { "epoch": 0.37373066804356914, "grad_norm": 1.1933926343917847, "learning_rate": 1.9621205172888198e-05, "loss": 0.5605, "step": 1900 }, { "epoch": 0.3756976715595879, "grad_norm": 1.1892311573028564, "learning_rate": 1.9618147910361064e-05, "loss": 0.4527, "step": 1910 }, { "epoch": 0.3776646750756067, "grad_norm": 0.7793095111846924, "learning_rate": 1.9615090647833933e-05, "loss": 0.5498, "step": 1920 }, { "epoch": 0.3796316785916255, "grad_norm": 0.4772995114326477, "learning_rate": 1.9612033385306795e-05, "loss": 0.4916, "step": 1930 }, { "epoch": 0.38159868210764425, "grad_norm": 0.8411799669265747, "learning_rate": 1.9608976122779664e-05, "loss": 0.4298, "step": 1940 }, { "epoch": 0.38356568562366306, "grad_norm": 1.2099697589874268, "learning_rate": 1.960591886025253e-05, "loss": 0.5165, "step": 1950 }, { "epoch": 0.38553268913968186, "grad_norm": 1.0067557096481323, "learning_rate": 1.96028615977254e-05, "loss": 0.5543, "step": 1960 }, { "epoch": 0.3874996926557006, "grad_norm": 1.0297846794128418, "learning_rate": 1.9599804335198265e-05, "loss": 0.4319, "step": 1970 }, { "epoch": 0.3894666961717194, "grad_norm": 0.788569450378418, "learning_rate": 1.959674707267113e-05, "loss": 0.4775, "step": 1980 }, { "epoch": 0.39143369968773817, "grad_norm": 0.9311039447784424, "learning_rate": 1.9593689810144e-05, "loss": 0.4415, "step": 1990 }, { "epoch": 0.393400703203757, "grad_norm": 0.8891676068305969, "learning_rate": 1.9590632547616866e-05, "loss": 0.4075, "step": 2000 }, { "epoch": 0.393400703203757, "eval_loss": 0.2848837673664093, "eval_runtime": 8.8626, "eval_samples_per_second": 5.642, "eval_steps_per_second": 2.821, "step": 2000 }, { "epoch": 0.3953677067197758, "grad_norm": 0.6306418180465698, "learning_rate": 1.958757528508973e-05, "loss": 0.5114, "step": 2010 }, { "epoch": 0.39733471023579453, "grad_norm": 0.8960371613502502, "learning_rate": 1.95845180225626e-05, "loss": 0.386, "step": 2020 }, { "epoch": 0.39930171375181334, "grad_norm": 1.0953959226608276, "learning_rate": 1.9581460760035467e-05, "loss": 0.5311, "step": 2030 }, { "epoch": 0.40126871726783214, "grad_norm": 0.8647001385688782, "learning_rate": 1.9578403497508332e-05, "loss": 0.4544, "step": 2040 }, { "epoch": 0.4032357207838509, "grad_norm": 0.9456301927566528, "learning_rate": 1.9575346234981198e-05, "loss": 0.456, "step": 2050 }, { "epoch": 0.4052027242998697, "grad_norm": 0.7155416011810303, "learning_rate": 1.9572288972454064e-05, "loss": 0.5354, "step": 2060 }, { "epoch": 0.40716972781588845, "grad_norm": 1.0676209926605225, "learning_rate": 1.9569231709926933e-05, "loss": 0.4509, "step": 2070 }, { "epoch": 0.40913673133190726, "grad_norm": 1.194039225578308, "learning_rate": 1.95661744473998e-05, "loss": 0.6663, "step": 2080 }, { "epoch": 0.41110373484792606, "grad_norm": 0.9243388175964355, "learning_rate": 1.9563117184872668e-05, "loss": 0.4106, "step": 2090 }, { "epoch": 0.4130707383639448, "grad_norm": 0.9473809599876404, "learning_rate": 1.9560059922345534e-05, "loss": 0.4455, "step": 2100 }, { "epoch": 0.4150377418799636, "grad_norm": 0.6198266744613647, "learning_rate": 1.95570026598184e-05, "loss": 0.4656, "step": 2110 }, { "epoch": 0.41700474539598237, "grad_norm": 0.6981731057167053, "learning_rate": 1.955394539729127e-05, "loss": 0.4844, "step": 2120 }, { "epoch": 0.4189717489120012, "grad_norm": 0.9984627366065979, "learning_rate": 1.9550888134764134e-05, "loss": 0.4839, "step": 2130 }, { "epoch": 0.42093875242802, "grad_norm": 1.2150676250457764, "learning_rate": 1.9547830872237e-05, "loss": 0.5235, "step": 2140 }, { "epoch": 0.42290575594403873, "grad_norm": 0.801630973815918, "learning_rate": 1.954477360970987e-05, "loss": 0.5282, "step": 2150 }, { "epoch": 0.42487275946005754, "grad_norm": 1.599314570426941, "learning_rate": 1.9541716347182732e-05, "loss": 0.4939, "step": 2160 }, { "epoch": 0.42683976297607634, "grad_norm": 0.885888397693634, "learning_rate": 1.95386590846556e-05, "loss": 0.5504, "step": 2170 }, { "epoch": 0.4288067664920951, "grad_norm": 0.7052297592163086, "learning_rate": 1.9535601822128467e-05, "loss": 0.4576, "step": 2180 }, { "epoch": 0.4307737700081139, "grad_norm": 0.635510265827179, "learning_rate": 1.9532544559601333e-05, "loss": 0.5811, "step": 2190 }, { "epoch": 0.43274077352413265, "grad_norm": 0.619910478591919, "learning_rate": 1.9529487297074202e-05, "loss": 0.4898, "step": 2200 }, { "epoch": 0.43470777704015146, "grad_norm": 0.8020810484886169, "learning_rate": 1.9526430034547067e-05, "loss": 0.4318, "step": 2210 }, { "epoch": 0.43667478055617026, "grad_norm": 1.2118501663208008, "learning_rate": 1.9523372772019937e-05, "loss": 0.4156, "step": 2220 }, { "epoch": 0.438641784072189, "grad_norm": 0.6296743154525757, "learning_rate": 1.9520315509492802e-05, "loss": 0.4371, "step": 2230 }, { "epoch": 0.4406087875882078, "grad_norm": 0.7382553219795227, "learning_rate": 1.9517258246965668e-05, "loss": 0.4819, "step": 2240 }, { "epoch": 0.4425757911042266, "grad_norm": 0.9509519338607788, "learning_rate": 1.9514200984438537e-05, "loss": 0.5008, "step": 2250 }, { "epoch": 0.4445427946202454, "grad_norm": 1.098402976989746, "learning_rate": 1.9511143721911403e-05, "loss": 0.4785, "step": 2260 }, { "epoch": 0.4465097981362642, "grad_norm": 0.6164669990539551, "learning_rate": 1.950808645938427e-05, "loss": 0.4082, "step": 2270 }, { "epoch": 0.44847680165228293, "grad_norm": 0.9613497257232666, "learning_rate": 1.9505029196857135e-05, "loss": 0.3826, "step": 2280 }, { "epoch": 0.45044380516830174, "grad_norm": 0.8639736175537109, "learning_rate": 1.950197193433e-05, "loss": 0.537, "step": 2290 }, { "epoch": 0.45241080868432054, "grad_norm": 1.09634530544281, "learning_rate": 1.949891467180287e-05, "loss": 0.5444, "step": 2300 }, { "epoch": 0.4543778122003393, "grad_norm": 0.7725170254707336, "learning_rate": 1.9495857409275735e-05, "loss": 0.3995, "step": 2310 }, { "epoch": 0.4563448157163581, "grad_norm": 0.5570437908172607, "learning_rate": 1.94928001467486e-05, "loss": 0.4471, "step": 2320 }, { "epoch": 0.4583118192323769, "grad_norm": 0.8947836756706238, "learning_rate": 1.948974288422147e-05, "loss": 0.4954, "step": 2330 }, { "epoch": 0.46027882274839566, "grad_norm": 0.6316766738891602, "learning_rate": 1.9486685621694336e-05, "loss": 0.5009, "step": 2340 }, { "epoch": 0.46224582626441446, "grad_norm": 0.6918854117393494, "learning_rate": 1.9483628359167205e-05, "loss": 0.5725, "step": 2350 }, { "epoch": 0.4642128297804332, "grad_norm": 1.0366955995559692, "learning_rate": 1.948057109664007e-05, "loss": 0.5113, "step": 2360 }, { "epoch": 0.466179833296452, "grad_norm": 1.386698603630066, "learning_rate": 1.9477513834112937e-05, "loss": 0.481, "step": 2370 }, { "epoch": 0.4681468368124708, "grad_norm": 0.4483737349510193, "learning_rate": 1.9474456571585806e-05, "loss": 0.4917, "step": 2380 }, { "epoch": 0.4701138403284896, "grad_norm": 0.7257867455482483, "learning_rate": 1.947139930905867e-05, "loss": 0.5398, "step": 2390 }, { "epoch": 0.4720808438445084, "grad_norm": 1.0875515937805176, "learning_rate": 1.9468342046531538e-05, "loss": 0.3761, "step": 2400 }, { "epoch": 0.47404784736052713, "grad_norm": 1.1179277896881104, "learning_rate": 1.9465284784004403e-05, "loss": 0.3777, "step": 2410 }, { "epoch": 0.47601485087654594, "grad_norm": 1.132418155670166, "learning_rate": 1.946222752147727e-05, "loss": 0.4218, "step": 2420 }, { "epoch": 0.47798185439256474, "grad_norm": 0.5543782114982605, "learning_rate": 1.945917025895014e-05, "loss": 0.3894, "step": 2430 }, { "epoch": 0.4799488579085835, "grad_norm": 0.7127739191055298, "learning_rate": 1.9456112996423004e-05, "loss": 0.5318, "step": 2440 }, { "epoch": 0.4819158614246023, "grad_norm": 0.6442409157752991, "learning_rate": 1.945305573389587e-05, "loss": 0.4879, "step": 2450 }, { "epoch": 0.4838828649406211, "grad_norm": 0.7430324554443359, "learning_rate": 1.944999847136874e-05, "loss": 0.4242, "step": 2460 }, { "epoch": 0.48584986845663986, "grad_norm": 1.3987452983856201, "learning_rate": 1.9446941208841605e-05, "loss": 0.5224, "step": 2470 }, { "epoch": 0.48781687197265866, "grad_norm": 1.1393516063690186, "learning_rate": 1.9443883946314474e-05, "loss": 0.4801, "step": 2480 }, { "epoch": 0.4897838754886774, "grad_norm": 1.146475911140442, "learning_rate": 1.944082668378734e-05, "loss": 0.4509, "step": 2490 }, { "epoch": 0.4917508790046962, "grad_norm": 0.9160381555557251, "learning_rate": 1.9437769421260206e-05, "loss": 0.4539, "step": 2500 }, { "epoch": 0.4917508790046962, "eval_loss": 0.27316009998321533, "eval_runtime": 8.8733, "eval_samples_per_second": 5.635, "eval_steps_per_second": 2.817, "step": 2500 }, { "epoch": 0.493717882520715, "grad_norm": 0.8414424657821655, "learning_rate": 1.943471215873307e-05, "loss": 0.4398, "step": 2510 }, { "epoch": 0.4956848860367338, "grad_norm": 1.182061791419983, "learning_rate": 1.9431654896205937e-05, "loss": 0.4886, "step": 2520 }, { "epoch": 0.4976518895527526, "grad_norm": 0.6704056262969971, "learning_rate": 1.9428597633678806e-05, "loss": 0.479, "step": 2530 }, { "epoch": 0.4996188930687714, "grad_norm": 1.2297146320343018, "learning_rate": 1.9425540371151672e-05, "loss": 0.3661, "step": 2540 }, { "epoch": 0.5015858965847901, "grad_norm": 0.8071584701538086, "learning_rate": 1.9422483108624538e-05, "loss": 0.4658, "step": 2550 }, { "epoch": 0.5035529001008089, "grad_norm": 0.942072868347168, "learning_rate": 1.9419425846097407e-05, "loss": 0.4275, "step": 2560 }, { "epoch": 0.5055199036168277, "grad_norm": 0.8354616761207581, "learning_rate": 1.9416368583570273e-05, "loss": 0.4297, "step": 2570 }, { "epoch": 0.5074869071328465, "grad_norm": 0.7049144506454468, "learning_rate": 1.941331132104314e-05, "loss": 0.4462, "step": 2580 }, { "epoch": 0.5094539106488652, "grad_norm": 0.7258726358413696, "learning_rate": 1.9410254058516008e-05, "loss": 0.6207, "step": 2590 }, { "epoch": 0.5114209141648841, "grad_norm": 0.611747682094574, "learning_rate": 1.9407196795988873e-05, "loss": 0.3919, "step": 2600 }, { "epoch": 0.5133879176809029, "grad_norm": 1.1328556537628174, "learning_rate": 1.940413953346174e-05, "loss": 0.4299, "step": 2610 }, { "epoch": 0.5153549211969216, "grad_norm": 1.74727463722229, "learning_rate": 1.9401082270934605e-05, "loss": 0.5916, "step": 2620 }, { "epoch": 0.5173219247129405, "grad_norm": 1.2672849893569946, "learning_rate": 1.9398025008407474e-05, "loss": 0.4927, "step": 2630 }, { "epoch": 0.5192889282289592, "grad_norm": 1.1896312236785889, "learning_rate": 1.939496774588034e-05, "loss": 0.4934, "step": 2640 }, { "epoch": 0.521255931744978, "grad_norm": 0.9513353109359741, "learning_rate": 1.9391910483353206e-05, "loss": 0.4687, "step": 2650 }, { "epoch": 0.5232229352609967, "grad_norm": 1.061252474784851, "learning_rate": 1.9388853220826075e-05, "loss": 0.5134, "step": 2660 }, { "epoch": 0.5251899387770156, "grad_norm": 0.498430997133255, "learning_rate": 1.938579595829894e-05, "loss": 0.4358, "step": 2670 }, { "epoch": 0.5271569422930343, "grad_norm": 0.8934110999107361, "learning_rate": 1.9382738695771806e-05, "loss": 0.4829, "step": 2680 }, { "epoch": 0.5291239458090531, "grad_norm": 0.6681143045425415, "learning_rate": 1.9379681433244676e-05, "loss": 0.5481, "step": 2690 }, { "epoch": 0.531090949325072, "grad_norm": 0.9681861400604248, "learning_rate": 1.937662417071754e-05, "loss": 0.5768, "step": 2700 }, { "epoch": 0.5330579528410907, "grad_norm": 0.7599331140518188, "learning_rate": 1.9373566908190407e-05, "loss": 0.4921, "step": 2710 }, { "epoch": 0.5350249563571094, "grad_norm": 0.8120267391204834, "learning_rate": 1.9370509645663276e-05, "loss": 0.4022, "step": 2720 }, { "epoch": 0.5369919598731283, "grad_norm": 1.1031180620193481, "learning_rate": 1.9367452383136142e-05, "loss": 0.4954, "step": 2730 }, { "epoch": 0.5389589633891471, "grad_norm": 1.092254877090454, "learning_rate": 1.9364395120609008e-05, "loss": 0.4775, "step": 2740 }, { "epoch": 0.5409259669051658, "grad_norm": 1.720622181892395, "learning_rate": 1.9361337858081874e-05, "loss": 0.5302, "step": 2750 }, { "epoch": 0.5428929704211847, "grad_norm": 0.5872963070869446, "learning_rate": 1.9358280595554743e-05, "loss": 0.4608, "step": 2760 }, { "epoch": 0.5448599739372034, "grad_norm": 1.1749262809753418, "learning_rate": 1.935522333302761e-05, "loss": 0.4118, "step": 2770 }, { "epoch": 0.5468269774532222, "grad_norm": 1.0945734977722168, "learning_rate": 1.9352166070500474e-05, "loss": 0.3325, "step": 2780 }, { "epoch": 0.548793980969241, "grad_norm": 1.1984425783157349, "learning_rate": 1.9349108807973344e-05, "loss": 0.4699, "step": 2790 }, { "epoch": 0.5507609844852598, "grad_norm": 1.0518896579742432, "learning_rate": 1.934605154544621e-05, "loss": 0.5218, "step": 2800 }, { "epoch": 0.5527279880012785, "grad_norm": 1.1263470649719238, "learning_rate": 1.9342994282919075e-05, "loss": 0.3322, "step": 2810 }, { "epoch": 0.5546949915172973, "grad_norm": 0.9612912535667419, "learning_rate": 1.9339937020391944e-05, "loss": 0.486, "step": 2820 }, { "epoch": 0.5566619950333161, "grad_norm": 0.9742056131362915, "learning_rate": 1.933687975786481e-05, "loss": 0.4987, "step": 2830 }, { "epoch": 0.5586289985493349, "grad_norm": 1.2318615913391113, "learning_rate": 1.9333822495337676e-05, "loss": 0.5631, "step": 2840 }, { "epoch": 0.5605960020653536, "grad_norm": 0.7405826449394226, "learning_rate": 1.933076523281054e-05, "loss": 0.427, "step": 2850 }, { "epoch": 0.5625630055813725, "grad_norm": 1.1158024072647095, "learning_rate": 1.932770797028341e-05, "loss": 0.54, "step": 2860 }, { "epoch": 0.5645300090973913, "grad_norm": 1.1322060823440552, "learning_rate": 1.9324650707756277e-05, "loss": 0.4481, "step": 2870 }, { "epoch": 0.56649701261341, "grad_norm": 0.7638188004493713, "learning_rate": 1.9321593445229142e-05, "loss": 0.4681, "step": 2880 }, { "epoch": 0.5684640161294289, "grad_norm": 0.7837921977043152, "learning_rate": 1.931853618270201e-05, "loss": 0.4704, "step": 2890 }, { "epoch": 0.5704310196454476, "grad_norm": 0.8661313056945801, "learning_rate": 1.9315478920174877e-05, "loss": 0.4953, "step": 2900 }, { "epoch": 0.5723980231614664, "grad_norm": 1.4302645921707153, "learning_rate": 1.9312421657647743e-05, "loss": 0.4387, "step": 2910 }, { "epoch": 0.5743650266774852, "grad_norm": 1.5569978952407837, "learning_rate": 1.9309364395120612e-05, "loss": 0.4401, "step": 2920 }, { "epoch": 0.576332030193504, "grad_norm": 1.0568249225616455, "learning_rate": 1.9306307132593478e-05, "loss": 0.5414, "step": 2930 }, { "epoch": 0.5782990337095227, "grad_norm": 1.0306973457336426, "learning_rate": 1.9303249870066344e-05, "loss": 0.4364, "step": 2940 }, { "epoch": 0.5802660372255415, "grad_norm": 1.2997857332229614, "learning_rate": 1.930019260753921e-05, "loss": 0.4826, "step": 2950 }, { "epoch": 0.5822330407415603, "grad_norm": 1.0597947835922241, "learning_rate": 1.9297135345012075e-05, "loss": 0.4344, "step": 2960 }, { "epoch": 0.5842000442575791, "grad_norm": 0.6990482211112976, "learning_rate": 1.9294078082484945e-05, "loss": 0.5371, "step": 2970 }, { "epoch": 0.5861670477735978, "grad_norm": 1.2495508193969727, "learning_rate": 1.929102081995781e-05, "loss": 0.5635, "step": 2980 }, { "epoch": 0.5881340512896167, "grad_norm": 1.3085463047027588, "learning_rate": 1.928796355743068e-05, "loss": 0.6532, "step": 2990 }, { "epoch": 0.5901010548056355, "grad_norm": 0.8088992834091187, "learning_rate": 1.9284906294903545e-05, "loss": 0.4938, "step": 3000 }, { "epoch": 0.5901010548056355, "eval_loss": 0.26935356855392456, "eval_runtime": 8.8729, "eval_samples_per_second": 5.635, "eval_steps_per_second": 2.818, "step": 3000 }, { "epoch": 0.5920680583216542, "grad_norm": 0.9487748742103577, "learning_rate": 1.928184903237641e-05, "loss": 0.4492, "step": 3010 }, { "epoch": 0.5940350618376731, "grad_norm": 0.9440038800239563, "learning_rate": 1.927879176984928e-05, "loss": 0.4755, "step": 3020 }, { "epoch": 0.5960020653536918, "grad_norm": 0.7290757298469543, "learning_rate": 1.9275734507322146e-05, "loss": 0.425, "step": 3030 }, { "epoch": 0.5979690688697106, "grad_norm": 1.2127468585968018, "learning_rate": 1.9272677244795012e-05, "loss": 0.4825, "step": 3040 }, { "epoch": 0.5999360723857294, "grad_norm": 1.1375706195831299, "learning_rate": 1.926961998226788e-05, "loss": 0.4468, "step": 3050 }, { "epoch": 0.6019030759017482, "grad_norm": 0.8501954078674316, "learning_rate": 1.9266562719740747e-05, "loss": 0.4823, "step": 3060 }, { "epoch": 0.6038700794177669, "grad_norm": 0.7859975099563599, "learning_rate": 1.9263505457213612e-05, "loss": 0.5721, "step": 3070 }, { "epoch": 0.6058370829337858, "grad_norm": 0.6325510144233704, "learning_rate": 1.9260448194686478e-05, "loss": 0.3817, "step": 3080 }, { "epoch": 0.6078040864498045, "grad_norm": 0.8559825420379639, "learning_rate": 1.9257390932159344e-05, "loss": 0.3648, "step": 3090 }, { "epoch": 0.6097710899658233, "grad_norm": 1.0981616973876953, "learning_rate": 1.9254333669632213e-05, "loss": 0.5788, "step": 3100 }, { "epoch": 0.611738093481842, "grad_norm": 0.7142055630683899, "learning_rate": 1.925127640710508e-05, "loss": 0.4691, "step": 3110 }, { "epoch": 0.6137050969978609, "grad_norm": 0.9014882445335388, "learning_rate": 1.9248219144577948e-05, "loss": 0.4137, "step": 3120 }, { "epoch": 0.6156721005138797, "grad_norm": 1.2298983335494995, "learning_rate": 1.9245161882050814e-05, "loss": 0.4547, "step": 3130 }, { "epoch": 0.6176391040298984, "grad_norm": 0.9861557483673096, "learning_rate": 1.924210461952368e-05, "loss": 0.546, "step": 3140 }, { "epoch": 0.6196061075459173, "grad_norm": 0.9207095503807068, "learning_rate": 1.923904735699655e-05, "loss": 0.6043, "step": 3150 }, { "epoch": 0.621573111061936, "grad_norm": 0.9119741320610046, "learning_rate": 1.9235990094469415e-05, "loss": 0.3983, "step": 3160 }, { "epoch": 0.6235401145779548, "grad_norm": 0.946865975856781, "learning_rate": 1.923293283194228e-05, "loss": 0.3886, "step": 3170 }, { "epoch": 0.6255071180939736, "grad_norm": 0.8447843790054321, "learning_rate": 1.9229875569415146e-05, "loss": 0.4373, "step": 3180 }, { "epoch": 0.6274741216099924, "grad_norm": 1.0305899381637573, "learning_rate": 1.9226818306888012e-05, "loss": 0.612, "step": 3190 }, { "epoch": 0.6294411251260111, "grad_norm": 0.8156121373176575, "learning_rate": 1.922376104436088e-05, "loss": 0.3686, "step": 3200 }, { "epoch": 0.63140812864203, "grad_norm": 0.884971559047699, "learning_rate": 1.9220703781833747e-05, "loss": 0.52, "step": 3210 }, { "epoch": 0.6333751321580487, "grad_norm": 1.0552936792373657, "learning_rate": 1.9217646519306613e-05, "loss": 0.529, "step": 3220 }, { "epoch": 0.6353421356740675, "grad_norm": 0.8704593777656555, "learning_rate": 1.9214589256779482e-05, "loss": 0.5397, "step": 3230 }, { "epoch": 0.6373091391900862, "grad_norm": 1.075453281402588, "learning_rate": 1.9211531994252348e-05, "loss": 0.5448, "step": 3240 }, { "epoch": 0.6392761427061051, "grad_norm": 0.6276763677597046, "learning_rate": 1.9208474731725217e-05, "loss": 0.4736, "step": 3250 }, { "epoch": 0.6412431462221239, "grad_norm": 1.0386992692947388, "learning_rate": 1.9205417469198083e-05, "loss": 0.516, "step": 3260 }, { "epoch": 0.6432101497381426, "grad_norm": 1.1383495330810547, "learning_rate": 1.920236020667095e-05, "loss": 0.5796, "step": 3270 }, { "epoch": 0.6451771532541615, "grad_norm": 0.5847461819648743, "learning_rate": 1.9199302944143818e-05, "loss": 0.5272, "step": 3280 }, { "epoch": 0.6471441567701802, "grad_norm": 0.7087602019309998, "learning_rate": 1.919624568161668e-05, "loss": 0.4371, "step": 3290 }, { "epoch": 0.649111160286199, "grad_norm": 1.0593681335449219, "learning_rate": 1.919318841908955e-05, "loss": 0.4453, "step": 3300 }, { "epoch": 0.6510781638022178, "grad_norm": 0.9373090863227844, "learning_rate": 1.9190131156562415e-05, "loss": 0.5262, "step": 3310 }, { "epoch": 0.6530451673182366, "grad_norm": 1.0394222736358643, "learning_rate": 1.918707389403528e-05, "loss": 0.4508, "step": 3320 }, { "epoch": 0.6550121708342553, "grad_norm": 1.1478387117385864, "learning_rate": 1.918401663150815e-05, "loss": 0.4628, "step": 3330 }, { "epoch": 0.6569791743502742, "grad_norm": 1.0079622268676758, "learning_rate": 1.9180959368981016e-05, "loss": 0.5326, "step": 3340 }, { "epoch": 0.658946177866293, "grad_norm": 1.0284887552261353, "learning_rate": 1.917790210645388e-05, "loss": 0.3955, "step": 3350 }, { "epoch": 0.6609131813823117, "grad_norm": 0.9022873640060425, "learning_rate": 1.917484484392675e-05, "loss": 0.3859, "step": 3360 }, { "epoch": 0.6628801848983306, "grad_norm": 1.5396238565444946, "learning_rate": 1.9171787581399616e-05, "loss": 0.5984, "step": 3370 }, { "epoch": 0.6648471884143493, "grad_norm": 2.1132633686065674, "learning_rate": 1.9168730318872485e-05, "loss": 0.5249, "step": 3380 }, { "epoch": 0.6668141919303681, "grad_norm": 0.8763299584388733, "learning_rate": 1.916567305634535e-05, "loss": 0.4281, "step": 3390 }, { "epoch": 0.6687811954463868, "grad_norm": 1.1695775985717773, "learning_rate": 1.9162615793818217e-05, "loss": 0.4416, "step": 3400 }, { "epoch": 0.6707481989624057, "grad_norm": 1.1734123229980469, "learning_rate": 1.9159558531291083e-05, "loss": 0.4649, "step": 3410 }, { "epoch": 0.6727152024784244, "grad_norm": 1.1211220026016235, "learning_rate": 1.915650126876395e-05, "loss": 0.4709, "step": 3420 }, { "epoch": 0.6746822059944432, "grad_norm": 0.763123631477356, "learning_rate": 1.9153444006236818e-05, "loss": 0.3934, "step": 3430 }, { "epoch": 0.676649209510462, "grad_norm": 0.5031880140304565, "learning_rate": 1.9150386743709684e-05, "loss": 0.5802, "step": 3440 }, { "epoch": 0.6786162130264808, "grad_norm": 1.1181432008743286, "learning_rate": 1.914732948118255e-05, "loss": 0.4726, "step": 3450 }, { "epoch": 0.6805832165424995, "grad_norm": 1.1385880708694458, "learning_rate": 1.914427221865542e-05, "loss": 0.5769, "step": 3460 }, { "epoch": 0.6825502200585184, "grad_norm": 1.035854697227478, "learning_rate": 1.9141214956128284e-05, "loss": 0.5229, "step": 3470 }, { "epoch": 0.6845172235745371, "grad_norm": 0.8765487670898438, "learning_rate": 1.913815769360115e-05, "loss": 0.5184, "step": 3480 }, { "epoch": 0.6864842270905559, "grad_norm": 0.7699954509735107, "learning_rate": 1.913510043107402e-05, "loss": 0.5459, "step": 3490 }, { "epoch": 0.6884512306065748, "grad_norm": 1.094295859336853, "learning_rate": 1.9132043168546885e-05, "loss": 0.4648, "step": 3500 }, { "epoch": 0.6884512306065748, "eval_loss": 0.26830142736434937, "eval_runtime": 8.8663, "eval_samples_per_second": 5.639, "eval_steps_per_second": 2.82, "step": 3500 }, { "epoch": 0.6904182341225935, "grad_norm": 0.8461592793464661, "learning_rate": 1.9128985906019754e-05, "loss": 0.4858, "step": 3510 }, { "epoch": 0.6923852376386123, "grad_norm": 1.2828164100646973, "learning_rate": 1.9125928643492617e-05, "loss": 0.5672, "step": 3520 }, { "epoch": 0.694352241154631, "grad_norm": 1.2198454141616821, "learning_rate": 1.9122871380965486e-05, "loss": 0.5289, "step": 3530 }, { "epoch": 0.6963192446706499, "grad_norm": 1.0390441417694092, "learning_rate": 1.911981411843835e-05, "loss": 0.5492, "step": 3540 }, { "epoch": 0.6982862481866686, "grad_norm": 1.178147792816162, "learning_rate": 1.9116756855911217e-05, "loss": 0.4086, "step": 3550 }, { "epoch": 0.7002532517026874, "grad_norm": 1.009112000465393, "learning_rate": 1.9113699593384086e-05, "loss": 0.45, "step": 3560 }, { "epoch": 0.7022202552187062, "grad_norm": 1.2733867168426514, "learning_rate": 1.9110642330856952e-05, "loss": 0.499, "step": 3570 }, { "epoch": 0.704187258734725, "grad_norm": 1.1256855726242065, "learning_rate": 1.9107585068329818e-05, "loss": 0.4129, "step": 3580 }, { "epoch": 0.7061542622507437, "grad_norm": 1.1128904819488525, "learning_rate": 1.9104527805802687e-05, "loss": 0.5093, "step": 3590 }, { "epoch": 0.7081212657667626, "grad_norm": 0.9144822955131531, "learning_rate": 1.9101470543275553e-05, "loss": 0.3973, "step": 3600 }, { "epoch": 0.7100882692827813, "grad_norm": 0.6767692565917969, "learning_rate": 1.909841328074842e-05, "loss": 0.4236, "step": 3610 }, { "epoch": 0.7120552727988001, "grad_norm": 0.9001137614250183, "learning_rate": 1.9095356018221288e-05, "loss": 0.501, "step": 3620 }, { "epoch": 0.714022276314819, "grad_norm": 0.9917561411857605, "learning_rate": 1.909229875569415e-05, "loss": 0.36, "step": 3630 }, { "epoch": 0.7159892798308377, "grad_norm": 0.810536801815033, "learning_rate": 1.908924149316702e-05, "loss": 0.5213, "step": 3640 }, { "epoch": 0.7179562833468565, "grad_norm": 1.5590412616729736, "learning_rate": 1.9086184230639885e-05, "loss": 0.6036, "step": 3650 }, { "epoch": 0.7199232868628753, "grad_norm": 0.597394585609436, "learning_rate": 1.9083126968112754e-05, "loss": 0.4566, "step": 3660 }, { "epoch": 0.7218902903788941, "grad_norm": 0.692649781703949, "learning_rate": 1.908006970558562e-05, "loss": 0.4622, "step": 3670 }, { "epoch": 0.7238572938949128, "grad_norm": 0.9043506383895874, "learning_rate": 1.9077012443058486e-05, "loss": 0.4407, "step": 3680 }, { "epoch": 0.7258242974109316, "grad_norm": 1.1840672492980957, "learning_rate": 1.9073955180531355e-05, "loss": 0.4605, "step": 3690 }, { "epoch": 0.7277913009269504, "grad_norm": 0.9953239560127258, "learning_rate": 1.907089791800422e-05, "loss": 0.5095, "step": 3700 }, { "epoch": 0.7297583044429692, "grad_norm": 0.6179831624031067, "learning_rate": 1.9067840655477087e-05, "loss": 0.4551, "step": 3710 }, { "epoch": 0.7317253079589879, "grad_norm": 1.8376891613006592, "learning_rate": 1.9064783392949956e-05, "loss": 0.399, "step": 3720 }, { "epoch": 0.7336923114750068, "grad_norm": 1.0583064556121826, "learning_rate": 1.906172613042282e-05, "loss": 0.465, "step": 3730 }, { "epoch": 0.7356593149910255, "grad_norm": 1.0016286373138428, "learning_rate": 1.9058668867895687e-05, "loss": 0.5169, "step": 3740 }, { "epoch": 0.7376263185070443, "grad_norm": 1.5237940549850464, "learning_rate": 1.9055611605368553e-05, "loss": 0.5857, "step": 3750 }, { "epoch": 0.7395933220230632, "grad_norm": 1.1710821390151978, "learning_rate": 1.905255434284142e-05, "loss": 0.4861, "step": 3760 }, { "epoch": 0.7415603255390819, "grad_norm": 0.7797861695289612, "learning_rate": 1.9049497080314288e-05, "loss": 0.5841, "step": 3770 }, { "epoch": 0.7435273290551007, "grad_norm": 1.0958030223846436, "learning_rate": 1.9046439817787154e-05, "loss": 0.4966, "step": 3780 }, { "epoch": 0.7454943325711195, "grad_norm": 0.7116280794143677, "learning_rate": 1.9043382555260023e-05, "loss": 0.4492, "step": 3790 }, { "epoch": 0.7474613360871383, "grad_norm": 0.9613781571388245, "learning_rate": 1.904032529273289e-05, "loss": 0.5049, "step": 3800 }, { "epoch": 0.749428339603157, "grad_norm": 0.9669978618621826, "learning_rate": 1.9037268030205755e-05, "loss": 0.5129, "step": 3810 }, { "epoch": 0.7513953431191758, "grad_norm": 0.8262606263160706, "learning_rate": 1.9034210767678624e-05, "loss": 0.5172, "step": 3820 }, { "epoch": 0.7533623466351946, "grad_norm": 2.477900743484497, "learning_rate": 1.903115350515149e-05, "loss": 0.537, "step": 3830 }, { "epoch": 0.7553293501512134, "grad_norm": 0.6476783752441406, "learning_rate": 1.9028096242624355e-05, "loss": 0.5639, "step": 3840 }, { "epoch": 0.7572963536672321, "grad_norm": 1.1109529733657837, "learning_rate": 1.9025038980097224e-05, "loss": 0.4391, "step": 3850 }, { "epoch": 0.759263357183251, "grad_norm": 0.8402903079986572, "learning_rate": 1.9021981717570087e-05, "loss": 0.389, "step": 3860 }, { "epoch": 0.7612303606992697, "grad_norm": 1.0447739362716675, "learning_rate": 1.9018924455042956e-05, "loss": 0.4631, "step": 3870 }, { "epoch": 0.7631973642152885, "grad_norm": 1.4362361431121826, "learning_rate": 1.9015867192515822e-05, "loss": 0.4975, "step": 3880 }, { "epoch": 0.7651643677313074, "grad_norm": 1.702034831047058, "learning_rate": 1.9012809929988688e-05, "loss": 0.5585, "step": 3890 }, { "epoch": 0.7671313712473261, "grad_norm": 1.469128966331482, "learning_rate": 1.9009752667461557e-05, "loss": 0.5571, "step": 3900 }, { "epoch": 0.7690983747633449, "grad_norm": 0.793484628200531, "learning_rate": 1.9006695404934423e-05, "loss": 0.4067, "step": 3910 }, { "epoch": 0.7710653782793637, "grad_norm": 0.775711715221405, "learning_rate": 1.900363814240729e-05, "loss": 0.5249, "step": 3920 }, { "epoch": 0.7730323817953825, "grad_norm": 0.528615415096283, "learning_rate": 1.9000580879880157e-05, "loss": 0.3714, "step": 3930 }, { "epoch": 0.7749993853114012, "grad_norm": 0.9931712746620178, "learning_rate": 1.8997523617353023e-05, "loss": 0.4286, "step": 3940 }, { "epoch": 0.77696638882742, "grad_norm": 1.9543815851211548, "learning_rate": 1.8994466354825892e-05, "loss": 0.4738, "step": 3950 }, { "epoch": 0.7789333923434388, "grad_norm": 1.5863524675369263, "learning_rate": 1.8991409092298758e-05, "loss": 0.518, "step": 3960 }, { "epoch": 0.7809003958594576, "grad_norm": 1.0554157495498657, "learning_rate": 1.8988351829771624e-05, "loss": 0.5735, "step": 3970 }, { "epoch": 0.7828673993754763, "grad_norm": 1.5502032041549683, "learning_rate": 1.898529456724449e-05, "loss": 0.4925, "step": 3980 }, { "epoch": 0.7848344028914952, "grad_norm": 0.9096987843513489, "learning_rate": 1.8982237304717356e-05, "loss": 0.2723, "step": 3990 }, { "epoch": 0.786801406407514, "grad_norm": 1.4501546621322632, "learning_rate": 1.8979180042190225e-05, "loss": 0.4634, "step": 4000 }, { "epoch": 0.786801406407514, "eval_loss": 0.2581852972507477, "eval_runtime": 8.8766, "eval_samples_per_second": 5.633, "eval_steps_per_second": 2.816, "step": 4000 }, { "epoch": 0.7887684099235327, "grad_norm": 0.8492615818977356, "learning_rate": 1.897612277966309e-05, "loss": 0.4806, "step": 4010 }, { "epoch": 0.7907354134395516, "grad_norm": 1.0982277393341064, "learning_rate": 1.8973065517135956e-05, "loss": 0.481, "step": 4020 }, { "epoch": 0.7927024169555703, "grad_norm": 1.1932698488235474, "learning_rate": 1.8970008254608825e-05, "loss": 0.4803, "step": 4030 }, { "epoch": 0.7946694204715891, "grad_norm": 0.9135488867759705, "learning_rate": 1.896695099208169e-05, "loss": 0.5505, "step": 4040 }, { "epoch": 0.7966364239876079, "grad_norm": 1.5870565176010132, "learning_rate": 1.896389372955456e-05, "loss": 0.4184, "step": 4050 }, { "epoch": 0.7986034275036267, "grad_norm": 0.9987393021583557, "learning_rate": 1.8960836467027426e-05, "loss": 0.5769, "step": 4060 }, { "epoch": 0.8005704310196454, "grad_norm": 1.0902693271636963, "learning_rate": 1.8957779204500292e-05, "loss": 0.4934, "step": 4070 }, { "epoch": 0.8025374345356643, "grad_norm": 1.2201869487762451, "learning_rate": 1.895472194197316e-05, "loss": 0.4007, "step": 4080 }, { "epoch": 0.804504438051683, "grad_norm": 1.425352692604065, "learning_rate": 1.8951664679446023e-05, "loss": 0.6055, "step": 4090 }, { "epoch": 0.8064714415677018, "grad_norm": 1.107489824295044, "learning_rate": 1.8948607416918893e-05, "loss": 0.5402, "step": 4100 }, { "epoch": 0.8084384450837205, "grad_norm": 0.8518027067184448, "learning_rate": 1.894555015439176e-05, "loss": 0.4802, "step": 4110 }, { "epoch": 0.8104054485997394, "grad_norm": 0.9588760137557983, "learning_rate": 1.8942492891864624e-05, "loss": 0.4809, "step": 4120 }, { "epoch": 0.8123724521157581, "grad_norm": 1.3297154903411865, "learning_rate": 1.8939435629337493e-05, "loss": 0.4471, "step": 4130 }, { "epoch": 0.8143394556317769, "grad_norm": 0.9255673885345459, "learning_rate": 1.893637836681036e-05, "loss": 0.4714, "step": 4140 }, { "epoch": 0.8163064591477958, "grad_norm": 1.3429300785064697, "learning_rate": 1.8933321104283225e-05, "loss": 0.5203, "step": 4150 }, { "epoch": 0.8182734626638145, "grad_norm": 1.7732727527618408, "learning_rate": 1.8930263841756094e-05, "loss": 0.5918, "step": 4160 }, { "epoch": 0.8202404661798333, "grad_norm": 1.1229453086853027, "learning_rate": 1.892720657922896e-05, "loss": 0.449, "step": 4170 }, { "epoch": 0.8222074696958521, "grad_norm": 1.2160098552703857, "learning_rate": 1.892414931670183e-05, "loss": 0.4994, "step": 4180 }, { "epoch": 0.8241744732118709, "grad_norm": 1.6124922037124634, "learning_rate": 1.8921092054174695e-05, "loss": 0.4356, "step": 4190 }, { "epoch": 0.8261414767278896, "grad_norm": 1.0741393566131592, "learning_rate": 1.891803479164756e-05, "loss": 0.427, "step": 4200 }, { "epoch": 0.8281084802439085, "grad_norm": 1.2140878438949585, "learning_rate": 1.8914977529120426e-05, "loss": 0.5503, "step": 4210 }, { "epoch": 0.8300754837599272, "grad_norm": 1.016489028930664, "learning_rate": 1.8911920266593292e-05, "loss": 0.5485, "step": 4220 }, { "epoch": 0.832042487275946, "grad_norm": 1.1552870273590088, "learning_rate": 1.890886300406616e-05, "loss": 0.4611, "step": 4230 }, { "epoch": 0.8340094907919647, "grad_norm": 0.9830273985862732, "learning_rate": 1.8905805741539027e-05, "loss": 0.4794, "step": 4240 }, { "epoch": 0.8359764943079836, "grad_norm": 0.9099911451339722, "learning_rate": 1.8902748479011893e-05, "loss": 0.4479, "step": 4250 }, { "epoch": 0.8379434978240023, "grad_norm": 1.2491271495819092, "learning_rate": 1.8899691216484762e-05, "loss": 0.5845, "step": 4260 }, { "epoch": 0.8399105013400211, "grad_norm": 1.144546627998352, "learning_rate": 1.8896633953957628e-05, "loss": 0.4979, "step": 4270 }, { "epoch": 0.84187750485604, "grad_norm": 0.45869743824005127, "learning_rate": 1.8893576691430494e-05, "loss": 0.4119, "step": 4280 }, { "epoch": 0.8438445083720587, "grad_norm": 1.6647731065750122, "learning_rate": 1.8890519428903363e-05, "loss": 0.5917, "step": 4290 }, { "epoch": 0.8458115118880775, "grad_norm": 0.6388562321662903, "learning_rate": 1.888746216637623e-05, "loss": 0.534, "step": 4300 }, { "epoch": 0.8477785154040963, "grad_norm": 1.499766230583191, "learning_rate": 1.8884404903849094e-05, "loss": 0.6057, "step": 4310 }, { "epoch": 0.8497455189201151, "grad_norm": 1.1714287996292114, "learning_rate": 1.888134764132196e-05, "loss": 0.4296, "step": 4320 }, { "epoch": 0.8517125224361338, "grad_norm": 0.9272406697273254, "learning_rate": 1.887829037879483e-05, "loss": 0.4614, "step": 4330 }, { "epoch": 0.8536795259521527, "grad_norm": 0.9848095774650574, "learning_rate": 1.8875233116267695e-05, "loss": 0.4652, "step": 4340 }, { "epoch": 0.8556465294681714, "grad_norm": 1.2984775304794312, "learning_rate": 1.887217585374056e-05, "loss": 0.4715, "step": 4350 }, { "epoch": 0.8576135329841902, "grad_norm": 0.8881934285163879, "learning_rate": 1.886911859121343e-05, "loss": 0.3736, "step": 4360 }, { "epoch": 0.859580536500209, "grad_norm": 1.8501014709472656, "learning_rate": 1.8866061328686296e-05, "loss": 0.4502, "step": 4370 }, { "epoch": 0.8615475400162278, "grad_norm": 1.3918039798736572, "learning_rate": 1.886300406615916e-05, "loss": 0.5603, "step": 4380 }, { "epoch": 0.8635145435322465, "grad_norm": 1.2267512083053589, "learning_rate": 1.885994680363203e-05, "loss": 0.4656, "step": 4390 }, { "epoch": 0.8654815470482653, "grad_norm": 1.0257207155227661, "learning_rate": 1.8856889541104896e-05, "loss": 0.3862, "step": 4400 }, { "epoch": 0.8674485505642842, "grad_norm": 0.5115770101547241, "learning_rate": 1.8853832278577762e-05, "loss": 0.4928, "step": 4410 }, { "epoch": 0.8694155540803029, "grad_norm": 0.8817374110221863, "learning_rate": 1.885077501605063e-05, "loss": 0.4431, "step": 4420 }, { "epoch": 0.8713825575963217, "grad_norm": 1.37067449092865, "learning_rate": 1.8847717753523497e-05, "loss": 0.5065, "step": 4430 }, { "epoch": 0.8733495611123405, "grad_norm": 0.4619062840938568, "learning_rate": 1.8844660490996363e-05, "loss": 0.4052, "step": 4440 }, { "epoch": 0.8753165646283593, "grad_norm": 0.8775585293769836, "learning_rate": 1.884160322846923e-05, "loss": 0.4538, "step": 4450 }, { "epoch": 0.877283568144378, "grad_norm": 0.6405948996543884, "learning_rate": 1.8838545965942098e-05, "loss": 0.4881, "step": 4460 }, { "epoch": 0.8792505716603969, "grad_norm": 1.11896812915802, "learning_rate": 1.8835488703414964e-05, "loss": 0.3966, "step": 4470 }, { "epoch": 0.8812175751764156, "grad_norm": 1.493742823600769, "learning_rate": 1.883243144088783e-05, "loss": 0.4512, "step": 4480 }, { "epoch": 0.8831845786924344, "grad_norm": 1.34097158908844, "learning_rate": 1.88293741783607e-05, "loss": 0.3983, "step": 4490 }, { "epoch": 0.8851515822084532, "grad_norm": 1.2133510112762451, "learning_rate": 1.8826316915833564e-05, "loss": 0.4512, "step": 4500 }, { "epoch": 0.8851515822084532, "eval_loss": 0.25500819087028503, "eval_runtime": 8.9175, "eval_samples_per_second": 5.607, "eval_steps_per_second": 2.803, "step": 4500 }, { "epoch": 0.887118585724472, "grad_norm": 0.8433915376663208, "learning_rate": 1.882325965330643e-05, "loss": 0.4379, "step": 4510 }, { "epoch": 0.8890855892404907, "grad_norm": 1.048805832862854, "learning_rate": 1.88202023907793e-05, "loss": 0.3957, "step": 4520 }, { "epoch": 0.8910525927565095, "grad_norm": 1.5707403421401978, "learning_rate": 1.8817145128252165e-05, "loss": 0.4594, "step": 4530 }, { "epoch": 0.8930195962725284, "grad_norm": 1.2741488218307495, "learning_rate": 1.881408786572503e-05, "loss": 0.5692, "step": 4540 }, { "epoch": 0.8949865997885471, "grad_norm": 0.6873851418495178, "learning_rate": 1.8811030603197897e-05, "loss": 0.3774, "step": 4550 }, { "epoch": 0.8969536033045659, "grad_norm": 0.9358246922492981, "learning_rate": 1.8807973340670766e-05, "loss": 0.5593, "step": 4560 }, { "epoch": 0.8989206068205847, "grad_norm": 1.3726028203964233, "learning_rate": 1.880491607814363e-05, "loss": 0.4497, "step": 4570 }, { "epoch": 0.9008876103366035, "grad_norm": 0.9805541634559631, "learning_rate": 1.8801858815616497e-05, "loss": 0.3744, "step": 4580 }, { "epoch": 0.9028546138526222, "grad_norm": 1.2838389873504639, "learning_rate": 1.8798801553089367e-05, "loss": 0.4415, "step": 4590 }, { "epoch": 0.9048216173686411, "grad_norm": 0.782386064529419, "learning_rate": 1.8795744290562232e-05, "loss": 0.4138, "step": 4600 }, { "epoch": 0.9067886208846598, "grad_norm": 1.1255232095718384, "learning_rate": 1.8792687028035098e-05, "loss": 0.5175, "step": 4610 }, { "epoch": 0.9087556244006786, "grad_norm": 1.0745489597320557, "learning_rate": 1.8789629765507967e-05, "loss": 0.432, "step": 4620 }, { "epoch": 0.9107226279166974, "grad_norm": 0.6713242530822754, "learning_rate": 1.8786572502980833e-05, "loss": 0.5976, "step": 4630 }, { "epoch": 0.9126896314327162, "grad_norm": 1.3857566118240356, "learning_rate": 1.87835152404537e-05, "loss": 0.4972, "step": 4640 }, { "epoch": 0.914656634948735, "grad_norm": 1.6660428047180176, "learning_rate": 1.8780457977926565e-05, "loss": 0.456, "step": 4650 }, { "epoch": 0.9166236384647538, "grad_norm": 1.1957042217254639, "learning_rate": 1.877740071539943e-05, "loss": 0.5614, "step": 4660 }, { "epoch": 0.9185906419807726, "grad_norm": 1.4520841836929321, "learning_rate": 1.87743434528723e-05, "loss": 0.5019, "step": 4670 }, { "epoch": 0.9205576454967913, "grad_norm": 1.6292874813079834, "learning_rate": 1.8771286190345165e-05, "loss": 0.4422, "step": 4680 }, { "epoch": 0.9225246490128101, "grad_norm": 1.2969595193862915, "learning_rate": 1.8768228927818035e-05, "loss": 0.4567, "step": 4690 }, { "epoch": 0.9244916525288289, "grad_norm": 0.8778219223022461, "learning_rate": 1.87651716652909e-05, "loss": 0.6256, "step": 4700 }, { "epoch": 0.9264586560448477, "grad_norm": 0.725437343120575, "learning_rate": 1.8762114402763766e-05, "loss": 0.5737, "step": 4710 }, { "epoch": 0.9284256595608664, "grad_norm": 0.9920393228530884, "learning_rate": 1.8759057140236635e-05, "loss": 0.4377, "step": 4720 }, { "epoch": 0.9303926630768853, "grad_norm": 1.587246298789978, "learning_rate": 1.87559998777095e-05, "loss": 0.4066, "step": 4730 }, { "epoch": 0.932359666592904, "grad_norm": 1.5838128328323364, "learning_rate": 1.8752942615182367e-05, "loss": 0.4921, "step": 4740 }, { "epoch": 0.9343266701089228, "grad_norm": 2.2074408531188965, "learning_rate": 1.8749885352655236e-05, "loss": 0.4415, "step": 4750 }, { "epoch": 0.9362936736249416, "grad_norm": 0.7175712585449219, "learning_rate": 1.8746828090128102e-05, "loss": 0.5967, "step": 4760 }, { "epoch": 0.9382606771409604, "grad_norm": 0.672035276889801, "learning_rate": 1.8743770827600968e-05, "loss": 0.4765, "step": 4770 }, { "epoch": 0.9402276806569791, "grad_norm": 1.2136248350143433, "learning_rate": 1.8740713565073833e-05, "loss": 0.4918, "step": 4780 }, { "epoch": 0.942194684172998, "grad_norm": 1.1846280097961426, "learning_rate": 1.87376563025467e-05, "loss": 0.5204, "step": 4790 }, { "epoch": 0.9441616876890168, "grad_norm": 0.9919416904449463, "learning_rate": 1.8734599040019568e-05, "loss": 0.5251, "step": 4800 }, { "epoch": 0.9461286912050355, "grad_norm": 0.9183461666107178, "learning_rate": 1.8731541777492434e-05, "loss": 0.3683, "step": 4810 }, { "epoch": 0.9480956947210543, "grad_norm": 1.5398882627487183, "learning_rate": 1.8728484514965303e-05, "loss": 0.4656, "step": 4820 }, { "epoch": 0.9500626982370731, "grad_norm": 2.7431869506835938, "learning_rate": 1.872542725243817e-05, "loss": 0.4459, "step": 4830 }, { "epoch": 0.9520297017530919, "grad_norm": 1.4181733131408691, "learning_rate": 1.8722369989911035e-05, "loss": 0.4347, "step": 4840 }, { "epoch": 0.9539967052691106, "grad_norm": 1.1322598457336426, "learning_rate": 1.8719312727383904e-05, "loss": 0.4761, "step": 4850 }, { "epoch": 0.9559637087851295, "grad_norm": 1.0552661418914795, "learning_rate": 1.871625546485677e-05, "loss": 0.4593, "step": 4860 }, { "epoch": 0.9579307123011482, "grad_norm": 1.7677165269851685, "learning_rate": 1.8713198202329635e-05, "loss": 0.516, "step": 4870 }, { "epoch": 0.959897715817167, "grad_norm": 1.1124646663665771, "learning_rate": 1.87101409398025e-05, "loss": 0.4594, "step": 4880 }, { "epoch": 0.9618647193331858, "grad_norm": 0.9659914970397949, "learning_rate": 1.8707083677275367e-05, "loss": 0.4844, "step": 4890 }, { "epoch": 0.9638317228492046, "grad_norm": 0.9251281023025513, "learning_rate": 1.8704026414748236e-05, "loss": 0.5587, "step": 4900 }, { "epoch": 0.9657987263652233, "grad_norm": 1.2297642230987549, "learning_rate": 1.8700969152221102e-05, "loss": 0.5739, "step": 4910 }, { "epoch": 0.9677657298812422, "grad_norm": 1.329194188117981, "learning_rate": 1.8697911889693968e-05, "loss": 0.5016, "step": 4920 }, { "epoch": 0.969732733397261, "grad_norm": 0.8818217515945435, "learning_rate": 1.8694854627166837e-05, "loss": 0.4428, "step": 4930 }, { "epoch": 0.9716997369132797, "grad_norm": 1.1140741109848022, "learning_rate": 1.8691797364639703e-05, "loss": 0.5713, "step": 4940 }, { "epoch": 0.9736667404292986, "grad_norm": 0.5966857075691223, "learning_rate": 1.8688740102112572e-05, "loss": 0.4876, "step": 4950 }, { "epoch": 0.9756337439453173, "grad_norm": 0.8906331062316895, "learning_rate": 1.8685682839585438e-05, "loss": 0.4157, "step": 4960 }, { "epoch": 0.9776007474613361, "grad_norm": 1.6256823539733887, "learning_rate": 1.8682625577058303e-05, "loss": 0.4524, "step": 4970 }, { "epoch": 0.9795677509773548, "grad_norm": 1.4493746757507324, "learning_rate": 1.8679568314531173e-05, "loss": 0.5159, "step": 4980 }, { "epoch": 0.9815347544933737, "grad_norm": 0.9830152988433838, "learning_rate": 1.8676511052004035e-05, "loss": 0.3848, "step": 4990 }, { "epoch": 0.9835017580093924, "grad_norm": 1.2543774843215942, "learning_rate": 1.8673453789476904e-05, "loss": 0.5435, "step": 5000 }, { "epoch": 0.9835017580093924, "eval_loss": 0.2498249113559723, "eval_runtime": 8.8851, "eval_samples_per_second": 5.627, "eval_steps_per_second": 2.814, "step": 5000 }, { "epoch": 0.9854687615254112, "grad_norm": 1.0389529466629028, "learning_rate": 1.867039652694977e-05, "loss": 0.4089, "step": 5010 }, { "epoch": 0.98743576504143, "grad_norm": 0.9256637692451477, "learning_rate": 1.8667339264422636e-05, "loss": 0.5185, "step": 5020 }, { "epoch": 0.9894027685574488, "grad_norm": 1.2138028144836426, "learning_rate": 1.8664282001895505e-05, "loss": 0.5416, "step": 5030 }, { "epoch": 0.9913697720734675, "grad_norm": 1.4770766496658325, "learning_rate": 1.866122473936837e-05, "loss": 0.4611, "step": 5040 }, { "epoch": 0.9933367755894864, "grad_norm": 0.783585250377655, "learning_rate": 1.8658167476841236e-05, "loss": 0.4212, "step": 5050 }, { "epoch": 0.9953037791055052, "grad_norm": 1.2547729015350342, "learning_rate": 1.8655110214314106e-05, "loss": 0.5168, "step": 5060 }, { "epoch": 0.9972707826215239, "grad_norm": 0.9369317293167114, "learning_rate": 1.865205295178697e-05, "loss": 0.5653, "step": 5070 }, { "epoch": 0.9992377861375428, "grad_norm": 1.1755496263504028, "learning_rate": 1.864899568925984e-05, "loss": 0.5023, "step": 5080 }, { "epoch": 1.0012047896535614, "grad_norm": 0.9298199415206909, "learning_rate": 1.8645938426732706e-05, "loss": 0.5254, "step": 5090 }, { "epoch": 1.0031717931695803, "grad_norm": 1.0964889526367188, "learning_rate": 1.8642881164205572e-05, "loss": 0.367, "step": 5100 }, { "epoch": 1.0051387966855991, "grad_norm": 1.8859641551971436, "learning_rate": 1.8639823901678438e-05, "loss": 0.4357, "step": 5110 }, { "epoch": 1.0071058002016178, "grad_norm": 1.2474465370178223, "learning_rate": 1.8636766639151304e-05, "loss": 0.4071, "step": 5120 }, { "epoch": 1.0090728037176366, "grad_norm": 0.820743203163147, "learning_rate": 1.8633709376624173e-05, "loss": 0.4397, "step": 5130 }, { "epoch": 1.0110398072336555, "grad_norm": 1.2931946516036987, "learning_rate": 1.863065211409704e-05, "loss": 0.3876, "step": 5140 }, { "epoch": 1.0130068107496741, "grad_norm": 0.6955686211585999, "learning_rate": 1.8627594851569904e-05, "loss": 0.408, "step": 5150 }, { "epoch": 1.014973814265693, "grad_norm": 0.8109415769577026, "learning_rate": 1.8624537589042773e-05, "loss": 0.4783, "step": 5160 }, { "epoch": 1.0169408177817119, "grad_norm": 0.7668949961662292, "learning_rate": 1.862148032651564e-05, "loss": 0.5177, "step": 5170 }, { "epoch": 1.0189078212977305, "grad_norm": 1.3539754152297974, "learning_rate": 1.8618423063988505e-05, "loss": 0.4552, "step": 5180 }, { "epoch": 1.0208748248137494, "grad_norm": 1.7623764276504517, "learning_rate": 1.8615365801461374e-05, "loss": 0.4712, "step": 5190 }, { "epoch": 1.0228418283297682, "grad_norm": 1.3476308584213257, "learning_rate": 1.861230853893424e-05, "loss": 0.3688, "step": 5200 }, { "epoch": 1.0248088318457869, "grad_norm": 0.7044321298599243, "learning_rate": 1.860925127640711e-05, "loss": 0.5159, "step": 5210 }, { "epoch": 1.0267758353618057, "grad_norm": 1.0632188320159912, "learning_rate": 1.860619401387997e-05, "loss": 0.4336, "step": 5220 }, { "epoch": 1.0287428388778246, "grad_norm": 1.7851991653442383, "learning_rate": 1.860313675135284e-05, "loss": 0.4671, "step": 5230 }, { "epoch": 1.0307098423938432, "grad_norm": 0.7615410685539246, "learning_rate": 1.8600079488825706e-05, "loss": 0.4408, "step": 5240 }, { "epoch": 1.032676845909862, "grad_norm": 1.042166829109192, "learning_rate": 1.8597022226298572e-05, "loss": 0.3965, "step": 5250 }, { "epoch": 1.034643849425881, "grad_norm": 0.7487517595291138, "learning_rate": 1.859396496377144e-05, "loss": 0.3477, "step": 5260 }, { "epoch": 1.0366108529418996, "grad_norm": 0.9106249213218689, "learning_rate": 1.8590907701244307e-05, "loss": 0.601, "step": 5270 }, { "epoch": 1.0385778564579184, "grad_norm": 1.1695805788040161, "learning_rate": 1.8587850438717173e-05, "loss": 0.4192, "step": 5280 }, { "epoch": 1.0405448599739373, "grad_norm": 0.398007333278656, "learning_rate": 1.8584793176190042e-05, "loss": 0.4124, "step": 5290 }, { "epoch": 1.042511863489956, "grad_norm": 0.6286818981170654, "learning_rate": 1.8581735913662908e-05, "loss": 0.4796, "step": 5300 }, { "epoch": 1.0444788670059748, "grad_norm": 1.6384111642837524, "learning_rate": 1.8578678651135774e-05, "loss": 0.4816, "step": 5310 }, { "epoch": 1.0464458705219934, "grad_norm": 0.9090391397476196, "learning_rate": 1.8575621388608643e-05, "loss": 0.4333, "step": 5320 }, { "epoch": 1.0484128740380123, "grad_norm": 1.0161601305007935, "learning_rate": 1.857256412608151e-05, "loss": 0.3375, "step": 5330 }, { "epoch": 1.0503798775540312, "grad_norm": 1.1942650079727173, "learning_rate": 1.8569506863554374e-05, "loss": 0.411, "step": 5340 }, { "epoch": 1.0523468810700498, "grad_norm": 1.3584073781967163, "learning_rate": 1.856644960102724e-05, "loss": 0.4614, "step": 5350 }, { "epoch": 1.0543138845860687, "grad_norm": 1.0407729148864746, "learning_rate": 1.856339233850011e-05, "loss": 0.578, "step": 5360 }, { "epoch": 1.0562808881020875, "grad_norm": 1.0869169235229492, "learning_rate": 1.8560335075972975e-05, "loss": 0.3789, "step": 5370 }, { "epoch": 1.0582478916181062, "grad_norm": 0.9861720204353333, "learning_rate": 1.855727781344584e-05, "loss": 0.5299, "step": 5380 }, { "epoch": 1.060214895134125, "grad_norm": 0.9632128477096558, "learning_rate": 1.855422055091871e-05, "loss": 0.4665, "step": 5390 }, { "epoch": 1.062181898650144, "grad_norm": 1.5414925813674927, "learning_rate": 1.8551163288391576e-05, "loss": 0.4346, "step": 5400 }, { "epoch": 1.0641489021661625, "grad_norm": 2.2143287658691406, "learning_rate": 1.854810602586444e-05, "loss": 0.3353, "step": 5410 }, { "epoch": 1.0661159056821814, "grad_norm": 1.193596363067627, "learning_rate": 1.854504876333731e-05, "loss": 0.3978, "step": 5420 }, { "epoch": 1.0680829091982003, "grad_norm": 1.4474225044250488, "learning_rate": 1.8541991500810177e-05, "loss": 0.3549, "step": 5430 }, { "epoch": 1.070049912714219, "grad_norm": 0.8442851305007935, "learning_rate": 1.8538934238283042e-05, "loss": 0.4715, "step": 5440 }, { "epoch": 1.0720169162302378, "grad_norm": 1.0033780336380005, "learning_rate": 1.8535876975755908e-05, "loss": 0.6038, "step": 5450 }, { "epoch": 1.0739839197462566, "grad_norm": 0.9648568630218506, "learning_rate": 1.8532819713228777e-05, "loss": 0.385, "step": 5460 }, { "epoch": 1.0759509232622753, "grad_norm": 1.1449640989303589, "learning_rate": 1.8529762450701643e-05, "loss": 0.4313, "step": 5470 }, { "epoch": 1.0779179267782941, "grad_norm": 1.7114028930664062, "learning_rate": 1.852670518817451e-05, "loss": 0.4804, "step": 5480 }, { "epoch": 1.079884930294313, "grad_norm": 1.1060335636138916, "learning_rate": 1.8523647925647378e-05, "loss": 0.3977, "step": 5490 }, { "epoch": 1.0818519338103316, "grad_norm": 1.1767979860305786, "learning_rate": 1.8520590663120244e-05, "loss": 0.4773, "step": 5500 }, { "epoch": 1.0818519338103316, "eval_loss": 0.25869181752204895, "eval_runtime": 8.8591, "eval_samples_per_second": 5.644, "eval_steps_per_second": 2.822, "step": 5500 }, { "epoch": 1.0838189373263505, "grad_norm": 1.1900354623794556, "learning_rate": 1.851753340059311e-05, "loss": 0.4315, "step": 5510 }, { "epoch": 1.0857859408423693, "grad_norm": 1.4605563879013062, "learning_rate": 1.851447613806598e-05, "loss": 0.3617, "step": 5520 }, { "epoch": 1.087752944358388, "grad_norm": 0.6865226030349731, "learning_rate": 1.8511418875538845e-05, "loss": 0.45, "step": 5530 }, { "epoch": 1.0897199478744068, "grad_norm": 1.535517930984497, "learning_rate": 1.850836161301171e-05, "loss": 0.4856, "step": 5540 }, { "epoch": 1.0916869513904257, "grad_norm": 1.234263300895691, "learning_rate": 1.850530435048458e-05, "loss": 0.4462, "step": 5550 }, { "epoch": 1.0936539549064443, "grad_norm": 0.8305632472038269, "learning_rate": 1.8502247087957442e-05, "loss": 0.3267, "step": 5560 }, { "epoch": 1.0956209584224632, "grad_norm": 0.8066553473472595, "learning_rate": 1.849918982543031e-05, "loss": 0.5352, "step": 5570 }, { "epoch": 1.097587961938482, "grad_norm": 1.5289993286132812, "learning_rate": 1.8496132562903177e-05, "loss": 0.4365, "step": 5580 }, { "epoch": 1.0995549654545007, "grad_norm": 1.027649164199829, "learning_rate": 1.8493075300376046e-05, "loss": 0.4864, "step": 5590 }, { "epoch": 1.1015219689705196, "grad_norm": 0.9802488088607788, "learning_rate": 1.8490018037848912e-05, "loss": 0.4012, "step": 5600 }, { "epoch": 1.1034889724865384, "grad_norm": 0.9458854794502258, "learning_rate": 1.8486960775321778e-05, "loss": 0.3817, "step": 5610 }, { "epoch": 1.105455976002557, "grad_norm": 0.6298452615737915, "learning_rate": 1.8483903512794647e-05, "loss": 0.4332, "step": 5620 }, { "epoch": 1.107422979518576, "grad_norm": 1.1540119647979736, "learning_rate": 1.8480846250267512e-05, "loss": 0.4173, "step": 5630 }, { "epoch": 1.1093899830345946, "grad_norm": 1.4845781326293945, "learning_rate": 1.8477788987740378e-05, "loss": 0.3793, "step": 5640 }, { "epoch": 1.1113569865506134, "grad_norm": 1.618861436843872, "learning_rate": 1.8474731725213247e-05, "loss": 0.4487, "step": 5650 }, { "epoch": 1.1133239900666323, "grad_norm": 1.1488289833068848, "learning_rate": 1.8471674462686113e-05, "loss": 0.3696, "step": 5660 }, { "epoch": 1.115290993582651, "grad_norm": 0.8757450580596924, "learning_rate": 1.846861720015898e-05, "loss": 0.4138, "step": 5670 }, { "epoch": 1.1172579970986698, "grad_norm": 0.7664826512336731, "learning_rate": 1.8465559937631845e-05, "loss": 0.5212, "step": 5680 }, { "epoch": 1.1192250006146887, "grad_norm": 0.9353389143943787, "learning_rate": 1.846250267510471e-05, "loss": 0.4855, "step": 5690 }, { "epoch": 1.1211920041307073, "grad_norm": 1.3535192012786865, "learning_rate": 1.845944541257758e-05, "loss": 0.4871, "step": 5700 }, { "epoch": 1.1231590076467262, "grad_norm": 0.7864957451820374, "learning_rate": 1.8456388150050445e-05, "loss": 0.4506, "step": 5710 }, { "epoch": 1.125126011162745, "grad_norm": 1.1982388496398926, "learning_rate": 1.8453330887523315e-05, "loss": 0.4044, "step": 5720 }, { "epoch": 1.1270930146787637, "grad_norm": 0.6584609746932983, "learning_rate": 1.845027362499618e-05, "loss": 0.4408, "step": 5730 }, { "epoch": 1.1290600181947825, "grad_norm": 1.2081828117370605, "learning_rate": 1.8447216362469046e-05, "loss": 0.4274, "step": 5740 }, { "epoch": 1.1310270217108014, "grad_norm": 1.024104356765747, "learning_rate": 1.8444159099941915e-05, "loss": 0.4062, "step": 5750 }, { "epoch": 1.13299402522682, "grad_norm": 1.289136290550232, "learning_rate": 1.844110183741478e-05, "loss": 0.4041, "step": 5760 }, { "epoch": 1.1349610287428389, "grad_norm": 1.1129745244979858, "learning_rate": 1.8438044574887647e-05, "loss": 0.3489, "step": 5770 }, { "epoch": 1.1369280322588577, "grad_norm": 0.7395710945129395, "learning_rate": 1.8434987312360516e-05, "loss": 0.4426, "step": 5780 }, { "epoch": 1.1388950357748764, "grad_norm": 0.6231014728546143, "learning_rate": 1.843193004983338e-05, "loss": 0.6161, "step": 5790 }, { "epoch": 1.1408620392908952, "grad_norm": 1.7715225219726562, "learning_rate": 1.8428872787306248e-05, "loss": 0.4273, "step": 5800 }, { "epoch": 1.142829042806914, "grad_norm": 0.9418803453445435, "learning_rate": 1.8425815524779113e-05, "loss": 0.4091, "step": 5810 }, { "epoch": 1.1447960463229327, "grad_norm": 1.3185018301010132, "learning_rate": 1.842275826225198e-05, "loss": 0.4431, "step": 5820 }, { "epoch": 1.1467630498389516, "grad_norm": 1.9988341331481934, "learning_rate": 1.841970099972485e-05, "loss": 0.4172, "step": 5830 }, { "epoch": 1.1487300533549702, "grad_norm": 1.2586854696273804, "learning_rate": 1.8416643737197714e-05, "loss": 0.3204, "step": 5840 }, { "epoch": 1.150697056870989, "grad_norm": 1.1469558477401733, "learning_rate": 1.8413586474670583e-05, "loss": 0.4413, "step": 5850 }, { "epoch": 1.152664060387008, "grad_norm": 0.7499154806137085, "learning_rate": 1.841052921214345e-05, "loss": 0.4259, "step": 5860 }, { "epoch": 1.1546310639030266, "grad_norm": 1.0914968252182007, "learning_rate": 1.8407471949616315e-05, "loss": 0.4374, "step": 5870 }, { "epoch": 1.1565980674190455, "grad_norm": 1.812558889389038, "learning_rate": 1.8404414687089184e-05, "loss": 0.4405, "step": 5880 }, { "epoch": 1.1585650709350643, "grad_norm": 0.8804789185523987, "learning_rate": 1.840135742456205e-05, "loss": 0.5325, "step": 5890 }, { "epoch": 1.160532074451083, "grad_norm": 1.3734694719314575, "learning_rate": 1.8398300162034916e-05, "loss": 0.3723, "step": 5900 }, { "epoch": 1.1624990779671018, "grad_norm": 1.260372519493103, "learning_rate": 1.839524289950778e-05, "loss": 0.4549, "step": 5910 }, { "epoch": 1.1644660814831207, "grad_norm": 1.2938246726989746, "learning_rate": 1.8392185636980647e-05, "loss": 0.4422, "step": 5920 }, { "epoch": 1.1664330849991393, "grad_norm": 1.4152106046676636, "learning_rate": 1.8389128374453516e-05, "loss": 0.3273, "step": 5930 }, { "epoch": 1.1684000885151582, "grad_norm": 0.9573132395744324, "learning_rate": 1.8386071111926382e-05, "loss": 0.6119, "step": 5940 }, { "epoch": 1.170367092031177, "grad_norm": 1.479049801826477, "learning_rate": 1.8383013849399248e-05, "loss": 0.5705, "step": 5950 }, { "epoch": 1.1723340955471957, "grad_norm": 1.8711466789245605, "learning_rate": 1.8379956586872117e-05, "loss": 0.422, "step": 5960 }, { "epoch": 1.1743010990632146, "grad_norm": 1.1066919565200806, "learning_rate": 1.8376899324344983e-05, "loss": 0.5835, "step": 5970 }, { "epoch": 1.1762681025792334, "grad_norm": 1.9158978462219238, "learning_rate": 1.8373842061817852e-05, "loss": 0.3937, "step": 5980 }, { "epoch": 1.178235106095252, "grad_norm": 1.785532832145691, "learning_rate": 1.8370784799290718e-05, "loss": 0.3908, "step": 5990 }, { "epoch": 1.180202109611271, "grad_norm": 1.3090122938156128, "learning_rate": 1.8367727536763584e-05, "loss": 0.408, "step": 6000 }, { "epoch": 1.180202109611271, "eval_loss": 0.2541753053665161, "eval_runtime": 8.8707, "eval_samples_per_second": 5.637, "eval_steps_per_second": 2.818, "step": 6000 }, { "epoch": 1.1821691131272898, "grad_norm": 1.0158838033676147, "learning_rate": 1.836467027423645e-05, "loss": 0.5237, "step": 6010 }, { "epoch": 1.1841361166433084, "grad_norm": 1.9016451835632324, "learning_rate": 1.8361613011709315e-05, "loss": 0.419, "step": 6020 }, { "epoch": 1.1861031201593273, "grad_norm": 0.9506327509880066, "learning_rate": 1.8358555749182184e-05, "loss": 0.4917, "step": 6030 }, { "epoch": 1.1880701236753461, "grad_norm": 0.9373012781143188, "learning_rate": 1.835549848665505e-05, "loss": 0.3754, "step": 6040 }, { "epoch": 1.1900371271913648, "grad_norm": 1.2526496648788452, "learning_rate": 1.8352441224127916e-05, "loss": 0.4037, "step": 6050 }, { "epoch": 1.1920041307073836, "grad_norm": 1.1698423624038696, "learning_rate": 1.8349383961600785e-05, "loss": 0.5628, "step": 6060 }, { "epoch": 1.1939711342234025, "grad_norm": 1.2731508016586304, "learning_rate": 1.834632669907365e-05, "loss": 0.4187, "step": 6070 }, { "epoch": 1.1959381377394211, "grad_norm": 1.4930040836334229, "learning_rate": 1.8343269436546517e-05, "loss": 0.4448, "step": 6080 }, { "epoch": 1.19790514125544, "grad_norm": 0.7274429202079773, "learning_rate": 1.8340212174019386e-05, "loss": 0.5058, "step": 6090 }, { "epoch": 1.1998721447714589, "grad_norm": 0.5345088839530945, "learning_rate": 1.833715491149225e-05, "loss": 0.451, "step": 6100 }, { "epoch": 1.2018391482874775, "grad_norm": 0.7829896807670593, "learning_rate": 1.833409764896512e-05, "loss": 0.3958, "step": 6110 }, { "epoch": 1.2038061518034964, "grad_norm": 1.2394695281982422, "learning_rate": 1.8331040386437986e-05, "loss": 0.3365, "step": 6120 }, { "epoch": 1.2057731553195152, "grad_norm": 0.8492904901504517, "learning_rate": 1.8327983123910852e-05, "loss": 0.414, "step": 6130 }, { "epoch": 1.2077401588355339, "grad_norm": 1.0777946710586548, "learning_rate": 1.8324925861383718e-05, "loss": 0.4816, "step": 6140 }, { "epoch": 1.2097071623515527, "grad_norm": 2.155651569366455, "learning_rate": 1.8321868598856584e-05, "loss": 0.5321, "step": 6150 }, { "epoch": 1.2116741658675716, "grad_norm": 2.0705020427703857, "learning_rate": 1.8318811336329453e-05, "loss": 0.4464, "step": 6160 }, { "epoch": 1.2136411693835902, "grad_norm": 1.2468976974487305, "learning_rate": 1.831575407380232e-05, "loss": 0.5157, "step": 6170 }, { "epoch": 1.215608172899609, "grad_norm": 0.6635984778404236, "learning_rate": 1.8312696811275184e-05, "loss": 0.4052, "step": 6180 }, { "epoch": 1.217575176415628, "grad_norm": 1.6718881130218506, "learning_rate": 1.8309639548748054e-05, "loss": 0.4765, "step": 6190 }, { "epoch": 1.2195421799316466, "grad_norm": 1.2337743043899536, "learning_rate": 1.830658228622092e-05, "loss": 0.3758, "step": 6200 }, { "epoch": 1.2215091834476655, "grad_norm": 1.3416141271591187, "learning_rate": 1.8303525023693785e-05, "loss": 0.4676, "step": 6210 }, { "epoch": 1.2234761869636843, "grad_norm": 1.2023998498916626, "learning_rate": 1.8300467761166654e-05, "loss": 0.4034, "step": 6220 }, { "epoch": 1.225443190479703, "grad_norm": 1.1398640871047974, "learning_rate": 1.829741049863952e-05, "loss": 0.5946, "step": 6230 }, { "epoch": 1.2274101939957218, "grad_norm": 0.9996533989906311, "learning_rate": 1.8294353236112386e-05, "loss": 0.3983, "step": 6240 }, { "epoch": 1.2293771975117405, "grad_norm": 1.0722390413284302, "learning_rate": 1.8291295973585252e-05, "loss": 0.4448, "step": 6250 }, { "epoch": 1.2313442010277593, "grad_norm": 1.5299232006072998, "learning_rate": 1.828823871105812e-05, "loss": 0.5033, "step": 6260 }, { "epoch": 1.2333112045437782, "grad_norm": 1.0260642766952515, "learning_rate": 1.8285181448530987e-05, "loss": 0.3642, "step": 6270 }, { "epoch": 1.2352782080597968, "grad_norm": 1.560285210609436, "learning_rate": 1.8282124186003852e-05, "loss": 0.503, "step": 6280 }, { "epoch": 1.2372452115758157, "grad_norm": 1.4115056991577148, "learning_rate": 1.827906692347672e-05, "loss": 0.3812, "step": 6290 }, { "epoch": 1.2392122150918345, "grad_norm": 0.7947004437446594, "learning_rate": 1.8276009660949587e-05, "loss": 0.4024, "step": 6300 }, { "epoch": 1.2411792186078532, "grad_norm": 0.9930680394172668, "learning_rate": 1.8272952398422453e-05, "loss": 0.441, "step": 6310 }, { "epoch": 1.243146222123872, "grad_norm": 2.014035940170288, "learning_rate": 1.8269895135895322e-05, "loss": 0.4854, "step": 6320 }, { "epoch": 1.245113225639891, "grad_norm": 1.7918622493743896, "learning_rate": 1.8266837873368188e-05, "loss": 0.4661, "step": 6330 }, { "epoch": 1.2470802291559095, "grad_norm": 0.7752891182899475, "learning_rate": 1.8263780610841054e-05, "loss": 0.5, "step": 6340 }, { "epoch": 1.2490472326719284, "grad_norm": 0.6974585056304932, "learning_rate": 1.826072334831392e-05, "loss": 0.4312, "step": 6350 }, { "epoch": 1.251014236187947, "grad_norm": 1.6854921579360962, "learning_rate": 1.825766608578679e-05, "loss": 0.3643, "step": 6360 }, { "epoch": 1.252981239703966, "grad_norm": 0.9491457343101501, "learning_rate": 1.8254608823259655e-05, "loss": 0.4874, "step": 6370 }, { "epoch": 1.2549482432199848, "grad_norm": 1.1778289079666138, "learning_rate": 1.825155156073252e-05, "loss": 0.4696, "step": 6380 }, { "epoch": 1.2569152467360034, "grad_norm": 3.334805488586426, "learning_rate": 1.824849429820539e-05, "loss": 0.475, "step": 6390 }, { "epoch": 1.2588822502520223, "grad_norm": 0.7782668471336365, "learning_rate": 1.8245437035678255e-05, "loss": 0.3803, "step": 6400 }, { "epoch": 1.2608492537680411, "grad_norm": 1.3739856481552124, "learning_rate": 1.824237977315112e-05, "loss": 0.4898, "step": 6410 }, { "epoch": 1.2628162572840598, "grad_norm": 1.7571340799331665, "learning_rate": 1.823932251062399e-05, "loss": 0.5217, "step": 6420 }, { "epoch": 1.2647832608000786, "grad_norm": 0.6563398241996765, "learning_rate": 1.8236265248096856e-05, "loss": 0.3598, "step": 6430 }, { "epoch": 1.2667502643160975, "grad_norm": 0.8935515284538269, "learning_rate": 1.8233207985569722e-05, "loss": 0.4735, "step": 6440 }, { "epoch": 1.2687172678321161, "grad_norm": 0.8654441833496094, "learning_rate": 1.823015072304259e-05, "loss": 0.4369, "step": 6450 }, { "epoch": 1.270684271348135, "grad_norm": 1.5720266103744507, "learning_rate": 1.8227093460515457e-05, "loss": 0.3759, "step": 6460 }, { "epoch": 1.2726512748641539, "grad_norm": 1.4234576225280762, "learning_rate": 1.8224036197988323e-05, "loss": 0.5163, "step": 6470 }, { "epoch": 1.2746182783801725, "grad_norm": 1.3795866966247559, "learning_rate": 1.822097893546119e-05, "loss": 0.4891, "step": 6480 }, { "epoch": 1.2765852818961914, "grad_norm": 2.2530128955841064, "learning_rate": 1.8217921672934054e-05, "loss": 0.5406, "step": 6490 }, { "epoch": 1.2785522854122102, "grad_norm": 1.2608258724212646, "learning_rate": 1.8214864410406923e-05, "loss": 0.4731, "step": 6500 }, { "epoch": 1.2785522854122102, "eval_loss": 0.2500273585319519, "eval_runtime": 8.9303, "eval_samples_per_second": 5.599, "eval_steps_per_second": 2.799, "step": 6500 }, { "epoch": 1.2805192889282289, "grad_norm": 1.2853199243545532, "learning_rate": 1.821180714787979e-05, "loss": 0.4657, "step": 6510 }, { "epoch": 1.2824862924442477, "grad_norm": 0.6123810410499573, "learning_rate": 1.8208749885352658e-05, "loss": 0.6216, "step": 6520 }, { "epoch": 1.2844532959602666, "grad_norm": 0.8398245573043823, "learning_rate": 1.8205692622825524e-05, "loss": 0.4216, "step": 6530 }, { "epoch": 1.2864202994762852, "grad_norm": 1.0161961317062378, "learning_rate": 1.820263536029839e-05, "loss": 0.4325, "step": 6540 }, { "epoch": 1.288387302992304, "grad_norm": 1.7545063495635986, "learning_rate": 1.819957809777126e-05, "loss": 0.5181, "step": 6550 }, { "epoch": 1.290354306508323, "grad_norm": 0.9450379014015198, "learning_rate": 1.8196520835244125e-05, "loss": 0.4417, "step": 6560 }, { "epoch": 1.2923213100243416, "grad_norm": 1.3141273260116577, "learning_rate": 1.819346357271699e-05, "loss": 0.5343, "step": 6570 }, { "epoch": 1.2942883135403604, "grad_norm": 1.6922868490219116, "learning_rate": 1.8190406310189856e-05, "loss": 0.4544, "step": 6580 }, { "epoch": 1.2962553170563793, "grad_norm": 1.2070684432983398, "learning_rate": 1.8187349047662722e-05, "loss": 0.3989, "step": 6590 }, { "epoch": 1.298222320572398, "grad_norm": 1.0390480756759644, "learning_rate": 1.818429178513559e-05, "loss": 0.3849, "step": 6600 }, { "epoch": 1.3001893240884168, "grad_norm": 1.3022695779800415, "learning_rate": 1.8181234522608457e-05, "loss": 0.3879, "step": 6610 }, { "epoch": 1.3021563276044357, "grad_norm": 2.036789655685425, "learning_rate": 1.8178177260081323e-05, "loss": 0.4393, "step": 6620 }, { "epoch": 1.3041233311204543, "grad_norm": 1.0028927326202393, "learning_rate": 1.8175119997554192e-05, "loss": 0.5197, "step": 6630 }, { "epoch": 1.3060903346364732, "grad_norm": 0.9908479452133179, "learning_rate": 1.8172062735027058e-05, "loss": 0.3509, "step": 6640 }, { "epoch": 1.308057338152492, "grad_norm": 1.617487907409668, "learning_rate": 1.8169005472499927e-05, "loss": 0.5052, "step": 6650 }, { "epoch": 1.3100243416685107, "grad_norm": 1.229190468788147, "learning_rate": 1.8165948209972793e-05, "loss": 0.4564, "step": 6660 }, { "epoch": 1.3119913451845295, "grad_norm": 1.5831772089004517, "learning_rate": 1.816289094744566e-05, "loss": 0.4283, "step": 6670 }, { "epoch": 1.3139583487005484, "grad_norm": 1.4236372709274292, "learning_rate": 1.8159833684918528e-05, "loss": 0.4583, "step": 6680 }, { "epoch": 1.315925352216567, "grad_norm": 0.9847359657287598, "learning_rate": 1.815677642239139e-05, "loss": 0.5584, "step": 6690 }, { "epoch": 1.317892355732586, "grad_norm": 1.2194691896438599, "learning_rate": 1.815371915986426e-05, "loss": 0.4529, "step": 6700 }, { "epoch": 1.3198593592486048, "grad_norm": 1.4143697023391724, "learning_rate": 1.8150661897337125e-05, "loss": 0.4783, "step": 6710 }, { "epoch": 1.3218263627646234, "grad_norm": 1.3399523496627808, "learning_rate": 1.814760463480999e-05, "loss": 0.3955, "step": 6720 }, { "epoch": 1.3237933662806423, "grad_norm": 0.7370299100875854, "learning_rate": 1.814454737228286e-05, "loss": 0.4647, "step": 6730 }, { "epoch": 1.3257603697966611, "grad_norm": 1.4617048501968384, "learning_rate": 1.8141490109755726e-05, "loss": 0.4911, "step": 6740 }, { "epoch": 1.3277273733126798, "grad_norm": 0.836281418800354, "learning_rate": 1.813843284722859e-05, "loss": 0.442, "step": 6750 }, { "epoch": 1.3296943768286986, "grad_norm": 0.901801586151123, "learning_rate": 1.813537558470146e-05, "loss": 0.3387, "step": 6760 }, { "epoch": 1.3316613803447175, "grad_norm": 1.461903691291809, "learning_rate": 1.8132318322174326e-05, "loss": 0.4815, "step": 6770 }, { "epoch": 1.3336283838607361, "grad_norm": 1.2641518115997314, "learning_rate": 1.8129261059647196e-05, "loss": 0.4465, "step": 6780 }, { "epoch": 1.335595387376755, "grad_norm": 2.127981662750244, "learning_rate": 1.812620379712006e-05, "loss": 0.4972, "step": 6790 }, { "epoch": 1.3375623908927738, "grad_norm": 1.0643235445022583, "learning_rate": 1.8123146534592927e-05, "loss": 0.4823, "step": 6800 }, { "epoch": 1.3395293944087925, "grad_norm": 1.392342448234558, "learning_rate": 1.8120089272065793e-05, "loss": 0.4488, "step": 6810 }, { "epoch": 1.3414963979248113, "grad_norm": 1.372171401977539, "learning_rate": 1.811703200953866e-05, "loss": 0.5518, "step": 6820 }, { "epoch": 1.3434634014408302, "grad_norm": 1.0841792821884155, "learning_rate": 1.8113974747011528e-05, "loss": 0.4059, "step": 6830 }, { "epoch": 1.3454304049568488, "grad_norm": 1.0114047527313232, "learning_rate": 1.8110917484484394e-05, "loss": 0.4705, "step": 6840 }, { "epoch": 1.3473974084728677, "grad_norm": 1.600369930267334, "learning_rate": 1.810786022195726e-05, "loss": 0.5064, "step": 6850 }, { "epoch": 1.3493644119888863, "grad_norm": 1.3853987455368042, "learning_rate": 1.810480295943013e-05, "loss": 0.5352, "step": 6860 }, { "epoch": 1.3513314155049052, "grad_norm": 1.627324104309082, "learning_rate": 1.8101745696902994e-05, "loss": 0.6512, "step": 6870 }, { "epoch": 1.353298419020924, "grad_norm": 1.6808173656463623, "learning_rate": 1.809868843437586e-05, "loss": 0.4856, "step": 6880 }, { "epoch": 1.3552654225369427, "grad_norm": 1.0722404718399048, "learning_rate": 1.809563117184873e-05, "loss": 0.4883, "step": 6890 }, { "epoch": 1.3572324260529616, "grad_norm": 1.3000462055206299, "learning_rate": 1.8092573909321595e-05, "loss": 0.459, "step": 6900 }, { "epoch": 1.3591994295689804, "grad_norm": 1.0217570066452026, "learning_rate": 1.8089516646794464e-05, "loss": 0.4908, "step": 6910 }, { "epoch": 1.361166433084999, "grad_norm": 2.192474603652954, "learning_rate": 1.8086459384267327e-05, "loss": 0.5604, "step": 6920 }, { "epoch": 1.363133436601018, "grad_norm": 1.4930495023727417, "learning_rate": 1.8083402121740196e-05, "loss": 0.412, "step": 6930 }, { "epoch": 1.3651004401170366, "grad_norm": 0.9470030069351196, "learning_rate": 1.808034485921306e-05, "loss": 0.5397, "step": 6940 }, { "epoch": 1.3670674436330554, "grad_norm": 1.439299464225769, "learning_rate": 1.8077287596685927e-05, "loss": 0.6071, "step": 6950 }, { "epoch": 1.3690344471490743, "grad_norm": 1.0739001035690308, "learning_rate": 1.8074230334158796e-05, "loss": 0.4911, "step": 6960 }, { "epoch": 1.371001450665093, "grad_norm": 2.3669240474700928, "learning_rate": 1.8071173071631662e-05, "loss": 0.5202, "step": 6970 }, { "epoch": 1.3729684541811118, "grad_norm": 0.8354695439338684, "learning_rate": 1.8068115809104528e-05, "loss": 0.453, "step": 6980 }, { "epoch": 1.3749354576971307, "grad_norm": 0.8748831748962402, "learning_rate": 1.8065058546577397e-05, "loss": 0.4235, "step": 6990 }, { "epoch": 1.3769024612131493, "grad_norm": 0.8159099221229553, "learning_rate": 1.8062001284050263e-05, "loss": 0.4364, "step": 7000 }, { "epoch": 1.3769024612131493, "eval_loss": 0.2508287727832794, "eval_runtime": 8.8467, "eval_samples_per_second": 5.652, "eval_steps_per_second": 2.826, "step": 7000 }, { "epoch": 1.3788694647291682, "grad_norm": 0.868539035320282, "learning_rate": 1.805894402152313e-05, "loss": 0.621, "step": 7010 }, { "epoch": 1.380836468245187, "grad_norm": 1.5465404987335205, "learning_rate": 1.8055886758995998e-05, "loss": 0.402, "step": 7020 }, { "epoch": 1.3828034717612057, "grad_norm": 2.190101146697998, "learning_rate": 1.8052829496468864e-05, "loss": 0.5353, "step": 7030 }, { "epoch": 1.3847704752772245, "grad_norm": 0.7492843866348267, "learning_rate": 1.804977223394173e-05, "loss": 0.517, "step": 7040 }, { "epoch": 1.3867374787932434, "grad_norm": 0.9948645234107971, "learning_rate": 1.8046714971414595e-05, "loss": 0.5211, "step": 7050 }, { "epoch": 1.388704482309262, "grad_norm": 1.594095230102539, "learning_rate": 1.8043657708887464e-05, "loss": 0.4808, "step": 7060 }, { "epoch": 1.3906714858252809, "grad_norm": 1.165153980255127, "learning_rate": 1.804060044636033e-05, "loss": 0.3884, "step": 7070 }, { "epoch": 1.3926384893412997, "grad_norm": 2.061393976211548, "learning_rate": 1.8037543183833196e-05, "loss": 0.4339, "step": 7080 }, { "epoch": 1.3946054928573184, "grad_norm": 1.4817818403244019, "learning_rate": 1.8034485921306065e-05, "loss": 0.4482, "step": 7090 }, { "epoch": 1.3965724963733372, "grad_norm": 0.8791131377220154, "learning_rate": 1.803142865877893e-05, "loss": 0.4498, "step": 7100 }, { "epoch": 1.398539499889356, "grad_norm": 0.8255666494369507, "learning_rate": 1.8028371396251797e-05, "loss": 0.4166, "step": 7110 }, { "epoch": 1.4005065034053747, "grad_norm": 0.8668946027755737, "learning_rate": 1.8025314133724666e-05, "loss": 0.5698, "step": 7120 }, { "epoch": 1.4024735069213936, "grad_norm": 1.2157158851623535, "learning_rate": 1.802225687119753e-05, "loss": 0.4634, "step": 7130 }, { "epoch": 1.4044405104374125, "grad_norm": 1.54035484790802, "learning_rate": 1.8019199608670397e-05, "loss": 0.3969, "step": 7140 }, { "epoch": 1.406407513953431, "grad_norm": 0.8776307702064514, "learning_rate": 1.8016142346143263e-05, "loss": 0.5061, "step": 7150 }, { "epoch": 1.40837451746945, "grad_norm": 1.1032741069793701, "learning_rate": 1.8013085083616132e-05, "loss": 0.3971, "step": 7160 }, { "epoch": 1.4103415209854688, "grad_norm": 1.0241215229034424, "learning_rate": 1.8010027821088998e-05, "loss": 0.4959, "step": 7170 }, { "epoch": 1.4123085245014875, "grad_norm": 1.6800438165664673, "learning_rate": 1.8006970558561864e-05, "loss": 0.4738, "step": 7180 }, { "epoch": 1.4142755280175063, "grad_norm": 1.5936535596847534, "learning_rate": 1.8003913296034733e-05, "loss": 0.4572, "step": 7190 }, { "epoch": 1.4162425315335252, "grad_norm": 1.2243019342422485, "learning_rate": 1.80008560335076e-05, "loss": 0.3557, "step": 7200 }, { "epoch": 1.4182095350495438, "grad_norm": 0.8471090197563171, "learning_rate": 1.7997798770980465e-05, "loss": 0.3765, "step": 7210 }, { "epoch": 1.4201765385655627, "grad_norm": 1.2940151691436768, "learning_rate": 1.7994741508453334e-05, "loss": 0.5282, "step": 7220 }, { "epoch": 1.4221435420815816, "grad_norm": 0.7135307192802429, "learning_rate": 1.79916842459262e-05, "loss": 0.4611, "step": 7230 }, { "epoch": 1.4241105455976002, "grad_norm": 0.9937286972999573, "learning_rate": 1.7988626983399065e-05, "loss": 0.502, "step": 7240 }, { "epoch": 1.426077549113619, "grad_norm": 1.5111737251281738, "learning_rate": 1.7985569720871935e-05, "loss": 0.4421, "step": 7250 }, { "epoch": 1.428044552629638, "grad_norm": 1.29506516456604, "learning_rate": 1.7982512458344797e-05, "loss": 0.5282, "step": 7260 }, { "epoch": 1.4300115561456566, "grad_norm": 1.1045817136764526, "learning_rate": 1.7979455195817666e-05, "loss": 0.4594, "step": 7270 }, { "epoch": 1.4319785596616754, "grad_norm": 1.3081828355789185, "learning_rate": 1.7976397933290532e-05, "loss": 0.451, "step": 7280 }, { "epoch": 1.4339455631776943, "grad_norm": 1.3445611000061035, "learning_rate": 1.79733406707634e-05, "loss": 0.4904, "step": 7290 }, { "epoch": 1.435912566693713, "grad_norm": 1.8529108762741089, "learning_rate": 1.7970283408236267e-05, "loss": 0.537, "step": 7300 }, { "epoch": 1.4378795702097318, "grad_norm": 1.6065847873687744, "learning_rate": 1.7967226145709133e-05, "loss": 0.3289, "step": 7310 }, { "epoch": 1.4398465737257506, "grad_norm": 1.597464680671692, "learning_rate": 1.7964168883182002e-05, "loss": 0.4434, "step": 7320 }, { "epoch": 1.4418135772417693, "grad_norm": 1.5186161994934082, "learning_rate": 1.7961111620654868e-05, "loss": 0.5233, "step": 7330 }, { "epoch": 1.4437805807577881, "grad_norm": 2.497467279434204, "learning_rate": 1.7958054358127733e-05, "loss": 0.4332, "step": 7340 }, { "epoch": 1.445747584273807, "grad_norm": 1.7080587148666382, "learning_rate": 1.7954997095600602e-05, "loss": 0.4665, "step": 7350 }, { "epoch": 1.4477145877898256, "grad_norm": 1.3408483266830444, "learning_rate": 1.7951939833073468e-05, "loss": 0.4356, "step": 7360 }, { "epoch": 1.4496815913058445, "grad_norm": 0.8652418255805969, "learning_rate": 1.7948882570546334e-05, "loss": 0.3355, "step": 7370 }, { "epoch": 1.4516485948218634, "grad_norm": 1.6720373630523682, "learning_rate": 1.79458253080192e-05, "loss": 0.4702, "step": 7380 }, { "epoch": 1.453615598337882, "grad_norm": 1.1792707443237305, "learning_rate": 1.7942768045492066e-05, "loss": 0.4056, "step": 7390 }, { "epoch": 1.4555826018539009, "grad_norm": 0.5109983682632446, "learning_rate": 1.7939710782964935e-05, "loss": 0.4889, "step": 7400 }, { "epoch": 1.4575496053699197, "grad_norm": 0.864371120929718, "learning_rate": 1.79366535204378e-05, "loss": 0.5084, "step": 7410 }, { "epoch": 1.4595166088859384, "grad_norm": 1.4081306457519531, "learning_rate": 1.793359625791067e-05, "loss": 0.4884, "step": 7420 }, { "epoch": 1.4614836124019572, "grad_norm": 0.5403172969818115, "learning_rate": 1.7930538995383535e-05, "loss": 0.4762, "step": 7430 }, { "epoch": 1.4634506159179759, "grad_norm": 2.7623186111450195, "learning_rate": 1.79274817328564e-05, "loss": 0.4962, "step": 7440 }, { "epoch": 1.4654176194339947, "grad_norm": 1.38148832321167, "learning_rate": 1.792442447032927e-05, "loss": 0.4259, "step": 7450 }, { "epoch": 1.4673846229500136, "grad_norm": 1.2702479362487793, "learning_rate": 1.7921367207802136e-05, "loss": 0.4708, "step": 7460 }, { "epoch": 1.4693516264660322, "grad_norm": 1.7170747518539429, "learning_rate": 1.7918309945275002e-05, "loss": 0.5886, "step": 7470 }, { "epoch": 1.471318629982051, "grad_norm": 2.262479543685913, "learning_rate": 1.791525268274787e-05, "loss": 0.47, "step": 7480 }, { "epoch": 1.47328563349807, "grad_norm": 0.977174699306488, "learning_rate": 1.7912195420220734e-05, "loss": 0.3943, "step": 7490 }, { "epoch": 1.4752526370140886, "grad_norm": 1.1417875289916992, "learning_rate": 1.7909138157693603e-05, "loss": 0.3999, "step": 7500 }, { "epoch": 1.4752526370140886, "eval_loss": 0.2466391772031784, "eval_runtime": 8.9007, "eval_samples_per_second": 5.618, "eval_steps_per_second": 2.809, "step": 7500 }, { "epoch": 1.4772196405301075, "grad_norm": 0.9850865602493286, "learning_rate": 1.790608089516647e-05, "loss": 0.4194, "step": 7510 }, { "epoch": 1.479186644046126, "grad_norm": 0.9605159759521484, "learning_rate": 1.7903023632639334e-05, "loss": 0.5008, "step": 7520 }, { "epoch": 1.481153647562145, "grad_norm": 0.9875272512435913, "learning_rate": 1.7899966370112203e-05, "loss": 0.384, "step": 7530 }, { "epoch": 1.4831206510781638, "grad_norm": 1.9756124019622803, "learning_rate": 1.789690910758507e-05, "loss": 0.4649, "step": 7540 }, { "epoch": 1.4850876545941825, "grad_norm": 1.7961291074752808, "learning_rate": 1.789385184505794e-05, "loss": 0.5256, "step": 7550 }, { "epoch": 1.4870546581102013, "grad_norm": 1.253611445426941, "learning_rate": 1.7890794582530804e-05, "loss": 0.4242, "step": 7560 }, { "epoch": 1.4890216616262202, "grad_norm": 1.7903774976730347, "learning_rate": 1.788773732000367e-05, "loss": 0.4745, "step": 7570 }, { "epoch": 1.4909886651422388, "grad_norm": 2.0283164978027344, "learning_rate": 1.788468005747654e-05, "loss": 0.4065, "step": 7580 }, { "epoch": 1.4929556686582577, "grad_norm": 1.0774911642074585, "learning_rate": 1.7881622794949405e-05, "loss": 0.5152, "step": 7590 }, { "epoch": 1.4949226721742765, "grad_norm": 2.157869815826416, "learning_rate": 1.787856553242227e-05, "loss": 0.4008, "step": 7600 }, { "epoch": 1.4968896756902952, "grad_norm": 1.0705121755599976, "learning_rate": 1.7875508269895136e-05, "loss": 0.4837, "step": 7610 }, { "epoch": 1.498856679206314, "grad_norm": 1.2112722396850586, "learning_rate": 1.7872451007368002e-05, "loss": 0.4164, "step": 7620 }, { "epoch": 1.500823682722333, "grad_norm": 0.9930263161659241, "learning_rate": 1.786939374484087e-05, "loss": 0.2991, "step": 7630 }, { "epoch": 1.5027906862383515, "grad_norm": 1.176189661026001, "learning_rate": 1.7866336482313737e-05, "loss": 0.3959, "step": 7640 }, { "epoch": 1.5047576897543704, "grad_norm": 2.2921009063720703, "learning_rate": 1.7863279219786603e-05, "loss": 0.4512, "step": 7650 }, { "epoch": 1.5067246932703893, "grad_norm": 0.8029620051383972, "learning_rate": 1.7860221957259472e-05, "loss": 0.4313, "step": 7660 }, { "epoch": 1.508691696786408, "grad_norm": 0.9565465450286865, "learning_rate": 1.7857164694732338e-05, "loss": 0.4258, "step": 7670 }, { "epoch": 1.5106587003024268, "grad_norm": 1.5121804475784302, "learning_rate": 1.7854107432205207e-05, "loss": 0.4782, "step": 7680 }, { "epoch": 1.5126257038184456, "grad_norm": 0.9973114132881165, "learning_rate": 1.7851050169678073e-05, "loss": 0.3971, "step": 7690 }, { "epoch": 1.5145927073344643, "grad_norm": 0.9934736490249634, "learning_rate": 1.784799290715094e-05, "loss": 0.4563, "step": 7700 }, { "epoch": 1.5165597108504831, "grad_norm": 1.394128441810608, "learning_rate": 1.7844935644623804e-05, "loss": 0.4807, "step": 7710 }, { "epoch": 1.518526714366502, "grad_norm": 1.5065066814422607, "learning_rate": 1.784187838209667e-05, "loss": 0.4271, "step": 7720 }, { "epoch": 1.5204937178825206, "grad_norm": 1.175439476966858, "learning_rate": 1.783882111956954e-05, "loss": 0.5912, "step": 7730 }, { "epoch": 1.5224607213985395, "grad_norm": 1.7413098812103271, "learning_rate": 1.7835763857042405e-05, "loss": 0.4099, "step": 7740 }, { "epoch": 1.5244277249145584, "grad_norm": 2.1757400035858154, "learning_rate": 1.783270659451527e-05, "loss": 0.4179, "step": 7750 }, { "epoch": 1.526394728430577, "grad_norm": 1.3303672075271606, "learning_rate": 1.782964933198814e-05, "loss": 0.2962, "step": 7760 }, { "epoch": 1.5283617319465959, "grad_norm": 1.1518796682357788, "learning_rate": 1.7826592069461006e-05, "loss": 0.3631, "step": 7770 }, { "epoch": 1.5303287354626147, "grad_norm": 1.4014183282852173, "learning_rate": 1.782353480693387e-05, "loss": 0.407, "step": 7780 }, { "epoch": 1.5322957389786334, "grad_norm": 1.4308451414108276, "learning_rate": 1.782047754440674e-05, "loss": 0.5349, "step": 7790 }, { "epoch": 1.5342627424946522, "grad_norm": 2.0231456756591797, "learning_rate": 1.7817420281879607e-05, "loss": 0.4317, "step": 7800 }, { "epoch": 1.536229746010671, "grad_norm": 0.7665293216705322, "learning_rate": 1.7814363019352476e-05, "loss": 0.602, "step": 7810 }, { "epoch": 1.5381967495266897, "grad_norm": 1.1104438304901123, "learning_rate": 1.781130575682534e-05, "loss": 0.52, "step": 7820 }, { "epoch": 1.5401637530427086, "grad_norm": 1.0240403413772583, "learning_rate": 1.7808248494298207e-05, "loss": 0.476, "step": 7830 }, { "epoch": 1.5421307565587274, "grad_norm": 0.6431864500045776, "learning_rate": 1.7805191231771073e-05, "loss": 0.4132, "step": 7840 }, { "epoch": 1.544097760074746, "grad_norm": 1.3472301959991455, "learning_rate": 1.780213396924394e-05, "loss": 0.5485, "step": 7850 }, { "epoch": 1.546064763590765, "grad_norm": 1.5425872802734375, "learning_rate": 1.7799076706716808e-05, "loss": 0.4777, "step": 7860 }, { "epoch": 1.5480317671067838, "grad_norm": 0.8339068293571472, "learning_rate": 1.7796019444189674e-05, "loss": 0.6139, "step": 7870 }, { "epoch": 1.5499987706228024, "grad_norm": 1.1242626905441284, "learning_rate": 1.779296218166254e-05, "loss": 0.4117, "step": 7880 }, { "epoch": 1.5519657741388213, "grad_norm": 0.9733056426048279, "learning_rate": 1.778990491913541e-05, "loss": 0.4415, "step": 7890 }, { "epoch": 1.5539327776548402, "grad_norm": 1.3571527004241943, "learning_rate": 1.7786847656608274e-05, "loss": 0.4764, "step": 7900 }, { "epoch": 1.5558997811708588, "grad_norm": 2.1205756664276123, "learning_rate": 1.778379039408114e-05, "loss": 0.4691, "step": 7910 }, { "epoch": 1.5578667846868777, "grad_norm": 1.7528705596923828, "learning_rate": 1.778073313155401e-05, "loss": 0.4773, "step": 7920 }, { "epoch": 1.5598337882028965, "grad_norm": 1.2486058473587036, "learning_rate": 1.7777675869026875e-05, "loss": 0.5109, "step": 7930 }, { "epoch": 1.5618007917189152, "grad_norm": 1.1096512079238892, "learning_rate": 1.777461860649974e-05, "loss": 0.2977, "step": 7940 }, { "epoch": 1.563767795234934, "grad_norm": 1.3998329639434814, "learning_rate": 1.7771561343972607e-05, "loss": 0.5104, "step": 7950 }, { "epoch": 1.565734798750953, "grad_norm": 1.1133865118026733, "learning_rate": 1.7768504081445476e-05, "loss": 0.4394, "step": 7960 }, { "epoch": 1.5677018022669715, "grad_norm": 1.2691013813018799, "learning_rate": 1.776544681891834e-05, "loss": 0.3867, "step": 7970 }, { "epoch": 1.5696688057829902, "grad_norm": 0.6371937990188599, "learning_rate": 1.7762389556391207e-05, "loss": 0.4909, "step": 7980 }, { "epoch": 1.5716358092990093, "grad_norm": 1.969839096069336, "learning_rate": 1.7759332293864077e-05, "loss": 0.4578, "step": 7990 }, { "epoch": 1.573602812815028, "grad_norm": 1.014076590538025, "learning_rate": 1.7756275031336942e-05, "loss": 0.4606, "step": 8000 }, { "epoch": 1.573602812815028, "eval_loss": 0.24194316565990448, "eval_runtime": 8.8643, "eval_samples_per_second": 5.641, "eval_steps_per_second": 2.82, "step": 8000 }, { "epoch": 1.5755698163310465, "grad_norm": 1.0506935119628906, "learning_rate": 1.7753217768809808e-05, "loss": 0.4692, "step": 8010 }, { "epoch": 1.5775368198470656, "grad_norm": 0.9741007089614868, "learning_rate": 1.7750160506282677e-05, "loss": 0.4414, "step": 8020 }, { "epoch": 1.5795038233630843, "grad_norm": 1.311547875404358, "learning_rate": 1.7747103243755543e-05, "loss": 0.5126, "step": 8030 }, { "epoch": 1.581470826879103, "grad_norm": 1.2044017314910889, "learning_rate": 1.774404598122841e-05, "loss": 0.4694, "step": 8040 }, { "epoch": 1.583437830395122, "grad_norm": 1.728625774383545, "learning_rate": 1.7740988718701275e-05, "loss": 0.3995, "step": 8050 }, { "epoch": 1.5854048339111406, "grad_norm": 0.7783991694450378, "learning_rate": 1.7737931456174144e-05, "loss": 0.4051, "step": 8060 }, { "epoch": 1.5873718374271593, "grad_norm": 1.6007112264633179, "learning_rate": 1.773487419364701e-05, "loss": 0.3564, "step": 8070 }, { "epoch": 1.5893388409431783, "grad_norm": 1.422080159187317, "learning_rate": 1.7731816931119875e-05, "loss": 0.4121, "step": 8080 }, { "epoch": 1.591305844459197, "grad_norm": 0.9315184950828552, "learning_rate": 1.7728759668592745e-05, "loss": 0.5115, "step": 8090 }, { "epoch": 1.5932728479752156, "grad_norm": 1.5067553520202637, "learning_rate": 1.772570240606561e-05, "loss": 0.4785, "step": 8100 }, { "epoch": 1.5952398514912347, "grad_norm": 0.9890210628509521, "learning_rate": 1.7722645143538476e-05, "loss": 0.3946, "step": 8110 }, { "epoch": 1.5972068550072533, "grad_norm": 1.6322187185287476, "learning_rate": 1.7719587881011345e-05, "loss": 0.3755, "step": 8120 }, { "epoch": 1.599173858523272, "grad_norm": 1.1822806596755981, "learning_rate": 1.771653061848421e-05, "loss": 0.3832, "step": 8130 }, { "epoch": 1.6011408620392908, "grad_norm": 0.8035289645195007, "learning_rate": 1.7713473355957077e-05, "loss": 0.3896, "step": 8140 }, { "epoch": 1.6031078655553097, "grad_norm": 1.2947710752487183, "learning_rate": 1.7710416093429946e-05, "loss": 0.4154, "step": 8150 }, { "epoch": 1.6050748690713283, "grad_norm": 1.4459134340286255, "learning_rate": 1.7707358830902812e-05, "loss": 0.4079, "step": 8160 }, { "epoch": 1.6070418725873472, "grad_norm": 2.4247255325317383, "learning_rate": 1.7704301568375678e-05, "loss": 0.4353, "step": 8170 }, { "epoch": 1.609008876103366, "grad_norm": 1.7483638525009155, "learning_rate": 1.7701244305848543e-05, "loss": 0.4049, "step": 8180 }, { "epoch": 1.6109758796193847, "grad_norm": 1.2426694631576538, "learning_rate": 1.7698187043321413e-05, "loss": 0.3751, "step": 8190 }, { "epoch": 1.6129428831354036, "grad_norm": 1.3838658332824707, "learning_rate": 1.7695129780794278e-05, "loss": 0.3779, "step": 8200 }, { "epoch": 1.6149098866514224, "grad_norm": 1.3950488567352295, "learning_rate": 1.7692072518267144e-05, "loss": 0.4883, "step": 8210 }, { "epoch": 1.616876890167441, "grad_norm": 1.1213529109954834, "learning_rate": 1.7689015255740013e-05, "loss": 0.4647, "step": 8220 }, { "epoch": 1.61884389368346, "grad_norm": 1.6724748611450195, "learning_rate": 1.768595799321288e-05, "loss": 0.6121, "step": 8230 }, { "epoch": 1.6208108971994788, "grad_norm": 1.7976254224777222, "learning_rate": 1.7682900730685745e-05, "loss": 0.51, "step": 8240 }, { "epoch": 1.6227779007154974, "grad_norm": 1.0217626094818115, "learning_rate": 1.7679843468158614e-05, "loss": 0.3698, "step": 8250 }, { "epoch": 1.6247449042315163, "grad_norm": 2.1370279788970947, "learning_rate": 1.767678620563148e-05, "loss": 0.4984, "step": 8260 }, { "epoch": 1.6267119077475352, "grad_norm": 1.4108999967575073, "learning_rate": 1.7673728943104345e-05, "loss": 0.507, "step": 8270 }, { "epoch": 1.6286789112635538, "grad_norm": 0.9152798652648926, "learning_rate": 1.767067168057721e-05, "loss": 0.4259, "step": 8280 }, { "epoch": 1.6306459147795727, "grad_norm": 1.3696630001068115, "learning_rate": 1.7667614418050077e-05, "loss": 0.482, "step": 8290 }, { "epoch": 1.6326129182955915, "grad_norm": 1.8380405902862549, "learning_rate": 1.7664557155522946e-05, "loss": 0.3949, "step": 8300 }, { "epoch": 1.6345799218116102, "grad_norm": 1.0962872505187988, "learning_rate": 1.7661499892995812e-05, "loss": 0.3266, "step": 8310 }, { "epoch": 1.636546925327629, "grad_norm": 1.6513553857803345, "learning_rate": 1.765844263046868e-05, "loss": 0.3018, "step": 8320 }, { "epoch": 1.6385139288436479, "grad_norm": 2.0764846801757812, "learning_rate": 1.7655385367941547e-05, "loss": 0.3792, "step": 8330 }, { "epoch": 1.6404809323596665, "grad_norm": 1.0733838081359863, "learning_rate": 1.7652328105414413e-05, "loss": 0.5205, "step": 8340 }, { "epoch": 1.6424479358756854, "grad_norm": 1.3473206758499146, "learning_rate": 1.7649270842887282e-05, "loss": 0.4039, "step": 8350 }, { "epoch": 1.6444149393917042, "grad_norm": 1.290519118309021, "learning_rate": 1.7646213580360148e-05, "loss": 0.4938, "step": 8360 }, { "epoch": 1.6463819429077229, "grad_norm": 1.4205468893051147, "learning_rate": 1.7643156317833013e-05, "loss": 0.5465, "step": 8370 }, { "epoch": 1.6483489464237417, "grad_norm": 1.6838555335998535, "learning_rate": 1.7640099055305883e-05, "loss": 0.343, "step": 8380 }, { "epoch": 1.6503159499397606, "grad_norm": 1.086775302886963, "learning_rate": 1.7637041792778745e-05, "loss": 0.6068, "step": 8390 }, { "epoch": 1.6522829534557792, "grad_norm": 2.012615442276001, "learning_rate": 1.7633984530251614e-05, "loss": 0.3777, "step": 8400 }, { "epoch": 1.654249956971798, "grad_norm": 1.1604315042495728, "learning_rate": 1.763092726772448e-05, "loss": 0.4205, "step": 8410 }, { "epoch": 1.656216960487817, "grad_norm": 1.714273452758789, "learning_rate": 1.7627870005197346e-05, "loss": 0.4793, "step": 8420 }, { "epoch": 1.6581839640038356, "grad_norm": 1.1449471712112427, "learning_rate": 1.7624812742670215e-05, "loss": 0.4119, "step": 8430 }, { "epoch": 1.6601509675198545, "grad_norm": 1.364147663116455, "learning_rate": 1.762175548014308e-05, "loss": 0.4533, "step": 8440 }, { "epoch": 1.6621179710358733, "grad_norm": 1.7727055549621582, "learning_rate": 1.761869821761595e-05, "loss": 0.4756, "step": 8450 }, { "epoch": 1.664084974551892, "grad_norm": 1.1421890258789062, "learning_rate": 1.7615640955088816e-05, "loss": 0.4338, "step": 8460 }, { "epoch": 1.6660519780679108, "grad_norm": 1.056373953819275, "learning_rate": 1.761258369256168e-05, "loss": 0.3724, "step": 8470 }, { "epoch": 1.6680189815839297, "grad_norm": 2.1806955337524414, "learning_rate": 1.760952643003455e-05, "loss": 0.5298, "step": 8480 }, { "epoch": 1.6699859850999483, "grad_norm": 1.1213524341583252, "learning_rate": 1.7606469167507416e-05, "loss": 0.4571, "step": 8490 }, { "epoch": 1.6719529886159672, "grad_norm": 1.1251106262207031, "learning_rate": 1.7603411904980282e-05, "loss": 0.3943, "step": 8500 }, { "epoch": 1.6719529886159672, "eval_loss": 0.24524246156215668, "eval_runtime": 8.8832, "eval_samples_per_second": 5.629, "eval_steps_per_second": 2.814, "step": 8500 }, { "epoch": 1.673919992131986, "grad_norm": 2.213970184326172, "learning_rate": 1.7600354642453148e-05, "loss": 0.4906, "step": 8510 }, { "epoch": 1.6758869956480047, "grad_norm": 1.1807043552398682, "learning_rate": 1.7597297379926014e-05, "loss": 0.4446, "step": 8520 }, { "epoch": 1.6778539991640236, "grad_norm": 1.5438140630722046, "learning_rate": 1.7594240117398883e-05, "loss": 0.4429, "step": 8530 }, { "epoch": 1.6798210026800424, "grad_norm": 1.0600336790084839, "learning_rate": 1.759118285487175e-05, "loss": 0.5285, "step": 8540 }, { "epoch": 1.681788006196061, "grad_norm": 0.9455369710922241, "learning_rate": 1.7588125592344614e-05, "loss": 0.4963, "step": 8550 }, { "epoch": 1.6837550097120797, "grad_norm": 1.0323309898376465, "learning_rate": 1.7585068329817484e-05, "loss": 0.4413, "step": 8560 }, { "epoch": 1.6857220132280988, "grad_norm": 1.9019440412521362, "learning_rate": 1.758201106729035e-05, "loss": 0.4316, "step": 8570 }, { "epoch": 1.6876890167441174, "grad_norm": 1.4282838106155396, "learning_rate": 1.757895380476322e-05, "loss": 0.501, "step": 8580 }, { "epoch": 1.689656020260136, "grad_norm": 1.0056709051132202, "learning_rate": 1.7575896542236084e-05, "loss": 0.4711, "step": 8590 }, { "epoch": 1.6916230237761551, "grad_norm": 1.0339857339859009, "learning_rate": 1.757283927970895e-05, "loss": 0.5852, "step": 8600 }, { "epoch": 1.6935900272921738, "grad_norm": 1.7100114822387695, "learning_rate": 1.756978201718182e-05, "loss": 0.3511, "step": 8610 }, { "epoch": 1.6955570308081924, "grad_norm": 0.7833712697029114, "learning_rate": 1.756672475465468e-05, "loss": 0.5208, "step": 8620 }, { "epoch": 1.6975240343242115, "grad_norm": 0.9590697884559631, "learning_rate": 1.756366749212755e-05, "loss": 0.4618, "step": 8630 }, { "epoch": 1.6994910378402301, "grad_norm": 1.0845881700515747, "learning_rate": 1.7560610229600417e-05, "loss": 0.4621, "step": 8640 }, { "epoch": 1.7014580413562488, "grad_norm": 1.404337763786316, "learning_rate": 1.7557552967073282e-05, "loss": 0.4138, "step": 8650 }, { "epoch": 1.7034250448722679, "grad_norm": 0.9446793794631958, "learning_rate": 1.755449570454615e-05, "loss": 0.4518, "step": 8660 }, { "epoch": 1.7053920483882865, "grad_norm": 2.2327349185943604, "learning_rate": 1.7551438442019017e-05, "loss": 0.3963, "step": 8670 }, { "epoch": 1.7073590519043051, "grad_norm": 1.1843819618225098, "learning_rate": 1.7548381179491883e-05, "loss": 0.386, "step": 8680 }, { "epoch": 1.709326055420324, "grad_norm": 0.536867082118988, "learning_rate": 1.7545323916964752e-05, "loss": 0.4608, "step": 8690 }, { "epoch": 1.7112930589363429, "grad_norm": 1.349477767944336, "learning_rate": 1.7542266654437618e-05, "loss": 0.5574, "step": 8700 }, { "epoch": 1.7132600624523615, "grad_norm": 0.8360427618026733, "learning_rate": 1.7539209391910487e-05, "loss": 0.4974, "step": 8710 }, { "epoch": 1.7152270659683804, "grad_norm": 1.3188480138778687, "learning_rate": 1.7536152129383353e-05, "loss": 0.4416, "step": 8720 }, { "epoch": 1.7171940694843992, "grad_norm": 0.8572363257408142, "learning_rate": 1.753309486685622e-05, "loss": 0.513, "step": 8730 }, { "epoch": 1.7191610730004179, "grad_norm": 0.8236428499221802, "learning_rate": 1.7530037604329084e-05, "loss": 0.4828, "step": 8740 }, { "epoch": 1.7211280765164367, "grad_norm": 1.0772844552993774, "learning_rate": 1.752698034180195e-05, "loss": 0.3824, "step": 8750 }, { "epoch": 1.7230950800324556, "grad_norm": 1.132460355758667, "learning_rate": 1.752392307927482e-05, "loss": 0.4269, "step": 8760 }, { "epoch": 1.7250620835484742, "grad_norm": 1.9588900804519653, "learning_rate": 1.7520865816747685e-05, "loss": 0.4036, "step": 8770 }, { "epoch": 1.727029087064493, "grad_norm": 0.9417825937271118, "learning_rate": 1.751780855422055e-05, "loss": 0.4554, "step": 8780 }, { "epoch": 1.728996090580512, "grad_norm": 0.6179748773574829, "learning_rate": 1.751475129169342e-05, "loss": 0.4016, "step": 8790 }, { "epoch": 1.7309630940965306, "grad_norm": 1.306562900543213, "learning_rate": 1.7511694029166286e-05, "loss": 0.4883, "step": 8800 }, { "epoch": 1.7329300976125495, "grad_norm": 1.5696890354156494, "learning_rate": 1.7508636766639152e-05, "loss": 0.4443, "step": 8810 }, { "epoch": 1.7348971011285683, "grad_norm": 1.0068707466125488, "learning_rate": 1.750557950411202e-05, "loss": 0.4914, "step": 8820 }, { "epoch": 1.736864104644587, "grad_norm": 1.382118821144104, "learning_rate": 1.7502522241584887e-05, "loss": 0.3365, "step": 8830 }, { "epoch": 1.7388311081606058, "grad_norm": 2.175328493118286, "learning_rate": 1.7499464979057756e-05, "loss": 0.3458, "step": 8840 }, { "epoch": 1.7407981116766247, "grad_norm": 1.2995758056640625, "learning_rate": 1.7496407716530618e-05, "loss": 0.5217, "step": 8850 }, { "epoch": 1.7427651151926433, "grad_norm": 1.4120404720306396, "learning_rate": 1.7493350454003487e-05, "loss": 0.3349, "step": 8860 }, { "epoch": 1.7447321187086622, "grad_norm": 1.544440507888794, "learning_rate": 1.7490293191476353e-05, "loss": 0.4898, "step": 8870 }, { "epoch": 1.746699122224681, "grad_norm": 1.823754072189331, "learning_rate": 1.748723592894922e-05, "loss": 0.5335, "step": 8880 }, { "epoch": 1.7486661257406997, "grad_norm": 2.340019464492798, "learning_rate": 1.7484178666422088e-05, "loss": 0.4089, "step": 8890 }, { "epoch": 1.7506331292567185, "grad_norm": 1.16437828540802, "learning_rate": 1.7481121403894954e-05, "loss": 0.3566, "step": 8900 }, { "epoch": 1.7526001327727374, "grad_norm": 0.9248781800270081, "learning_rate": 1.747806414136782e-05, "loss": 0.4785, "step": 8910 }, { "epoch": 1.754567136288756, "grad_norm": 0.8662049770355225, "learning_rate": 1.747500687884069e-05, "loss": 0.4731, "step": 8920 }, { "epoch": 1.756534139804775, "grad_norm": 2.055873394012451, "learning_rate": 1.7471949616313555e-05, "loss": 0.548, "step": 8930 }, { "epoch": 1.7585011433207938, "grad_norm": 1.322381854057312, "learning_rate": 1.746889235378642e-05, "loss": 0.4159, "step": 8940 }, { "epoch": 1.7604681468368124, "grad_norm": 0.811429500579834, "learning_rate": 1.746583509125929e-05, "loss": 0.4281, "step": 8950 }, { "epoch": 1.7624351503528313, "grad_norm": 1.911391258239746, "learning_rate": 1.7462777828732155e-05, "loss": 0.4722, "step": 8960 }, { "epoch": 1.7644021538688501, "grad_norm": 1.6919752359390259, "learning_rate": 1.745972056620502e-05, "loss": 0.4453, "step": 8970 }, { "epoch": 1.7663691573848688, "grad_norm": 0.8237192630767822, "learning_rate": 1.7456663303677887e-05, "loss": 0.3616, "step": 8980 }, { "epoch": 1.7683361609008876, "grad_norm": 1.7030389308929443, "learning_rate": 1.7453606041150756e-05, "loss": 0.4525, "step": 8990 }, { "epoch": 1.7703031644169065, "grad_norm": 2.035853147506714, "learning_rate": 1.7450548778623622e-05, "loss": 0.4024, "step": 9000 }, { "epoch": 1.7703031644169065, "eval_loss": 0.23471477627754211, "eval_runtime": 8.9179, "eval_samples_per_second": 5.607, "eval_steps_per_second": 2.803, "step": 9000 }, { "epoch": 1.7722701679329251, "grad_norm": 1.2445576190948486, "learning_rate": 1.7447491516096488e-05, "loss": 0.4392, "step": 9010 }, { "epoch": 1.774237171448944, "grad_norm": 2.278787612915039, "learning_rate": 1.7444434253569357e-05, "loss": 0.4082, "step": 9020 }, { "epoch": 1.7762041749649629, "grad_norm": 1.5828843116760254, "learning_rate": 1.7441376991042223e-05, "loss": 0.4268, "step": 9030 }, { "epoch": 1.7781711784809815, "grad_norm": 1.3761073350906372, "learning_rate": 1.743831972851509e-05, "loss": 0.5517, "step": 9040 }, { "epoch": 1.7801381819970004, "grad_norm": 1.5714308023452759, "learning_rate": 1.7435262465987957e-05, "loss": 0.3992, "step": 9050 }, { "epoch": 1.7821051855130192, "grad_norm": 1.2077587842941284, "learning_rate": 1.7432205203460823e-05, "loss": 0.5408, "step": 9060 }, { "epoch": 1.7840721890290379, "grad_norm": 1.1043856143951416, "learning_rate": 1.742914794093369e-05, "loss": 0.4638, "step": 9070 }, { "epoch": 1.7860391925450567, "grad_norm": 2.701866388320923, "learning_rate": 1.7426090678406555e-05, "loss": 0.47, "step": 9080 }, { "epoch": 1.7880061960610756, "grad_norm": 0.7272081971168518, "learning_rate": 1.7423033415879424e-05, "loss": 0.4197, "step": 9090 }, { "epoch": 1.7899731995770942, "grad_norm": 1.856882929801941, "learning_rate": 1.741997615335229e-05, "loss": 0.4344, "step": 9100 }, { "epoch": 1.791940203093113, "grad_norm": 1.0670031309127808, "learning_rate": 1.7416918890825156e-05, "loss": 0.453, "step": 9110 }, { "epoch": 1.793907206609132, "grad_norm": 1.773953914642334, "learning_rate": 1.7413861628298025e-05, "loss": 0.4737, "step": 9120 }, { "epoch": 1.7958742101251506, "grad_norm": 2.5238022804260254, "learning_rate": 1.741080436577089e-05, "loss": 0.4638, "step": 9130 }, { "epoch": 1.7978412136411692, "grad_norm": 1.11234450340271, "learning_rate": 1.7407747103243756e-05, "loss": 0.3236, "step": 9140 }, { "epoch": 1.7998082171571883, "grad_norm": 0.8358986973762512, "learning_rate": 1.7404689840716625e-05, "loss": 0.5479, "step": 9150 }, { "epoch": 1.801775220673207, "grad_norm": 1.2683533430099487, "learning_rate": 1.740163257818949e-05, "loss": 0.4049, "step": 9160 }, { "epoch": 1.8037422241892256, "grad_norm": 1.2132279872894287, "learning_rate": 1.7398575315662357e-05, "loss": 0.4431, "step": 9170 }, { "epoch": 1.8057092277052447, "grad_norm": 2.021444082260132, "learning_rate": 1.7395518053135226e-05, "loss": 0.3557, "step": 9180 }, { "epoch": 1.8076762312212633, "grad_norm": 2.0817456245422363, "learning_rate": 1.739246079060809e-05, "loss": 0.4853, "step": 9190 }, { "epoch": 1.809643234737282, "grad_norm": 1.6060495376586914, "learning_rate": 1.7389403528080958e-05, "loss": 0.4487, "step": 9200 }, { "epoch": 1.811610238253301, "grad_norm": 1.1672077178955078, "learning_rate": 1.7386346265553823e-05, "loss": 0.4644, "step": 9210 }, { "epoch": 1.8135772417693197, "grad_norm": 1.7596189975738525, "learning_rate": 1.7383289003026693e-05, "loss": 0.4474, "step": 9220 }, { "epoch": 1.8155442452853383, "grad_norm": 1.368772268295288, "learning_rate": 1.738023174049956e-05, "loss": 0.3707, "step": 9230 }, { "epoch": 1.8175112488013574, "grad_norm": 1.8463257551193237, "learning_rate": 1.7377174477972424e-05, "loss": 0.426, "step": 9240 }, { "epoch": 1.819478252317376, "grad_norm": 2.007481813430786, "learning_rate": 1.7374117215445293e-05, "loss": 0.4587, "step": 9250 }, { "epoch": 1.8214452558333947, "grad_norm": 1.2451355457305908, "learning_rate": 1.737105995291816e-05, "loss": 0.3996, "step": 9260 }, { "epoch": 1.8234122593494135, "grad_norm": 0.9296106696128845, "learning_rate": 1.7368002690391025e-05, "loss": 0.5172, "step": 9270 }, { "epoch": 1.8253792628654324, "grad_norm": 1.6007474660873413, "learning_rate": 1.7364945427863894e-05, "loss": 0.4304, "step": 9280 }, { "epoch": 1.827346266381451, "grad_norm": 1.849847674369812, "learning_rate": 1.736188816533676e-05, "loss": 0.4836, "step": 9290 }, { "epoch": 1.82931326989747, "grad_norm": 1.7262097597122192, "learning_rate": 1.7358830902809626e-05, "loss": 0.4367, "step": 9300 }, { "epoch": 1.8312802734134888, "grad_norm": 1.2491943836212158, "learning_rate": 1.735577364028249e-05, "loss": 0.4727, "step": 9310 }, { "epoch": 1.8332472769295074, "grad_norm": 0.9708894491195679, "learning_rate": 1.7352716377755357e-05, "loss": 0.347, "step": 9320 }, { "epoch": 1.8352142804455263, "grad_norm": 1.439257025718689, "learning_rate": 1.7349659115228226e-05, "loss": 0.5331, "step": 9330 }, { "epoch": 1.8371812839615451, "grad_norm": 1.9630393981933594, "learning_rate": 1.7346601852701092e-05, "loss": 0.3824, "step": 9340 }, { "epoch": 1.8391482874775638, "grad_norm": 0.876420259475708, "learning_rate": 1.734354459017396e-05, "loss": 0.3851, "step": 9350 }, { "epoch": 1.8411152909935826, "grad_norm": 1.0410975217819214, "learning_rate": 1.7340487327646827e-05, "loss": 0.3504, "step": 9360 }, { "epoch": 1.8430822945096015, "grad_norm": 1.2595709562301636, "learning_rate": 1.7337430065119693e-05, "loss": 0.4824, "step": 9370 }, { "epoch": 1.8450492980256201, "grad_norm": 0.9842739701271057, "learning_rate": 1.7334372802592562e-05, "loss": 0.4738, "step": 9380 }, { "epoch": 1.847016301541639, "grad_norm": 0.8385689854621887, "learning_rate": 1.7331315540065428e-05, "loss": 0.487, "step": 9390 }, { "epoch": 1.8489833050576578, "grad_norm": 0.9482077360153198, "learning_rate": 1.7328258277538294e-05, "loss": 0.5305, "step": 9400 }, { "epoch": 1.8509503085736765, "grad_norm": 2.2385172843933105, "learning_rate": 1.732520101501116e-05, "loss": 0.4707, "step": 9410 }, { "epoch": 1.8529173120896953, "grad_norm": 1.1302319765090942, "learning_rate": 1.7322143752484025e-05, "loss": 0.4294, "step": 9420 }, { "epoch": 1.8548843156057142, "grad_norm": 1.530410885810852, "learning_rate": 1.7319086489956894e-05, "loss": 0.5544, "step": 9430 }, { "epoch": 1.8568513191217328, "grad_norm": 0.8635900020599365, "learning_rate": 1.731602922742976e-05, "loss": 0.4827, "step": 9440 }, { "epoch": 1.8588183226377517, "grad_norm": 1.119480848312378, "learning_rate": 1.7312971964902626e-05, "loss": 0.3917, "step": 9450 }, { "epoch": 1.8607853261537706, "grad_norm": 1.0276856422424316, "learning_rate": 1.7309914702375495e-05, "loss": 0.3687, "step": 9460 }, { "epoch": 1.8627523296697892, "grad_norm": 1.7624456882476807, "learning_rate": 1.730685743984836e-05, "loss": 0.4276, "step": 9470 }, { "epoch": 1.864719333185808, "grad_norm": 1.3257324695587158, "learning_rate": 1.730380017732123e-05, "loss": 0.3444, "step": 9480 }, { "epoch": 1.866686336701827, "grad_norm": 1.4196683168411255, "learning_rate": 1.7300742914794096e-05, "loss": 0.3877, "step": 9490 }, { "epoch": 1.8686533402178456, "grad_norm": 1.4340052604675293, "learning_rate": 1.729768565226696e-05, "loss": 0.3258, "step": 9500 }, { "epoch": 1.8686533402178456, "eval_loss": 0.21660512685775757, "eval_runtime": 8.8832, "eval_samples_per_second": 5.629, "eval_steps_per_second": 2.814, "step": 9500 }, { "epoch": 1.8706203437338644, "grad_norm": 1.190011978149414, "learning_rate": 1.729462838973983e-05, "loss": 0.3835, "step": 9510 }, { "epoch": 1.8725873472498833, "grad_norm": 1.5729475021362305, "learning_rate": 1.7291571127212696e-05, "loss": 0.3737, "step": 9520 }, { "epoch": 1.874554350765902, "grad_norm": 1.9554622173309326, "learning_rate": 1.7288513864685562e-05, "loss": 0.5472, "step": 9530 }, { "epoch": 1.8765213542819208, "grad_norm": 1.2588348388671875, "learning_rate": 1.7285456602158428e-05, "loss": 0.3402, "step": 9540 }, { "epoch": 1.8784883577979397, "grad_norm": 1.588701844215393, "learning_rate": 1.7282399339631294e-05, "loss": 0.5021, "step": 9550 }, { "epoch": 1.8804553613139583, "grad_norm": 1.0969479084014893, "learning_rate": 1.7279342077104163e-05, "loss": 0.4521, "step": 9560 }, { "epoch": 1.8824223648299772, "grad_norm": 0.828027606010437, "learning_rate": 1.727628481457703e-05, "loss": 0.5221, "step": 9570 }, { "epoch": 1.884389368345996, "grad_norm": 0.9304088950157166, "learning_rate": 1.7273227552049895e-05, "loss": 0.4416, "step": 9580 }, { "epoch": 1.8863563718620147, "grad_norm": 1.506238341331482, "learning_rate": 1.7270170289522764e-05, "loss": 0.2849, "step": 9590 }, { "epoch": 1.8883233753780335, "grad_norm": 2.2380335330963135, "learning_rate": 1.726711302699563e-05, "loss": 0.3606, "step": 9600 }, { "epoch": 1.8902903788940524, "grad_norm": 1.978633165359497, "learning_rate": 1.72640557644685e-05, "loss": 0.4753, "step": 9610 }, { "epoch": 1.892257382410071, "grad_norm": 2.0024452209472656, "learning_rate": 1.7260998501941364e-05, "loss": 0.43, "step": 9620 }, { "epoch": 1.8942243859260899, "grad_norm": 1.901621699333191, "learning_rate": 1.725794123941423e-05, "loss": 0.3712, "step": 9630 }, { "epoch": 1.8961913894421087, "grad_norm": 1.4384682178497314, "learning_rate": 1.7254883976887096e-05, "loss": 0.4753, "step": 9640 }, { "epoch": 1.8981583929581274, "grad_norm": 1.5139740705490112, "learning_rate": 1.7251826714359962e-05, "loss": 0.5545, "step": 9650 }, { "epoch": 1.9001253964741462, "grad_norm": 1.0212291479110718, "learning_rate": 1.724876945183283e-05, "loss": 0.3801, "step": 9660 }, { "epoch": 1.902092399990165, "grad_norm": 0.6466912031173706, "learning_rate": 1.7245712189305697e-05, "loss": 0.4541, "step": 9670 }, { "epoch": 1.9040594035061837, "grad_norm": 1.1486016511917114, "learning_rate": 1.7242654926778562e-05, "loss": 0.3891, "step": 9680 }, { "epoch": 1.9060264070222026, "grad_norm": 1.5629327297210693, "learning_rate": 1.723959766425143e-05, "loss": 0.4214, "step": 9690 }, { "epoch": 1.9079934105382215, "grad_norm": 1.9606152772903442, "learning_rate": 1.7236540401724297e-05, "loss": 0.4845, "step": 9700 }, { "epoch": 1.90996041405424, "grad_norm": 1.438989281654358, "learning_rate": 1.7233483139197163e-05, "loss": 0.399, "step": 9710 }, { "epoch": 1.9119274175702587, "grad_norm": 1.5821136236190796, "learning_rate": 1.7230425876670032e-05, "loss": 0.4228, "step": 9720 }, { "epoch": 1.9138944210862778, "grad_norm": 1.7703495025634766, "learning_rate": 1.7227368614142898e-05, "loss": 0.5136, "step": 9730 }, { "epoch": 1.9158614246022965, "grad_norm": 0.7840451598167419, "learning_rate": 1.7224311351615767e-05, "loss": 0.5638, "step": 9740 }, { "epoch": 1.917828428118315, "grad_norm": 1.6046384572982788, "learning_rate": 1.722125408908863e-05, "loss": 0.5003, "step": 9750 }, { "epoch": 1.9197954316343342, "grad_norm": 1.572587013244629, "learning_rate": 1.72181968265615e-05, "loss": 0.4326, "step": 9760 }, { "epoch": 1.9217624351503528, "grad_norm": 1.9076180458068848, "learning_rate": 1.7215139564034365e-05, "loss": 0.4825, "step": 9770 }, { "epoch": 1.9237294386663715, "grad_norm": 0.769214928150177, "learning_rate": 1.721208230150723e-05, "loss": 0.4463, "step": 9780 }, { "epoch": 1.9256964421823906, "grad_norm": 1.7923251390457153, "learning_rate": 1.72090250389801e-05, "loss": 0.4338, "step": 9790 }, { "epoch": 1.9276634456984092, "grad_norm": 1.6408928632736206, "learning_rate": 1.7205967776452965e-05, "loss": 0.534, "step": 9800 }, { "epoch": 1.9296304492144278, "grad_norm": 0.9936132431030273, "learning_rate": 1.720291051392583e-05, "loss": 0.5031, "step": 9810 }, { "epoch": 1.931597452730447, "grad_norm": 1.5383307933807373, "learning_rate": 1.71998532513987e-05, "loss": 0.4663, "step": 9820 }, { "epoch": 1.9335644562464656, "grad_norm": 1.4885003566741943, "learning_rate": 1.7196795988871566e-05, "loss": 0.3868, "step": 9830 }, { "epoch": 1.9355314597624842, "grad_norm": 1.8248869180679321, "learning_rate": 1.7193738726344432e-05, "loss": 0.6038, "step": 9840 }, { "epoch": 1.937498463278503, "grad_norm": 0.8678923845291138, "learning_rate": 1.71906814638173e-05, "loss": 0.5078, "step": 9850 }, { "epoch": 1.939465466794522, "grad_norm": 1.997922420501709, "learning_rate": 1.7187624201290167e-05, "loss": 0.4789, "step": 9860 }, { "epoch": 1.9414324703105406, "grad_norm": 1.5485014915466309, "learning_rate": 1.7184566938763033e-05, "loss": 0.5152, "step": 9870 }, { "epoch": 1.9433994738265594, "grad_norm": 1.3265610933303833, "learning_rate": 1.71815096762359e-05, "loss": 0.4607, "step": 9880 }, { "epoch": 1.9453664773425783, "grad_norm": 0.9985576868057251, "learning_rate": 1.7178452413708768e-05, "loss": 0.4812, "step": 9890 }, { "epoch": 1.947333480858597, "grad_norm": 1.0717341899871826, "learning_rate": 1.7175395151181633e-05, "loss": 0.3841, "step": 9900 }, { "epoch": 1.9493004843746158, "grad_norm": 2.604396343231201, "learning_rate": 1.71723378886545e-05, "loss": 0.5168, "step": 9910 }, { "epoch": 1.9512674878906346, "grad_norm": 1.9883261919021606, "learning_rate": 1.7169280626127368e-05, "loss": 0.4564, "step": 9920 }, { "epoch": 1.9532344914066533, "grad_norm": 1.710569977760315, "learning_rate": 1.7166223363600234e-05, "loss": 0.4084, "step": 9930 }, { "epoch": 1.9552014949226721, "grad_norm": 1.1840331554412842, "learning_rate": 1.71631661010731e-05, "loss": 0.517, "step": 9940 }, { "epoch": 1.957168498438691, "grad_norm": 1.052003264427185, "learning_rate": 1.716010883854597e-05, "loss": 0.5016, "step": 9950 }, { "epoch": 1.9591355019547096, "grad_norm": 2.5739831924438477, "learning_rate": 1.7157051576018835e-05, "loss": 0.4217, "step": 9960 }, { "epoch": 1.9611025054707285, "grad_norm": 1.5411380529403687, "learning_rate": 1.71539943134917e-05, "loss": 0.3677, "step": 9970 }, { "epoch": 1.9630695089867474, "grad_norm": 1.0582150220870972, "learning_rate": 1.7150937050964566e-05, "loss": 0.4218, "step": 9980 }, { "epoch": 1.965036512502766, "grad_norm": 1.0257982015609741, "learning_rate": 1.7147879788437432e-05, "loss": 0.4642, "step": 9990 }, { "epoch": 1.9670035160187849, "grad_norm": 1.5949499607086182, "learning_rate": 1.71448225259103e-05, "loss": 0.4337, "step": 10000 }, { "epoch": 1.9670035160187849, "eval_loss": 0.2338702380657196, "eval_runtime": 8.8698, "eval_samples_per_second": 5.637, "eval_steps_per_second": 2.819, "step": 10000 }, { "epoch": 1.9689705195348037, "grad_norm": 1.4009411334991455, "learning_rate": 1.7141765263383167e-05, "loss": 0.433, "step": 10010 }, { "epoch": 1.9709375230508224, "grad_norm": 1.0874663591384888, "learning_rate": 1.7138708000856036e-05, "loss": 0.3726, "step": 10020 }, { "epoch": 1.9729045265668412, "grad_norm": 1.939491868019104, "learning_rate": 1.7135650738328902e-05, "loss": 0.479, "step": 10030 }, { "epoch": 1.97487153008286, "grad_norm": 1.9109244346618652, "learning_rate": 1.7132593475801768e-05, "loss": 0.4814, "step": 10040 }, { "epoch": 1.9768385335988787, "grad_norm": 1.6677470207214355, "learning_rate": 1.7129536213274637e-05, "loss": 0.3483, "step": 10050 }, { "epoch": 1.9788055371148976, "grad_norm": 0.8189138770103455, "learning_rate": 1.7126478950747503e-05, "loss": 0.5348, "step": 10060 }, { "epoch": 1.9807725406309165, "grad_norm": 2.4953317642211914, "learning_rate": 1.712342168822037e-05, "loss": 0.4601, "step": 10070 }, { "epoch": 1.982739544146935, "grad_norm": 1.0296778678894043, "learning_rate": 1.7120364425693238e-05, "loss": 0.387, "step": 10080 }, { "epoch": 1.984706547662954, "grad_norm": 1.6703166961669922, "learning_rate": 1.71173071631661e-05, "loss": 0.4055, "step": 10090 }, { "epoch": 1.9866735511789728, "grad_norm": 1.5256836414337158, "learning_rate": 1.711424990063897e-05, "loss": 0.5053, "step": 10100 }, { "epoch": 1.9886405546949915, "grad_norm": 1.264963984489441, "learning_rate": 1.7111192638111835e-05, "loss": 0.5051, "step": 10110 }, { "epoch": 1.9906075582110103, "grad_norm": 0.8529186248779297, "learning_rate": 1.71081353755847e-05, "loss": 0.5238, "step": 10120 }, { "epoch": 1.9925745617270292, "grad_norm": 1.695892333984375, "learning_rate": 1.710507811305757e-05, "loss": 0.3558, "step": 10130 }, { "epoch": 1.9945415652430478, "grad_norm": 1.855906367301941, "learning_rate": 1.7102020850530436e-05, "loss": 0.4037, "step": 10140 }, { "epoch": 1.9965085687590667, "grad_norm": 2.48172926902771, "learning_rate": 1.7098963588003305e-05, "loss": 0.5684, "step": 10150 }, { "epoch": 1.9984755722750855, "grad_norm": 2.117180585861206, "learning_rate": 1.709590632547617e-05, "loss": 0.426, "step": 10160 }, { "epoch": 2.000442575791104, "grad_norm": 1.000969409942627, "learning_rate": 1.7092849062949036e-05, "loss": 0.366, "step": 10170 }, { "epoch": 2.002409579307123, "grad_norm": 1.5670902729034424, "learning_rate": 1.7089791800421906e-05, "loss": 0.3538, "step": 10180 }, { "epoch": 2.004376582823142, "grad_norm": 2.4152426719665527, "learning_rate": 1.708673453789477e-05, "loss": 0.3628, "step": 10190 }, { "epoch": 2.0063435863391605, "grad_norm": 1.6392732858657837, "learning_rate": 1.7083677275367637e-05, "loss": 0.4825, "step": 10200 }, { "epoch": 2.008310589855179, "grad_norm": 1.2263078689575195, "learning_rate": 1.7080620012840503e-05, "loss": 0.3793, "step": 10210 }, { "epoch": 2.0102775933711983, "grad_norm": 0.8084795475006104, "learning_rate": 1.707756275031337e-05, "loss": 0.4204, "step": 10220 }, { "epoch": 2.012244596887217, "grad_norm": 1.0626640319824219, "learning_rate": 1.7074505487786238e-05, "loss": 0.4433, "step": 10230 }, { "epoch": 2.0142116004032355, "grad_norm": 1.165838360786438, "learning_rate": 1.7071448225259104e-05, "loss": 0.4568, "step": 10240 }, { "epoch": 2.0161786039192546, "grad_norm": 1.4117451906204224, "learning_rate": 1.706839096273197e-05, "loss": 0.4427, "step": 10250 }, { "epoch": 2.0181456074352733, "grad_norm": 1.1329997777938843, "learning_rate": 1.706533370020484e-05, "loss": 0.4228, "step": 10260 }, { "epoch": 2.020112610951292, "grad_norm": 2.336711883544922, "learning_rate": 1.7062276437677704e-05, "loss": 0.4538, "step": 10270 }, { "epoch": 2.022079614467311, "grad_norm": 0.8653255701065063, "learning_rate": 1.7059219175150574e-05, "loss": 0.432, "step": 10280 }, { "epoch": 2.0240466179833296, "grad_norm": 1.8122618198394775, "learning_rate": 1.705616191262344e-05, "loss": 0.4071, "step": 10290 }, { "epoch": 2.0260136214993483, "grad_norm": 0.9676852226257324, "learning_rate": 1.7053104650096305e-05, "loss": 0.4196, "step": 10300 }, { "epoch": 2.0279806250153674, "grad_norm": 0.8326351046562195, "learning_rate": 1.7050047387569174e-05, "loss": 0.41, "step": 10310 }, { "epoch": 2.029947628531386, "grad_norm": 0.9962462782859802, "learning_rate": 1.7046990125042037e-05, "loss": 0.4066, "step": 10320 }, { "epoch": 2.0319146320474046, "grad_norm": 2.096683979034424, "learning_rate": 1.7043932862514906e-05, "loss": 0.4446, "step": 10330 }, { "epoch": 2.0338816355634237, "grad_norm": 1.3960990905761719, "learning_rate": 1.704087559998777e-05, "loss": 0.4261, "step": 10340 }, { "epoch": 2.0358486390794424, "grad_norm": 1.7559967041015625, "learning_rate": 1.7037818337460637e-05, "loss": 0.4218, "step": 10350 }, { "epoch": 2.037815642595461, "grad_norm": 1.368927240371704, "learning_rate": 1.7034761074933507e-05, "loss": 0.3458, "step": 10360 }, { "epoch": 2.03978264611148, "grad_norm": 1.6407620906829834, "learning_rate": 1.7031703812406372e-05, "loss": 0.5405, "step": 10370 }, { "epoch": 2.0417496496274987, "grad_norm": 1.106787085533142, "learning_rate": 1.7028646549879238e-05, "loss": 0.5497, "step": 10380 }, { "epoch": 2.0437166531435174, "grad_norm": 1.9294019937515259, "learning_rate": 1.7025589287352107e-05, "loss": 0.5084, "step": 10390 }, { "epoch": 2.0456836566595364, "grad_norm": 0.8011900186538696, "learning_rate": 1.7022532024824973e-05, "loss": 0.5933, "step": 10400 }, { "epoch": 2.047650660175555, "grad_norm": 1.3763272762298584, "learning_rate": 1.7019474762297842e-05, "loss": 0.4236, "step": 10410 }, { "epoch": 2.0496176636915737, "grad_norm": 1.301306128501892, "learning_rate": 1.7016417499770708e-05, "loss": 0.4095, "step": 10420 }, { "epoch": 2.051584667207593, "grad_norm": 2.0290560722351074, "learning_rate": 1.7013360237243574e-05, "loss": 0.4528, "step": 10430 }, { "epoch": 2.0535516707236114, "grad_norm": 0.6603031754493713, "learning_rate": 1.701030297471644e-05, "loss": 0.4511, "step": 10440 }, { "epoch": 2.05551867423963, "grad_norm": 1.3809963464736938, "learning_rate": 1.7007245712189305e-05, "loss": 0.4434, "step": 10450 }, { "epoch": 2.057485677755649, "grad_norm": 1.5114200115203857, "learning_rate": 1.7004188449662174e-05, "loss": 0.5448, "step": 10460 }, { "epoch": 2.059452681271668, "grad_norm": 2.5838265419006348, "learning_rate": 1.700113118713504e-05, "loss": 0.4025, "step": 10470 }, { "epoch": 2.0614196847876864, "grad_norm": 1.1007928848266602, "learning_rate": 1.6998073924607906e-05, "loss": 0.4732, "step": 10480 }, { "epoch": 2.0633866883037055, "grad_norm": 1.4720772504806519, "learning_rate": 1.6995016662080775e-05, "loss": 0.4076, "step": 10490 }, { "epoch": 2.065353691819724, "grad_norm": 1.3407565355300903, "learning_rate": 1.699195939955364e-05, "loss": 0.3955, "step": 10500 }, { "epoch": 2.065353691819724, "eval_loss": 0.22142630815505981, "eval_runtime": 8.865, "eval_samples_per_second": 5.64, "eval_steps_per_second": 2.82, "step": 10500 }, { "epoch": 2.067320695335743, "grad_norm": 1.8334215879440308, "learning_rate": 1.6988902137026507e-05, "loss": 0.4093, "step": 10510 }, { "epoch": 2.069287698851762, "grad_norm": 1.8577845096588135, "learning_rate": 1.6985844874499376e-05, "loss": 0.344, "step": 10520 }, { "epoch": 2.0712547023677805, "grad_norm": 1.6269792318344116, "learning_rate": 1.698278761197224e-05, "loss": 0.3725, "step": 10530 }, { "epoch": 2.073221705883799, "grad_norm": 2.4148001670837402, "learning_rate": 1.697973034944511e-05, "loss": 0.347, "step": 10540 }, { "epoch": 2.0751887093998183, "grad_norm": 2.106750965118408, "learning_rate": 1.6976673086917973e-05, "loss": 0.4462, "step": 10550 }, { "epoch": 2.077155712915837, "grad_norm": 1.6390737295150757, "learning_rate": 1.6973615824390842e-05, "loss": 0.44, "step": 10560 }, { "epoch": 2.0791227164318555, "grad_norm": 1.192014455795288, "learning_rate": 1.6970558561863708e-05, "loss": 0.4548, "step": 10570 }, { "epoch": 2.0810897199478746, "grad_norm": 0.9265616536140442, "learning_rate": 1.6967501299336574e-05, "loss": 0.5047, "step": 10580 }, { "epoch": 2.0830567234638933, "grad_norm": 1.0966876745224, "learning_rate": 1.6964444036809443e-05, "loss": 0.3367, "step": 10590 }, { "epoch": 2.085023726979912, "grad_norm": 1.5295296907424927, "learning_rate": 1.696138677428231e-05, "loss": 0.3238, "step": 10600 }, { "epoch": 2.0869907304959305, "grad_norm": 1.5133509635925293, "learning_rate": 1.6958329511755175e-05, "loss": 0.5125, "step": 10610 }, { "epoch": 2.0889577340119496, "grad_norm": 1.9635529518127441, "learning_rate": 1.6955272249228044e-05, "loss": 0.3324, "step": 10620 }, { "epoch": 2.0909247375279683, "grad_norm": 1.3140554428100586, "learning_rate": 1.695221498670091e-05, "loss": 0.3871, "step": 10630 }, { "epoch": 2.092891741043987, "grad_norm": 1.486924171447754, "learning_rate": 1.6949157724173775e-05, "loss": 0.3921, "step": 10640 }, { "epoch": 2.094858744560006, "grad_norm": 1.5189197063446045, "learning_rate": 1.6946100461646645e-05, "loss": 0.3921, "step": 10650 }, { "epoch": 2.0968257480760246, "grad_norm": 2.586416006088257, "learning_rate": 1.694304319911951e-05, "loss": 0.4417, "step": 10660 }, { "epoch": 2.0987927515920433, "grad_norm": 1.2350314855575562, "learning_rate": 1.6939985936592376e-05, "loss": 0.4176, "step": 10670 }, { "epoch": 2.1007597551080623, "grad_norm": 1.220737099647522, "learning_rate": 1.6936928674065242e-05, "loss": 0.4954, "step": 10680 }, { "epoch": 2.102726758624081, "grad_norm": 0.970892608165741, "learning_rate": 1.693387141153811e-05, "loss": 0.3041, "step": 10690 }, { "epoch": 2.1046937621400996, "grad_norm": 1.9684553146362305, "learning_rate": 1.6930814149010977e-05, "loss": 0.4289, "step": 10700 }, { "epoch": 2.1066607656561187, "grad_norm": 1.4467488527297974, "learning_rate": 1.6927756886483843e-05, "loss": 0.4797, "step": 10710 }, { "epoch": 2.1086277691721373, "grad_norm": 1.9098058938980103, "learning_rate": 1.6924699623956712e-05, "loss": 0.3886, "step": 10720 }, { "epoch": 2.110594772688156, "grad_norm": 1.4749529361724854, "learning_rate": 1.6921642361429578e-05, "loss": 0.3511, "step": 10730 }, { "epoch": 2.112561776204175, "grad_norm": 1.6586591005325317, "learning_rate": 1.6918585098902443e-05, "loss": 0.4309, "step": 10740 }, { "epoch": 2.1145287797201937, "grad_norm": 1.833769679069519, "learning_rate": 1.6915527836375313e-05, "loss": 0.4512, "step": 10750 }, { "epoch": 2.1164957832362123, "grad_norm": 0.7013012766838074, "learning_rate": 1.6912470573848178e-05, "loss": 0.3429, "step": 10760 }, { "epoch": 2.1184627867522314, "grad_norm": 0.8748033046722412, "learning_rate": 1.6909413311321044e-05, "loss": 0.4688, "step": 10770 }, { "epoch": 2.12042979026825, "grad_norm": 1.659879207611084, "learning_rate": 1.690635604879391e-05, "loss": 0.4047, "step": 10780 }, { "epoch": 2.1223967937842687, "grad_norm": 1.2329697608947754, "learning_rate": 1.690329878626678e-05, "loss": 0.4941, "step": 10790 }, { "epoch": 2.124363797300288, "grad_norm": 1.6511222124099731, "learning_rate": 1.6900241523739645e-05, "loss": 0.32, "step": 10800 }, { "epoch": 2.1263308008163064, "grad_norm": 2.3250954151153564, "learning_rate": 1.689718426121251e-05, "loss": 0.4237, "step": 10810 }, { "epoch": 2.128297804332325, "grad_norm": 1.3927966356277466, "learning_rate": 1.689412699868538e-05, "loss": 0.3225, "step": 10820 }, { "epoch": 2.130264807848344, "grad_norm": 1.1719884872436523, "learning_rate": 1.6891069736158246e-05, "loss": 0.4178, "step": 10830 }, { "epoch": 2.132231811364363, "grad_norm": 0.8177443742752075, "learning_rate": 1.688801247363111e-05, "loss": 0.455, "step": 10840 }, { "epoch": 2.1341988148803814, "grad_norm": 2.0233986377716064, "learning_rate": 1.688495521110398e-05, "loss": 0.4306, "step": 10850 }, { "epoch": 2.1361658183964005, "grad_norm": 2.01068377494812, "learning_rate": 1.6881897948576846e-05, "loss": 0.4518, "step": 10860 }, { "epoch": 2.138132821912419, "grad_norm": 1.5686466693878174, "learning_rate": 1.6878840686049712e-05, "loss": 0.4131, "step": 10870 }, { "epoch": 2.140099825428438, "grad_norm": 1.2859925031661987, "learning_rate": 1.687578342352258e-05, "loss": 0.4335, "step": 10880 }, { "epoch": 2.142066828944457, "grad_norm": 1.372796893119812, "learning_rate": 1.6872726160995444e-05, "loss": 0.4711, "step": 10890 }, { "epoch": 2.1440338324604755, "grad_norm": 1.5391151905059814, "learning_rate": 1.6869668898468313e-05, "loss": 0.4958, "step": 10900 }, { "epoch": 2.146000835976494, "grad_norm": 1.3352559804916382, "learning_rate": 1.686661163594118e-05, "loss": 0.6165, "step": 10910 }, { "epoch": 2.1479678394925132, "grad_norm": 2.093535900115967, "learning_rate": 1.6863554373414048e-05, "loss": 0.4023, "step": 10920 }, { "epoch": 2.149934843008532, "grad_norm": 1.734489917755127, "learning_rate": 1.6860497110886913e-05, "loss": 0.4107, "step": 10930 }, { "epoch": 2.1519018465245505, "grad_norm": 1.1061903238296509, "learning_rate": 1.685743984835978e-05, "loss": 0.387, "step": 10940 }, { "epoch": 2.1538688500405696, "grad_norm": 0.8513095378875732, "learning_rate": 1.685438258583265e-05, "loss": 0.3447, "step": 10950 }, { "epoch": 2.1558358535565882, "grad_norm": 1.3149404525756836, "learning_rate": 1.6851325323305514e-05, "loss": 0.4679, "step": 10960 }, { "epoch": 2.157802857072607, "grad_norm": 0.6072118282318115, "learning_rate": 1.684826806077838e-05, "loss": 0.4193, "step": 10970 }, { "epoch": 2.159769860588626, "grad_norm": 1.0672342777252197, "learning_rate": 1.684521079825125e-05, "loss": 0.4421, "step": 10980 }, { "epoch": 2.1617368641046446, "grad_norm": 1.4868695735931396, "learning_rate": 1.6842153535724115e-05, "loss": 0.4104, "step": 10990 }, { "epoch": 2.1637038676206632, "grad_norm": 2.028120279312134, "learning_rate": 1.683909627319698e-05, "loss": 0.3666, "step": 11000 }, { "epoch": 2.1637038676206632, "eval_loss": 0.2037489265203476, "eval_runtime": 8.8865, "eval_samples_per_second": 5.627, "eval_steps_per_second": 2.813, "step": 11000 }, { "epoch": 2.1656708711366823, "grad_norm": 1.9844077825546265, "learning_rate": 1.6836039010669846e-05, "loss": 0.3107, "step": 11010 }, { "epoch": 2.167637874652701, "grad_norm": 0.890990674495697, "learning_rate": 1.6832981748142712e-05, "loss": 0.4025, "step": 11020 }, { "epoch": 2.1696048781687196, "grad_norm": 1.8403328657150269, "learning_rate": 1.682992448561558e-05, "loss": 0.3271, "step": 11030 }, { "epoch": 2.1715718816847387, "grad_norm": 1.9692051410675049, "learning_rate": 1.6826867223088447e-05, "loss": 0.4018, "step": 11040 }, { "epoch": 2.1735388852007573, "grad_norm": 1.9689651727676392, "learning_rate": 1.6823809960561316e-05, "loss": 0.4714, "step": 11050 }, { "epoch": 2.175505888716776, "grad_norm": 1.6802412271499634, "learning_rate": 1.6820752698034182e-05, "loss": 0.4456, "step": 11060 }, { "epoch": 2.177472892232795, "grad_norm": 0.762005090713501, "learning_rate": 1.6817695435507048e-05, "loss": 0.4683, "step": 11070 }, { "epoch": 2.1794398957488137, "grad_norm": 1.079770803451538, "learning_rate": 1.6814638172979917e-05, "loss": 0.4442, "step": 11080 }, { "epoch": 2.1814068992648323, "grad_norm": 1.6594971418380737, "learning_rate": 1.6811580910452783e-05, "loss": 0.3854, "step": 11090 }, { "epoch": 2.1833739027808514, "grad_norm": 2.0914525985717773, "learning_rate": 1.680852364792565e-05, "loss": 0.4987, "step": 11100 }, { "epoch": 2.18534090629687, "grad_norm": 0.9866094589233398, "learning_rate": 1.6805466385398514e-05, "loss": 0.4544, "step": 11110 }, { "epoch": 2.1873079098128887, "grad_norm": 1.063025712966919, "learning_rate": 1.680240912287138e-05, "loss": 0.3954, "step": 11120 }, { "epoch": 2.189274913328908, "grad_norm": 1.4042121171951294, "learning_rate": 1.679935186034425e-05, "loss": 0.5461, "step": 11130 }, { "epoch": 2.1912419168449264, "grad_norm": 1.1642961502075195, "learning_rate": 1.6796294597817115e-05, "loss": 0.4287, "step": 11140 }, { "epoch": 2.193208920360945, "grad_norm": 1.1392892599105835, "learning_rate": 1.679323733528998e-05, "loss": 0.4508, "step": 11150 }, { "epoch": 2.195175923876964, "grad_norm": 1.2517368793487549, "learning_rate": 1.679018007276285e-05, "loss": 0.4513, "step": 11160 }, { "epoch": 2.197142927392983, "grad_norm": 1.0235626697540283, "learning_rate": 1.6787122810235716e-05, "loss": 0.4612, "step": 11170 }, { "epoch": 2.1991099309090014, "grad_norm": 0.9338393211364746, "learning_rate": 1.6784065547708585e-05, "loss": 0.4623, "step": 11180 }, { "epoch": 2.2010769344250205, "grad_norm": 1.9318597316741943, "learning_rate": 1.678100828518145e-05, "loss": 0.5033, "step": 11190 }, { "epoch": 2.203043937941039, "grad_norm": 1.3232470750808716, "learning_rate": 1.6777951022654317e-05, "loss": 0.5158, "step": 11200 }, { "epoch": 2.205010941457058, "grad_norm": 1.7988661527633667, "learning_rate": 1.6774893760127186e-05, "loss": 0.5245, "step": 11210 }, { "epoch": 2.206977944973077, "grad_norm": 1.9910075664520264, "learning_rate": 1.677183649760005e-05, "loss": 0.4869, "step": 11220 }, { "epoch": 2.2089449484890955, "grad_norm": 0.9067610502243042, "learning_rate": 1.6768779235072917e-05, "loss": 0.4429, "step": 11230 }, { "epoch": 2.210911952005114, "grad_norm": 1.7104099988937378, "learning_rate": 1.6765721972545783e-05, "loss": 0.3101, "step": 11240 }, { "epoch": 2.2128789555211332, "grad_norm": 1.0749773979187012, "learning_rate": 1.676266471001865e-05, "loss": 0.6143, "step": 11250 }, { "epoch": 2.214845959037152, "grad_norm": 1.470632791519165, "learning_rate": 1.6759607447491518e-05, "loss": 0.2978, "step": 11260 }, { "epoch": 2.2168129625531705, "grad_norm": 1.5304147005081177, "learning_rate": 1.6756550184964384e-05, "loss": 0.4798, "step": 11270 }, { "epoch": 2.218779966069189, "grad_norm": 1.8575870990753174, "learning_rate": 1.675349292243725e-05, "loss": 0.3762, "step": 11280 }, { "epoch": 2.2207469695852082, "grad_norm": 1.5989304780960083, "learning_rate": 1.675043565991012e-05, "loss": 0.388, "step": 11290 }, { "epoch": 2.222713973101227, "grad_norm": 0.9317789077758789, "learning_rate": 1.6747378397382985e-05, "loss": 0.5609, "step": 11300 }, { "epoch": 2.2246809766172455, "grad_norm": 1.4716814756393433, "learning_rate": 1.6744321134855854e-05, "loss": 0.4376, "step": 11310 }, { "epoch": 2.2266479801332646, "grad_norm": 0.7102442383766174, "learning_rate": 1.674126387232872e-05, "loss": 0.5373, "step": 11320 }, { "epoch": 2.2286149836492832, "grad_norm": 0.704011857509613, "learning_rate": 1.6738206609801585e-05, "loss": 0.3751, "step": 11330 }, { "epoch": 2.230581987165302, "grad_norm": 1.789819598197937, "learning_rate": 1.673514934727445e-05, "loss": 0.4251, "step": 11340 }, { "epoch": 2.232548990681321, "grad_norm": 2.0482563972473145, "learning_rate": 1.6732092084747317e-05, "loss": 0.3117, "step": 11350 }, { "epoch": 2.2345159941973396, "grad_norm": 1.1816494464874268, "learning_rate": 1.6729034822220186e-05, "loss": 0.4754, "step": 11360 }, { "epoch": 2.2364829977133582, "grad_norm": 0.9137541055679321, "learning_rate": 1.6725977559693052e-05, "loss": 0.3389, "step": 11370 }, { "epoch": 2.2384500012293773, "grad_norm": 3.108690023422241, "learning_rate": 1.6722920297165918e-05, "loss": 0.502, "step": 11380 }, { "epoch": 2.240417004745396, "grad_norm": 1.4583312273025513, "learning_rate": 1.6719863034638787e-05, "loss": 0.3701, "step": 11390 }, { "epoch": 2.2423840082614146, "grad_norm": 0.734485387802124, "learning_rate": 1.6716805772111652e-05, "loss": 0.4625, "step": 11400 }, { "epoch": 2.2443510117774337, "grad_norm": 1.41990327835083, "learning_rate": 1.6713748509584518e-05, "loss": 0.4124, "step": 11410 }, { "epoch": 2.2463180152934523, "grad_norm": 1.3609710931777954, "learning_rate": 1.6710691247057387e-05, "loss": 0.3965, "step": 11420 }, { "epoch": 2.248285018809471, "grad_norm": 0.8547394871711731, "learning_rate": 1.6707633984530253e-05, "loss": 0.4872, "step": 11430 }, { "epoch": 2.25025202232549, "grad_norm": 0.9560080170631409, "learning_rate": 1.6704576722003122e-05, "loss": 0.4059, "step": 11440 }, { "epoch": 2.2522190258415087, "grad_norm": 1.0539902448654175, "learning_rate": 1.6701519459475985e-05, "loss": 0.3862, "step": 11450 }, { "epoch": 2.2541860293575273, "grad_norm": 1.8969827890396118, "learning_rate": 1.6698462196948854e-05, "loss": 0.3707, "step": 11460 }, { "epoch": 2.2561530328735464, "grad_norm": 1.145606517791748, "learning_rate": 1.669540493442172e-05, "loss": 0.3504, "step": 11470 }, { "epoch": 2.258120036389565, "grad_norm": 1.8824901580810547, "learning_rate": 1.6692347671894585e-05, "loss": 0.3876, "step": 11480 }, { "epoch": 2.2600870399055837, "grad_norm": 2.9286253452301025, "learning_rate": 1.6689290409367455e-05, "loss": 0.4352, "step": 11490 }, { "epoch": 2.2620540434216028, "grad_norm": 1.540687918663025, "learning_rate": 1.668623314684032e-05, "loss": 0.4271, "step": 11500 }, { "epoch": 2.2620540434216028, "eval_loss": 0.20973175764083862, "eval_runtime": 8.8684, "eval_samples_per_second": 5.638, "eval_steps_per_second": 2.819, "step": 11500 }, { "epoch": 2.2640210469376214, "grad_norm": 1.8869454860687256, "learning_rate": 1.6683175884313186e-05, "loss": 0.3653, "step": 11510 }, { "epoch": 2.26598805045364, "grad_norm": 1.647462010383606, "learning_rate": 1.6680118621786055e-05, "loss": 0.3849, "step": 11520 }, { "epoch": 2.267955053969659, "grad_norm": 1.2821617126464844, "learning_rate": 1.667706135925892e-05, "loss": 0.4158, "step": 11530 }, { "epoch": 2.2699220574856778, "grad_norm": 0.9892310500144958, "learning_rate": 1.6674004096731787e-05, "loss": 0.4286, "step": 11540 }, { "epoch": 2.2718890610016964, "grad_norm": 1.099701166152954, "learning_rate": 1.6670946834204656e-05, "loss": 0.5114, "step": 11550 }, { "epoch": 2.2738560645177155, "grad_norm": 1.2315559387207031, "learning_rate": 1.6667889571677522e-05, "loss": 0.3438, "step": 11560 }, { "epoch": 2.275823068033734, "grad_norm": 1.3679817914962769, "learning_rate": 1.6664832309150388e-05, "loss": 0.3972, "step": 11570 }, { "epoch": 2.2777900715497528, "grad_norm": 1.3526530265808105, "learning_rate": 1.6661775046623253e-05, "loss": 0.5465, "step": 11580 }, { "epoch": 2.279757075065772, "grad_norm": 2.072378396987915, "learning_rate": 1.6658717784096123e-05, "loss": 0.4384, "step": 11590 }, { "epoch": 2.2817240785817905, "grad_norm": 2.052748918533325, "learning_rate": 1.665566052156899e-05, "loss": 0.4023, "step": 11600 }, { "epoch": 2.283691082097809, "grad_norm": 2.0281856060028076, "learning_rate": 1.6652603259041854e-05, "loss": 0.3589, "step": 11610 }, { "epoch": 2.285658085613828, "grad_norm": 1.2389588356018066, "learning_rate": 1.6649545996514723e-05, "loss": 0.3531, "step": 11620 }, { "epoch": 2.287625089129847, "grad_norm": 1.8695019483566284, "learning_rate": 1.664648873398759e-05, "loss": 0.4811, "step": 11630 }, { "epoch": 2.2895920926458655, "grad_norm": 1.843996524810791, "learning_rate": 1.6643431471460455e-05, "loss": 0.4894, "step": 11640 }, { "epoch": 2.2915590961618846, "grad_norm": 1.7340086698532104, "learning_rate": 1.6640374208933324e-05, "loss": 0.3934, "step": 11650 }, { "epoch": 2.293526099677903, "grad_norm": 0.9214049577713013, "learning_rate": 1.663731694640619e-05, "loss": 0.4839, "step": 11660 }, { "epoch": 2.295493103193922, "grad_norm": 1.3762463331222534, "learning_rate": 1.6634259683879056e-05, "loss": 0.376, "step": 11670 }, { "epoch": 2.2974601067099405, "grad_norm": 1.5327290296554565, "learning_rate": 1.663120242135192e-05, "loss": 0.4463, "step": 11680 }, { "epoch": 2.2994271102259596, "grad_norm": 1.4228308200836182, "learning_rate": 1.662814515882479e-05, "loss": 0.4595, "step": 11690 }, { "epoch": 2.301394113741978, "grad_norm": 0.9541878700256348, "learning_rate": 1.6625087896297656e-05, "loss": 0.4507, "step": 11700 }, { "epoch": 2.303361117257997, "grad_norm": 1.2874113321304321, "learning_rate": 1.6622030633770522e-05, "loss": 0.4727, "step": 11710 }, { "epoch": 2.305328120774016, "grad_norm": 1.3238129615783691, "learning_rate": 1.661897337124339e-05, "loss": 0.4588, "step": 11720 }, { "epoch": 2.3072951242900346, "grad_norm": 1.3692721128463745, "learning_rate": 1.6615916108716257e-05, "loss": 0.4373, "step": 11730 }, { "epoch": 2.309262127806053, "grad_norm": 2.104457139968872, "learning_rate": 1.6612858846189123e-05, "loss": 0.3013, "step": 11740 }, { "epoch": 2.3112291313220723, "grad_norm": 1.6918872594833374, "learning_rate": 1.6609801583661992e-05, "loss": 0.3407, "step": 11750 }, { "epoch": 2.313196134838091, "grad_norm": 1.5859813690185547, "learning_rate": 1.6606744321134858e-05, "loss": 0.2991, "step": 11760 }, { "epoch": 2.3151631383541096, "grad_norm": 2.3670969009399414, "learning_rate": 1.6603687058607723e-05, "loss": 0.3871, "step": 11770 }, { "epoch": 2.3171301418701287, "grad_norm": 1.1432586908340454, "learning_rate": 1.6600629796080593e-05, "loss": 0.4183, "step": 11780 }, { "epoch": 2.3190971453861473, "grad_norm": 2.7241263389587402, "learning_rate": 1.6597572533553455e-05, "loss": 0.3574, "step": 11790 }, { "epoch": 2.321064148902166, "grad_norm": 1.253374695777893, "learning_rate": 1.6594515271026324e-05, "loss": 0.4658, "step": 11800 }, { "epoch": 2.323031152418185, "grad_norm": 1.5033408403396606, "learning_rate": 1.659145800849919e-05, "loss": 0.4508, "step": 11810 }, { "epoch": 2.3249981559342037, "grad_norm": 2.561887502670288, "learning_rate": 1.658840074597206e-05, "loss": 0.4595, "step": 11820 }, { "epoch": 2.3269651594502223, "grad_norm": 1.2708535194396973, "learning_rate": 1.6585343483444925e-05, "loss": 0.5024, "step": 11830 }, { "epoch": 2.3289321629662414, "grad_norm": 1.9025499820709229, "learning_rate": 1.658228622091779e-05, "loss": 0.365, "step": 11840 }, { "epoch": 2.33089916648226, "grad_norm": 1.0760164260864258, "learning_rate": 1.657922895839066e-05, "loss": 0.4055, "step": 11850 }, { "epoch": 2.3328661699982787, "grad_norm": 0.8068252801895142, "learning_rate": 1.6576171695863526e-05, "loss": 0.4234, "step": 11860 }, { "epoch": 2.3348331735142978, "grad_norm": 1.0687072277069092, "learning_rate": 1.657311443333639e-05, "loss": 0.5491, "step": 11870 }, { "epoch": 2.3368001770303164, "grad_norm": 1.7262513637542725, "learning_rate": 1.657005717080926e-05, "loss": 0.4301, "step": 11880 }, { "epoch": 2.338767180546335, "grad_norm": 1.9340697526931763, "learning_rate": 1.6566999908282126e-05, "loss": 0.4087, "step": 11890 }, { "epoch": 2.340734184062354, "grad_norm": 1.6383976936340332, "learning_rate": 1.6563942645754992e-05, "loss": 0.3496, "step": 11900 }, { "epoch": 2.3427011875783728, "grad_norm": 1.1806261539459229, "learning_rate": 1.6560885383227858e-05, "loss": 0.2965, "step": 11910 }, { "epoch": 2.3446681910943914, "grad_norm": 1.8395899534225464, "learning_rate": 1.6557828120700724e-05, "loss": 0.5577, "step": 11920 }, { "epoch": 2.3466351946104105, "grad_norm": 1.3617602586746216, "learning_rate": 1.6554770858173593e-05, "loss": 0.4576, "step": 11930 }, { "epoch": 2.348602198126429, "grad_norm": 1.1605844497680664, "learning_rate": 1.655171359564646e-05, "loss": 0.3198, "step": 11940 }, { "epoch": 2.3505692016424478, "grad_norm": 1.2354375123977661, "learning_rate": 1.6548656333119328e-05, "loss": 0.568, "step": 11950 }, { "epoch": 2.352536205158467, "grad_norm": 1.2012954950332642, "learning_rate": 1.6545599070592194e-05, "loss": 0.3795, "step": 11960 }, { "epoch": 2.3545032086744855, "grad_norm": 2.271904945373535, "learning_rate": 1.654254180806506e-05, "loss": 0.3787, "step": 11970 }, { "epoch": 2.356470212190504, "grad_norm": 1.7770686149597168, "learning_rate": 1.653948454553793e-05, "loss": 0.3701, "step": 11980 }, { "epoch": 2.358437215706523, "grad_norm": 1.3162378072738647, "learning_rate": 1.6536427283010794e-05, "loss": 0.3635, "step": 11990 }, { "epoch": 2.360404219222542, "grad_norm": 0.8531973958015442, "learning_rate": 1.653337002048366e-05, "loss": 0.4221, "step": 12000 }, { "epoch": 2.360404219222542, "eval_loss": 0.21400800347328186, "eval_runtime": 8.8552, "eval_samples_per_second": 5.646, "eval_steps_per_second": 2.823, "step": 12000 }, { "epoch": 2.3623712227385605, "grad_norm": 1.781295657157898, "learning_rate": 1.653031275795653e-05, "loss": 0.3358, "step": 12010 }, { "epoch": 2.3643382262545796, "grad_norm": 2.028844118118286, "learning_rate": 1.652725549542939e-05, "loss": 0.4255, "step": 12020 }, { "epoch": 2.366305229770598, "grad_norm": 2.3487181663513184, "learning_rate": 1.652419823290226e-05, "loss": 0.5234, "step": 12030 }, { "epoch": 2.368272233286617, "grad_norm": 2.8350348472595215, "learning_rate": 1.6521140970375127e-05, "loss": 0.3041, "step": 12040 }, { "epoch": 2.370239236802636, "grad_norm": 1.8248299360275269, "learning_rate": 1.6518083707847992e-05, "loss": 0.3711, "step": 12050 }, { "epoch": 2.3722062403186546, "grad_norm": 1.7937493324279785, "learning_rate": 1.651502644532086e-05, "loss": 0.4739, "step": 12060 }, { "epoch": 2.374173243834673, "grad_norm": 1.0475170612335205, "learning_rate": 1.6511969182793727e-05, "loss": 0.4552, "step": 12070 }, { "epoch": 2.3761402473506923, "grad_norm": 1.3136638402938843, "learning_rate": 1.6508911920266596e-05, "loss": 0.3586, "step": 12080 }, { "epoch": 2.378107250866711, "grad_norm": 1.4082086086273193, "learning_rate": 1.6505854657739462e-05, "loss": 0.4826, "step": 12090 }, { "epoch": 2.3800742543827296, "grad_norm": 1.2185932397842407, "learning_rate": 1.6502797395212328e-05, "loss": 0.3852, "step": 12100 }, { "epoch": 2.3820412578987487, "grad_norm": 2.0192642211914062, "learning_rate": 1.6499740132685197e-05, "loss": 0.4272, "step": 12110 }, { "epoch": 2.3840082614147673, "grad_norm": 2.2088992595672607, "learning_rate": 1.6496682870158063e-05, "loss": 0.4102, "step": 12120 }, { "epoch": 2.385975264930786, "grad_norm": 1.1546714305877686, "learning_rate": 1.649362560763093e-05, "loss": 0.4554, "step": 12130 }, { "epoch": 2.387942268446805, "grad_norm": 0.7382022142410278, "learning_rate": 1.6490568345103795e-05, "loss": 0.3757, "step": 12140 }, { "epoch": 2.3899092719628237, "grad_norm": 1.110977053642273, "learning_rate": 1.648751108257666e-05, "loss": 0.4407, "step": 12150 }, { "epoch": 2.3918762754788423, "grad_norm": 1.028681755065918, "learning_rate": 1.648445382004953e-05, "loss": 0.3124, "step": 12160 }, { "epoch": 2.3938432789948614, "grad_norm": 1.3879059553146362, "learning_rate": 1.6481396557522395e-05, "loss": 0.4393, "step": 12170 }, { "epoch": 2.39581028251088, "grad_norm": 1.3907514810562134, "learning_rate": 1.647833929499526e-05, "loss": 0.4351, "step": 12180 }, { "epoch": 2.3977772860268987, "grad_norm": 1.410379409790039, "learning_rate": 1.647528203246813e-05, "loss": 0.5897, "step": 12190 }, { "epoch": 2.3997442895429177, "grad_norm": 2.0820980072021484, "learning_rate": 1.6472224769940996e-05, "loss": 0.4466, "step": 12200 }, { "epoch": 2.4017112930589364, "grad_norm": 1.685351014137268, "learning_rate": 1.6469167507413865e-05, "loss": 0.4181, "step": 12210 }, { "epoch": 2.403678296574955, "grad_norm": 2.2443206310272217, "learning_rate": 1.646611024488673e-05, "loss": 0.4247, "step": 12220 }, { "epoch": 2.405645300090974, "grad_norm": 1.3944865465164185, "learning_rate": 1.6463052982359597e-05, "loss": 0.3066, "step": 12230 }, { "epoch": 2.4076123036069927, "grad_norm": 1.7855195999145508, "learning_rate": 1.6459995719832466e-05, "loss": 0.5395, "step": 12240 }, { "epoch": 2.4095793071230114, "grad_norm": 1.5307120084762573, "learning_rate": 1.6456938457305328e-05, "loss": 0.3447, "step": 12250 }, { "epoch": 2.4115463106390305, "grad_norm": 2.71352219581604, "learning_rate": 1.6453881194778197e-05, "loss": 0.4779, "step": 12260 }, { "epoch": 2.413513314155049, "grad_norm": 1.4388123750686646, "learning_rate": 1.6450823932251063e-05, "loss": 0.4417, "step": 12270 }, { "epoch": 2.4154803176710677, "grad_norm": 1.343959093093872, "learning_rate": 1.644776666972393e-05, "loss": 0.5612, "step": 12280 }, { "epoch": 2.417447321187087, "grad_norm": 1.7309019565582275, "learning_rate": 1.6444709407196798e-05, "loss": 0.5677, "step": 12290 }, { "epoch": 2.4194143247031055, "grad_norm": 0.48004379868507385, "learning_rate": 1.6441652144669664e-05, "loss": 0.4653, "step": 12300 }, { "epoch": 2.421381328219124, "grad_norm": 1.704228162765503, "learning_rate": 1.643859488214253e-05, "loss": 0.414, "step": 12310 }, { "epoch": 2.423348331735143, "grad_norm": 1.2886383533477783, "learning_rate": 1.64355376196154e-05, "loss": 0.4166, "step": 12320 }, { "epoch": 2.425315335251162, "grad_norm": 1.781337857246399, "learning_rate": 1.6432480357088265e-05, "loss": 0.3569, "step": 12330 }, { "epoch": 2.4272823387671805, "grad_norm": 2.4359853267669678, "learning_rate": 1.6429423094561134e-05, "loss": 0.4891, "step": 12340 }, { "epoch": 2.4292493422831996, "grad_norm": 1.3055243492126465, "learning_rate": 1.6426365832034e-05, "loss": 0.4029, "step": 12350 }, { "epoch": 2.431216345799218, "grad_norm": 0.97089022397995, "learning_rate": 1.6423308569506865e-05, "loss": 0.4751, "step": 12360 }, { "epoch": 2.433183349315237, "grad_norm": 0.9612852931022644, "learning_rate": 1.642025130697973e-05, "loss": 0.4231, "step": 12370 }, { "epoch": 2.435150352831256, "grad_norm": 3.4028701782226562, "learning_rate": 1.6417194044452597e-05, "loss": 0.3678, "step": 12380 }, { "epoch": 2.4371173563472746, "grad_norm": 1.2526423931121826, "learning_rate": 1.6414136781925466e-05, "loss": 0.5883, "step": 12390 }, { "epoch": 2.439084359863293, "grad_norm": 1.2844873666763306, "learning_rate": 1.6411079519398332e-05, "loss": 0.4305, "step": 12400 }, { "epoch": 2.4410513633793123, "grad_norm": 0.8970216512680054, "learning_rate": 1.6408022256871198e-05, "loss": 0.3743, "step": 12410 }, { "epoch": 2.443018366895331, "grad_norm": 2.136035203933716, "learning_rate": 1.6404964994344067e-05, "loss": 0.5527, "step": 12420 }, { "epoch": 2.4449853704113496, "grad_norm": 1.0382180213928223, "learning_rate": 1.6401907731816933e-05, "loss": 0.5142, "step": 12430 }, { "epoch": 2.4469523739273686, "grad_norm": 1.2471837997436523, "learning_rate": 1.63988504692898e-05, "loss": 0.4031, "step": 12440 }, { "epoch": 2.4489193774433873, "grad_norm": 1.7783029079437256, "learning_rate": 1.6395793206762668e-05, "loss": 0.5711, "step": 12450 }, { "epoch": 2.450886380959406, "grad_norm": 2.7205777168273926, "learning_rate": 1.6392735944235533e-05, "loss": 0.5073, "step": 12460 }, { "epoch": 2.4528533844754246, "grad_norm": 0.9302681088447571, "learning_rate": 1.63896786817084e-05, "loss": 0.3801, "step": 12470 }, { "epoch": 2.4548203879914436, "grad_norm": 2.4945271015167236, "learning_rate": 1.6386621419181265e-05, "loss": 0.4482, "step": 12480 }, { "epoch": 2.4567873915074623, "grad_norm": 1.396541953086853, "learning_rate": 1.6383564156654134e-05, "loss": 0.4727, "step": 12490 }, { "epoch": 2.458754395023481, "grad_norm": 1.2617021799087524, "learning_rate": 1.6380506894127e-05, "loss": 0.5215, "step": 12500 }, { "epoch": 2.458754395023481, "eval_loss": 0.20947669446468353, "eval_runtime": 8.8627, "eval_samples_per_second": 5.642, "eval_steps_per_second": 2.821, "step": 12500 }, { "epoch": 2.4607213985395, "grad_norm": 1.8948769569396973, "learning_rate": 1.6377449631599866e-05, "loss": 0.3883, "step": 12510 }, { "epoch": 2.4626884020555186, "grad_norm": 2.941626787185669, "learning_rate": 1.6374392369072735e-05, "loss": 0.4237, "step": 12520 }, { "epoch": 2.4646554055715373, "grad_norm": 0.9059364795684814, "learning_rate": 1.63713351065456e-05, "loss": 0.3822, "step": 12530 }, { "epoch": 2.4666224090875564, "grad_norm": 1.0603015422821045, "learning_rate": 1.6368277844018466e-05, "loss": 0.3393, "step": 12540 }, { "epoch": 2.468589412603575, "grad_norm": 2.579197645187378, "learning_rate": 1.6365220581491335e-05, "loss": 0.3045, "step": 12550 }, { "epoch": 2.4705564161195936, "grad_norm": 1.5118027925491333, "learning_rate": 1.63621633189642e-05, "loss": 0.5017, "step": 12560 }, { "epoch": 2.4725234196356127, "grad_norm": 0.8895286321640015, "learning_rate": 1.6359106056437067e-05, "loss": 0.3775, "step": 12570 }, { "epoch": 2.4744904231516314, "grad_norm": 2.36152982711792, "learning_rate": 1.6356048793909933e-05, "loss": 0.3886, "step": 12580 }, { "epoch": 2.47645742666765, "grad_norm": 0.6294612884521484, "learning_rate": 1.63529915313828e-05, "loss": 0.3861, "step": 12590 }, { "epoch": 2.478424430183669, "grad_norm": 1.4751849174499512, "learning_rate": 1.6349934268855668e-05, "loss": 0.5084, "step": 12600 }, { "epoch": 2.4803914336996877, "grad_norm": 1.883037805557251, "learning_rate": 1.6346877006328534e-05, "loss": 0.3831, "step": 12610 }, { "epoch": 2.4823584372157064, "grad_norm": 0.8708747029304504, "learning_rate": 1.6343819743801403e-05, "loss": 0.489, "step": 12620 }, { "epoch": 2.4843254407317255, "grad_norm": 1.8524725437164307, "learning_rate": 1.634076248127427e-05, "loss": 0.4233, "step": 12630 }, { "epoch": 2.486292444247744, "grad_norm": 1.2213215827941895, "learning_rate": 1.6337705218747134e-05, "loss": 0.5392, "step": 12640 }, { "epoch": 2.4882594477637627, "grad_norm": 2.4590892791748047, "learning_rate": 1.6334647956220003e-05, "loss": 0.3953, "step": 12650 }, { "epoch": 2.490226451279782, "grad_norm": 1.7708888053894043, "learning_rate": 1.633159069369287e-05, "loss": 0.3311, "step": 12660 }, { "epoch": 2.4921934547958005, "grad_norm": 2.163320541381836, "learning_rate": 1.6328533431165735e-05, "loss": 0.3635, "step": 12670 }, { "epoch": 2.494160458311819, "grad_norm": 1.4477022886276245, "learning_rate": 1.6325476168638604e-05, "loss": 0.4243, "step": 12680 }, { "epoch": 2.496127461827838, "grad_norm": 1.0409399271011353, "learning_rate": 1.632241890611147e-05, "loss": 0.4322, "step": 12690 }, { "epoch": 2.498094465343857, "grad_norm": 1.2920570373535156, "learning_rate": 1.6319361643584336e-05, "loss": 0.3994, "step": 12700 }, { "epoch": 2.5000614688598755, "grad_norm": 1.5400962829589844, "learning_rate": 1.63163043810572e-05, "loss": 0.299, "step": 12710 }, { "epoch": 2.502028472375894, "grad_norm": 1.4039868116378784, "learning_rate": 1.6313247118530067e-05, "loss": 0.3499, "step": 12720 }, { "epoch": 2.503995475891913, "grad_norm": 1.696679949760437, "learning_rate": 1.6310189856002936e-05, "loss": 0.5174, "step": 12730 }, { "epoch": 2.505962479407932, "grad_norm": 1.6985901594161987, "learning_rate": 1.6307132593475802e-05, "loss": 0.4112, "step": 12740 }, { "epoch": 2.5079294829239505, "grad_norm": 0.7867997288703918, "learning_rate": 1.630407533094867e-05, "loss": 0.4532, "step": 12750 }, { "epoch": 2.5098964864399695, "grad_norm": 1.9344456195831299, "learning_rate": 1.6301018068421537e-05, "loss": 0.4035, "step": 12760 }, { "epoch": 2.511863489955988, "grad_norm": 1.574959635734558, "learning_rate": 1.6297960805894403e-05, "loss": 0.3104, "step": 12770 }, { "epoch": 2.513830493472007, "grad_norm": 1.628767967224121, "learning_rate": 1.6294903543367272e-05, "loss": 0.5006, "step": 12780 }, { "epoch": 2.515797496988026, "grad_norm": 1.0452278852462769, "learning_rate": 1.6291846280840138e-05, "loss": 0.3795, "step": 12790 }, { "epoch": 2.5177645005040445, "grad_norm": 1.067253589630127, "learning_rate": 1.6288789018313004e-05, "loss": 0.3938, "step": 12800 }, { "epoch": 2.519731504020063, "grad_norm": 2.536316156387329, "learning_rate": 1.628573175578587e-05, "loss": 0.4488, "step": 12810 }, { "epoch": 2.5216985075360823, "grad_norm": 2.0406346321105957, "learning_rate": 1.6282674493258735e-05, "loss": 0.3586, "step": 12820 }, { "epoch": 2.523665511052101, "grad_norm": 2.524869680404663, "learning_rate": 1.6279617230731604e-05, "loss": 0.3906, "step": 12830 }, { "epoch": 2.5256325145681195, "grad_norm": 1.8386890888214111, "learning_rate": 1.627655996820447e-05, "loss": 0.6009, "step": 12840 }, { "epoch": 2.5275995180841386, "grad_norm": 0.31160733103752136, "learning_rate": 1.6273502705677336e-05, "loss": 0.4166, "step": 12850 }, { "epoch": 2.5295665216001573, "grad_norm": 1.9621902704238892, "learning_rate": 1.6270445443150205e-05, "loss": 0.4007, "step": 12860 }, { "epoch": 2.531533525116176, "grad_norm": 1.396183967590332, "learning_rate": 1.626738818062307e-05, "loss": 0.4049, "step": 12870 }, { "epoch": 2.533500528632195, "grad_norm": 1.2113840579986572, "learning_rate": 1.626433091809594e-05, "loss": 0.4187, "step": 12880 }, { "epoch": 2.5354675321482136, "grad_norm": 0.6130431294441223, "learning_rate": 1.6261273655568806e-05, "loss": 0.3547, "step": 12890 }, { "epoch": 2.5374345356642323, "grad_norm": 1.4548547267913818, "learning_rate": 1.625821639304167e-05, "loss": 0.5581, "step": 12900 }, { "epoch": 2.5394015391802514, "grad_norm": 1.243184208869934, "learning_rate": 1.625515913051454e-05, "loss": 0.3434, "step": 12910 }, { "epoch": 2.54136854269627, "grad_norm": 1.3352422714233398, "learning_rate": 1.6252101867987403e-05, "loss": 0.3148, "step": 12920 }, { "epoch": 2.5433355462122886, "grad_norm": 0.9811519980430603, "learning_rate": 1.6249044605460272e-05, "loss": 0.4075, "step": 12930 }, { "epoch": 2.5453025497283077, "grad_norm": 1.1707277297973633, "learning_rate": 1.6245987342933138e-05, "loss": 0.4329, "step": 12940 }, { "epoch": 2.5472695532443264, "grad_norm": 0.9138590693473816, "learning_rate": 1.6242930080406004e-05, "loss": 0.355, "step": 12950 }, { "epoch": 2.549236556760345, "grad_norm": 1.6511414051055908, "learning_rate": 1.6239872817878873e-05, "loss": 0.3969, "step": 12960 }, { "epoch": 2.551203560276364, "grad_norm": 1.2512931823730469, "learning_rate": 1.623681555535174e-05, "loss": 0.4485, "step": 12970 }, { "epoch": 2.5531705637923827, "grad_norm": 2.979414701461792, "learning_rate": 1.6233758292824605e-05, "loss": 0.3468, "step": 12980 }, { "epoch": 2.5551375673084014, "grad_norm": 2.5046603679656982, "learning_rate": 1.6230701030297474e-05, "loss": 0.438, "step": 12990 }, { "epoch": 2.5571045708244204, "grad_norm": 1.1281431913375854, "learning_rate": 1.622764376777034e-05, "loss": 0.3836, "step": 13000 }, { "epoch": 2.5571045708244204, "eval_loss": 0.20514748990535736, "eval_runtime": 8.8701, "eval_samples_per_second": 5.637, "eval_steps_per_second": 2.818, "step": 13000 }, { "epoch": 2.559071574340439, "grad_norm": 1.0448176860809326, "learning_rate": 1.622458650524321e-05, "loss": 0.5373, "step": 13010 }, { "epoch": 2.5610385778564577, "grad_norm": 1.2854679822921753, "learning_rate": 1.6221529242716074e-05, "loss": 0.4245, "step": 13020 }, { "epoch": 2.563005581372477, "grad_norm": 1.9112696647644043, "learning_rate": 1.621847198018894e-05, "loss": 0.4786, "step": 13030 }, { "epoch": 2.5649725848884954, "grad_norm": 1.9691932201385498, "learning_rate": 1.6215414717661806e-05, "loss": 0.4054, "step": 13040 }, { "epoch": 2.566939588404514, "grad_norm": 2.541759490966797, "learning_rate": 1.6212357455134672e-05, "loss": 0.4093, "step": 13050 }, { "epoch": 2.568906591920533, "grad_norm": 1.623146414756775, "learning_rate": 1.620930019260754e-05, "loss": 0.4011, "step": 13060 }, { "epoch": 2.570873595436552, "grad_norm": 1.0942410230636597, "learning_rate": 1.6206242930080407e-05, "loss": 0.3456, "step": 13070 }, { "epoch": 2.5728405989525704, "grad_norm": 2.006178140640259, "learning_rate": 1.6203185667553273e-05, "loss": 0.4755, "step": 13080 }, { "epoch": 2.5748076024685895, "grad_norm": 1.132165789604187, "learning_rate": 1.6200128405026142e-05, "loss": 0.4762, "step": 13090 }, { "epoch": 2.576774605984608, "grad_norm": 0.8494846224784851, "learning_rate": 1.6197071142499007e-05, "loss": 0.5175, "step": 13100 }, { "epoch": 2.578741609500627, "grad_norm": 1.8866459131240845, "learning_rate": 1.6194013879971873e-05, "loss": 0.5924, "step": 13110 }, { "epoch": 2.580708613016646, "grad_norm": 0.9108813405036926, "learning_rate": 1.6190956617444742e-05, "loss": 0.4604, "step": 13120 }, { "epoch": 2.5826756165326645, "grad_norm": 1.544240951538086, "learning_rate": 1.6187899354917608e-05, "loss": 0.4162, "step": 13130 }, { "epoch": 2.584642620048683, "grad_norm": 2.693819046020508, "learning_rate": 1.6184842092390477e-05, "loss": 0.4684, "step": 13140 }, { "epoch": 2.5866096235647023, "grad_norm": 2.237470865249634, "learning_rate": 1.618178482986334e-05, "loss": 0.3965, "step": 13150 }, { "epoch": 2.588576627080721, "grad_norm": 1.9089361429214478, "learning_rate": 1.617872756733621e-05, "loss": 0.4684, "step": 13160 }, { "epoch": 2.5905436305967395, "grad_norm": 1.8669962882995605, "learning_rate": 1.6175670304809075e-05, "loss": 0.5052, "step": 13170 }, { "epoch": 2.5925106341127586, "grad_norm": 1.4717791080474854, "learning_rate": 1.617261304228194e-05, "loss": 0.4677, "step": 13180 }, { "epoch": 2.5944776376287773, "grad_norm": 1.2365857362747192, "learning_rate": 1.616955577975481e-05, "loss": 0.516, "step": 13190 }, { "epoch": 2.596444641144796, "grad_norm": 1.2148315906524658, "learning_rate": 1.6166498517227675e-05, "loss": 0.4646, "step": 13200 }, { "epoch": 2.598411644660815, "grad_norm": 1.734046459197998, "learning_rate": 1.616344125470054e-05, "loss": 0.3972, "step": 13210 }, { "epoch": 2.6003786481768336, "grad_norm": 1.1978328227996826, "learning_rate": 1.616038399217341e-05, "loss": 0.4358, "step": 13220 }, { "epoch": 2.6023456516928523, "grad_norm": 1.2464817762374878, "learning_rate": 1.6157326729646276e-05, "loss": 0.3132, "step": 13230 }, { "epoch": 2.6043126552088713, "grad_norm": 1.3837008476257324, "learning_rate": 1.6154269467119142e-05, "loss": 0.4221, "step": 13240 }, { "epoch": 2.60627965872489, "grad_norm": 1.9459171295166016, "learning_rate": 1.615121220459201e-05, "loss": 0.3874, "step": 13250 }, { "epoch": 2.6082466622409086, "grad_norm": 1.4665859937667847, "learning_rate": 1.6148154942064877e-05, "loss": 0.4835, "step": 13260 }, { "epoch": 2.6102136657569277, "grad_norm": 1.916831612586975, "learning_rate": 1.6145097679537743e-05, "loss": 0.3224, "step": 13270 }, { "epoch": 2.6121806692729463, "grad_norm": 1.1361554861068726, "learning_rate": 1.614204041701061e-05, "loss": 0.2798, "step": 13280 }, { "epoch": 2.614147672788965, "grad_norm": 1.8515132665634155, "learning_rate": 1.6138983154483478e-05, "loss": 0.4076, "step": 13290 }, { "epoch": 2.616114676304984, "grad_norm": 1.1301642656326294, "learning_rate": 1.6135925891956343e-05, "loss": 0.4611, "step": 13300 }, { "epoch": 2.6180816798210027, "grad_norm": 1.3350282907485962, "learning_rate": 1.613286862942921e-05, "loss": 0.3452, "step": 13310 }, { "epoch": 2.6200486833370213, "grad_norm": 3.7476720809936523, "learning_rate": 1.6129811366902078e-05, "loss": 0.3489, "step": 13320 }, { "epoch": 2.6220156868530404, "grad_norm": 2.181448459625244, "learning_rate": 1.6126754104374944e-05, "loss": 0.4138, "step": 13330 }, { "epoch": 2.623982690369059, "grad_norm": 0.9513285756111145, "learning_rate": 1.612369684184781e-05, "loss": 0.5012, "step": 13340 }, { "epoch": 2.6259496938850777, "grad_norm": 1.0185880661010742, "learning_rate": 1.612063957932068e-05, "loss": 0.3593, "step": 13350 }, { "epoch": 2.627916697401097, "grad_norm": 1.5732872486114502, "learning_rate": 1.6117582316793545e-05, "loss": 0.5159, "step": 13360 }, { "epoch": 2.6298837009171154, "grad_norm": 0.9264469742774963, "learning_rate": 1.611452505426641e-05, "loss": 0.3632, "step": 13370 }, { "epoch": 2.631850704433134, "grad_norm": 1.364571213722229, "learning_rate": 1.6111467791739276e-05, "loss": 0.3872, "step": 13380 }, { "epoch": 2.633817707949153, "grad_norm": 0.6632816195487976, "learning_rate": 1.6108410529212146e-05, "loss": 0.3853, "step": 13390 }, { "epoch": 2.635784711465172, "grad_norm": 1.6225327253341675, "learning_rate": 1.610535326668501e-05, "loss": 0.4046, "step": 13400 }, { "epoch": 2.6377517149811904, "grad_norm": 1.5951011180877686, "learning_rate": 1.6102296004157877e-05, "loss": 0.5356, "step": 13410 }, { "epoch": 2.6397187184972095, "grad_norm": 1.490448236465454, "learning_rate": 1.6099238741630746e-05, "loss": 0.3891, "step": 13420 }, { "epoch": 2.641685722013228, "grad_norm": 0.9575764536857605, "learning_rate": 1.6096181479103612e-05, "loss": 0.5333, "step": 13430 }, { "epoch": 2.643652725529247, "grad_norm": 1.7446562051773071, "learning_rate": 1.6093124216576478e-05, "loss": 0.4368, "step": 13440 }, { "epoch": 2.645619729045266, "grad_norm": 2.8371479511260986, "learning_rate": 1.6090066954049347e-05, "loss": 0.4029, "step": 13450 }, { "epoch": 2.6475867325612845, "grad_norm": 0.8692865371704102, "learning_rate": 1.6087009691522213e-05, "loss": 0.4666, "step": 13460 }, { "epoch": 2.649553736077303, "grad_norm": 0.8409749269485474, "learning_rate": 1.608395242899508e-05, "loss": 0.3958, "step": 13470 }, { "epoch": 2.6515207395933222, "grad_norm": 3.647979974746704, "learning_rate": 1.6080895166467948e-05, "loss": 0.3191, "step": 13480 }, { "epoch": 2.653487743109341, "grad_norm": 0.7913485169410706, "learning_rate": 1.607783790394081e-05, "loss": 0.3484, "step": 13490 }, { "epoch": 2.6554547466253595, "grad_norm": 1.9412989616394043, "learning_rate": 1.607478064141368e-05, "loss": 0.3757, "step": 13500 }, { "epoch": 2.6554547466253595, "eval_loss": 0.1958555430173874, "eval_runtime": 8.9086, "eval_samples_per_second": 5.613, "eval_steps_per_second": 2.806, "step": 13500 }, { "epoch": 2.6574217501413786, "grad_norm": 3.0294971466064453, "learning_rate": 1.6071723378886545e-05, "loss": 0.4377, "step": 13510 }, { "epoch": 2.6593887536573972, "grad_norm": 1.4375206232070923, "learning_rate": 1.6068666116359414e-05, "loss": 0.4421, "step": 13520 }, { "epoch": 2.661355757173416, "grad_norm": 2.2551164627075195, "learning_rate": 1.606560885383228e-05, "loss": 0.3295, "step": 13530 }, { "epoch": 2.663322760689435, "grad_norm": 0.9871407747268677, "learning_rate": 1.6062551591305146e-05, "loss": 0.4265, "step": 13540 }, { "epoch": 2.6652897642054536, "grad_norm": 2.210333824157715, "learning_rate": 1.6059494328778015e-05, "loss": 0.3181, "step": 13550 }, { "epoch": 2.6672567677214722, "grad_norm": 1.154691219329834, "learning_rate": 1.605643706625088e-05, "loss": 0.5528, "step": 13560 }, { "epoch": 2.6692237712374913, "grad_norm": 1.9619114398956299, "learning_rate": 1.6053379803723746e-05, "loss": 0.512, "step": 13570 }, { "epoch": 2.67119077475351, "grad_norm": 1.5608044862747192, "learning_rate": 1.6050322541196616e-05, "loss": 0.4688, "step": 13580 }, { "epoch": 2.6731577782695286, "grad_norm": 1.3780293464660645, "learning_rate": 1.604726527866948e-05, "loss": 0.3468, "step": 13590 }, { "epoch": 2.6751247817855477, "grad_norm": 1.2746591567993164, "learning_rate": 1.6044208016142347e-05, "loss": 0.5138, "step": 13600 }, { "epoch": 2.6770917853015663, "grad_norm": 1.2642594575881958, "learning_rate": 1.6041150753615213e-05, "loss": 0.4725, "step": 13610 }, { "epoch": 2.679058788817585, "grad_norm": 0.9786370992660522, "learning_rate": 1.603809349108808e-05, "loss": 0.4651, "step": 13620 }, { "epoch": 2.681025792333604, "grad_norm": 0.7670680284500122, "learning_rate": 1.6035036228560948e-05, "loss": 0.4079, "step": 13630 }, { "epoch": 2.6829927958496227, "grad_norm": 1.5032764673233032, "learning_rate": 1.6031978966033814e-05, "loss": 0.4281, "step": 13640 }, { "epoch": 2.6849597993656413, "grad_norm": 0.8874984383583069, "learning_rate": 1.6028921703506683e-05, "loss": 0.5697, "step": 13650 }, { "epoch": 2.6869268028816604, "grad_norm": 1.297289252281189, "learning_rate": 1.602586444097955e-05, "loss": 0.4133, "step": 13660 }, { "epoch": 2.688893806397679, "grad_norm": 1.6247835159301758, "learning_rate": 1.6022807178452414e-05, "loss": 0.3557, "step": 13670 }, { "epoch": 2.6908608099136977, "grad_norm": 1.0644588470458984, "learning_rate": 1.6019749915925284e-05, "loss": 0.342, "step": 13680 }, { "epoch": 2.6928278134297168, "grad_norm": 1.2811824083328247, "learning_rate": 1.601669265339815e-05, "loss": 0.4606, "step": 13690 }, { "epoch": 2.6947948169457354, "grad_norm": 0.8294884562492371, "learning_rate": 1.6013635390871015e-05, "loss": 0.4232, "step": 13700 }, { "epoch": 2.696761820461754, "grad_norm": 1.221997618675232, "learning_rate": 1.6010578128343884e-05, "loss": 0.4509, "step": 13710 }, { "epoch": 2.6987288239777727, "grad_norm": 1.3223415613174438, "learning_rate": 1.6007520865816747e-05, "loss": 0.4079, "step": 13720 }, { "epoch": 2.700695827493792, "grad_norm": 1.2597076892852783, "learning_rate": 1.6004463603289616e-05, "loss": 0.3619, "step": 13730 }, { "epoch": 2.7026628310098104, "grad_norm": 1.868239402770996, "learning_rate": 1.600140634076248e-05, "loss": 0.3647, "step": 13740 }, { "epoch": 2.704629834525829, "grad_norm": 2.783144950866699, "learning_rate": 1.5998349078235347e-05, "loss": 0.3667, "step": 13750 }, { "epoch": 2.706596838041848, "grad_norm": 1.2698179483413696, "learning_rate": 1.5995291815708217e-05, "loss": 0.3574, "step": 13760 }, { "epoch": 2.708563841557867, "grad_norm": 1.1568933725357056, "learning_rate": 1.5992234553181082e-05, "loss": 0.3992, "step": 13770 }, { "epoch": 2.7105308450738854, "grad_norm": 1.916214108467102, "learning_rate": 1.598917729065395e-05, "loss": 0.4522, "step": 13780 }, { "epoch": 2.7124978485899045, "grad_norm": 1.5209614038467407, "learning_rate": 1.5986120028126817e-05, "loss": 0.4657, "step": 13790 }, { "epoch": 2.714464852105923, "grad_norm": 1.1678006649017334, "learning_rate": 1.5983062765599683e-05, "loss": 0.3623, "step": 13800 }, { "epoch": 2.716431855621942, "grad_norm": 1.426422357559204, "learning_rate": 1.5980005503072552e-05, "loss": 0.3642, "step": 13810 }, { "epoch": 2.718398859137961, "grad_norm": 1.0496702194213867, "learning_rate": 1.5976948240545418e-05, "loss": 0.3638, "step": 13820 }, { "epoch": 2.7203658626539795, "grad_norm": 1.1832960844039917, "learning_rate": 1.5973890978018284e-05, "loss": 0.4138, "step": 13830 }, { "epoch": 2.722332866169998, "grad_norm": 1.9487724304199219, "learning_rate": 1.597083371549115e-05, "loss": 0.3491, "step": 13840 }, { "epoch": 2.7242998696860172, "grad_norm": 2.9646143913269043, "learning_rate": 1.5967776452964015e-05, "loss": 0.3892, "step": 13850 }, { "epoch": 2.726266873202036, "grad_norm": 2.2359533309936523, "learning_rate": 1.5964719190436885e-05, "loss": 0.4241, "step": 13860 }, { "epoch": 2.7282338767180545, "grad_norm": 1.4290543794631958, "learning_rate": 1.596166192790975e-05, "loss": 0.4208, "step": 13870 }, { "epoch": 2.730200880234073, "grad_norm": 0.8418980240821838, "learning_rate": 1.5958604665382616e-05, "loss": 0.4289, "step": 13880 }, { "epoch": 2.7321678837500922, "grad_norm": 1.897002100944519, "learning_rate": 1.5955547402855485e-05, "loss": 0.3951, "step": 13890 }, { "epoch": 2.734134887266111, "grad_norm": 1.466009497642517, "learning_rate": 1.595249014032835e-05, "loss": 0.4793, "step": 13900 }, { "epoch": 2.7361018907821295, "grad_norm": 1.0053349733352661, "learning_rate": 1.594943287780122e-05, "loss": 0.3532, "step": 13910 }, { "epoch": 2.7380688942981486, "grad_norm": 1.6650139093399048, "learning_rate": 1.5946375615274086e-05, "loss": 0.3795, "step": 13920 }, { "epoch": 2.7400358978141672, "grad_norm": 0.7641739845275879, "learning_rate": 1.5943318352746952e-05, "loss": 0.4616, "step": 13930 }, { "epoch": 2.742002901330186, "grad_norm": 1.5295542478561401, "learning_rate": 1.5940261090219818e-05, "loss": 0.3941, "step": 13940 }, { "epoch": 2.743969904846205, "grad_norm": 1.3583757877349854, "learning_rate": 1.5937203827692683e-05, "loss": 0.4931, "step": 13950 }, { "epoch": 2.7459369083622236, "grad_norm": 1.5385262966156006, "learning_rate": 1.5934146565165552e-05, "loss": 0.5179, "step": 13960 }, { "epoch": 2.7479039118782422, "grad_norm": 1.2843902111053467, "learning_rate": 1.5931089302638418e-05, "loss": 0.3554, "step": 13970 }, { "epoch": 2.7498709153942613, "grad_norm": 1.251584529876709, "learning_rate": 1.5928032040111284e-05, "loss": 0.3453, "step": 13980 }, { "epoch": 2.75183791891028, "grad_norm": 1.3551993370056152, "learning_rate": 1.5924974777584153e-05, "loss": 0.3805, "step": 13990 }, { "epoch": 2.7538049224262986, "grad_norm": 0.8944595456123352, "learning_rate": 1.592191751505702e-05, "loss": 0.4317, "step": 14000 }, { "epoch": 2.7538049224262986, "eval_loss": 0.19482757151126862, "eval_runtime": 8.8976, "eval_samples_per_second": 5.62, "eval_steps_per_second": 2.81, "step": 14000 }, { "epoch": 2.7557719259423177, "grad_norm": 1.8329250812530518, "learning_rate": 1.5918860252529885e-05, "loss": 0.4319, "step": 14010 }, { "epoch": 2.7577389294583363, "grad_norm": 0.7794898748397827, "learning_rate": 1.5915802990002754e-05, "loss": 0.4589, "step": 14020 }, { "epoch": 2.759705932974355, "grad_norm": 2.663428544998169, "learning_rate": 1.591274572747562e-05, "loss": 0.35, "step": 14030 }, { "epoch": 2.761672936490374, "grad_norm": 1.2760578393936157, "learning_rate": 1.590968846494849e-05, "loss": 0.3787, "step": 14040 }, { "epoch": 2.7636399400063927, "grad_norm": 0.9405483603477478, "learning_rate": 1.5906631202421355e-05, "loss": 0.3884, "step": 14050 }, { "epoch": 2.7656069435224113, "grad_norm": 0.6196711659431458, "learning_rate": 1.590357393989422e-05, "loss": 0.307, "step": 14060 }, { "epoch": 2.7675739470384304, "grad_norm": 0.8994119763374329, "learning_rate": 1.5900516677367086e-05, "loss": 0.4833, "step": 14070 }, { "epoch": 2.769540950554449, "grad_norm": 1.5671933889389038, "learning_rate": 1.5897459414839952e-05, "loss": 0.4233, "step": 14080 }, { "epoch": 2.7715079540704677, "grad_norm": 1.0534205436706543, "learning_rate": 1.589440215231282e-05, "loss": 0.5757, "step": 14090 }, { "epoch": 2.7734749575864868, "grad_norm": 1.036082148551941, "learning_rate": 1.5891344889785687e-05, "loss": 0.4192, "step": 14100 }, { "epoch": 2.7754419611025054, "grad_norm": 1.4796607494354248, "learning_rate": 1.5888287627258553e-05, "loss": 0.3789, "step": 14110 }, { "epoch": 2.777408964618524, "grad_norm": 0.9435361623764038, "learning_rate": 1.5885230364731422e-05, "loss": 0.4361, "step": 14120 }, { "epoch": 2.779375968134543, "grad_norm": 1.5666536092758179, "learning_rate": 1.5882173102204288e-05, "loss": 0.3985, "step": 14130 }, { "epoch": 2.7813429716505618, "grad_norm": 1.0621715784072876, "learning_rate": 1.5879115839677153e-05, "loss": 0.3678, "step": 14140 }, { "epoch": 2.7833099751665804, "grad_norm": 1.2044368982315063, "learning_rate": 1.5876058577150023e-05, "loss": 0.393, "step": 14150 }, { "epoch": 2.7852769786825995, "grad_norm": 1.5973824262619019, "learning_rate": 1.587300131462289e-05, "loss": 0.3445, "step": 14160 }, { "epoch": 2.787243982198618, "grad_norm": 0.9761004447937012, "learning_rate": 1.5869944052095754e-05, "loss": 0.5057, "step": 14170 }, { "epoch": 2.7892109857146368, "grad_norm": 1.2603461742401123, "learning_rate": 1.586688678956862e-05, "loss": 0.3153, "step": 14180 }, { "epoch": 2.791177989230656, "grad_norm": 1.165386438369751, "learning_rate": 1.586382952704149e-05, "loss": 0.4833, "step": 14190 }, { "epoch": 2.7931449927466745, "grad_norm": 2.618959903717041, "learning_rate": 1.5860772264514355e-05, "loss": 0.4365, "step": 14200 }, { "epoch": 2.795111996262693, "grad_norm": 1.329698920249939, "learning_rate": 1.585771500198722e-05, "loss": 0.5081, "step": 14210 }, { "epoch": 2.797078999778712, "grad_norm": 1.7404251098632812, "learning_rate": 1.585465773946009e-05, "loss": 0.3237, "step": 14220 }, { "epoch": 2.799046003294731, "grad_norm": 1.8908653259277344, "learning_rate": 1.5851600476932956e-05, "loss": 0.3845, "step": 14230 }, { "epoch": 2.8010130068107495, "grad_norm": 4.6009907722473145, "learning_rate": 1.584854321440582e-05, "loss": 0.3441, "step": 14240 }, { "epoch": 2.8029800103267686, "grad_norm": 1.369461178779602, "learning_rate": 1.584548595187869e-05, "loss": 0.3429, "step": 14250 }, { "epoch": 2.804947013842787, "grad_norm": 1.4235303401947021, "learning_rate": 1.5842428689351556e-05, "loss": 0.3876, "step": 14260 }, { "epoch": 2.806914017358806, "grad_norm": 1.7543208599090576, "learning_rate": 1.5839371426824422e-05, "loss": 0.3113, "step": 14270 }, { "epoch": 2.808881020874825, "grad_norm": 1.4244465827941895, "learning_rate": 1.5836314164297288e-05, "loss": 0.3924, "step": 14280 }, { "epoch": 2.8108480243908436, "grad_norm": 1.5675772428512573, "learning_rate": 1.5833256901770157e-05, "loss": 0.5018, "step": 14290 }, { "epoch": 2.812815027906862, "grad_norm": 1.6984508037567139, "learning_rate": 1.5830199639243023e-05, "loss": 0.4307, "step": 14300 }, { "epoch": 2.8147820314228813, "grad_norm": 1.7523006200790405, "learning_rate": 1.582714237671589e-05, "loss": 0.3569, "step": 14310 }, { "epoch": 2.8167490349389, "grad_norm": 1.2517403364181519, "learning_rate": 1.5824085114188758e-05, "loss": 0.4189, "step": 14320 }, { "epoch": 2.8187160384549186, "grad_norm": 1.5489752292633057, "learning_rate": 1.5821027851661624e-05, "loss": 0.4768, "step": 14330 }, { "epoch": 2.8206830419709377, "grad_norm": 1.4975192546844482, "learning_rate": 1.581797058913449e-05, "loss": 0.4721, "step": 14340 }, { "epoch": 2.8226500454869563, "grad_norm": 1.262944221496582, "learning_rate": 1.581491332660736e-05, "loss": 0.4397, "step": 14350 }, { "epoch": 2.824617049002975, "grad_norm": 1.9267690181732178, "learning_rate": 1.5811856064080224e-05, "loss": 0.3516, "step": 14360 }, { "epoch": 2.826584052518994, "grad_norm": 2.1961848735809326, "learning_rate": 1.580879880155309e-05, "loss": 0.5012, "step": 14370 }, { "epoch": 2.8285510560350127, "grad_norm": 2.383462905883789, "learning_rate": 1.580574153902596e-05, "loss": 0.4569, "step": 14380 }, { "epoch": 2.8305180595510313, "grad_norm": 1.119138240814209, "learning_rate": 1.5802684276498825e-05, "loss": 0.4949, "step": 14390 }, { "epoch": 2.8324850630670504, "grad_norm": 2.2834503650665283, "learning_rate": 1.579962701397169e-05, "loss": 0.3773, "step": 14400 }, { "epoch": 2.834452066583069, "grad_norm": 1.1680762767791748, "learning_rate": 1.5796569751444557e-05, "loss": 0.3865, "step": 14410 }, { "epoch": 2.8364190700990877, "grad_norm": 1.060563564300537, "learning_rate": 1.5793512488917426e-05, "loss": 0.3549, "step": 14420 }, { "epoch": 2.8383860736151068, "grad_norm": 1.6449270248413086, "learning_rate": 1.579045522639029e-05, "loss": 0.3986, "step": 14430 }, { "epoch": 2.8403530771311254, "grad_norm": 1.8094450235366821, "learning_rate": 1.5787397963863157e-05, "loss": 0.4567, "step": 14440 }, { "epoch": 2.842320080647144, "grad_norm": 1.4255567789077759, "learning_rate": 1.5784340701336026e-05, "loss": 0.3615, "step": 14450 }, { "epoch": 2.844287084163163, "grad_norm": 1.9378465414047241, "learning_rate": 1.5781283438808892e-05, "loss": 0.4763, "step": 14460 }, { "epoch": 2.8462540876791818, "grad_norm": 1.6682242155075073, "learning_rate": 1.5778226176281758e-05, "loss": 0.391, "step": 14470 }, { "epoch": 2.8482210911952004, "grad_norm": 1.55122709274292, "learning_rate": 1.5775168913754627e-05, "loss": 0.3561, "step": 14480 }, { "epoch": 2.8501880947112195, "grad_norm": 1.6349620819091797, "learning_rate": 1.5772111651227493e-05, "loss": 0.4355, "step": 14490 }, { "epoch": 2.852155098227238, "grad_norm": 1.3233345746994019, "learning_rate": 1.576905438870036e-05, "loss": 0.4785, "step": 14500 }, { "epoch": 2.852155098227238, "eval_loss": 0.1918381303548813, "eval_runtime": 8.8922, "eval_samples_per_second": 5.623, "eval_steps_per_second": 2.811, "step": 14500 }, { "epoch": 2.8541221017432568, "grad_norm": 1.6185466051101685, "learning_rate": 1.5765997126173224e-05, "loss": 0.4041, "step": 14510 }, { "epoch": 2.856089105259276, "grad_norm": 1.1693966388702393, "learning_rate": 1.576293986364609e-05, "loss": 0.4266, "step": 14520 }, { "epoch": 2.8580561087752945, "grad_norm": 1.2927526235580444, "learning_rate": 1.575988260111896e-05, "loss": 0.4679, "step": 14530 }, { "epoch": 2.860023112291313, "grad_norm": 1.0233153104782104, "learning_rate": 1.5756825338591825e-05, "loss": 0.4132, "step": 14540 }, { "epoch": 2.861990115807332, "grad_norm": 1.5996166467666626, "learning_rate": 1.5753768076064694e-05, "loss": 0.5182, "step": 14550 }, { "epoch": 2.863957119323351, "grad_norm": 1.4874346256256104, "learning_rate": 1.575071081353756e-05, "loss": 0.4805, "step": 14560 }, { "epoch": 2.8659241228393695, "grad_norm": 0.8183672428131104, "learning_rate": 1.5747653551010426e-05, "loss": 0.3395, "step": 14570 }, { "epoch": 2.8678911263553886, "grad_norm": 1.1641387939453125, "learning_rate": 1.5744596288483295e-05, "loss": 0.4027, "step": 14580 }, { "epoch": 2.869858129871407, "grad_norm": 0.9957535862922668, "learning_rate": 1.574153902595616e-05, "loss": 0.4174, "step": 14590 }, { "epoch": 2.871825133387426, "grad_norm": 2.292351007461548, "learning_rate": 1.5738481763429027e-05, "loss": 0.4162, "step": 14600 }, { "epoch": 2.873792136903445, "grad_norm": 1.9051101207733154, "learning_rate": 1.5735424500901896e-05, "loss": 0.3827, "step": 14610 }, { "epoch": 2.8757591404194636, "grad_norm": 2.993645668029785, "learning_rate": 1.5732367238374758e-05, "loss": 0.4401, "step": 14620 }, { "epoch": 2.877726143935482, "grad_norm": 1.3731578588485718, "learning_rate": 1.5729309975847627e-05, "loss": 0.4085, "step": 14630 }, { "epoch": 2.8796931474515013, "grad_norm": 1.1569373607635498, "learning_rate": 1.5726252713320493e-05, "loss": 0.4803, "step": 14640 }, { "epoch": 2.88166015096752, "grad_norm": 1.196911334991455, "learning_rate": 1.572319545079336e-05, "loss": 0.5282, "step": 14650 }, { "epoch": 2.8836271544835386, "grad_norm": 1.6104505062103271, "learning_rate": 1.5720138188266228e-05, "loss": 0.5026, "step": 14660 }, { "epoch": 2.8855941579995577, "grad_norm": 1.437827229499817, "learning_rate": 1.5717080925739094e-05, "loss": 0.5583, "step": 14670 }, { "epoch": 2.8875611615155763, "grad_norm": 1.507562518119812, "learning_rate": 1.5714023663211963e-05, "loss": 0.3675, "step": 14680 }, { "epoch": 2.889528165031595, "grad_norm": 1.2080801725387573, "learning_rate": 1.571096640068483e-05, "loss": 0.4364, "step": 14690 }, { "epoch": 2.891495168547614, "grad_norm": 1.4376025199890137, "learning_rate": 1.5707909138157695e-05, "loss": 0.3281, "step": 14700 }, { "epoch": 2.8934621720636327, "grad_norm": 0.8100415468215942, "learning_rate": 1.5704851875630564e-05, "loss": 0.343, "step": 14710 }, { "epoch": 2.8954291755796513, "grad_norm": 1.3493585586547852, "learning_rate": 1.570179461310343e-05, "loss": 0.5444, "step": 14720 }, { "epoch": 2.8973961790956704, "grad_norm": 0.9113426208496094, "learning_rate": 1.5698737350576295e-05, "loss": 0.3933, "step": 14730 }, { "epoch": 2.899363182611689, "grad_norm": 0.9956138134002686, "learning_rate": 1.569568008804916e-05, "loss": 0.3959, "step": 14740 }, { "epoch": 2.9013301861277077, "grad_norm": 1.8397066593170166, "learning_rate": 1.5692622825522027e-05, "loss": 0.4507, "step": 14750 }, { "epoch": 2.9032971896437267, "grad_norm": 1.2363187074661255, "learning_rate": 1.5689565562994896e-05, "loss": 0.4155, "step": 14760 }, { "epoch": 2.9052641931597454, "grad_norm": 1.7232961654663086, "learning_rate": 1.5686508300467762e-05, "loss": 0.3256, "step": 14770 }, { "epoch": 2.907231196675764, "grad_norm": 2.252438545227051, "learning_rate": 1.5683451037940628e-05, "loss": 0.6026, "step": 14780 }, { "epoch": 2.909198200191783, "grad_norm": 3.091703414916992, "learning_rate": 1.5680393775413497e-05, "loss": 0.5315, "step": 14790 }, { "epoch": 2.9111652037078017, "grad_norm": 0.9878594279289246, "learning_rate": 1.5677336512886362e-05, "loss": 0.4185, "step": 14800 }, { "epoch": 2.9131322072238204, "grad_norm": 2.0725889205932617, "learning_rate": 1.567427925035923e-05, "loss": 0.4641, "step": 14810 }, { "epoch": 2.9150992107398395, "grad_norm": 0.8847138285636902, "learning_rate": 1.5671221987832097e-05, "loss": 0.4097, "step": 14820 }, { "epoch": 2.917066214255858, "grad_norm": 2.7409422397613525, "learning_rate": 1.5668164725304963e-05, "loss": 0.3762, "step": 14830 }, { "epoch": 2.9190332177718767, "grad_norm": 1.3184597492218018, "learning_rate": 1.5665107462777832e-05, "loss": 0.3953, "step": 14840 }, { "epoch": 2.921000221287896, "grad_norm": 1.068154215812683, "learning_rate": 1.5662050200250695e-05, "loss": 0.5554, "step": 14850 }, { "epoch": 2.9229672248039145, "grad_norm": 1.1467418670654297, "learning_rate": 1.5658992937723564e-05, "loss": 0.4592, "step": 14860 }, { "epoch": 2.924934228319933, "grad_norm": 1.156706690788269, "learning_rate": 1.565593567519643e-05, "loss": 0.3614, "step": 14870 }, { "epoch": 2.9269012318359517, "grad_norm": 1.4045031070709229, "learning_rate": 1.5652878412669295e-05, "loss": 0.3408, "step": 14880 }, { "epoch": 2.928868235351971, "grad_norm": 1.043555498123169, "learning_rate": 1.5649821150142165e-05, "loss": 0.3368, "step": 14890 }, { "epoch": 2.9308352388679895, "grad_norm": 1.4246408939361572, "learning_rate": 1.564676388761503e-05, "loss": 0.3734, "step": 14900 }, { "epoch": 2.932802242384008, "grad_norm": 1.4614734649658203, "learning_rate": 1.5643706625087896e-05, "loss": 0.4164, "step": 14910 }, { "epoch": 2.934769245900027, "grad_norm": 1.4192919731140137, "learning_rate": 1.5640649362560765e-05, "loss": 0.487, "step": 14920 }, { "epoch": 2.936736249416046, "grad_norm": 1.1312637329101562, "learning_rate": 1.563759210003363e-05, "loss": 0.4026, "step": 14930 }, { "epoch": 2.9387032529320645, "grad_norm": 1.9668546915054321, "learning_rate": 1.56345348375065e-05, "loss": 0.4553, "step": 14940 }, { "epoch": 2.9406702564480836, "grad_norm": 0.8810634613037109, "learning_rate": 1.5631477574979366e-05, "loss": 0.429, "step": 14950 }, { "epoch": 2.942637259964102, "grad_norm": 1.1697512865066528, "learning_rate": 1.5628420312452232e-05, "loss": 0.3331, "step": 14960 }, { "epoch": 2.944604263480121, "grad_norm": 0.7395417094230652, "learning_rate": 1.5625363049925098e-05, "loss": 0.3326, "step": 14970 }, { "epoch": 2.94657126699614, "grad_norm": 1.3220033645629883, "learning_rate": 1.5622305787397963e-05, "loss": 0.5424, "step": 14980 }, { "epoch": 2.9485382705121586, "grad_norm": 1.2082237005233765, "learning_rate": 1.5619248524870833e-05, "loss": 0.3298, "step": 14990 }, { "epoch": 2.950505274028177, "grad_norm": 1.1049548387527466, "learning_rate": 1.56161912623437e-05, "loss": 0.4071, "step": 15000 }, { "epoch": 2.950505274028177, "eval_loss": 0.19573713839054108, "eval_runtime": 8.8769, "eval_samples_per_second": 5.633, "eval_steps_per_second": 2.816, "step": 15000 }, { "epoch": 2.952472277544196, "grad_norm": 1.520330786705017, "learning_rate": 1.5613133999816564e-05, "loss": 0.4873, "step": 15010 }, { "epoch": 2.954439281060215, "grad_norm": 2.7328922748565674, "learning_rate": 1.5610076737289433e-05, "loss": 0.3784, "step": 15020 }, { "epoch": 2.9564062845762336, "grad_norm": 1.440152645111084, "learning_rate": 1.56070194747623e-05, "loss": 0.3922, "step": 15030 }, { "epoch": 2.958373288092252, "grad_norm": 3.501024007797241, "learning_rate": 1.5603962212235165e-05, "loss": 0.3912, "step": 15040 }, { "epoch": 2.9603402916082713, "grad_norm": 2.0499727725982666, "learning_rate": 1.5600904949708034e-05, "loss": 0.403, "step": 15050 }, { "epoch": 2.96230729512429, "grad_norm": 1.093933343887329, "learning_rate": 1.55978476871809e-05, "loss": 0.4449, "step": 15060 }, { "epoch": 2.9642742986403086, "grad_norm": 2.806871175765991, "learning_rate": 1.559479042465377e-05, "loss": 0.3951, "step": 15070 }, { "epoch": 2.9662413021563276, "grad_norm": 1.279954433441162, "learning_rate": 1.559173316212663e-05, "loss": 0.3745, "step": 15080 }, { "epoch": 2.9682083056723463, "grad_norm": 1.3171770572662354, "learning_rate": 1.55886758995995e-05, "loss": 0.6198, "step": 15090 }, { "epoch": 2.970175309188365, "grad_norm": 1.5054320096969604, "learning_rate": 1.5585618637072366e-05, "loss": 0.4462, "step": 15100 }, { "epoch": 2.972142312704384, "grad_norm": 2.4180641174316406, "learning_rate": 1.5582561374545232e-05, "loss": 0.3919, "step": 15110 }, { "epoch": 2.9741093162204026, "grad_norm": 1.1554861068725586, "learning_rate": 1.55795041120181e-05, "loss": 0.4175, "step": 15120 }, { "epoch": 2.9760763197364213, "grad_norm": 1.2998076677322388, "learning_rate": 1.5576446849490967e-05, "loss": 0.3055, "step": 15130 }, { "epoch": 2.9780433232524404, "grad_norm": 1.6554224491119385, "learning_rate": 1.5573389586963833e-05, "loss": 0.3742, "step": 15140 }, { "epoch": 2.980010326768459, "grad_norm": 1.5794579982757568, "learning_rate": 1.5570332324436702e-05, "loss": 0.3416, "step": 15150 }, { "epoch": 2.9819773302844776, "grad_norm": 1.393416404724121, "learning_rate": 1.5567275061909568e-05, "loss": 0.3949, "step": 15160 }, { "epoch": 2.9839443338004967, "grad_norm": 1.9517128467559814, "learning_rate": 1.5564217799382434e-05, "loss": 0.4206, "step": 15170 }, { "epoch": 2.9859113373165154, "grad_norm": 1.381700873374939, "learning_rate": 1.5561160536855303e-05, "loss": 0.3732, "step": 15180 }, { "epoch": 2.987878340832534, "grad_norm": 2.750070095062256, "learning_rate": 1.555810327432817e-05, "loss": 0.4142, "step": 15190 }, { "epoch": 2.989845344348553, "grad_norm": 1.1984093189239502, "learning_rate": 1.5555046011801034e-05, "loss": 0.5847, "step": 15200 }, { "epoch": 2.9918123478645717, "grad_norm": 0.7176037430763245, "learning_rate": 1.55519887492739e-05, "loss": 0.324, "step": 15210 }, { "epoch": 2.9937793513805904, "grad_norm": 1.6049420833587646, "learning_rate": 1.554893148674677e-05, "loss": 0.4589, "step": 15220 }, { "epoch": 2.9957463548966095, "grad_norm": 0.9500333070755005, "learning_rate": 1.5545874224219635e-05, "loss": 0.3222, "step": 15230 }, { "epoch": 2.997713358412628, "grad_norm": 2.22035551071167, "learning_rate": 1.55428169616925e-05, "loss": 0.4337, "step": 15240 }, { "epoch": 2.9996803619286467, "grad_norm": 0.9914708733558655, "learning_rate": 1.553975969916537e-05, "loss": 0.435, "step": 15250 }, { "epoch": 3.001647365444666, "grad_norm": 1.351025938987732, "learning_rate": 1.5536702436638236e-05, "loss": 0.3849, "step": 15260 }, { "epoch": 3.0036143689606845, "grad_norm": 1.5382455587387085, "learning_rate": 1.55336451741111e-05, "loss": 0.3485, "step": 15270 }, { "epoch": 3.005581372476703, "grad_norm": 1.7761247158050537, "learning_rate": 1.553058791158397e-05, "loss": 0.3946, "step": 15280 }, { "epoch": 3.007548375992722, "grad_norm": 1.7743333578109741, "learning_rate": 1.5527530649056836e-05, "loss": 0.4532, "step": 15290 }, { "epoch": 3.009515379508741, "grad_norm": 1.4908400774002075, "learning_rate": 1.5524473386529702e-05, "loss": 0.3707, "step": 15300 }, { "epoch": 3.0114823830247595, "grad_norm": 1.8430463075637817, "learning_rate": 1.5521416124002568e-05, "loss": 0.5074, "step": 15310 }, { "epoch": 3.0134493865407785, "grad_norm": 1.3990598917007446, "learning_rate": 1.5518358861475437e-05, "loss": 0.5073, "step": 15320 }, { "epoch": 3.015416390056797, "grad_norm": 1.564259648323059, "learning_rate": 1.5515301598948303e-05, "loss": 0.3635, "step": 15330 }, { "epoch": 3.017383393572816, "grad_norm": 2.144291877746582, "learning_rate": 1.551224433642117e-05, "loss": 0.408, "step": 15340 }, { "epoch": 3.019350397088835, "grad_norm": 0.6599649786949158, "learning_rate": 1.5509187073894038e-05, "loss": 0.4184, "step": 15350 }, { "epoch": 3.0213174006048535, "grad_norm": 1.5132403373718262, "learning_rate": 1.5506129811366904e-05, "loss": 0.3938, "step": 15360 }, { "epoch": 3.023284404120872, "grad_norm": 1.6984745264053345, "learning_rate": 1.550307254883977e-05, "loss": 0.5159, "step": 15370 }, { "epoch": 3.0252514076368913, "grad_norm": 0.92393958568573, "learning_rate": 1.550001528631264e-05, "loss": 0.399, "step": 15380 }, { "epoch": 3.02721841115291, "grad_norm": 1.139657974243164, "learning_rate": 1.5496958023785504e-05, "loss": 0.3417, "step": 15390 }, { "epoch": 3.0291854146689285, "grad_norm": 0.8047592043876648, "learning_rate": 1.549390076125837e-05, "loss": 0.4166, "step": 15400 }, { "epoch": 3.0311524181849476, "grad_norm": 2.0489370822906494, "learning_rate": 1.549084349873124e-05, "loss": 0.3989, "step": 15410 }, { "epoch": 3.0331194217009663, "grad_norm": 1.3521877527236938, "learning_rate": 1.5487786236204102e-05, "loss": 0.3956, "step": 15420 }, { "epoch": 3.035086425216985, "grad_norm": 1.4160988330841064, "learning_rate": 1.548472897367697e-05, "loss": 0.3439, "step": 15430 }, { "epoch": 3.037053428733004, "grad_norm": 1.4726520776748657, "learning_rate": 1.5481671711149837e-05, "loss": 0.3935, "step": 15440 }, { "epoch": 3.0390204322490226, "grad_norm": 0.754563570022583, "learning_rate": 1.5478614448622702e-05, "loss": 0.5188, "step": 15450 }, { "epoch": 3.0409874357650413, "grad_norm": 3.205728769302368, "learning_rate": 1.547555718609557e-05, "loss": 0.3125, "step": 15460 }, { "epoch": 3.0429544392810604, "grad_norm": 2.270024538040161, "learning_rate": 1.5472499923568437e-05, "loss": 0.512, "step": 15470 }, { "epoch": 3.044921442797079, "grad_norm": 2.011733055114746, "learning_rate": 1.5469442661041307e-05, "loss": 0.3769, "step": 15480 }, { "epoch": 3.0468884463130976, "grad_norm": 1.1816586256027222, "learning_rate": 1.5466385398514172e-05, "loss": 0.3847, "step": 15490 }, { "epoch": 3.0488554498291167, "grad_norm": 1.7889635562896729, "learning_rate": 1.5463328135987038e-05, "loss": 0.3609, "step": 15500 }, { "epoch": 3.0488554498291167, "eval_loss": 0.1888391375541687, "eval_runtime": 8.8733, "eval_samples_per_second": 5.635, "eval_steps_per_second": 2.817, "step": 15500 }, { "epoch": 3.0508224533451354, "grad_norm": 1.3694669008255005, "learning_rate": 1.5460270873459907e-05, "loss": 0.4962, "step": 15510 }, { "epoch": 3.052789456861154, "grad_norm": 0.9643653035163879, "learning_rate": 1.5457213610932773e-05, "loss": 0.4182, "step": 15520 }, { "epoch": 3.054756460377173, "grad_norm": 1.9267834424972534, "learning_rate": 1.545415634840564e-05, "loss": 0.4511, "step": 15530 }, { "epoch": 3.0567234638931917, "grad_norm": 1.3095884323120117, "learning_rate": 1.5451099085878505e-05, "loss": 0.3976, "step": 15540 }, { "epoch": 3.0586904674092104, "grad_norm": 0.8985733389854431, "learning_rate": 1.544804182335137e-05, "loss": 0.4533, "step": 15550 }, { "epoch": 3.0606574709252294, "grad_norm": 1.9100348949432373, "learning_rate": 1.544498456082424e-05, "loss": 0.3699, "step": 15560 }, { "epoch": 3.062624474441248, "grad_norm": 2.1582112312316895, "learning_rate": 1.5441927298297105e-05, "loss": 0.3652, "step": 15570 }, { "epoch": 3.0645914779572667, "grad_norm": 0.8692134022712708, "learning_rate": 1.543887003576997e-05, "loss": 0.4368, "step": 15580 }, { "epoch": 3.066558481473286, "grad_norm": 0.8661279082298279, "learning_rate": 1.543581277324284e-05, "loss": 0.3397, "step": 15590 }, { "epoch": 3.0685254849893044, "grad_norm": 0.8110594153404236, "learning_rate": 1.5432755510715706e-05, "loss": 0.3904, "step": 15600 }, { "epoch": 3.070492488505323, "grad_norm": 1.5086452960968018, "learning_rate": 1.5429698248188575e-05, "loss": 0.3595, "step": 15610 }, { "epoch": 3.072459492021342, "grad_norm": 1.3329867124557495, "learning_rate": 1.542664098566144e-05, "loss": 0.4274, "step": 15620 }, { "epoch": 3.074426495537361, "grad_norm": 1.3834015130996704, "learning_rate": 1.5423583723134307e-05, "loss": 0.3732, "step": 15630 }, { "epoch": 3.0763934990533794, "grad_norm": 1.0691471099853516, "learning_rate": 1.5420526460607173e-05, "loss": 0.4107, "step": 15640 }, { "epoch": 3.0783605025693985, "grad_norm": 2.663893938064575, "learning_rate": 1.541746919808004e-05, "loss": 0.4178, "step": 15650 }, { "epoch": 3.080327506085417, "grad_norm": 1.1573967933654785, "learning_rate": 1.5414411935552907e-05, "loss": 0.3972, "step": 15660 }, { "epoch": 3.082294509601436, "grad_norm": 1.3600716590881348, "learning_rate": 1.5411354673025773e-05, "loss": 0.361, "step": 15670 }, { "epoch": 3.084261513117455, "grad_norm": 1.168461799621582, "learning_rate": 1.540829741049864e-05, "loss": 0.4059, "step": 15680 }, { "epoch": 3.0862285166334735, "grad_norm": 1.0328609943389893, "learning_rate": 1.5405240147971508e-05, "loss": 0.491, "step": 15690 }, { "epoch": 3.088195520149492, "grad_norm": 1.7007650136947632, "learning_rate": 1.5402182885444374e-05, "loss": 0.3999, "step": 15700 }, { "epoch": 3.0901625236655113, "grad_norm": 1.2272205352783203, "learning_rate": 1.539912562291724e-05, "loss": 0.2953, "step": 15710 }, { "epoch": 3.09212952718153, "grad_norm": 1.079796314239502, "learning_rate": 1.539606836039011e-05, "loss": 0.3475, "step": 15720 }, { "epoch": 3.0940965306975485, "grad_norm": 0.5963640213012695, "learning_rate": 1.5393011097862975e-05, "loss": 0.4842, "step": 15730 }, { "epoch": 3.0960635342135676, "grad_norm": 1.291596531867981, "learning_rate": 1.5389953835335844e-05, "loss": 0.4387, "step": 15740 }, { "epoch": 3.0980305377295863, "grad_norm": 1.0582354068756104, "learning_rate": 1.538689657280871e-05, "loss": 0.3131, "step": 15750 }, { "epoch": 3.099997541245605, "grad_norm": 1.327975869178772, "learning_rate": 1.5383839310281575e-05, "loss": 0.3869, "step": 15760 }, { "epoch": 3.101964544761624, "grad_norm": 3.4642159938812256, "learning_rate": 1.538078204775444e-05, "loss": 0.4543, "step": 15770 }, { "epoch": 3.1039315482776426, "grad_norm": 1.4730603694915771, "learning_rate": 1.5377724785227307e-05, "loss": 0.3647, "step": 15780 }, { "epoch": 3.1058985517936613, "grad_norm": 3.070542573928833, "learning_rate": 1.5374667522700176e-05, "loss": 0.409, "step": 15790 }, { "epoch": 3.1078655553096803, "grad_norm": 1.379279375076294, "learning_rate": 1.5371610260173042e-05, "loss": 0.2932, "step": 15800 }, { "epoch": 3.109832558825699, "grad_norm": 1.639320969581604, "learning_rate": 1.5368552997645908e-05, "loss": 0.3491, "step": 15810 }, { "epoch": 3.1117995623417176, "grad_norm": 1.58949875831604, "learning_rate": 1.5365495735118777e-05, "loss": 0.3182, "step": 15820 }, { "epoch": 3.1137665658577363, "grad_norm": 1.9512661695480347, "learning_rate": 1.5362438472591643e-05, "loss": 0.342, "step": 15830 }, { "epoch": 3.1157335693737553, "grad_norm": 1.656012773513794, "learning_rate": 1.535938121006451e-05, "loss": 0.409, "step": 15840 }, { "epoch": 3.117700572889774, "grad_norm": 1.1831879615783691, "learning_rate": 1.5356323947537378e-05, "loss": 0.321, "step": 15850 }, { "epoch": 3.1196675764057926, "grad_norm": 1.4055463075637817, "learning_rate": 1.5353266685010243e-05, "loss": 0.3609, "step": 15860 }, { "epoch": 3.1216345799218117, "grad_norm": 1.3060204982757568, "learning_rate": 1.535020942248311e-05, "loss": 0.3488, "step": 15870 }, { "epoch": 3.1236015834378303, "grad_norm": 1.2678416967391968, "learning_rate": 1.5347152159955975e-05, "loss": 0.3194, "step": 15880 }, { "epoch": 3.125568586953849, "grad_norm": 0.9447997808456421, "learning_rate": 1.5344094897428844e-05, "loss": 0.4053, "step": 15890 }, { "epoch": 3.127535590469868, "grad_norm": 1.1538068056106567, "learning_rate": 1.534103763490171e-05, "loss": 0.3742, "step": 15900 }, { "epoch": 3.1295025939858867, "grad_norm": 1.4947891235351562, "learning_rate": 1.5337980372374576e-05, "loss": 0.3913, "step": 15910 }, { "epoch": 3.1314695975019053, "grad_norm": 1.4566258192062378, "learning_rate": 1.5334923109847445e-05, "loss": 0.3162, "step": 15920 }, { "epoch": 3.1334366010179244, "grad_norm": 1.3656127452850342, "learning_rate": 1.533186584732031e-05, "loss": 0.2902, "step": 15930 }, { "epoch": 3.135403604533943, "grad_norm": 0.8082563877105713, "learning_rate": 1.5328808584793176e-05, "loss": 0.3822, "step": 15940 }, { "epoch": 3.1373706080499617, "grad_norm": 0.9663358330726624, "learning_rate": 1.5325751322266046e-05, "loss": 0.3285, "step": 15950 }, { "epoch": 3.139337611565981, "grad_norm": 1.7113450765609741, "learning_rate": 1.532269405973891e-05, "loss": 0.3592, "step": 15960 }, { "epoch": 3.1413046150819994, "grad_norm": 2.219865322113037, "learning_rate": 1.5319636797211777e-05, "loss": 0.469, "step": 15970 }, { "epoch": 3.143271618598018, "grad_norm": 1.3224714994430542, "learning_rate": 1.5316579534684643e-05, "loss": 0.4137, "step": 15980 }, { "epoch": 3.145238622114037, "grad_norm": 2.301541328430176, "learning_rate": 1.5313522272157512e-05, "loss": 0.3628, "step": 15990 }, { "epoch": 3.147205625630056, "grad_norm": 1.2435368299484253, "learning_rate": 1.5310465009630378e-05, "loss": 0.3875, "step": 16000 }, { "epoch": 3.147205625630056, "eval_loss": 0.18709848821163177, "eval_runtime": 8.8741, "eval_samples_per_second": 5.634, "eval_steps_per_second": 2.817, "step": 16000 }, { "epoch": 3.1491726291460744, "grad_norm": 1.5594910383224487, "learning_rate": 1.5307407747103244e-05, "loss": 0.4676, "step": 16010 }, { "epoch": 3.1511396326620935, "grad_norm": 2.1316165924072266, "learning_rate": 1.5304350484576113e-05, "loss": 0.4096, "step": 16020 }, { "epoch": 3.153106636178112, "grad_norm": 1.002728819847107, "learning_rate": 1.530129322204898e-05, "loss": 0.3481, "step": 16030 }, { "epoch": 3.155073639694131, "grad_norm": 1.6484594345092773, "learning_rate": 1.5298235959521844e-05, "loss": 0.4777, "step": 16040 }, { "epoch": 3.15704064321015, "grad_norm": 1.0128759145736694, "learning_rate": 1.5295178696994713e-05, "loss": 0.3421, "step": 16050 }, { "epoch": 3.1590076467261685, "grad_norm": 1.290736436843872, "learning_rate": 1.529212143446758e-05, "loss": 0.3759, "step": 16060 }, { "epoch": 3.160974650242187, "grad_norm": 2.4858880043029785, "learning_rate": 1.5289064171940445e-05, "loss": 0.3818, "step": 16070 }, { "epoch": 3.1629416537582062, "grad_norm": 1.9373151063919067, "learning_rate": 1.5286006909413314e-05, "loss": 0.364, "step": 16080 }, { "epoch": 3.164908657274225, "grad_norm": 2.771146297454834, "learning_rate": 1.528294964688618e-05, "loss": 0.3584, "step": 16090 }, { "epoch": 3.1668756607902435, "grad_norm": 1.1857792139053345, "learning_rate": 1.5279892384359046e-05, "loss": 0.4443, "step": 16100 }, { "epoch": 3.1688426643062626, "grad_norm": 1.4357842206954956, "learning_rate": 1.527683512183191e-05, "loss": 0.3047, "step": 16110 }, { "epoch": 3.1708096678222812, "grad_norm": 1.4366753101348877, "learning_rate": 1.527377785930478e-05, "loss": 0.4793, "step": 16120 }, { "epoch": 3.1727766713383, "grad_norm": 1.1929391622543335, "learning_rate": 1.5270720596777646e-05, "loss": 0.4433, "step": 16130 }, { "epoch": 3.174743674854319, "grad_norm": 1.8114970922470093, "learning_rate": 1.5267663334250512e-05, "loss": 0.3533, "step": 16140 }, { "epoch": 3.1767106783703376, "grad_norm": 2.2348251342773438, "learning_rate": 1.526460607172338e-05, "loss": 0.3287, "step": 16150 }, { "epoch": 3.1786776818863562, "grad_norm": 2.0907199382781982, "learning_rate": 1.5261548809196247e-05, "loss": 0.5437, "step": 16160 }, { "epoch": 3.1806446854023753, "grad_norm": 1.5697036981582642, "learning_rate": 1.5258491546669115e-05, "loss": 0.3234, "step": 16170 }, { "epoch": 3.182611688918394, "grad_norm": 2.318769931793213, "learning_rate": 1.525543428414198e-05, "loss": 0.3702, "step": 16180 }, { "epoch": 3.1845786924344126, "grad_norm": 2.415572166442871, "learning_rate": 1.5252377021614848e-05, "loss": 0.4578, "step": 16190 }, { "epoch": 3.1865456959504317, "grad_norm": 0.8824933767318726, "learning_rate": 1.5249319759087715e-05, "loss": 0.3238, "step": 16200 }, { "epoch": 3.1885126994664503, "grad_norm": 2.243199348449707, "learning_rate": 1.524626249656058e-05, "loss": 0.3848, "step": 16210 }, { "epoch": 3.190479702982469, "grad_norm": 1.778119683265686, "learning_rate": 1.5243205234033447e-05, "loss": 0.4075, "step": 16220 }, { "epoch": 3.192446706498488, "grad_norm": 1.3491222858428955, "learning_rate": 1.5240147971506314e-05, "loss": 0.3983, "step": 16230 }, { "epoch": 3.1944137100145067, "grad_norm": 1.729386329650879, "learning_rate": 1.523709070897918e-05, "loss": 0.3341, "step": 16240 }, { "epoch": 3.1963807135305253, "grad_norm": 1.2898648977279663, "learning_rate": 1.5234033446452048e-05, "loss": 0.3996, "step": 16250 }, { "epoch": 3.1983477170465444, "grad_norm": 1.3081196546554565, "learning_rate": 1.5230976183924915e-05, "loss": 0.5638, "step": 16260 }, { "epoch": 3.200314720562563, "grad_norm": 1.233986258506775, "learning_rate": 1.5227918921397783e-05, "loss": 0.2604, "step": 16270 }, { "epoch": 3.2022817240785817, "grad_norm": 1.209220290184021, "learning_rate": 1.5224861658870648e-05, "loss": 0.4057, "step": 16280 }, { "epoch": 3.2042487275946008, "grad_norm": 1.7715145349502563, "learning_rate": 1.5221804396343516e-05, "loss": 0.5158, "step": 16290 }, { "epoch": 3.2062157311106194, "grad_norm": 2.0262582302093506, "learning_rate": 1.5218747133816383e-05, "loss": 0.2559, "step": 16300 }, { "epoch": 3.208182734626638, "grad_norm": 0.9263052940368652, "learning_rate": 1.5215689871289249e-05, "loss": 0.4124, "step": 16310 }, { "epoch": 3.2101497381426567, "grad_norm": 0.5316019058227539, "learning_rate": 1.5212632608762115e-05, "loss": 0.3539, "step": 16320 }, { "epoch": 3.212116741658676, "grad_norm": 1.329001784324646, "learning_rate": 1.5209575346234982e-05, "loss": 0.3256, "step": 16330 }, { "epoch": 3.2140837451746944, "grad_norm": 2.238600254058838, "learning_rate": 1.5206518083707848e-05, "loss": 0.483, "step": 16340 }, { "epoch": 3.216050748690713, "grad_norm": 1.8471485376358032, "learning_rate": 1.5203460821180716e-05, "loss": 0.4705, "step": 16350 }, { "epoch": 3.218017752206732, "grad_norm": 1.351696252822876, "learning_rate": 1.5200403558653583e-05, "loss": 0.4801, "step": 16360 }, { "epoch": 3.219984755722751, "grad_norm": 1.284036636352539, "learning_rate": 1.5197346296126449e-05, "loss": 0.397, "step": 16370 }, { "epoch": 3.2219517592387694, "grad_norm": 1.4418704509735107, "learning_rate": 1.5194289033599316e-05, "loss": 0.4128, "step": 16380 }, { "epoch": 3.2239187627547885, "grad_norm": 0.7759731411933899, "learning_rate": 1.5191231771072184e-05, "loss": 0.3414, "step": 16390 }, { "epoch": 3.225885766270807, "grad_norm": 1.1827300786972046, "learning_rate": 1.5188174508545051e-05, "loss": 0.4214, "step": 16400 }, { "epoch": 3.227852769786826, "grad_norm": 1.2514878511428833, "learning_rate": 1.5185117246017917e-05, "loss": 0.3382, "step": 16410 }, { "epoch": 3.229819773302845, "grad_norm": 0.9900833964347839, "learning_rate": 1.5182059983490785e-05, "loss": 0.3742, "step": 16420 }, { "epoch": 3.2317867768188635, "grad_norm": 0.8759545087814331, "learning_rate": 1.5179002720963652e-05, "loss": 0.4368, "step": 16430 }, { "epoch": 3.233753780334882, "grad_norm": 2.74102520942688, "learning_rate": 1.5175945458436516e-05, "loss": 0.4181, "step": 16440 }, { "epoch": 3.2357207838509012, "grad_norm": 1.1985174417495728, "learning_rate": 1.5172888195909384e-05, "loss": 0.3676, "step": 16450 }, { "epoch": 3.23768778736692, "grad_norm": 1.1097427606582642, "learning_rate": 1.5169830933382251e-05, "loss": 0.3544, "step": 16460 }, { "epoch": 3.2396547908829385, "grad_norm": 1.6733139753341675, "learning_rate": 1.5166773670855117e-05, "loss": 0.5471, "step": 16470 }, { "epoch": 3.2416217943989576, "grad_norm": 0.8993484973907471, "learning_rate": 1.5163716408327984e-05, "loss": 0.4655, "step": 16480 }, { "epoch": 3.2435887979149762, "grad_norm": 1.7301872968673706, "learning_rate": 1.5160659145800852e-05, "loss": 0.3459, "step": 16490 }, { "epoch": 3.245555801430995, "grad_norm": 1.0023419857025146, "learning_rate": 1.5157601883273718e-05, "loss": 0.3609, "step": 16500 }, { "epoch": 3.245555801430995, "eval_loss": 0.18538357317447662, "eval_runtime": 8.898, "eval_samples_per_second": 5.619, "eval_steps_per_second": 2.81, "step": 16500 }, { "epoch": 3.247522804947014, "grad_norm": 1.1783928871154785, "learning_rate": 1.5154544620746585e-05, "loss": 0.4928, "step": 16510 }, { "epoch": 3.2494898084630326, "grad_norm": 1.1952968835830688, "learning_rate": 1.5151487358219452e-05, "loss": 0.5126, "step": 16520 }, { "epoch": 3.2514568119790512, "grad_norm": 1.4290196895599365, "learning_rate": 1.514843009569232e-05, "loss": 0.4627, "step": 16530 }, { "epoch": 3.2534238154950703, "grad_norm": 4.141489028930664, "learning_rate": 1.5145372833165186e-05, "loss": 0.4236, "step": 16540 }, { "epoch": 3.255390819011089, "grad_norm": 3.053148031234741, "learning_rate": 1.5142315570638051e-05, "loss": 0.4596, "step": 16550 }, { "epoch": 3.2573578225271076, "grad_norm": 1.426658034324646, "learning_rate": 1.5139258308110917e-05, "loss": 0.3954, "step": 16560 }, { "epoch": 3.2593248260431267, "grad_norm": 4.583922386169434, "learning_rate": 1.5136201045583785e-05, "loss": 0.4003, "step": 16570 }, { "epoch": 3.2612918295591453, "grad_norm": 0.754277765750885, "learning_rate": 1.5133143783056652e-05, "loss": 0.3205, "step": 16580 }, { "epoch": 3.263258833075164, "grad_norm": 1.0575964450836182, "learning_rate": 1.513008652052952e-05, "loss": 0.3348, "step": 16590 }, { "epoch": 3.265225836591183, "grad_norm": 1.7032275199890137, "learning_rate": 1.5127029258002385e-05, "loss": 0.4254, "step": 16600 }, { "epoch": 3.2671928401072017, "grad_norm": 0.8603004813194275, "learning_rate": 1.5123971995475253e-05, "loss": 0.349, "step": 16610 }, { "epoch": 3.2691598436232203, "grad_norm": 2.109483480453491, "learning_rate": 1.512091473294812e-05, "loss": 0.5604, "step": 16620 }, { "epoch": 3.2711268471392394, "grad_norm": 2.1563918590545654, "learning_rate": 1.5117857470420986e-05, "loss": 0.516, "step": 16630 }, { "epoch": 3.273093850655258, "grad_norm": 3.7578446865081787, "learning_rate": 1.5114800207893854e-05, "loss": 0.3768, "step": 16640 }, { "epoch": 3.2750608541712767, "grad_norm": 1.2201038599014282, "learning_rate": 1.5111742945366721e-05, "loss": 0.3314, "step": 16650 }, { "epoch": 3.2770278576872958, "grad_norm": 1.9109632968902588, "learning_rate": 1.5108685682839585e-05, "loss": 0.3612, "step": 16660 }, { "epoch": 3.2789948612033144, "grad_norm": 1.967887043952942, "learning_rate": 1.5105628420312453e-05, "loss": 0.4012, "step": 16670 }, { "epoch": 3.280961864719333, "grad_norm": 1.129434585571289, "learning_rate": 1.510257115778532e-05, "loss": 0.3515, "step": 16680 }, { "epoch": 3.282928868235352, "grad_norm": 3.1458957195281982, "learning_rate": 1.5099513895258186e-05, "loss": 0.3917, "step": 16690 }, { "epoch": 3.2848958717513708, "grad_norm": 1.294286847114563, "learning_rate": 1.5096456632731053e-05, "loss": 0.3951, "step": 16700 }, { "epoch": 3.2868628752673894, "grad_norm": 1.4721623659133911, "learning_rate": 1.5093399370203921e-05, "loss": 0.3145, "step": 16710 }, { "epoch": 3.2888298787834085, "grad_norm": 1.3779369592666626, "learning_rate": 1.5090342107676788e-05, "loss": 0.492, "step": 16720 }, { "epoch": 3.290796882299427, "grad_norm": 2.5460314750671387, "learning_rate": 1.5087284845149654e-05, "loss": 0.4616, "step": 16730 }, { "epoch": 3.2927638858154458, "grad_norm": 1.3833627700805664, "learning_rate": 1.5084227582622522e-05, "loss": 0.4766, "step": 16740 }, { "epoch": 3.294730889331465, "grad_norm": 0.9380312561988831, "learning_rate": 1.5081170320095389e-05, "loss": 0.449, "step": 16750 }, { "epoch": 3.2966978928474835, "grad_norm": 1.3430695533752441, "learning_rate": 1.5078113057568255e-05, "loss": 0.5169, "step": 16760 }, { "epoch": 3.298664896363502, "grad_norm": 1.6764286756515503, "learning_rate": 1.5075055795041122e-05, "loss": 0.5024, "step": 16770 }, { "epoch": 3.300631899879521, "grad_norm": 1.2297427654266357, "learning_rate": 1.5071998532513988e-05, "loss": 0.4217, "step": 16780 }, { "epoch": 3.30259890339554, "grad_norm": 1.2263567447662354, "learning_rate": 1.5068941269986854e-05, "loss": 0.4577, "step": 16790 }, { "epoch": 3.3045659069115585, "grad_norm": 1.4633382558822632, "learning_rate": 1.5065884007459721e-05, "loss": 0.3677, "step": 16800 }, { "epoch": 3.3065329104275776, "grad_norm": 1.3397003412246704, "learning_rate": 1.5062826744932589e-05, "loss": 0.3227, "step": 16810 }, { "epoch": 3.308499913943596, "grad_norm": 1.438306450843811, "learning_rate": 1.5059769482405455e-05, "loss": 0.3175, "step": 16820 }, { "epoch": 3.310466917459615, "grad_norm": 1.435363531112671, "learning_rate": 1.5056712219878322e-05, "loss": 0.3392, "step": 16830 }, { "epoch": 3.312433920975634, "grad_norm": 2.12419056892395, "learning_rate": 1.505365495735119e-05, "loss": 0.5618, "step": 16840 }, { "epoch": 3.3144009244916526, "grad_norm": 1.4096325635910034, "learning_rate": 1.5050597694824057e-05, "loss": 0.3874, "step": 16850 }, { "epoch": 3.316367928007671, "grad_norm": 2.04353404045105, "learning_rate": 1.5047540432296923e-05, "loss": 0.4306, "step": 16860 }, { "epoch": 3.3183349315236903, "grad_norm": 1.5693720579147339, "learning_rate": 1.504448316976979e-05, "loss": 0.4537, "step": 16870 }, { "epoch": 3.320301935039709, "grad_norm": 1.255326271057129, "learning_rate": 1.5041425907242658e-05, "loss": 0.4625, "step": 16880 }, { "epoch": 3.3222689385557276, "grad_norm": 2.1881473064422607, "learning_rate": 1.5038368644715522e-05, "loss": 0.3993, "step": 16890 }, { "epoch": 3.3242359420717467, "grad_norm": 1.0848050117492676, "learning_rate": 1.503531138218839e-05, "loss": 0.3404, "step": 16900 }, { "epoch": 3.3262029455877653, "grad_norm": 1.1328253746032715, "learning_rate": 1.5032254119661255e-05, "loss": 0.367, "step": 16910 }, { "epoch": 3.328169949103784, "grad_norm": 2.038686513900757, "learning_rate": 1.5029196857134123e-05, "loss": 0.4136, "step": 16920 }, { "epoch": 3.330136952619803, "grad_norm": 1.5437724590301514, "learning_rate": 1.502613959460699e-05, "loss": 0.3227, "step": 16930 }, { "epoch": 3.3321039561358217, "grad_norm": 1.6073527336120605, "learning_rate": 1.5023082332079857e-05, "loss": 0.4443, "step": 16940 }, { "epoch": 3.3340709596518403, "grad_norm": 0.997283935546875, "learning_rate": 1.5020025069552723e-05, "loss": 0.5583, "step": 16950 }, { "epoch": 3.3360379631678594, "grad_norm": 1.7358487844467163, "learning_rate": 1.501696780702559e-05, "loss": 0.4091, "step": 16960 }, { "epoch": 3.338004966683878, "grad_norm": 1.2565847635269165, "learning_rate": 1.5013910544498458e-05, "loss": 0.4082, "step": 16970 }, { "epoch": 3.3399719701998967, "grad_norm": 2.5975379943847656, "learning_rate": 1.5010853281971326e-05, "loss": 0.4562, "step": 16980 }, { "epoch": 3.3419389737159158, "grad_norm": 0.9561290740966797, "learning_rate": 1.5007796019444191e-05, "loss": 0.4909, "step": 16990 }, { "epoch": 3.3439059772319344, "grad_norm": 1.0989880561828613, "learning_rate": 1.5004738756917057e-05, "loss": 0.4527, "step": 17000 }, { "epoch": 3.3439059772319344, "eval_loss": 0.17282529175281525, "eval_runtime": 8.8713, "eval_samples_per_second": 5.636, "eval_steps_per_second": 2.818, "step": 17000 }, { "epoch": 3.345872980747953, "grad_norm": 1.0213874578475952, "learning_rate": 1.5001681494389923e-05, "loss": 0.3839, "step": 17010 }, { "epoch": 3.347839984263972, "grad_norm": 0.748192548751831, "learning_rate": 1.499862423186279e-05, "loss": 0.3286, "step": 17020 }, { "epoch": 3.3498069877799908, "grad_norm": 2.626722574234009, "learning_rate": 1.4995566969335658e-05, "loss": 0.3294, "step": 17030 }, { "epoch": 3.3517739912960094, "grad_norm": 2.035220146179199, "learning_rate": 1.4992509706808524e-05, "loss": 0.3095, "step": 17040 }, { "epoch": 3.3537409948120285, "grad_norm": 0.9364410638809204, "learning_rate": 1.4989452444281391e-05, "loss": 0.5119, "step": 17050 }, { "epoch": 3.355707998328047, "grad_norm": 1.4533653259277344, "learning_rate": 1.4986395181754259e-05, "loss": 0.392, "step": 17060 }, { "epoch": 3.3576750018440658, "grad_norm": 0.9131273627281189, "learning_rate": 1.4983337919227126e-05, "loss": 0.4791, "step": 17070 }, { "epoch": 3.3596420053600844, "grad_norm": 1.4228838682174683, "learning_rate": 1.4980280656699992e-05, "loss": 0.4465, "step": 17080 }, { "epoch": 3.3616090088761035, "grad_norm": 1.7861515283584595, "learning_rate": 1.497722339417286e-05, "loss": 0.3783, "step": 17090 }, { "epoch": 3.363576012392122, "grad_norm": 2.334275484085083, "learning_rate": 1.4974166131645727e-05, "loss": 0.4166, "step": 17100 }, { "epoch": 3.3655430159081408, "grad_norm": 1.2254345417022705, "learning_rate": 1.4971108869118594e-05, "loss": 0.3472, "step": 17110 }, { "epoch": 3.36751001942416, "grad_norm": 0.9154657125473022, "learning_rate": 1.4968051606591458e-05, "loss": 0.4692, "step": 17120 }, { "epoch": 3.3694770229401785, "grad_norm": 1.3035657405853271, "learning_rate": 1.4964994344064326e-05, "loss": 0.3889, "step": 17130 }, { "epoch": 3.371444026456197, "grad_norm": 1.2203218936920166, "learning_rate": 1.4961937081537192e-05, "loss": 0.3086, "step": 17140 }, { "epoch": 3.373411029972216, "grad_norm": 0.7467970848083496, "learning_rate": 1.4958879819010059e-05, "loss": 0.4192, "step": 17150 }, { "epoch": 3.375378033488235, "grad_norm": 1.9781105518341064, "learning_rate": 1.4955822556482927e-05, "loss": 0.3967, "step": 17160 }, { "epoch": 3.3773450370042535, "grad_norm": 1.0278807878494263, "learning_rate": 1.4952765293955792e-05, "loss": 0.4609, "step": 17170 }, { "epoch": 3.3793120405202726, "grad_norm": 1.5243288278579712, "learning_rate": 1.494970803142866e-05, "loss": 0.3332, "step": 17180 }, { "epoch": 3.381279044036291, "grad_norm": 1.0082272291183472, "learning_rate": 1.4946650768901527e-05, "loss": 0.2864, "step": 17190 }, { "epoch": 3.38324604755231, "grad_norm": 1.5018178224563599, "learning_rate": 1.4943593506374395e-05, "loss": 0.4893, "step": 17200 }, { "epoch": 3.385213051068329, "grad_norm": 1.1987298727035522, "learning_rate": 1.494053624384726e-05, "loss": 0.2966, "step": 17210 }, { "epoch": 3.3871800545843476, "grad_norm": 3.0867018699645996, "learning_rate": 1.4937478981320128e-05, "loss": 0.4841, "step": 17220 }, { "epoch": 3.389147058100366, "grad_norm": 1.738324761390686, "learning_rate": 1.4934421718792992e-05, "loss": 0.3599, "step": 17230 }, { "epoch": 3.3911140616163853, "grad_norm": 1.9667898416519165, "learning_rate": 1.493136445626586e-05, "loss": 0.3224, "step": 17240 }, { "epoch": 3.393081065132404, "grad_norm": 1.2552376985549927, "learning_rate": 1.4928307193738727e-05, "loss": 0.4395, "step": 17250 }, { "epoch": 3.3950480686484226, "grad_norm": 2.6338891983032227, "learning_rate": 1.4925249931211595e-05, "loss": 0.4038, "step": 17260 }, { "epoch": 3.3970150721644417, "grad_norm": 1.44743013381958, "learning_rate": 1.492219266868446e-05, "loss": 0.2454, "step": 17270 }, { "epoch": 3.3989820756804603, "grad_norm": 1.3355497121810913, "learning_rate": 1.4919135406157328e-05, "loss": 0.2837, "step": 17280 }, { "epoch": 3.400949079196479, "grad_norm": 2.0968546867370605, "learning_rate": 1.4916078143630195e-05, "loss": 0.2952, "step": 17290 }, { "epoch": 3.402916082712498, "grad_norm": 1.6755894422531128, "learning_rate": 1.4913020881103061e-05, "loss": 0.3963, "step": 17300 }, { "epoch": 3.4048830862285167, "grad_norm": 2.201287031173706, "learning_rate": 1.4909963618575929e-05, "loss": 0.4035, "step": 17310 }, { "epoch": 3.4068500897445353, "grad_norm": 1.1484105587005615, "learning_rate": 1.4906906356048796e-05, "loss": 0.4353, "step": 17320 }, { "epoch": 3.4088170932605544, "grad_norm": 1.558431625366211, "learning_rate": 1.4903849093521663e-05, "loss": 0.4465, "step": 17330 }, { "epoch": 3.410784096776573, "grad_norm": 1.5346964597702026, "learning_rate": 1.4900791830994528e-05, "loss": 0.4069, "step": 17340 }, { "epoch": 3.4127511002925917, "grad_norm": 1.7767252922058105, "learning_rate": 1.4897734568467395e-05, "loss": 0.3883, "step": 17350 }, { "epoch": 3.4147181038086107, "grad_norm": 0.967298686504364, "learning_rate": 1.489467730594026e-05, "loss": 0.3695, "step": 17360 }, { "epoch": 3.4166851073246294, "grad_norm": 0.9392523765563965, "learning_rate": 1.4891620043413128e-05, "loss": 0.4026, "step": 17370 }, { "epoch": 3.418652110840648, "grad_norm": 1.7647945880889893, "learning_rate": 1.4888562780885996e-05, "loss": 0.3074, "step": 17380 }, { "epoch": 3.420619114356667, "grad_norm": 1.1813808679580688, "learning_rate": 1.4885505518358863e-05, "loss": 0.381, "step": 17390 }, { "epoch": 3.4225861178726857, "grad_norm": 1.2232673168182373, "learning_rate": 1.4882448255831729e-05, "loss": 0.48, "step": 17400 }, { "epoch": 3.4245531213887044, "grad_norm": 1.052878975868225, "learning_rate": 1.4879390993304596e-05, "loss": 0.4123, "step": 17410 }, { "epoch": 3.426520124904723, "grad_norm": 1.2868292331695557, "learning_rate": 1.4876333730777464e-05, "loss": 0.3772, "step": 17420 }, { "epoch": 3.428487128420742, "grad_norm": 1.3638333082199097, "learning_rate": 1.487327646825033e-05, "loss": 0.3819, "step": 17430 }, { "epoch": 3.4304541319367607, "grad_norm": 2.3324482440948486, "learning_rate": 1.4870219205723197e-05, "loss": 0.4775, "step": 17440 }, { "epoch": 3.4324211354527794, "grad_norm": 1.1716792583465576, "learning_rate": 1.4867161943196065e-05, "loss": 0.3989, "step": 17450 }, { "epoch": 3.4343881389687985, "grad_norm": 1.5580689907073975, "learning_rate": 1.4864104680668929e-05, "loss": 0.4771, "step": 17460 }, { "epoch": 3.436355142484817, "grad_norm": 1.3105270862579346, "learning_rate": 1.4861047418141796e-05, "loss": 0.378, "step": 17470 }, { "epoch": 3.4383221460008357, "grad_norm": 1.6902211904525757, "learning_rate": 1.4857990155614664e-05, "loss": 0.4194, "step": 17480 }, { "epoch": 3.440289149516855, "grad_norm": 0.9721977114677429, "learning_rate": 1.485493289308753e-05, "loss": 0.5298, "step": 17490 }, { "epoch": 3.4422561530328735, "grad_norm": 1.1658124923706055, "learning_rate": 1.4851875630560397e-05, "loss": 0.3992, "step": 17500 }, { "epoch": 3.4422561530328735, "eval_loss": 0.17830701172351837, "eval_runtime": 8.8662, "eval_samples_per_second": 5.639, "eval_steps_per_second": 2.82, "step": 17500 }, { "epoch": 3.444223156548892, "grad_norm": 0.9558307528495789, "learning_rate": 1.4848818368033264e-05, "loss": 0.4587, "step": 17510 }, { "epoch": 3.446190160064911, "grad_norm": 1.4307557344436646, "learning_rate": 1.4845761105506132e-05, "loss": 0.3265, "step": 17520 }, { "epoch": 3.44815716358093, "grad_norm": 1.1415700912475586, "learning_rate": 1.4842703842978998e-05, "loss": 0.3661, "step": 17530 }, { "epoch": 3.4501241670969485, "grad_norm": 3.2358052730560303, "learning_rate": 1.4839646580451865e-05, "loss": 0.4427, "step": 17540 }, { "epoch": 3.4520911706129676, "grad_norm": 1.6495846509933472, "learning_rate": 1.4836589317924733e-05, "loss": 0.3998, "step": 17550 }, { "epoch": 3.454058174128986, "grad_norm": 1.502959966659546, "learning_rate": 1.4833532055397598e-05, "loss": 0.3918, "step": 17560 }, { "epoch": 3.456025177645005, "grad_norm": 2.2317111492156982, "learning_rate": 1.4830474792870464e-05, "loss": 0.4154, "step": 17570 }, { "epoch": 3.457992181161024, "grad_norm": 1.9064170122146606, "learning_rate": 1.4827417530343332e-05, "loss": 0.4326, "step": 17580 }, { "epoch": 3.4599591846770426, "grad_norm": 1.7507431507110596, "learning_rate": 1.4824360267816197e-05, "loss": 0.4027, "step": 17590 }, { "epoch": 3.461926188193061, "grad_norm": 1.6282731294631958, "learning_rate": 1.4821303005289065e-05, "loss": 0.386, "step": 17600 }, { "epoch": 3.4638931917090803, "grad_norm": 1.266846776008606, "learning_rate": 1.4818245742761932e-05, "loss": 0.256, "step": 17610 }, { "epoch": 3.465860195225099, "grad_norm": 2.4402332305908203, "learning_rate": 1.4815188480234798e-05, "loss": 0.4506, "step": 17620 }, { "epoch": 3.4678271987411176, "grad_norm": 1.072805404663086, "learning_rate": 1.4812131217707666e-05, "loss": 0.298, "step": 17630 }, { "epoch": 3.4697942022571366, "grad_norm": 2.0623440742492676, "learning_rate": 1.4809073955180533e-05, "loss": 0.4893, "step": 17640 }, { "epoch": 3.4717612057731553, "grad_norm": 2.03657865524292, "learning_rate": 1.48060166926534e-05, "loss": 0.4633, "step": 17650 }, { "epoch": 3.473728209289174, "grad_norm": 1.6767804622650146, "learning_rate": 1.4802959430126266e-05, "loss": 0.3956, "step": 17660 }, { "epoch": 3.475695212805193, "grad_norm": 1.3445123434066772, "learning_rate": 1.4799902167599134e-05, "loss": 0.4233, "step": 17670 }, { "epoch": 3.4776622163212116, "grad_norm": 0.9934619665145874, "learning_rate": 1.4796844905071998e-05, "loss": 0.4358, "step": 17680 }, { "epoch": 3.4796292198372303, "grad_norm": 1.1431872844696045, "learning_rate": 1.4793787642544865e-05, "loss": 0.4817, "step": 17690 }, { "epoch": 3.4815962233532494, "grad_norm": 2.3636295795440674, "learning_rate": 1.4790730380017733e-05, "loss": 0.3428, "step": 17700 }, { "epoch": 3.483563226869268, "grad_norm": 1.1688228845596313, "learning_rate": 1.47876731174906e-05, "loss": 0.4124, "step": 17710 }, { "epoch": 3.4855302303852866, "grad_norm": 1.8074513673782349, "learning_rate": 1.4784615854963466e-05, "loss": 0.4571, "step": 17720 }, { "epoch": 3.4874972339013057, "grad_norm": 1.1620044708251953, "learning_rate": 1.4781558592436334e-05, "loss": 0.3975, "step": 17730 }, { "epoch": 3.4894642374173244, "grad_norm": 2.0841927528381348, "learning_rate": 1.4778501329909201e-05, "loss": 0.3895, "step": 17740 }, { "epoch": 3.491431240933343, "grad_norm": 2.7444779872894287, "learning_rate": 1.4775444067382067e-05, "loss": 0.4395, "step": 17750 }, { "epoch": 3.493398244449362, "grad_norm": 2.4333293437957764, "learning_rate": 1.4772386804854934e-05, "loss": 0.3026, "step": 17760 }, { "epoch": 3.4953652479653807, "grad_norm": 1.7708234786987305, "learning_rate": 1.4769329542327802e-05, "loss": 0.4695, "step": 17770 }, { "epoch": 3.4973322514813994, "grad_norm": 1.2430254220962524, "learning_rate": 1.476627227980067e-05, "loss": 0.3391, "step": 17780 }, { "epoch": 3.4992992549974185, "grad_norm": 1.132948875427246, "learning_rate": 1.4763215017273535e-05, "loss": 0.4641, "step": 17790 }, { "epoch": 3.501266258513437, "grad_norm": 0.7967662811279297, "learning_rate": 1.47601577547464e-05, "loss": 0.4284, "step": 17800 }, { "epoch": 3.5032332620294557, "grad_norm": 1.9069840908050537, "learning_rate": 1.4757100492219267e-05, "loss": 0.293, "step": 17810 }, { "epoch": 3.505200265545475, "grad_norm": 1.1667743921279907, "learning_rate": 1.4754043229692134e-05, "loss": 0.4267, "step": 17820 }, { "epoch": 3.5071672690614935, "grad_norm": 1.3001110553741455, "learning_rate": 1.4750985967165002e-05, "loss": 0.4152, "step": 17830 }, { "epoch": 3.509134272577512, "grad_norm": 1.6212232112884521, "learning_rate": 1.4747928704637869e-05, "loss": 0.2784, "step": 17840 }, { "epoch": 3.511101276093531, "grad_norm": 1.6824311017990112, "learning_rate": 1.4744871442110735e-05, "loss": 0.5409, "step": 17850 }, { "epoch": 3.51306827960955, "grad_norm": 2.316366672515869, "learning_rate": 1.4741814179583602e-05, "loss": 0.3553, "step": 17860 }, { "epoch": 3.5150352831255685, "grad_norm": 1.079383373260498, "learning_rate": 1.473875691705647e-05, "loss": 0.4142, "step": 17870 }, { "epoch": 3.5170022866415875, "grad_norm": 1.539841651916504, "learning_rate": 1.4735699654529335e-05, "loss": 0.3635, "step": 17880 }, { "epoch": 3.518969290157606, "grad_norm": 1.3688745498657227, "learning_rate": 1.4732642392002203e-05, "loss": 0.3125, "step": 17890 }, { "epoch": 3.520936293673625, "grad_norm": 1.4701616764068604, "learning_rate": 1.472958512947507e-05, "loss": 0.3808, "step": 17900 }, { "epoch": 3.522903297189644, "grad_norm": 1.2537261247634888, "learning_rate": 1.4726527866947935e-05, "loss": 0.3833, "step": 17910 }, { "epoch": 3.5248703007056625, "grad_norm": 1.4147868156433105, "learning_rate": 1.4723470604420802e-05, "loss": 0.3304, "step": 17920 }, { "epoch": 3.526837304221681, "grad_norm": 2.0437426567077637, "learning_rate": 1.472041334189367e-05, "loss": 0.4108, "step": 17930 }, { "epoch": 3.5288043077377003, "grad_norm": 0.944835364818573, "learning_rate": 1.4717356079366535e-05, "loss": 0.4279, "step": 17940 }, { "epoch": 3.530771311253719, "grad_norm": 0.7478554844856262, "learning_rate": 1.4714298816839403e-05, "loss": 0.4301, "step": 17950 }, { "epoch": 3.5327383147697375, "grad_norm": 1.308043360710144, "learning_rate": 1.471124155431227e-05, "loss": 0.4849, "step": 17960 }, { "epoch": 3.5347053182857566, "grad_norm": 0.990397572517395, "learning_rate": 1.4708184291785138e-05, "loss": 0.473, "step": 17970 }, { "epoch": 3.5366723218017753, "grad_norm": 0.9142557978630066, "learning_rate": 1.4705127029258003e-05, "loss": 0.6008, "step": 17980 }, { "epoch": 3.538639325317794, "grad_norm": 2.228318452835083, "learning_rate": 1.4702069766730871e-05, "loss": 0.4516, "step": 17990 }, { "epoch": 3.540606328833813, "grad_norm": 1.151518702507019, "learning_rate": 1.4699012504203738e-05, "loss": 0.5171, "step": 18000 }, { "epoch": 3.540606328833813, "eval_loss": 0.18341459333896637, "eval_runtime": 8.9046, "eval_samples_per_second": 5.615, "eval_steps_per_second": 2.808, "step": 18000 }, { "epoch": 3.5425733323498316, "grad_norm": 1.794045329093933, "learning_rate": 1.4695955241676604e-05, "loss": 0.3875, "step": 18010 }, { "epoch": 3.5445403358658503, "grad_norm": 1.164614200592041, "learning_rate": 1.469289797914947e-05, "loss": 0.4417, "step": 18020 }, { "epoch": 3.5465073393818694, "grad_norm": 1.6620136499404907, "learning_rate": 1.4689840716622337e-05, "loss": 0.3842, "step": 18030 }, { "epoch": 3.548474342897888, "grad_norm": 1.4214859008789062, "learning_rate": 1.4686783454095203e-05, "loss": 0.4269, "step": 18040 }, { "epoch": 3.5504413464139066, "grad_norm": 3.659273386001587, "learning_rate": 1.468372619156807e-05, "loss": 0.3654, "step": 18050 }, { "epoch": 3.5524083499299257, "grad_norm": 1.5434625148773193, "learning_rate": 1.4680668929040938e-05, "loss": 0.4587, "step": 18060 }, { "epoch": 3.5543753534459444, "grad_norm": 1.2292109727859497, "learning_rate": 1.4677611666513804e-05, "loss": 0.3767, "step": 18070 }, { "epoch": 3.556342356961963, "grad_norm": 1.5507662296295166, "learning_rate": 1.4674554403986671e-05, "loss": 0.3647, "step": 18080 }, { "epoch": 3.558309360477982, "grad_norm": 1.2681255340576172, "learning_rate": 1.4671497141459539e-05, "loss": 0.3738, "step": 18090 }, { "epoch": 3.5602763639940007, "grad_norm": 1.5174590349197388, "learning_rate": 1.4668439878932406e-05, "loss": 0.3405, "step": 18100 }, { "epoch": 3.5622433675100194, "grad_norm": 0.6839548945426941, "learning_rate": 1.4665382616405272e-05, "loss": 0.2135, "step": 18110 }, { "epoch": 3.5642103710260384, "grad_norm": 2.3022007942199707, "learning_rate": 1.466232535387814e-05, "loss": 0.4563, "step": 18120 }, { "epoch": 3.566177374542057, "grad_norm": 2.41715407371521, "learning_rate": 1.4659268091351007e-05, "loss": 0.3429, "step": 18130 }, { "epoch": 3.5681443780580757, "grad_norm": 2.4926366806030273, "learning_rate": 1.4656210828823871e-05, "loss": 0.3883, "step": 18140 }, { "epoch": 3.570111381574095, "grad_norm": 0.43145233392715454, "learning_rate": 1.4653153566296739e-05, "loss": 0.3409, "step": 18150 }, { "epoch": 3.5720783850901134, "grad_norm": 1.6929571628570557, "learning_rate": 1.4650096303769606e-05, "loss": 0.3721, "step": 18160 }, { "epoch": 3.574045388606132, "grad_norm": 1.570227026939392, "learning_rate": 1.4647039041242472e-05, "loss": 0.4791, "step": 18170 }, { "epoch": 3.576012392122151, "grad_norm": 0.941781222820282, "learning_rate": 1.464398177871534e-05, "loss": 0.4605, "step": 18180 }, { "epoch": 3.57797939563817, "grad_norm": 1.1423379182815552, "learning_rate": 1.4640924516188207e-05, "loss": 0.3748, "step": 18190 }, { "epoch": 3.5799463991541884, "grad_norm": 1.7628474235534668, "learning_rate": 1.4637867253661073e-05, "loss": 0.4167, "step": 18200 }, { "epoch": 3.5819134026702075, "grad_norm": 1.6465951204299927, "learning_rate": 1.463480999113394e-05, "loss": 0.4413, "step": 18210 }, { "epoch": 3.583880406186226, "grad_norm": 1.6730773448944092, "learning_rate": 1.4631752728606807e-05, "loss": 0.3454, "step": 18220 }, { "epoch": 3.585847409702245, "grad_norm": 1.4015109539031982, "learning_rate": 1.4628695466079675e-05, "loss": 0.4662, "step": 18230 }, { "epoch": 3.587814413218264, "grad_norm": 1.1885541677474976, "learning_rate": 1.462563820355254e-05, "loss": 0.3946, "step": 18240 }, { "epoch": 3.5897814167342825, "grad_norm": 0.859453022480011, "learning_rate": 1.4622580941025407e-05, "loss": 0.4901, "step": 18250 }, { "epoch": 3.591748420250301, "grad_norm": 2.0021779537200928, "learning_rate": 1.4619523678498272e-05, "loss": 0.3715, "step": 18260 }, { "epoch": 3.5937154237663202, "grad_norm": 1.107502818107605, "learning_rate": 1.461646641597114e-05, "loss": 0.4452, "step": 18270 }, { "epoch": 3.595682427282339, "grad_norm": 1.370747447013855, "learning_rate": 1.4613409153444007e-05, "loss": 0.4037, "step": 18280 }, { "epoch": 3.5976494307983575, "grad_norm": 2.8257832527160645, "learning_rate": 1.4610351890916875e-05, "loss": 0.3248, "step": 18290 }, { "epoch": 3.5996164343143766, "grad_norm": 2.032862901687622, "learning_rate": 1.460729462838974e-05, "loss": 0.4332, "step": 18300 }, { "epoch": 3.6015834378303953, "grad_norm": 1.1066280603408813, "learning_rate": 1.4604237365862608e-05, "loss": 0.4214, "step": 18310 }, { "epoch": 3.603550441346414, "grad_norm": 0.9145089983940125, "learning_rate": 1.4601180103335475e-05, "loss": 0.4127, "step": 18320 }, { "epoch": 3.6055174448624325, "grad_norm": 1.2561908960342407, "learning_rate": 1.4598122840808341e-05, "loss": 0.3641, "step": 18330 }, { "epoch": 3.6074844483784516, "grad_norm": 1.9700093269348145, "learning_rate": 1.4595065578281209e-05, "loss": 0.348, "step": 18340 }, { "epoch": 3.6094514518944703, "grad_norm": 1.7856909036636353, "learning_rate": 1.4592008315754076e-05, "loss": 0.401, "step": 18350 }, { "epoch": 3.611418455410489, "grad_norm": 0.8354535102844238, "learning_rate": 1.458895105322694e-05, "loss": 0.3844, "step": 18360 }, { "epoch": 3.613385458926508, "grad_norm": 1.3362191915512085, "learning_rate": 1.4585893790699808e-05, "loss": 0.3867, "step": 18370 }, { "epoch": 3.6153524624425266, "grad_norm": 1.0379217863082886, "learning_rate": 1.4582836528172675e-05, "loss": 0.424, "step": 18380 }, { "epoch": 3.6173194659585453, "grad_norm": 2.015969753265381, "learning_rate": 1.4579779265645541e-05, "loss": 0.4422, "step": 18390 }, { "epoch": 3.6192864694745643, "grad_norm": 1.8602677583694458, "learning_rate": 1.4576722003118408e-05, "loss": 0.2997, "step": 18400 }, { "epoch": 3.621253472990583, "grad_norm": 1.1193724870681763, "learning_rate": 1.4573664740591276e-05, "loss": 0.3575, "step": 18410 }, { "epoch": 3.6232204765066016, "grad_norm": 2.074537992477417, "learning_rate": 1.4570607478064143e-05, "loss": 0.4189, "step": 18420 }, { "epoch": 3.6251874800226207, "grad_norm": 2.3834993839263916, "learning_rate": 1.456755021553701e-05, "loss": 0.4037, "step": 18430 }, { "epoch": 3.6271544835386393, "grad_norm": 1.3813104629516602, "learning_rate": 1.4564492953009877e-05, "loss": 0.451, "step": 18440 }, { "epoch": 3.629121487054658, "grad_norm": 1.1835546493530273, "learning_rate": 1.4561435690482744e-05, "loss": 0.3707, "step": 18450 }, { "epoch": 3.6310884905706766, "grad_norm": 0.9354956150054932, "learning_rate": 1.455837842795561e-05, "loss": 0.3163, "step": 18460 }, { "epoch": 3.6330554940866957, "grad_norm": 2.104048490524292, "learning_rate": 1.4555321165428477e-05, "loss": 0.2469, "step": 18470 }, { "epoch": 3.6350224976027143, "grad_norm": 1.4287692308425903, "learning_rate": 1.4552263902901343e-05, "loss": 0.4124, "step": 18480 }, { "epoch": 3.636989501118733, "grad_norm": 1.1007471084594727, "learning_rate": 1.4549206640374209e-05, "loss": 0.3916, "step": 18490 }, { "epoch": 3.638956504634752, "grad_norm": 2.9746992588043213, "learning_rate": 1.4546149377847076e-05, "loss": 0.4604, "step": 18500 }, { "epoch": 3.638956504634752, "eval_loss": 0.18906159698963165, "eval_runtime": 8.8794, "eval_samples_per_second": 5.631, "eval_steps_per_second": 2.816, "step": 18500 }, { "epoch": 3.6409235081507707, "grad_norm": 1.591049313545227, "learning_rate": 1.4543092115319944e-05, "loss": 0.4539, "step": 18510 }, { "epoch": 3.6428905116667893, "grad_norm": 1.869845986366272, "learning_rate": 1.454003485279281e-05, "loss": 0.4262, "step": 18520 }, { "epoch": 3.6448575151828084, "grad_norm": 2.037968873977661, "learning_rate": 1.4536977590265677e-05, "loss": 0.435, "step": 18530 }, { "epoch": 3.646824518698827, "grad_norm": 1.494908094406128, "learning_rate": 1.4533920327738545e-05, "loss": 0.3022, "step": 18540 }, { "epoch": 3.6487915222148457, "grad_norm": 1.9231337308883667, "learning_rate": 1.4530863065211412e-05, "loss": 0.3857, "step": 18550 }, { "epoch": 3.650758525730865, "grad_norm": 1.4916620254516602, "learning_rate": 1.4527805802684278e-05, "loss": 0.4307, "step": 18560 }, { "epoch": 3.6527255292468834, "grad_norm": 1.293621301651001, "learning_rate": 1.4524748540157145e-05, "loss": 0.3926, "step": 18570 }, { "epoch": 3.654692532762902, "grad_norm": 1.8174000978469849, "learning_rate": 1.4521691277630013e-05, "loss": 0.4001, "step": 18580 }, { "epoch": 3.656659536278921, "grad_norm": 1.4056103229522705, "learning_rate": 1.4518634015102877e-05, "loss": 0.4308, "step": 18590 }, { "epoch": 3.65862653979494, "grad_norm": 1.1704431772232056, "learning_rate": 1.4515576752575744e-05, "loss": 0.3702, "step": 18600 }, { "epoch": 3.6605935433109584, "grad_norm": 2.3189661502838135, "learning_rate": 1.4512519490048612e-05, "loss": 0.3239, "step": 18610 }, { "epoch": 3.6625605468269775, "grad_norm": 2.15673828125, "learning_rate": 1.4509462227521478e-05, "loss": 0.3059, "step": 18620 }, { "epoch": 3.664527550342996, "grad_norm": 2.0306997299194336, "learning_rate": 1.4506404964994345e-05, "loss": 0.3696, "step": 18630 }, { "epoch": 3.666494553859015, "grad_norm": 1.7248976230621338, "learning_rate": 1.4503347702467213e-05, "loss": 0.3235, "step": 18640 }, { "epoch": 3.668461557375034, "grad_norm": 1.1028352975845337, "learning_rate": 1.4500290439940078e-05, "loss": 0.407, "step": 18650 }, { "epoch": 3.6704285608910525, "grad_norm": 1.5350819826126099, "learning_rate": 1.4497233177412946e-05, "loss": 0.3412, "step": 18660 }, { "epoch": 3.672395564407071, "grad_norm": 1.2633845806121826, "learning_rate": 1.4494175914885813e-05, "loss": 0.4628, "step": 18670 }, { "epoch": 3.6743625679230902, "grad_norm": 0.8121323585510254, "learning_rate": 1.449111865235868e-05, "loss": 0.3931, "step": 18680 }, { "epoch": 3.676329571439109, "grad_norm": 2.546295404434204, "learning_rate": 1.4488061389831546e-05, "loss": 0.3877, "step": 18690 }, { "epoch": 3.6782965749551275, "grad_norm": 1.443408489227295, "learning_rate": 1.4485004127304412e-05, "loss": 0.3466, "step": 18700 }, { "epoch": 3.6802635784711466, "grad_norm": 2.1941025257110596, "learning_rate": 1.4481946864777278e-05, "loss": 0.317, "step": 18710 }, { "epoch": 3.6822305819871652, "grad_norm": 1.4300298690795898, "learning_rate": 1.4478889602250146e-05, "loss": 0.515, "step": 18720 }, { "epoch": 3.684197585503184, "grad_norm": 1.361330270767212, "learning_rate": 1.4475832339723013e-05, "loss": 0.3974, "step": 18730 }, { "epoch": 3.686164589019203, "grad_norm": 1.9021443128585815, "learning_rate": 1.447277507719588e-05, "loss": 0.3661, "step": 18740 }, { "epoch": 3.6881315925352216, "grad_norm": 1.5542855262756348, "learning_rate": 1.4469717814668746e-05, "loss": 0.4889, "step": 18750 }, { "epoch": 3.6900985960512402, "grad_norm": 1.2739965915679932, "learning_rate": 1.4466660552141614e-05, "loss": 0.4009, "step": 18760 }, { "epoch": 3.6920655995672593, "grad_norm": 2.3175907135009766, "learning_rate": 1.4463603289614481e-05, "loss": 0.4487, "step": 18770 }, { "epoch": 3.694032603083278, "grad_norm": 4.204349994659424, "learning_rate": 1.4460546027087347e-05, "loss": 0.3568, "step": 18780 }, { "epoch": 3.6959996065992966, "grad_norm": 1.1765432357788086, "learning_rate": 1.4457488764560214e-05, "loss": 0.3488, "step": 18790 }, { "epoch": 3.6979666101153157, "grad_norm": 1.1111738681793213, "learning_rate": 1.4454431502033082e-05, "loss": 0.3493, "step": 18800 }, { "epoch": 3.6999336136313343, "grad_norm": 0.8277744650840759, "learning_rate": 1.445137423950595e-05, "loss": 0.3763, "step": 18810 }, { "epoch": 3.701900617147353, "grad_norm": 1.207329511642456, "learning_rate": 1.4448316976978813e-05, "loss": 0.4516, "step": 18820 }, { "epoch": 3.703867620663372, "grad_norm": 1.5290664434432983, "learning_rate": 1.4445259714451681e-05, "loss": 0.3653, "step": 18830 }, { "epoch": 3.7058346241793907, "grad_norm": 1.277742862701416, "learning_rate": 1.4442202451924547e-05, "loss": 0.471, "step": 18840 }, { "epoch": 3.7078016276954093, "grad_norm": 1.5165798664093018, "learning_rate": 1.4439145189397414e-05, "loss": 0.2837, "step": 18850 }, { "epoch": 3.7097686312114284, "grad_norm": 1.6007874011993408, "learning_rate": 1.4436087926870282e-05, "loss": 0.4247, "step": 18860 }, { "epoch": 3.711735634727447, "grad_norm": 1.5536600351333618, "learning_rate": 1.4433030664343149e-05, "loss": 0.3162, "step": 18870 }, { "epoch": 3.7137026382434657, "grad_norm": 1.0041977167129517, "learning_rate": 1.4429973401816015e-05, "loss": 0.4459, "step": 18880 }, { "epoch": 3.7156696417594848, "grad_norm": 1.6984635591506958, "learning_rate": 1.4426916139288882e-05, "loss": 0.3537, "step": 18890 }, { "epoch": 3.7176366452755034, "grad_norm": 1.9706878662109375, "learning_rate": 1.442385887676175e-05, "loss": 0.4942, "step": 18900 }, { "epoch": 3.719603648791522, "grad_norm": 0.7841013669967651, "learning_rate": 1.4420801614234616e-05, "loss": 0.3564, "step": 18910 }, { "epoch": 3.721570652307541, "grad_norm": 1.4458509683609009, "learning_rate": 1.4417744351707483e-05, "loss": 0.3577, "step": 18920 }, { "epoch": 3.7235376558235598, "grad_norm": 2.219909906387329, "learning_rate": 1.4414687089180349e-05, "loss": 0.5433, "step": 18930 }, { "epoch": 3.7255046593395784, "grad_norm": 0.8798016905784607, "learning_rate": 1.4411629826653215e-05, "loss": 0.3507, "step": 18940 }, { "epoch": 3.7274716628555975, "grad_norm": 1.1821753978729248, "learning_rate": 1.4408572564126082e-05, "loss": 0.4787, "step": 18950 }, { "epoch": 3.729438666371616, "grad_norm": 0.8908816576004028, "learning_rate": 1.440551530159895e-05, "loss": 0.358, "step": 18960 }, { "epoch": 3.731405669887635, "grad_norm": 0.9876505732536316, "learning_rate": 1.4402458039071815e-05, "loss": 0.4758, "step": 18970 }, { "epoch": 3.733372673403654, "grad_norm": 0.9503434896469116, "learning_rate": 1.4399400776544683e-05, "loss": 0.3929, "step": 18980 }, { "epoch": 3.7353396769196725, "grad_norm": 0.9545804262161255, "learning_rate": 1.439634351401755e-05, "loss": 0.4286, "step": 18990 }, { "epoch": 3.737306680435691, "grad_norm": 2.3993990421295166, "learning_rate": 1.4393286251490418e-05, "loss": 0.3785, "step": 19000 }, { "epoch": 3.737306680435691, "eval_loss": 0.17065879702568054, "eval_runtime": 8.8951, "eval_samples_per_second": 5.621, "eval_steps_per_second": 2.811, "step": 19000 }, { "epoch": 3.7392736839517102, "grad_norm": 2.2503209114074707, "learning_rate": 1.4390228988963284e-05, "loss": 0.4456, "step": 19010 }, { "epoch": 3.741240687467729, "grad_norm": 1.7953013181686401, "learning_rate": 1.4387171726436151e-05, "loss": 0.3437, "step": 19020 }, { "epoch": 3.7432076909837475, "grad_norm": 3.9343481063842773, "learning_rate": 1.4384114463909019e-05, "loss": 0.4451, "step": 19030 }, { "epoch": 3.7451746944997666, "grad_norm": 1.1897847652435303, "learning_rate": 1.4381057201381883e-05, "loss": 0.4459, "step": 19040 }, { "epoch": 3.7471416980157852, "grad_norm": 1.6288838386535645, "learning_rate": 1.437799993885475e-05, "loss": 0.4209, "step": 19050 }, { "epoch": 3.749108701531804, "grad_norm": 1.8541220426559448, "learning_rate": 1.4374942676327618e-05, "loss": 0.4737, "step": 19060 }, { "epoch": 3.751075705047823, "grad_norm": 1.3628418445587158, "learning_rate": 1.4371885413800483e-05, "loss": 0.4486, "step": 19070 }, { "epoch": 3.7530427085638416, "grad_norm": 1.4050499200820923, "learning_rate": 1.436882815127335e-05, "loss": 0.39, "step": 19080 }, { "epoch": 3.7550097120798602, "grad_norm": 0.8367191553115845, "learning_rate": 1.4365770888746218e-05, "loss": 0.3072, "step": 19090 }, { "epoch": 3.7569767155958793, "grad_norm": 1.374824047088623, "learning_rate": 1.4362713626219084e-05, "loss": 0.3958, "step": 19100 }, { "epoch": 3.758943719111898, "grad_norm": 1.165436029434204, "learning_rate": 1.4359656363691952e-05, "loss": 0.4409, "step": 19110 }, { "epoch": 3.7609107226279166, "grad_norm": 1.6920998096466064, "learning_rate": 1.4356599101164819e-05, "loss": 0.4824, "step": 19120 }, { "epoch": 3.7628777261439357, "grad_norm": 0.9310535788536072, "learning_rate": 1.4353541838637686e-05, "loss": 0.4144, "step": 19130 }, { "epoch": 3.7648447296599543, "grad_norm": 1.4429125785827637, "learning_rate": 1.4350484576110552e-05, "loss": 0.2829, "step": 19140 }, { "epoch": 3.766811733175973, "grad_norm": 1.7485815286636353, "learning_rate": 1.434742731358342e-05, "loss": 0.5212, "step": 19150 }, { "epoch": 3.768778736691992, "grad_norm": 1.283400535583496, "learning_rate": 1.4344370051056284e-05, "loss": 0.4345, "step": 19160 }, { "epoch": 3.7707457402080107, "grad_norm": 1.8467475175857544, "learning_rate": 1.4341312788529151e-05, "loss": 0.2751, "step": 19170 }, { "epoch": 3.7727127437240293, "grad_norm": 1.880601406097412, "learning_rate": 1.4338255526002019e-05, "loss": 0.347, "step": 19180 }, { "epoch": 3.7746797472400484, "grad_norm": 2.1124298572540283, "learning_rate": 1.4335198263474886e-05, "loss": 0.5249, "step": 19190 }, { "epoch": 3.776646750756067, "grad_norm": 1.1820361614227295, "learning_rate": 1.4332141000947752e-05, "loss": 0.3955, "step": 19200 }, { "epoch": 3.7786137542720857, "grad_norm": 1.8350051641464233, "learning_rate": 1.432908373842062e-05, "loss": 0.5027, "step": 19210 }, { "epoch": 3.7805807577881048, "grad_norm": 1.0418702363967896, "learning_rate": 1.4326026475893487e-05, "loss": 0.4091, "step": 19220 }, { "epoch": 3.7825477613041234, "grad_norm": 1.2310411930084229, "learning_rate": 1.4322969213366353e-05, "loss": 0.3877, "step": 19230 }, { "epoch": 3.784514764820142, "grad_norm": 1.0506771802902222, "learning_rate": 1.431991195083922e-05, "loss": 0.3961, "step": 19240 }, { "epoch": 3.786481768336161, "grad_norm": 1.3733229637145996, "learning_rate": 1.4316854688312088e-05, "loss": 0.3588, "step": 19250 }, { "epoch": 3.7884487718521798, "grad_norm": 1.6931480169296265, "learning_rate": 1.4313797425784955e-05, "loss": 0.4726, "step": 19260 }, { "epoch": 3.7904157753681984, "grad_norm": 1.191991925239563, "learning_rate": 1.431074016325782e-05, "loss": 0.3923, "step": 19270 }, { "epoch": 3.7923827788842175, "grad_norm": 2.010301351547241, "learning_rate": 1.4307682900730687e-05, "loss": 0.4324, "step": 19280 }, { "epoch": 3.794349782400236, "grad_norm": 0.7601318359375, "learning_rate": 1.4304625638203552e-05, "loss": 0.5484, "step": 19290 }, { "epoch": 3.7963167859162548, "grad_norm": 1.0302248001098633, "learning_rate": 1.430156837567642e-05, "loss": 0.4386, "step": 19300 }, { "epoch": 3.798283789432274, "grad_norm": 1.992654800415039, "learning_rate": 1.4298511113149287e-05, "loss": 0.34, "step": 19310 }, { "epoch": 3.8002507929482925, "grad_norm": 1.9000381231307983, "learning_rate": 1.4295453850622155e-05, "loss": 0.5453, "step": 19320 }, { "epoch": 3.802217796464311, "grad_norm": 1.3942879438400269, "learning_rate": 1.429239658809502e-05, "loss": 0.4681, "step": 19330 }, { "epoch": 3.80418479998033, "grad_norm": 0.8642085790634155, "learning_rate": 1.4289339325567888e-05, "loss": 0.3664, "step": 19340 }, { "epoch": 3.806151803496349, "grad_norm": 1.3701379299163818, "learning_rate": 1.4286282063040756e-05, "loss": 0.3545, "step": 19350 }, { "epoch": 3.8081188070123675, "grad_norm": 1.2130693197250366, "learning_rate": 1.4283224800513621e-05, "loss": 0.353, "step": 19360 }, { "epoch": 3.8100858105283866, "grad_norm": 1.9979584217071533, "learning_rate": 1.4280167537986489e-05, "loss": 0.4111, "step": 19370 }, { "epoch": 3.812052814044405, "grad_norm": 1.0861486196517944, "learning_rate": 1.4277110275459355e-05, "loss": 0.3785, "step": 19380 }, { "epoch": 3.814019817560424, "grad_norm": 1.5842684507369995, "learning_rate": 1.427405301293222e-05, "loss": 0.3934, "step": 19390 }, { "epoch": 3.815986821076443, "grad_norm": 1.0454131364822388, "learning_rate": 1.4270995750405088e-05, "loss": 0.4427, "step": 19400 }, { "epoch": 3.8179538245924616, "grad_norm": 0.670005202293396, "learning_rate": 1.4267938487877955e-05, "loss": 0.4881, "step": 19410 }, { "epoch": 3.81992082810848, "grad_norm": 0.9665763974189758, "learning_rate": 1.4264881225350821e-05, "loss": 0.3345, "step": 19420 }, { "epoch": 3.8218878316244993, "grad_norm": 0.7225205302238464, "learning_rate": 1.4261823962823689e-05, "loss": 0.4946, "step": 19430 }, { "epoch": 3.823854835140518, "grad_norm": 1.0303906202316284, "learning_rate": 1.4258766700296556e-05, "loss": 0.4415, "step": 19440 }, { "epoch": 3.8258218386565366, "grad_norm": 1.3895282745361328, "learning_rate": 1.4255709437769424e-05, "loss": 0.3707, "step": 19450 }, { "epoch": 3.827788842172555, "grad_norm": 1.1078003644943237, "learning_rate": 1.425265217524229e-05, "loss": 0.4026, "step": 19460 }, { "epoch": 3.8297558456885743, "grad_norm": 2.3170745372772217, "learning_rate": 1.4249594912715157e-05, "loss": 0.341, "step": 19470 }, { "epoch": 3.831722849204593, "grad_norm": 1.1369905471801758, "learning_rate": 1.4246537650188024e-05, "loss": 0.4851, "step": 19480 }, { "epoch": 3.8336898527206116, "grad_norm": 1.0877054929733276, "learning_rate": 1.424348038766089e-05, "loss": 0.4159, "step": 19490 }, { "epoch": 3.8356568562366307, "grad_norm": 1.8775924444198608, "learning_rate": 1.4240423125133756e-05, "loss": 0.3514, "step": 19500 }, { "epoch": 3.8356568562366307, "eval_loss": 0.18426425755023956, "eval_runtime": 8.8833, "eval_samples_per_second": 5.629, "eval_steps_per_second": 2.814, "step": 19500 }, { "epoch": 3.8376238597526493, "grad_norm": 1.6352936029434204, "learning_rate": 1.4237365862606623e-05, "loss": 0.6251, "step": 19510 }, { "epoch": 3.839590863268668, "grad_norm": 1.4518588781356812, "learning_rate": 1.4234308600079489e-05, "loss": 0.3573, "step": 19520 }, { "epoch": 3.841557866784687, "grad_norm": 1.4944310188293457, "learning_rate": 1.4231251337552357e-05, "loss": 0.4555, "step": 19530 }, { "epoch": 3.8435248703007057, "grad_norm": 1.3529949188232422, "learning_rate": 1.4228194075025224e-05, "loss": 0.3766, "step": 19540 }, { "epoch": 3.8454918738167243, "grad_norm": 1.6837838888168335, "learning_rate": 1.422513681249809e-05, "loss": 0.4189, "step": 19550 }, { "epoch": 3.8474588773327434, "grad_norm": 0.719254732131958, "learning_rate": 1.4222079549970957e-05, "loss": 0.4939, "step": 19560 }, { "epoch": 3.849425880848762, "grad_norm": 1.1821671724319458, "learning_rate": 1.4219022287443825e-05, "loss": 0.4553, "step": 19570 }, { "epoch": 3.8513928843647807, "grad_norm": 1.264253854751587, "learning_rate": 1.4215965024916692e-05, "loss": 0.52, "step": 19580 }, { "epoch": 3.8533598878807998, "grad_norm": 1.4270673990249634, "learning_rate": 1.4212907762389558e-05, "loss": 0.3172, "step": 19590 }, { "epoch": 3.8553268913968184, "grad_norm": 1.0974931716918945, "learning_rate": 1.4209850499862425e-05, "loss": 0.4648, "step": 19600 }, { "epoch": 3.857293894912837, "grad_norm": 1.9445582628250122, "learning_rate": 1.420679323733529e-05, "loss": 0.3901, "step": 19610 }, { "epoch": 3.8592608984288557, "grad_norm": 1.3331164121627808, "learning_rate": 1.4203735974808157e-05, "loss": 0.394, "step": 19620 }, { "epoch": 3.8612279019448748, "grad_norm": 1.6156160831451416, "learning_rate": 1.4200678712281024e-05, "loss": 0.352, "step": 19630 }, { "epoch": 3.8631949054608934, "grad_norm": 1.9054970741271973, "learning_rate": 1.4197621449753892e-05, "loss": 0.455, "step": 19640 }, { "epoch": 3.865161908976912, "grad_norm": 1.2336102724075317, "learning_rate": 1.4194564187226758e-05, "loss": 0.3731, "step": 19650 }, { "epoch": 3.867128912492931, "grad_norm": 1.1675928831100464, "learning_rate": 1.4191506924699625e-05, "loss": 0.3594, "step": 19660 }, { "epoch": 3.8690959160089498, "grad_norm": 1.6791987419128418, "learning_rate": 1.4188449662172493e-05, "loss": 0.3734, "step": 19670 }, { "epoch": 3.8710629195249684, "grad_norm": 1.1804096698760986, "learning_rate": 1.4185392399645358e-05, "loss": 0.3333, "step": 19680 }, { "epoch": 3.8730299230409875, "grad_norm": 0.658993661403656, "learning_rate": 1.4182335137118226e-05, "loss": 0.27, "step": 19690 }, { "epoch": 3.874996926557006, "grad_norm": 1.942895770072937, "learning_rate": 1.4179277874591093e-05, "loss": 0.2867, "step": 19700 }, { "epoch": 3.8769639300730248, "grad_norm": 2.464463233947754, "learning_rate": 1.4176220612063961e-05, "loss": 0.4725, "step": 19710 }, { "epoch": 3.878930933589044, "grad_norm": 1.6073675155639648, "learning_rate": 1.4173163349536825e-05, "loss": 0.3144, "step": 19720 }, { "epoch": 3.8808979371050625, "grad_norm": 1.7023011445999146, "learning_rate": 1.4170106087009692e-05, "loss": 0.3217, "step": 19730 }, { "epoch": 3.882864940621081, "grad_norm": 3.136507749557495, "learning_rate": 1.4167048824482558e-05, "loss": 0.4172, "step": 19740 }, { "epoch": 3.8848319441371, "grad_norm": 1.6217586994171143, "learning_rate": 1.4163991561955426e-05, "loss": 0.5113, "step": 19750 }, { "epoch": 3.886798947653119, "grad_norm": 1.7035020589828491, "learning_rate": 1.4160934299428293e-05, "loss": 0.379, "step": 19760 }, { "epoch": 3.8887659511691375, "grad_norm": 1.4003608226776123, "learning_rate": 1.415787703690116e-05, "loss": 0.3828, "step": 19770 }, { "epoch": 3.8907329546851566, "grad_norm": 1.6719714403152466, "learning_rate": 1.4154819774374026e-05, "loss": 0.3379, "step": 19780 }, { "epoch": 3.892699958201175, "grad_norm": 1.173722743988037, "learning_rate": 1.4151762511846894e-05, "loss": 0.3918, "step": 19790 }, { "epoch": 3.894666961717194, "grad_norm": 0.8642592430114746, "learning_rate": 1.4148705249319761e-05, "loss": 0.4654, "step": 19800 }, { "epoch": 3.896633965233213, "grad_norm": 2.1731534004211426, "learning_rate": 1.4145647986792627e-05, "loss": 0.4032, "step": 19810 }, { "epoch": 3.8986009687492316, "grad_norm": 1.903075933456421, "learning_rate": 1.4142590724265495e-05, "loss": 0.3913, "step": 19820 }, { "epoch": 3.90056797226525, "grad_norm": 2.3391592502593994, "learning_rate": 1.4139533461738362e-05, "loss": 0.3946, "step": 19830 }, { "epoch": 3.9025349757812693, "grad_norm": 1.1577038764953613, "learning_rate": 1.4136476199211226e-05, "loss": 0.4448, "step": 19840 }, { "epoch": 3.904501979297288, "grad_norm": 1.8592039346694946, "learning_rate": 1.4133418936684094e-05, "loss": 0.3715, "step": 19850 }, { "epoch": 3.9064689828133066, "grad_norm": 1.1040771007537842, "learning_rate": 1.4130361674156961e-05, "loss": 0.3806, "step": 19860 }, { "epoch": 3.9084359863293257, "grad_norm": 1.3957312107086182, "learning_rate": 1.4127304411629827e-05, "loss": 0.431, "step": 19870 }, { "epoch": 3.9104029898453443, "grad_norm": 1.2022626399993896, "learning_rate": 1.4124247149102694e-05, "loss": 0.2885, "step": 19880 }, { "epoch": 3.912369993361363, "grad_norm": 2.126249074935913, "learning_rate": 1.4121189886575562e-05, "loss": 0.45, "step": 19890 }, { "epoch": 3.914336996877382, "grad_norm": 2.9966373443603516, "learning_rate": 1.411813262404843e-05, "loss": 0.5084, "step": 19900 }, { "epoch": 3.9163040003934007, "grad_norm": 0.9211567640304565, "learning_rate": 1.4115075361521295e-05, "loss": 0.4613, "step": 19910 }, { "epoch": 3.9182710039094193, "grad_norm": 1.7329440116882324, "learning_rate": 1.4112018098994163e-05, "loss": 0.3291, "step": 19920 }, { "epoch": 3.9202380074254384, "grad_norm": 2.0354349613189697, "learning_rate": 1.410896083646703e-05, "loss": 0.4231, "step": 19930 }, { "epoch": 3.922205010941457, "grad_norm": 19.046695709228516, "learning_rate": 1.4105903573939896e-05, "loss": 0.4676, "step": 19940 }, { "epoch": 3.9241720144574757, "grad_norm": 1.4789849519729614, "learning_rate": 1.4102846311412762e-05, "loss": 0.5275, "step": 19950 }, { "epoch": 3.9261390179734947, "grad_norm": 0.89300936460495, "learning_rate": 1.4099789048885627e-05, "loss": 0.4568, "step": 19960 }, { "epoch": 3.9281060214895134, "grad_norm": 1.1245640516281128, "learning_rate": 1.4096731786358495e-05, "loss": 0.4998, "step": 19970 }, { "epoch": 3.930073025005532, "grad_norm": 1.9542558193206787, "learning_rate": 1.4093674523831362e-05, "loss": 0.4368, "step": 19980 }, { "epoch": 3.932040028521551, "grad_norm": 1.956928014755249, "learning_rate": 1.409061726130423e-05, "loss": 0.4154, "step": 19990 }, { "epoch": 3.9340070320375697, "grad_norm": 2.864295721054077, "learning_rate": 1.4087559998777096e-05, "loss": 0.3798, "step": 20000 }, { "epoch": 3.9340070320375697, "eval_loss": 0.16804826259613037, "eval_runtime": 8.8889, "eval_samples_per_second": 5.625, "eval_steps_per_second": 2.812, "step": 20000 }, { "epoch": 3.9359740355535884, "grad_norm": 1.1259634494781494, "learning_rate": 1.4084502736249963e-05, "loss": 0.3126, "step": 20010 }, { "epoch": 3.9379410390696075, "grad_norm": 1.4045112133026123, "learning_rate": 1.408144547372283e-05, "loss": 0.3549, "step": 20020 }, { "epoch": 3.939908042585626, "grad_norm": 1.2622108459472656, "learning_rate": 1.4078388211195698e-05, "loss": 0.3867, "step": 20030 }, { "epoch": 3.9418750461016447, "grad_norm": 1.6983309984207153, "learning_rate": 1.4075330948668564e-05, "loss": 0.3751, "step": 20040 }, { "epoch": 3.943842049617664, "grad_norm": 1.3586803674697876, "learning_rate": 1.4072273686141431e-05, "loss": 0.6043, "step": 20050 }, { "epoch": 3.9458090531336825, "grad_norm": 0.9066451191902161, "learning_rate": 1.4069216423614295e-05, "loss": 0.3718, "step": 20060 }, { "epoch": 3.947776056649701, "grad_norm": 1.4924407005310059, "learning_rate": 1.4066159161087163e-05, "loss": 0.3764, "step": 20070 }, { "epoch": 3.94974306016572, "grad_norm": 1.226970911026001, "learning_rate": 1.406310189856003e-05, "loss": 0.3908, "step": 20080 }, { "epoch": 3.951710063681739, "grad_norm": 0.9943327307701111, "learning_rate": 1.4060044636032896e-05, "loss": 0.4514, "step": 20090 }, { "epoch": 3.9536770671977575, "grad_norm": 1.371333360671997, "learning_rate": 1.4056987373505763e-05, "loss": 0.4506, "step": 20100 }, { "epoch": 3.9556440707137766, "grad_norm": 1.717349648475647, "learning_rate": 1.4053930110978631e-05, "loss": 0.3844, "step": 20110 }, { "epoch": 3.957611074229795, "grad_norm": 2.133634090423584, "learning_rate": 1.4050872848451498e-05, "loss": 0.4074, "step": 20120 }, { "epoch": 3.959578077745814, "grad_norm": 1.379530906677246, "learning_rate": 1.4047815585924364e-05, "loss": 0.4095, "step": 20130 }, { "epoch": 3.961545081261833, "grad_norm": 1.0096360445022583, "learning_rate": 1.4044758323397232e-05, "loss": 0.3714, "step": 20140 }, { "epoch": 3.9635120847778516, "grad_norm": 2.1940183639526367, "learning_rate": 1.4041701060870099e-05, "loss": 0.4436, "step": 20150 }, { "epoch": 3.96547908829387, "grad_norm": 1.4452601671218872, "learning_rate": 1.4038643798342967e-05, "loss": 0.4699, "step": 20160 }, { "epoch": 3.9674460918098893, "grad_norm": 1.9705860614776611, "learning_rate": 1.4035586535815832e-05, "loss": 0.386, "step": 20170 }, { "epoch": 3.969413095325908, "grad_norm": 1.5604002475738525, "learning_rate": 1.4032529273288698e-05, "loss": 0.4451, "step": 20180 }, { "epoch": 3.9713800988419266, "grad_norm": 1.4916459321975708, "learning_rate": 1.4029472010761564e-05, "loss": 0.3558, "step": 20190 }, { "epoch": 3.9733471023579456, "grad_norm": 2.563976287841797, "learning_rate": 1.4026414748234431e-05, "loss": 0.3958, "step": 20200 }, { "epoch": 3.9753141058739643, "grad_norm": 2.008441209793091, "learning_rate": 1.4023357485707299e-05, "loss": 0.4369, "step": 20210 }, { "epoch": 3.977281109389983, "grad_norm": 1.1274158954620361, "learning_rate": 1.4020300223180165e-05, "loss": 0.2473, "step": 20220 }, { "epoch": 3.979248112906002, "grad_norm": 1.1995351314544678, "learning_rate": 1.4017242960653032e-05, "loss": 0.4072, "step": 20230 }, { "epoch": 3.9812151164220206, "grad_norm": 1.7753493785858154, "learning_rate": 1.40141856981259e-05, "loss": 0.3449, "step": 20240 }, { "epoch": 3.9831821199380393, "grad_norm": 2.2701807022094727, "learning_rate": 1.4011128435598767e-05, "loss": 0.4204, "step": 20250 }, { "epoch": 3.9851491234540584, "grad_norm": 1.2807681560516357, "learning_rate": 1.4008071173071633e-05, "loss": 0.2937, "step": 20260 }, { "epoch": 3.987116126970077, "grad_norm": 0.9695661664009094, "learning_rate": 1.40050139105445e-05, "loss": 0.382, "step": 20270 }, { "epoch": 3.9890831304860956, "grad_norm": 2.4322593212127686, "learning_rate": 1.4001956648017368e-05, "loss": 0.4999, "step": 20280 }, { "epoch": 3.9910501340021147, "grad_norm": 1.0660431385040283, "learning_rate": 1.3998899385490232e-05, "loss": 0.406, "step": 20290 }, { "epoch": 3.9930171375181334, "grad_norm": 1.3648877143859863, "learning_rate": 1.39958421229631e-05, "loss": 0.4379, "step": 20300 }, { "epoch": 3.994984141034152, "grad_norm": 1.5317339897155762, "learning_rate": 1.3992784860435967e-05, "loss": 0.4288, "step": 20310 }, { "epoch": 3.996951144550171, "grad_norm": 1.7145333290100098, "learning_rate": 1.3989727597908833e-05, "loss": 0.3317, "step": 20320 }, { "epoch": 3.9989181480661897, "grad_norm": 3.5187952518463135, "learning_rate": 1.39866703353817e-05, "loss": 0.4305, "step": 20330 }, { "epoch": 4.000885151582208, "grad_norm": 0.9262216091156006, "learning_rate": 1.3983613072854568e-05, "loss": 0.3561, "step": 20340 }, { "epoch": 4.0028521550982274, "grad_norm": 1.3652830123901367, "learning_rate": 1.3980555810327433e-05, "loss": 0.4874, "step": 20350 }, { "epoch": 4.004819158614246, "grad_norm": 0.9129854440689087, "learning_rate": 1.39774985478003e-05, "loss": 0.3942, "step": 20360 }, { "epoch": 4.006786162130265, "grad_norm": 1.0334409475326538, "learning_rate": 1.3974441285273168e-05, "loss": 0.3191, "step": 20370 }, { "epoch": 4.008753165646284, "grad_norm": 1.292149543762207, "learning_rate": 1.3971384022746036e-05, "loss": 0.4056, "step": 20380 }, { "epoch": 4.010720169162302, "grad_norm": 1.2655421495437622, "learning_rate": 1.3968326760218902e-05, "loss": 0.2615, "step": 20390 }, { "epoch": 4.012687172678321, "grad_norm": 3.0423643589019775, "learning_rate": 1.3965269497691767e-05, "loss": 0.3888, "step": 20400 }, { "epoch": 4.01465417619434, "grad_norm": 1.6518824100494385, "learning_rate": 1.3962212235164633e-05, "loss": 0.3371, "step": 20410 }, { "epoch": 4.016621179710358, "grad_norm": 1.5549308061599731, "learning_rate": 1.39591549726375e-05, "loss": 0.4344, "step": 20420 }, { "epoch": 4.0185881832263775, "grad_norm": 1.3199830055236816, "learning_rate": 1.3956097710110368e-05, "loss": 0.4384, "step": 20430 }, { "epoch": 4.0205551867423965, "grad_norm": 1.8251094818115234, "learning_rate": 1.3953040447583235e-05, "loss": 0.3351, "step": 20440 }, { "epoch": 4.022522190258415, "grad_norm": 1.6318509578704834, "learning_rate": 1.3949983185056101e-05, "loss": 0.4346, "step": 20450 }, { "epoch": 4.024489193774434, "grad_norm": 1.016904354095459, "learning_rate": 1.3946925922528969e-05, "loss": 0.3728, "step": 20460 }, { "epoch": 4.026456197290453, "grad_norm": 1.3480511903762817, "learning_rate": 1.3943868660001836e-05, "loss": 0.2728, "step": 20470 }, { "epoch": 4.028423200806471, "grad_norm": 1.0706782341003418, "learning_rate": 1.3940811397474702e-05, "loss": 0.4899, "step": 20480 }, { "epoch": 4.03039020432249, "grad_norm": 2.8634934425354004, "learning_rate": 1.393775413494757e-05, "loss": 0.4906, "step": 20490 }, { "epoch": 4.032357207838509, "grad_norm": 1.0132216215133667, "learning_rate": 1.3934696872420437e-05, "loss": 0.3943, "step": 20500 }, { "epoch": 4.032357207838509, "eval_loss": 0.16937392950057983, "eval_runtime": 8.8749, "eval_samples_per_second": 5.634, "eval_steps_per_second": 2.817, "step": 20500 }, { "epoch": 4.0343242113545275, "grad_norm": 1.4437650442123413, "learning_rate": 1.3931639609893304e-05, "loss": 0.3685, "step": 20510 }, { "epoch": 4.0362912148705465, "grad_norm": 1.7009004354476929, "learning_rate": 1.3928582347366168e-05, "loss": 0.4131, "step": 20520 }, { "epoch": 4.038258218386566, "grad_norm": 1.1112629175186157, "learning_rate": 1.3925525084839036e-05, "loss": 0.4157, "step": 20530 }, { "epoch": 4.040225221902584, "grad_norm": 1.0249930620193481, "learning_rate": 1.3922467822311902e-05, "loss": 0.4676, "step": 20540 }, { "epoch": 4.042192225418603, "grad_norm": 1.9239535331726074, "learning_rate": 1.391941055978477e-05, "loss": 0.3815, "step": 20550 }, { "epoch": 4.044159228934622, "grad_norm": 1.6564403772354126, "learning_rate": 1.3916353297257637e-05, "loss": 0.3594, "step": 20560 }, { "epoch": 4.04612623245064, "grad_norm": 1.2980719804763794, "learning_rate": 1.3913296034730504e-05, "loss": 0.2283, "step": 20570 }, { "epoch": 4.048093235966659, "grad_norm": 0.801551878452301, "learning_rate": 1.391023877220337e-05, "loss": 0.3371, "step": 20580 }, { "epoch": 4.050060239482678, "grad_norm": 2.342517852783203, "learning_rate": 1.3907181509676237e-05, "loss": 0.3808, "step": 20590 }, { "epoch": 4.0520272429986965, "grad_norm": 2.5508110523223877, "learning_rate": 1.3904124247149105e-05, "loss": 0.4242, "step": 20600 }, { "epoch": 4.053994246514716, "grad_norm": 1.4144970178604126, "learning_rate": 1.390106698462197e-05, "loss": 0.3226, "step": 20610 }, { "epoch": 4.055961250030735, "grad_norm": 1.0900161266326904, "learning_rate": 1.3898009722094838e-05, "loss": 0.283, "step": 20620 }, { "epoch": 4.057928253546753, "grad_norm": 1.0960556268692017, "learning_rate": 1.3894952459567704e-05, "loss": 0.3846, "step": 20630 }, { "epoch": 4.059895257062772, "grad_norm": 1.9161323308944702, "learning_rate": 1.389189519704057e-05, "loss": 0.5165, "step": 20640 }, { "epoch": 4.061862260578791, "grad_norm": 0.9415732026100159, "learning_rate": 1.3888837934513437e-05, "loss": 0.4541, "step": 20650 }, { "epoch": 4.063829264094809, "grad_norm": 1.3521498441696167, "learning_rate": 1.3885780671986305e-05, "loss": 0.3138, "step": 20660 }, { "epoch": 4.065796267610828, "grad_norm": 1.2440180778503418, "learning_rate": 1.388272340945917e-05, "loss": 0.4493, "step": 20670 }, { "epoch": 4.067763271126847, "grad_norm": 1.1224850416183472, "learning_rate": 1.3879666146932038e-05, "loss": 0.3012, "step": 20680 }, { "epoch": 4.069730274642866, "grad_norm": 1.1821485757827759, "learning_rate": 1.3876608884404905e-05, "loss": 0.4457, "step": 20690 }, { "epoch": 4.071697278158885, "grad_norm": 1.7971569299697876, "learning_rate": 1.3873551621877773e-05, "loss": 0.3576, "step": 20700 }, { "epoch": 4.073664281674904, "grad_norm": 1.944004774093628, "learning_rate": 1.3870494359350639e-05, "loss": 0.4334, "step": 20710 }, { "epoch": 4.075631285190922, "grad_norm": 0.9380494952201843, "learning_rate": 1.3867437096823506e-05, "loss": 0.3865, "step": 20720 }, { "epoch": 4.077598288706941, "grad_norm": 0.9753469824790955, "learning_rate": 1.3864379834296374e-05, "loss": 0.3999, "step": 20730 }, { "epoch": 4.07956529222296, "grad_norm": 1.8330798149108887, "learning_rate": 1.3861322571769238e-05, "loss": 0.3551, "step": 20740 }, { "epoch": 4.081532295738978, "grad_norm": 1.0402212142944336, "learning_rate": 1.3858265309242105e-05, "loss": 0.3628, "step": 20750 }, { "epoch": 4.083499299254997, "grad_norm": 0.5755205154418945, "learning_rate": 1.3855208046714973e-05, "loss": 0.4447, "step": 20760 }, { "epoch": 4.0854663027710165, "grad_norm": 1.3143962621688843, "learning_rate": 1.3852150784187838e-05, "loss": 0.3869, "step": 20770 }, { "epoch": 4.087433306287035, "grad_norm": 1.2326987981796265, "learning_rate": 1.3849093521660706e-05, "loss": 0.377, "step": 20780 }, { "epoch": 4.089400309803054, "grad_norm": 2.1556975841522217, "learning_rate": 1.3846036259133573e-05, "loss": 0.4335, "step": 20790 }, { "epoch": 4.091367313319073, "grad_norm": 1.4288907051086426, "learning_rate": 1.3842978996606439e-05, "loss": 0.243, "step": 20800 }, { "epoch": 4.093334316835091, "grad_norm": 1.218528151512146, "learning_rate": 1.3839921734079307e-05, "loss": 0.4571, "step": 20810 }, { "epoch": 4.09530132035111, "grad_norm": 0.7718478441238403, "learning_rate": 1.3836864471552174e-05, "loss": 0.3369, "step": 20820 }, { "epoch": 4.097268323867129, "grad_norm": 1.171607494354248, "learning_rate": 1.3833807209025041e-05, "loss": 0.3596, "step": 20830 }, { "epoch": 4.099235327383147, "grad_norm": 0.9774153232574463, "learning_rate": 1.3830749946497907e-05, "loss": 0.2873, "step": 20840 }, { "epoch": 4.1012023308991665, "grad_norm": 2.146676540374756, "learning_rate": 1.3827692683970775e-05, "loss": 0.3206, "step": 20850 }, { "epoch": 4.103169334415186, "grad_norm": 1.7797375917434692, "learning_rate": 1.3824635421443639e-05, "loss": 0.2919, "step": 20860 }, { "epoch": 4.105136337931204, "grad_norm": 1.1288076639175415, "learning_rate": 1.3821578158916506e-05, "loss": 0.3807, "step": 20870 }, { "epoch": 4.107103341447223, "grad_norm": 1.2551835775375366, "learning_rate": 1.3818520896389374e-05, "loss": 0.4071, "step": 20880 }, { "epoch": 4.109070344963242, "grad_norm": 1.2019129991531372, "learning_rate": 1.3815463633862241e-05, "loss": 0.377, "step": 20890 }, { "epoch": 4.11103734847926, "grad_norm": 2.886444091796875, "learning_rate": 1.3812406371335107e-05, "loss": 0.4303, "step": 20900 }, { "epoch": 4.113004351995279, "grad_norm": 2.8083629608154297, "learning_rate": 1.3809349108807974e-05, "loss": 0.3066, "step": 20910 }, { "epoch": 4.114971355511298, "grad_norm": 1.7566733360290527, "learning_rate": 1.3806291846280842e-05, "loss": 0.4121, "step": 20920 }, { "epoch": 4.1169383590273165, "grad_norm": 1.5556622743606567, "learning_rate": 1.3803234583753708e-05, "loss": 0.3334, "step": 20930 }, { "epoch": 4.118905362543336, "grad_norm": 0.984484851360321, "learning_rate": 1.3800177321226575e-05, "loss": 0.3508, "step": 20940 }, { "epoch": 4.120872366059355, "grad_norm": 1.2625389099121094, "learning_rate": 1.3797120058699443e-05, "loss": 0.2859, "step": 20950 }, { "epoch": 4.122839369575373, "grad_norm": 2.6469364166259766, "learning_rate": 1.379406279617231e-05, "loss": 0.331, "step": 20960 }, { "epoch": 4.124806373091392, "grad_norm": 0.9096398949623108, "learning_rate": 1.3791005533645174e-05, "loss": 0.3237, "step": 20970 }, { "epoch": 4.126773376607411, "grad_norm": 1.5150424242019653, "learning_rate": 1.3787948271118042e-05, "loss": 0.3276, "step": 20980 }, { "epoch": 4.128740380123429, "grad_norm": 1.083422064781189, "learning_rate": 1.3784891008590907e-05, "loss": 0.3369, "step": 20990 }, { "epoch": 4.130707383639448, "grad_norm": 1.2050446271896362, "learning_rate": 1.3781833746063775e-05, "loss": 0.2082, "step": 21000 }, { "epoch": 4.130707383639448, "eval_loss": 0.15738588571548462, "eval_runtime": 8.8899, "eval_samples_per_second": 5.624, "eval_steps_per_second": 2.812, "step": 21000 }, { "epoch": 4.132674387155467, "grad_norm": 0.7613298892974854, "learning_rate": 1.3778776483536642e-05, "loss": 0.4471, "step": 21010 }, { "epoch": 4.134641390671486, "grad_norm": 0.7388508915901184, "learning_rate": 1.377571922100951e-05, "loss": 0.4445, "step": 21020 }, { "epoch": 4.136608394187505, "grad_norm": 1.4384886026382446, "learning_rate": 1.3772661958482376e-05, "loss": 0.3718, "step": 21030 }, { "epoch": 4.138575397703524, "grad_norm": 1.3622539043426514, "learning_rate": 1.3769604695955243e-05, "loss": 0.3525, "step": 21040 }, { "epoch": 4.140542401219542, "grad_norm": 1.0458835363388062, "learning_rate": 1.376654743342811e-05, "loss": 0.3866, "step": 21050 }, { "epoch": 4.142509404735561, "grad_norm": 3.151491641998291, "learning_rate": 1.3763490170900976e-05, "loss": 0.3231, "step": 21060 }, { "epoch": 4.14447640825158, "grad_norm": 1.1052993535995483, "learning_rate": 1.3760432908373844e-05, "loss": 0.3444, "step": 21070 }, { "epoch": 4.146443411767598, "grad_norm": 0.982151210308075, "learning_rate": 1.375737564584671e-05, "loss": 0.3724, "step": 21080 }, { "epoch": 4.148410415283617, "grad_norm": 1.6932227611541748, "learning_rate": 1.3754318383319575e-05, "loss": 0.4338, "step": 21090 }, { "epoch": 4.1503774187996365, "grad_norm": 0.9574220180511475, "learning_rate": 1.3751261120792443e-05, "loss": 0.4565, "step": 21100 }, { "epoch": 4.152344422315655, "grad_norm": 1.4933550357818604, "learning_rate": 1.374820385826531e-05, "loss": 0.4857, "step": 21110 }, { "epoch": 4.154311425831674, "grad_norm": 3.230536699295044, "learning_rate": 1.3745146595738176e-05, "loss": 0.3114, "step": 21120 }, { "epoch": 4.156278429347693, "grad_norm": 0.8398426175117493, "learning_rate": 1.3742089333211044e-05, "loss": 0.5186, "step": 21130 }, { "epoch": 4.158245432863711, "grad_norm": 1.4111764430999756, "learning_rate": 1.3739032070683911e-05, "loss": 0.3243, "step": 21140 }, { "epoch": 4.16021243637973, "grad_norm": 0.9417469501495361, "learning_rate": 1.3735974808156779e-05, "loss": 0.2852, "step": 21150 }, { "epoch": 4.162179439895749, "grad_norm": 0.43298131227493286, "learning_rate": 1.3732917545629644e-05, "loss": 0.313, "step": 21160 }, { "epoch": 4.164146443411767, "grad_norm": 1.1655223369598389, "learning_rate": 1.3729860283102512e-05, "loss": 0.422, "step": 21170 }, { "epoch": 4.1661134469277865, "grad_norm": 3.525562047958374, "learning_rate": 1.372680302057538e-05, "loss": 0.3823, "step": 21180 }, { "epoch": 4.168080450443806, "grad_norm": 1.0082656145095825, "learning_rate": 1.3723745758048245e-05, "loss": 0.3787, "step": 21190 }, { "epoch": 4.170047453959824, "grad_norm": 1.1629912853240967, "learning_rate": 1.372068849552111e-05, "loss": 0.4133, "step": 21200 }, { "epoch": 4.172014457475843, "grad_norm": 0.9328269958496094, "learning_rate": 1.3717631232993978e-05, "loss": 0.4058, "step": 21210 }, { "epoch": 4.173981460991861, "grad_norm": 1.1281689405441284, "learning_rate": 1.3714573970466844e-05, "loss": 0.4844, "step": 21220 }, { "epoch": 4.17594846450788, "grad_norm": 0.7831975221633911, "learning_rate": 1.3711516707939712e-05, "loss": 0.4359, "step": 21230 }, { "epoch": 4.177915468023899, "grad_norm": 1.9914780855178833, "learning_rate": 1.3708459445412579e-05, "loss": 0.3736, "step": 21240 }, { "epoch": 4.179882471539917, "grad_norm": 1.0189566612243652, "learning_rate": 1.3705402182885445e-05, "loss": 0.3837, "step": 21250 }, { "epoch": 4.1818494750559365, "grad_norm": 1.1360441446304321, "learning_rate": 1.3702344920358312e-05, "loss": 0.3335, "step": 21260 }, { "epoch": 4.183816478571956, "grad_norm": 1.208349585533142, "learning_rate": 1.369928765783118e-05, "loss": 0.387, "step": 21270 }, { "epoch": 4.185783482087974, "grad_norm": 1.4240858554840088, "learning_rate": 1.3696230395304047e-05, "loss": 0.4941, "step": 21280 }, { "epoch": 4.187750485603993, "grad_norm": 1.1946526765823364, "learning_rate": 1.3693173132776913e-05, "loss": 0.3432, "step": 21290 }, { "epoch": 4.189717489120012, "grad_norm": 1.1512346267700195, "learning_rate": 1.369011587024978e-05, "loss": 0.3967, "step": 21300 }, { "epoch": 4.19168449263603, "grad_norm": 1.373772144317627, "learning_rate": 1.3687058607722645e-05, "loss": 0.503, "step": 21310 }, { "epoch": 4.193651496152049, "grad_norm": 1.0407246351242065, "learning_rate": 1.3684001345195512e-05, "loss": 0.4068, "step": 21320 }, { "epoch": 4.195618499668068, "grad_norm": 1.2824262380599976, "learning_rate": 1.368094408266838e-05, "loss": 0.3464, "step": 21330 }, { "epoch": 4.1975855031840865, "grad_norm": 1.0151809453964233, "learning_rate": 1.3677886820141247e-05, "loss": 0.4902, "step": 21340 }, { "epoch": 4.199552506700106, "grad_norm": 0.9720301032066345, "learning_rate": 1.3674829557614113e-05, "loss": 0.3562, "step": 21350 }, { "epoch": 4.201519510216125, "grad_norm": 1.015008807182312, "learning_rate": 1.367177229508698e-05, "loss": 0.4697, "step": 21360 }, { "epoch": 4.203486513732143, "grad_norm": 1.029425859451294, "learning_rate": 1.3668715032559848e-05, "loss": 0.4514, "step": 21370 }, { "epoch": 4.205453517248162, "grad_norm": 1.1204736232757568, "learning_rate": 1.3665657770032713e-05, "loss": 0.3916, "step": 21380 }, { "epoch": 4.207420520764181, "grad_norm": 1.102508544921875, "learning_rate": 1.3662600507505581e-05, "loss": 0.4383, "step": 21390 }, { "epoch": 4.209387524280199, "grad_norm": 0.8906152248382568, "learning_rate": 1.3659543244978448e-05, "loss": 0.4674, "step": 21400 }, { "epoch": 4.211354527796218, "grad_norm": 1.9512590169906616, "learning_rate": 1.3656485982451316e-05, "loss": 0.3862, "step": 21410 }, { "epoch": 4.213321531312237, "grad_norm": 1.4732189178466797, "learning_rate": 1.365342871992418e-05, "loss": 0.3456, "step": 21420 }, { "epoch": 4.215288534828256, "grad_norm": 1.1794780492782593, "learning_rate": 1.3650371457397047e-05, "loss": 0.33, "step": 21430 }, { "epoch": 4.217255538344275, "grad_norm": 2.1888999938964844, "learning_rate": 1.3647314194869913e-05, "loss": 0.3866, "step": 21440 }, { "epoch": 4.219222541860294, "grad_norm": 1.7608472108840942, "learning_rate": 1.364425693234278e-05, "loss": 0.3101, "step": 21450 }, { "epoch": 4.221189545376312, "grad_norm": 2.152985095977783, "learning_rate": 1.3641199669815648e-05, "loss": 0.4059, "step": 21460 }, { "epoch": 4.223156548892331, "grad_norm": 1.4136911630630493, "learning_rate": 1.3638142407288516e-05, "loss": 0.4259, "step": 21470 }, { "epoch": 4.22512355240835, "grad_norm": 1.411493182182312, "learning_rate": 1.3635085144761381e-05, "loss": 0.4125, "step": 21480 }, { "epoch": 4.227090555924368, "grad_norm": 1.3379024267196655, "learning_rate": 1.3632027882234249e-05, "loss": 0.4389, "step": 21490 }, { "epoch": 4.229057559440387, "grad_norm": 0.7551414966583252, "learning_rate": 1.3628970619707116e-05, "loss": 0.3998, "step": 21500 }, { "epoch": 4.229057559440387, "eval_loss": 0.16503483057022095, "eval_runtime": 8.8997, "eval_samples_per_second": 5.618, "eval_steps_per_second": 2.809, "step": 21500 }, { "epoch": 4.2310245629564065, "grad_norm": 1.2745224237442017, "learning_rate": 1.3625913357179982e-05, "loss": 0.3906, "step": 21510 }, { "epoch": 4.232991566472425, "grad_norm": 3.2287611961364746, "learning_rate": 1.362285609465285e-05, "loss": 0.4169, "step": 21520 }, { "epoch": 4.234958569988444, "grad_norm": 1.4224720001220703, "learning_rate": 1.3619798832125717e-05, "loss": 0.2925, "step": 21530 }, { "epoch": 4.236925573504463, "grad_norm": 2.0160717964172363, "learning_rate": 1.3616741569598581e-05, "loss": 0.3777, "step": 21540 }, { "epoch": 4.238892577020481, "grad_norm": 0.9775928854942322, "learning_rate": 1.3613684307071449e-05, "loss": 0.3709, "step": 21550 }, { "epoch": 4.2408595805365, "grad_norm": 1.825934886932373, "learning_rate": 1.3610627044544316e-05, "loss": 0.4786, "step": 21560 }, { "epoch": 4.242826584052519, "grad_norm": 3.071707248687744, "learning_rate": 1.3607569782017182e-05, "loss": 0.2793, "step": 21570 }, { "epoch": 4.244793587568537, "grad_norm": 1.1243879795074463, "learning_rate": 1.360451251949005e-05, "loss": 0.249, "step": 21580 }, { "epoch": 4.2467605910845565, "grad_norm": 0.7699998021125793, "learning_rate": 1.3601455256962917e-05, "loss": 0.3444, "step": 21590 }, { "epoch": 4.248727594600576, "grad_norm": 1.3177696466445923, "learning_rate": 1.3598397994435784e-05, "loss": 0.4581, "step": 21600 }, { "epoch": 4.250694598116594, "grad_norm": 1.4589474201202393, "learning_rate": 1.359534073190865e-05, "loss": 0.5666, "step": 21610 }, { "epoch": 4.252661601632613, "grad_norm": 0.6593105792999268, "learning_rate": 1.3592283469381518e-05, "loss": 0.4002, "step": 21620 }, { "epoch": 4.254628605148632, "grad_norm": 2.0840675830841064, "learning_rate": 1.3589226206854385e-05, "loss": 0.4659, "step": 21630 }, { "epoch": 4.25659560866465, "grad_norm": 1.280366063117981, "learning_rate": 1.358616894432725e-05, "loss": 0.3954, "step": 21640 }, { "epoch": 4.258562612180669, "grad_norm": 1.2666908502578735, "learning_rate": 1.3583111681800117e-05, "loss": 0.4636, "step": 21650 }, { "epoch": 4.260529615696688, "grad_norm": 1.0792936086654663, "learning_rate": 1.3580054419272984e-05, "loss": 0.4839, "step": 21660 }, { "epoch": 4.2624966192127065, "grad_norm": 0.5781777501106262, "learning_rate": 1.357699715674585e-05, "loss": 0.3682, "step": 21670 }, { "epoch": 4.264463622728726, "grad_norm": 2.4557440280914307, "learning_rate": 1.3573939894218717e-05, "loss": 0.3156, "step": 21680 }, { "epoch": 4.266430626244745, "grad_norm": 2.1623849868774414, "learning_rate": 1.3570882631691585e-05, "loss": 0.3706, "step": 21690 }, { "epoch": 4.268397629760763, "grad_norm": 1.0489895343780518, "learning_rate": 1.356782536916445e-05, "loss": 0.426, "step": 21700 }, { "epoch": 4.270364633276782, "grad_norm": 0.9245631694793701, "learning_rate": 1.3564768106637318e-05, "loss": 0.3085, "step": 21710 }, { "epoch": 4.272331636792801, "grad_norm": 2.06693696975708, "learning_rate": 1.3561710844110185e-05, "loss": 0.3848, "step": 21720 }, { "epoch": 4.274298640308819, "grad_norm": 1.9542118310928345, "learning_rate": 1.3558653581583053e-05, "loss": 0.4136, "step": 21730 }, { "epoch": 4.276265643824838, "grad_norm": 0.6859905123710632, "learning_rate": 1.3555596319055919e-05, "loss": 0.357, "step": 21740 }, { "epoch": 4.278232647340857, "grad_norm": 1.3181674480438232, "learning_rate": 1.3552539056528786e-05, "loss": 0.3272, "step": 21750 }, { "epoch": 4.280199650856876, "grad_norm": 2.021536350250244, "learning_rate": 1.354948179400165e-05, "loss": 0.424, "step": 21760 }, { "epoch": 4.282166654372895, "grad_norm": 1.2987251281738281, "learning_rate": 1.3546424531474518e-05, "loss": 0.327, "step": 21770 }, { "epoch": 4.284133657888914, "grad_norm": 1.4116606712341309, "learning_rate": 1.3543367268947385e-05, "loss": 0.3835, "step": 21780 }, { "epoch": 4.286100661404932, "grad_norm": 1.1729151010513306, "learning_rate": 1.3540310006420253e-05, "loss": 0.3466, "step": 21790 }, { "epoch": 4.288067664920951, "grad_norm": 1.3461573123931885, "learning_rate": 1.3537252743893118e-05, "loss": 0.3492, "step": 21800 }, { "epoch": 4.29003466843697, "grad_norm": 1.5087522268295288, "learning_rate": 1.3534195481365986e-05, "loss": 0.3127, "step": 21810 }, { "epoch": 4.292001671952988, "grad_norm": 0.7613654136657715, "learning_rate": 1.3531138218838853e-05, "loss": 0.4316, "step": 21820 }, { "epoch": 4.293968675469007, "grad_norm": 2.289320230484009, "learning_rate": 1.352808095631172e-05, "loss": 0.3419, "step": 21830 }, { "epoch": 4.2959356789850265, "grad_norm": 1.3401827812194824, "learning_rate": 1.3525023693784587e-05, "loss": 0.4404, "step": 21840 }, { "epoch": 4.297902682501045, "grad_norm": 1.1575355529785156, "learning_rate": 1.3521966431257454e-05, "loss": 0.3049, "step": 21850 }, { "epoch": 4.299869686017064, "grad_norm": 1.0929820537567139, "learning_rate": 1.3518909168730322e-05, "loss": 0.4202, "step": 21860 }, { "epoch": 4.301836689533083, "grad_norm": 1.3887335062026978, "learning_rate": 1.3515851906203187e-05, "loss": 0.4188, "step": 21870 }, { "epoch": 4.303803693049101, "grad_norm": 4.001399040222168, "learning_rate": 1.3512794643676053e-05, "loss": 0.3887, "step": 21880 }, { "epoch": 4.30577069656512, "grad_norm": 2.886561155319214, "learning_rate": 1.3509737381148919e-05, "loss": 0.4775, "step": 21890 }, { "epoch": 4.307737700081139, "grad_norm": 1.860848307609558, "learning_rate": 1.3506680118621786e-05, "loss": 0.4234, "step": 21900 }, { "epoch": 4.309704703597157, "grad_norm": 3.108816146850586, "learning_rate": 1.3503622856094654e-05, "loss": 0.4309, "step": 21910 }, { "epoch": 4.3116717071131765, "grad_norm": 2.461402654647827, "learning_rate": 1.3500565593567521e-05, "loss": 0.2674, "step": 21920 }, { "epoch": 4.313638710629196, "grad_norm": 0.9968726634979248, "learning_rate": 1.3497508331040387e-05, "loss": 0.5311, "step": 21930 }, { "epoch": 4.315605714145214, "grad_norm": 1.9840035438537598, "learning_rate": 1.3494451068513255e-05, "loss": 0.448, "step": 21940 }, { "epoch": 4.317572717661233, "grad_norm": 1.257011890411377, "learning_rate": 1.3491393805986122e-05, "loss": 0.3892, "step": 21950 }, { "epoch": 4.319539721177252, "grad_norm": 1.2027013301849365, "learning_rate": 1.3488336543458988e-05, "loss": 0.3691, "step": 21960 }, { "epoch": 4.32150672469327, "grad_norm": 1.6953173875808716, "learning_rate": 1.3485279280931855e-05, "loss": 0.306, "step": 21970 }, { "epoch": 4.323473728209289, "grad_norm": 1.4018330574035645, "learning_rate": 1.3482222018404723e-05, "loss": 0.4501, "step": 21980 }, { "epoch": 4.325440731725308, "grad_norm": 1.5370357036590576, "learning_rate": 1.3479164755877587e-05, "loss": 0.3796, "step": 21990 }, { "epoch": 4.3274077352413265, "grad_norm": 1.5908989906311035, "learning_rate": 1.3476107493350454e-05, "loss": 0.3807, "step": 22000 }, { "epoch": 4.3274077352413265, "eval_loss": 0.16674765944480896, "eval_runtime": 8.8966, "eval_samples_per_second": 5.62, "eval_steps_per_second": 2.81, "step": 22000 }, { "epoch": 4.329374738757346, "grad_norm": 1.3100334405899048, "learning_rate": 1.3473050230823322e-05, "loss": 0.2926, "step": 22010 }, { "epoch": 4.331341742273365, "grad_norm": 1.27474844455719, "learning_rate": 1.3469992968296188e-05, "loss": 0.3087, "step": 22020 }, { "epoch": 4.333308745789383, "grad_norm": 2.377283811569214, "learning_rate": 1.3466935705769055e-05, "loss": 0.3764, "step": 22030 }, { "epoch": 4.335275749305402, "grad_norm": 0.9264887571334839, "learning_rate": 1.3463878443241923e-05, "loss": 0.2962, "step": 22040 }, { "epoch": 4.337242752821421, "grad_norm": 1.8624486923217773, "learning_rate": 1.346082118071479e-05, "loss": 0.3453, "step": 22050 }, { "epoch": 4.339209756337439, "grad_norm": 2.4104392528533936, "learning_rate": 1.3457763918187656e-05, "loss": 0.3549, "step": 22060 }, { "epoch": 4.341176759853458, "grad_norm": 1.0291612148284912, "learning_rate": 1.3454706655660523e-05, "loss": 0.3597, "step": 22070 }, { "epoch": 4.343143763369477, "grad_norm": 1.2357598543167114, "learning_rate": 1.345164939313339e-05, "loss": 0.5817, "step": 22080 }, { "epoch": 4.345110766885496, "grad_norm": 1.3580961227416992, "learning_rate": 1.3448592130606257e-05, "loss": 0.2892, "step": 22090 }, { "epoch": 4.347077770401515, "grad_norm": 0.9541229605674744, "learning_rate": 1.3445534868079122e-05, "loss": 0.3649, "step": 22100 }, { "epoch": 4.349044773917534, "grad_norm": 1.903846025466919, "learning_rate": 1.344247760555199e-05, "loss": 0.2888, "step": 22110 }, { "epoch": 4.351011777433552, "grad_norm": 1.6936200857162476, "learning_rate": 1.3439420343024856e-05, "loss": 0.4391, "step": 22120 }, { "epoch": 4.352978780949571, "grad_norm": 1.6819368600845337, "learning_rate": 1.3436363080497723e-05, "loss": 0.2995, "step": 22130 }, { "epoch": 4.35494578446559, "grad_norm": 1.0277513265609741, "learning_rate": 1.343330581797059e-05, "loss": 0.4502, "step": 22140 }, { "epoch": 4.356912787981608, "grad_norm": 1.3117585182189941, "learning_rate": 1.3430248555443456e-05, "loss": 0.4085, "step": 22150 }, { "epoch": 4.358879791497627, "grad_norm": 2.2798779010772705, "learning_rate": 1.3427191292916324e-05, "loss": 0.2803, "step": 22160 }, { "epoch": 4.3608467950136465, "grad_norm": 0.8277081251144409, "learning_rate": 1.3424134030389191e-05, "loss": 0.3755, "step": 22170 }, { "epoch": 4.362813798529665, "grad_norm": 1.4714564085006714, "learning_rate": 1.3421076767862059e-05, "loss": 0.446, "step": 22180 }, { "epoch": 4.364780802045684, "grad_norm": 1.0377657413482666, "learning_rate": 1.3418019505334924e-05, "loss": 0.4361, "step": 22190 }, { "epoch": 4.366747805561703, "grad_norm": 1.56504487991333, "learning_rate": 1.3414962242807792e-05, "loss": 0.4736, "step": 22200 }, { "epoch": 4.368714809077721, "grad_norm": 1.3587467670440674, "learning_rate": 1.341190498028066e-05, "loss": 0.4469, "step": 22210 }, { "epoch": 4.37068181259374, "grad_norm": 2.5480990409851074, "learning_rate": 1.3408847717753524e-05, "loss": 0.395, "step": 22220 }, { "epoch": 4.372648816109759, "grad_norm": 0.9612480401992798, "learning_rate": 1.3405790455226391e-05, "loss": 0.2845, "step": 22230 }, { "epoch": 4.374615819625777, "grad_norm": 3.587682008743286, "learning_rate": 1.3402733192699258e-05, "loss": 0.4049, "step": 22240 }, { "epoch": 4.3765828231417965, "grad_norm": 1.1484252214431763, "learning_rate": 1.3399675930172124e-05, "loss": 0.4146, "step": 22250 }, { "epoch": 4.378549826657816, "grad_norm": 1.5784882307052612, "learning_rate": 1.3396618667644992e-05, "loss": 0.3146, "step": 22260 }, { "epoch": 4.380516830173834, "grad_norm": 0.8098098039627075, "learning_rate": 1.339356140511786e-05, "loss": 0.3316, "step": 22270 }, { "epoch": 4.382483833689853, "grad_norm": 1.1505993604660034, "learning_rate": 1.3390504142590725e-05, "loss": 0.3938, "step": 22280 }, { "epoch": 4.384450837205872, "grad_norm": 1.157180905342102, "learning_rate": 1.3387446880063592e-05, "loss": 0.3029, "step": 22290 }, { "epoch": 4.38641784072189, "grad_norm": 0.7741907835006714, "learning_rate": 1.338438961753646e-05, "loss": 0.4229, "step": 22300 }, { "epoch": 4.388384844237909, "grad_norm": 2.2423791885375977, "learning_rate": 1.3381332355009327e-05, "loss": 0.3072, "step": 22310 }, { "epoch": 4.390351847753928, "grad_norm": 1.6292169094085693, "learning_rate": 1.3378275092482193e-05, "loss": 0.4627, "step": 22320 }, { "epoch": 4.3923188512699465, "grad_norm": 1.3712037801742554, "learning_rate": 1.3375217829955059e-05, "loss": 0.3798, "step": 22330 }, { "epoch": 4.394285854785966, "grad_norm": 1.2356741428375244, "learning_rate": 1.3372160567427925e-05, "loss": 0.4468, "step": 22340 }, { "epoch": 4.396252858301985, "grad_norm": 1.029561996459961, "learning_rate": 1.3369103304900792e-05, "loss": 0.3884, "step": 22350 }, { "epoch": 4.398219861818003, "grad_norm": 1.0780653953552246, "learning_rate": 1.336604604237366e-05, "loss": 0.3785, "step": 22360 }, { "epoch": 4.400186865334022, "grad_norm": 1.105497121810913, "learning_rate": 1.3362988779846527e-05, "loss": 0.373, "step": 22370 }, { "epoch": 4.402153868850041, "grad_norm": 1.8972887992858887, "learning_rate": 1.3359931517319393e-05, "loss": 0.3213, "step": 22380 }, { "epoch": 4.404120872366059, "grad_norm": 1.4248191118240356, "learning_rate": 1.335687425479226e-05, "loss": 0.2402, "step": 22390 }, { "epoch": 4.406087875882078, "grad_norm": 2.0353078842163086, "learning_rate": 1.3353816992265128e-05, "loss": 0.331, "step": 22400 }, { "epoch": 4.408054879398097, "grad_norm": 1.0217580795288086, "learning_rate": 1.3350759729737994e-05, "loss": 0.4146, "step": 22410 }, { "epoch": 4.410021882914116, "grad_norm": 3.259878635406494, "learning_rate": 1.3347702467210861e-05, "loss": 0.435, "step": 22420 }, { "epoch": 4.411988886430135, "grad_norm": 2.661207437515259, "learning_rate": 1.3344645204683729e-05, "loss": 0.3198, "step": 22430 }, { "epoch": 4.413955889946154, "grad_norm": 1.1849759817123413, "learning_rate": 1.3341587942156593e-05, "loss": 0.5211, "step": 22440 }, { "epoch": 4.415922893462172, "grad_norm": 1.2727833986282349, "learning_rate": 1.333853067962946e-05, "loss": 0.3356, "step": 22450 }, { "epoch": 4.417889896978191, "grad_norm": 1.231123685836792, "learning_rate": 1.3335473417102328e-05, "loss": 0.367, "step": 22460 }, { "epoch": 4.41985690049421, "grad_norm": 2.1626877784729004, "learning_rate": 1.3332416154575193e-05, "loss": 0.438, "step": 22470 }, { "epoch": 4.421823904010228, "grad_norm": 1.1750102043151855, "learning_rate": 1.332935889204806e-05, "loss": 0.382, "step": 22480 }, { "epoch": 4.423790907526247, "grad_norm": 2.559382438659668, "learning_rate": 1.3326301629520928e-05, "loss": 0.3909, "step": 22490 }, { "epoch": 4.4257579110422665, "grad_norm": 1.5452768802642822, "learning_rate": 1.3323244366993796e-05, "loss": 0.4676, "step": 22500 }, { "epoch": 4.4257579110422665, "eval_loss": 0.16149091720581055, "eval_runtime": 8.8776, "eval_samples_per_second": 5.632, "eval_steps_per_second": 2.816, "step": 22500 }, { "epoch": 4.427724914558285, "grad_norm": 1.9061115980148315, "learning_rate": 1.3320187104466662e-05, "loss": 0.4678, "step": 22510 }, { "epoch": 4.429691918074304, "grad_norm": 2.9274566173553467, "learning_rate": 1.3317129841939529e-05, "loss": 0.3, "step": 22520 }, { "epoch": 4.431658921590322, "grad_norm": 1.1326109170913696, "learning_rate": 1.3314072579412397e-05, "loss": 0.4364, "step": 22530 }, { "epoch": 4.433625925106341, "grad_norm": 0.9025644659996033, "learning_rate": 1.3311015316885262e-05, "loss": 0.3303, "step": 22540 }, { "epoch": 4.43559292862236, "grad_norm": 1.5909408330917358, "learning_rate": 1.330795805435813e-05, "loss": 0.3363, "step": 22550 }, { "epoch": 4.437559932138378, "grad_norm": 0.7686471939086914, "learning_rate": 1.3304900791830996e-05, "loss": 0.4155, "step": 22560 }, { "epoch": 4.439526935654397, "grad_norm": 0.9131650328636169, "learning_rate": 1.3301843529303861e-05, "loss": 0.483, "step": 22570 }, { "epoch": 4.4414939391704165, "grad_norm": 0.9710274338722229, "learning_rate": 1.3298786266776729e-05, "loss": 0.3863, "step": 22580 }, { "epoch": 4.443460942686435, "grad_norm": 2.079197645187378, "learning_rate": 1.3295729004249596e-05, "loss": 0.4701, "step": 22590 }, { "epoch": 4.445427946202454, "grad_norm": 1.4030396938323975, "learning_rate": 1.3292671741722462e-05, "loss": 0.413, "step": 22600 }, { "epoch": 4.447394949718473, "grad_norm": 1.82746160030365, "learning_rate": 1.328961447919533e-05, "loss": 0.5094, "step": 22610 }, { "epoch": 4.449361953234491, "grad_norm": 1.0365689992904663, "learning_rate": 1.3286557216668197e-05, "loss": 0.3536, "step": 22620 }, { "epoch": 4.45132895675051, "grad_norm": 1.287792682647705, "learning_rate": 1.3283499954141064e-05, "loss": 0.3274, "step": 22630 }, { "epoch": 4.453295960266529, "grad_norm": 2.0417988300323486, "learning_rate": 1.328044269161393e-05, "loss": 0.432, "step": 22640 }, { "epoch": 4.455262963782547, "grad_norm": 1.6449002027511597, "learning_rate": 1.3277385429086798e-05, "loss": 0.3048, "step": 22650 }, { "epoch": 4.4572299672985665, "grad_norm": 1.3553321361541748, "learning_rate": 1.3274328166559665e-05, "loss": 0.4551, "step": 22660 }, { "epoch": 4.4591969708145855, "grad_norm": 0.7964408993721008, "learning_rate": 1.327127090403253e-05, "loss": 0.4138, "step": 22670 }, { "epoch": 4.461163974330604, "grad_norm": 2.022167682647705, "learning_rate": 1.3268213641505397e-05, "loss": 0.3209, "step": 22680 }, { "epoch": 4.463130977846623, "grad_norm": 1.58821702003479, "learning_rate": 1.3265156378978264e-05, "loss": 0.4652, "step": 22690 }, { "epoch": 4.465097981362642, "grad_norm": 2.6645796298980713, "learning_rate": 1.326209911645113e-05, "loss": 0.4101, "step": 22700 }, { "epoch": 4.46706498487866, "grad_norm": 0.9394934177398682, "learning_rate": 1.3259041853923997e-05, "loss": 0.3394, "step": 22710 }, { "epoch": 4.469031988394679, "grad_norm": 1.1719261407852173, "learning_rate": 1.3255984591396865e-05, "loss": 0.3804, "step": 22720 }, { "epoch": 4.470998991910698, "grad_norm": 1.7606682777404785, "learning_rate": 1.325292732886973e-05, "loss": 0.4182, "step": 22730 }, { "epoch": 4.4729659954267165, "grad_norm": 1.0615359544754028, "learning_rate": 1.3249870066342598e-05, "loss": 0.3121, "step": 22740 }, { "epoch": 4.4749329989427356, "grad_norm": 1.0118271112442017, "learning_rate": 1.3246812803815466e-05, "loss": 0.4423, "step": 22750 }, { "epoch": 4.476900002458755, "grad_norm": 1.467221736907959, "learning_rate": 1.3243755541288333e-05, "loss": 0.4035, "step": 22760 }, { "epoch": 4.478867005974773, "grad_norm": 0.9635478854179382, "learning_rate": 1.3240698278761199e-05, "loss": 0.4424, "step": 22770 }, { "epoch": 4.480834009490792, "grad_norm": 2.118645429611206, "learning_rate": 1.3237641016234065e-05, "loss": 0.467, "step": 22780 }, { "epoch": 4.482801013006811, "grad_norm": 1.9423701763153076, "learning_rate": 1.323458375370693e-05, "loss": 0.3502, "step": 22790 }, { "epoch": 4.484768016522829, "grad_norm": 2.8893489837646484, "learning_rate": 1.3231526491179798e-05, "loss": 0.376, "step": 22800 }, { "epoch": 4.486735020038848, "grad_norm": 2.3034305572509766, "learning_rate": 1.3228469228652665e-05, "loss": 0.3, "step": 22810 }, { "epoch": 4.488702023554867, "grad_norm": 2.976353168487549, "learning_rate": 1.3225411966125533e-05, "loss": 0.3781, "step": 22820 }, { "epoch": 4.4906690270708856, "grad_norm": 1.0142730474472046, "learning_rate": 1.3222354703598399e-05, "loss": 0.4124, "step": 22830 }, { "epoch": 4.492636030586905, "grad_norm": 2.81199312210083, "learning_rate": 1.3219297441071266e-05, "loss": 0.3022, "step": 22840 }, { "epoch": 4.494603034102924, "grad_norm": 1.7177281379699707, "learning_rate": 1.3216240178544134e-05, "loss": 0.2978, "step": 22850 }, { "epoch": 4.496570037618942, "grad_norm": 1.6933962106704712, "learning_rate": 1.3213182916017e-05, "loss": 0.4975, "step": 22860 }, { "epoch": 4.498537041134961, "grad_norm": 1.574341893196106, "learning_rate": 1.3210125653489867e-05, "loss": 0.3842, "step": 22870 }, { "epoch": 4.50050404465098, "grad_norm": 1.6971651315689087, "learning_rate": 1.3207068390962734e-05, "loss": 0.3666, "step": 22880 }, { "epoch": 4.502471048166998, "grad_norm": 1.8739854097366333, "learning_rate": 1.3204011128435598e-05, "loss": 0.4354, "step": 22890 }, { "epoch": 4.504438051683017, "grad_norm": 1.7573387622833252, "learning_rate": 1.3200953865908466e-05, "loss": 0.3319, "step": 22900 }, { "epoch": 4.5064050551990364, "grad_norm": 1.734623670578003, "learning_rate": 1.3197896603381333e-05, "loss": 0.3508, "step": 22910 }, { "epoch": 4.508372058715055, "grad_norm": 1.4408247470855713, "learning_rate": 1.3194839340854199e-05, "loss": 0.398, "step": 22920 }, { "epoch": 4.510339062231074, "grad_norm": 1.2463972568511963, "learning_rate": 1.3191782078327067e-05, "loss": 0.4155, "step": 22930 }, { "epoch": 4.512306065747093, "grad_norm": 1.2701466083526611, "learning_rate": 1.3188724815799934e-05, "loss": 0.3238, "step": 22940 }, { "epoch": 4.514273069263111, "grad_norm": 2.2108230590820312, "learning_rate": 1.3185667553272802e-05, "loss": 0.3183, "step": 22950 }, { "epoch": 4.51624007277913, "grad_norm": 1.100644588470459, "learning_rate": 1.3182610290745667e-05, "loss": 0.4386, "step": 22960 }, { "epoch": 4.518207076295149, "grad_norm": 1.379320740699768, "learning_rate": 1.3179553028218535e-05, "loss": 0.3571, "step": 22970 }, { "epoch": 4.520174079811167, "grad_norm": 0.7771784663200378, "learning_rate": 1.3176495765691402e-05, "loss": 0.5327, "step": 22980 }, { "epoch": 4.5221410833271865, "grad_norm": 1.1977689266204834, "learning_rate": 1.3173438503164268e-05, "loss": 0.3398, "step": 22990 }, { "epoch": 4.5241080868432055, "grad_norm": 0.870847225189209, "learning_rate": 1.3170381240637135e-05, "loss": 0.4439, "step": 23000 }, { "epoch": 4.5241080868432055, "eval_loss": 0.17048148810863495, "eval_runtime": 8.9021, "eval_samples_per_second": 5.617, "eval_steps_per_second": 2.808, "step": 23000 }, { "epoch": 4.526075090359224, "grad_norm": 1.433722734451294, "learning_rate": 1.316732397811e-05, "loss": 0.4748, "step": 23010 }, { "epoch": 4.528042093875243, "grad_norm": 1.5698235034942627, "learning_rate": 1.3164266715582867e-05, "loss": 0.3629, "step": 23020 }, { "epoch": 4.530009097391262, "grad_norm": 1.342296838760376, "learning_rate": 1.3161209453055735e-05, "loss": 0.4292, "step": 23030 }, { "epoch": 4.53197610090728, "grad_norm": 1.4127466678619385, "learning_rate": 1.3158152190528602e-05, "loss": 0.3382, "step": 23040 }, { "epoch": 4.533943104423299, "grad_norm": 0.9064677357673645, "learning_rate": 1.3155094928001468e-05, "loss": 0.3593, "step": 23050 }, { "epoch": 4.535910107939318, "grad_norm": 1.4167490005493164, "learning_rate": 1.3152037665474335e-05, "loss": 0.3732, "step": 23060 }, { "epoch": 4.5378771114553365, "grad_norm": 1.2913470268249512, "learning_rate": 1.3148980402947203e-05, "loss": 0.3262, "step": 23070 }, { "epoch": 4.5398441149713555, "grad_norm": 1.5516128540039062, "learning_rate": 1.3145923140420068e-05, "loss": 0.381, "step": 23080 }, { "epoch": 4.541811118487375, "grad_norm": 1.0643260478973389, "learning_rate": 1.3142865877892936e-05, "loss": 0.5163, "step": 23090 }, { "epoch": 4.543778122003393, "grad_norm": 1.4565191268920898, "learning_rate": 1.3139808615365803e-05, "loss": 0.3783, "step": 23100 }, { "epoch": 4.545745125519412, "grad_norm": 1.2845790386199951, "learning_rate": 1.3136751352838671e-05, "loss": 0.2615, "step": 23110 }, { "epoch": 4.547712129035431, "grad_norm": 1.2871668338775635, "learning_rate": 1.3133694090311535e-05, "loss": 0.3775, "step": 23120 }, { "epoch": 4.549679132551449, "grad_norm": 1.8691515922546387, "learning_rate": 1.3130636827784402e-05, "loss": 0.3405, "step": 23130 }, { "epoch": 4.551646136067468, "grad_norm": 0.7955536842346191, "learning_rate": 1.3127579565257268e-05, "loss": 0.3757, "step": 23140 }, { "epoch": 4.553613139583487, "grad_norm": 1.201189398765564, "learning_rate": 1.3124522302730136e-05, "loss": 0.4091, "step": 23150 }, { "epoch": 4.5555801430995055, "grad_norm": 2.295210361480713, "learning_rate": 1.3121465040203003e-05, "loss": 0.3536, "step": 23160 }, { "epoch": 4.557547146615525, "grad_norm": 3.4921929836273193, "learning_rate": 1.311840777767587e-05, "loss": 0.3792, "step": 23170 }, { "epoch": 4.559514150131544, "grad_norm": 1.5654789209365845, "learning_rate": 1.3115350515148736e-05, "loss": 0.407, "step": 23180 }, { "epoch": 4.561481153647562, "grad_norm": 1.4418089389801025, "learning_rate": 1.3112293252621604e-05, "loss": 0.3842, "step": 23190 }, { "epoch": 4.563448157163581, "grad_norm": 1.6266974210739136, "learning_rate": 1.3109235990094471e-05, "loss": 0.3039, "step": 23200 }, { "epoch": 4.5654151606796, "grad_norm": 2.377856731414795, "learning_rate": 1.3106178727567337e-05, "loss": 0.4083, "step": 23210 }, { "epoch": 4.567382164195618, "grad_norm": 0.9909720420837402, "learning_rate": 1.3103121465040205e-05, "loss": 0.3155, "step": 23220 }, { "epoch": 4.569349167711637, "grad_norm": 1.2801272869110107, "learning_rate": 1.310006420251307e-05, "loss": 0.4249, "step": 23230 }, { "epoch": 4.571316171227656, "grad_norm": 1.7023869752883911, "learning_rate": 1.3097006939985936e-05, "loss": 0.2412, "step": 23240 }, { "epoch": 4.573283174743675, "grad_norm": 1.0783114433288574, "learning_rate": 1.3093949677458804e-05, "loss": 0.3342, "step": 23250 }, { "epoch": 4.575250178259694, "grad_norm": 2.0209755897521973, "learning_rate": 1.3090892414931671e-05, "loss": 0.3702, "step": 23260 }, { "epoch": 4.577217181775713, "grad_norm": 0.7944973111152649, "learning_rate": 1.3087835152404537e-05, "loss": 0.3751, "step": 23270 }, { "epoch": 4.579184185291731, "grad_norm": 1.268553614616394, "learning_rate": 1.3084777889877404e-05, "loss": 0.3624, "step": 23280 }, { "epoch": 4.58115118880775, "grad_norm": 0.9479203224182129, "learning_rate": 1.3081720627350272e-05, "loss": 0.3788, "step": 23290 }, { "epoch": 4.583118192323769, "grad_norm": 1.9021347761154175, "learning_rate": 1.307866336482314e-05, "loss": 0.4087, "step": 23300 }, { "epoch": 4.585085195839787, "grad_norm": 0.9418231844902039, "learning_rate": 1.3075606102296005e-05, "loss": 0.2752, "step": 23310 }, { "epoch": 4.587052199355806, "grad_norm": 1.3885140419006348, "learning_rate": 1.3072548839768873e-05, "loss": 0.3861, "step": 23320 }, { "epoch": 4.5890192028718255, "grad_norm": 1.2173035144805908, "learning_rate": 1.306949157724174e-05, "loss": 0.5022, "step": 23330 }, { "epoch": 4.590986206387844, "grad_norm": 0.7699891328811646, "learning_rate": 1.3066434314714606e-05, "loss": 0.3706, "step": 23340 }, { "epoch": 4.592953209903863, "grad_norm": 1.2280246019363403, "learning_rate": 1.3063377052187472e-05, "loss": 0.4889, "step": 23350 }, { "epoch": 4.594920213419881, "grad_norm": 3.758246660232544, "learning_rate": 1.3060319789660339e-05, "loss": 0.3761, "step": 23360 }, { "epoch": 4.5968872169359, "grad_norm": 1.604286551475525, "learning_rate": 1.3057262527133205e-05, "loss": 0.3536, "step": 23370 }, { "epoch": 4.598854220451919, "grad_norm": 1.3617031574249268, "learning_rate": 1.3054205264606072e-05, "loss": 0.3312, "step": 23380 }, { "epoch": 4.600821223967937, "grad_norm": 1.016905665397644, "learning_rate": 1.305114800207894e-05, "loss": 0.3307, "step": 23390 }, { "epoch": 4.602788227483956, "grad_norm": 0.9720826148986816, "learning_rate": 1.3048090739551806e-05, "loss": 0.4432, "step": 23400 }, { "epoch": 4.6047552309999755, "grad_norm": 1.73250150680542, "learning_rate": 1.3045033477024673e-05, "loss": 0.3725, "step": 23410 }, { "epoch": 4.606722234515994, "grad_norm": 1.7776602506637573, "learning_rate": 1.304197621449754e-05, "loss": 0.4515, "step": 23420 }, { "epoch": 4.608689238032013, "grad_norm": 1.3988317251205444, "learning_rate": 1.3038918951970408e-05, "loss": 0.357, "step": 23430 }, { "epoch": 4.610656241548032, "grad_norm": 1.8836336135864258, "learning_rate": 1.3035861689443274e-05, "loss": 0.3908, "step": 23440 }, { "epoch": 4.61262324506405, "grad_norm": 4.226807117462158, "learning_rate": 1.3032804426916141e-05, "loss": 0.3504, "step": 23450 }, { "epoch": 4.614590248580069, "grad_norm": 1.609248399734497, "learning_rate": 1.3029747164389005e-05, "loss": 0.4287, "step": 23460 }, { "epoch": 4.616557252096088, "grad_norm": 1.37553071975708, "learning_rate": 1.3026689901861873e-05, "loss": 0.3076, "step": 23470 }, { "epoch": 4.618524255612106, "grad_norm": 1.6155554056167603, "learning_rate": 1.302363263933474e-05, "loss": 0.398, "step": 23480 }, { "epoch": 4.6204912591281255, "grad_norm": 1.0259311199188232, "learning_rate": 1.3020575376807608e-05, "loss": 0.4076, "step": 23490 }, { "epoch": 4.622458262644145, "grad_norm": 1.4233862161636353, "learning_rate": 1.3017518114280474e-05, "loss": 0.2926, "step": 23500 }, { "epoch": 4.622458262644145, "eval_loss": 0.15736329555511475, "eval_runtime": 8.8744, "eval_samples_per_second": 5.634, "eval_steps_per_second": 2.817, "step": 23500 }, { "epoch": 4.624425266160163, "grad_norm": 1.5899595022201538, "learning_rate": 1.3014460851753341e-05, "loss": 0.4433, "step": 23510 }, { "epoch": 4.626392269676182, "grad_norm": 1.366363286972046, "learning_rate": 1.3011403589226208e-05, "loss": 0.3417, "step": 23520 }, { "epoch": 4.628359273192201, "grad_norm": 1.814328908920288, "learning_rate": 1.3008346326699074e-05, "loss": 0.3231, "step": 23530 }, { "epoch": 4.630326276708219, "grad_norm": 1.5949935913085938, "learning_rate": 1.3005289064171942e-05, "loss": 0.3993, "step": 23540 }, { "epoch": 4.632293280224238, "grad_norm": 0.9953024387359619, "learning_rate": 1.300223180164481e-05, "loss": 0.2743, "step": 23550 }, { "epoch": 4.634260283740257, "grad_norm": 1.2114768028259277, "learning_rate": 1.2999174539117677e-05, "loss": 0.5201, "step": 23560 }, { "epoch": 4.6362272872562755, "grad_norm": 1.964851975440979, "learning_rate": 1.299611727659054e-05, "loss": 0.5065, "step": 23570 }, { "epoch": 4.638194290772295, "grad_norm": 1.2670104503631592, "learning_rate": 1.2993060014063408e-05, "loss": 0.4855, "step": 23580 }, { "epoch": 4.640161294288314, "grad_norm": 0.9536296129226685, "learning_rate": 1.2990002751536274e-05, "loss": 0.4132, "step": 23590 }, { "epoch": 4.642128297804332, "grad_norm": 2.4617717266082764, "learning_rate": 1.2986945489009141e-05, "loss": 0.3878, "step": 23600 }, { "epoch": 4.644095301320351, "grad_norm": 2.0903079509735107, "learning_rate": 1.2983888226482009e-05, "loss": 0.3278, "step": 23610 }, { "epoch": 4.64606230483637, "grad_norm": 1.6212421655654907, "learning_rate": 1.2980830963954876e-05, "loss": 0.4473, "step": 23620 }, { "epoch": 4.648029308352388, "grad_norm": 1.0318902730941772, "learning_rate": 1.2977773701427742e-05, "loss": 0.2512, "step": 23630 }, { "epoch": 4.649996311868407, "grad_norm": 0.8654146194458008, "learning_rate": 1.297471643890061e-05, "loss": 0.4187, "step": 23640 }, { "epoch": 4.651963315384426, "grad_norm": 1.1589908599853516, "learning_rate": 1.2971659176373477e-05, "loss": 0.2749, "step": 23650 }, { "epoch": 4.653930318900445, "grad_norm": 0.9683002829551697, "learning_rate": 1.2968601913846343e-05, "loss": 0.3428, "step": 23660 }, { "epoch": 4.655897322416464, "grad_norm": 2.9495432376861572, "learning_rate": 1.296554465131921e-05, "loss": 0.3887, "step": 23670 }, { "epoch": 4.657864325932483, "grad_norm": 0.9485315084457397, "learning_rate": 1.2962487388792078e-05, "loss": 0.2573, "step": 23680 }, { "epoch": 4.659831329448501, "grad_norm": 0.9480960965156555, "learning_rate": 1.2959430126264942e-05, "loss": 0.3191, "step": 23690 }, { "epoch": 4.66179833296452, "grad_norm": 2.1239635944366455, "learning_rate": 1.295637286373781e-05, "loss": 0.4233, "step": 23700 }, { "epoch": 4.663765336480539, "grad_norm": 1.388307809829712, "learning_rate": 1.2953315601210677e-05, "loss": 0.3491, "step": 23710 }, { "epoch": 4.665732339996557, "grad_norm": 1.6451371908187866, "learning_rate": 1.2950258338683543e-05, "loss": 0.4707, "step": 23720 }, { "epoch": 4.667699343512576, "grad_norm": 1.3180400133132935, "learning_rate": 1.294720107615641e-05, "loss": 0.2803, "step": 23730 }, { "epoch": 4.6696663470285955, "grad_norm": 2.698408365249634, "learning_rate": 1.2944143813629278e-05, "loss": 0.3303, "step": 23740 }, { "epoch": 4.671633350544614, "grad_norm": 1.5707173347473145, "learning_rate": 1.2941086551102145e-05, "loss": 0.3554, "step": 23750 }, { "epoch": 4.673600354060633, "grad_norm": 2.129290819168091, "learning_rate": 1.293802928857501e-05, "loss": 0.4207, "step": 23760 }, { "epoch": 4.675567357576652, "grad_norm": 1.163686752319336, "learning_rate": 1.2934972026047878e-05, "loss": 0.3788, "step": 23770 }, { "epoch": 4.67753436109267, "grad_norm": 1.2933921813964844, "learning_rate": 1.2931914763520746e-05, "loss": 0.396, "step": 23780 }, { "epoch": 4.679501364608689, "grad_norm": 0.8011611700057983, "learning_rate": 1.2928857500993612e-05, "loss": 0.3598, "step": 23790 }, { "epoch": 4.681468368124708, "grad_norm": 0.9323627352714539, "learning_rate": 1.2925800238466477e-05, "loss": 0.3895, "step": 23800 }, { "epoch": 4.683435371640726, "grad_norm": 1.582582950592041, "learning_rate": 1.2922742975939345e-05, "loss": 0.4277, "step": 23810 }, { "epoch": 4.6854023751567455, "grad_norm": 1.743823528289795, "learning_rate": 1.291968571341221e-05, "loss": 0.3476, "step": 23820 }, { "epoch": 4.687369378672765, "grad_norm": 1.2965720891952515, "learning_rate": 1.2916628450885078e-05, "loss": 0.36, "step": 23830 }, { "epoch": 4.689336382188783, "grad_norm": 1.0615127086639404, "learning_rate": 1.2913571188357946e-05, "loss": 0.4398, "step": 23840 }, { "epoch": 4.691303385704802, "grad_norm": 1.2573604583740234, "learning_rate": 1.2910513925830811e-05, "loss": 0.4662, "step": 23850 }, { "epoch": 4.693270389220821, "grad_norm": 1.6875718832015991, "learning_rate": 1.2907456663303679e-05, "loss": 0.2507, "step": 23860 }, { "epoch": 4.695237392736839, "grad_norm": 1.415879249572754, "learning_rate": 1.2904399400776546e-05, "loss": 0.233, "step": 23870 }, { "epoch": 4.697204396252858, "grad_norm": 2.006418466567993, "learning_rate": 1.2901342138249414e-05, "loss": 0.3111, "step": 23880 }, { "epoch": 4.699171399768877, "grad_norm": 1.177172064781189, "learning_rate": 1.289828487572228e-05, "loss": 0.4526, "step": 23890 }, { "epoch": 4.7011384032848955, "grad_norm": 1.4981369972229004, "learning_rate": 1.2895227613195147e-05, "loss": 0.3032, "step": 23900 }, { "epoch": 4.703105406800915, "grad_norm": 1.1772596836090088, "learning_rate": 1.2892170350668011e-05, "loss": 0.2929, "step": 23910 }, { "epoch": 4.705072410316934, "grad_norm": 1.217176914215088, "learning_rate": 1.2889113088140879e-05, "loss": 0.3669, "step": 23920 }, { "epoch": 4.707039413832952, "grad_norm": 1.7104442119598389, "learning_rate": 1.2886055825613746e-05, "loss": 0.4283, "step": 23930 }, { "epoch": 4.709006417348971, "grad_norm": 1.0308455228805542, "learning_rate": 1.2882998563086613e-05, "loss": 0.3678, "step": 23940 }, { "epoch": 4.71097342086499, "grad_norm": 1.3772929906845093, "learning_rate": 1.287994130055948e-05, "loss": 0.2768, "step": 23950 }, { "epoch": 4.712940424381008, "grad_norm": 1.864748477935791, "learning_rate": 1.2876884038032347e-05, "loss": 0.446, "step": 23960 }, { "epoch": 4.714907427897027, "grad_norm": 1.0032296180725098, "learning_rate": 1.2873826775505214e-05, "loss": 0.4478, "step": 23970 }, { "epoch": 4.716874431413046, "grad_norm": 3.7188913822174072, "learning_rate": 1.287076951297808e-05, "loss": 0.3125, "step": 23980 }, { "epoch": 4.718841434929065, "grad_norm": 0.7291481494903564, "learning_rate": 1.2867712250450947e-05, "loss": 0.4573, "step": 23990 }, { "epoch": 4.720808438445084, "grad_norm": 3.503469944000244, "learning_rate": 1.2864654987923815e-05, "loss": 0.5222, "step": 24000 }, { "epoch": 4.720808438445084, "eval_loss": 0.15961362421512604, "eval_runtime": 8.865, "eval_samples_per_second": 5.64, "eval_steps_per_second": 2.82, "step": 24000 }, { "epoch": 4.722775441961103, "grad_norm": 1.0029255151748657, "learning_rate": 1.2861597725396682e-05, "loss": 0.4331, "step": 24010 }, { "epoch": 4.724742445477121, "grad_norm": 1.118600606918335, "learning_rate": 1.2858540462869548e-05, "loss": 0.4526, "step": 24020 }, { "epoch": 4.72670944899314, "grad_norm": 1.1678026914596558, "learning_rate": 1.2855483200342414e-05, "loss": 0.2711, "step": 24030 }, { "epoch": 4.728676452509159, "grad_norm": 1.6292004585266113, "learning_rate": 1.285242593781528e-05, "loss": 0.3166, "step": 24040 }, { "epoch": 4.730643456025177, "grad_norm": 0.6910290122032166, "learning_rate": 1.2849368675288147e-05, "loss": 0.4412, "step": 24050 }, { "epoch": 4.732610459541196, "grad_norm": 1.3265618085861206, "learning_rate": 1.2846311412761015e-05, "loss": 0.3109, "step": 24060 }, { "epoch": 4.7345774630572155, "grad_norm": 1.3849608898162842, "learning_rate": 1.2843254150233882e-05, "loss": 0.368, "step": 24070 }, { "epoch": 4.736544466573234, "grad_norm": 0.561913013458252, "learning_rate": 1.2840196887706748e-05, "loss": 0.3611, "step": 24080 }, { "epoch": 4.738511470089253, "grad_norm": 1.4088836908340454, "learning_rate": 1.2837139625179615e-05, "loss": 0.4978, "step": 24090 }, { "epoch": 4.740478473605272, "grad_norm": 1.5019394159317017, "learning_rate": 1.2834082362652483e-05, "loss": 0.2134, "step": 24100 }, { "epoch": 4.74244547712129, "grad_norm": 1.1798714399337769, "learning_rate": 1.2831025100125349e-05, "loss": 0.529, "step": 24110 }, { "epoch": 4.744412480637309, "grad_norm": 0.8506179451942444, "learning_rate": 1.2827967837598216e-05, "loss": 0.3883, "step": 24120 }, { "epoch": 4.746379484153328, "grad_norm": 1.204187273979187, "learning_rate": 1.2824910575071084e-05, "loss": 0.493, "step": 24130 }, { "epoch": 4.748346487669346, "grad_norm": 1.6845051050186157, "learning_rate": 1.2821853312543948e-05, "loss": 0.4189, "step": 24140 }, { "epoch": 4.7503134911853655, "grad_norm": 2.0980777740478516, "learning_rate": 1.2818796050016815e-05, "loss": 0.4742, "step": 24150 }, { "epoch": 4.752280494701385, "grad_norm": 2.7500083446502686, "learning_rate": 1.2815738787489683e-05, "loss": 0.3244, "step": 24160 }, { "epoch": 4.754247498217403, "grad_norm": 1.2423878908157349, "learning_rate": 1.2812681524962548e-05, "loss": 0.3138, "step": 24170 }, { "epoch": 4.756214501733422, "grad_norm": 1.9693214893341064, "learning_rate": 1.2809624262435416e-05, "loss": 0.4574, "step": 24180 }, { "epoch": 4.758181505249441, "grad_norm": 1.148080587387085, "learning_rate": 1.2806566999908283e-05, "loss": 0.4755, "step": 24190 }, { "epoch": 4.760148508765459, "grad_norm": 1.0471198558807373, "learning_rate": 1.280350973738115e-05, "loss": 0.3744, "step": 24200 }, { "epoch": 4.762115512281478, "grad_norm": 4.0511698722839355, "learning_rate": 1.2800452474854017e-05, "loss": 0.3998, "step": 24210 }, { "epoch": 4.764082515797497, "grad_norm": 0.9432234168052673, "learning_rate": 1.2797395212326884e-05, "loss": 0.2854, "step": 24220 }, { "epoch": 4.7660495193135155, "grad_norm": 1.4072970151901245, "learning_rate": 1.2794337949799752e-05, "loss": 0.3818, "step": 24230 }, { "epoch": 4.768016522829535, "grad_norm": 1.4117906093597412, "learning_rate": 1.2791280687272617e-05, "loss": 0.3295, "step": 24240 }, { "epoch": 4.769983526345554, "grad_norm": 1.4891633987426758, "learning_rate": 1.2788223424745483e-05, "loss": 0.5198, "step": 24250 }, { "epoch": 4.771950529861572, "grad_norm": 1.3312366008758545, "learning_rate": 1.278516616221835e-05, "loss": 0.3685, "step": 24260 }, { "epoch": 4.773917533377591, "grad_norm": 1.0341721773147583, "learning_rate": 1.2782108899691216e-05, "loss": 0.3587, "step": 24270 }, { "epoch": 4.77588453689361, "grad_norm": 1.457330346107483, "learning_rate": 1.2779051637164084e-05, "loss": 0.4167, "step": 24280 }, { "epoch": 4.777851540409628, "grad_norm": 1.134305715560913, "learning_rate": 1.2775994374636951e-05, "loss": 0.2711, "step": 24290 }, { "epoch": 4.779818543925647, "grad_norm": 1.0785402059555054, "learning_rate": 1.2772937112109817e-05, "loss": 0.4465, "step": 24300 }, { "epoch": 4.781785547441666, "grad_norm": 1.2902367115020752, "learning_rate": 1.2769879849582685e-05, "loss": 0.4601, "step": 24310 }, { "epoch": 4.783752550957685, "grad_norm": 1.6840447187423706, "learning_rate": 1.2766822587055552e-05, "loss": 0.3442, "step": 24320 }, { "epoch": 4.785719554473704, "grad_norm": 1.1976608037948608, "learning_rate": 1.276376532452842e-05, "loss": 0.2902, "step": 24330 }, { "epoch": 4.787686557989723, "grad_norm": 2.4353525638580322, "learning_rate": 1.2760708062001285e-05, "loss": 0.3511, "step": 24340 }, { "epoch": 4.789653561505741, "grad_norm": 1.9608737230300903, "learning_rate": 1.2757650799474153e-05, "loss": 0.334, "step": 24350 }, { "epoch": 4.79162056502176, "grad_norm": 1.517730951309204, "learning_rate": 1.275459353694702e-05, "loss": 0.246, "step": 24360 }, { "epoch": 4.793587568537779, "grad_norm": 3.3855648040771484, "learning_rate": 1.2751536274419884e-05, "loss": 0.4411, "step": 24370 }, { "epoch": 4.795554572053797, "grad_norm": 1.0907025337219238, "learning_rate": 1.2748479011892752e-05, "loss": 0.4161, "step": 24380 }, { "epoch": 4.797521575569816, "grad_norm": 2.2040274143218994, "learning_rate": 1.274542174936562e-05, "loss": 0.4809, "step": 24390 }, { "epoch": 4.7994885790858355, "grad_norm": 1.5226056575775146, "learning_rate": 1.2742364486838485e-05, "loss": 0.4644, "step": 24400 }, { "epoch": 4.801455582601854, "grad_norm": 1.3238670825958252, "learning_rate": 1.2739307224311352e-05, "loss": 0.3859, "step": 24410 }, { "epoch": 4.803422586117873, "grad_norm": 1.0836786031723022, "learning_rate": 1.273624996178422e-05, "loss": 0.4176, "step": 24420 }, { "epoch": 4.805389589633892, "grad_norm": 1.9304059743881226, "learning_rate": 1.2733192699257086e-05, "loss": 0.3487, "step": 24430 }, { "epoch": 4.80735659314991, "grad_norm": 3.5189664363861084, "learning_rate": 1.2730135436729953e-05, "loss": 0.4214, "step": 24440 }, { "epoch": 4.809323596665929, "grad_norm": 1.5368452072143555, "learning_rate": 1.272707817420282e-05, "loss": 0.5255, "step": 24450 }, { "epoch": 4.811290600181948, "grad_norm": 2.091585159301758, "learning_rate": 1.2724020911675688e-05, "loss": 0.331, "step": 24460 }, { "epoch": 4.813257603697966, "grad_norm": 2.5375471115112305, "learning_rate": 1.2720963649148554e-05, "loss": 0.3701, "step": 24470 }, { "epoch": 4.8152246072139855, "grad_norm": 2.3465840816497803, "learning_rate": 1.271790638662142e-05, "loss": 0.4327, "step": 24480 }, { "epoch": 4.817191610730005, "grad_norm": 1.6373897790908813, "learning_rate": 1.2714849124094285e-05, "loss": 0.4704, "step": 24490 }, { "epoch": 4.819158614246023, "grad_norm": 1.7311253547668457, "learning_rate": 1.2711791861567153e-05, "loss": 0.4387, "step": 24500 }, { "epoch": 4.819158614246023, "eval_loss": 0.1739385724067688, "eval_runtime": 8.8844, "eval_samples_per_second": 5.628, "eval_steps_per_second": 2.814, "step": 24500 }, { "epoch": 4.821125617762042, "grad_norm": 1.9144916534423828, "learning_rate": 1.270873459904002e-05, "loss": 0.3141, "step": 24510 }, { "epoch": 4.823092621278061, "grad_norm": 1.6824641227722168, "learning_rate": 1.2705677336512888e-05, "loss": 0.4319, "step": 24520 }, { "epoch": 4.825059624794079, "grad_norm": 1.2311413288116455, "learning_rate": 1.2702620073985754e-05, "loss": 0.338, "step": 24530 }, { "epoch": 4.827026628310098, "grad_norm": 1.8054691553115845, "learning_rate": 1.2699562811458621e-05, "loss": 0.4764, "step": 24540 }, { "epoch": 4.828993631826117, "grad_norm": 2.1235504150390625, "learning_rate": 1.2696505548931489e-05, "loss": 0.3935, "step": 24550 }, { "epoch": 4.8309606353421355, "grad_norm": 1.0977386236190796, "learning_rate": 1.2693448286404354e-05, "loss": 0.3601, "step": 24560 }, { "epoch": 4.832927638858155, "grad_norm": 0.9977229833602905, "learning_rate": 1.2690391023877222e-05, "loss": 0.4251, "step": 24570 }, { "epoch": 4.834894642374174, "grad_norm": 1.337618350982666, "learning_rate": 1.268733376135009e-05, "loss": 0.3121, "step": 24580 }, { "epoch": 4.836861645890192, "grad_norm": 1.5812876224517822, "learning_rate": 1.2684276498822953e-05, "loss": 0.3465, "step": 24590 }, { "epoch": 4.838828649406211, "grad_norm": 1.1964857578277588, "learning_rate": 1.2681219236295821e-05, "loss": 0.3805, "step": 24600 }, { "epoch": 4.84079565292223, "grad_norm": 1.448060393333435, "learning_rate": 1.2678161973768688e-05, "loss": 0.3723, "step": 24610 }, { "epoch": 4.842762656438248, "grad_norm": 1.366903305053711, "learning_rate": 1.2675104711241554e-05, "loss": 0.4319, "step": 24620 }, { "epoch": 4.844729659954267, "grad_norm": 1.6274155378341675, "learning_rate": 1.2672047448714422e-05, "loss": 0.371, "step": 24630 }, { "epoch": 4.846696663470286, "grad_norm": 2.308397054672241, "learning_rate": 1.2668990186187289e-05, "loss": 0.4486, "step": 24640 }, { "epoch": 4.848663666986305, "grad_norm": 0.8979236483573914, "learning_rate": 1.2665932923660157e-05, "loss": 0.3777, "step": 24650 }, { "epoch": 4.850630670502324, "grad_norm": 1.4962645769119263, "learning_rate": 1.2662875661133022e-05, "loss": 0.4427, "step": 24660 }, { "epoch": 4.852597674018343, "grad_norm": 1.5937800407409668, "learning_rate": 1.265981839860589e-05, "loss": 0.4127, "step": 24670 }, { "epoch": 4.854564677534361, "grad_norm": 0.9097000360488892, "learning_rate": 1.2656761136078757e-05, "loss": 0.4768, "step": 24680 }, { "epoch": 4.85653168105038, "grad_norm": 1.4313491582870483, "learning_rate": 1.2653703873551623e-05, "loss": 0.5025, "step": 24690 }, { "epoch": 4.858498684566399, "grad_norm": 1.1728724241256714, "learning_rate": 1.265064661102449e-05, "loss": 0.3964, "step": 24700 }, { "epoch": 4.860465688082417, "grad_norm": 2.7858197689056396, "learning_rate": 1.2647589348497356e-05, "loss": 0.395, "step": 24710 }, { "epoch": 4.862432691598436, "grad_norm": 0.7792349457740784, "learning_rate": 1.2644532085970222e-05, "loss": 0.4374, "step": 24720 }, { "epoch": 4.8643996951144555, "grad_norm": 1.8455607891082764, "learning_rate": 1.264147482344309e-05, "loss": 0.4001, "step": 24730 }, { "epoch": 4.866366698630474, "grad_norm": 2.4060239791870117, "learning_rate": 1.2638417560915957e-05, "loss": 0.4162, "step": 24740 }, { "epoch": 4.868333702146493, "grad_norm": 1.1814616918563843, "learning_rate": 1.2635360298388823e-05, "loss": 0.5639, "step": 24750 }, { "epoch": 4.870300705662512, "grad_norm": 1.329742670059204, "learning_rate": 1.263230303586169e-05, "loss": 0.5668, "step": 24760 }, { "epoch": 4.87226770917853, "grad_norm": 1.1229251623153687, "learning_rate": 1.2629245773334558e-05, "loss": 0.5126, "step": 24770 }, { "epoch": 4.874234712694549, "grad_norm": 1.2327066659927368, "learning_rate": 1.2626188510807425e-05, "loss": 0.3026, "step": 24780 }, { "epoch": 4.876201716210568, "grad_norm": 1.2386301755905151, "learning_rate": 1.2623131248280291e-05, "loss": 0.3811, "step": 24790 }, { "epoch": 4.878168719726586, "grad_norm": 1.8087528944015503, "learning_rate": 1.2620073985753158e-05, "loss": 0.3896, "step": 24800 }, { "epoch": 4.8801357232426055, "grad_norm": 1.0072718858718872, "learning_rate": 1.2617016723226026e-05, "loss": 0.2802, "step": 24810 }, { "epoch": 4.8821027267586246, "grad_norm": 1.5886342525482178, "learning_rate": 1.261395946069889e-05, "loss": 0.4388, "step": 24820 }, { "epoch": 4.884069730274643, "grad_norm": 1.1532931327819824, "learning_rate": 1.2610902198171757e-05, "loss": 0.4262, "step": 24830 }, { "epoch": 4.886036733790662, "grad_norm": 1.3830076456069946, "learning_rate": 1.2607844935644625e-05, "loss": 0.3209, "step": 24840 }, { "epoch": 4.888003737306681, "grad_norm": 1.9003983736038208, "learning_rate": 1.260478767311749e-05, "loss": 0.3566, "step": 24850 }, { "epoch": 4.889970740822699, "grad_norm": 1.8158513307571411, "learning_rate": 1.2601730410590358e-05, "loss": 0.3776, "step": 24860 }, { "epoch": 4.891937744338718, "grad_norm": 1.6109884977340698, "learning_rate": 1.2598673148063226e-05, "loss": 0.3846, "step": 24870 }, { "epoch": 4.893904747854737, "grad_norm": 1.3823764324188232, "learning_rate": 1.2595615885536091e-05, "loss": 0.3041, "step": 24880 }, { "epoch": 4.8958717513707555, "grad_norm": 2.2418899536132812, "learning_rate": 1.2592558623008959e-05, "loss": 0.3792, "step": 24890 }, { "epoch": 4.897838754886775, "grad_norm": 0.8362561464309692, "learning_rate": 1.2589501360481826e-05, "loss": 0.3726, "step": 24900 }, { "epoch": 4.899805758402794, "grad_norm": 1.6469014883041382, "learning_rate": 1.2586444097954694e-05, "loss": 0.4788, "step": 24910 }, { "epoch": 4.901772761918812, "grad_norm": 1.3245086669921875, "learning_rate": 1.258338683542756e-05, "loss": 0.4149, "step": 24920 }, { "epoch": 4.903739765434831, "grad_norm": 0.9041739702224731, "learning_rate": 1.2580329572900425e-05, "loss": 0.3088, "step": 24930 }, { "epoch": 4.905706768950849, "grad_norm": 1.0792338848114014, "learning_rate": 1.2577272310373291e-05, "loss": 0.3612, "step": 24940 }, { "epoch": 4.907673772466868, "grad_norm": 2.269197940826416, "learning_rate": 1.2574215047846159e-05, "loss": 0.454, "step": 24950 }, { "epoch": 4.909640775982887, "grad_norm": 1.8159810304641724, "learning_rate": 1.2571157785319026e-05, "loss": 0.3962, "step": 24960 }, { "epoch": 4.9116077794989055, "grad_norm": 1.9534015655517578, "learning_rate": 1.2568100522791894e-05, "loss": 0.3694, "step": 24970 }, { "epoch": 4.913574783014925, "grad_norm": 2.8406848907470703, "learning_rate": 1.256504326026476e-05, "loss": 0.3452, "step": 24980 }, { "epoch": 4.915541786530944, "grad_norm": 3.0141844749450684, "learning_rate": 1.2561985997737627e-05, "loss": 0.4919, "step": 24990 }, { "epoch": 4.917508790046962, "grad_norm": 0.9889708161354065, "learning_rate": 1.2558928735210494e-05, "loss": 0.487, "step": 25000 }, { "epoch": 4.917508790046962, "eval_loss": 0.16260863840579987, "eval_runtime": 8.8961, "eval_samples_per_second": 5.62, "eval_steps_per_second": 2.81, "step": 25000 }, { "epoch": 4.919475793562981, "grad_norm": 0.8428569436073303, "learning_rate": 1.255587147268336e-05, "loss": 0.4327, "step": 25010 }, { "epoch": 4.921442797079, "grad_norm": 1.1603949069976807, "learning_rate": 1.2552814210156228e-05, "loss": 0.3681, "step": 25020 }, { "epoch": 4.923409800595018, "grad_norm": 1.6248499155044556, "learning_rate": 1.2549756947629095e-05, "loss": 0.331, "step": 25030 }, { "epoch": 4.925376804111037, "grad_norm": 1.5447056293487549, "learning_rate": 1.2546699685101963e-05, "loss": 0.4105, "step": 25040 }, { "epoch": 4.927343807627056, "grad_norm": 0.8895068168640137, "learning_rate": 1.2543642422574827e-05, "loss": 0.3633, "step": 25050 }, { "epoch": 4.929310811143075, "grad_norm": 2.2197225093841553, "learning_rate": 1.2540585160047694e-05, "loss": 0.4538, "step": 25060 }, { "epoch": 4.931277814659094, "grad_norm": 0.859494686126709, "learning_rate": 1.253752789752056e-05, "loss": 0.502, "step": 25070 }, { "epoch": 4.933244818175113, "grad_norm": 1.645679235458374, "learning_rate": 1.2534470634993427e-05, "loss": 0.4424, "step": 25080 }, { "epoch": 4.935211821691131, "grad_norm": 2.0506739616394043, "learning_rate": 1.2531413372466295e-05, "loss": 0.4194, "step": 25090 }, { "epoch": 4.93717882520715, "grad_norm": 0.7906273603439331, "learning_rate": 1.2528356109939162e-05, "loss": 0.3206, "step": 25100 }, { "epoch": 4.939145828723169, "grad_norm": 1.0917754173278809, "learning_rate": 1.2525298847412028e-05, "loss": 0.3554, "step": 25110 }, { "epoch": 4.941112832239187, "grad_norm": 2.7056996822357178, "learning_rate": 1.2522241584884896e-05, "loss": 0.3399, "step": 25120 }, { "epoch": 4.943079835755206, "grad_norm": 0.8214200735092163, "learning_rate": 1.2519184322357763e-05, "loss": 0.4405, "step": 25130 }, { "epoch": 4.9450468392712255, "grad_norm": 2.3560619354248047, "learning_rate": 1.2516127059830629e-05, "loss": 0.3374, "step": 25140 }, { "epoch": 4.947013842787244, "grad_norm": 0.8455098271369934, "learning_rate": 1.2513069797303496e-05, "loss": 0.4155, "step": 25150 }, { "epoch": 4.948980846303263, "grad_norm": 2.703481912612915, "learning_rate": 1.2510012534776362e-05, "loss": 0.4248, "step": 25160 }, { "epoch": 4.950947849819282, "grad_norm": 1.9255882501602173, "learning_rate": 1.2506955272249228e-05, "loss": 0.2887, "step": 25170 }, { "epoch": 4.9529148533353, "grad_norm": 0.7499654293060303, "learning_rate": 1.2503898009722095e-05, "loss": 0.4598, "step": 25180 }, { "epoch": 4.954881856851319, "grad_norm": 1.097519874572754, "learning_rate": 1.2500840747194963e-05, "loss": 0.3877, "step": 25190 }, { "epoch": 4.956848860367338, "grad_norm": 1.5604249238967896, "learning_rate": 1.2497783484667829e-05, "loss": 0.4214, "step": 25200 }, { "epoch": 4.958815863883356, "grad_norm": 1.6495370864868164, "learning_rate": 1.2494726222140696e-05, "loss": 0.3071, "step": 25210 }, { "epoch": 4.9607828673993755, "grad_norm": 1.7267638444900513, "learning_rate": 1.2491668959613563e-05, "loss": 0.3833, "step": 25220 }, { "epoch": 4.9627498709153945, "grad_norm": 0.581771731376648, "learning_rate": 1.2488611697086431e-05, "loss": 0.324, "step": 25230 }, { "epoch": 4.964716874431413, "grad_norm": 1.5528723001480103, "learning_rate": 1.2485554434559297e-05, "loss": 0.3987, "step": 25240 }, { "epoch": 4.966683877947432, "grad_norm": 1.1603542566299438, "learning_rate": 1.2482497172032164e-05, "loss": 0.2939, "step": 25250 }, { "epoch": 4.968650881463451, "grad_norm": 0.8113834261894226, "learning_rate": 1.2479439909505032e-05, "loss": 0.3677, "step": 25260 }, { "epoch": 4.970617884979469, "grad_norm": 1.3022969961166382, "learning_rate": 1.2476382646977896e-05, "loss": 0.4211, "step": 25270 }, { "epoch": 4.972584888495488, "grad_norm": 1.1333060264587402, "learning_rate": 1.2473325384450763e-05, "loss": 0.3824, "step": 25280 }, { "epoch": 4.974551892011507, "grad_norm": 1.6616308689117432, "learning_rate": 1.247026812192363e-05, "loss": 0.4234, "step": 25290 }, { "epoch": 4.9765188955275255, "grad_norm": 1.6122316122055054, "learning_rate": 1.2467210859396496e-05, "loss": 0.5025, "step": 25300 }, { "epoch": 4.9784858990435445, "grad_norm": 1.1408931016921997, "learning_rate": 1.2464153596869364e-05, "loss": 0.4242, "step": 25310 }, { "epoch": 4.980452902559564, "grad_norm": 0.966662585735321, "learning_rate": 1.2461096334342231e-05, "loss": 0.4244, "step": 25320 }, { "epoch": 4.982419906075582, "grad_norm": 1.6419168710708618, "learning_rate": 1.2458039071815097e-05, "loss": 0.3898, "step": 25330 }, { "epoch": 4.984386909591601, "grad_norm": 1.3502774238586426, "learning_rate": 1.2454981809287965e-05, "loss": 0.3736, "step": 25340 }, { "epoch": 4.98635391310762, "grad_norm": 1.3421791791915894, "learning_rate": 1.2451924546760832e-05, "loss": 0.4539, "step": 25350 }, { "epoch": 4.988320916623638, "grad_norm": 0.8233484625816345, "learning_rate": 1.24488672842337e-05, "loss": 0.4242, "step": 25360 }, { "epoch": 4.990287920139657, "grad_norm": 1.437248706817627, "learning_rate": 1.2445810021706565e-05, "loss": 0.3697, "step": 25370 }, { "epoch": 4.992254923655676, "grad_norm": 0.8841282725334167, "learning_rate": 1.2442752759179433e-05, "loss": 0.4423, "step": 25380 }, { "epoch": 4.9942219271716946, "grad_norm": 0.6750190258026123, "learning_rate": 1.2439695496652297e-05, "loss": 0.5867, "step": 25390 }, { "epoch": 4.996188930687714, "grad_norm": 1.2948499917984009, "learning_rate": 1.2436638234125164e-05, "loss": 0.4803, "step": 25400 }, { "epoch": 4.998155934203733, "grad_norm": 1.183539867401123, "learning_rate": 1.2433580971598032e-05, "loss": 0.4717, "step": 25410 }, { "epoch": 5.000122937719751, "grad_norm": 1.8746291399002075, "learning_rate": 1.24305237090709e-05, "loss": 0.4304, "step": 25420 }, { "epoch": 5.00208994123577, "grad_norm": 1.3940856456756592, "learning_rate": 1.2427466446543765e-05, "loss": 0.368, "step": 25430 }, { "epoch": 5.004056944751789, "grad_norm": 1.1577160358428955, "learning_rate": 1.2424409184016633e-05, "loss": 0.2709, "step": 25440 }, { "epoch": 5.006023948267807, "grad_norm": 0.8124995827674866, "learning_rate": 1.24213519214895e-05, "loss": 0.4429, "step": 25450 }, { "epoch": 5.007990951783826, "grad_norm": 0.6613554358482361, "learning_rate": 1.2418294658962366e-05, "loss": 0.3882, "step": 25460 }, { "epoch": 5.0099579552998454, "grad_norm": 1.7531118392944336, "learning_rate": 1.2415237396435233e-05, "loss": 0.3169, "step": 25470 }, { "epoch": 5.011924958815864, "grad_norm": 1.7000898122787476, "learning_rate": 1.24121801339081e-05, "loss": 0.3254, "step": 25480 }, { "epoch": 5.013891962331883, "grad_norm": 1.7405487298965454, "learning_rate": 1.2409122871380968e-05, "loss": 0.3976, "step": 25490 }, { "epoch": 5.015858965847902, "grad_norm": 1.5716254711151123, "learning_rate": 1.2406065608853832e-05, "loss": 0.3814, "step": 25500 }, { "epoch": 5.015858965847902, "eval_loss": 0.15875256061553955, "eval_runtime": 8.886, "eval_samples_per_second": 5.627, "eval_steps_per_second": 2.813, "step": 25500 }, { "epoch": 5.01782596936392, "grad_norm": 1.5281723737716675, "learning_rate": 1.24030083463267e-05, "loss": 0.3889, "step": 25510 }, { "epoch": 5.019792972879939, "grad_norm": 0.9451437592506409, "learning_rate": 1.2399951083799566e-05, "loss": 0.2675, "step": 25520 }, { "epoch": 5.021759976395958, "grad_norm": 1.0830464363098145, "learning_rate": 1.2396893821272433e-05, "loss": 0.4978, "step": 25530 }, { "epoch": 5.023726979911976, "grad_norm": 1.3201954364776611, "learning_rate": 1.23938365587453e-05, "loss": 0.3073, "step": 25540 }, { "epoch": 5.0256939834279954, "grad_norm": 1.1136363744735718, "learning_rate": 1.2390779296218168e-05, "loss": 0.3773, "step": 25550 }, { "epoch": 5.0276609869440145, "grad_norm": 1.7179425954818726, "learning_rate": 1.2387722033691034e-05, "loss": 0.463, "step": 25560 }, { "epoch": 5.029627990460033, "grad_norm": 1.7579281330108643, "learning_rate": 1.2384664771163901e-05, "loss": 0.2775, "step": 25570 }, { "epoch": 5.031594993976052, "grad_norm": 1.5655035972595215, "learning_rate": 1.2381607508636769e-05, "loss": 0.4461, "step": 25580 }, { "epoch": 5.033561997492071, "grad_norm": 1.0195614099502563, "learning_rate": 1.2378550246109635e-05, "loss": 0.4831, "step": 25590 }, { "epoch": 5.035529001008089, "grad_norm": 1.006157398223877, "learning_rate": 1.2375492983582502e-05, "loss": 0.3355, "step": 25600 }, { "epoch": 5.037496004524108, "grad_norm": 0.8941965103149414, "learning_rate": 1.2372435721055368e-05, "loss": 0.3134, "step": 25610 }, { "epoch": 5.039463008040127, "grad_norm": 1.5068615674972534, "learning_rate": 1.2369378458528234e-05, "loss": 0.3168, "step": 25620 }, { "epoch": 5.0414300115561455, "grad_norm": 1.5665245056152344, "learning_rate": 1.2366321196001101e-05, "loss": 0.438, "step": 25630 }, { "epoch": 5.0433970150721645, "grad_norm": 1.9946467876434326, "learning_rate": 1.2363263933473969e-05, "loss": 0.313, "step": 25640 }, { "epoch": 5.045364018588184, "grad_norm": 0.8916930556297302, "learning_rate": 1.2360206670946834e-05, "loss": 0.3134, "step": 25650 }, { "epoch": 5.047331022104202, "grad_norm": 1.5052319765090942, "learning_rate": 1.2357149408419702e-05, "loss": 0.234, "step": 25660 }, { "epoch": 5.049298025620221, "grad_norm": 1.262294888496399, "learning_rate": 1.235409214589257e-05, "loss": 0.4548, "step": 25670 }, { "epoch": 5.05126502913624, "grad_norm": 1.7379635572433472, "learning_rate": 1.2351034883365437e-05, "loss": 0.3546, "step": 25680 }, { "epoch": 5.053232032652258, "grad_norm": 0.9460011124610901, "learning_rate": 1.2347977620838302e-05, "loss": 0.3477, "step": 25690 }, { "epoch": 5.055199036168277, "grad_norm": 2.176176071166992, "learning_rate": 1.234492035831117e-05, "loss": 0.3871, "step": 25700 }, { "epoch": 5.057166039684296, "grad_norm": 1.6649911403656006, "learning_rate": 1.2341863095784037e-05, "loss": 0.472, "step": 25710 }, { "epoch": 5.0591330432003145, "grad_norm": 1.012190580368042, "learning_rate": 1.2338805833256903e-05, "loss": 0.4348, "step": 25720 }, { "epoch": 5.061100046716334, "grad_norm": 1.0883797407150269, "learning_rate": 1.2335748570729769e-05, "loss": 0.3849, "step": 25730 }, { "epoch": 5.063067050232353, "grad_norm": 1.2795614004135132, "learning_rate": 1.2332691308202636e-05, "loss": 0.4584, "step": 25740 }, { "epoch": 5.065034053748371, "grad_norm": 0.5763323903083801, "learning_rate": 1.2329634045675502e-05, "loss": 0.3416, "step": 25750 }, { "epoch": 5.06700105726439, "grad_norm": 2.406430959701538, "learning_rate": 1.232657678314837e-05, "loss": 0.2789, "step": 25760 }, { "epoch": 5.068968060780409, "grad_norm": 1.3768939971923828, "learning_rate": 1.2323519520621237e-05, "loss": 0.3298, "step": 25770 }, { "epoch": 5.070935064296427, "grad_norm": 1.4049491882324219, "learning_rate": 1.2320462258094103e-05, "loss": 0.3892, "step": 25780 }, { "epoch": 5.072902067812446, "grad_norm": 0.5275664329528809, "learning_rate": 1.231740499556697e-05, "loss": 0.3585, "step": 25790 }, { "epoch": 5.074869071328465, "grad_norm": 3.447392702102661, "learning_rate": 1.2314347733039838e-05, "loss": 0.4339, "step": 25800 }, { "epoch": 5.076836074844484, "grad_norm": 0.7431132793426514, "learning_rate": 1.2311290470512705e-05, "loss": 0.4617, "step": 25810 }, { "epoch": 5.078803078360503, "grad_norm": 0.9460954666137695, "learning_rate": 1.2308233207985571e-05, "loss": 0.3782, "step": 25820 }, { "epoch": 5.080770081876521, "grad_norm": 1.6559048891067505, "learning_rate": 1.2305175945458439e-05, "loss": 0.4792, "step": 25830 }, { "epoch": 5.08273708539254, "grad_norm": 1.7823586463928223, "learning_rate": 1.2302118682931303e-05, "loss": 0.3103, "step": 25840 }, { "epoch": 5.084704088908559, "grad_norm": 0.5632954835891724, "learning_rate": 1.229906142040417e-05, "loss": 0.2107, "step": 25850 }, { "epoch": 5.086671092424577, "grad_norm": 1.380471110343933, "learning_rate": 1.2296004157877038e-05, "loss": 0.4044, "step": 25860 }, { "epoch": 5.088638095940596, "grad_norm": 1.372969388961792, "learning_rate": 1.2292946895349903e-05, "loss": 0.3801, "step": 25870 }, { "epoch": 5.090605099456615, "grad_norm": 1.3248234987258911, "learning_rate": 1.2289889632822771e-05, "loss": 0.4301, "step": 25880 }, { "epoch": 5.092572102972634, "grad_norm": 1.0714397430419922, "learning_rate": 1.2286832370295638e-05, "loss": 0.4881, "step": 25890 }, { "epoch": 5.094539106488653, "grad_norm": 0.6456104516983032, "learning_rate": 1.2283775107768506e-05, "loss": 0.4173, "step": 25900 }, { "epoch": 5.096506110004672, "grad_norm": 1.3249002695083618, "learning_rate": 1.2280717845241372e-05, "loss": 0.283, "step": 25910 }, { "epoch": 5.09847311352069, "grad_norm": 0.9373102188110352, "learning_rate": 1.2277660582714239e-05, "loss": 0.3163, "step": 25920 }, { "epoch": 5.100440117036709, "grad_norm": 0.5609747171401978, "learning_rate": 1.2274603320187107e-05, "loss": 0.4604, "step": 25930 }, { "epoch": 5.102407120552728, "grad_norm": 0.5619985461235046, "learning_rate": 1.2271546057659974e-05, "loss": 0.2962, "step": 25940 }, { "epoch": 5.104374124068746, "grad_norm": 1.2047351598739624, "learning_rate": 1.2268488795132838e-05, "loss": 0.3264, "step": 25950 }, { "epoch": 5.106341127584765, "grad_norm": 0.7514748573303223, "learning_rate": 1.2265431532605706e-05, "loss": 0.392, "step": 25960 }, { "epoch": 5.1083081311007845, "grad_norm": 1.2202684879302979, "learning_rate": 1.2262374270078571e-05, "loss": 0.4654, "step": 25970 }, { "epoch": 5.110275134616803, "grad_norm": 1.5096663236618042, "learning_rate": 1.2259317007551439e-05, "loss": 0.3987, "step": 25980 }, { "epoch": 5.112242138132822, "grad_norm": 2.5822386741638184, "learning_rate": 1.2256259745024306e-05, "loss": 0.4966, "step": 25990 }, { "epoch": 5.114209141648841, "grad_norm": 1.6036911010742188, "learning_rate": 1.2253202482497172e-05, "loss": 0.3473, "step": 26000 }, { "epoch": 5.114209141648841, "eval_loss": 0.1639460325241089, "eval_runtime": 8.8667, "eval_samples_per_second": 5.639, "eval_steps_per_second": 2.82, "step": 26000 }, { "epoch": 5.116176145164859, "grad_norm": 5.09474515914917, "learning_rate": 1.225014521997004e-05, "loss": 0.3937, "step": 26010 }, { "epoch": 5.118143148680878, "grad_norm": 0.8852512240409851, "learning_rate": 1.2247087957442907e-05, "loss": 0.3332, "step": 26020 }, { "epoch": 5.120110152196897, "grad_norm": 1.108298420906067, "learning_rate": 1.2244030694915774e-05, "loss": 0.2213, "step": 26030 }, { "epoch": 5.122077155712915, "grad_norm": 1.3808269500732422, "learning_rate": 1.224097343238864e-05, "loss": 0.3315, "step": 26040 }, { "epoch": 5.1240441592289345, "grad_norm": 1.3293715715408325, "learning_rate": 1.2237916169861508e-05, "loss": 0.3907, "step": 26050 }, { "epoch": 5.126011162744954, "grad_norm": 2.228424310684204, "learning_rate": 1.2234858907334375e-05, "loss": 0.3814, "step": 26060 }, { "epoch": 5.127978166260972, "grad_norm": 0.8705452680587769, "learning_rate": 1.223180164480724e-05, "loss": 0.3779, "step": 26070 }, { "epoch": 5.129945169776991, "grad_norm": 1.1205730438232422, "learning_rate": 1.2228744382280107e-05, "loss": 0.3116, "step": 26080 }, { "epoch": 5.13191217329301, "grad_norm": 3.4011454582214355, "learning_rate": 1.2225687119752974e-05, "loss": 0.4678, "step": 26090 }, { "epoch": 5.133879176809028, "grad_norm": 1.2198089361190796, "learning_rate": 1.222262985722584e-05, "loss": 0.5545, "step": 26100 }, { "epoch": 5.135846180325047, "grad_norm": 1.411157488822937, "learning_rate": 1.2219572594698707e-05, "loss": 0.3571, "step": 26110 }, { "epoch": 5.137813183841066, "grad_norm": 1.0183827877044678, "learning_rate": 1.2216515332171575e-05, "loss": 0.2842, "step": 26120 }, { "epoch": 5.1397801873570845, "grad_norm": 0.849287211894989, "learning_rate": 1.221345806964444e-05, "loss": 0.3608, "step": 26130 }, { "epoch": 5.141747190873104, "grad_norm": 1.7928563356399536, "learning_rate": 1.2210400807117308e-05, "loss": 0.3096, "step": 26140 }, { "epoch": 5.143714194389123, "grad_norm": 0.9198914170265198, "learning_rate": 1.2207343544590176e-05, "loss": 0.2994, "step": 26150 }, { "epoch": 5.145681197905141, "grad_norm": 1.2599544525146484, "learning_rate": 1.2204286282063043e-05, "loss": 0.2084, "step": 26160 }, { "epoch": 5.14764820142116, "grad_norm": 0.6974878311157227, "learning_rate": 1.2201229019535909e-05, "loss": 0.4318, "step": 26170 }, { "epoch": 5.149615204937179, "grad_norm": 1.316336750984192, "learning_rate": 1.2198171757008775e-05, "loss": 0.382, "step": 26180 }, { "epoch": 5.151582208453197, "grad_norm": 1.2267037630081177, "learning_rate": 1.219511449448164e-05, "loss": 0.3681, "step": 26190 }, { "epoch": 5.153549211969216, "grad_norm": 3.4800543785095215, "learning_rate": 1.2192057231954508e-05, "loss": 0.4206, "step": 26200 }, { "epoch": 5.155516215485235, "grad_norm": 0.8750397562980652, "learning_rate": 1.2188999969427375e-05, "loss": 0.4165, "step": 26210 }, { "epoch": 5.157483219001254, "grad_norm": 1.767290711402893, "learning_rate": 1.2185942706900243e-05, "loss": 0.3753, "step": 26220 }, { "epoch": 5.159450222517273, "grad_norm": 1.1647064685821533, "learning_rate": 1.2182885444373109e-05, "loss": 0.3647, "step": 26230 }, { "epoch": 5.161417226033292, "grad_norm": 1.799381971359253, "learning_rate": 1.2179828181845976e-05, "loss": 0.3751, "step": 26240 }, { "epoch": 5.16338422954931, "grad_norm": 1.3957469463348389, "learning_rate": 1.2176770919318844e-05, "loss": 0.2916, "step": 26250 }, { "epoch": 5.165351233065329, "grad_norm": 1.7819017171859741, "learning_rate": 1.217371365679171e-05, "loss": 0.2783, "step": 26260 }, { "epoch": 5.167318236581348, "grad_norm": 1.2645076513290405, "learning_rate": 1.2170656394264577e-05, "loss": 0.4173, "step": 26270 }, { "epoch": 5.169285240097366, "grad_norm": 1.433780550956726, "learning_rate": 1.2167599131737444e-05, "loss": 0.463, "step": 26280 }, { "epoch": 5.171252243613385, "grad_norm": 1.155297040939331, "learning_rate": 1.2164541869210308e-05, "loss": 0.3507, "step": 26290 }, { "epoch": 5.1732192471294045, "grad_norm": 2.542121648788452, "learning_rate": 1.2161484606683176e-05, "loss": 0.3733, "step": 26300 }, { "epoch": 5.175186250645423, "grad_norm": 1.2050806283950806, "learning_rate": 1.2158427344156043e-05, "loss": 0.3418, "step": 26310 }, { "epoch": 5.177153254161442, "grad_norm": 0.7434633374214172, "learning_rate": 1.215537008162891e-05, "loss": 0.3878, "step": 26320 }, { "epoch": 5.179120257677461, "grad_norm": 1.3844777345657349, "learning_rate": 1.2152312819101777e-05, "loss": 0.3923, "step": 26330 }, { "epoch": 5.181087261193479, "grad_norm": 1.2223122119903564, "learning_rate": 1.2149255556574644e-05, "loss": 0.3741, "step": 26340 }, { "epoch": 5.183054264709498, "grad_norm": 1.156844139099121, "learning_rate": 1.2146198294047512e-05, "loss": 0.2896, "step": 26350 }, { "epoch": 5.185021268225517, "grad_norm": 0.49281927943229675, "learning_rate": 1.2143141031520377e-05, "loss": 0.3394, "step": 26360 }, { "epoch": 5.186988271741535, "grad_norm": 2.524413585662842, "learning_rate": 1.2140083768993245e-05, "loss": 0.4893, "step": 26370 }, { "epoch": 5.1889552752575545, "grad_norm": 3.383070230484009, "learning_rate": 1.2137026506466112e-05, "loss": 0.3611, "step": 26380 }, { "epoch": 5.190922278773574, "grad_norm": 1.3388442993164062, "learning_rate": 1.2133969243938978e-05, "loss": 0.3775, "step": 26390 }, { "epoch": 5.192889282289592, "grad_norm": 1.3757628202438354, "learning_rate": 1.2130911981411846e-05, "loss": 0.3374, "step": 26400 }, { "epoch": 5.194856285805611, "grad_norm": 1.150669813156128, "learning_rate": 1.2127854718884711e-05, "loss": 0.4746, "step": 26410 }, { "epoch": 5.19682328932163, "grad_norm": 0.8864215612411499, "learning_rate": 1.2124797456357577e-05, "loss": 0.3137, "step": 26420 }, { "epoch": 5.198790292837648, "grad_norm": 3.9217827320098877, "learning_rate": 1.2121740193830445e-05, "loss": 0.3472, "step": 26430 }, { "epoch": 5.200757296353667, "grad_norm": 1.4964346885681152, "learning_rate": 1.2118682931303312e-05, "loss": 0.4938, "step": 26440 }, { "epoch": 5.202724299869686, "grad_norm": 0.8897117972373962, "learning_rate": 1.2115625668776178e-05, "loss": 0.2581, "step": 26450 }, { "epoch": 5.2046913033857045, "grad_norm": 1.115395426750183, "learning_rate": 1.2112568406249045e-05, "loss": 0.4467, "step": 26460 }, { "epoch": 5.206658306901724, "grad_norm": 1.5817797183990479, "learning_rate": 1.2109511143721913e-05, "loss": 0.4972, "step": 26470 }, { "epoch": 5.208625310417743, "grad_norm": 1.0178269147872925, "learning_rate": 1.210645388119478e-05, "loss": 0.4131, "step": 26480 }, { "epoch": 5.210592313933761, "grad_norm": 2.237933874130249, "learning_rate": 1.2103396618667646e-05, "loss": 0.2859, "step": 26490 }, { "epoch": 5.21255931744978, "grad_norm": 1.0409702062606812, "learning_rate": 1.2100339356140513e-05, "loss": 0.2925, "step": 26500 }, { "epoch": 5.21255931744978, "eval_loss": 0.1575675904750824, "eval_runtime": 8.8783, "eval_samples_per_second": 5.632, "eval_steps_per_second": 2.816, "step": 26500 }, { "epoch": 5.214526320965799, "grad_norm": 1.6005821228027344, "learning_rate": 1.2097282093613381e-05, "loss": 0.3946, "step": 26510 }, { "epoch": 5.216493324481817, "grad_norm": 1.4268238544464111, "learning_rate": 1.2094224831086245e-05, "loss": 0.3797, "step": 26520 }, { "epoch": 5.218460327997836, "grad_norm": 1.2413865327835083, "learning_rate": 1.2091167568559113e-05, "loss": 0.319, "step": 26530 }, { "epoch": 5.220427331513855, "grad_norm": 0.78890460729599, "learning_rate": 1.208811030603198e-05, "loss": 0.3216, "step": 26540 }, { "epoch": 5.222394335029874, "grad_norm": 0.7617972493171692, "learning_rate": 1.2085053043504846e-05, "loss": 0.4423, "step": 26550 }, { "epoch": 5.224361338545893, "grad_norm": 1.5445433855056763, "learning_rate": 1.2081995780977713e-05, "loss": 0.2114, "step": 26560 }, { "epoch": 5.226328342061912, "grad_norm": 1.269335150718689, "learning_rate": 1.207893851845058e-05, "loss": 0.4142, "step": 26570 }, { "epoch": 5.22829534557793, "grad_norm": 1.3220189809799194, "learning_rate": 1.2075881255923446e-05, "loss": 0.2565, "step": 26580 }, { "epoch": 5.230262349093949, "grad_norm": 1.7140494585037231, "learning_rate": 1.2072823993396314e-05, "loss": 0.2602, "step": 26590 }, { "epoch": 5.232229352609968, "grad_norm": 1.852797508239746, "learning_rate": 1.2069766730869181e-05, "loss": 0.24, "step": 26600 }, { "epoch": 5.234196356125986, "grad_norm": 1.7020344734191895, "learning_rate": 1.2066709468342049e-05, "loss": 0.2881, "step": 26610 }, { "epoch": 5.236163359642005, "grad_norm": 0.5673180818557739, "learning_rate": 1.2063652205814915e-05, "loss": 0.3332, "step": 26620 }, { "epoch": 5.2381303631580245, "grad_norm": 0.687757670879364, "learning_rate": 1.206059494328778e-05, "loss": 0.3021, "step": 26630 }, { "epoch": 5.240097366674043, "grad_norm": 1.2419289350509644, "learning_rate": 1.2057537680760646e-05, "loss": 0.3224, "step": 26640 }, { "epoch": 5.242064370190062, "grad_norm": 1.7298426628112793, "learning_rate": 1.2054480418233514e-05, "loss": 0.3101, "step": 26650 }, { "epoch": 5.244031373706081, "grad_norm": 3.0599703788757324, "learning_rate": 1.2051423155706381e-05, "loss": 0.2711, "step": 26660 }, { "epoch": 5.245998377222099, "grad_norm": 0.3817830979824066, "learning_rate": 1.2048365893179249e-05, "loss": 0.4047, "step": 26670 }, { "epoch": 5.247965380738118, "grad_norm": 1.0103836059570312, "learning_rate": 1.2045308630652114e-05, "loss": 0.2704, "step": 26680 }, { "epoch": 5.249932384254137, "grad_norm": 1.6104187965393066, "learning_rate": 1.2042251368124982e-05, "loss": 0.2766, "step": 26690 }, { "epoch": 5.251899387770155, "grad_norm": 0.8950489163398743, "learning_rate": 1.203919410559785e-05, "loss": 0.458, "step": 26700 }, { "epoch": 5.2538663912861745, "grad_norm": 1.5272265672683716, "learning_rate": 1.2036136843070715e-05, "loss": 0.3857, "step": 26710 }, { "epoch": 5.255833394802194, "grad_norm": 1.1579011678695679, "learning_rate": 1.2033079580543583e-05, "loss": 0.4661, "step": 26720 }, { "epoch": 5.257800398318212, "grad_norm": 1.5543522834777832, "learning_rate": 1.203002231801645e-05, "loss": 0.3932, "step": 26730 }, { "epoch": 5.259767401834231, "grad_norm": 1.385425090789795, "learning_rate": 1.2026965055489318e-05, "loss": 0.368, "step": 26740 }, { "epoch": 5.26173440535025, "grad_norm": 0.7192743420600891, "learning_rate": 1.2023907792962182e-05, "loss": 0.3624, "step": 26750 }, { "epoch": 5.263701408866268, "grad_norm": 0.9118638634681702, "learning_rate": 1.2020850530435049e-05, "loss": 0.4483, "step": 26760 }, { "epoch": 5.265668412382287, "grad_norm": 1.5351526737213135, "learning_rate": 1.2017793267907915e-05, "loss": 0.3773, "step": 26770 }, { "epoch": 5.267635415898306, "grad_norm": 1.1610013246536255, "learning_rate": 1.2014736005380782e-05, "loss": 0.4611, "step": 26780 }, { "epoch": 5.2696024194143245, "grad_norm": 1.0738345384597778, "learning_rate": 1.201167874285365e-05, "loss": 0.3372, "step": 26790 }, { "epoch": 5.271569422930344, "grad_norm": 1.3111422061920166, "learning_rate": 1.2008621480326517e-05, "loss": 0.3991, "step": 26800 }, { "epoch": 5.273536426446363, "grad_norm": 1.3614267110824585, "learning_rate": 1.2005564217799383e-05, "loss": 0.4385, "step": 26810 }, { "epoch": 5.275503429962381, "grad_norm": 1.3455390930175781, "learning_rate": 1.200250695527225e-05, "loss": 0.3435, "step": 26820 }, { "epoch": 5.2774704334784, "grad_norm": 1.029335618019104, "learning_rate": 1.1999449692745118e-05, "loss": 0.3084, "step": 26830 }, { "epoch": 5.279437436994419, "grad_norm": 1.7655671834945679, "learning_rate": 1.1996392430217984e-05, "loss": 0.3199, "step": 26840 }, { "epoch": 5.281404440510437, "grad_norm": 1.5547866821289062, "learning_rate": 1.1993335167690851e-05, "loss": 0.3297, "step": 26850 }, { "epoch": 5.283371444026456, "grad_norm": 2.986433506011963, "learning_rate": 1.1990277905163717e-05, "loss": 0.3683, "step": 26860 }, { "epoch": 5.285338447542475, "grad_norm": 2.3336493968963623, "learning_rate": 1.1987220642636583e-05, "loss": 0.3083, "step": 26870 }, { "epoch": 5.287305451058494, "grad_norm": 1.0171363353729248, "learning_rate": 1.198416338010945e-05, "loss": 0.427, "step": 26880 }, { "epoch": 5.289272454574513, "grad_norm": 1.2654080390930176, "learning_rate": 1.1981106117582318e-05, "loss": 0.431, "step": 26890 }, { "epoch": 5.291239458090532, "grad_norm": 1.0362647771835327, "learning_rate": 1.1978048855055184e-05, "loss": 0.3448, "step": 26900 }, { "epoch": 5.29320646160655, "grad_norm": 0.8070294260978699, "learning_rate": 1.1974991592528051e-05, "loss": 0.3733, "step": 26910 }, { "epoch": 5.295173465122569, "grad_norm": 1.697971224784851, "learning_rate": 1.1971934330000919e-05, "loss": 0.4335, "step": 26920 }, { "epoch": 5.297140468638588, "grad_norm": 1.8027409315109253, "learning_rate": 1.1968877067473786e-05, "loss": 0.3142, "step": 26930 }, { "epoch": 5.299107472154606, "grad_norm": 1.0393763780593872, "learning_rate": 1.1965819804946652e-05, "loss": 0.403, "step": 26940 }, { "epoch": 5.301074475670625, "grad_norm": 0.7654443383216858, "learning_rate": 1.196276254241952e-05, "loss": 0.4825, "step": 26950 }, { "epoch": 5.3030414791866445, "grad_norm": 1.463889479637146, "learning_rate": 1.1959705279892387e-05, "loss": 0.4558, "step": 26960 }, { "epoch": 5.305008482702663, "grad_norm": 1.0815491676330566, "learning_rate": 1.195664801736525e-05, "loss": 0.2217, "step": 26970 }, { "epoch": 5.306975486218682, "grad_norm": 0.9593465924263, "learning_rate": 1.1953590754838118e-05, "loss": 0.3366, "step": 26980 }, { "epoch": 5.308942489734701, "grad_norm": 1.3412492275238037, "learning_rate": 1.1950533492310986e-05, "loss": 0.2579, "step": 26990 }, { "epoch": 5.310909493250719, "grad_norm": 1.191043496131897, "learning_rate": 1.1947476229783852e-05, "loss": 0.4985, "step": 27000 }, { "epoch": 5.310909493250719, "eval_loss": 0.15775032341480255, "eval_runtime": 8.861, "eval_samples_per_second": 5.643, "eval_steps_per_second": 2.821, "step": 27000 }, { "epoch": 5.312876496766738, "grad_norm": 2.1759085655212402, "learning_rate": 1.1944418967256719e-05, "loss": 0.366, "step": 27010 }, { "epoch": 5.314843500282757, "grad_norm": 1.6948908567428589, "learning_rate": 1.1941361704729586e-05, "loss": 0.414, "step": 27020 }, { "epoch": 5.316810503798775, "grad_norm": 1.4809417724609375, "learning_rate": 1.1938304442202452e-05, "loss": 0.3949, "step": 27030 }, { "epoch": 5.3187775073147945, "grad_norm": 2.090308904647827, "learning_rate": 1.193524717967532e-05, "loss": 0.4441, "step": 27040 }, { "epoch": 5.320744510830814, "grad_norm": 1.194943904876709, "learning_rate": 1.1932189917148187e-05, "loss": 0.4599, "step": 27050 }, { "epoch": 5.322711514346832, "grad_norm": 1.152958869934082, "learning_rate": 1.1929132654621055e-05, "loss": 0.4161, "step": 27060 }, { "epoch": 5.324678517862851, "grad_norm": 1.3262512683868408, "learning_rate": 1.192607539209392e-05, "loss": 0.4106, "step": 27070 }, { "epoch": 5.32664552137887, "grad_norm": 1.7198249101638794, "learning_rate": 1.1923018129566788e-05, "loss": 0.2689, "step": 27080 }, { "epoch": 5.328612524894888, "grad_norm": 0.76631760597229, "learning_rate": 1.1919960867039652e-05, "loss": 0.4833, "step": 27090 }, { "epoch": 5.330579528410907, "grad_norm": 1.0496758222579956, "learning_rate": 1.191690360451252e-05, "loss": 0.4318, "step": 27100 }, { "epoch": 5.332546531926925, "grad_norm": 0.9311888217926025, "learning_rate": 1.1913846341985387e-05, "loss": 0.4997, "step": 27110 }, { "epoch": 5.3345135354429445, "grad_norm": 1.919349193572998, "learning_rate": 1.1910789079458254e-05, "loss": 0.4288, "step": 27120 }, { "epoch": 5.336480538958964, "grad_norm": 1.031426191329956, "learning_rate": 1.190773181693112e-05, "loss": 0.3159, "step": 27130 }, { "epoch": 5.338447542474982, "grad_norm": 0.9320018291473389, "learning_rate": 1.1904674554403988e-05, "loss": 0.4373, "step": 27140 }, { "epoch": 5.340414545991001, "grad_norm": 0.9622688293457031, "learning_rate": 1.1901617291876855e-05, "loss": 0.4539, "step": 27150 }, { "epoch": 5.34238154950702, "grad_norm": 1.055091142654419, "learning_rate": 1.1898560029349721e-05, "loss": 0.556, "step": 27160 }, { "epoch": 5.344348553023038, "grad_norm": 1.1777045726776123, "learning_rate": 1.1895502766822588e-05, "loss": 0.2429, "step": 27170 }, { "epoch": 5.346315556539057, "grad_norm": 1.031323790550232, "learning_rate": 1.1892445504295456e-05, "loss": 0.2711, "step": 27180 }, { "epoch": 5.348282560055076, "grad_norm": 1.3598463535308838, "learning_rate": 1.1889388241768323e-05, "loss": 0.376, "step": 27190 }, { "epoch": 5.3502495635710945, "grad_norm": 0.9220485091209412, "learning_rate": 1.1886330979241187e-05, "loss": 0.4076, "step": 27200 }, { "epoch": 5.352216567087114, "grad_norm": 1.2869757413864136, "learning_rate": 1.1883273716714055e-05, "loss": 0.2967, "step": 27210 }, { "epoch": 5.354183570603133, "grad_norm": 1.306017279624939, "learning_rate": 1.188021645418692e-05, "loss": 0.3353, "step": 27220 }, { "epoch": 5.356150574119151, "grad_norm": 1.5376003980636597, "learning_rate": 1.1877159191659788e-05, "loss": 0.3431, "step": 27230 }, { "epoch": 5.35811757763517, "grad_norm": 1.538486361503601, "learning_rate": 1.1874101929132656e-05, "loss": 0.3858, "step": 27240 }, { "epoch": 5.360084581151189, "grad_norm": 1.009808897972107, "learning_rate": 1.1871044666605523e-05, "loss": 0.3684, "step": 27250 }, { "epoch": 5.362051584667207, "grad_norm": 1.3101840019226074, "learning_rate": 1.1867987404078389e-05, "loss": 0.3092, "step": 27260 }, { "epoch": 5.364018588183226, "grad_norm": 1.3292027711868286, "learning_rate": 1.1864930141551256e-05, "loss": 0.4574, "step": 27270 }, { "epoch": 5.365985591699245, "grad_norm": 2.2095041275024414, "learning_rate": 1.1861872879024124e-05, "loss": 0.3182, "step": 27280 }, { "epoch": 5.367952595215264, "grad_norm": 0.6917831301689148, "learning_rate": 1.185881561649699e-05, "loss": 0.2448, "step": 27290 }, { "epoch": 5.369919598731283, "grad_norm": 0.4980989396572113, "learning_rate": 1.1855758353969857e-05, "loss": 0.3643, "step": 27300 }, { "epoch": 5.371886602247302, "grad_norm": 1.0535064935684204, "learning_rate": 1.1852701091442723e-05, "loss": 0.482, "step": 27310 }, { "epoch": 5.37385360576332, "grad_norm": 1.0042133331298828, "learning_rate": 1.1849643828915589e-05, "loss": 0.3711, "step": 27320 }, { "epoch": 5.375820609279339, "grad_norm": 1.6627103090286255, "learning_rate": 1.1846586566388456e-05, "loss": 0.4113, "step": 27330 }, { "epoch": 5.377787612795358, "grad_norm": 1.3288803100585938, "learning_rate": 1.1843529303861324e-05, "loss": 0.4166, "step": 27340 }, { "epoch": 5.379754616311376, "grad_norm": 2.352654457092285, "learning_rate": 1.184047204133419e-05, "loss": 0.4147, "step": 27350 }, { "epoch": 5.381721619827395, "grad_norm": 1.5574378967285156, "learning_rate": 1.1837414778807057e-05, "loss": 0.4605, "step": 27360 }, { "epoch": 5.3836886233434145, "grad_norm": 0.9702771902084351, "learning_rate": 1.1834357516279924e-05, "loss": 0.5087, "step": 27370 }, { "epoch": 5.385655626859433, "grad_norm": 0.9326369762420654, "learning_rate": 1.1831300253752792e-05, "loss": 0.3388, "step": 27380 }, { "epoch": 5.387622630375452, "grad_norm": 0.9466789364814758, "learning_rate": 1.1828242991225658e-05, "loss": 0.3118, "step": 27390 }, { "epoch": 5.389589633891471, "grad_norm": 1.6036527156829834, "learning_rate": 1.1825185728698525e-05, "loss": 0.3969, "step": 27400 }, { "epoch": 5.391556637407489, "grad_norm": 1.7829012870788574, "learning_rate": 1.1822128466171392e-05, "loss": 0.3626, "step": 27410 }, { "epoch": 5.393523640923508, "grad_norm": 1.2935879230499268, "learning_rate": 1.1819071203644258e-05, "loss": 0.4981, "step": 27420 }, { "epoch": 5.395490644439527, "grad_norm": 2.1549735069274902, "learning_rate": 1.1816013941117124e-05, "loss": 0.3817, "step": 27430 }, { "epoch": 5.397457647955545, "grad_norm": 0.5140044689178467, "learning_rate": 1.1812956678589991e-05, "loss": 0.2148, "step": 27440 }, { "epoch": 5.3994246514715645, "grad_norm": 2.3934736251831055, "learning_rate": 1.1809899416062857e-05, "loss": 0.382, "step": 27450 }, { "epoch": 5.401391654987584, "grad_norm": 1.491032361984253, "learning_rate": 1.1806842153535725e-05, "loss": 0.2914, "step": 27460 }, { "epoch": 5.403358658503602, "grad_norm": 1.6353907585144043, "learning_rate": 1.1803784891008592e-05, "loss": 0.2502, "step": 27470 }, { "epoch": 5.405325662019621, "grad_norm": 1.7032148838043213, "learning_rate": 1.1800727628481458e-05, "loss": 0.5484, "step": 27480 }, { "epoch": 5.40729266553564, "grad_norm": 1.4025053977966309, "learning_rate": 1.1797670365954325e-05, "loss": 0.3196, "step": 27490 }, { "epoch": 5.409259669051658, "grad_norm": 1.2315196990966797, "learning_rate": 1.1794613103427193e-05, "loss": 0.4845, "step": 27500 }, { "epoch": 5.409259669051658, "eval_loss": 0.16736486554145813, "eval_runtime": 8.8667, "eval_samples_per_second": 5.639, "eval_steps_per_second": 2.82, "step": 27500 }, { "epoch": 5.411226672567677, "grad_norm": 1.790334701538086, "learning_rate": 1.179155584090006e-05, "loss": 0.3443, "step": 27510 }, { "epoch": 5.413193676083696, "grad_norm": 0.7707914113998413, "learning_rate": 1.1788498578372926e-05, "loss": 0.3989, "step": 27520 }, { "epoch": 5.4151606795997145, "grad_norm": 1.0538116693496704, "learning_rate": 1.1785441315845794e-05, "loss": 0.4151, "step": 27530 }, { "epoch": 5.417127683115734, "grad_norm": 1.2277323007583618, "learning_rate": 1.1782384053318658e-05, "loss": 0.4971, "step": 27540 }, { "epoch": 5.419094686631753, "grad_norm": 2.6161787509918213, "learning_rate": 1.1779326790791525e-05, "loss": 0.3905, "step": 27550 }, { "epoch": 5.421061690147771, "grad_norm": 1.2206039428710938, "learning_rate": 1.1776269528264393e-05, "loss": 0.3759, "step": 27560 }, { "epoch": 5.42302869366379, "grad_norm": 1.2194130420684814, "learning_rate": 1.177321226573726e-05, "loss": 0.4211, "step": 27570 }, { "epoch": 5.424995697179809, "grad_norm": 1.6475505828857422, "learning_rate": 1.1770155003210126e-05, "loss": 0.4336, "step": 27580 }, { "epoch": 5.426962700695827, "grad_norm": 1.873953938484192, "learning_rate": 1.1767097740682993e-05, "loss": 0.3902, "step": 27590 }, { "epoch": 5.428929704211846, "grad_norm": 0.45552027225494385, "learning_rate": 1.1764040478155861e-05, "loss": 0.3605, "step": 27600 }, { "epoch": 5.430896707727865, "grad_norm": 1.3343397378921509, "learning_rate": 1.1760983215628727e-05, "loss": 0.3379, "step": 27610 }, { "epoch": 5.432863711243884, "grad_norm": 1.5617072582244873, "learning_rate": 1.1757925953101594e-05, "loss": 0.3483, "step": 27620 }, { "epoch": 5.434830714759903, "grad_norm": 0.8592610359191895, "learning_rate": 1.1754868690574462e-05, "loss": 0.3791, "step": 27630 }, { "epoch": 5.436797718275922, "grad_norm": 2.699916362762451, "learning_rate": 1.1751811428047329e-05, "loss": 0.391, "step": 27640 }, { "epoch": 5.43876472179194, "grad_norm": 1.036454677581787, "learning_rate": 1.1748754165520193e-05, "loss": 0.4393, "step": 27650 }, { "epoch": 5.440731725307959, "grad_norm": 1.018223524093628, "learning_rate": 1.174569690299306e-05, "loss": 0.3592, "step": 27660 }, { "epoch": 5.442698728823978, "grad_norm": 0.7438509464263916, "learning_rate": 1.1742639640465926e-05, "loss": 0.3895, "step": 27670 }, { "epoch": 5.444665732339996, "grad_norm": 1.229840874671936, "learning_rate": 1.1739582377938794e-05, "loss": 0.4443, "step": 27680 }, { "epoch": 5.446632735856015, "grad_norm": 1.2053213119506836, "learning_rate": 1.1736525115411661e-05, "loss": 0.3692, "step": 27690 }, { "epoch": 5.4485997393720345, "grad_norm": 1.5629843473434448, "learning_rate": 1.1733467852884529e-05, "loss": 0.4367, "step": 27700 }, { "epoch": 5.450566742888053, "grad_norm": 1.0979888439178467, "learning_rate": 1.1730410590357395e-05, "loss": 0.3375, "step": 27710 }, { "epoch": 5.452533746404072, "grad_norm": 0.9558001160621643, "learning_rate": 1.1727353327830262e-05, "loss": 0.4945, "step": 27720 }, { "epoch": 5.454500749920091, "grad_norm": 3.2173619270324707, "learning_rate": 1.172429606530313e-05, "loss": 0.4662, "step": 27730 }, { "epoch": 5.456467753436109, "grad_norm": 1.0213227272033691, "learning_rate": 1.1721238802775995e-05, "loss": 0.3302, "step": 27740 }, { "epoch": 5.458434756952128, "grad_norm": 1.0318022966384888, "learning_rate": 1.1718181540248863e-05, "loss": 0.4005, "step": 27750 }, { "epoch": 5.460401760468147, "grad_norm": 1.205824613571167, "learning_rate": 1.171512427772173e-05, "loss": 0.3246, "step": 27760 }, { "epoch": 5.462368763984165, "grad_norm": 1.5735535621643066, "learning_rate": 1.1712067015194594e-05, "loss": 0.4247, "step": 27770 }, { "epoch": 5.4643357675001845, "grad_norm": 2.0467681884765625, "learning_rate": 1.1709009752667462e-05, "loss": 0.4287, "step": 27780 }, { "epoch": 5.4663027710162035, "grad_norm": 0.7347807288169861, "learning_rate": 1.170595249014033e-05, "loss": 0.4136, "step": 27790 }, { "epoch": 5.468269774532222, "grad_norm": 1.6695806980133057, "learning_rate": 1.1702895227613195e-05, "loss": 0.3893, "step": 27800 }, { "epoch": 5.470236778048241, "grad_norm": 1.408737063407898, "learning_rate": 1.1699837965086063e-05, "loss": 0.3863, "step": 27810 }, { "epoch": 5.47220378156426, "grad_norm": 2.0042405128479004, "learning_rate": 1.169678070255893e-05, "loss": 0.3712, "step": 27820 }, { "epoch": 5.474170785080278, "grad_norm": 1.8257968425750732, "learning_rate": 1.1693723440031797e-05, "loss": 0.3769, "step": 27830 }, { "epoch": 5.476137788596297, "grad_norm": 1.2176882028579712, "learning_rate": 1.1690666177504663e-05, "loss": 0.4373, "step": 27840 }, { "epoch": 5.478104792112316, "grad_norm": 2.558542013168335, "learning_rate": 1.168760891497753e-05, "loss": 0.3779, "step": 27850 }, { "epoch": 5.4800717956283345, "grad_norm": 1.4738448858261108, "learning_rate": 1.1684551652450398e-05, "loss": 0.3334, "step": 27860 }, { "epoch": 5.4820387991443535, "grad_norm": 1.189650058746338, "learning_rate": 1.1681494389923264e-05, "loss": 0.4015, "step": 27870 }, { "epoch": 5.484005802660373, "grad_norm": 2.8915460109710693, "learning_rate": 1.167843712739613e-05, "loss": 0.3709, "step": 27880 }, { "epoch": 5.485972806176391, "grad_norm": 1.3140373229980469, "learning_rate": 1.1675379864868997e-05, "loss": 0.3003, "step": 27890 }, { "epoch": 5.48793980969241, "grad_norm": 2.5268616676330566, "learning_rate": 1.1672322602341863e-05, "loss": 0.3986, "step": 27900 }, { "epoch": 5.489906813208429, "grad_norm": 1.513447880744934, "learning_rate": 1.166926533981473e-05, "loss": 0.3804, "step": 27910 }, { "epoch": 5.491873816724447, "grad_norm": 1.0257861614227295, "learning_rate": 1.1666208077287598e-05, "loss": 0.3423, "step": 27920 }, { "epoch": 5.493840820240466, "grad_norm": 3.507874011993408, "learning_rate": 1.1663150814760464e-05, "loss": 0.3601, "step": 27930 }, { "epoch": 5.495807823756485, "grad_norm": 0.7899544835090637, "learning_rate": 1.1660093552233331e-05, "loss": 0.3116, "step": 27940 }, { "epoch": 5.4977748272725036, "grad_norm": 0.9232675433158875, "learning_rate": 1.1657036289706199e-05, "loss": 0.3821, "step": 27950 }, { "epoch": 5.499741830788523, "grad_norm": 1.5610233545303345, "learning_rate": 1.1653979027179066e-05, "loss": 0.2642, "step": 27960 }, { "epoch": 5.501708834304541, "grad_norm": 1.6391479969024658, "learning_rate": 1.1650921764651932e-05, "loss": 0.3912, "step": 27970 }, { "epoch": 5.50367583782056, "grad_norm": 0.9958533644676208, "learning_rate": 1.16478645021248e-05, "loss": 0.3767, "step": 27980 }, { "epoch": 5.505642841336579, "grad_norm": 1.2634178400039673, "learning_rate": 1.1644807239597663e-05, "loss": 0.3419, "step": 27990 }, { "epoch": 5.507609844852597, "grad_norm": 1.0763895511627197, "learning_rate": 1.1641749977070531e-05, "loss": 0.3763, "step": 28000 }, { "epoch": 5.507609844852597, "eval_loss": 0.1603037267923355, "eval_runtime": 8.8616, "eval_samples_per_second": 5.642, "eval_steps_per_second": 2.821, "step": 28000 }, { "epoch": 5.509576848368616, "grad_norm": 2.0165274143218994, "learning_rate": 1.1638692714543398e-05, "loss": 0.3444, "step": 28010 }, { "epoch": 5.511543851884635, "grad_norm": 1.7106163501739502, "learning_rate": 1.1635635452016266e-05, "loss": 0.4143, "step": 28020 }, { "epoch": 5.5135108554006536, "grad_norm": 1.8056857585906982, "learning_rate": 1.1632578189489132e-05, "loss": 0.3066, "step": 28030 }, { "epoch": 5.515477858916673, "grad_norm": 0.7835657596588135, "learning_rate": 1.1629520926961999e-05, "loss": 0.383, "step": 28040 }, { "epoch": 5.517444862432692, "grad_norm": 0.7902368307113647, "learning_rate": 1.1626463664434867e-05, "loss": 0.568, "step": 28050 }, { "epoch": 5.51941186594871, "grad_norm": 1.101770281791687, "learning_rate": 1.1623406401907732e-05, "loss": 0.2871, "step": 28060 }, { "epoch": 5.521378869464729, "grad_norm": 1.5315871238708496, "learning_rate": 1.16203491393806e-05, "loss": 0.4072, "step": 28070 }, { "epoch": 5.523345872980748, "grad_norm": 1.7434234619140625, "learning_rate": 1.1617291876853467e-05, "loss": 0.4428, "step": 28080 }, { "epoch": 5.525312876496766, "grad_norm": 0.9678106904029846, "learning_rate": 1.1614234614326335e-05, "loss": 0.3465, "step": 28090 }, { "epoch": 5.527279880012785, "grad_norm": 1.8099299669265747, "learning_rate": 1.16111773517992e-05, "loss": 0.429, "step": 28100 }, { "epoch": 5.5292468835288044, "grad_norm": 1.2735828161239624, "learning_rate": 1.1608120089272066e-05, "loss": 0.3512, "step": 28110 }, { "epoch": 5.531213887044823, "grad_norm": 1.0866531133651733, "learning_rate": 1.1605062826744932e-05, "loss": 0.3188, "step": 28120 }, { "epoch": 5.533180890560842, "grad_norm": 1.9226776361465454, "learning_rate": 1.16020055642178e-05, "loss": 0.3112, "step": 28130 }, { "epoch": 5.535147894076861, "grad_norm": 0.8396310806274414, "learning_rate": 1.1598948301690667e-05, "loss": 0.2962, "step": 28140 }, { "epoch": 5.537114897592879, "grad_norm": 1.6284130811691284, "learning_rate": 1.1595891039163535e-05, "loss": 0.2717, "step": 28150 }, { "epoch": 5.539081901108898, "grad_norm": 0.705360472202301, "learning_rate": 1.15928337766364e-05, "loss": 0.3184, "step": 28160 }, { "epoch": 5.541048904624917, "grad_norm": 2.09700345993042, "learning_rate": 1.1589776514109268e-05, "loss": 0.4465, "step": 28170 }, { "epoch": 5.543015908140935, "grad_norm": 0.7859101891517639, "learning_rate": 1.1586719251582135e-05, "loss": 0.355, "step": 28180 }, { "epoch": 5.5449829116569544, "grad_norm": 2.0662646293640137, "learning_rate": 1.1583661989055001e-05, "loss": 0.4825, "step": 28190 }, { "epoch": 5.5469499151729735, "grad_norm": 1.131251335144043, "learning_rate": 1.1580604726527869e-05, "loss": 0.3399, "step": 28200 }, { "epoch": 5.548916918688992, "grad_norm": 1.2447280883789062, "learning_rate": 1.1577547464000736e-05, "loss": 0.4206, "step": 28210 }, { "epoch": 5.550883922205011, "grad_norm": 2.063737154006958, "learning_rate": 1.15744902014736e-05, "loss": 0.4005, "step": 28220 }, { "epoch": 5.55285092572103, "grad_norm": 1.5366238355636597, "learning_rate": 1.1571432938946468e-05, "loss": 0.378, "step": 28230 }, { "epoch": 5.554817929237048, "grad_norm": 1.0414916276931763, "learning_rate": 1.1568375676419335e-05, "loss": 0.3657, "step": 28240 }, { "epoch": 5.556784932753067, "grad_norm": 0.9085996150970459, "learning_rate": 1.15653184138922e-05, "loss": 0.4206, "step": 28250 }, { "epoch": 5.558751936269086, "grad_norm": 1.2168952226638794, "learning_rate": 1.1562261151365068e-05, "loss": 0.2791, "step": 28260 }, { "epoch": 5.5607189397851045, "grad_norm": 2.0721793174743652, "learning_rate": 1.1559203888837936e-05, "loss": 0.4165, "step": 28270 }, { "epoch": 5.5626859433011235, "grad_norm": 1.996752142906189, "learning_rate": 1.1556146626310803e-05, "loss": 0.3526, "step": 28280 }, { "epoch": 5.564652946817143, "grad_norm": 0.9958781003952026, "learning_rate": 1.1553089363783669e-05, "loss": 0.2766, "step": 28290 }, { "epoch": 5.566619950333161, "grad_norm": 1.2039257287979126, "learning_rate": 1.1550032101256536e-05, "loss": 0.3814, "step": 28300 }, { "epoch": 5.56858695384918, "grad_norm": 0.8608440160751343, "learning_rate": 1.1546974838729404e-05, "loss": 0.3909, "step": 28310 }, { "epoch": 5.570553957365199, "grad_norm": 2.0225956439971924, "learning_rate": 1.154391757620227e-05, "loss": 0.5571, "step": 28320 }, { "epoch": 5.572520960881217, "grad_norm": 1.0121216773986816, "learning_rate": 1.1540860313675135e-05, "loss": 0.3789, "step": 28330 }, { "epoch": 5.574487964397236, "grad_norm": 0.8687868714332581, "learning_rate": 1.1537803051148003e-05, "loss": 0.2573, "step": 28340 }, { "epoch": 5.576454967913255, "grad_norm": 1.3339784145355225, "learning_rate": 1.1534745788620869e-05, "loss": 0.4801, "step": 28350 }, { "epoch": 5.5784219714292735, "grad_norm": 1.0207067728042603, "learning_rate": 1.1531688526093736e-05, "loss": 0.4482, "step": 28360 }, { "epoch": 5.580388974945293, "grad_norm": 1.1519733667373657, "learning_rate": 1.1528631263566604e-05, "loss": 0.4116, "step": 28370 }, { "epoch": 5.582355978461312, "grad_norm": 2.023810386657715, "learning_rate": 1.152557400103947e-05, "loss": 0.3789, "step": 28380 }, { "epoch": 5.58432298197733, "grad_norm": 1.0238547325134277, "learning_rate": 1.1522516738512337e-05, "loss": 0.3157, "step": 28390 }, { "epoch": 5.586289985493349, "grad_norm": 0.9904107451438904, "learning_rate": 1.1519459475985204e-05, "loss": 0.4655, "step": 28400 }, { "epoch": 5.588256989009368, "grad_norm": 1.5001986026763916, "learning_rate": 1.1516402213458072e-05, "loss": 0.3845, "step": 28410 }, { "epoch": 5.590223992525386, "grad_norm": 1.6891433000564575, "learning_rate": 1.1513344950930938e-05, "loss": 0.4138, "step": 28420 }, { "epoch": 5.592190996041405, "grad_norm": 2.148615598678589, "learning_rate": 1.1510287688403805e-05, "loss": 0.3071, "step": 28430 }, { "epoch": 5.594157999557424, "grad_norm": 0.9641100764274597, "learning_rate": 1.1507230425876673e-05, "loss": 0.3471, "step": 28440 }, { "epoch": 5.596125003073443, "grad_norm": 0.9903495907783508, "learning_rate": 1.1504173163349537e-05, "loss": 0.3853, "step": 28450 }, { "epoch": 5.598092006589462, "grad_norm": 1.5597717761993408, "learning_rate": 1.1501115900822404e-05, "loss": 0.3705, "step": 28460 }, { "epoch": 5.600059010105481, "grad_norm": 1.0873743295669556, "learning_rate": 1.1498058638295272e-05, "loss": 0.3527, "step": 28470 }, { "epoch": 5.602026013621499, "grad_norm": 1.7205636501312256, "learning_rate": 1.1495001375768137e-05, "loss": 0.311, "step": 28480 }, { "epoch": 5.603993017137518, "grad_norm": 2.3332083225250244, "learning_rate": 1.1491944113241005e-05, "loss": 0.3583, "step": 28490 }, { "epoch": 5.605960020653537, "grad_norm": 1.5018432140350342, "learning_rate": 1.1488886850713872e-05, "loss": 0.2985, "step": 28500 }, { "epoch": 5.605960020653537, "eval_loss": 0.15660564601421356, "eval_runtime": 8.8811, "eval_samples_per_second": 5.63, "eval_steps_per_second": 2.815, "step": 28500 }, { "epoch": 5.607927024169555, "grad_norm": 0.7703841924667358, "learning_rate": 1.1485829588186738e-05, "loss": 0.486, "step": 28510 }, { "epoch": 5.609894027685574, "grad_norm": 2.638970375061035, "learning_rate": 1.1482772325659606e-05, "loss": 0.3021, "step": 28520 }, { "epoch": 5.6118610312015935, "grad_norm": 0.8547009229660034, "learning_rate": 1.1479715063132473e-05, "loss": 0.3226, "step": 28530 }, { "epoch": 5.613828034717612, "grad_norm": 1.1358122825622559, "learning_rate": 1.147665780060534e-05, "loss": 0.4475, "step": 28540 }, { "epoch": 5.615795038233631, "grad_norm": 1.3914819955825806, "learning_rate": 1.1473600538078206e-05, "loss": 0.3538, "step": 28550 }, { "epoch": 5.61776204174965, "grad_norm": 1.1753898859024048, "learning_rate": 1.1470543275551072e-05, "loss": 0.2922, "step": 28560 }, { "epoch": 5.619729045265668, "grad_norm": 1.4788683652877808, "learning_rate": 1.1467486013023938e-05, "loss": 0.3902, "step": 28570 }, { "epoch": 5.621696048781687, "grad_norm": 1.8976927995681763, "learning_rate": 1.1464428750496805e-05, "loss": 0.468, "step": 28580 }, { "epoch": 5.623663052297706, "grad_norm": 0.8130624294281006, "learning_rate": 1.1461371487969673e-05, "loss": 0.4195, "step": 28590 }, { "epoch": 5.625630055813724, "grad_norm": 1.275107741355896, "learning_rate": 1.145831422544254e-05, "loss": 0.3311, "step": 28600 }, { "epoch": 5.6275970593297435, "grad_norm": 1.0753655433654785, "learning_rate": 1.1455256962915406e-05, "loss": 0.3383, "step": 28610 }, { "epoch": 5.629564062845763, "grad_norm": 0.929724931716919, "learning_rate": 1.1452199700388274e-05, "loss": 0.4467, "step": 28620 }, { "epoch": 5.631531066361781, "grad_norm": 1.3278186321258545, "learning_rate": 1.1449142437861141e-05, "loss": 0.3923, "step": 28630 }, { "epoch": 5.6334980698778, "grad_norm": 1.7642334699630737, "learning_rate": 1.1446085175334007e-05, "loss": 0.3315, "step": 28640 }, { "epoch": 5.635465073393819, "grad_norm": 2.041316032409668, "learning_rate": 1.1443027912806874e-05, "loss": 0.3939, "step": 28650 }, { "epoch": 5.637432076909837, "grad_norm": 1.0631572008132935, "learning_rate": 1.1439970650279742e-05, "loss": 0.3989, "step": 28660 }, { "epoch": 5.639399080425856, "grad_norm": 0.8136534094810486, "learning_rate": 1.1436913387752606e-05, "loss": 0.3533, "step": 28670 }, { "epoch": 5.641366083941875, "grad_norm": 0.8498640656471252, "learning_rate": 1.1433856125225473e-05, "loss": 0.3885, "step": 28680 }, { "epoch": 5.6433330874578935, "grad_norm": 1.0519620180130005, "learning_rate": 1.143079886269834e-05, "loss": 0.3664, "step": 28690 }, { "epoch": 5.645300090973913, "grad_norm": 2.0294244289398193, "learning_rate": 1.1427741600171207e-05, "loss": 0.2631, "step": 28700 }, { "epoch": 5.647267094489932, "grad_norm": 1.1428825855255127, "learning_rate": 1.1424684337644074e-05, "loss": 0.3553, "step": 28710 }, { "epoch": 5.64923409800595, "grad_norm": 1.2105053663253784, "learning_rate": 1.1421627075116941e-05, "loss": 0.2884, "step": 28720 }, { "epoch": 5.651201101521969, "grad_norm": 0.7727178931236267, "learning_rate": 1.1418569812589809e-05, "loss": 0.462, "step": 28730 }, { "epoch": 5.653168105037988, "grad_norm": 0.9051704406738281, "learning_rate": 1.1415512550062675e-05, "loss": 0.3031, "step": 28740 }, { "epoch": 5.655135108554006, "grad_norm": 0.8865196704864502, "learning_rate": 1.1412455287535542e-05, "loss": 0.4039, "step": 28750 }, { "epoch": 5.657102112070025, "grad_norm": 1.78319251537323, "learning_rate": 1.140939802500841e-05, "loss": 0.3758, "step": 28760 }, { "epoch": 5.659069115586044, "grad_norm": 1.8657971620559692, "learning_rate": 1.1406340762481275e-05, "loss": 0.3919, "step": 28770 }, { "epoch": 5.661036119102063, "grad_norm": 1.4746910333633423, "learning_rate": 1.1403283499954143e-05, "loss": 0.4362, "step": 28780 }, { "epoch": 5.663003122618082, "grad_norm": 1.1907963752746582, "learning_rate": 1.1400226237427007e-05, "loss": 0.3671, "step": 28790 }, { "epoch": 5.664970126134101, "grad_norm": 0.8082676529884338, "learning_rate": 1.1397168974899874e-05, "loss": 0.4336, "step": 28800 }, { "epoch": 5.666937129650119, "grad_norm": 0.8014049530029297, "learning_rate": 1.1394111712372742e-05, "loss": 0.3373, "step": 28810 }, { "epoch": 5.668904133166138, "grad_norm": 1.8428163528442383, "learning_rate": 1.139105444984561e-05, "loss": 0.3643, "step": 28820 }, { "epoch": 5.670871136682157, "grad_norm": 0.9919217228889465, "learning_rate": 1.1387997187318475e-05, "loss": 0.4063, "step": 28830 }, { "epoch": 5.672838140198175, "grad_norm": 0.7838166952133179, "learning_rate": 1.1384939924791343e-05, "loss": 0.395, "step": 28840 }, { "epoch": 5.674805143714194, "grad_norm": 0.8631680011749268, "learning_rate": 1.138188266226421e-05, "loss": 0.3382, "step": 28850 }, { "epoch": 5.6767721472302135, "grad_norm": 0.9368540644645691, "learning_rate": 1.1378825399737078e-05, "loss": 0.2909, "step": 28860 }, { "epoch": 5.678739150746232, "grad_norm": 1.333701252937317, "learning_rate": 1.1375768137209943e-05, "loss": 0.3619, "step": 28870 }, { "epoch": 5.680706154262251, "grad_norm": 1.3428051471710205, "learning_rate": 1.1372710874682811e-05, "loss": 0.37, "step": 28880 }, { "epoch": 5.68267315777827, "grad_norm": 1.2829957008361816, "learning_rate": 1.1369653612155678e-05, "loss": 0.3244, "step": 28890 }, { "epoch": 5.684640161294288, "grad_norm": 2.1554901599884033, "learning_rate": 1.1366596349628542e-05, "loss": 0.2984, "step": 28900 }, { "epoch": 5.686607164810307, "grad_norm": 2.656132936477661, "learning_rate": 1.136353908710141e-05, "loss": 0.4132, "step": 28910 }, { "epoch": 5.688574168326326, "grad_norm": 0.8451870679855347, "learning_rate": 1.1360481824574276e-05, "loss": 0.4124, "step": 28920 }, { "epoch": 5.690541171842344, "grad_norm": 0.7421035170555115, "learning_rate": 1.1357424562047143e-05, "loss": 0.3635, "step": 28930 }, { "epoch": 5.6925081753583635, "grad_norm": 2.9646100997924805, "learning_rate": 1.135436729952001e-05, "loss": 0.4154, "step": 28940 }, { "epoch": 5.694475178874383, "grad_norm": 1.0777528285980225, "learning_rate": 1.1351310036992878e-05, "loss": 0.4434, "step": 28950 }, { "epoch": 5.696442182390401, "grad_norm": 1.156477689743042, "learning_rate": 1.1348252774465744e-05, "loss": 0.3711, "step": 28960 }, { "epoch": 5.69840918590642, "grad_norm": 2.4916858673095703, "learning_rate": 1.1345195511938611e-05, "loss": 0.3715, "step": 28970 }, { "epoch": 5.700376189422439, "grad_norm": 1.9382354021072388, "learning_rate": 1.1342138249411479e-05, "loss": 0.3031, "step": 28980 }, { "epoch": 5.702343192938457, "grad_norm": 0.9780879020690918, "learning_rate": 1.1339080986884346e-05, "loss": 0.4033, "step": 28990 }, { "epoch": 5.704310196454476, "grad_norm": 0.7445433139801025, "learning_rate": 1.1336023724357212e-05, "loss": 0.3011, "step": 29000 }, { "epoch": 5.704310196454476, "eval_loss": 0.15423063933849335, "eval_runtime": 8.8909, "eval_samples_per_second": 5.624, "eval_steps_per_second": 2.812, "step": 29000 }, { "epoch": 5.706277199970495, "grad_norm": 0.8199791312217712, "learning_rate": 1.1332966461830078e-05, "loss": 0.5285, "step": 29010 }, { "epoch": 5.7082442034865135, "grad_norm": 1.2353311777114868, "learning_rate": 1.1329909199302944e-05, "loss": 0.4156, "step": 29020 }, { "epoch": 5.710211207002533, "grad_norm": 0.6677939295768738, "learning_rate": 1.1326851936775811e-05, "loss": 0.3689, "step": 29030 }, { "epoch": 5.712178210518552, "grad_norm": 0.8684714436531067, "learning_rate": 1.1323794674248679e-05, "loss": 0.2528, "step": 29040 }, { "epoch": 5.71414521403457, "grad_norm": 1.1042633056640625, "learning_rate": 1.1320737411721544e-05, "loss": 0.4604, "step": 29050 }, { "epoch": 5.716112217550589, "grad_norm": 1.7766307592391968, "learning_rate": 1.1317680149194412e-05, "loss": 0.3185, "step": 29060 }, { "epoch": 5.718079221066608, "grad_norm": 1.5943020582199097, "learning_rate": 1.131462288666728e-05, "loss": 0.4921, "step": 29070 }, { "epoch": 5.720046224582626, "grad_norm": 0.9290481209754944, "learning_rate": 1.1311565624140147e-05, "loss": 0.3609, "step": 29080 }, { "epoch": 5.722013228098645, "grad_norm": 1.2227500677108765, "learning_rate": 1.1308508361613013e-05, "loss": 0.2812, "step": 29090 }, { "epoch": 5.723980231614664, "grad_norm": 1.03496515750885, "learning_rate": 1.130545109908588e-05, "loss": 0.4757, "step": 29100 }, { "epoch": 5.725947235130683, "grad_norm": 1.3645904064178467, "learning_rate": 1.1302393836558747e-05, "loss": 0.5169, "step": 29110 }, { "epoch": 5.727914238646702, "grad_norm": 1.264628529548645, "learning_rate": 1.1299336574031615e-05, "loss": 0.4321, "step": 29120 }, { "epoch": 5.729881242162721, "grad_norm": 1.108580231666565, "learning_rate": 1.1296279311504479e-05, "loss": 0.3469, "step": 29130 }, { "epoch": 5.731848245678739, "grad_norm": 1.319706678390503, "learning_rate": 1.1293222048977347e-05, "loss": 0.4453, "step": 29140 }, { "epoch": 5.733815249194758, "grad_norm": 1.1196752786636353, "learning_rate": 1.1290164786450212e-05, "loss": 0.5532, "step": 29150 }, { "epoch": 5.735782252710777, "grad_norm": 1.2848409414291382, "learning_rate": 1.128710752392308e-05, "loss": 0.4147, "step": 29160 }, { "epoch": 5.737749256226795, "grad_norm": 1.723487377166748, "learning_rate": 1.1284050261395947e-05, "loss": 0.3738, "step": 29170 }, { "epoch": 5.739716259742814, "grad_norm": 0.617231547832489, "learning_rate": 1.1280992998868813e-05, "loss": 0.368, "step": 29180 }, { "epoch": 5.7416832632588335, "grad_norm": 1.5116612911224365, "learning_rate": 1.127793573634168e-05, "loss": 0.4737, "step": 29190 }, { "epoch": 5.743650266774852, "grad_norm": 1.343389868736267, "learning_rate": 1.1274878473814548e-05, "loss": 0.3062, "step": 29200 }, { "epoch": 5.745617270290871, "grad_norm": 0.6800724864006042, "learning_rate": 1.1271821211287415e-05, "loss": 0.4265, "step": 29210 }, { "epoch": 5.74758427380689, "grad_norm": 1.4235820770263672, "learning_rate": 1.1268763948760281e-05, "loss": 0.4251, "step": 29220 }, { "epoch": 5.749551277322908, "grad_norm": 1.607358455657959, "learning_rate": 1.1265706686233149e-05, "loss": 0.3046, "step": 29230 }, { "epoch": 5.751518280838927, "grad_norm": 1.0604151487350464, "learning_rate": 1.1262649423706013e-05, "loss": 0.2405, "step": 29240 }, { "epoch": 5.753485284354946, "grad_norm": 1.6413496732711792, "learning_rate": 1.125959216117888e-05, "loss": 0.3785, "step": 29250 }, { "epoch": 5.755452287870964, "grad_norm": 2.484588146209717, "learning_rate": 1.1256534898651748e-05, "loss": 0.4283, "step": 29260 }, { "epoch": 5.7574192913869835, "grad_norm": 0.934377133846283, "learning_rate": 1.1253477636124615e-05, "loss": 0.4663, "step": 29270 }, { "epoch": 5.759386294903003, "grad_norm": 0.6259771585464478, "learning_rate": 1.1250420373597481e-05, "loss": 0.4307, "step": 29280 }, { "epoch": 5.761353298419021, "grad_norm": 1.0241243839263916, "learning_rate": 1.1247363111070348e-05, "loss": 0.3361, "step": 29290 }, { "epoch": 5.76332030193504, "grad_norm": 1.6852381229400635, "learning_rate": 1.1244305848543216e-05, "loss": 0.3302, "step": 29300 }, { "epoch": 5.765287305451059, "grad_norm": 1.2226417064666748, "learning_rate": 1.1241248586016082e-05, "loss": 0.2649, "step": 29310 }, { "epoch": 5.767254308967077, "grad_norm": 0.9367130398750305, "learning_rate": 1.1238191323488949e-05, "loss": 0.3448, "step": 29320 }, { "epoch": 5.769221312483096, "grad_norm": 1.9569463729858398, "learning_rate": 1.1235134060961817e-05, "loss": 0.291, "step": 29330 }, { "epoch": 5.771188315999115, "grad_norm": 1.3722015619277954, "learning_rate": 1.1232076798434684e-05, "loss": 0.3712, "step": 29340 }, { "epoch": 5.7731553195151335, "grad_norm": 1.0473368167877197, "learning_rate": 1.1229019535907548e-05, "loss": 0.3014, "step": 29350 }, { "epoch": 5.775122323031153, "grad_norm": 1.4684929847717285, "learning_rate": 1.1225962273380416e-05, "loss": 0.279, "step": 29360 }, { "epoch": 5.777089326547172, "grad_norm": 1.4670180082321167, "learning_rate": 1.1222905010853281e-05, "loss": 0.4775, "step": 29370 }, { "epoch": 5.77905633006319, "grad_norm": 1.68386709690094, "learning_rate": 1.1219847748326149e-05, "loss": 0.3514, "step": 29380 }, { "epoch": 5.781023333579209, "grad_norm": 1.5234280824661255, "learning_rate": 1.1216790485799016e-05, "loss": 0.5099, "step": 29390 }, { "epoch": 5.782990337095228, "grad_norm": 2.241706609725952, "learning_rate": 1.1213733223271884e-05, "loss": 0.4748, "step": 29400 }, { "epoch": 5.784957340611246, "grad_norm": 0.9543262124061584, "learning_rate": 1.121067596074475e-05, "loss": 0.3328, "step": 29410 }, { "epoch": 5.786924344127265, "grad_norm": 0.9000210165977478, "learning_rate": 1.1207618698217617e-05, "loss": 0.4743, "step": 29420 }, { "epoch": 5.788891347643284, "grad_norm": 1.6912353038787842, "learning_rate": 1.1204561435690485e-05, "loss": 0.4446, "step": 29430 }, { "epoch": 5.790858351159303, "grad_norm": 1.8903340101242065, "learning_rate": 1.120150417316335e-05, "loss": 0.3844, "step": 29440 }, { "epoch": 5.792825354675322, "grad_norm": 1.5042668581008911, "learning_rate": 1.1198446910636218e-05, "loss": 0.5255, "step": 29450 }, { "epoch": 5.794792358191341, "grad_norm": 1.3050025701522827, "learning_rate": 1.1195389648109085e-05, "loss": 0.391, "step": 29460 }, { "epoch": 5.796759361707359, "grad_norm": 1.804996371269226, "learning_rate": 1.119233238558195e-05, "loss": 0.4965, "step": 29470 }, { "epoch": 5.798726365223378, "grad_norm": 1.802411437034607, "learning_rate": 1.1189275123054817e-05, "loss": 0.3778, "step": 29480 }, { "epoch": 5.800693368739397, "grad_norm": 1.2030383348464966, "learning_rate": 1.1186217860527684e-05, "loss": 0.3617, "step": 29490 }, { "epoch": 5.802660372255415, "grad_norm": 1.6366369724273682, "learning_rate": 1.118316059800055e-05, "loss": 0.3232, "step": 29500 }, { "epoch": 5.802660372255415, "eval_loss": 0.1477995663881302, "eval_runtime": 8.8723, "eval_samples_per_second": 5.635, "eval_steps_per_second": 2.818, "step": 29500 }, { "epoch": 5.804627375771434, "grad_norm": 0.9681687951087952, "learning_rate": 1.1180103335473418e-05, "loss": 0.37, "step": 29510 }, { "epoch": 5.806594379287453, "grad_norm": 1.039945363998413, "learning_rate": 1.1177046072946285e-05, "loss": 0.3007, "step": 29520 }, { "epoch": 5.808561382803472, "grad_norm": 1.011750340461731, "learning_rate": 1.1173988810419152e-05, "loss": 0.362, "step": 29530 }, { "epoch": 5.810528386319491, "grad_norm": 1.4740638732910156, "learning_rate": 1.1170931547892018e-05, "loss": 0.4319, "step": 29540 }, { "epoch": 5.812495389835509, "grad_norm": 0.6471105813980103, "learning_rate": 1.1167874285364886e-05, "loss": 0.3625, "step": 29550 }, { "epoch": 5.814462393351528, "grad_norm": 0.8530930876731873, "learning_rate": 1.1164817022837753e-05, "loss": 0.2879, "step": 29560 }, { "epoch": 5.816429396867547, "grad_norm": 2.395834445953369, "learning_rate": 1.1161759760310619e-05, "loss": 0.3562, "step": 29570 }, { "epoch": 5.818396400383565, "grad_norm": 1.0060678720474243, "learning_rate": 1.1158702497783485e-05, "loss": 0.3603, "step": 29580 }, { "epoch": 5.820363403899584, "grad_norm": 0.6827914714813232, "learning_rate": 1.1155645235256352e-05, "loss": 0.3915, "step": 29590 }, { "epoch": 5.8223304074156035, "grad_norm": 1.3076701164245605, "learning_rate": 1.1152587972729218e-05, "loss": 0.3659, "step": 29600 }, { "epoch": 5.824297410931622, "grad_norm": 1.4336459636688232, "learning_rate": 1.1149530710202085e-05, "loss": 0.3897, "step": 29610 }, { "epoch": 5.826264414447641, "grad_norm": 2.1561131477355957, "learning_rate": 1.1146473447674953e-05, "loss": 0.5855, "step": 29620 }, { "epoch": 5.82823141796366, "grad_norm": 1.179270625114441, "learning_rate": 1.1143416185147819e-05, "loss": 0.3869, "step": 29630 }, { "epoch": 5.830198421479678, "grad_norm": 1.5720471143722534, "learning_rate": 1.1140358922620686e-05, "loss": 0.4048, "step": 29640 }, { "epoch": 5.832165424995697, "grad_norm": 2.540081024169922, "learning_rate": 1.1137301660093554e-05, "loss": 0.398, "step": 29650 }, { "epoch": 5.834132428511716, "grad_norm": 0.8215223550796509, "learning_rate": 1.1134244397566421e-05, "loss": 0.3492, "step": 29660 }, { "epoch": 5.836099432027734, "grad_norm": 1.6602963209152222, "learning_rate": 1.1131187135039287e-05, "loss": 0.4209, "step": 29670 }, { "epoch": 5.8380664355437535, "grad_norm": 1.6773931980133057, "learning_rate": 1.1128129872512154e-05, "loss": 0.3097, "step": 29680 }, { "epoch": 5.840033439059773, "grad_norm": 1.333003044128418, "learning_rate": 1.1125072609985018e-05, "loss": 0.3265, "step": 29690 }, { "epoch": 5.842000442575791, "grad_norm": 2.068171262741089, "learning_rate": 1.1122015347457886e-05, "loss": 0.26, "step": 29700 }, { "epoch": 5.84396744609181, "grad_norm": 0.8129364252090454, "learning_rate": 1.1118958084930753e-05, "loss": 0.2799, "step": 29710 }, { "epoch": 5.845934449607829, "grad_norm": 0.8760831952095032, "learning_rate": 1.1115900822403621e-05, "loss": 0.5451, "step": 29720 }, { "epoch": 5.847901453123847, "grad_norm": 1.9979091882705688, "learning_rate": 1.1112843559876487e-05, "loss": 0.3071, "step": 29730 }, { "epoch": 5.849868456639866, "grad_norm": 2.7981607913970947, "learning_rate": 1.1109786297349354e-05, "loss": 0.404, "step": 29740 }, { "epoch": 5.851835460155885, "grad_norm": 0.3538930416107178, "learning_rate": 1.1106729034822222e-05, "loss": 0.3743, "step": 29750 }, { "epoch": 5.8538024636719035, "grad_norm": 0.823993980884552, "learning_rate": 1.1103671772295087e-05, "loss": 0.298, "step": 29760 }, { "epoch": 5.855769467187923, "grad_norm": 0.8794128894805908, "learning_rate": 1.1100614509767955e-05, "loss": 0.3656, "step": 29770 }, { "epoch": 5.857736470703942, "grad_norm": 0.6292213797569275, "learning_rate": 1.1097557247240822e-05, "loss": 0.2991, "step": 29780 }, { "epoch": 5.85970347421996, "grad_norm": 1.622949481010437, "learning_rate": 1.109449998471369e-05, "loss": 0.4715, "step": 29790 }, { "epoch": 5.861670477735979, "grad_norm": 0.844419538974762, "learning_rate": 1.1091442722186556e-05, "loss": 0.3423, "step": 29800 }, { "epoch": 5.863637481251998, "grad_norm": 0.8577235341072083, "learning_rate": 1.1088385459659421e-05, "loss": 0.3279, "step": 29810 }, { "epoch": 5.865604484768016, "grad_norm": 1.3123620748519897, "learning_rate": 1.1085328197132287e-05, "loss": 0.3767, "step": 29820 }, { "epoch": 5.867571488284035, "grad_norm": 1.5416537523269653, "learning_rate": 1.1082270934605155e-05, "loss": 0.3926, "step": 29830 }, { "epoch": 5.869538491800054, "grad_norm": 1.229455590248108, "learning_rate": 1.1079213672078022e-05, "loss": 0.3863, "step": 29840 }, { "epoch": 5.871505495316073, "grad_norm": 1.6253336668014526, "learning_rate": 1.107615640955089e-05, "loss": 0.4067, "step": 29850 }, { "epoch": 5.873472498832092, "grad_norm": 1.830370306968689, "learning_rate": 1.1073099147023755e-05, "loss": 0.3894, "step": 29860 }, { "epoch": 5.875439502348111, "grad_norm": 1.366328239440918, "learning_rate": 1.1070041884496623e-05, "loss": 0.2714, "step": 29870 }, { "epoch": 5.877406505864129, "grad_norm": 0.6919177770614624, "learning_rate": 1.106698462196949e-05, "loss": 0.3454, "step": 29880 }, { "epoch": 5.879373509380148, "grad_norm": 0.7361364364624023, "learning_rate": 1.1063927359442356e-05, "loss": 0.4565, "step": 29890 }, { "epoch": 5.881340512896167, "grad_norm": 0.7951186299324036, "learning_rate": 1.1060870096915224e-05, "loss": 0.4556, "step": 29900 }, { "epoch": 5.883307516412185, "grad_norm": 2.3393678665161133, "learning_rate": 1.1057812834388091e-05, "loss": 0.4803, "step": 29910 }, { "epoch": 5.885274519928204, "grad_norm": 0.8017082810401917, "learning_rate": 1.1054755571860955e-05, "loss": 0.4325, "step": 29920 }, { "epoch": 5.8872415234442235, "grad_norm": 2.544628143310547, "learning_rate": 1.1051698309333823e-05, "loss": 0.3766, "step": 29930 }, { "epoch": 5.889208526960242, "grad_norm": 0.9725555777549744, "learning_rate": 1.104864104680669e-05, "loss": 0.3983, "step": 29940 }, { "epoch": 5.891175530476261, "grad_norm": 0.8429176807403564, "learning_rate": 1.1045583784279556e-05, "loss": 0.3185, "step": 29950 }, { "epoch": 5.89314253399228, "grad_norm": 1.1367381811141968, "learning_rate": 1.1042526521752423e-05, "loss": 0.2792, "step": 29960 }, { "epoch": 5.895109537508298, "grad_norm": 1.9774765968322754, "learning_rate": 1.103946925922529e-05, "loss": 0.4607, "step": 29970 }, { "epoch": 5.897076541024317, "grad_norm": 0.982170045375824, "learning_rate": 1.1036411996698158e-05, "loss": 0.3693, "step": 29980 }, { "epoch": 5.899043544540336, "grad_norm": 0.7595378160476685, "learning_rate": 1.1033354734171024e-05, "loss": 0.2639, "step": 29990 }, { "epoch": 5.901010548056354, "grad_norm": 1.9132200479507446, "learning_rate": 1.1030297471643891e-05, "loss": 0.2905, "step": 30000 }, { "epoch": 5.901010548056354, "eval_loss": 0.1469283252954483, "eval_runtime": 8.9116, "eval_samples_per_second": 5.611, "eval_steps_per_second": 2.805, "step": 30000 }, { "epoch": 5.9029775515723735, "grad_norm": 1.2966291904449463, "learning_rate": 1.1027240209116759e-05, "loss": 0.2171, "step": 30010 }, { "epoch": 5.9049445550883926, "grad_norm": 1.0475329160690308, "learning_rate": 1.1024182946589625e-05, "loss": 0.3754, "step": 30020 }, { "epoch": 5.906911558604411, "grad_norm": 1.6951876878738403, "learning_rate": 1.102112568406249e-05, "loss": 0.4807, "step": 30030 }, { "epoch": 5.90887856212043, "grad_norm": 1.9632104635238647, "learning_rate": 1.1018068421535358e-05, "loss": 0.339, "step": 30040 }, { "epoch": 5.910845565636449, "grad_norm": 1.9631885290145874, "learning_rate": 1.1015011159008224e-05, "loss": 0.3866, "step": 30050 }, { "epoch": 5.912812569152467, "grad_norm": 1.97544264793396, "learning_rate": 1.1011953896481091e-05, "loss": 0.3408, "step": 30060 }, { "epoch": 5.914779572668486, "grad_norm": 1.4105381965637207, "learning_rate": 1.1008896633953959e-05, "loss": 0.4061, "step": 30070 }, { "epoch": 5.916746576184505, "grad_norm": 1.1220061779022217, "learning_rate": 1.1005839371426824e-05, "loss": 0.4103, "step": 30080 }, { "epoch": 5.9187135797005235, "grad_norm": 0.9747940897941589, "learning_rate": 1.1002782108899692e-05, "loss": 0.3795, "step": 30090 }, { "epoch": 5.920680583216543, "grad_norm": 1.0602132081985474, "learning_rate": 1.099972484637256e-05, "loss": 0.3018, "step": 30100 }, { "epoch": 5.922647586732562, "grad_norm": 1.2885088920593262, "learning_rate": 1.0996667583845427e-05, "loss": 0.4286, "step": 30110 }, { "epoch": 5.92461459024858, "grad_norm": 0.9011586904525757, "learning_rate": 1.0993610321318293e-05, "loss": 0.393, "step": 30120 }, { "epoch": 5.926581593764599, "grad_norm": 1.460485816001892, "learning_rate": 1.099055305879116e-05, "loss": 0.3156, "step": 30130 }, { "epoch": 5.928548597280618, "grad_norm": 0.9864498972892761, "learning_rate": 1.0987495796264028e-05, "loss": 0.5503, "step": 30140 }, { "epoch": 5.930515600796636, "grad_norm": 1.3364967107772827, "learning_rate": 1.0984438533736892e-05, "loss": 0.3924, "step": 30150 }, { "epoch": 5.932482604312655, "grad_norm": 1.594724416732788, "learning_rate": 1.098138127120976e-05, "loss": 0.3058, "step": 30160 }, { "epoch": 5.934449607828674, "grad_norm": 1.7341972589492798, "learning_rate": 1.0978324008682627e-05, "loss": 0.5161, "step": 30170 }, { "epoch": 5.936416611344693, "grad_norm": 1.722612977027893, "learning_rate": 1.0975266746155492e-05, "loss": 0.4114, "step": 30180 }, { "epoch": 5.938383614860712, "grad_norm": 2.482858419418335, "learning_rate": 1.097220948362836e-05, "loss": 0.3087, "step": 30190 }, { "epoch": 5.940350618376731, "grad_norm": 1.0486516952514648, "learning_rate": 1.0969152221101227e-05, "loss": 0.3556, "step": 30200 }, { "epoch": 5.942317621892749, "grad_norm": 1.137136697769165, "learning_rate": 1.0966094958574093e-05, "loss": 0.4288, "step": 30210 }, { "epoch": 5.944284625408768, "grad_norm": 1.8614591360092163, "learning_rate": 1.096303769604696e-05, "loss": 0.3394, "step": 30220 }, { "epoch": 5.946251628924787, "grad_norm": 1.0514382123947144, "learning_rate": 1.0959980433519828e-05, "loss": 0.4297, "step": 30230 }, { "epoch": 5.948218632440805, "grad_norm": 0.6810349822044373, "learning_rate": 1.0956923170992696e-05, "loss": 0.3954, "step": 30240 }, { "epoch": 5.950185635956824, "grad_norm": 1.3290073871612549, "learning_rate": 1.0953865908465561e-05, "loss": 0.3973, "step": 30250 }, { "epoch": 5.952152639472843, "grad_norm": 1.5532108545303345, "learning_rate": 1.0950808645938427e-05, "loss": 0.3683, "step": 30260 }, { "epoch": 5.954119642988862, "grad_norm": 0.6228435635566711, "learning_rate": 1.0947751383411293e-05, "loss": 0.4073, "step": 30270 }, { "epoch": 5.956086646504881, "grad_norm": 1.2247480154037476, "learning_rate": 1.094469412088416e-05, "loss": 0.4175, "step": 30280 }, { "epoch": 5.958053650020899, "grad_norm": 1.8101277351379395, "learning_rate": 1.0941636858357028e-05, "loss": 0.3097, "step": 30290 }, { "epoch": 5.960020653536918, "grad_norm": 0.718777596950531, "learning_rate": 1.0938579595829895e-05, "loss": 0.2533, "step": 30300 }, { "epoch": 5.961987657052937, "grad_norm": 0.6376405954360962, "learning_rate": 1.0935522333302761e-05, "loss": 0.3094, "step": 30310 }, { "epoch": 5.963954660568955, "grad_norm": 1.3167697191238403, "learning_rate": 1.0932465070775629e-05, "loss": 0.3132, "step": 30320 }, { "epoch": 5.965921664084974, "grad_norm": 1.6995993852615356, "learning_rate": 1.0929407808248496e-05, "loss": 0.3559, "step": 30330 }, { "epoch": 5.9678886676009935, "grad_norm": 1.4039970636367798, "learning_rate": 1.0926350545721362e-05, "loss": 0.3164, "step": 30340 }, { "epoch": 5.969855671117012, "grad_norm": 2.015141487121582, "learning_rate": 1.092329328319423e-05, "loss": 0.3403, "step": 30350 }, { "epoch": 5.971822674633031, "grad_norm": 1.9489538669586182, "learning_rate": 1.0920236020667097e-05, "loss": 0.4236, "step": 30360 }, { "epoch": 5.97378967814905, "grad_norm": 1.7877446413040161, "learning_rate": 1.091717875813996e-05, "loss": 0.3764, "step": 30370 }, { "epoch": 5.975756681665068, "grad_norm": 1.0314801931381226, "learning_rate": 1.0914121495612828e-05, "loss": 0.3748, "step": 30380 }, { "epoch": 5.977723685181087, "grad_norm": 0.8923133611679077, "learning_rate": 1.0911064233085696e-05, "loss": 0.3876, "step": 30390 }, { "epoch": 5.979690688697106, "grad_norm": 0.8820732831954956, "learning_rate": 1.0908006970558562e-05, "loss": 0.3788, "step": 30400 }, { "epoch": 5.981657692213124, "grad_norm": 2.1888816356658936, "learning_rate": 1.0904949708031429e-05, "loss": 0.4475, "step": 30410 }, { "epoch": 5.9836246957291435, "grad_norm": 1.369350790977478, "learning_rate": 1.0901892445504297e-05, "loss": 0.4265, "step": 30420 }, { "epoch": 5.9855916992451625, "grad_norm": 0.6697467565536499, "learning_rate": 1.0898835182977164e-05, "loss": 0.4157, "step": 30430 }, { "epoch": 5.987558702761181, "grad_norm": 1.720997929573059, "learning_rate": 1.089577792045003e-05, "loss": 0.4813, "step": 30440 }, { "epoch": 5.9895257062772, "grad_norm": 0.8081413507461548, "learning_rate": 1.0892720657922897e-05, "loss": 0.4658, "step": 30450 }, { "epoch": 5.991492709793219, "grad_norm": 1.599493384361267, "learning_rate": 1.0889663395395765e-05, "loss": 0.3787, "step": 30460 }, { "epoch": 5.993459713309237, "grad_norm": 1.8009963035583496, "learning_rate": 1.088660613286863e-05, "loss": 0.3958, "step": 30470 }, { "epoch": 5.995426716825256, "grad_norm": 1.6714438199996948, "learning_rate": 1.0883548870341498e-05, "loss": 0.3767, "step": 30480 }, { "epoch": 5.997393720341275, "grad_norm": 0.8772445321083069, "learning_rate": 1.0880491607814364e-05, "loss": 0.3406, "step": 30490 }, { "epoch": 5.9993607238572935, "grad_norm": 1.883542776107788, "learning_rate": 1.087743434528723e-05, "loss": 0.2961, "step": 30500 }, { "epoch": 5.9993607238572935, "eval_loss": 0.14722052216529846, "eval_runtime": 8.9135, "eval_samples_per_second": 5.609, "eval_steps_per_second": 2.805, "step": 30500 }, { "epoch": 6.0013277273733125, "grad_norm": 2.056985378265381, "learning_rate": 1.0874377082760097e-05, "loss": 0.3812, "step": 30510 }, { "epoch": 6.003294730889332, "grad_norm": 1.3648536205291748, "learning_rate": 1.0871319820232964e-05, "loss": 0.3523, "step": 30520 }, { "epoch": 6.00526173440535, "grad_norm": 1.4668800830841064, "learning_rate": 1.086826255770583e-05, "loss": 0.3414, "step": 30530 }, { "epoch": 6.007228737921369, "grad_norm": 1.6990594863891602, "learning_rate": 1.0865205295178698e-05, "loss": 0.4042, "step": 30540 }, { "epoch": 6.009195741437388, "grad_norm": 1.3983603715896606, "learning_rate": 1.0862148032651565e-05, "loss": 0.3461, "step": 30550 }, { "epoch": 6.011162744953406, "grad_norm": 0.9408298134803772, "learning_rate": 1.0859090770124433e-05, "loss": 0.2446, "step": 30560 }, { "epoch": 6.013129748469425, "grad_norm": 0.7286332845687866, "learning_rate": 1.0856033507597298e-05, "loss": 0.4312, "step": 30570 }, { "epoch": 6.015096751985444, "grad_norm": 0.896022617816925, "learning_rate": 1.0852976245070166e-05, "loss": 0.297, "step": 30580 }, { "epoch": 6.0170637555014626, "grad_norm": 1.477001667022705, "learning_rate": 1.0849918982543033e-05, "loss": 0.4265, "step": 30590 }, { "epoch": 6.019030759017482, "grad_norm": 1.0923590660095215, "learning_rate": 1.0846861720015897e-05, "loss": 0.3933, "step": 30600 }, { "epoch": 6.020997762533501, "grad_norm": 0.9560948610305786, "learning_rate": 1.0843804457488765e-05, "loss": 0.3813, "step": 30610 }, { "epoch": 6.022964766049519, "grad_norm": 1.1864969730377197, "learning_rate": 1.0840747194961632e-05, "loss": 0.3759, "step": 30620 }, { "epoch": 6.024931769565538, "grad_norm": 0.9665082097053528, "learning_rate": 1.0837689932434498e-05, "loss": 0.3564, "step": 30630 }, { "epoch": 6.026898773081557, "grad_norm": 1.4572361707687378, "learning_rate": 1.0834632669907366e-05, "loss": 0.3752, "step": 30640 }, { "epoch": 6.028865776597575, "grad_norm": 1.0030657052993774, "learning_rate": 1.0831575407380233e-05, "loss": 0.2882, "step": 30650 }, { "epoch": 6.030832780113594, "grad_norm": 0.9930024147033691, "learning_rate": 1.0828518144853099e-05, "loss": 0.3293, "step": 30660 }, { "epoch": 6.032799783629613, "grad_norm": 2.801129102706909, "learning_rate": 1.0825460882325966e-05, "loss": 0.4869, "step": 30670 }, { "epoch": 6.034766787145632, "grad_norm": 1.1921310424804688, "learning_rate": 1.0822403619798834e-05, "loss": 0.3916, "step": 30680 }, { "epoch": 6.036733790661651, "grad_norm": 0.4918254017829895, "learning_rate": 1.0819346357271701e-05, "loss": 0.3047, "step": 30690 }, { "epoch": 6.03870079417767, "grad_norm": 0.8728923797607422, "learning_rate": 1.0816289094744567e-05, "loss": 0.3435, "step": 30700 }, { "epoch": 6.040667797693688, "grad_norm": 1.234466791152954, "learning_rate": 1.0813231832217433e-05, "loss": 0.5229, "step": 30710 }, { "epoch": 6.042634801209707, "grad_norm": 4.090738296508789, "learning_rate": 1.0810174569690299e-05, "loss": 0.3178, "step": 30720 }, { "epoch": 6.044601804725726, "grad_norm": 2.3472020626068115, "learning_rate": 1.0807117307163166e-05, "loss": 0.305, "step": 30730 }, { "epoch": 6.046568808241744, "grad_norm": 1.0088740587234497, "learning_rate": 1.0804060044636034e-05, "loss": 0.3439, "step": 30740 }, { "epoch": 6.0485358117577634, "grad_norm": 1.0725480318069458, "learning_rate": 1.0801002782108901e-05, "loss": 0.3794, "step": 30750 }, { "epoch": 6.0505028152737825, "grad_norm": 0.7405242919921875, "learning_rate": 1.0797945519581767e-05, "loss": 0.3402, "step": 30760 }, { "epoch": 6.052469818789801, "grad_norm": 1.314536690711975, "learning_rate": 1.0794888257054634e-05, "loss": 0.3956, "step": 30770 }, { "epoch": 6.05443682230582, "grad_norm": 0.7027092576026917, "learning_rate": 1.0791830994527502e-05, "loss": 0.3186, "step": 30780 }, { "epoch": 6.056403825821839, "grad_norm": 1.0569257736206055, "learning_rate": 1.0788773732000368e-05, "loss": 0.4329, "step": 30790 }, { "epoch": 6.058370829337857, "grad_norm": 1.2603479623794556, "learning_rate": 1.0785716469473235e-05, "loss": 0.3964, "step": 30800 }, { "epoch": 6.060337832853876, "grad_norm": 1.2180461883544922, "learning_rate": 1.0782659206946103e-05, "loss": 0.3594, "step": 30810 }, { "epoch": 6.062304836369895, "grad_norm": 1.306077003479004, "learning_rate": 1.077960194441897e-05, "loss": 0.4708, "step": 30820 }, { "epoch": 6.0642718398859135, "grad_norm": 1.0077178478240967, "learning_rate": 1.0776544681891834e-05, "loss": 0.3159, "step": 30830 }, { "epoch": 6.0662388434019325, "grad_norm": 0.9312611818313599, "learning_rate": 1.0773487419364702e-05, "loss": 0.3526, "step": 30840 }, { "epoch": 6.068205846917952, "grad_norm": 0.8618937134742737, "learning_rate": 1.0770430156837567e-05, "loss": 0.3417, "step": 30850 }, { "epoch": 6.07017285043397, "grad_norm": 0.825440526008606, "learning_rate": 1.0767372894310435e-05, "loss": 0.3364, "step": 30860 }, { "epoch": 6.072139853949989, "grad_norm": 0.8122813105583191, "learning_rate": 1.0764315631783302e-05, "loss": 0.421, "step": 30870 }, { "epoch": 6.074106857466008, "grad_norm": 1.007061243057251, "learning_rate": 1.076125836925617e-05, "loss": 0.341, "step": 30880 }, { "epoch": 6.076073860982026, "grad_norm": 0.4585067629814148, "learning_rate": 1.0758201106729035e-05, "loss": 0.3691, "step": 30890 }, { "epoch": 6.078040864498045, "grad_norm": 0.7221927046775818, "learning_rate": 1.0755143844201903e-05, "loss": 0.3342, "step": 30900 }, { "epoch": 6.080007868014064, "grad_norm": 1.0165441036224365, "learning_rate": 1.075208658167477e-05, "loss": 0.477, "step": 30910 }, { "epoch": 6.0819748715300825, "grad_norm": 1.446653962135315, "learning_rate": 1.0749029319147636e-05, "loss": 0.3424, "step": 30920 }, { "epoch": 6.083941875046102, "grad_norm": 0.7181447148323059, "learning_rate": 1.0745972056620504e-05, "loss": 0.2247, "step": 30930 }, { "epoch": 6.085908878562121, "grad_norm": 0.5309480428695679, "learning_rate": 1.074291479409337e-05, "loss": 0.4787, "step": 30940 }, { "epoch": 6.087875882078139, "grad_norm": 0.8217592835426331, "learning_rate": 1.0739857531566235e-05, "loss": 0.3421, "step": 30950 }, { "epoch": 6.089842885594158, "grad_norm": 1.1308156251907349, "learning_rate": 1.0736800269039103e-05, "loss": 0.3263, "step": 30960 }, { "epoch": 6.091809889110177, "grad_norm": 0.877742350101471, "learning_rate": 1.073374300651197e-05, "loss": 0.3078, "step": 30970 }, { "epoch": 6.093776892626195, "grad_norm": 1.2607346773147583, "learning_rate": 1.0730685743984836e-05, "loss": 0.51, "step": 30980 }, { "epoch": 6.095743896142214, "grad_norm": 1.0867124795913696, "learning_rate": 1.0727628481457703e-05, "loss": 0.2541, "step": 30990 }, { "epoch": 6.097710899658233, "grad_norm": 1.2764980792999268, "learning_rate": 1.0724571218930571e-05, "loss": 0.3536, "step": 31000 }, { "epoch": 6.097710899658233, "eval_loss": 0.14491592347621918, "eval_runtime": 8.8633, "eval_samples_per_second": 5.641, "eval_steps_per_second": 2.821, "step": 31000 }, { "epoch": 6.099677903174252, "grad_norm": 0.7683907747268677, "learning_rate": 1.0721513956403438e-05, "loss": 0.3945, "step": 31010 }, { "epoch": 6.101644906690271, "grad_norm": 1.1609355211257935, "learning_rate": 1.0718456693876304e-05, "loss": 0.337, "step": 31020 }, { "epoch": 6.10361191020629, "grad_norm": 0.5849171876907349, "learning_rate": 1.0715399431349172e-05, "loss": 0.3, "step": 31030 }, { "epoch": 6.105578913722308, "grad_norm": 1.1877692937850952, "learning_rate": 1.0712342168822039e-05, "loss": 0.3381, "step": 31040 }, { "epoch": 6.107545917238327, "grad_norm": 0.8901480436325073, "learning_rate": 1.0709284906294903e-05, "loss": 0.4137, "step": 31050 }, { "epoch": 6.109512920754346, "grad_norm": 1.0427314043045044, "learning_rate": 1.070622764376777e-05, "loss": 0.2717, "step": 31060 }, { "epoch": 6.111479924270364, "grad_norm": 1.543684959411621, "learning_rate": 1.0703170381240638e-05, "loss": 0.3706, "step": 31070 }, { "epoch": 6.113446927786383, "grad_norm": 1.26493501663208, "learning_rate": 1.0700113118713504e-05, "loss": 0.3729, "step": 31080 }, { "epoch": 6.1154139313024025, "grad_norm": 1.4996311664581299, "learning_rate": 1.0697055856186371e-05, "loss": 0.328, "step": 31090 }, { "epoch": 6.117380934818421, "grad_norm": 1.4721986055374146, "learning_rate": 1.0693998593659239e-05, "loss": 0.3299, "step": 31100 }, { "epoch": 6.11934793833444, "grad_norm": 3.207667350769043, "learning_rate": 1.0690941331132105e-05, "loss": 0.2632, "step": 31110 }, { "epoch": 6.121314941850459, "grad_norm": 0.6640446782112122, "learning_rate": 1.0687884068604972e-05, "loss": 0.3463, "step": 31120 }, { "epoch": 6.123281945366477, "grad_norm": 1.1663861274719238, "learning_rate": 1.068482680607784e-05, "loss": 0.3626, "step": 31130 }, { "epoch": 6.125248948882496, "grad_norm": 1.0878609418869019, "learning_rate": 1.0681769543550707e-05, "loss": 0.3306, "step": 31140 }, { "epoch": 6.127215952398515, "grad_norm": 1.0775517225265503, "learning_rate": 1.0678712281023573e-05, "loss": 0.3593, "step": 31150 }, { "epoch": 6.129182955914533, "grad_norm": 1.3499873876571655, "learning_rate": 1.067565501849644e-05, "loss": 0.303, "step": 31160 }, { "epoch": 6.1311499594305525, "grad_norm": 1.3294696807861328, "learning_rate": 1.0672597755969304e-05, "loss": 0.365, "step": 31170 }, { "epoch": 6.133116962946572, "grad_norm": 0.8494240641593933, "learning_rate": 1.0669540493442172e-05, "loss": 0.3471, "step": 31180 }, { "epoch": 6.13508396646259, "grad_norm": 2.2927815914154053, "learning_rate": 1.066648323091504e-05, "loss": 0.3805, "step": 31190 }, { "epoch": 6.137050969978609, "grad_norm": 1.4000989198684692, "learning_rate": 1.0663425968387907e-05, "loss": 0.2334, "step": 31200 }, { "epoch": 6.139017973494628, "grad_norm": 0.920910656452179, "learning_rate": 1.0660368705860773e-05, "loss": 0.2703, "step": 31210 }, { "epoch": 6.140984977010646, "grad_norm": 0.863723874092102, "learning_rate": 1.065731144333364e-05, "loss": 0.4328, "step": 31220 }, { "epoch": 6.142951980526665, "grad_norm": 1.2679826021194458, "learning_rate": 1.0654254180806508e-05, "loss": 0.3987, "step": 31230 }, { "epoch": 6.144918984042684, "grad_norm": 1.067123293876648, "learning_rate": 1.0651196918279373e-05, "loss": 0.3339, "step": 31240 }, { "epoch": 6.1468859875587025, "grad_norm": 1.1733587980270386, "learning_rate": 1.064813965575224e-05, "loss": 0.2214, "step": 31250 }, { "epoch": 6.148852991074722, "grad_norm": 1.0510685443878174, "learning_rate": 1.0645082393225108e-05, "loss": 0.3718, "step": 31260 }, { "epoch": 6.150819994590741, "grad_norm": 1.1614587306976318, "learning_rate": 1.0642025130697976e-05, "loss": 0.3971, "step": 31270 }, { "epoch": 6.152786998106759, "grad_norm": 1.8867449760437012, "learning_rate": 1.063896786817084e-05, "loss": 0.3666, "step": 31280 }, { "epoch": 6.154754001622778, "grad_norm": 1.8797495365142822, "learning_rate": 1.0635910605643707e-05, "loss": 0.3763, "step": 31290 }, { "epoch": 6.156721005138797, "grad_norm": 1.5775450468063354, "learning_rate": 1.0632853343116573e-05, "loss": 0.4743, "step": 31300 }, { "epoch": 6.158688008654815, "grad_norm": 1.456451416015625, "learning_rate": 1.062979608058944e-05, "loss": 0.4399, "step": 31310 }, { "epoch": 6.160655012170834, "grad_norm": 1.745969533920288, "learning_rate": 1.0626738818062308e-05, "loss": 0.2838, "step": 31320 }, { "epoch": 6.162622015686853, "grad_norm": 1.225696325302124, "learning_rate": 1.0623681555535175e-05, "loss": 0.3019, "step": 31330 }, { "epoch": 6.164589019202872, "grad_norm": 1.4007424116134644, "learning_rate": 1.0620624293008041e-05, "loss": 0.4124, "step": 31340 }, { "epoch": 6.166556022718891, "grad_norm": 0.6728320717811584, "learning_rate": 1.0617567030480909e-05, "loss": 0.4112, "step": 31350 }, { "epoch": 6.16852302623491, "grad_norm": 0.8944595456123352, "learning_rate": 1.0614509767953776e-05, "loss": 0.3456, "step": 31360 }, { "epoch": 6.170490029750928, "grad_norm": 1.4572259187698364, "learning_rate": 1.0611452505426642e-05, "loss": 0.4435, "step": 31370 }, { "epoch": 6.172457033266947, "grad_norm": 2.9878594875335693, "learning_rate": 1.060839524289951e-05, "loss": 0.4064, "step": 31380 }, { "epoch": 6.174424036782966, "grad_norm": 0.9966301321983337, "learning_rate": 1.0605337980372375e-05, "loss": 0.3786, "step": 31390 }, { "epoch": 6.176391040298984, "grad_norm": 0.930111289024353, "learning_rate": 1.0602280717845241e-05, "loss": 0.4371, "step": 31400 }, { "epoch": 6.178358043815003, "grad_norm": 1.0499768257141113, "learning_rate": 1.0599223455318108e-05, "loss": 0.3302, "step": 31410 }, { "epoch": 6.1803250473310225, "grad_norm": 1.316510796546936, "learning_rate": 1.0596166192790976e-05, "loss": 0.3829, "step": 31420 }, { "epoch": 6.182292050847041, "grad_norm": 1.1173617839813232, "learning_rate": 1.0593108930263842e-05, "loss": 0.4753, "step": 31430 }, { "epoch": 6.18425905436306, "grad_norm": 1.035115122795105, "learning_rate": 1.059005166773671e-05, "loss": 0.3719, "step": 31440 }, { "epoch": 6.186226057879079, "grad_norm": 1.8233667612075806, "learning_rate": 1.0586994405209577e-05, "loss": 0.4091, "step": 31450 }, { "epoch": 6.188193061395097, "grad_norm": 1.4853065013885498, "learning_rate": 1.0583937142682444e-05, "loss": 0.3037, "step": 31460 }, { "epoch": 6.190160064911116, "grad_norm": 1.442393183708191, "learning_rate": 1.058087988015531e-05, "loss": 0.4173, "step": 31470 }, { "epoch": 6.192127068427135, "grad_norm": 0.843182384967804, "learning_rate": 1.0577822617628177e-05, "loss": 0.4116, "step": 31480 }, { "epoch": 6.194094071943153, "grad_norm": 1.1940809488296509, "learning_rate": 1.0574765355101045e-05, "loss": 0.3755, "step": 31490 }, { "epoch": 6.1960610754591725, "grad_norm": 1.3931796550750732, "learning_rate": 1.057170809257391e-05, "loss": 0.279, "step": 31500 }, { "epoch": 6.1960610754591725, "eval_loss": 0.14727434515953064, "eval_runtime": 8.9002, "eval_samples_per_second": 5.618, "eval_steps_per_second": 2.809, "step": 31500 }, { "epoch": 6.198028078975192, "grad_norm": 1.3128204345703125, "learning_rate": 1.0568650830046776e-05, "loss": 0.3738, "step": 31510 }, { "epoch": 6.19999508249121, "grad_norm": 1.0435333251953125, "learning_rate": 1.0565593567519644e-05, "loss": 0.3621, "step": 31520 }, { "epoch": 6.201962086007229, "grad_norm": 1.0718704462051392, "learning_rate": 1.056253630499251e-05, "loss": 0.4154, "step": 31530 }, { "epoch": 6.203929089523248, "grad_norm": 1.5530494451522827, "learning_rate": 1.0559479042465377e-05, "loss": 0.3527, "step": 31540 }, { "epoch": 6.205896093039266, "grad_norm": 1.1111003160476685, "learning_rate": 1.0556421779938245e-05, "loss": 0.3764, "step": 31550 }, { "epoch": 6.207863096555285, "grad_norm": 1.1496018171310425, "learning_rate": 1.055336451741111e-05, "loss": 0.2756, "step": 31560 }, { "epoch": 6.209830100071304, "grad_norm": 1.0827871561050415, "learning_rate": 1.0550307254883978e-05, "loss": 0.2736, "step": 31570 }, { "epoch": 6.2117971035873225, "grad_norm": 1.5268453359603882, "learning_rate": 1.0547249992356845e-05, "loss": 0.4035, "step": 31580 }, { "epoch": 6.213764107103342, "grad_norm": 2.096116781234741, "learning_rate": 1.0544192729829713e-05, "loss": 0.3846, "step": 31590 }, { "epoch": 6.215731110619361, "grad_norm": 1.0222452878952026, "learning_rate": 1.0541135467302579e-05, "loss": 0.4076, "step": 31600 }, { "epoch": 6.217698114135379, "grad_norm": 1.360022783279419, "learning_rate": 1.0538078204775446e-05, "loss": 0.3092, "step": 31610 }, { "epoch": 6.219665117651398, "grad_norm": 0.7893637418746948, "learning_rate": 1.053502094224831e-05, "loss": 0.4069, "step": 31620 }, { "epoch": 6.221632121167417, "grad_norm": 1.6887398958206177, "learning_rate": 1.0531963679721178e-05, "loss": 0.4277, "step": 31630 }, { "epoch": 6.223599124683435, "grad_norm": 1.5129915475845337, "learning_rate": 1.0528906417194045e-05, "loss": 0.3649, "step": 31640 }, { "epoch": 6.225566128199454, "grad_norm": 1.1816556453704834, "learning_rate": 1.0525849154666913e-05, "loss": 0.3348, "step": 31650 }, { "epoch": 6.2275331317154725, "grad_norm": 1.3360129594802856, "learning_rate": 1.0522791892139778e-05, "loss": 0.3609, "step": 31660 }, { "epoch": 6.229500135231492, "grad_norm": 1.489035964012146, "learning_rate": 1.0519734629612646e-05, "loss": 0.3642, "step": 31670 }, { "epoch": 6.231467138747511, "grad_norm": 1.5423457622528076, "learning_rate": 1.0516677367085513e-05, "loss": 0.3332, "step": 31680 }, { "epoch": 6.233434142263529, "grad_norm": 1.2122647762298584, "learning_rate": 1.0513620104558379e-05, "loss": 0.4714, "step": 31690 }, { "epoch": 6.235401145779548, "grad_norm": 1.450034737586975, "learning_rate": 1.0510562842031247e-05, "loss": 0.3323, "step": 31700 }, { "epoch": 6.237368149295567, "grad_norm": 1.2099430561065674, "learning_rate": 1.0507505579504114e-05, "loss": 0.4134, "step": 31710 }, { "epoch": 6.239335152811585, "grad_norm": 2.170869827270508, "learning_rate": 1.0504448316976981e-05, "loss": 0.4279, "step": 31720 }, { "epoch": 6.241302156327604, "grad_norm": 0.630581796169281, "learning_rate": 1.0501391054449846e-05, "loss": 0.5135, "step": 31730 }, { "epoch": 6.243269159843623, "grad_norm": 2.2780323028564453, "learning_rate": 1.0498333791922713e-05, "loss": 0.423, "step": 31740 }, { "epoch": 6.245236163359642, "grad_norm": 1.5196157693862915, "learning_rate": 1.0495276529395579e-05, "loss": 0.2993, "step": 31750 }, { "epoch": 6.247203166875661, "grad_norm": 2.440230369567871, "learning_rate": 1.0492219266868446e-05, "loss": 0.3687, "step": 31760 }, { "epoch": 6.24917017039168, "grad_norm": 0.8297176957130432, "learning_rate": 1.0489162004341314e-05, "loss": 0.4382, "step": 31770 }, { "epoch": 6.251137173907698, "grad_norm": 1.7238305807113647, "learning_rate": 1.0486104741814181e-05, "loss": 0.3213, "step": 31780 }, { "epoch": 6.253104177423717, "grad_norm": 1.177493691444397, "learning_rate": 1.0483047479287047e-05, "loss": 0.2217, "step": 31790 }, { "epoch": 6.255071180939736, "grad_norm": 1.8509219884872437, "learning_rate": 1.0479990216759914e-05, "loss": 0.3354, "step": 31800 }, { "epoch": 6.257038184455754, "grad_norm": 0.8892121315002441, "learning_rate": 1.0476932954232782e-05, "loss": 0.3079, "step": 31810 }, { "epoch": 6.259005187971773, "grad_norm": 1.7575033903121948, "learning_rate": 1.0473875691705648e-05, "loss": 0.3453, "step": 31820 }, { "epoch": 6.2609721914877925, "grad_norm": 0.984065055847168, "learning_rate": 1.0470818429178515e-05, "loss": 0.2972, "step": 31830 }, { "epoch": 6.262939195003811, "grad_norm": 1.5092118978500366, "learning_rate": 1.0467761166651383e-05, "loss": 0.4259, "step": 31840 }, { "epoch": 6.26490619851983, "grad_norm": 1.1990478038787842, "learning_rate": 1.0464703904124247e-05, "loss": 0.3105, "step": 31850 }, { "epoch": 6.266873202035849, "grad_norm": 1.0917842388153076, "learning_rate": 1.0461646641597114e-05, "loss": 0.3538, "step": 31860 }, { "epoch": 6.268840205551867, "grad_norm": 1.0836409330368042, "learning_rate": 1.0458589379069982e-05, "loss": 0.2392, "step": 31870 }, { "epoch": 6.270807209067886, "grad_norm": 1.336677074432373, "learning_rate": 1.0455532116542847e-05, "loss": 0.3993, "step": 31880 }, { "epoch": 6.272774212583905, "grad_norm": 0.9778781533241272, "learning_rate": 1.0452474854015715e-05, "loss": 0.2921, "step": 31890 }, { "epoch": 6.274741216099923, "grad_norm": 1.270330786705017, "learning_rate": 1.0449417591488582e-05, "loss": 0.4029, "step": 31900 }, { "epoch": 6.2767082196159425, "grad_norm": 1.2101235389709473, "learning_rate": 1.044636032896145e-05, "loss": 0.3091, "step": 31910 }, { "epoch": 6.278675223131962, "grad_norm": 1.7291163206100464, "learning_rate": 1.0443303066434316e-05, "loss": 0.4171, "step": 31920 }, { "epoch": 6.28064222664798, "grad_norm": 1.4795582294464111, "learning_rate": 1.0440245803907183e-05, "loss": 0.3782, "step": 31930 }, { "epoch": 6.282609230163999, "grad_norm": 1.6377198696136475, "learning_rate": 1.043718854138005e-05, "loss": 0.3424, "step": 31940 }, { "epoch": 6.284576233680018, "grad_norm": 1.3972163200378418, "learning_rate": 1.0434131278852916e-05, "loss": 0.3734, "step": 31950 }, { "epoch": 6.286543237196036, "grad_norm": 1.3955159187316895, "learning_rate": 1.0431074016325782e-05, "loss": 0.4133, "step": 31960 }, { "epoch": 6.288510240712055, "grad_norm": 1.2619258165359497, "learning_rate": 1.0428016753798648e-05, "loss": 0.3035, "step": 31970 }, { "epoch": 6.290477244228074, "grad_norm": 1.8464090824127197, "learning_rate": 1.0424959491271515e-05, "loss": 0.4578, "step": 31980 }, { "epoch": 6.2924442477440925, "grad_norm": 1.2729978561401367, "learning_rate": 1.0421902228744383e-05, "loss": 0.4224, "step": 31990 }, { "epoch": 6.294411251260112, "grad_norm": 0.8992209434509277, "learning_rate": 1.041884496621725e-05, "loss": 0.3831, "step": 32000 }, { "epoch": 6.294411251260112, "eval_loss": 0.14667941629886627, "eval_runtime": 8.9005, "eval_samples_per_second": 5.618, "eval_steps_per_second": 2.809, "step": 32000 }, { "epoch": 6.296378254776131, "grad_norm": 0.7840712070465088, "learning_rate": 1.0415787703690116e-05, "loss": 0.3726, "step": 32010 }, { "epoch": 6.298345258292149, "grad_norm": 0.7266148924827576, "learning_rate": 1.0412730441162984e-05, "loss": 0.2885, "step": 32020 }, { "epoch": 6.300312261808168, "grad_norm": 1.2583742141723633, "learning_rate": 1.0409673178635851e-05, "loss": 0.2949, "step": 32030 }, { "epoch": 6.302279265324187, "grad_norm": 1.4370315074920654, "learning_rate": 1.0406615916108717e-05, "loss": 0.3672, "step": 32040 }, { "epoch": 6.304246268840205, "grad_norm": 0.9335483908653259, "learning_rate": 1.0403558653581584e-05, "loss": 0.3721, "step": 32050 }, { "epoch": 6.306213272356224, "grad_norm": 1.3395602703094482, "learning_rate": 1.0400501391054452e-05, "loss": 0.3631, "step": 32060 }, { "epoch": 6.308180275872243, "grad_norm": 1.8281170129776, "learning_rate": 1.0397444128527316e-05, "loss": 0.3413, "step": 32070 }, { "epoch": 6.310147279388262, "grad_norm": 1.3462482690811157, "learning_rate": 1.0394386866000183e-05, "loss": 0.3088, "step": 32080 }, { "epoch": 6.312114282904281, "grad_norm": 1.1540286540985107, "learning_rate": 1.039132960347305e-05, "loss": 0.42, "step": 32090 }, { "epoch": 6.3140812864203, "grad_norm": 1.4234508275985718, "learning_rate": 1.0388272340945917e-05, "loss": 0.3489, "step": 32100 }, { "epoch": 6.316048289936318, "grad_norm": 1.2908326387405396, "learning_rate": 1.0385215078418784e-05, "loss": 0.4033, "step": 32110 }, { "epoch": 6.318015293452337, "grad_norm": 0.784662127494812, "learning_rate": 1.0382157815891652e-05, "loss": 0.2647, "step": 32120 }, { "epoch": 6.319982296968356, "grad_norm": 1.8527098894119263, "learning_rate": 1.0379100553364519e-05, "loss": 0.3639, "step": 32130 }, { "epoch": 6.321949300484374, "grad_norm": 1.9791474342346191, "learning_rate": 1.0376043290837385e-05, "loss": 0.4099, "step": 32140 }, { "epoch": 6.323916304000393, "grad_norm": 1.3550399541854858, "learning_rate": 1.0372986028310252e-05, "loss": 0.3554, "step": 32150 }, { "epoch": 6.3258833075164125, "grad_norm": 0.712272047996521, "learning_rate": 1.036992876578312e-05, "loss": 0.2878, "step": 32160 }, { "epoch": 6.327850311032431, "grad_norm": 1.3497095108032227, "learning_rate": 1.0366871503255986e-05, "loss": 0.3627, "step": 32170 }, { "epoch": 6.32981731454845, "grad_norm": 1.3138824701309204, "learning_rate": 1.0363814240728853e-05, "loss": 0.3123, "step": 32180 }, { "epoch": 6.331784318064469, "grad_norm": 1.375718593597412, "learning_rate": 1.0360756978201719e-05, "loss": 0.3333, "step": 32190 }, { "epoch": 6.333751321580487, "grad_norm": 1.069199800491333, "learning_rate": 1.0357699715674585e-05, "loss": 0.294, "step": 32200 }, { "epoch": 6.335718325096506, "grad_norm": 1.2835943698883057, "learning_rate": 1.0354642453147452e-05, "loss": 0.3857, "step": 32210 }, { "epoch": 6.337685328612525, "grad_norm": 1.1084645986557007, "learning_rate": 1.035158519062032e-05, "loss": 0.3547, "step": 32220 }, { "epoch": 6.339652332128543, "grad_norm": 0.8232660293579102, "learning_rate": 1.0348527928093185e-05, "loss": 0.2852, "step": 32230 }, { "epoch": 6.3416193356445625, "grad_norm": 1.066155195236206, "learning_rate": 1.0345470665566053e-05, "loss": 0.391, "step": 32240 }, { "epoch": 6.343586339160582, "grad_norm": 1.3041104078292847, "learning_rate": 1.034241340303892e-05, "loss": 0.3893, "step": 32250 }, { "epoch": 6.3455533426766, "grad_norm": 0.8700298070907593, "learning_rate": 1.0339356140511788e-05, "loss": 0.4521, "step": 32260 }, { "epoch": 6.347520346192619, "grad_norm": 0.884760320186615, "learning_rate": 1.0336298877984653e-05, "loss": 0.3217, "step": 32270 }, { "epoch": 6.349487349708638, "grad_norm": 1.4652529954910278, "learning_rate": 1.0333241615457521e-05, "loss": 0.3204, "step": 32280 }, { "epoch": 6.351454353224656, "grad_norm": 1.0134848356246948, "learning_rate": 1.0330184352930388e-05, "loss": 0.3291, "step": 32290 }, { "epoch": 6.353421356740675, "grad_norm": 1.1539208889007568, "learning_rate": 1.0327127090403252e-05, "loss": 0.3689, "step": 32300 }, { "epoch": 6.355388360256694, "grad_norm": 1.202022910118103, "learning_rate": 1.032406982787612e-05, "loss": 0.3629, "step": 32310 }, { "epoch": 6.3573553637727125, "grad_norm": 0.8208584785461426, "learning_rate": 1.0321012565348987e-05, "loss": 0.329, "step": 32320 }, { "epoch": 6.359322367288732, "grad_norm": 0.9570119380950928, "learning_rate": 1.0317955302821853e-05, "loss": 0.3857, "step": 32330 }, { "epoch": 6.361289370804751, "grad_norm": 0.9772025942802429, "learning_rate": 1.031489804029472e-05, "loss": 0.5343, "step": 32340 }, { "epoch": 6.363256374320769, "grad_norm": 0.41304048895835876, "learning_rate": 1.0311840777767588e-05, "loss": 0.323, "step": 32350 }, { "epoch": 6.365223377836788, "grad_norm": 1.1537532806396484, "learning_rate": 1.0308783515240454e-05, "loss": 0.3416, "step": 32360 }, { "epoch": 6.367190381352807, "grad_norm": 1.2086857557296753, "learning_rate": 1.0305726252713321e-05, "loss": 0.3707, "step": 32370 }, { "epoch": 6.369157384868825, "grad_norm": 0.5529403686523438, "learning_rate": 1.0302668990186189e-05, "loss": 0.3776, "step": 32380 }, { "epoch": 6.371124388384844, "grad_norm": 0.9833685159683228, "learning_rate": 1.0299611727659056e-05, "loss": 0.4551, "step": 32390 }, { "epoch": 6.373091391900863, "grad_norm": 1.5811911821365356, "learning_rate": 1.0296554465131922e-05, "loss": 0.5041, "step": 32400 }, { "epoch": 6.375058395416882, "grad_norm": 0.6909576654434204, "learning_rate": 1.0293497202604788e-05, "loss": 0.3439, "step": 32410 }, { "epoch": 6.377025398932901, "grad_norm": 0.8619405031204224, "learning_rate": 1.0290439940077654e-05, "loss": 0.3578, "step": 32420 }, { "epoch": 6.37899240244892, "grad_norm": 0.9947187900543213, "learning_rate": 1.0287382677550521e-05, "loss": 0.4153, "step": 32430 }, { "epoch": 6.380959405964938, "grad_norm": 0.8003689050674438, "learning_rate": 1.0284325415023389e-05, "loss": 0.2718, "step": 32440 }, { "epoch": 6.382926409480957, "grad_norm": 1.235228419303894, "learning_rate": 1.0281268152496256e-05, "loss": 0.5116, "step": 32450 }, { "epoch": 6.384893412996976, "grad_norm": 1.0889111757278442, "learning_rate": 1.0278210889969122e-05, "loss": 0.4442, "step": 32460 }, { "epoch": 6.386860416512994, "grad_norm": 1.042790174484253, "learning_rate": 1.027515362744199e-05, "loss": 0.276, "step": 32470 }, { "epoch": 6.388827420029013, "grad_norm": 1.6201059818267822, "learning_rate": 1.0272096364914857e-05, "loss": 0.3747, "step": 32480 }, { "epoch": 6.3907944235450325, "grad_norm": 1.099861741065979, "learning_rate": 1.0269039102387723e-05, "loss": 0.4556, "step": 32490 }, { "epoch": 6.392761427061051, "grad_norm": 1.1103860139846802, "learning_rate": 1.026598183986059e-05, "loss": 0.2511, "step": 32500 }, { "epoch": 6.392761427061051, "eval_loss": 0.14782226085662842, "eval_runtime": 8.8932, "eval_samples_per_second": 5.622, "eval_steps_per_second": 2.811, "step": 32500 }, { "epoch": 6.39472843057707, "grad_norm": 0.9129806160926819, "learning_rate": 1.0262924577333458e-05, "loss": 0.4684, "step": 32510 }, { "epoch": 6.396695434093089, "grad_norm": 1.1795682907104492, "learning_rate": 1.0259867314806325e-05, "loss": 0.3383, "step": 32520 }, { "epoch": 6.398662437609107, "grad_norm": 2.0722591876983643, "learning_rate": 1.0256810052279189e-05, "loss": 0.3128, "step": 32530 }, { "epoch": 6.400629441125126, "grad_norm": 1.4256749153137207, "learning_rate": 1.0253752789752057e-05, "loss": 0.3892, "step": 32540 }, { "epoch": 6.402596444641145, "grad_norm": 2.753934383392334, "learning_rate": 1.0250695527224922e-05, "loss": 0.4021, "step": 32550 }, { "epoch": 6.404563448157163, "grad_norm": 1.4296075105667114, "learning_rate": 1.024763826469779e-05, "loss": 0.3808, "step": 32560 }, { "epoch": 6.4065304516731825, "grad_norm": 0.9655913710594177, "learning_rate": 1.0244581002170657e-05, "loss": 0.4817, "step": 32570 }, { "epoch": 6.4084974551892016, "grad_norm": 1.0583401918411255, "learning_rate": 1.0241523739643525e-05, "loss": 0.4399, "step": 32580 }, { "epoch": 6.41046445870522, "grad_norm": 0.9479387998580933, "learning_rate": 1.023846647711639e-05, "loss": 0.3651, "step": 32590 }, { "epoch": 6.412431462221239, "grad_norm": 0.7584673166275024, "learning_rate": 1.0235409214589258e-05, "loss": 0.3063, "step": 32600 }, { "epoch": 6.414398465737258, "grad_norm": 1.1318553686141968, "learning_rate": 1.0232351952062125e-05, "loss": 0.426, "step": 32610 }, { "epoch": 6.416365469253276, "grad_norm": 1.6314092874526978, "learning_rate": 1.0229294689534991e-05, "loss": 0.3312, "step": 32620 }, { "epoch": 6.418332472769295, "grad_norm": 0.7870709300041199, "learning_rate": 1.0226237427007859e-05, "loss": 0.3737, "step": 32630 }, { "epoch": 6.420299476285313, "grad_norm": 0.8929570913314819, "learning_rate": 1.0223180164480724e-05, "loss": 0.2973, "step": 32640 }, { "epoch": 6.4222664798013325, "grad_norm": 1.1901662349700928, "learning_rate": 1.022012290195359e-05, "loss": 0.3487, "step": 32650 }, { "epoch": 6.424233483317352, "grad_norm": 1.270427942276001, "learning_rate": 1.0217065639426458e-05, "loss": 0.4018, "step": 32660 }, { "epoch": 6.42620048683337, "grad_norm": 1.4906806945800781, "learning_rate": 1.0214008376899325e-05, "loss": 0.3262, "step": 32670 }, { "epoch": 6.428167490349389, "grad_norm": 1.2974570989608765, "learning_rate": 1.0210951114372191e-05, "loss": 0.3769, "step": 32680 }, { "epoch": 6.430134493865408, "grad_norm": 1.0146774053573608, "learning_rate": 1.0207893851845058e-05, "loss": 0.4504, "step": 32690 }, { "epoch": 6.432101497381426, "grad_norm": 1.3171179294586182, "learning_rate": 1.0204836589317926e-05, "loss": 0.3756, "step": 32700 }, { "epoch": 6.434068500897445, "grad_norm": 1.1145819425582886, "learning_rate": 1.0201779326790793e-05, "loss": 0.3819, "step": 32710 }, { "epoch": 6.436035504413464, "grad_norm": 1.547440528869629, "learning_rate": 1.019872206426366e-05, "loss": 0.4455, "step": 32720 }, { "epoch": 6.4380025079294825, "grad_norm": 1.3117337226867676, "learning_rate": 1.0195664801736527e-05, "loss": 0.453, "step": 32730 }, { "epoch": 6.439969511445502, "grad_norm": 1.771700382232666, "learning_rate": 1.0192607539209394e-05, "loss": 0.3808, "step": 32740 }, { "epoch": 6.441936514961521, "grad_norm": 0.7972196936607361, "learning_rate": 1.0189550276682258e-05, "loss": 0.5139, "step": 32750 }, { "epoch": 6.443903518477539, "grad_norm": 0.4077063798904419, "learning_rate": 1.0186493014155126e-05, "loss": 0.3401, "step": 32760 }, { "epoch": 6.445870521993558, "grad_norm": 1.2201696634292603, "learning_rate": 1.0183435751627993e-05, "loss": 0.3978, "step": 32770 }, { "epoch": 6.447837525509577, "grad_norm": 2.2601189613342285, "learning_rate": 1.0180378489100859e-05, "loss": 0.3514, "step": 32780 }, { "epoch": 6.449804529025595, "grad_norm": 1.1869385242462158, "learning_rate": 1.0177321226573726e-05, "loss": 0.4924, "step": 32790 }, { "epoch": 6.451771532541614, "grad_norm": 1.4132533073425293, "learning_rate": 1.0174263964046594e-05, "loss": 0.3294, "step": 32800 }, { "epoch": 6.453738536057633, "grad_norm": 1.091038465499878, "learning_rate": 1.017120670151946e-05, "loss": 0.4039, "step": 32810 }, { "epoch": 6.455705539573652, "grad_norm": 1.458566665649414, "learning_rate": 1.0168149438992327e-05, "loss": 0.3343, "step": 32820 }, { "epoch": 6.457672543089671, "grad_norm": 1.7340314388275146, "learning_rate": 1.0165092176465195e-05, "loss": 0.3251, "step": 32830 }, { "epoch": 6.45963954660569, "grad_norm": 2.443197250366211, "learning_rate": 1.0162034913938062e-05, "loss": 0.4722, "step": 32840 }, { "epoch": 6.461606550121708, "grad_norm": 0.9471251368522644, "learning_rate": 1.0158977651410928e-05, "loss": 0.3322, "step": 32850 }, { "epoch": 6.463573553637727, "grad_norm": 1.2571531534194946, "learning_rate": 1.0155920388883795e-05, "loss": 0.3339, "step": 32860 }, { "epoch": 6.465540557153746, "grad_norm": 0.7914329171180725, "learning_rate": 1.015286312635666e-05, "loss": 0.3534, "step": 32870 }, { "epoch": 6.467507560669764, "grad_norm": 1.7630119323730469, "learning_rate": 1.0149805863829527e-05, "loss": 0.4638, "step": 32880 }, { "epoch": 6.469474564185783, "grad_norm": 1.0704189538955688, "learning_rate": 1.0146748601302394e-05, "loss": 0.341, "step": 32890 }, { "epoch": 6.4714415677018025, "grad_norm": 1.5141534805297852, "learning_rate": 1.0143691338775262e-05, "loss": 0.4852, "step": 32900 }, { "epoch": 6.473408571217821, "grad_norm": 0.8697926998138428, "learning_rate": 1.0140634076248128e-05, "loss": 0.3165, "step": 32910 }, { "epoch": 6.47537557473384, "grad_norm": 1.7349079847335815, "learning_rate": 1.0137576813720995e-05, "loss": 0.2807, "step": 32920 }, { "epoch": 6.477342578249859, "grad_norm": 1.5268747806549072, "learning_rate": 1.0134519551193863e-05, "loss": 0.3529, "step": 32930 }, { "epoch": 6.479309581765877, "grad_norm": 1.3544410467147827, "learning_rate": 1.0131462288666728e-05, "loss": 0.2781, "step": 32940 }, { "epoch": 6.481276585281896, "grad_norm": 1.6031782627105713, "learning_rate": 1.0128405026139596e-05, "loss": 0.3772, "step": 32950 }, { "epoch": 6.483243588797915, "grad_norm": 0.9160840511322021, "learning_rate": 1.0125347763612463e-05, "loss": 0.2645, "step": 32960 }, { "epoch": 6.485210592313933, "grad_norm": 1.133995532989502, "learning_rate": 1.012229050108533e-05, "loss": 0.2998, "step": 32970 }, { "epoch": 6.4871775958299525, "grad_norm": 2.93790864944458, "learning_rate": 1.0119233238558195e-05, "loss": 0.3928, "step": 32980 }, { "epoch": 6.4891445993459715, "grad_norm": 2.064584255218506, "learning_rate": 1.0116175976031062e-05, "loss": 0.3689, "step": 32990 }, { "epoch": 6.49111160286199, "grad_norm": 0.8051519393920898, "learning_rate": 1.0113118713503928e-05, "loss": 0.4315, "step": 33000 }, { "epoch": 6.49111160286199, "eval_loss": 0.14803524315357208, "eval_runtime": 8.8266, "eval_samples_per_second": 5.665, "eval_steps_per_second": 2.832, "step": 33000 }, { "epoch": 6.493078606378009, "grad_norm": 1.1119964122772217, "learning_rate": 1.0110061450976796e-05, "loss": 0.4261, "step": 33010 }, { "epoch": 6.495045609894028, "grad_norm": 0.7975828051567078, "learning_rate": 1.0107004188449663e-05, "loss": 0.3573, "step": 33020 }, { "epoch": 6.497012613410046, "grad_norm": 1.5630384683609009, "learning_rate": 1.010394692592253e-05, "loss": 0.3167, "step": 33030 }, { "epoch": 6.498979616926065, "grad_norm": 0.8238933086395264, "learning_rate": 1.0100889663395396e-05, "loss": 0.4177, "step": 33040 }, { "epoch": 6.500946620442084, "grad_norm": 0.8332470655441284, "learning_rate": 1.0097832400868264e-05, "loss": 0.3705, "step": 33050 }, { "epoch": 6.5029136239581025, "grad_norm": 0.9948000907897949, "learning_rate": 1.0094775138341131e-05, "loss": 0.4374, "step": 33060 }, { "epoch": 6.5048806274741215, "grad_norm": 0.8258544206619263, "learning_rate": 1.0091717875813997e-05, "loss": 0.3398, "step": 33070 }, { "epoch": 6.506847630990141, "grad_norm": 0.8997102379798889, "learning_rate": 1.0088660613286864e-05, "loss": 0.2978, "step": 33080 }, { "epoch": 6.508814634506159, "grad_norm": 1.6043155193328857, "learning_rate": 1.008560335075973e-05, "loss": 0.4263, "step": 33090 }, { "epoch": 6.510781638022178, "grad_norm": 1.8473858833312988, "learning_rate": 1.0082546088232596e-05, "loss": 0.3009, "step": 33100 }, { "epoch": 6.512748641538197, "grad_norm": 1.3721176385879517, "learning_rate": 1.0079488825705463e-05, "loss": 0.331, "step": 33110 }, { "epoch": 6.514715645054215, "grad_norm": 1.10801100730896, "learning_rate": 1.0076431563178331e-05, "loss": 0.334, "step": 33120 }, { "epoch": 6.516682648570234, "grad_norm": 0.6907325983047485, "learning_rate": 1.0073374300651197e-05, "loss": 0.346, "step": 33130 }, { "epoch": 6.518649652086253, "grad_norm": 0.8262765407562256, "learning_rate": 1.0070317038124064e-05, "loss": 0.3339, "step": 33140 }, { "epoch": 6.5206166556022715, "grad_norm": 1.0150346755981445, "learning_rate": 1.0067259775596932e-05, "loss": 0.317, "step": 33150 }, { "epoch": 6.522583659118291, "grad_norm": 2.1446454524993896, "learning_rate": 1.00642025130698e-05, "loss": 0.2435, "step": 33160 }, { "epoch": 6.52455066263431, "grad_norm": 1.5295718908309937, "learning_rate": 1.0061145250542665e-05, "loss": 0.3208, "step": 33170 }, { "epoch": 6.526517666150328, "grad_norm": 0.38532695174217224, "learning_rate": 1.0058087988015532e-05, "loss": 0.322, "step": 33180 }, { "epoch": 6.528484669666347, "grad_norm": 0.6402590274810791, "learning_rate": 1.00550307254884e-05, "loss": 0.3332, "step": 33190 }, { "epoch": 6.530451673182366, "grad_norm": 1.3206236362457275, "learning_rate": 1.0051973462961266e-05, "loss": 0.3526, "step": 33200 }, { "epoch": 6.532418676698384, "grad_norm": 1.180548071861267, "learning_rate": 1.0048916200434131e-05, "loss": 0.3406, "step": 33210 }, { "epoch": 6.534385680214403, "grad_norm": 0.8427443504333496, "learning_rate": 1.0045858937906999e-05, "loss": 0.3626, "step": 33220 }, { "epoch": 6.536352683730422, "grad_norm": 1.1290903091430664, "learning_rate": 1.0042801675379865e-05, "loss": 0.3294, "step": 33230 }, { "epoch": 6.538319687246441, "grad_norm": 1.4042856693267822, "learning_rate": 1.0039744412852732e-05, "loss": 0.351, "step": 33240 }, { "epoch": 6.54028669076246, "grad_norm": 1.821936011314392, "learning_rate": 1.00366871503256e-05, "loss": 0.3147, "step": 33250 }, { "epoch": 6.542253694278479, "grad_norm": 1.4279309511184692, "learning_rate": 1.0033629887798465e-05, "loss": 0.3773, "step": 33260 }, { "epoch": 6.544220697794497, "grad_norm": 0.9054422378540039, "learning_rate": 1.0030572625271333e-05, "loss": 0.3617, "step": 33270 }, { "epoch": 6.546187701310516, "grad_norm": 1.0631873607635498, "learning_rate": 1.00275153627442e-05, "loss": 0.4193, "step": 33280 }, { "epoch": 6.548154704826535, "grad_norm": 1.4749603271484375, "learning_rate": 1.0024458100217068e-05, "loss": 0.3965, "step": 33290 }, { "epoch": 6.550121708342553, "grad_norm": 1.0832397937774658, "learning_rate": 1.0021400837689934e-05, "loss": 0.2609, "step": 33300 }, { "epoch": 6.5520887118585724, "grad_norm": 2.431015729904175, "learning_rate": 1.0018343575162801e-05, "loss": 0.3519, "step": 33310 }, { "epoch": 6.5540557153745915, "grad_norm": 1.274034023284912, "learning_rate": 1.0015286312635665e-05, "loss": 0.3999, "step": 33320 }, { "epoch": 6.55602271889061, "grad_norm": 2.3737590312957764, "learning_rate": 1.0012229050108533e-05, "loss": 0.4696, "step": 33330 }, { "epoch": 6.557989722406629, "grad_norm": 2.1949424743652344, "learning_rate": 1.00091717875814e-05, "loss": 0.3498, "step": 33340 }, { "epoch": 6.559956725922648, "grad_norm": 1.2474371194839478, "learning_rate": 1.0006114525054268e-05, "loss": 0.505, "step": 33350 }, { "epoch": 6.561923729438666, "grad_norm": 1.3272656202316284, "learning_rate": 1.0003057262527133e-05, "loss": 0.4109, "step": 33360 }, { "epoch": 6.563890732954685, "grad_norm": 0.6079081296920776, "learning_rate": 1e-05, "loss": 0.2928, "step": 33370 }, { "epoch": 6.565857736470704, "grad_norm": 0.39416030049324036, "learning_rate": 9.996942737472868e-06, "loss": 0.4067, "step": 33380 }, { "epoch": 6.5678247399867224, "grad_norm": 1.118470549583435, "learning_rate": 9.993885474945734e-06, "loss": 0.3543, "step": 33390 }, { "epoch": 6.5697917435027415, "grad_norm": 1.9530739784240723, "learning_rate": 9.9908282124186e-06, "loss": 0.4074, "step": 33400 }, { "epoch": 6.571758747018761, "grad_norm": 0.9212591052055359, "learning_rate": 9.987770949891467e-06, "loss": 0.4043, "step": 33410 }, { "epoch": 6.573725750534779, "grad_norm": 1.6362104415893555, "learning_rate": 9.984713687364335e-06, "loss": 0.3953, "step": 33420 }, { "epoch": 6.575692754050798, "grad_norm": 0.9912499189376831, "learning_rate": 9.981656424837202e-06, "loss": 0.2861, "step": 33430 }, { "epoch": 6.577659757566817, "grad_norm": 1.6725244522094727, "learning_rate": 9.978599162310068e-06, "loss": 0.3207, "step": 33440 }, { "epoch": 6.579626761082835, "grad_norm": 0.7188600301742554, "learning_rate": 9.975541899782936e-06, "loss": 0.2644, "step": 33450 }, { "epoch": 6.581593764598854, "grad_norm": 2.0234286785125732, "learning_rate": 9.972484637255801e-06, "loss": 0.3419, "step": 33460 }, { "epoch": 6.583560768114873, "grad_norm": 0.6678216457366943, "learning_rate": 9.969427374728669e-06, "loss": 0.374, "step": 33470 }, { "epoch": 6.5855277716308915, "grad_norm": 0.9582223892211914, "learning_rate": 9.966370112201536e-06, "loss": 0.3289, "step": 33480 }, { "epoch": 6.587494775146911, "grad_norm": 1.9727556705474854, "learning_rate": 9.963312849674402e-06, "loss": 0.3535, "step": 33490 }, { "epoch": 6.58946177866293, "grad_norm": 1.2631657123565674, "learning_rate": 9.96025558714727e-06, "loss": 0.3904, "step": 33500 }, { "epoch": 6.58946177866293, "eval_loss": 0.1484050750732422, "eval_runtime": 8.8797, "eval_samples_per_second": 5.631, "eval_steps_per_second": 2.815, "step": 33500 }, { "epoch": 6.591428782178948, "grad_norm": 0.944599449634552, "learning_rate": 9.957198324620137e-06, "loss": 0.3904, "step": 33510 }, { "epoch": 6.593395785694967, "grad_norm": 1.7309553623199463, "learning_rate": 9.954141062093003e-06, "loss": 0.4003, "step": 33520 }, { "epoch": 6.595362789210986, "grad_norm": 1.5342662334442139, "learning_rate": 9.951083799565869e-06, "loss": 0.3995, "step": 33530 }, { "epoch": 6.597329792727004, "grad_norm": 0.9480270743370056, "learning_rate": 9.948026537038736e-06, "loss": 0.2778, "step": 33540 }, { "epoch": 6.599296796243023, "grad_norm": 0.864125669002533, "learning_rate": 9.944969274511603e-06, "loss": 0.4614, "step": 33550 }, { "epoch": 6.601263799759042, "grad_norm": 3.487790107727051, "learning_rate": 9.941912011984471e-06, "loss": 0.4029, "step": 33560 }, { "epoch": 6.603230803275061, "grad_norm": 2.3038623332977295, "learning_rate": 9.938854749457337e-06, "loss": 0.3582, "step": 33570 }, { "epoch": 6.60519780679108, "grad_norm": 2.7051186561584473, "learning_rate": 9.935797486930202e-06, "loss": 0.3567, "step": 33580 }, { "epoch": 6.607164810307099, "grad_norm": 2.1781363487243652, "learning_rate": 9.93274022440307e-06, "loss": 0.4223, "step": 33590 }, { "epoch": 6.609131813823117, "grad_norm": 0.9622666239738464, "learning_rate": 9.929682961875937e-06, "loss": 0.3935, "step": 33600 }, { "epoch": 6.611098817339136, "grad_norm": 0.6262819170951843, "learning_rate": 9.926625699348805e-06, "loss": 0.2501, "step": 33610 }, { "epoch": 6.613065820855155, "grad_norm": 1.4548559188842773, "learning_rate": 9.92356843682167e-06, "loss": 0.5423, "step": 33620 }, { "epoch": 6.615032824371173, "grad_norm": 0.7134609222412109, "learning_rate": 9.920511174294536e-06, "loss": 0.3038, "step": 33630 }, { "epoch": 6.616999827887192, "grad_norm": 0.9675575494766235, "learning_rate": 9.917453911767404e-06, "loss": 0.313, "step": 33640 }, { "epoch": 6.6189668314032115, "grad_norm": 1.1765183210372925, "learning_rate": 9.914396649240271e-06, "loss": 0.4338, "step": 33650 }, { "epoch": 6.62093383491923, "grad_norm": 1.1487053632736206, "learning_rate": 9.911339386713137e-06, "loss": 0.4628, "step": 33660 }, { "epoch": 6.622900838435249, "grad_norm": 0.7740342020988464, "learning_rate": 9.908282124186005e-06, "loss": 0.3036, "step": 33670 }, { "epoch": 6.624867841951268, "grad_norm": 0.717975914478302, "learning_rate": 9.905224861658872e-06, "loss": 0.4258, "step": 33680 }, { "epoch": 6.626834845467286, "grad_norm": 1.3622957468032837, "learning_rate": 9.902167599131738e-06, "loss": 0.5035, "step": 33690 }, { "epoch": 6.628801848983305, "grad_norm": 0.8520593643188477, "learning_rate": 9.899110336604605e-06, "loss": 0.4059, "step": 33700 }, { "epoch": 6.630768852499324, "grad_norm": 1.1923712491989136, "learning_rate": 9.896053074077471e-06, "loss": 0.3483, "step": 33710 }, { "epoch": 6.632735856015342, "grad_norm": 1.4621663093566895, "learning_rate": 9.892995811550339e-06, "loss": 0.3298, "step": 33720 }, { "epoch": 6.6347028595313615, "grad_norm": 1.36235511302948, "learning_rate": 9.889938549023206e-06, "loss": 0.3386, "step": 33730 }, { "epoch": 6.636669863047381, "grad_norm": 0.9748831987380981, "learning_rate": 9.886881286496072e-06, "loss": 0.3864, "step": 33740 }, { "epoch": 6.638636866563399, "grad_norm": 0.5554082989692688, "learning_rate": 9.88382402396894e-06, "loss": 0.3939, "step": 33750 }, { "epoch": 6.640603870079418, "grad_norm": 1.192552924156189, "learning_rate": 9.880766761441805e-06, "loss": 0.4679, "step": 33760 }, { "epoch": 6.642570873595437, "grad_norm": 1.8474180698394775, "learning_rate": 9.877709498914673e-06, "loss": 0.4699, "step": 33770 }, { "epoch": 6.644537877111455, "grad_norm": 1.1618289947509766, "learning_rate": 9.87465223638754e-06, "loss": 0.3393, "step": 33780 }, { "epoch": 6.646504880627474, "grad_norm": 0.9800117015838623, "learning_rate": 9.871594973860406e-06, "loss": 0.3497, "step": 33790 }, { "epoch": 6.648471884143493, "grad_norm": 1.332341194152832, "learning_rate": 9.868537711333273e-06, "loss": 0.4472, "step": 33800 }, { "epoch": 6.6504388876595115, "grad_norm": 1.6299222707748413, "learning_rate": 9.865480448806139e-06, "loss": 0.4534, "step": 33810 }, { "epoch": 6.652405891175531, "grad_norm": 0.9234393239021301, "learning_rate": 9.862423186279007e-06, "loss": 0.3266, "step": 33820 }, { "epoch": 6.65437289469155, "grad_norm": 0.9883418679237366, "learning_rate": 9.859365923751874e-06, "loss": 0.2851, "step": 33830 }, { "epoch": 6.656339898207568, "grad_norm": 1.3818767070770264, "learning_rate": 9.85630866122474e-06, "loss": 0.3963, "step": 33840 }, { "epoch": 6.658306901723587, "grad_norm": 1.418997049331665, "learning_rate": 9.853251398697607e-06, "loss": 0.2881, "step": 33850 }, { "epoch": 6.660273905239606, "grad_norm": 1.2970802783966064, "learning_rate": 9.850194136170473e-06, "loss": 0.2917, "step": 33860 }, { "epoch": 6.662240908755624, "grad_norm": 1.2719851732254028, "learning_rate": 9.84713687364334e-06, "loss": 0.5373, "step": 33870 }, { "epoch": 6.664207912271643, "grad_norm": 0.9323065280914307, "learning_rate": 9.844079611116208e-06, "loss": 0.3705, "step": 33880 }, { "epoch": 6.666174915787662, "grad_norm": 1.5898349285125732, "learning_rate": 9.841022348589074e-06, "loss": 0.4218, "step": 33890 }, { "epoch": 6.668141919303681, "grad_norm": 1.950779676437378, "learning_rate": 9.837965086061941e-06, "loss": 0.4501, "step": 33900 }, { "epoch": 6.6701089228197, "grad_norm": 1.2006927728652954, "learning_rate": 9.834907823534807e-06, "loss": 0.2791, "step": 33910 }, { "epoch": 6.672075926335719, "grad_norm": 4.144277572631836, "learning_rate": 9.831850561007675e-06, "loss": 0.3079, "step": 33920 }, { "epoch": 6.674042929851737, "grad_norm": 0.7587095499038696, "learning_rate": 9.828793298480542e-06, "loss": 0.2736, "step": 33930 }, { "epoch": 6.676009933367756, "grad_norm": 1.2794342041015625, "learning_rate": 9.825736035953408e-06, "loss": 0.3481, "step": 33940 }, { "epoch": 6.677976936883775, "grad_norm": 1.8106298446655273, "learning_rate": 9.822678773426275e-06, "loss": 0.3745, "step": 33950 }, { "epoch": 6.679943940399793, "grad_norm": 1.3771394491195679, "learning_rate": 9.819621510899143e-06, "loss": 0.3761, "step": 33960 }, { "epoch": 6.681910943915812, "grad_norm": 2.139343023300171, "learning_rate": 9.816564248372008e-06, "loss": 0.3894, "step": 33970 }, { "epoch": 6.6838779474318315, "grad_norm": 1.1938695907592773, "learning_rate": 9.813506985844874e-06, "loss": 0.4358, "step": 33980 }, { "epoch": 6.68584495094785, "grad_norm": 1.9572737216949463, "learning_rate": 9.810449723317742e-06, "loss": 0.3695, "step": 33990 }, { "epoch": 6.687811954463869, "grad_norm": 1.5220359563827515, "learning_rate": 9.80739246079061e-06, "loss": 0.4523, "step": 34000 }, { "epoch": 6.687811954463869, "eval_loss": 0.14722880721092224, "eval_runtime": 8.8579, "eval_samples_per_second": 5.645, "eval_steps_per_second": 2.822, "step": 34000 }, { "epoch": 6.689778957979888, "grad_norm": 1.234450101852417, "learning_rate": 9.804335198263477e-06, "loss": 0.4261, "step": 34010 }, { "epoch": 6.691745961495906, "grad_norm": 3.4291653633117676, "learning_rate": 9.801277935736342e-06, "loss": 0.4619, "step": 34020 }, { "epoch": 6.693712965011925, "grad_norm": 1.2263903617858887, "learning_rate": 9.798220673209208e-06, "loss": 0.4102, "step": 34030 }, { "epoch": 6.695679968527944, "grad_norm": 1.5767312049865723, "learning_rate": 9.795163410682076e-06, "loss": 0.4185, "step": 34040 }, { "epoch": 6.697646972043962, "grad_norm": 1.0134525299072266, "learning_rate": 9.792106148154943e-06, "loss": 0.5154, "step": 34050 }, { "epoch": 6.6996139755599815, "grad_norm": 1.2162753343582153, "learning_rate": 9.78904888562781e-06, "loss": 0.3059, "step": 34060 }, { "epoch": 6.701580979076001, "grad_norm": 0.9273087382316589, "learning_rate": 9.785991623100676e-06, "loss": 0.5046, "step": 34070 }, { "epoch": 6.703547982592019, "grad_norm": 1.6081489324569702, "learning_rate": 9.782934360573542e-06, "loss": 0.3274, "step": 34080 }, { "epoch": 6.705514986108038, "grad_norm": 1.1477094888687134, "learning_rate": 9.77987709804641e-06, "loss": 0.3135, "step": 34090 }, { "epoch": 6.707481989624057, "grad_norm": 1.948773980140686, "learning_rate": 9.776819835519277e-06, "loss": 0.3611, "step": 34100 }, { "epoch": 6.709448993140075, "grad_norm": 0.7589294910430908, "learning_rate": 9.773762572992143e-06, "loss": 0.4675, "step": 34110 }, { "epoch": 6.711415996656094, "grad_norm": 1.0926454067230225, "learning_rate": 9.77070531046501e-06, "loss": 0.4786, "step": 34120 }, { "epoch": 6.713383000172112, "grad_norm": 1.9458638429641724, "learning_rate": 9.767648047937878e-06, "loss": 0.391, "step": 34130 }, { "epoch": 6.7153500036881315, "grad_norm": 1.7874311208724976, "learning_rate": 9.764590785410744e-06, "loss": 0.3388, "step": 34140 }, { "epoch": 6.717317007204151, "grad_norm": 1.0775543451309204, "learning_rate": 9.761533522883611e-06, "loss": 0.4601, "step": 34150 }, { "epoch": 6.719284010720169, "grad_norm": 1.0132941007614136, "learning_rate": 9.758476260356477e-06, "loss": 0.3121, "step": 34160 }, { "epoch": 6.721251014236188, "grad_norm": 0.6694428324699402, "learning_rate": 9.755418997829344e-06, "loss": 0.4602, "step": 34170 }, { "epoch": 6.723218017752207, "grad_norm": 4.015652656555176, "learning_rate": 9.752361735302212e-06, "loss": 0.3528, "step": 34180 }, { "epoch": 6.725185021268225, "grad_norm": 1.2746211290359497, "learning_rate": 9.74930447277508e-06, "loss": 0.2661, "step": 34190 }, { "epoch": 6.727152024784244, "grad_norm": 0.7422463893890381, "learning_rate": 9.746247210247945e-06, "loss": 0.2808, "step": 34200 }, { "epoch": 6.729119028300263, "grad_norm": 1.6404222249984741, "learning_rate": 9.743189947720811e-06, "loss": 0.2404, "step": 34210 }, { "epoch": 6.7310860318162815, "grad_norm": 1.4843297004699707, "learning_rate": 9.740132685193678e-06, "loss": 0.4713, "step": 34220 }, { "epoch": 6.733053035332301, "grad_norm": 0.8812487125396729, "learning_rate": 9.737075422666546e-06, "loss": 0.3954, "step": 34230 }, { "epoch": 6.73502003884832, "grad_norm": 1.0680122375488281, "learning_rate": 9.734018160139412e-06, "loss": 0.3453, "step": 34240 }, { "epoch": 6.736987042364338, "grad_norm": 0.6341516971588135, "learning_rate": 9.730960897612279e-06, "loss": 0.3423, "step": 34250 }, { "epoch": 6.738954045880357, "grad_norm": 1.3379762172698975, "learning_rate": 9.727903635085145e-06, "loss": 0.3971, "step": 34260 }, { "epoch": 6.740921049396376, "grad_norm": 0.9502032399177551, "learning_rate": 9.724846372558012e-06, "loss": 0.3367, "step": 34270 }, { "epoch": 6.742888052912394, "grad_norm": 1.1735727787017822, "learning_rate": 9.72178911003088e-06, "loss": 0.3365, "step": 34280 }, { "epoch": 6.744855056428413, "grad_norm": 1.3326324224472046, "learning_rate": 9.718731847503746e-06, "loss": 0.2607, "step": 34290 }, { "epoch": 6.746822059944432, "grad_norm": 1.5648033618927002, "learning_rate": 9.715674584976613e-06, "loss": 0.3734, "step": 34300 }, { "epoch": 6.748789063460451, "grad_norm": 1.3950005769729614, "learning_rate": 9.712617322449479e-06, "loss": 0.3373, "step": 34310 }, { "epoch": 6.75075606697647, "grad_norm": 1.2418285608291626, "learning_rate": 9.709560059922346e-06, "loss": 0.4781, "step": 34320 }, { "epoch": 6.752723070492489, "grad_norm": 1.5094794034957886, "learning_rate": 9.706502797395214e-06, "loss": 0.3345, "step": 34330 }, { "epoch": 6.754690074008507, "grad_norm": 1.1631951332092285, "learning_rate": 9.70344553486808e-06, "loss": 0.2247, "step": 34340 }, { "epoch": 6.756657077524526, "grad_norm": 2.2930450439453125, "learning_rate": 9.700388272340947e-06, "loss": 0.5198, "step": 34350 }, { "epoch": 6.758624081040545, "grad_norm": 1.347464919090271, "learning_rate": 9.697331009813814e-06, "loss": 0.2888, "step": 34360 }, { "epoch": 6.760591084556563, "grad_norm": 1.6012616157531738, "learning_rate": 9.69427374728668e-06, "loss": 0.4417, "step": 34370 }, { "epoch": 6.762558088072582, "grad_norm": 1.6863954067230225, "learning_rate": 9.691216484759548e-06, "loss": 0.2953, "step": 34380 }, { "epoch": 6.7645250915886015, "grad_norm": 1.4628210067749023, "learning_rate": 9.688159222232413e-06, "loss": 0.36, "step": 34390 }, { "epoch": 6.76649209510462, "grad_norm": 1.8912512063980103, "learning_rate": 9.685101959705281e-06, "loss": 0.3969, "step": 34400 }, { "epoch": 6.768459098620639, "grad_norm": 1.2532269954681396, "learning_rate": 9.682044697178148e-06, "loss": 0.4475, "step": 34410 }, { "epoch": 6.770426102136658, "grad_norm": 1.3984140157699585, "learning_rate": 9.678987434651014e-06, "loss": 0.3566, "step": 34420 }, { "epoch": 6.772393105652676, "grad_norm": 0.7679744958877563, "learning_rate": 9.67593017212388e-06, "loss": 0.3454, "step": 34430 }, { "epoch": 6.774360109168695, "grad_norm": 0.7441784739494324, "learning_rate": 9.672872909596747e-06, "loss": 0.3793, "step": 34440 }, { "epoch": 6.776327112684714, "grad_norm": 1.1137535572052002, "learning_rate": 9.669815647069615e-06, "loss": 0.4104, "step": 34450 }, { "epoch": 6.778294116200732, "grad_norm": 8.478157997131348, "learning_rate": 9.666758384542482e-06, "loss": 0.337, "step": 34460 }, { "epoch": 6.7802611197167515, "grad_norm": 0.7802620530128479, "learning_rate": 9.663701122015348e-06, "loss": 0.4132, "step": 34470 }, { "epoch": 6.782228123232771, "grad_norm": 1.2110645771026611, "learning_rate": 9.660643859488214e-06, "loss": 0.3649, "step": 34480 }, { "epoch": 6.784195126748789, "grad_norm": 0.8571938872337341, "learning_rate": 9.657586596961081e-06, "loss": 0.3435, "step": 34490 }, { "epoch": 6.786162130264808, "grad_norm": 0.9900618195533752, "learning_rate": 9.654529334433949e-06, "loss": 0.3434, "step": 34500 }, { "epoch": 6.786162130264808, "eval_loss": 0.14516228437423706, "eval_runtime": 8.8825, "eval_samples_per_second": 5.629, "eval_steps_per_second": 2.815, "step": 34500 }, { "epoch": 6.788129133780827, "grad_norm": 1.6814920902252197, "learning_rate": 9.651472071906816e-06, "loss": 0.4392, "step": 34510 }, { "epoch": 6.790096137296845, "grad_norm": 2.54838490486145, "learning_rate": 9.648414809379682e-06, "loss": 0.2895, "step": 34520 }, { "epoch": 6.792063140812864, "grad_norm": 1.4882313013076782, "learning_rate": 9.64535754685255e-06, "loss": 0.4699, "step": 34530 }, { "epoch": 6.794030144328883, "grad_norm": 1.2992831468582153, "learning_rate": 9.642300284325415e-06, "loss": 0.3009, "step": 34540 }, { "epoch": 6.7959971478449015, "grad_norm": 1.3498175144195557, "learning_rate": 9.639243021798283e-06, "loss": 0.5047, "step": 34550 }, { "epoch": 6.797964151360921, "grad_norm": 0.9251207113265991, "learning_rate": 9.636185759271149e-06, "loss": 0.3042, "step": 34560 }, { "epoch": 6.79993115487694, "grad_norm": 1.6040048599243164, "learning_rate": 9.633128496744016e-06, "loss": 0.3682, "step": 34570 }, { "epoch": 6.801898158392958, "grad_norm": 0.9295186996459961, "learning_rate": 9.630071234216884e-06, "loss": 0.2468, "step": 34580 }, { "epoch": 6.803865161908977, "grad_norm": 1.454932451248169, "learning_rate": 9.62701397168975e-06, "loss": 0.4661, "step": 34590 }, { "epoch": 6.805832165424996, "grad_norm": 0.9594427347183228, "learning_rate": 9.623956709162617e-06, "loss": 0.3591, "step": 34600 }, { "epoch": 6.807799168941014, "grad_norm": 1.4732391834259033, "learning_rate": 9.620899446635483e-06, "loss": 0.3109, "step": 34610 }, { "epoch": 6.809766172457033, "grad_norm": 1.8341666460037231, "learning_rate": 9.61784218410835e-06, "loss": 0.2608, "step": 34620 }, { "epoch": 6.811733175973052, "grad_norm": 1.3303409814834595, "learning_rate": 9.614784921581218e-06, "loss": 0.3969, "step": 34630 }, { "epoch": 6.813700179489071, "grad_norm": 2.279348850250244, "learning_rate": 9.611727659054085e-06, "loss": 0.3592, "step": 34640 }, { "epoch": 6.81566718300509, "grad_norm": 1.6453670263290405, "learning_rate": 9.60867039652695e-06, "loss": 0.4212, "step": 34650 }, { "epoch": 6.817634186521109, "grad_norm": 1.5812371969223022, "learning_rate": 9.605613133999817e-06, "loss": 0.3665, "step": 34660 }, { "epoch": 6.819601190037127, "grad_norm": 0.9130092263221741, "learning_rate": 9.602555871472684e-06, "loss": 0.362, "step": 34670 }, { "epoch": 6.821568193553146, "grad_norm": 0.9470576047897339, "learning_rate": 9.599498608945552e-06, "loss": 0.2942, "step": 34680 }, { "epoch": 6.823535197069165, "grad_norm": 0.9792674779891968, "learning_rate": 9.596441346418417e-06, "loss": 0.388, "step": 34690 }, { "epoch": 6.825502200585183, "grad_norm": 1.5535203218460083, "learning_rate": 9.593384083891285e-06, "loss": 0.321, "step": 34700 }, { "epoch": 6.827469204101202, "grad_norm": 1.3171846866607666, "learning_rate": 9.59032682136415e-06, "loss": 0.3896, "step": 34710 }, { "epoch": 6.8294362076172215, "grad_norm": 1.0758510828018188, "learning_rate": 9.587269558837018e-06, "loss": 0.3583, "step": 34720 }, { "epoch": 6.83140321113324, "grad_norm": 0.9125159978866577, "learning_rate": 9.584212296309886e-06, "loss": 0.3526, "step": 34730 }, { "epoch": 6.833370214649259, "grad_norm": 1.0016282796859741, "learning_rate": 9.581155033782751e-06, "loss": 0.2809, "step": 34740 }, { "epoch": 6.835337218165278, "grad_norm": 1.18870210647583, "learning_rate": 9.578097771255619e-06, "loss": 0.4357, "step": 34750 }, { "epoch": 6.837304221681296, "grad_norm": 1.2393519878387451, "learning_rate": 9.575040508728485e-06, "loss": 0.2914, "step": 34760 }, { "epoch": 6.839271225197315, "grad_norm": 0.9247721433639526, "learning_rate": 9.571983246201352e-06, "loss": 0.3452, "step": 34770 }, { "epoch": 6.841238228713334, "grad_norm": 0.6819230318069458, "learning_rate": 9.56892598367422e-06, "loss": 0.428, "step": 34780 }, { "epoch": 6.843205232229352, "grad_norm": 0.8827270269393921, "learning_rate": 9.565868721147085e-06, "loss": 0.4428, "step": 34790 }, { "epoch": 6.8451722357453715, "grad_norm": 1.2155219316482544, "learning_rate": 9.562811458619953e-06, "loss": 0.3088, "step": 34800 }, { "epoch": 6.847139239261391, "grad_norm": 1.844770908355713, "learning_rate": 9.55975419609282e-06, "loss": 0.3482, "step": 34810 }, { "epoch": 6.849106242777409, "grad_norm": 1.582555890083313, "learning_rate": 9.556696933565686e-06, "loss": 0.3297, "step": 34820 }, { "epoch": 6.851073246293428, "grad_norm": 0.9563121795654297, "learning_rate": 9.553639671038552e-06, "loss": 0.4846, "step": 34830 }, { "epoch": 6.853040249809446, "grad_norm": 1.4238522052764893, "learning_rate": 9.55058240851142e-06, "loss": 0.3762, "step": 34840 }, { "epoch": 6.855007253325465, "grad_norm": 1.6920660734176636, "learning_rate": 9.547525145984287e-06, "loss": 0.4887, "step": 34850 }, { "epoch": 6.856974256841484, "grad_norm": 1.3875815868377686, "learning_rate": 9.544467883457154e-06, "loss": 0.5668, "step": 34860 }, { "epoch": 6.858941260357502, "grad_norm": 0.7671670913696289, "learning_rate": 9.54141062093002e-06, "loss": 0.3215, "step": 34870 }, { "epoch": 6.8609082638735215, "grad_norm": 0.8364396691322327, "learning_rate": 9.538353358402886e-06, "loss": 0.3085, "step": 34880 }, { "epoch": 6.862875267389541, "grad_norm": 1.3606170415878296, "learning_rate": 9.535296095875753e-06, "loss": 0.2713, "step": 34890 }, { "epoch": 6.864842270905559, "grad_norm": 1.0086907148361206, "learning_rate": 9.53223883334862e-06, "loss": 0.3078, "step": 34900 }, { "epoch": 6.866809274421578, "grad_norm": 1.5053181648254395, "learning_rate": 9.529181570821488e-06, "loss": 0.4035, "step": 34910 }, { "epoch": 6.868776277937597, "grad_norm": 0.9070050716400146, "learning_rate": 9.526124308294354e-06, "loss": 0.343, "step": 34920 }, { "epoch": 6.870743281453615, "grad_norm": 1.2644065618515015, "learning_rate": 9.52306704576722e-06, "loss": 0.2343, "step": 34930 }, { "epoch": 6.872710284969634, "grad_norm": 0.9814834594726562, "learning_rate": 9.520009783240087e-06, "loss": 0.393, "step": 34940 }, { "epoch": 6.874677288485653, "grad_norm": 1.202989101409912, "learning_rate": 9.516952520712955e-06, "loss": 0.4422, "step": 34950 }, { "epoch": 6.8766442920016715, "grad_norm": 2.017094612121582, "learning_rate": 9.51389525818582e-06, "loss": 0.3723, "step": 34960 }, { "epoch": 6.878611295517691, "grad_norm": 1.194692850112915, "learning_rate": 9.510837995658688e-06, "loss": 0.3942, "step": 34970 }, { "epoch": 6.88057829903371, "grad_norm": 1.6615229845046997, "learning_rate": 9.507780733131555e-06, "loss": 0.3137, "step": 34980 }, { "epoch": 6.882545302549728, "grad_norm": 0.8167328238487244, "learning_rate": 9.504723470604421e-06, "loss": 0.3133, "step": 34990 }, { "epoch": 6.884512306065747, "grad_norm": 0.8303702473640442, "learning_rate": 9.501666208077289e-06, "loss": 0.3374, "step": 35000 }, { "epoch": 6.884512306065747, "eval_loss": 0.14276579022407532, "eval_runtime": 8.8481, "eval_samples_per_second": 5.651, "eval_steps_per_second": 2.825, "step": 35000 }, { "epoch": 6.886479309581766, "grad_norm": 1.779717206954956, "learning_rate": 9.498608945550154e-06, "loss": 0.4124, "step": 35010 }, { "epoch": 6.888446313097784, "grad_norm": 1.0585066080093384, "learning_rate": 9.495551683023022e-06, "loss": 0.3539, "step": 35020 }, { "epoch": 6.890413316613803, "grad_norm": 2.3185861110687256, "learning_rate": 9.49249442049589e-06, "loss": 0.3541, "step": 35030 }, { "epoch": 6.892380320129822, "grad_norm": 1.2141361236572266, "learning_rate": 9.489437157968757e-06, "loss": 0.3086, "step": 35040 }, { "epoch": 6.894347323645841, "grad_norm": 2.0195775032043457, "learning_rate": 9.486379895441623e-06, "loss": 0.4795, "step": 35050 }, { "epoch": 6.89631432716186, "grad_norm": 1.4040886163711548, "learning_rate": 9.483322632914488e-06, "loss": 0.3061, "step": 35060 }, { "epoch": 6.898281330677879, "grad_norm": 0.7142741084098816, "learning_rate": 9.480265370387356e-06, "loss": 0.278, "step": 35070 }, { "epoch": 6.900248334193897, "grad_norm": 1.6775517463684082, "learning_rate": 9.477208107860223e-06, "loss": 0.3032, "step": 35080 }, { "epoch": 6.902215337709916, "grad_norm": 1.7707325220108032, "learning_rate": 9.474150845333089e-06, "loss": 0.4228, "step": 35090 }, { "epoch": 6.904182341225935, "grad_norm": 1.264266848564148, "learning_rate": 9.471093582805957e-06, "loss": 0.4042, "step": 35100 }, { "epoch": 6.906149344741953, "grad_norm": 2.7818541526794434, "learning_rate": 9.468036320278822e-06, "loss": 0.4654, "step": 35110 }, { "epoch": 6.908116348257972, "grad_norm": 1.321373701095581, "learning_rate": 9.46497905775169e-06, "loss": 0.4151, "step": 35120 }, { "epoch": 6.9100833517739915, "grad_norm": 0.5779895186424255, "learning_rate": 9.461921795224557e-06, "loss": 0.3116, "step": 35130 }, { "epoch": 6.91205035529001, "grad_norm": 2.980159044265747, "learning_rate": 9.458864532697423e-06, "loss": 0.3251, "step": 35140 }, { "epoch": 6.914017358806029, "grad_norm": 0.6384599804878235, "learning_rate": 9.45580727017029e-06, "loss": 0.3456, "step": 35150 }, { "epoch": 6.915984362322048, "grad_norm": 0.8337164521217346, "learning_rate": 9.452750007643156e-06, "loss": 0.3414, "step": 35160 }, { "epoch": 6.917951365838066, "grad_norm": 0.8615796566009521, "learning_rate": 9.449692745116024e-06, "loss": 0.3019, "step": 35170 }, { "epoch": 6.919918369354085, "grad_norm": 1.3931223154067993, "learning_rate": 9.446635482588891e-06, "loss": 0.2995, "step": 35180 }, { "epoch": 6.921885372870104, "grad_norm": 0.6666834354400635, "learning_rate": 9.443578220061757e-06, "loss": 0.2891, "step": 35190 }, { "epoch": 6.923852376386122, "grad_norm": 2.166940927505493, "learning_rate": 9.440520957534625e-06, "loss": 0.4111, "step": 35200 }, { "epoch": 6.9258193799021415, "grad_norm": 1.2376841306686401, "learning_rate": 9.437463695007492e-06, "loss": 0.4181, "step": 35210 }, { "epoch": 6.9277863834181606, "grad_norm": 1.8692290782928467, "learning_rate": 9.434406432480358e-06, "loss": 0.3888, "step": 35220 }, { "epoch": 6.929753386934179, "grad_norm": 1.0836957693099976, "learning_rate": 9.431349169953225e-06, "loss": 0.4205, "step": 35230 }, { "epoch": 6.931720390450198, "grad_norm": 1.6166654825210571, "learning_rate": 9.428291907426091e-06, "loss": 0.4591, "step": 35240 }, { "epoch": 6.933687393966217, "grad_norm": 0.9476025104522705, "learning_rate": 9.425234644898958e-06, "loss": 0.3028, "step": 35250 }, { "epoch": 6.935654397482235, "grad_norm": 1.6330329179763794, "learning_rate": 9.422177382371826e-06, "loss": 0.3836, "step": 35260 }, { "epoch": 6.937621400998254, "grad_norm": 1.3539565801620483, "learning_rate": 9.419120119844692e-06, "loss": 0.3086, "step": 35270 }, { "epoch": 6.939588404514273, "grad_norm": 2.6535205841064453, "learning_rate": 9.416062857317558e-06, "loss": 0.3566, "step": 35280 }, { "epoch": 6.9415554080302915, "grad_norm": 0.5615221858024597, "learning_rate": 9.413005594790425e-06, "loss": 0.3996, "step": 35290 }, { "epoch": 6.943522411546311, "grad_norm": 1.2192131280899048, "learning_rate": 9.409948332263292e-06, "loss": 0.4368, "step": 35300 }, { "epoch": 6.94548941506233, "grad_norm": 3.1623566150665283, "learning_rate": 9.40689106973616e-06, "loss": 0.2673, "step": 35310 }, { "epoch": 6.947456418578348, "grad_norm": 0.8841261267662048, "learning_rate": 9.403833807209026e-06, "loss": 0.3659, "step": 35320 }, { "epoch": 6.949423422094367, "grad_norm": 1.4013664722442627, "learning_rate": 9.400776544681891e-06, "loss": 0.5024, "step": 35330 }, { "epoch": 6.951390425610386, "grad_norm": 1.311947226524353, "learning_rate": 9.397719282154759e-06, "loss": 0.4321, "step": 35340 }, { "epoch": 6.953357429126404, "grad_norm": 3.628854274749756, "learning_rate": 9.394662019627626e-06, "loss": 0.3072, "step": 35350 }, { "epoch": 6.955324432642423, "grad_norm": 1.5111637115478516, "learning_rate": 9.391604757100494e-06, "loss": 0.3502, "step": 35360 }, { "epoch": 6.957291436158442, "grad_norm": 1.093699336051941, "learning_rate": 9.38854749457336e-06, "loss": 0.5111, "step": 35370 }, { "epoch": 6.959258439674461, "grad_norm": 1.2105228900909424, "learning_rate": 9.385490232046227e-06, "loss": 0.5082, "step": 35380 }, { "epoch": 6.96122544319048, "grad_norm": 0.9326910376548767, "learning_rate": 9.382432969519093e-06, "loss": 0.263, "step": 35390 }, { "epoch": 6.963192446706499, "grad_norm": 2.3566551208496094, "learning_rate": 9.37937570699196e-06, "loss": 0.3236, "step": 35400 }, { "epoch": 6.965159450222517, "grad_norm": 0.6763259768486023, "learning_rate": 9.376318444464826e-06, "loss": 0.2762, "step": 35410 }, { "epoch": 6.967126453738536, "grad_norm": 1.633030891418457, "learning_rate": 9.373261181937694e-06, "loss": 0.4472, "step": 35420 }, { "epoch": 6.969093457254555, "grad_norm": 2.4877285957336426, "learning_rate": 9.370203919410561e-06, "loss": 0.2424, "step": 35430 }, { "epoch": 6.971060460770573, "grad_norm": 0.9804292917251587, "learning_rate": 9.367146656883427e-06, "loss": 0.3773, "step": 35440 }, { "epoch": 6.973027464286592, "grad_norm": 1.5808591842651367, "learning_rate": 9.364089394356294e-06, "loss": 0.3392, "step": 35450 }, { "epoch": 6.9749944678026115, "grad_norm": 1.5255887508392334, "learning_rate": 9.36103213182916e-06, "loss": 0.4773, "step": 35460 }, { "epoch": 6.97696147131863, "grad_norm": 0.9557861685752869, "learning_rate": 9.357974869302028e-06, "loss": 0.3166, "step": 35470 }, { "epoch": 6.978928474834649, "grad_norm": 1.788199782371521, "learning_rate": 9.354917606774895e-06, "loss": 0.4094, "step": 35480 }, { "epoch": 6.980895478350668, "grad_norm": 0.9672784805297852, "learning_rate": 9.351860344247763e-06, "loss": 0.3216, "step": 35490 }, { "epoch": 6.982862481866686, "grad_norm": 1.6953893899917603, "learning_rate": 9.348803081720628e-06, "loss": 0.5089, "step": 35500 }, { "epoch": 6.982862481866686, "eval_loss": 0.14116249978542328, "eval_runtime": 8.8785, "eval_samples_per_second": 5.632, "eval_steps_per_second": 2.816, "step": 35500 }, { "epoch": 6.984829485382705, "grad_norm": 0.8304757475852966, "learning_rate": 9.345745819193494e-06, "loss": 0.2787, "step": 35510 }, { "epoch": 6.986796488898724, "grad_norm": 0.7213247418403625, "learning_rate": 9.342688556666362e-06, "loss": 0.4252, "step": 35520 }, { "epoch": 6.988763492414742, "grad_norm": 2.7171456813812256, "learning_rate": 9.339631294139229e-06, "loss": 0.5031, "step": 35530 }, { "epoch": 6.9907304959307615, "grad_norm": 1.4926707744598389, "learning_rate": 9.336574031612095e-06, "loss": 0.3693, "step": 35540 }, { "epoch": 6.9926974994467805, "grad_norm": 1.0265848636627197, "learning_rate": 9.333516769084962e-06, "loss": 0.3407, "step": 35550 }, { "epoch": 6.994664502962799, "grad_norm": 1.8866267204284668, "learning_rate": 9.330459506557828e-06, "loss": 0.3216, "step": 35560 }, { "epoch": 6.996631506478818, "grad_norm": 3.09433913230896, "learning_rate": 9.327402244030696e-06, "loss": 0.3974, "step": 35570 }, { "epoch": 6.998598509994837, "grad_norm": 2.6518099308013916, "learning_rate": 9.324344981503563e-06, "loss": 0.2656, "step": 35580 }, { "epoch": 7.000565513510855, "grad_norm": 0.38323965668678284, "learning_rate": 9.321287718976429e-06, "loss": 0.3545, "step": 35590 }, { "epoch": 7.002532517026874, "grad_norm": 0.8496131896972656, "learning_rate": 9.318230456449296e-06, "loss": 0.2795, "step": 35600 }, { "epoch": 7.004499520542893, "grad_norm": 1.9276858568191528, "learning_rate": 9.315173193922162e-06, "loss": 0.3878, "step": 35610 }, { "epoch": 7.0064665240589115, "grad_norm": 0.9172375798225403, "learning_rate": 9.31211593139503e-06, "loss": 0.3019, "step": 35620 }, { "epoch": 7.0084335275749305, "grad_norm": 1.2532397508621216, "learning_rate": 9.309058668867897e-06, "loss": 0.37, "step": 35630 }, { "epoch": 7.01040053109095, "grad_norm": 1.1102087497711182, "learning_rate": 9.306001406340763e-06, "loss": 0.3783, "step": 35640 }, { "epoch": 7.012367534606968, "grad_norm": 1.4500218629837036, "learning_rate": 9.30294414381363e-06, "loss": 0.4186, "step": 35650 }, { "epoch": 7.014334538122987, "grad_norm": 1.0918534994125366, "learning_rate": 9.299886881286498e-06, "loss": 0.4503, "step": 35660 }, { "epoch": 7.016301541639006, "grad_norm": 0.8374407291412354, "learning_rate": 9.296829618759364e-06, "loss": 0.2792, "step": 35670 }, { "epoch": 7.018268545155024, "grad_norm": 0.924603283405304, "learning_rate": 9.293772356232231e-06, "loss": 0.4561, "step": 35680 }, { "epoch": 7.020235548671043, "grad_norm": 0.7381618618965149, "learning_rate": 9.290715093705097e-06, "loss": 0.3591, "step": 35690 }, { "epoch": 7.022202552187062, "grad_norm": 1.1523455381393433, "learning_rate": 9.287657831177964e-06, "loss": 0.2453, "step": 35700 }, { "epoch": 7.0241695557030805, "grad_norm": 1.0780432224273682, "learning_rate": 9.284600568650832e-06, "loss": 0.3303, "step": 35710 }, { "epoch": 7.0261365592191, "grad_norm": 0.5242741703987122, "learning_rate": 9.281543306123697e-06, "loss": 0.1654, "step": 35720 }, { "epoch": 7.028103562735119, "grad_norm": 0.6786366701126099, "learning_rate": 9.278486043596563e-06, "loss": 0.3589, "step": 35730 }, { "epoch": 7.030070566251137, "grad_norm": 0.8653206825256348, "learning_rate": 9.27542878106943e-06, "loss": 0.3865, "step": 35740 }, { "epoch": 7.032037569767156, "grad_norm": 1.2041491270065308, "learning_rate": 9.272371518542298e-06, "loss": 0.3778, "step": 35750 }, { "epoch": 7.034004573283175, "grad_norm": 0.9845544695854187, "learning_rate": 9.269314256015166e-06, "loss": 0.3827, "step": 35760 }, { "epoch": 7.035971576799193, "grad_norm": 0.5570571422576904, "learning_rate": 9.266256993488031e-06, "loss": 0.3254, "step": 35770 }, { "epoch": 7.037938580315212, "grad_norm": 0.9653236269950867, "learning_rate": 9.263199730960897e-06, "loss": 0.4093, "step": 35780 }, { "epoch": 7.039905583831231, "grad_norm": 1.872336745262146, "learning_rate": 9.260142468433765e-06, "loss": 0.3385, "step": 35790 }, { "epoch": 7.04187258734725, "grad_norm": 1.1234791278839111, "learning_rate": 9.257085205906632e-06, "loss": 0.3464, "step": 35800 }, { "epoch": 7.043839590863269, "grad_norm": 1.0836743116378784, "learning_rate": 9.2540279433795e-06, "loss": 0.3793, "step": 35810 }, { "epoch": 7.045806594379288, "grad_norm": 0.8646442890167236, "learning_rate": 9.250970680852365e-06, "loss": 0.3513, "step": 35820 }, { "epoch": 7.047773597895306, "grad_norm": 0.9303305745124817, "learning_rate": 9.247913418325233e-06, "loss": 0.4035, "step": 35830 }, { "epoch": 7.049740601411325, "grad_norm": 1.3007382154464722, "learning_rate": 9.244856155798099e-06, "loss": 0.3699, "step": 35840 }, { "epoch": 7.051707604927344, "grad_norm": 1.8843724727630615, "learning_rate": 9.241798893270966e-06, "loss": 0.3185, "step": 35850 }, { "epoch": 7.053674608443362, "grad_norm": 1.665401577949524, "learning_rate": 9.238741630743832e-06, "loss": 0.3567, "step": 35860 }, { "epoch": 7.055641611959381, "grad_norm": 0.9232697486877441, "learning_rate": 9.2356843682167e-06, "loss": 0.3165, "step": 35870 }, { "epoch": 7.0576086154754005, "grad_norm": 1.5284390449523926, "learning_rate": 9.232627105689567e-06, "loss": 0.2676, "step": 35880 }, { "epoch": 7.059575618991419, "grad_norm": 0.810941219329834, "learning_rate": 9.229569843162434e-06, "loss": 0.4656, "step": 35890 }, { "epoch": 7.061542622507438, "grad_norm": 0.7167800068855286, "learning_rate": 9.2265125806353e-06, "loss": 0.3008, "step": 35900 }, { "epoch": 7.063509626023457, "grad_norm": 0.9715979099273682, "learning_rate": 9.223455318108166e-06, "loss": 0.3422, "step": 35910 }, { "epoch": 7.065476629539475, "grad_norm": 2.699817657470703, "learning_rate": 9.220398055581033e-06, "loss": 0.35, "step": 35920 }, { "epoch": 7.067443633055494, "grad_norm": 1.0953922271728516, "learning_rate": 9.2173407930539e-06, "loss": 0.3478, "step": 35930 }, { "epoch": 7.069410636571513, "grad_norm": 0.5451570749282837, "learning_rate": 9.214283530526768e-06, "loss": 0.3762, "step": 35940 }, { "epoch": 7.0713776400875314, "grad_norm": 1.066961407661438, "learning_rate": 9.211226267999634e-06, "loss": 0.3103, "step": 35950 }, { "epoch": 7.0733446436035505, "grad_norm": 1.1819201707839966, "learning_rate": 9.2081690054725e-06, "loss": 0.3374, "step": 35960 }, { "epoch": 7.07531164711957, "grad_norm": 0.8258270621299744, "learning_rate": 9.205111742945367e-06, "loss": 0.3513, "step": 35970 }, { "epoch": 7.077278650635588, "grad_norm": 1.4958642721176147, "learning_rate": 9.202054480418235e-06, "loss": 0.3744, "step": 35980 }, { "epoch": 7.079245654151607, "grad_norm": 0.4954798221588135, "learning_rate": 9.1989972178911e-06, "loss": 0.3777, "step": 35990 }, { "epoch": 7.081212657667626, "grad_norm": 0.4899837076663971, "learning_rate": 9.195939955363968e-06, "loss": 0.243, "step": 36000 }, { "epoch": 7.081212657667626, "eval_loss": 0.14319463074207306, "eval_runtime": 8.8442, "eval_samples_per_second": 5.653, "eval_steps_per_second": 2.827, "step": 36000 } ], "logging_steps": 10, "max_steps": 66079, "num_input_tokens_seen": 0, "num_train_epochs": 13, "save_steps": 4000, "total_flos": 1.9414670099987497e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }