diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,26572 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9956467969329705, + "eval_steps": 500, + "global_step": 3789, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007918048201118424, + "grad_norm": 3.249665515279578, + "learning_rate": 2e-07, + "loss": 0.7945, + "step": 1 + }, + { + "epoch": 0.0015836096402236848, + "grad_norm": 2.8142684051052407, + "learning_rate": 4e-07, + "loss": 0.7499, + "step": 2 + }, + { + "epoch": 0.0023754144603355272, + "grad_norm": 3.1595653950203344, + "learning_rate": 6e-07, + "loss": 0.7657, + "step": 3 + }, + { + "epoch": 0.0031672192804473695, + "grad_norm": 3.077093768185348, + "learning_rate": 8e-07, + "loss": 0.7546, + "step": 4 + }, + { + "epoch": 0.003959024100559212, + "grad_norm": 2.991003227075916, + "learning_rate": 1e-06, + "loss": 0.7488, + "step": 5 + }, + { + "epoch": 0.0047508289206710545, + "grad_norm": 2.664419096676507, + "learning_rate": 1.2e-06, + "loss": 0.735, + "step": 6 + }, + { + "epoch": 0.005542633740782897, + "grad_norm": 2.447554072933163, + "learning_rate": 1.4e-06, + "loss": 0.7432, + "step": 7 + }, + { + "epoch": 0.006334438560894739, + "grad_norm": 1.8641292483217404, + "learning_rate": 1.6e-06, + "loss": 0.6963, + "step": 8 + }, + { + "epoch": 0.007126243381006582, + "grad_norm": 1.9602828612613827, + "learning_rate": 1.8e-06, + "loss": 0.7213, + "step": 9 + }, + { + "epoch": 0.007918048201118424, + "grad_norm": 1.839376447147316, + "learning_rate": 2e-06, + "loss": 0.7203, + "step": 10 + }, + { + "epoch": 0.008709853021230268, + "grad_norm": 1.0894689354185774, + "learning_rate": 1.999999654445987e-06, + "loss": 0.6868, + "step": 11 + }, + { + "epoch": 0.009501657841342109, + "grad_norm": 1.2516859974601064, + "learning_rate": 1.9999986177841868e-06, + "loss": 0.6474, + "step": 12 + }, + { + "epoch": 0.010293462661453952, + "grad_norm": 1.2990716624476766, + "learning_rate": 1.999996890015316e-06, + "loss": 0.6559, + "step": 13 + }, + { + "epoch": 0.011085267481565794, + "grad_norm": 1.271434335142783, + "learning_rate": 1.9999944711405684e-06, + "loss": 0.6491, + "step": 14 + }, + { + "epoch": 0.011877072301677637, + "grad_norm": 2.0436825661525506, + "learning_rate": 1.9999913611616162e-06, + "loss": 0.6452, + "step": 15 + }, + { + "epoch": 0.012668877121789478, + "grad_norm": 2.788044827948402, + "learning_rate": 1.999987560080608e-06, + "loss": 0.6572, + "step": 16 + }, + { + "epoch": 0.013460681941901321, + "grad_norm": 1.817419771671645, + "learning_rate": 1.9999830679001716e-06, + "loss": 0.6691, + "step": 17 + }, + { + "epoch": 0.014252486762013164, + "grad_norm": 1.06301349202853, + "learning_rate": 1.999977884623411e-06, + "loss": 0.6521, + "step": 18 + }, + { + "epoch": 0.015044291582125006, + "grad_norm": 0.7884086864832666, + "learning_rate": 1.999972010253908e-06, + "loss": 0.6435, + "step": 19 + }, + { + "epoch": 0.01583609640223685, + "grad_norm": 0.5927933324880149, + "learning_rate": 1.999965444795724e-06, + "loss": 0.6378, + "step": 20 + }, + { + "epoch": 0.016627901222348692, + "grad_norm": 0.515330764428853, + "learning_rate": 1.9999581882533944e-06, + "loss": 0.6451, + "step": 21 + }, + { + "epoch": 0.017419706042460535, + "grad_norm": 0.46993043798293804, + "learning_rate": 1.9999502406319355e-06, + "loss": 0.6276, + "step": 22 + }, + { + "epoch": 0.018211510862572375, + "grad_norm": 0.4663536484014579, + "learning_rate": 1.99994160193684e-06, + "loss": 0.644, + "step": 23 + }, + { + "epoch": 0.019003315682684218, + "grad_norm": 0.49060116408582094, + "learning_rate": 1.999932272174078e-06, + "loss": 0.655, + "step": 24 + }, + { + "epoch": 0.01979512050279606, + "grad_norm": 0.5480942676252971, + "learning_rate": 1.999922251350097e-06, + "loss": 0.6515, + "step": 25 + }, + { + "epoch": 0.020586925322907904, + "grad_norm": 0.5771916166813391, + "learning_rate": 1.999911539471823e-06, + "loss": 0.6415, + "step": 26 + }, + { + "epoch": 0.021378730143019744, + "grad_norm": 0.5839231264275982, + "learning_rate": 1.9999001365466587e-06, + "loss": 0.635, + "step": 27 + }, + { + "epoch": 0.022170534963131587, + "grad_norm": 0.5301968144588673, + "learning_rate": 1.999888042582485e-06, + "loss": 0.6375, + "step": 28 + }, + { + "epoch": 0.02296233978324343, + "grad_norm": 0.4734334925176316, + "learning_rate": 1.9998752575876592e-06, + "loss": 0.6184, + "step": 29 + }, + { + "epoch": 0.023754144603355273, + "grad_norm": 0.4249379146562315, + "learning_rate": 1.9998617815710186e-06, + "loss": 0.6452, + "step": 30 + }, + { + "epoch": 0.024545949423467116, + "grad_norm": 0.3662617306941407, + "learning_rate": 1.999847614541876e-06, + "loss": 0.6193, + "step": 31 + }, + { + "epoch": 0.025337754243578956, + "grad_norm": 0.33731127892211377, + "learning_rate": 1.999832756510022e-06, + "loss": 0.6286, + "step": 32 + }, + { + "epoch": 0.0261295590636908, + "grad_norm": 0.3663161838243929, + "learning_rate": 1.9998172074857257e-06, + "loss": 0.6547, + "step": 33 + }, + { + "epoch": 0.026921363883802642, + "grad_norm": 0.381291078224402, + "learning_rate": 1.999800967479732e-06, + "loss": 0.6297, + "step": 34 + }, + { + "epoch": 0.027713168703914486, + "grad_norm": 0.3839788122616414, + "learning_rate": 1.999784036503266e-06, + "loss": 0.6203, + "step": 35 + }, + { + "epoch": 0.02850497352402633, + "grad_norm": 0.38298696253878306, + "learning_rate": 1.999766414568028e-06, + "loss": 0.6167, + "step": 36 + }, + { + "epoch": 0.02929677834413817, + "grad_norm": 0.3607911449224398, + "learning_rate": 1.999748101686197e-06, + "loss": 0.6288, + "step": 37 + }, + { + "epoch": 0.03008858316425001, + "grad_norm": 0.3384587080481325, + "learning_rate": 1.999729097870429e-06, + "loss": 0.6373, + "step": 38 + }, + { + "epoch": 0.030880387984361855, + "grad_norm": 0.3437018408721899, + "learning_rate": 1.999709403133857e-06, + "loss": 0.6158, + "step": 39 + }, + { + "epoch": 0.0316721928044737, + "grad_norm": 0.35323126492777207, + "learning_rate": 1.9996890174900932e-06, + "loss": 0.616, + "step": 40 + }, + { + "epoch": 0.03246399762458554, + "grad_norm": 0.31554623480143845, + "learning_rate": 1.9996679409532266e-06, + "loss": 0.6282, + "step": 41 + }, + { + "epoch": 0.033255802444697384, + "grad_norm": 0.32553225221446214, + "learning_rate": 1.999646173537822e-06, + "loss": 0.6247, + "step": 42 + }, + { + "epoch": 0.034047607264809224, + "grad_norm": 0.2931471917183078, + "learning_rate": 1.9996237152589236e-06, + "loss": 0.598, + "step": 43 + }, + { + "epoch": 0.03483941208492107, + "grad_norm": 0.29857347585860283, + "learning_rate": 1.999600566132053e-06, + "loss": 0.6107, + "step": 44 + }, + { + "epoch": 0.03563121690503291, + "grad_norm": 0.2766259754867876, + "learning_rate": 1.9995767261732084e-06, + "loss": 0.6096, + "step": 45 + }, + { + "epoch": 0.03642302172514475, + "grad_norm": 0.2925641957500468, + "learning_rate": 1.9995521953988655e-06, + "loss": 0.6124, + "step": 46 + }, + { + "epoch": 0.037214826545256596, + "grad_norm": 0.27289093170244716, + "learning_rate": 1.9995269738259782e-06, + "loss": 0.6245, + "step": 47 + }, + { + "epoch": 0.038006631365368436, + "grad_norm": 0.2758276253364378, + "learning_rate": 1.9995010614719774e-06, + "loss": 0.6197, + "step": 48 + }, + { + "epoch": 0.03879843618548028, + "grad_norm": 0.264869674303575, + "learning_rate": 1.9994744583547704e-06, + "loss": 0.622, + "step": 49 + }, + { + "epoch": 0.03959024100559212, + "grad_norm": 0.26959516153393015, + "learning_rate": 1.999447164492744e-06, + "loss": 0.6089, + "step": 50 + }, + { + "epoch": 0.04038204582570396, + "grad_norm": 0.2585070187575266, + "learning_rate": 1.99941917990476e-06, + "loss": 0.6176, + "step": 51 + }, + { + "epoch": 0.04117385064581581, + "grad_norm": 0.2535853429181986, + "learning_rate": 1.9993905046101603e-06, + "loss": 0.5881, + "step": 52 + }, + { + "epoch": 0.04196565546592765, + "grad_norm": 0.25964496610236193, + "learning_rate": 1.999361138628761e-06, + "loss": 0.629, + "step": 53 + }, + { + "epoch": 0.04275746028603949, + "grad_norm": 0.3438764551057683, + "learning_rate": 1.9993310819808587e-06, + "loss": 0.6178, + "step": 54 + }, + { + "epoch": 0.043549265106151334, + "grad_norm": 0.24882642114908518, + "learning_rate": 1.9993003346872247e-06, + "loss": 0.6375, + "step": 55 + }, + { + "epoch": 0.044341069926263174, + "grad_norm": 0.2651871648532402, + "learning_rate": 1.9992688967691088e-06, + "loss": 0.624, + "step": 56 + }, + { + "epoch": 0.04513287474637502, + "grad_norm": 0.25284940502692427, + "learning_rate": 1.999236768248239e-06, + "loss": 0.6125, + "step": 57 + }, + { + "epoch": 0.04592467956648686, + "grad_norm": 0.26229841543170196, + "learning_rate": 1.999203949146818e-06, + "loss": 0.618, + "step": 58 + }, + { + "epoch": 0.0467164843865987, + "grad_norm": 0.24920327663759706, + "learning_rate": 1.9991704394875287e-06, + "loss": 0.6224, + "step": 59 + }, + { + "epoch": 0.04750828920671055, + "grad_norm": 0.24646823708084495, + "learning_rate": 1.999136239293529e-06, + "loss": 0.6083, + "step": 60 + }, + { + "epoch": 0.048300094026822386, + "grad_norm": 0.2560335398854992, + "learning_rate": 1.9991013485884557e-06, + "loss": 0.6122, + "step": 61 + }, + { + "epoch": 0.04909189884693423, + "grad_norm": 0.24690995915149744, + "learning_rate": 1.9990657673964212e-06, + "loss": 0.606, + "step": 62 + }, + { + "epoch": 0.04988370366704607, + "grad_norm": 0.2544634101312482, + "learning_rate": 1.9990294957420167e-06, + "loss": 0.6217, + "step": 63 + }, + { + "epoch": 0.05067550848715791, + "grad_norm": 0.25189634044211, + "learning_rate": 1.9989925336503093e-06, + "loss": 0.6084, + "step": 64 + }, + { + "epoch": 0.05146731330726976, + "grad_norm": 0.24802884462346636, + "learning_rate": 1.998954881146844e-06, + "loss": 0.6237, + "step": 65 + }, + { + "epoch": 0.0522591181273816, + "grad_norm": 0.2490473534466852, + "learning_rate": 1.998916538257643e-06, + "loss": 0.6092, + "step": 66 + }, + { + "epoch": 0.053050922947493445, + "grad_norm": 0.24744554617386624, + "learning_rate": 1.9988775050092048e-06, + "loss": 0.6209, + "step": 67 + }, + { + "epoch": 0.053842727767605285, + "grad_norm": 0.24988780685989925, + "learning_rate": 1.9988377814285063e-06, + "loss": 0.6279, + "step": 68 + }, + { + "epoch": 0.054634532587717125, + "grad_norm": 0.24054326057001302, + "learning_rate": 1.998797367543e-06, + "loss": 0.6181, + "step": 69 + }, + { + "epoch": 0.05542633740782897, + "grad_norm": 0.24339247767164976, + "learning_rate": 1.998756263380617e-06, + "loss": 0.6164, + "step": 70 + }, + { + "epoch": 0.05621814222794081, + "grad_norm": 0.25210836407538834, + "learning_rate": 1.998714468969764e-06, + "loss": 0.6249, + "step": 71 + }, + { + "epoch": 0.05700994704805266, + "grad_norm": 0.25071208596853567, + "learning_rate": 1.9986719843393265e-06, + "loss": 0.6338, + "step": 72 + }, + { + "epoch": 0.0578017518681645, + "grad_norm": 0.23776914423319057, + "learning_rate": 1.998628809518665e-06, + "loss": 0.6033, + "step": 73 + }, + { + "epoch": 0.05859355668827634, + "grad_norm": 0.2453910024467796, + "learning_rate": 1.998584944537618e-06, + "loss": 0.6073, + "step": 74 + }, + { + "epoch": 0.05938536150838818, + "grad_norm": 0.24394938336098929, + "learning_rate": 1.9985403894265017e-06, + "loss": 0.6223, + "step": 75 + }, + { + "epoch": 0.06017716632850002, + "grad_norm": 0.2505817471928499, + "learning_rate": 1.998495144216108e-06, + "loss": 0.6102, + "step": 76 + }, + { + "epoch": 0.06096897114861187, + "grad_norm": 0.2437926189291882, + "learning_rate": 1.9984492089377057e-06, + "loss": 0.6023, + "step": 77 + }, + { + "epoch": 0.06176077596872371, + "grad_norm": 0.24885762929340935, + "learning_rate": 1.998402583623042e-06, + "loss": 0.6075, + "step": 78 + }, + { + "epoch": 0.06255258078883555, + "grad_norm": 0.24551220308165528, + "learning_rate": 1.9983552683043395e-06, + "loss": 0.5987, + "step": 79 + }, + { + "epoch": 0.0633443856089474, + "grad_norm": 0.268359520348399, + "learning_rate": 1.998307263014298e-06, + "loss": 0.6105, + "step": 80 + }, + { + "epoch": 0.06413619042905924, + "grad_norm": 0.24925244015298842, + "learning_rate": 1.9982585677860954e-06, + "loss": 0.6138, + "step": 81 + }, + { + "epoch": 0.06492799524917107, + "grad_norm": 0.24162772926660567, + "learning_rate": 1.998209182653384e-06, + "loss": 0.5903, + "step": 82 + }, + { + "epoch": 0.06571980006928292, + "grad_norm": 0.24781516071224408, + "learning_rate": 1.998159107650295e-06, + "loss": 0.6212, + "step": 83 + }, + { + "epoch": 0.06651160488939477, + "grad_norm": 0.24192761327310594, + "learning_rate": 1.9981083428114355e-06, + "loss": 0.6137, + "step": 84 + }, + { + "epoch": 0.0673034097095066, + "grad_norm": 0.24717391550896387, + "learning_rate": 1.99805688817189e-06, + "loss": 0.6203, + "step": 85 + }, + { + "epoch": 0.06809521452961845, + "grad_norm": 0.24088249501234663, + "learning_rate": 1.998004743767218e-06, + "loss": 0.6208, + "step": 86 + }, + { + "epoch": 0.0688870193497303, + "grad_norm": 0.23821640997792096, + "learning_rate": 1.997951909633458e-06, + "loss": 0.6302, + "step": 87 + }, + { + "epoch": 0.06967882416984214, + "grad_norm": 0.24287262149822414, + "learning_rate": 1.9978983858071236e-06, + "loss": 0.6116, + "step": 88 + }, + { + "epoch": 0.07047062898995397, + "grad_norm": 0.24798287495108298, + "learning_rate": 1.9978441723252055e-06, + "loss": 0.6125, + "step": 89 + }, + { + "epoch": 0.07126243381006582, + "grad_norm": 0.2392397494962519, + "learning_rate": 1.9977892692251715e-06, + "loss": 0.591, + "step": 90 + }, + { + "epoch": 0.07205423863017767, + "grad_norm": 0.24837547105981, + "learning_rate": 1.997733676544965e-06, + "loss": 0.6278, + "step": 91 + }, + { + "epoch": 0.0728460434502895, + "grad_norm": 0.2409248642894013, + "learning_rate": 1.997677394323007e-06, + "loss": 0.6144, + "step": 92 + }, + { + "epoch": 0.07363784827040135, + "grad_norm": 0.24434129583738856, + "learning_rate": 1.997620422598195e-06, + "loss": 0.6261, + "step": 93 + }, + { + "epoch": 0.07442965309051319, + "grad_norm": 0.23785260817816306, + "learning_rate": 1.9975627614099018e-06, + "loss": 0.6137, + "step": 94 + }, + { + "epoch": 0.07522145791062503, + "grad_norm": 0.2582438980418932, + "learning_rate": 1.997504410797977e-06, + "loss": 0.6155, + "step": 95 + }, + { + "epoch": 0.07601326273073687, + "grad_norm": 0.23782100531057535, + "learning_rate": 1.997445370802749e-06, + "loss": 0.5945, + "step": 96 + }, + { + "epoch": 0.07680506755084872, + "grad_norm": 0.24354341473147686, + "learning_rate": 1.997385641465019e-06, + "loss": 0.605, + "step": 97 + }, + { + "epoch": 0.07759687237096057, + "grad_norm": 0.24085340933783522, + "learning_rate": 1.9973252228260678e-06, + "loss": 0.6023, + "step": 98 + }, + { + "epoch": 0.0783886771910724, + "grad_norm": 0.24648161069179864, + "learning_rate": 1.9972641149276506e-06, + "loss": 0.616, + "step": 99 + }, + { + "epoch": 0.07918048201118424, + "grad_norm": 0.2544677163439913, + "learning_rate": 1.997202317811999e-06, + "loss": 0.6067, + "step": 100 + }, + { + "epoch": 0.07997228683129609, + "grad_norm": 0.24490827872109347, + "learning_rate": 1.9971398315218223e-06, + "loss": 0.6237, + "step": 101 + }, + { + "epoch": 0.08076409165140792, + "grad_norm": 0.25703203869900015, + "learning_rate": 1.997076656100305e-06, + "loss": 0.6185, + "step": 102 + }, + { + "epoch": 0.08155589647151977, + "grad_norm": 0.23887272337607254, + "learning_rate": 1.9970127915911083e-06, + "loss": 0.6287, + "step": 103 + }, + { + "epoch": 0.08234770129163162, + "grad_norm": 0.2347393500247831, + "learning_rate": 1.996948238038369e-06, + "loss": 0.6212, + "step": 104 + }, + { + "epoch": 0.08313950611174345, + "grad_norm": 0.23400474153598683, + "learning_rate": 1.9968829954867015e-06, + "loss": 0.6063, + "step": 105 + }, + { + "epoch": 0.0839313109318553, + "grad_norm": 0.25444492337175667, + "learning_rate": 1.9968170639811943e-06, + "loss": 0.6015, + "step": 106 + }, + { + "epoch": 0.08472311575196714, + "grad_norm": 0.24767706489511598, + "learning_rate": 1.9967504435674142e-06, + "loss": 0.6151, + "step": 107 + }, + { + "epoch": 0.08551492057207898, + "grad_norm": 0.24857739706161022, + "learning_rate": 1.9966831342914025e-06, + "loss": 0.6068, + "step": 108 + }, + { + "epoch": 0.08630672539219082, + "grad_norm": 0.24013957818221426, + "learning_rate": 1.9966151361996774e-06, + "loss": 0.6221, + "step": 109 + }, + { + "epoch": 0.08709853021230267, + "grad_norm": 0.2552787663397809, + "learning_rate": 1.9965464493392326e-06, + "loss": 0.6185, + "step": 110 + }, + { + "epoch": 0.08789033503241452, + "grad_norm": 0.24393863343993738, + "learning_rate": 1.996477073757539e-06, + "loss": 0.6106, + "step": 111 + }, + { + "epoch": 0.08868213985252635, + "grad_norm": 0.24327566997291633, + "learning_rate": 1.9964070095025412e-06, + "loss": 0.6158, + "step": 112 + }, + { + "epoch": 0.0894739446726382, + "grad_norm": 0.2460212889025927, + "learning_rate": 1.9963362566226625e-06, + "loss": 0.5918, + "step": 113 + }, + { + "epoch": 0.09026574949275004, + "grad_norm": 0.24668386912004803, + "learning_rate": 1.9962648151668e-06, + "loss": 0.6068, + "step": 114 + }, + { + "epoch": 0.09105755431286187, + "grad_norm": 0.24064611635390737, + "learning_rate": 1.9961926851843284e-06, + "loss": 0.6109, + "step": 115 + }, + { + "epoch": 0.09184935913297372, + "grad_norm": 0.2484601429962744, + "learning_rate": 1.9961198667250963e-06, + "loss": 0.605, + "step": 116 + }, + { + "epoch": 0.09264116395308557, + "grad_norm": 0.24394465737688995, + "learning_rate": 1.9960463598394293e-06, + "loss": 0.6286, + "step": 117 + }, + { + "epoch": 0.0934329687731974, + "grad_norm": 0.24679080532649691, + "learning_rate": 1.995972164578129e-06, + "loss": 0.5859, + "step": 118 + }, + { + "epoch": 0.09422477359330925, + "grad_norm": 0.25371887380609054, + "learning_rate": 1.995897280992472e-06, + "loss": 0.6024, + "step": 119 + }, + { + "epoch": 0.0950165784134211, + "grad_norm": 0.24806492536949867, + "learning_rate": 1.995821709134211e-06, + "loss": 0.6174, + "step": 120 + }, + { + "epoch": 0.09580838323353294, + "grad_norm": 0.2565291335234416, + "learning_rate": 1.9957454490555745e-06, + "loss": 0.6117, + "step": 121 + }, + { + "epoch": 0.09660018805364477, + "grad_norm": 0.2510354896290779, + "learning_rate": 1.995668500809266e-06, + "loss": 0.6083, + "step": 122 + }, + { + "epoch": 0.09739199287375662, + "grad_norm": 0.25122352500446965, + "learning_rate": 1.995590864448466e-06, + "loss": 0.6171, + "step": 123 + }, + { + "epoch": 0.09818379769386847, + "grad_norm": 0.2454319191209234, + "learning_rate": 1.995512540026829e-06, + "loss": 0.6081, + "step": 124 + }, + { + "epoch": 0.0989756025139803, + "grad_norm": 0.2520995178709679, + "learning_rate": 1.995433527598485e-06, + "loss": 0.6001, + "step": 125 + }, + { + "epoch": 0.09976740733409215, + "grad_norm": 0.24895886123859298, + "learning_rate": 1.995353827218041e-06, + "loss": 0.6229, + "step": 126 + }, + { + "epoch": 0.10055921215420399, + "grad_norm": 0.2594511545591342, + "learning_rate": 1.9952734389405786e-06, + "loss": 0.6069, + "step": 127 + }, + { + "epoch": 0.10135101697431582, + "grad_norm": 0.24667835691941334, + "learning_rate": 1.995192362821654e-06, + "loss": 0.6275, + "step": 128 + }, + { + "epoch": 0.10214282179442767, + "grad_norm": 0.24318685595006123, + "learning_rate": 1.9951105989173006e-06, + "loss": 0.6118, + "step": 129 + }, + { + "epoch": 0.10293462661453952, + "grad_norm": 0.24353911392852584, + "learning_rate": 1.9950281472840255e-06, + "loss": 0.6099, + "step": 130 + }, + { + "epoch": 0.10372643143465136, + "grad_norm": 0.24328214159825529, + "learning_rate": 1.9949450079788114e-06, + "loss": 0.595, + "step": 131 + }, + { + "epoch": 0.1045182362547632, + "grad_norm": 0.233892898891033, + "learning_rate": 1.9948611810591172e-06, + "loss": 0.6148, + "step": 132 + }, + { + "epoch": 0.10531004107487504, + "grad_norm": 0.23766448029852974, + "learning_rate": 1.9947766665828758e-06, + "loss": 0.5807, + "step": 133 + }, + { + "epoch": 0.10610184589498689, + "grad_norm": 0.2379454146592536, + "learning_rate": 1.9946914646084963e-06, + "loss": 0.6267, + "step": 134 + }, + { + "epoch": 0.10689365071509872, + "grad_norm": 0.24945201019879407, + "learning_rate": 1.9946055751948624e-06, + "loss": 0.6259, + "step": 135 + }, + { + "epoch": 0.10768545553521057, + "grad_norm": 0.24891292482229235, + "learning_rate": 1.9945189984013325e-06, + "loss": 0.6128, + "step": 136 + }, + { + "epoch": 0.10847726035532242, + "grad_norm": 0.2525902603363248, + "learning_rate": 1.994431734287741e-06, + "loss": 0.5994, + "step": 137 + }, + { + "epoch": 0.10926906517543425, + "grad_norm": 0.25443424584665786, + "learning_rate": 1.994343782914396e-06, + "loss": 0.5965, + "step": 138 + }, + { + "epoch": 0.1100608699955461, + "grad_norm": 0.24778523468262087, + "learning_rate": 1.994255144342083e-06, + "loss": 0.6066, + "step": 139 + }, + { + "epoch": 0.11085267481565794, + "grad_norm": 0.24549555159772826, + "learning_rate": 1.9941658186320596e-06, + "loss": 0.6109, + "step": 140 + }, + { + "epoch": 0.11164447963576979, + "grad_norm": 0.2459614675472423, + "learning_rate": 1.9940758058460596e-06, + "loss": 0.6246, + "step": 141 + }, + { + "epoch": 0.11243628445588162, + "grad_norm": 0.25382226746913006, + "learning_rate": 1.993985106046292e-06, + "loss": 0.6281, + "step": 142 + }, + { + "epoch": 0.11322808927599347, + "grad_norm": 0.23892895275893525, + "learning_rate": 1.9938937192954394e-06, + "loss": 0.6151, + "step": 143 + }, + { + "epoch": 0.11401989409610531, + "grad_norm": 0.24606184796512073, + "learning_rate": 1.993801645656661e-06, + "loss": 0.6215, + "step": 144 + }, + { + "epoch": 0.11481169891621715, + "grad_norm": 0.2421058351251952, + "learning_rate": 1.993708885193589e-06, + "loss": 0.6148, + "step": 145 + }, + { + "epoch": 0.115603503736329, + "grad_norm": 0.24631536565411302, + "learning_rate": 1.9936154379703307e-06, + "loss": 0.6033, + "step": 146 + }, + { + "epoch": 0.11639530855644084, + "grad_norm": 0.24147837800339966, + "learning_rate": 1.9935213040514685e-06, + "loss": 0.6093, + "step": 147 + }, + { + "epoch": 0.11718711337655267, + "grad_norm": 0.24339817339043665, + "learning_rate": 1.9934264835020596e-06, + "loss": 0.6259, + "step": 148 + }, + { + "epoch": 0.11797891819666452, + "grad_norm": 0.23842551166815323, + "learning_rate": 1.9933309763876342e-06, + "loss": 0.6171, + "step": 149 + }, + { + "epoch": 0.11877072301677637, + "grad_norm": 0.24104605193929682, + "learning_rate": 1.9932347827741987e-06, + "loss": 0.6078, + "step": 150 + }, + { + "epoch": 0.11956252783688821, + "grad_norm": 0.24688252589023799, + "learning_rate": 1.9931379027282333e-06, + "loss": 0.6215, + "step": 151 + }, + { + "epoch": 0.12035433265700005, + "grad_norm": 0.25022709961910333, + "learning_rate": 1.9930403363166925e-06, + "loss": 0.6097, + "step": 152 + }, + { + "epoch": 0.12114613747711189, + "grad_norm": 0.2525497601786903, + "learning_rate": 1.992942083607005e-06, + "loss": 0.5999, + "step": 153 + }, + { + "epoch": 0.12193794229722374, + "grad_norm": 0.24582397817659582, + "learning_rate": 1.992843144667074e-06, + "loss": 0.5911, + "step": 154 + }, + { + "epoch": 0.12272974711733557, + "grad_norm": 0.24517082411379587, + "learning_rate": 1.9927435195652776e-06, + "loss": 0.6055, + "step": 155 + }, + { + "epoch": 0.12352155193744742, + "grad_norm": 0.24686986627179539, + "learning_rate": 1.9926432083704666e-06, + "loss": 0.6133, + "step": 156 + }, + { + "epoch": 0.12431335675755927, + "grad_norm": 0.24124754454063155, + "learning_rate": 1.9925422111519678e-06, + "loss": 0.6078, + "step": 157 + }, + { + "epoch": 0.1251051615776711, + "grad_norm": 0.25064281296316676, + "learning_rate": 1.992440527979581e-06, + "loss": 0.6087, + "step": 158 + }, + { + "epoch": 0.12589696639778294, + "grad_norm": 0.2422033815243161, + "learning_rate": 1.9923381589235794e-06, + "loss": 0.5948, + "step": 159 + }, + { + "epoch": 0.1266887712178948, + "grad_norm": 0.25651784949814377, + "learning_rate": 1.992235104054712e-06, + "loss": 0.6184, + "step": 160 + }, + { + "epoch": 0.12748057603800664, + "grad_norm": 0.24005721640899194, + "learning_rate": 1.9921313634442005e-06, + "loss": 0.6178, + "step": 161 + }, + { + "epoch": 0.12827238085811848, + "grad_norm": 0.24352522262928494, + "learning_rate": 1.9920269371637407e-06, + "loss": 0.6085, + "step": 162 + }, + { + "epoch": 0.1290641856782303, + "grad_norm": 0.24623314726006892, + "learning_rate": 1.991921825285503e-06, + "loss": 0.6081, + "step": 163 + }, + { + "epoch": 0.12985599049834215, + "grad_norm": 0.2553394676667384, + "learning_rate": 1.99181602788213e-06, + "loss": 0.6123, + "step": 164 + }, + { + "epoch": 0.130647795318454, + "grad_norm": 0.25484888153943747, + "learning_rate": 1.9917095450267405e-06, + "loss": 0.6053, + "step": 165 + }, + { + "epoch": 0.13143960013856584, + "grad_norm": 0.2451398623016501, + "learning_rate": 1.991602376792925e-06, + "loss": 0.5935, + "step": 166 + }, + { + "epoch": 0.1322314049586777, + "grad_norm": 0.23374157148454733, + "learning_rate": 1.9914945232547486e-06, + "loss": 0.5949, + "step": 167 + }, + { + "epoch": 0.13302320977878954, + "grad_norm": 0.2423950980430432, + "learning_rate": 1.9913859844867493e-06, + "loss": 0.6051, + "step": 168 + }, + { + "epoch": 0.13381501459890138, + "grad_norm": 0.2400913194454132, + "learning_rate": 1.9912767605639387e-06, + "loss": 0.6243, + "step": 169 + }, + { + "epoch": 0.1346068194190132, + "grad_norm": 0.24404491950360782, + "learning_rate": 1.9911668515618033e-06, + "loss": 0.5875, + "step": 170 + }, + { + "epoch": 0.13539862423912505, + "grad_norm": 0.24678318176788785, + "learning_rate": 1.991056257556302e-06, + "loss": 0.6096, + "step": 171 + }, + { + "epoch": 0.1361904290592369, + "grad_norm": 0.25735257644134474, + "learning_rate": 1.9909449786238666e-06, + "loss": 0.6201, + "step": 172 + }, + { + "epoch": 0.13698223387934874, + "grad_norm": 0.2519557865307485, + "learning_rate": 1.990833014841403e-06, + "loss": 0.6067, + "step": 173 + }, + { + "epoch": 0.1377740386994606, + "grad_norm": 0.23885770861285516, + "learning_rate": 1.9907203662862906e-06, + "loss": 0.6019, + "step": 174 + }, + { + "epoch": 0.13856584351957243, + "grad_norm": 0.24925712376363354, + "learning_rate": 1.990607033036382e-06, + "loss": 0.608, + "step": 175 + }, + { + "epoch": 0.13935764833968428, + "grad_norm": 0.24712204252168293, + "learning_rate": 1.990493015170002e-06, + "loss": 0.6057, + "step": 176 + }, + { + "epoch": 0.1401494531597961, + "grad_norm": 0.255062892374803, + "learning_rate": 1.990378312765949e-06, + "loss": 0.6024, + "step": 177 + }, + { + "epoch": 0.14094125797990795, + "grad_norm": 0.2412239226129293, + "learning_rate": 1.990262925903496e-06, + "loss": 0.592, + "step": 178 + }, + { + "epoch": 0.1417330628000198, + "grad_norm": 0.24100115776441883, + "learning_rate": 1.9901468546623867e-06, + "loss": 0.6161, + "step": 179 + }, + { + "epoch": 0.14252486762013164, + "grad_norm": 0.2379116890638869, + "learning_rate": 1.9900300991228397e-06, + "loss": 0.5902, + "step": 180 + }, + { + "epoch": 0.1433166724402435, + "grad_norm": 0.2557311003501454, + "learning_rate": 1.9899126593655448e-06, + "loss": 0.6023, + "step": 181 + }, + { + "epoch": 0.14410847726035533, + "grad_norm": 0.2455583575343663, + "learning_rate": 1.9897945354716662e-06, + "loss": 0.6094, + "step": 182 + }, + { + "epoch": 0.14490028208046715, + "grad_norm": 0.24370628898940228, + "learning_rate": 1.9896757275228403e-06, + "loss": 0.6078, + "step": 183 + }, + { + "epoch": 0.145692086900579, + "grad_norm": 0.23759579081298693, + "learning_rate": 1.989556235601176e-06, + "loss": 0.6039, + "step": 184 + }, + { + "epoch": 0.14648389172069085, + "grad_norm": 0.24391920970902445, + "learning_rate": 1.9894360597892543e-06, + "loss": 0.6078, + "step": 185 + }, + { + "epoch": 0.1472756965408027, + "grad_norm": 0.24845711826445283, + "learning_rate": 1.989315200170131e-06, + "loss": 0.6328, + "step": 186 + }, + { + "epoch": 0.14806750136091454, + "grad_norm": 0.2616752365478856, + "learning_rate": 1.989193656827333e-06, + "loss": 0.6046, + "step": 187 + }, + { + "epoch": 0.14885930618102639, + "grad_norm": 0.2446871338572912, + "learning_rate": 1.989071429844859e-06, + "loss": 0.6061, + "step": 188 + }, + { + "epoch": 0.14965111100113823, + "grad_norm": 0.24408589622192395, + "learning_rate": 1.988948519307182e-06, + "loss": 0.6045, + "step": 189 + }, + { + "epoch": 0.15044291582125005, + "grad_norm": 0.23962545305324873, + "learning_rate": 1.9888249252992457e-06, + "loss": 0.6145, + "step": 190 + }, + { + "epoch": 0.1512347206413619, + "grad_norm": 0.25855035028698975, + "learning_rate": 1.9887006479064676e-06, + "loss": 0.622, + "step": 191 + }, + { + "epoch": 0.15202652546147374, + "grad_norm": 0.25096403621981855, + "learning_rate": 1.9885756872147367e-06, + "loss": 0.599, + "step": 192 + }, + { + "epoch": 0.1528183302815856, + "grad_norm": 0.2456345168057193, + "learning_rate": 1.9884500433104133e-06, + "loss": 0.5971, + "step": 193 + }, + { + "epoch": 0.15361013510169744, + "grad_norm": 0.24697511302923733, + "learning_rate": 1.988323716280332e-06, + "loss": 0.6172, + "step": 194 + }, + { + "epoch": 0.15440193992180928, + "grad_norm": 0.2498250649439413, + "learning_rate": 1.9881967062117985e-06, + "loss": 0.6202, + "step": 195 + }, + { + "epoch": 0.15519374474192113, + "grad_norm": 0.2477427405714654, + "learning_rate": 1.9880690131925897e-06, + "loss": 0.6028, + "step": 196 + }, + { + "epoch": 0.15598554956203295, + "grad_norm": 0.25368523649798064, + "learning_rate": 1.987940637310956e-06, + "loss": 0.6013, + "step": 197 + }, + { + "epoch": 0.1567773543821448, + "grad_norm": 0.36000625045653284, + "learning_rate": 1.987811578655618e-06, + "loss": 0.6118, + "step": 198 + }, + { + "epoch": 0.15756915920225664, + "grad_norm": 0.24669657811244383, + "learning_rate": 1.9876818373157706e-06, + "loss": 0.6099, + "step": 199 + }, + { + "epoch": 0.1583609640223685, + "grad_norm": 0.24717856235092012, + "learning_rate": 1.987551413381078e-06, + "loss": 0.5989, + "step": 200 + }, + { + "epoch": 0.15915276884248034, + "grad_norm": 0.24166194958075368, + "learning_rate": 1.9874203069416774e-06, + "loss": 0.6095, + "step": 201 + }, + { + "epoch": 0.15994457366259218, + "grad_norm": 0.2432222315837849, + "learning_rate": 1.987288518088178e-06, + "loss": 0.5939, + "step": 202 + }, + { + "epoch": 0.160736378482704, + "grad_norm": 0.25616990344712176, + "learning_rate": 1.9871560469116597e-06, + "loss": 0.6319, + "step": 203 + }, + { + "epoch": 0.16152818330281585, + "grad_norm": 0.25124893412477034, + "learning_rate": 1.9870228935036743e-06, + "loss": 0.6104, + "step": 204 + }, + { + "epoch": 0.1623199881229277, + "grad_norm": 0.2533422007446896, + "learning_rate": 1.9868890579562456e-06, + "loss": 0.6187, + "step": 205 + }, + { + "epoch": 0.16311179294303954, + "grad_norm": 0.24291758536714064, + "learning_rate": 1.9867545403618678e-06, + "loss": 0.6145, + "step": 206 + }, + { + "epoch": 0.1639035977631514, + "grad_norm": 0.24321672867435246, + "learning_rate": 1.9866193408135083e-06, + "loss": 0.6037, + "step": 207 + }, + { + "epoch": 0.16469540258326323, + "grad_norm": 0.24239392661935416, + "learning_rate": 1.986483459404603e-06, + "loss": 0.6176, + "step": 208 + }, + { + "epoch": 0.16548720740337508, + "grad_norm": 0.2453605563740051, + "learning_rate": 1.9863468962290616e-06, + "loss": 0.6035, + "step": 209 + }, + { + "epoch": 0.1662790122234869, + "grad_norm": 0.25150165507645755, + "learning_rate": 1.986209651381264e-06, + "loss": 0.5926, + "step": 210 + }, + { + "epoch": 0.16707081704359875, + "grad_norm": 0.25363285848077716, + "learning_rate": 1.986071724956061e-06, + "loss": 0.6122, + "step": 211 + }, + { + "epoch": 0.1678626218637106, + "grad_norm": 0.24764125273102747, + "learning_rate": 1.9859331170487748e-06, + "loss": 0.605, + "step": 212 + }, + { + "epoch": 0.16865442668382244, + "grad_norm": 0.24485071327883146, + "learning_rate": 1.9857938277551983e-06, + "loss": 0.6112, + "step": 213 + }, + { + "epoch": 0.16944623150393429, + "grad_norm": 0.24319134264447712, + "learning_rate": 1.9856538571715954e-06, + "loss": 0.6087, + "step": 214 + }, + { + "epoch": 0.17023803632404613, + "grad_norm": 0.24594381812896973, + "learning_rate": 1.985513205394701e-06, + "loss": 0.5984, + "step": 215 + }, + { + "epoch": 0.17102984114415795, + "grad_norm": 0.24682994996910132, + "learning_rate": 1.9853718725217207e-06, + "loss": 0.607, + "step": 216 + }, + { + "epoch": 0.1718216459642698, + "grad_norm": 0.2523031853877198, + "learning_rate": 1.9852298586503305e-06, + "loss": 0.6154, + "step": 217 + }, + { + "epoch": 0.17261345078438164, + "grad_norm": 0.25940404563218156, + "learning_rate": 1.985087163878678e-06, + "loss": 0.6201, + "step": 218 + }, + { + "epoch": 0.1734052556044935, + "grad_norm": 0.24143005184833266, + "learning_rate": 1.9849437883053802e-06, + "loss": 0.6021, + "step": 219 + }, + { + "epoch": 0.17419706042460534, + "grad_norm": 0.24284774715761134, + "learning_rate": 1.984799732029525e-06, + "loss": 0.6012, + "step": 220 + }, + { + "epoch": 0.17498886524471718, + "grad_norm": 0.24534087775707256, + "learning_rate": 1.9846549951506712e-06, + "loss": 0.6207, + "step": 221 + }, + { + "epoch": 0.17578067006482903, + "grad_norm": 0.24717583062758106, + "learning_rate": 1.984509577768847e-06, + "loss": 0.6202, + "step": 222 + }, + { + "epoch": 0.17657247488494085, + "grad_norm": 0.24145918806305072, + "learning_rate": 1.984363479984552e-06, + "loss": 0.6084, + "step": 223 + }, + { + "epoch": 0.1773642797050527, + "grad_norm": 0.24360708384916321, + "learning_rate": 1.984216701898756e-06, + "loss": 0.6015, + "step": 224 + }, + { + "epoch": 0.17815608452516454, + "grad_norm": 0.25269848624865143, + "learning_rate": 1.9840692436128975e-06, + "loss": 0.595, + "step": 225 + }, + { + "epoch": 0.1789478893452764, + "grad_norm": 0.24271430989679857, + "learning_rate": 1.9839211052288873e-06, + "loss": 0.5995, + "step": 226 + }, + { + "epoch": 0.17973969416538824, + "grad_norm": 0.24759560592499152, + "learning_rate": 1.983772286849104e-06, + "loss": 0.5909, + "step": 227 + }, + { + "epoch": 0.18053149898550008, + "grad_norm": 0.25407305995874085, + "learning_rate": 1.9836227885763975e-06, + "loss": 0.5981, + "step": 228 + }, + { + "epoch": 0.18132330380561193, + "grad_norm": 0.24923103654822074, + "learning_rate": 1.9834726105140873e-06, + "loss": 0.5969, + "step": 229 + }, + { + "epoch": 0.18211510862572375, + "grad_norm": 0.2495011332612613, + "learning_rate": 1.9833217527659624e-06, + "loss": 0.5936, + "step": 230 + }, + { + "epoch": 0.1829069134458356, + "grad_norm": 0.25895094181331907, + "learning_rate": 1.9831702154362825e-06, + "loss": 0.5915, + "step": 231 + }, + { + "epoch": 0.18369871826594744, + "grad_norm": 0.25319893187249265, + "learning_rate": 1.9830179986297755e-06, + "loss": 0.5982, + "step": 232 + }, + { + "epoch": 0.1844905230860593, + "grad_norm": 0.24279819253802182, + "learning_rate": 1.98286510245164e-06, + "loss": 0.6067, + "step": 233 + }, + { + "epoch": 0.18528232790617113, + "grad_norm": 0.2619794566140669, + "learning_rate": 1.9827115270075438e-06, + "loss": 0.6116, + "step": 234 + }, + { + "epoch": 0.18607413272628298, + "grad_norm": 0.24762634464898894, + "learning_rate": 1.9825572724036242e-06, + "loss": 0.5835, + "step": 235 + }, + { + "epoch": 0.1868659375463948, + "grad_norm": 0.23136911235072175, + "learning_rate": 1.9824023387464877e-06, + "loss": 0.6147, + "step": 236 + }, + { + "epoch": 0.18765774236650665, + "grad_norm": 0.2635243334169113, + "learning_rate": 1.9822467261432097e-06, + "loss": 0.6159, + "step": 237 + }, + { + "epoch": 0.1884495471866185, + "grad_norm": 0.25635271967861756, + "learning_rate": 1.9820904347013364e-06, + "loss": 0.614, + "step": 238 + }, + { + "epoch": 0.18924135200673034, + "grad_norm": 0.24759710351741077, + "learning_rate": 1.9819334645288813e-06, + "loss": 0.604, + "step": 239 + }, + { + "epoch": 0.1900331568268422, + "grad_norm": 0.25418645288516356, + "learning_rate": 1.9817758157343274e-06, + "loss": 0.5995, + "step": 240 + }, + { + "epoch": 0.19082496164695403, + "grad_norm": 0.24625861500046178, + "learning_rate": 1.981617488426628e-06, + "loss": 0.6105, + "step": 241 + }, + { + "epoch": 0.19161676646706588, + "grad_norm": 0.2477603992651197, + "learning_rate": 1.981458482715204e-06, + "loss": 0.5975, + "step": 242 + }, + { + "epoch": 0.1924085712871777, + "grad_norm": 0.25403326190627773, + "learning_rate": 1.9812987987099454e-06, + "loss": 0.607, + "step": 243 + }, + { + "epoch": 0.19320037610728955, + "grad_norm": 0.254854496754908, + "learning_rate": 1.9811384365212108e-06, + "loss": 0.595, + "step": 244 + }, + { + "epoch": 0.1939921809274014, + "grad_norm": 0.24897773797625972, + "learning_rate": 1.9809773962598285e-06, + "loss": 0.6066, + "step": 245 + }, + { + "epoch": 0.19478398574751324, + "grad_norm": 0.257037218773845, + "learning_rate": 1.980815678037094e-06, + "loss": 0.6144, + "step": 246 + }, + { + "epoch": 0.19557579056762509, + "grad_norm": 0.24510867554760934, + "learning_rate": 1.980653281964773e-06, + "loss": 0.6091, + "step": 247 + }, + { + "epoch": 0.19636759538773693, + "grad_norm": 0.25553709307934286, + "learning_rate": 1.9804902081550976e-06, + "loss": 0.6204, + "step": 248 + }, + { + "epoch": 0.19715940020784878, + "grad_norm": 0.24653872920986297, + "learning_rate": 1.98032645672077e-06, + "loss": 0.6101, + "step": 249 + }, + { + "epoch": 0.1979512050279606, + "grad_norm": 0.2639555201466776, + "learning_rate": 1.9801620277749603e-06, + "loss": 0.613, + "step": 250 + }, + { + "epoch": 0.19874300984807244, + "grad_norm": 0.24866230382323729, + "learning_rate": 1.9799969214313065e-06, + "loss": 0.5997, + "step": 251 + }, + { + "epoch": 0.1995348146681843, + "grad_norm": 0.24273609462370518, + "learning_rate": 1.9798311378039146e-06, + "loss": 0.5914, + "step": 252 + }, + { + "epoch": 0.20032661948829614, + "grad_norm": 0.2506812812970362, + "learning_rate": 1.9796646770073594e-06, + "loss": 0.6215, + "step": 253 + }, + { + "epoch": 0.20111842430840798, + "grad_norm": 0.24602122825448242, + "learning_rate": 1.9794975391566832e-06, + "loss": 0.6155, + "step": 254 + }, + { + "epoch": 0.20191022912851983, + "grad_norm": 0.24750713225507665, + "learning_rate": 1.979329724367396e-06, + "loss": 0.6088, + "step": 255 + }, + { + "epoch": 0.20270203394863165, + "grad_norm": 0.24092722636200453, + "learning_rate": 1.9791612327554768e-06, + "loss": 0.6022, + "step": 256 + }, + { + "epoch": 0.2034938387687435, + "grad_norm": 0.2444047578034734, + "learning_rate": 1.9789920644373706e-06, + "loss": 0.611, + "step": 257 + }, + { + "epoch": 0.20428564358885534, + "grad_norm": 0.2508034294267427, + "learning_rate": 1.978822219529991e-06, + "loss": 0.6042, + "step": 258 + }, + { + "epoch": 0.2050774484089672, + "grad_norm": 0.2485192806241367, + "learning_rate": 1.97865169815072e-06, + "loss": 0.593, + "step": 259 + }, + { + "epoch": 0.20586925322907904, + "grad_norm": 0.25986543914571764, + "learning_rate": 1.9784805004174053e-06, + "loss": 0.6111, + "step": 260 + }, + { + "epoch": 0.20666105804919088, + "grad_norm": 0.24888504318885482, + "learning_rate": 1.978308626448364e-06, + "loss": 0.6137, + "step": 261 + }, + { + "epoch": 0.20745286286930273, + "grad_norm": 0.2509505465202432, + "learning_rate": 1.978136076362379e-06, + "loss": 0.5987, + "step": 262 + }, + { + "epoch": 0.20824466768941455, + "grad_norm": 0.23819610108431402, + "learning_rate": 1.977962850278701e-06, + "loss": 0.6001, + "step": 263 + }, + { + "epoch": 0.2090364725095264, + "grad_norm": 0.25056741041097214, + "learning_rate": 1.977788948317048e-06, + "loss": 0.6033, + "step": 264 + }, + { + "epoch": 0.20982827732963824, + "grad_norm": 0.24498731933250023, + "learning_rate": 1.977614370597605e-06, + "loss": 0.5956, + "step": 265 + }, + { + "epoch": 0.2106200821497501, + "grad_norm": 0.24759436752884384, + "learning_rate": 1.9774391172410243e-06, + "loss": 0.6031, + "step": 266 + }, + { + "epoch": 0.21141188696986193, + "grad_norm": 0.24667679841911644, + "learning_rate": 1.977263188368425e-06, + "loss": 0.5989, + "step": 267 + }, + { + "epoch": 0.21220369178997378, + "grad_norm": 0.24341767258742814, + "learning_rate": 1.977086584101393e-06, + "loss": 0.5969, + "step": 268 + }, + { + "epoch": 0.21299549661008563, + "grad_norm": 0.26466260911509654, + "learning_rate": 1.97690930456198e-06, + "loss": 0.6138, + "step": 269 + }, + { + "epoch": 0.21378730143019745, + "grad_norm": 0.2405167898745803, + "learning_rate": 1.9767313498727063e-06, + "loss": 0.6064, + "step": 270 + }, + { + "epoch": 0.2145791062503093, + "grad_norm": 0.24275673692261862, + "learning_rate": 1.9765527201565574e-06, + "loss": 0.5975, + "step": 271 + }, + { + "epoch": 0.21537091107042114, + "grad_norm": 1.1475853790832273, + "learning_rate": 1.976373415536986e-06, + "loss": 0.5973, + "step": 272 + }, + { + "epoch": 0.21616271589053299, + "grad_norm": 0.25371017212183866, + "learning_rate": 1.9761934361379107e-06, + "loss": 0.6184, + "step": 273 + }, + { + "epoch": 0.21695452071064483, + "grad_norm": 0.24533245392799563, + "learning_rate": 1.9760127820837163e-06, + "loss": 0.6082, + "step": 274 + }, + { + "epoch": 0.21774632553075668, + "grad_norm": 0.24420365359668406, + "learning_rate": 1.9758314534992555e-06, + "loss": 0.5992, + "step": 275 + }, + { + "epoch": 0.2185381303508685, + "grad_norm": 0.24787248749030424, + "learning_rate": 1.9756494505098446e-06, + "loss": 0.6163, + "step": 276 + }, + { + "epoch": 0.21932993517098034, + "grad_norm": 0.25578859269353377, + "learning_rate": 1.975466773241268e-06, + "loss": 0.6118, + "step": 277 + }, + { + "epoch": 0.2201217399910922, + "grad_norm": 0.2407539832006363, + "learning_rate": 1.9752834218197757e-06, + "loss": 0.6093, + "step": 278 + }, + { + "epoch": 0.22091354481120404, + "grad_norm": 0.24025331731692606, + "learning_rate": 1.975099396372083e-06, + "loss": 0.5972, + "step": 279 + }, + { + "epoch": 0.22170534963131588, + "grad_norm": 0.24577431048359374, + "learning_rate": 1.9749146970253705e-06, + "loss": 0.603, + "step": 280 + }, + { + "epoch": 0.22249715445142773, + "grad_norm": 0.25542977696523095, + "learning_rate": 1.974729323907287e-06, + "loss": 0.6088, + "step": 281 + }, + { + "epoch": 0.22328895927153958, + "grad_norm": 0.2406509357436267, + "learning_rate": 1.9745432771459447e-06, + "loss": 0.6077, + "step": 282 + }, + { + "epoch": 0.2240807640916514, + "grad_norm": 0.24706767269031316, + "learning_rate": 1.974356556869922e-06, + "loss": 0.5984, + "step": 283 + }, + { + "epoch": 0.22487256891176324, + "grad_norm": 0.24309976779774528, + "learning_rate": 1.9741691632082623e-06, + "loss": 0.5834, + "step": 284 + }, + { + "epoch": 0.2256643737318751, + "grad_norm": 0.24748833527613592, + "learning_rate": 1.9739810962904752e-06, + "loss": 0.5814, + "step": 285 + }, + { + "epoch": 0.22645617855198694, + "grad_norm": 0.2460942176494084, + "learning_rate": 1.9737923562465356e-06, + "loss": 0.6015, + "step": 286 + }, + { + "epoch": 0.22724798337209878, + "grad_norm": 0.25104869936231566, + "learning_rate": 1.973602943206883e-06, + "loss": 0.6084, + "step": 287 + }, + { + "epoch": 0.22803978819221063, + "grad_norm": 0.2476909073234987, + "learning_rate": 1.9734128573024217e-06, + "loss": 0.5882, + "step": 288 + }, + { + "epoch": 0.22883159301232248, + "grad_norm": 0.24021216899169395, + "learning_rate": 1.9732220986645227e-06, + "loss": 0.6065, + "step": 289 + }, + { + "epoch": 0.2296233978324343, + "grad_norm": 0.26552042901412143, + "learning_rate": 1.97303066742502e-06, + "loss": 0.6075, + "step": 290 + }, + { + "epoch": 0.23041520265254614, + "grad_norm": 0.2391744425572385, + "learning_rate": 1.9728385637162136e-06, + "loss": 0.5956, + "step": 291 + }, + { + "epoch": 0.231207007472658, + "grad_norm": 0.240755445252526, + "learning_rate": 1.972645787670868e-06, + "loss": 0.5951, + "step": 292 + }, + { + "epoch": 0.23199881229276983, + "grad_norm": 0.25114356374370583, + "learning_rate": 1.972452339422212e-06, + "loss": 0.5855, + "step": 293 + }, + { + "epoch": 0.23279061711288168, + "grad_norm": 0.2517140758585886, + "learning_rate": 1.9722582191039393e-06, + "loss": 0.6042, + "step": 294 + }, + { + "epoch": 0.23358242193299353, + "grad_norm": 0.24718597294358718, + "learning_rate": 1.972063426850208e-06, + "loss": 0.5769, + "step": 295 + }, + { + "epoch": 0.23437422675310535, + "grad_norm": 0.2482292753203727, + "learning_rate": 1.971867962795641e-06, + "loss": 0.5808, + "step": 296 + }, + { + "epoch": 0.2351660315732172, + "grad_norm": 0.24794499541491236, + "learning_rate": 1.971671827075324e-06, + "loss": 0.5968, + "step": 297 + }, + { + "epoch": 0.23595783639332904, + "grad_norm": 0.23903391096586601, + "learning_rate": 1.9714750198248096e-06, + "loss": 0.6063, + "step": 298 + }, + { + "epoch": 0.2367496412134409, + "grad_norm": 0.24515278358151316, + "learning_rate": 1.9712775411801116e-06, + "loss": 0.6026, + "step": 299 + }, + { + "epoch": 0.23754144603355273, + "grad_norm": 0.24847240615755817, + "learning_rate": 1.9710793912777094e-06, + "loss": 0.5978, + "step": 300 + }, + { + "epoch": 0.23833325085366458, + "grad_norm": 0.25146845692678066, + "learning_rate": 1.9708805702545458e-06, + "loss": 0.606, + "step": 301 + }, + { + "epoch": 0.23912505567377643, + "grad_norm": 0.2528429551784224, + "learning_rate": 1.970681078248028e-06, + "loss": 0.5836, + "step": 302 + }, + { + "epoch": 0.23991686049388825, + "grad_norm": 0.25244432858861543, + "learning_rate": 1.970480915396027e-06, + "loss": 0.6076, + "step": 303 + }, + { + "epoch": 0.2407086653140001, + "grad_norm": 0.2651349383906822, + "learning_rate": 1.9702800818368754e-06, + "loss": 0.5888, + "step": 304 + }, + { + "epoch": 0.24150047013411194, + "grad_norm": 0.23884501343673606, + "learning_rate": 1.9700785777093724e-06, + "loss": 0.6105, + "step": 305 + }, + { + "epoch": 0.24229227495422379, + "grad_norm": 0.2491501508439908, + "learning_rate": 1.969876403152778e-06, + "loss": 0.595, + "step": 306 + }, + { + "epoch": 0.24308407977433563, + "grad_norm": 0.25783710037749885, + "learning_rate": 1.9696735583068173e-06, + "loss": 0.6198, + "step": 307 + }, + { + "epoch": 0.24387588459444748, + "grad_norm": 0.24815004860558676, + "learning_rate": 1.9694700433116783e-06, + "loss": 0.5937, + "step": 308 + }, + { + "epoch": 0.2446676894145593, + "grad_norm": 0.24426378069410107, + "learning_rate": 1.969265858308011e-06, + "loss": 0.5973, + "step": 309 + }, + { + "epoch": 0.24545949423467114, + "grad_norm": 0.24771046129560645, + "learning_rate": 1.96906100343693e-06, + "loss": 0.5897, + "step": 310 + }, + { + "epoch": 0.246251299054783, + "grad_norm": 0.25749973668903764, + "learning_rate": 1.968855478840012e-06, + "loss": 0.6004, + "step": 311 + }, + { + "epoch": 0.24704310387489484, + "grad_norm": 0.248025320372043, + "learning_rate": 1.9686492846592963e-06, + "loss": 0.5783, + "step": 312 + }, + { + "epoch": 0.24783490869500668, + "grad_norm": 0.24984650039117945, + "learning_rate": 1.9684424210372857e-06, + "loss": 0.6122, + "step": 313 + }, + { + "epoch": 0.24862671351511853, + "grad_norm": 0.24415386192705926, + "learning_rate": 1.968234888116945e-06, + "loss": 0.5981, + "step": 314 + }, + { + "epoch": 0.24941851833523038, + "grad_norm": 0.2509073672891484, + "learning_rate": 1.9680266860417025e-06, + "loss": 0.5998, + "step": 315 + }, + { + "epoch": 0.2502103231553422, + "grad_norm": 0.2552178325232198, + "learning_rate": 1.9678178149554477e-06, + "loss": 0.5946, + "step": 316 + }, + { + "epoch": 0.25100212797545407, + "grad_norm": 0.24539374694206276, + "learning_rate": 1.9676082750025335e-06, + "loss": 0.6, + "step": 317 + }, + { + "epoch": 0.2517939327955659, + "grad_norm": 0.25067363359825495, + "learning_rate": 1.967398066327774e-06, + "loss": 0.5971, + "step": 318 + }, + { + "epoch": 0.2525857376156777, + "grad_norm": 0.2506825238389697, + "learning_rate": 1.9671871890764473e-06, + "loss": 0.6043, + "step": 319 + }, + { + "epoch": 0.2533775424357896, + "grad_norm": 0.2541507375488699, + "learning_rate": 1.966975643394291e-06, + "loss": 0.599, + "step": 320 + }, + { + "epoch": 0.2541693472559014, + "grad_norm": 0.23826752511006694, + "learning_rate": 1.966763429427507e-06, + "loss": 0.5895, + "step": 321 + }, + { + "epoch": 0.2549611520760133, + "grad_norm": 0.2567212307793121, + "learning_rate": 1.9665505473227575e-06, + "loss": 0.59, + "step": 322 + }, + { + "epoch": 0.2557529568961251, + "grad_norm": 0.25250948582083205, + "learning_rate": 1.966336997227167e-06, + "loss": 0.6027, + "step": 323 + }, + { + "epoch": 0.25654476171623697, + "grad_norm": 0.2644211046230142, + "learning_rate": 1.9661227792883222e-06, + "loss": 0.6136, + "step": 324 + }, + { + "epoch": 0.2573365665363488, + "grad_norm": 0.2525750708474708, + "learning_rate": 1.9659078936542705e-06, + "loss": 0.6057, + "step": 325 + }, + { + "epoch": 0.2581283713564606, + "grad_norm": 0.24039651813252683, + "learning_rate": 1.9656923404735213e-06, + "loss": 0.5997, + "step": 326 + }, + { + "epoch": 0.2589201761765725, + "grad_norm": 0.2556325514662131, + "learning_rate": 1.965476119895045e-06, + "loss": 0.6052, + "step": 327 + }, + { + "epoch": 0.2597119809966843, + "grad_norm": 0.2570466737402276, + "learning_rate": 1.965259232068273e-06, + "loss": 0.5967, + "step": 328 + }, + { + "epoch": 0.2605037858167962, + "grad_norm": 0.2548504469875246, + "learning_rate": 1.965041677143099e-06, + "loss": 0.5956, + "step": 329 + }, + { + "epoch": 0.261295590636908, + "grad_norm": 0.24551894747783568, + "learning_rate": 1.9648234552698764e-06, + "loss": 0.5855, + "step": 330 + }, + { + "epoch": 0.26208739545701987, + "grad_norm": 0.2511820658089693, + "learning_rate": 1.9646045665994203e-06, + "loss": 0.6027, + "step": 331 + }, + { + "epoch": 0.2628792002771317, + "grad_norm": 0.25625179410182997, + "learning_rate": 1.964385011283006e-06, + "loss": 0.5822, + "step": 332 + }, + { + "epoch": 0.2636710050972435, + "grad_norm": 0.2492724738434624, + "learning_rate": 1.9641647894723706e-06, + "loss": 0.5962, + "step": 333 + }, + { + "epoch": 0.2644628099173554, + "grad_norm": 0.24639712006197856, + "learning_rate": 1.9639439013197107e-06, + "loss": 0.5964, + "step": 334 + }, + { + "epoch": 0.2652546147374672, + "grad_norm": 0.24284570346349335, + "learning_rate": 1.963722346977684e-06, + "loss": 0.6077, + "step": 335 + }, + { + "epoch": 0.2660464195575791, + "grad_norm": 0.23927956699044922, + "learning_rate": 1.9635001265994085e-06, + "loss": 0.6148, + "step": 336 + }, + { + "epoch": 0.2668382243776909, + "grad_norm": 0.2390886419500776, + "learning_rate": 1.9632772403384624e-06, + "loss": 0.6072, + "step": 337 + }, + { + "epoch": 0.26763002919780277, + "grad_norm": 0.24263488626702964, + "learning_rate": 1.9630536883488844e-06, + "loss": 0.604, + "step": 338 + }, + { + "epoch": 0.2684218340179146, + "grad_norm": 0.24884755916427828, + "learning_rate": 1.9628294707851733e-06, + "loss": 0.5971, + "step": 339 + }, + { + "epoch": 0.2692136388380264, + "grad_norm": 0.2566830533858907, + "learning_rate": 1.9626045878022866e-06, + "loss": 0.6017, + "step": 340 + }, + { + "epoch": 0.2700054436581383, + "grad_norm": 0.2410117164423693, + "learning_rate": 1.962379039555644e-06, + "loss": 0.6015, + "step": 341 + }, + { + "epoch": 0.2707972484782501, + "grad_norm": 0.24423703801214214, + "learning_rate": 1.9621528262011227e-06, + "loss": 0.6044, + "step": 342 + }, + { + "epoch": 0.27158905329836197, + "grad_norm": 0.2515836234811855, + "learning_rate": 1.9619259478950614e-06, + "loss": 0.5946, + "step": 343 + }, + { + "epoch": 0.2723808581184738, + "grad_norm": 0.2597038071689357, + "learning_rate": 1.961698404794257e-06, + "loss": 0.5985, + "step": 344 + }, + { + "epoch": 0.27317266293858566, + "grad_norm": 0.24035427328118955, + "learning_rate": 1.9614701970559664e-06, + "loss": 0.5903, + "step": 345 + }, + { + "epoch": 0.2739644677586975, + "grad_norm": 0.24663368446774298, + "learning_rate": 1.961241324837906e-06, + "loss": 0.6162, + "step": 346 + }, + { + "epoch": 0.2747562725788093, + "grad_norm": 0.26954452255428774, + "learning_rate": 1.9610117882982512e-06, + "loss": 0.5952, + "step": 347 + }, + { + "epoch": 0.2755480773989212, + "grad_norm": 0.2439930239105369, + "learning_rate": 1.960781587595636e-06, + "loss": 0.5944, + "step": 348 + }, + { + "epoch": 0.276339882219033, + "grad_norm": 0.24771750596417796, + "learning_rate": 1.9605507228891547e-06, + "loss": 0.5801, + "step": 349 + }, + { + "epoch": 0.27713168703914487, + "grad_norm": 0.26550547373108035, + "learning_rate": 1.9603191943383597e-06, + "loss": 0.5961, + "step": 350 + }, + { + "epoch": 0.2779234918592567, + "grad_norm": 0.24431239688491097, + "learning_rate": 1.9600870021032615e-06, + "loss": 0.5907, + "step": 351 + }, + { + "epoch": 0.27871529667936856, + "grad_norm": 0.2535682794416749, + "learning_rate": 1.9598541463443307e-06, + "loss": 0.603, + "step": 352 + }, + { + "epoch": 0.2795071014994804, + "grad_norm": 0.2448211841047002, + "learning_rate": 1.959620627222496e-06, + "loss": 0.5928, + "step": 353 + }, + { + "epoch": 0.2802989063195922, + "grad_norm": 0.25103757340926774, + "learning_rate": 1.9593864448991435e-06, + "loss": 0.6032, + "step": 354 + }, + { + "epoch": 0.2810907111397041, + "grad_norm": 0.23940689444983226, + "learning_rate": 1.959151599536119e-06, + "loss": 0.5938, + "step": 355 + }, + { + "epoch": 0.2818825159598159, + "grad_norm": 0.2607433431505887, + "learning_rate": 1.958916091295726e-06, + "loss": 0.6062, + "step": 356 + }, + { + "epoch": 0.28267432077992777, + "grad_norm": 0.25475404406877755, + "learning_rate": 1.958679920340726e-06, + "loss": 0.5947, + "step": 357 + }, + { + "epoch": 0.2834661256000396, + "grad_norm": 0.24802973037438725, + "learning_rate": 1.958443086834339e-06, + "loss": 0.5991, + "step": 358 + }, + { + "epoch": 0.2842579304201514, + "grad_norm": 0.2431769776754948, + "learning_rate": 1.958205590940242e-06, + "loss": 0.6063, + "step": 359 + }, + { + "epoch": 0.2850497352402633, + "grad_norm": 0.2639127813318562, + "learning_rate": 1.957967432822571e-06, + "loss": 0.6086, + "step": 360 + }, + { + "epoch": 0.2858415400603751, + "grad_norm": 0.26584545205362775, + "learning_rate": 1.9577286126459176e-06, + "loss": 0.6097, + "step": 361 + }, + { + "epoch": 0.286633344880487, + "grad_norm": 0.24055194183303985, + "learning_rate": 1.957489130575334e-06, + "loss": 0.5833, + "step": 362 + }, + { + "epoch": 0.2874251497005988, + "grad_norm": 0.25429600055401314, + "learning_rate": 1.9572489867763274e-06, + "loss": 0.6006, + "step": 363 + }, + { + "epoch": 0.28821695452071067, + "grad_norm": 0.2537061934212363, + "learning_rate": 1.957008181414863e-06, + "loss": 0.6085, + "step": 364 + }, + { + "epoch": 0.2890087593408225, + "grad_norm": 0.23596507193683017, + "learning_rate": 1.956766714657364e-06, + "loss": 0.5909, + "step": 365 + }, + { + "epoch": 0.2898005641609343, + "grad_norm": 0.25278423998075816, + "learning_rate": 1.956524586670709e-06, + "loss": 0.5941, + "step": 366 + }, + { + "epoch": 0.2905923689810462, + "grad_norm": 0.2594776908735593, + "learning_rate": 1.9562817976222356e-06, + "loss": 0.5861, + "step": 367 + }, + { + "epoch": 0.291384173801158, + "grad_norm": 0.23709390308086759, + "learning_rate": 1.956038347679736e-06, + "loss": 0.5982, + "step": 368 + }, + { + "epoch": 0.29217597862126987, + "grad_norm": 0.24788413500193693, + "learning_rate": 1.9557942370114618e-06, + "loss": 0.6196, + "step": 369 + }, + { + "epoch": 0.2929677834413817, + "grad_norm": 0.2456033048336526, + "learning_rate": 1.955549465786119e-06, + "loss": 0.5918, + "step": 370 + }, + { + "epoch": 0.29375958826149356, + "grad_norm": 0.24947676521440168, + "learning_rate": 1.9553040341728716e-06, + "loss": 0.5946, + "step": 371 + }, + { + "epoch": 0.2945513930816054, + "grad_norm": 0.2698339661266052, + "learning_rate": 1.9550579423413384e-06, + "loss": 0.5761, + "step": 372 + }, + { + "epoch": 0.2953431979017172, + "grad_norm": 0.25230214410700375, + "learning_rate": 1.954811190461596e-06, + "loss": 0.5821, + "step": 373 + }, + { + "epoch": 0.2961350027218291, + "grad_norm": 0.3105801685091088, + "learning_rate": 1.954563778704177e-06, + "loss": 0.6046, + "step": 374 + }, + { + "epoch": 0.2969268075419409, + "grad_norm": 0.23384241659400207, + "learning_rate": 1.954315707240069e-06, + "loss": 0.5907, + "step": 375 + }, + { + "epoch": 0.29771861236205277, + "grad_norm": 0.24705464372439737, + "learning_rate": 1.9540669762407163e-06, + "loss": 0.5871, + "step": 376 + }, + { + "epoch": 0.2985104171821646, + "grad_norm": 0.2595162897846651, + "learning_rate": 1.953817585878019e-06, + "loss": 0.5971, + "step": 377 + }, + { + "epoch": 0.29930222200227646, + "grad_norm": 0.26096526999231834, + "learning_rate": 1.9535675363243334e-06, + "loss": 0.594, + "step": 378 + }, + { + "epoch": 0.3000940268223883, + "grad_norm": 0.2579837657000217, + "learning_rate": 1.9533168277524695e-06, + "loss": 0.5993, + "step": 379 + }, + { + "epoch": 0.3008858316425001, + "grad_norm": 0.260944446862816, + "learning_rate": 1.953065460335695e-06, + "loss": 0.6004, + "step": 380 + }, + { + "epoch": 0.301677636462612, + "grad_norm": 0.2539201800274544, + "learning_rate": 1.9528134342477313e-06, + "loss": 0.6043, + "step": 381 + }, + { + "epoch": 0.3024694412827238, + "grad_norm": 0.2594416904882131, + "learning_rate": 1.952560749662756e-06, + "loss": 0.593, + "step": 382 + }, + { + "epoch": 0.30326124610283567, + "grad_norm": 0.24637556242607472, + "learning_rate": 1.9523074067554017e-06, + "loss": 0.6056, + "step": 383 + }, + { + "epoch": 0.3040530509229475, + "grad_norm": 0.2613736787318001, + "learning_rate": 1.9520534057007548e-06, + "loss": 0.5885, + "step": 384 + }, + { + "epoch": 0.30484485574305936, + "grad_norm": 0.2576006861385841, + "learning_rate": 1.9517987466743585e-06, + "loss": 0.6058, + "step": 385 + }, + { + "epoch": 0.3056366605631712, + "grad_norm": 0.2559209310303427, + "learning_rate": 1.9515434298522093e-06, + "loss": 0.5992, + "step": 386 + }, + { + "epoch": 0.306428465383283, + "grad_norm": 0.26632881562278343, + "learning_rate": 1.9512874554107587e-06, + "loss": 0.5859, + "step": 387 + }, + { + "epoch": 0.3072202702033949, + "grad_norm": 0.25448466650597745, + "learning_rate": 1.9510308235269122e-06, + "loss": 0.5952, + "step": 388 + }, + { + "epoch": 0.3080120750235067, + "grad_norm": 0.24686184135640277, + "learning_rate": 1.9507735343780308e-06, + "loss": 0.6017, + "step": 389 + }, + { + "epoch": 0.30880387984361857, + "grad_norm": 0.25400298532675814, + "learning_rate": 1.9505155881419293e-06, + "loss": 0.5845, + "step": 390 + }, + { + "epoch": 0.3095956846637304, + "grad_norm": 0.24829689603575703, + "learning_rate": 1.9502569849968756e-06, + "loss": 0.6061, + "step": 391 + }, + { + "epoch": 0.31038748948384226, + "grad_norm": 0.2423789155144904, + "learning_rate": 1.949997725121593e-06, + "loss": 0.5973, + "step": 392 + }, + { + "epoch": 0.3111792943039541, + "grad_norm": 0.2502562090896562, + "learning_rate": 1.949737808695258e-06, + "loss": 0.5886, + "step": 393 + }, + { + "epoch": 0.3119710991240659, + "grad_norm": 0.25580619317505604, + "learning_rate": 1.949477235897501e-06, + "loss": 0.6038, + "step": 394 + }, + { + "epoch": 0.3127629039441778, + "grad_norm": 0.25269772488002135, + "learning_rate": 1.9492160069084057e-06, + "loss": 0.5898, + "step": 395 + }, + { + "epoch": 0.3135547087642896, + "grad_norm": 0.2557818555265261, + "learning_rate": 1.948954121908509e-06, + "loss": 0.5885, + "step": 396 + }, + { + "epoch": 0.31434651358440147, + "grad_norm": 0.2450807783588401, + "learning_rate": 1.9486915810788032e-06, + "loss": 0.6002, + "step": 397 + }, + { + "epoch": 0.3151383184045133, + "grad_norm": 0.2510392036495738, + "learning_rate": 1.9484283846007313e-06, + "loss": 0.5976, + "step": 398 + }, + { + "epoch": 0.3159301232246251, + "grad_norm": 0.25561092633708016, + "learning_rate": 1.948164532656191e-06, + "loss": 0.5776, + "step": 399 + }, + { + "epoch": 0.316721928044737, + "grad_norm": 0.2596884998651969, + "learning_rate": 1.9479000254275314e-06, + "loss": 0.599, + "step": 400 + }, + { + "epoch": 0.3175137328648488, + "grad_norm": 0.26498978664991624, + "learning_rate": 1.947634863097557e-06, + "loss": 0.5967, + "step": 401 + }, + { + "epoch": 0.31830553768496067, + "grad_norm": 0.2501572200213723, + "learning_rate": 1.947369045849523e-06, + "loss": 0.5943, + "step": 402 + }, + { + "epoch": 0.3190973425050725, + "grad_norm": 0.2577630865177056, + "learning_rate": 1.947102573867138e-06, + "loss": 0.5887, + "step": 403 + }, + { + "epoch": 0.31988914732518436, + "grad_norm": 0.24174145169513117, + "learning_rate": 1.9468354473345624e-06, + "loss": 0.6044, + "step": 404 + }, + { + "epoch": 0.3206809521452962, + "grad_norm": 0.24591986565078588, + "learning_rate": 1.9465676664364104e-06, + "loss": 0.6086, + "step": 405 + }, + { + "epoch": 0.321472756965408, + "grad_norm": 0.26307380086163157, + "learning_rate": 1.9462992313577462e-06, + "loss": 0.6025, + "step": 406 + }, + { + "epoch": 0.3222645617855199, + "grad_norm": 0.24700348114115595, + "learning_rate": 1.946030142284089e-06, + "loss": 0.5896, + "step": 407 + }, + { + "epoch": 0.3230563666056317, + "grad_norm": 0.2558304440489774, + "learning_rate": 1.9457603994014073e-06, + "loss": 0.5995, + "step": 408 + }, + { + "epoch": 0.32384817142574357, + "grad_norm": 0.26976961195565224, + "learning_rate": 1.9454900028961234e-06, + "loss": 0.5806, + "step": 409 + }, + { + "epoch": 0.3246399762458554, + "grad_norm": 0.24682486178015295, + "learning_rate": 1.9452189529551094e-06, + "loss": 0.6029, + "step": 410 + }, + { + "epoch": 0.32543178106596726, + "grad_norm": 0.2511422633097611, + "learning_rate": 1.944947249765691e-06, + "loss": 0.5858, + "step": 411 + }, + { + "epoch": 0.3262235858860791, + "grad_norm": 0.26370240792835703, + "learning_rate": 1.944674893515644e-06, + "loss": 0.5996, + "step": 412 + }, + { + "epoch": 0.3270153907061909, + "grad_norm": 0.2523583273695204, + "learning_rate": 1.9444018843931965e-06, + "loss": 0.6015, + "step": 413 + }, + { + "epoch": 0.3278071955263028, + "grad_norm": 0.24129610898224038, + "learning_rate": 1.9441282225870267e-06, + "loss": 0.5917, + "step": 414 + }, + { + "epoch": 0.3285990003464146, + "grad_norm": 0.2616328657607042, + "learning_rate": 1.9438539082862648e-06, + "loss": 0.5934, + "step": 415 + }, + { + "epoch": 0.32939080516652647, + "grad_norm": 0.24557528223674166, + "learning_rate": 1.9435789416804915e-06, + "loss": 0.5987, + "step": 416 + }, + { + "epoch": 0.3301826099866383, + "grad_norm": 0.2514189246130001, + "learning_rate": 1.9433033229597384e-06, + "loss": 0.5961, + "step": 417 + }, + { + "epoch": 0.33097441480675016, + "grad_norm": 0.2466652126527317, + "learning_rate": 1.9430270523144882e-06, + "loss": 0.5804, + "step": 418 + }, + { + "epoch": 0.331766219626862, + "grad_norm": 0.24752095595178564, + "learning_rate": 1.942750129935673e-06, + "loss": 0.5935, + "step": 419 + }, + { + "epoch": 0.3325580244469738, + "grad_norm": 0.24996783596689934, + "learning_rate": 1.942472556014677e-06, + "loss": 0.5994, + "step": 420 + }, + { + "epoch": 0.3333498292670857, + "grad_norm": 0.25223965184297614, + "learning_rate": 1.942194330743333e-06, + "loss": 0.5898, + "step": 421 + }, + { + "epoch": 0.3341416340871975, + "grad_norm": 0.2534843988928022, + "learning_rate": 1.9419154543139247e-06, + "loss": 0.6157, + "step": 422 + }, + { + "epoch": 0.33493343890730937, + "grad_norm": 0.24706484156818997, + "learning_rate": 1.9416359269191864e-06, + "loss": 0.6089, + "step": 423 + }, + { + "epoch": 0.3357252437274212, + "grad_norm": 0.24757094331330678, + "learning_rate": 1.9413557487523014e-06, + "loss": 0.6049, + "step": 424 + }, + { + "epoch": 0.33651704854753306, + "grad_norm": 0.24142843028914535, + "learning_rate": 1.9410749200069033e-06, + "loss": 0.578, + "step": 425 + }, + { + "epoch": 0.3373088533676449, + "grad_norm": 0.2510069841485761, + "learning_rate": 1.940793440877075e-06, + "loss": 0.5879, + "step": 426 + }, + { + "epoch": 0.3381006581877567, + "grad_norm": 0.2609053814825094, + "learning_rate": 1.940511311557349e-06, + "loss": 0.5848, + "step": 427 + }, + { + "epoch": 0.33889246300786857, + "grad_norm": 0.2518737245867738, + "learning_rate": 1.940228532242707e-06, + "loss": 0.5962, + "step": 428 + }, + { + "epoch": 0.3396842678279804, + "grad_norm": 0.25221504281541435, + "learning_rate": 1.93994510312858e-06, + "loss": 0.6024, + "step": 429 + }, + { + "epoch": 0.34047607264809227, + "grad_norm": 0.24951406008176466, + "learning_rate": 1.9396610244108486e-06, + "loss": 0.5985, + "step": 430 + }, + { + "epoch": 0.3412678774682041, + "grad_norm": 0.2453685216063324, + "learning_rate": 1.939376296285841e-06, + "loss": 0.6025, + "step": 431 + }, + { + "epoch": 0.3420596822883159, + "grad_norm": 0.24921034068058728, + "learning_rate": 1.939090918950336e-06, + "loss": 0.5982, + "step": 432 + }, + { + "epoch": 0.3428514871084278, + "grad_norm": 0.24095478985143048, + "learning_rate": 1.9388048926015593e-06, + "loss": 0.6006, + "step": 433 + }, + { + "epoch": 0.3436432919285396, + "grad_norm": 0.23977530214075207, + "learning_rate": 1.9385182174371872e-06, + "loss": 0.5993, + "step": 434 + }, + { + "epoch": 0.34443509674865147, + "grad_norm": 0.241451042853671, + "learning_rate": 1.938230893655342e-06, + "loss": 0.5947, + "step": 435 + }, + { + "epoch": 0.3452269015687633, + "grad_norm": 0.2514189307194158, + "learning_rate": 1.937942921454596e-06, + "loss": 0.5853, + "step": 436 + }, + { + "epoch": 0.34601870638887516, + "grad_norm": 0.24048387845244765, + "learning_rate": 1.9376543010339692e-06, + "loss": 0.5988, + "step": 437 + }, + { + "epoch": 0.346810511208987, + "grad_norm": 0.2536295455265749, + "learning_rate": 1.93736503259293e-06, + "loss": 0.587, + "step": 438 + }, + { + "epoch": 0.3476023160290988, + "grad_norm": 0.261232912394264, + "learning_rate": 1.9370751163313927e-06, + "loss": 0.5965, + "step": 439 + }, + { + "epoch": 0.3483941208492107, + "grad_norm": 0.24959356062955415, + "learning_rate": 1.9367845524497217e-06, + "loss": 0.6025, + "step": 440 + }, + { + "epoch": 0.3491859256693225, + "grad_norm": 0.24606995041096932, + "learning_rate": 1.9364933411487283e-06, + "loss": 0.5878, + "step": 441 + }, + { + "epoch": 0.34997773048943437, + "grad_norm": 0.2525242803889403, + "learning_rate": 1.93620148262967e-06, + "loss": 0.6033, + "step": 442 + }, + { + "epoch": 0.3507695353095462, + "grad_norm": 0.24699322431888046, + "learning_rate": 1.9359089770942534e-06, + "loss": 0.6022, + "step": 443 + }, + { + "epoch": 0.35156134012965806, + "grad_norm": 0.25703417564848346, + "learning_rate": 1.935615824744631e-06, + "loss": 0.6028, + "step": 444 + }, + { + "epoch": 0.3523531449497699, + "grad_norm": 0.25967336842246613, + "learning_rate": 1.935322025783403e-06, + "loss": 0.6214, + "step": 445 + }, + { + "epoch": 0.3531449497698817, + "grad_norm": 0.2540070899335221, + "learning_rate": 1.9350275804136166e-06, + "loss": 0.5946, + "step": 446 + }, + { + "epoch": 0.3539367545899936, + "grad_norm": 0.24440179466401316, + "learning_rate": 1.934732488838764e-06, + "loss": 0.5949, + "step": 447 + }, + { + "epoch": 0.3547285594101054, + "grad_norm": 0.23931278271932108, + "learning_rate": 1.934436751262787e-06, + "loss": 0.5845, + "step": 448 + }, + { + "epoch": 0.35552036423021727, + "grad_norm": 0.24706305509590024, + "learning_rate": 1.934140367890071e-06, + "loss": 0.5828, + "step": 449 + }, + { + "epoch": 0.3563121690503291, + "grad_norm": 0.24703464436340708, + "learning_rate": 1.9338433389254495e-06, + "loss": 0.5947, + "step": 450 + }, + { + "epoch": 0.35710397387044096, + "grad_norm": 0.26567239272076876, + "learning_rate": 1.9335456645742014e-06, + "loss": 0.5995, + "step": 451 + }, + { + "epoch": 0.3578957786905528, + "grad_norm": 0.2623514720491113, + "learning_rate": 1.9332473450420523e-06, + "loss": 0.5831, + "step": 452 + }, + { + "epoch": 0.3586875835106646, + "grad_norm": 0.2518150603558824, + "learning_rate": 1.9329483805351723e-06, + "loss": 0.5921, + "step": 453 + }, + { + "epoch": 0.3594793883307765, + "grad_norm": 0.24844155181430966, + "learning_rate": 1.932648771260179e-06, + "loss": 0.5992, + "step": 454 + }, + { + "epoch": 0.3602711931508883, + "grad_norm": 0.24764470623088883, + "learning_rate": 1.932348517424134e-06, + "loss": 0.6017, + "step": 455 + }, + { + "epoch": 0.36106299797100017, + "grad_norm": 0.2444623913737678, + "learning_rate": 1.932047619234546e-06, + "loss": 0.6114, + "step": 456 + }, + { + "epoch": 0.361854802791112, + "grad_norm": 0.24878794738315252, + "learning_rate": 1.9317460768993676e-06, + "loss": 0.5979, + "step": 457 + }, + { + "epoch": 0.36264660761122386, + "grad_norm": 0.25626001974283374, + "learning_rate": 1.9314438906269975e-06, + "loss": 0.607, + "step": 458 + }, + { + "epoch": 0.3634384124313357, + "grad_norm": 0.24385129423397395, + "learning_rate": 1.931141060626279e-06, + "loss": 0.5852, + "step": 459 + }, + { + "epoch": 0.3642302172514475, + "grad_norm": 0.2561568562068568, + "learning_rate": 1.9308375871065003e-06, + "loss": 0.6122, + "step": 460 + }, + { + "epoch": 0.36502202207155937, + "grad_norm": 0.2511846260355575, + "learning_rate": 1.9305334702773936e-06, + "loss": 0.5911, + "step": 461 + }, + { + "epoch": 0.3658138268916712, + "grad_norm": 0.23904173222463654, + "learning_rate": 1.9302287103491376e-06, + "loss": 0.58, + "step": 462 + }, + { + "epoch": 0.36660563171178306, + "grad_norm": 0.24310034339402387, + "learning_rate": 1.9299233075323537e-06, + "loss": 0.574, + "step": 463 + }, + { + "epoch": 0.3673974365318949, + "grad_norm": 0.23495726837588596, + "learning_rate": 1.9296172620381087e-06, + "loss": 0.5905, + "step": 464 + }, + { + "epoch": 0.36818924135200676, + "grad_norm": 0.2546785897112177, + "learning_rate": 1.9293105740779126e-06, + "loss": 0.6015, + "step": 465 + }, + { + "epoch": 0.3689810461721186, + "grad_norm": 0.252056722545849, + "learning_rate": 1.92900324386372e-06, + "loss": 0.5817, + "step": 466 + }, + { + "epoch": 0.3697728509922304, + "grad_norm": 0.24993882029205636, + "learning_rate": 1.9286952716079297e-06, + "loss": 0.5917, + "step": 467 + }, + { + "epoch": 0.37056465581234227, + "grad_norm": 0.2563838810351198, + "learning_rate": 1.928386657523383e-06, + "loss": 0.6038, + "step": 468 + }, + { + "epoch": 0.3713564606324541, + "grad_norm": 0.244787640148442, + "learning_rate": 1.928077401823367e-06, + "loss": 0.5781, + "step": 469 + }, + { + "epoch": 0.37214826545256596, + "grad_norm": 0.25997369540927734, + "learning_rate": 1.9277675047216092e-06, + "loss": 0.602, + "step": 470 + }, + { + "epoch": 0.3729400702726778, + "grad_norm": 0.2577866099198746, + "learning_rate": 1.927456966432283e-06, + "loss": 0.5906, + "step": 471 + }, + { + "epoch": 0.3737318750927896, + "grad_norm": 0.25343861301131776, + "learning_rate": 1.9271457871700032e-06, + "loss": 0.5928, + "step": 472 + }, + { + "epoch": 0.3745236799129015, + "grad_norm": 0.24326992190799326, + "learning_rate": 1.9268339671498287e-06, + "loss": 0.5983, + "step": 473 + }, + { + "epoch": 0.3753154847330133, + "grad_norm": 0.2565132395383672, + "learning_rate": 1.926521506587261e-06, + "loss": 0.6021, + "step": 474 + }, + { + "epoch": 0.37610728955312517, + "grad_norm": 0.2592413096288339, + "learning_rate": 1.9262084056982437e-06, + "loss": 0.5897, + "step": 475 + }, + { + "epoch": 0.376899094373237, + "grad_norm": 0.25826628366629745, + "learning_rate": 1.9258946646991636e-06, + "loss": 0.596, + "step": 476 + }, + { + "epoch": 0.37769089919334886, + "grad_norm": 0.2479127952507506, + "learning_rate": 1.9255802838068497e-06, + "loss": 0.6046, + "step": 477 + }, + { + "epoch": 0.3784827040134607, + "grad_norm": 0.24028609433471643, + "learning_rate": 1.9252652632385727e-06, + "loss": 0.6079, + "step": 478 + }, + { + "epoch": 0.3792745088335725, + "grad_norm": 0.24444875484814269, + "learning_rate": 1.9249496032120463e-06, + "loss": 0.584, + "step": 479 + }, + { + "epoch": 0.3800663136536844, + "grad_norm": 0.2435843573159472, + "learning_rate": 1.924633303945425e-06, + "loss": 0.5912, + "step": 480 + }, + { + "epoch": 0.3808581184737962, + "grad_norm": 0.2580854272202583, + "learning_rate": 1.9243163656573072e-06, + "loss": 0.6155, + "step": 481 + }, + { + "epoch": 0.38164992329390807, + "grad_norm": 0.24377352220871984, + "learning_rate": 1.92399878856673e-06, + "loss": 0.5815, + "step": 482 + }, + { + "epoch": 0.3824417281140199, + "grad_norm": 0.2611149627044241, + "learning_rate": 1.923680572893174e-06, + "loss": 0.5874, + "step": 483 + }, + { + "epoch": 0.38323353293413176, + "grad_norm": 0.26532404200982734, + "learning_rate": 1.9233617188565607e-06, + "loss": 0.5857, + "step": 484 + }, + { + "epoch": 0.3840253377542436, + "grad_norm": 0.26893467080462036, + "learning_rate": 1.9230422266772525e-06, + "loss": 0.5952, + "step": 485 + }, + { + "epoch": 0.3848171425743554, + "grad_norm": 0.2571456802966826, + "learning_rate": 1.9227220965760535e-06, + "loss": 0.6135, + "step": 486 + }, + { + "epoch": 0.38560894739446727, + "grad_norm": 0.25166009156873304, + "learning_rate": 1.9224013287742078e-06, + "loss": 0.6047, + "step": 487 + }, + { + "epoch": 0.3864007522145791, + "grad_norm": 0.2702394849108666, + "learning_rate": 1.9220799234934e-06, + "loss": 0.5908, + "step": 488 + }, + { + "epoch": 0.38719255703469097, + "grad_norm": 0.25136505111479945, + "learning_rate": 1.921757880955757e-06, + "loss": 0.6039, + "step": 489 + }, + { + "epoch": 0.3879843618548028, + "grad_norm": 0.2534880486433305, + "learning_rate": 1.9214352013838445e-06, + "loss": 0.5984, + "step": 490 + }, + { + "epoch": 0.38877616667491466, + "grad_norm": 0.25108927918306556, + "learning_rate": 1.9211118850006687e-06, + "loss": 0.5988, + "step": 491 + }, + { + "epoch": 0.3895679714950265, + "grad_norm": 0.25633840231196486, + "learning_rate": 1.9207879320296764e-06, + "loss": 0.5882, + "step": 492 + }, + { + "epoch": 0.3903597763151383, + "grad_norm": 0.25275777210177885, + "learning_rate": 1.920463342694754e-06, + "loss": 0.5824, + "step": 493 + }, + { + "epoch": 0.39115158113525017, + "grad_norm": 0.2577018302858132, + "learning_rate": 1.9201381172202283e-06, + "loss": 0.6049, + "step": 494 + }, + { + "epoch": 0.391943385955362, + "grad_norm": 0.25717885246394784, + "learning_rate": 1.9198122558308643e-06, + "loss": 0.5977, + "step": 495 + }, + { + "epoch": 0.39273519077547386, + "grad_norm": 0.24231408485397066, + "learning_rate": 1.919485758751868e-06, + "loss": 0.5867, + "step": 496 + }, + { + "epoch": 0.3935269955955857, + "grad_norm": 0.2503292332824136, + "learning_rate": 1.919158626208884e-06, + "loss": 0.603, + "step": 497 + }, + { + "epoch": 0.39431880041569756, + "grad_norm": 0.2583634281080707, + "learning_rate": 1.9188308584279965e-06, + "loss": 0.6029, + "step": 498 + }, + { + "epoch": 0.3951106052358094, + "grad_norm": 0.25626058933065626, + "learning_rate": 1.9185024556357283e-06, + "loss": 0.5797, + "step": 499 + }, + { + "epoch": 0.3959024100559212, + "grad_norm": 0.2552322552629375, + "learning_rate": 1.9181734180590408e-06, + "loss": 0.5891, + "step": 500 + }, + { + "epoch": 0.39669421487603307, + "grad_norm": 0.2571482255500502, + "learning_rate": 1.917843745925335e-06, + "loss": 0.5907, + "step": 501 + }, + { + "epoch": 0.3974860196961449, + "grad_norm": 0.2558347636656439, + "learning_rate": 1.91751343946245e-06, + "loss": 0.606, + "step": 502 + }, + { + "epoch": 0.39827782451625676, + "grad_norm": 0.24970008982422848, + "learning_rate": 1.917182498898663e-06, + "loss": 0.5805, + "step": 503 + }, + { + "epoch": 0.3990696293363686, + "grad_norm": 0.24874262897937013, + "learning_rate": 1.9168509244626897e-06, + "loss": 0.6, + "step": 504 + }, + { + "epoch": 0.39986143415648046, + "grad_norm": 0.24498750008115716, + "learning_rate": 1.916518716383684e-06, + "loss": 0.5853, + "step": 505 + }, + { + "epoch": 0.4006532389765923, + "grad_norm": 0.26467126876440167, + "learning_rate": 1.916185874891237e-06, + "loss": 0.6043, + "step": 506 + }, + { + "epoch": 0.4014450437967041, + "grad_norm": 0.25168731029496005, + "learning_rate": 1.9158524002153787e-06, + "loss": 0.5976, + "step": 507 + }, + { + "epoch": 0.40223684861681597, + "grad_norm": 0.24247677785798802, + "learning_rate": 1.9155182925865762e-06, + "loss": 0.5986, + "step": 508 + }, + { + "epoch": 0.4030286534369278, + "grad_norm": 0.24947876029329646, + "learning_rate": 1.915183552235734e-06, + "loss": 0.5836, + "step": 509 + }, + { + "epoch": 0.40382045825703966, + "grad_norm": 0.24712297571012012, + "learning_rate": 1.9148481793941933e-06, + "loss": 0.5876, + "step": 510 + }, + { + "epoch": 0.4046122630771515, + "grad_norm": 0.25468129328604866, + "learning_rate": 1.914512174293733e-06, + "loss": 0.5981, + "step": 511 + }, + { + "epoch": 0.4054040678972633, + "grad_norm": 0.24966577324530015, + "learning_rate": 1.9141755371665698e-06, + "loss": 0.5908, + "step": 512 + }, + { + "epoch": 0.4061958727173752, + "grad_norm": 0.23920241476206486, + "learning_rate": 1.9138382682453555e-06, + "loss": 0.6034, + "step": 513 + }, + { + "epoch": 0.406987677537487, + "grad_norm": 0.24572990955319945, + "learning_rate": 1.9135003677631794e-06, + "loss": 0.6045, + "step": 514 + }, + { + "epoch": 0.40777948235759887, + "grad_norm": 0.25333970267239314, + "learning_rate": 1.9131618359535676e-06, + "loss": 0.5994, + "step": 515 + }, + { + "epoch": 0.4085712871777107, + "grad_norm": 0.2452416194903933, + "learning_rate": 1.9128226730504816e-06, + "loss": 0.6049, + "step": 516 + }, + { + "epoch": 0.40936309199782256, + "grad_norm": 0.2554514592884786, + "learning_rate": 1.9124828792883204e-06, + "loss": 0.6018, + "step": 517 + }, + { + "epoch": 0.4101548968179344, + "grad_norm": 0.25668467435797254, + "learning_rate": 1.9121424549019176e-06, + "loss": 0.5767, + "step": 518 + }, + { + "epoch": 0.4109467016380462, + "grad_norm": 0.2440800558700137, + "learning_rate": 1.911801400126544e-06, + "loss": 0.5874, + "step": 519 + }, + { + "epoch": 0.41173850645815807, + "grad_norm": 0.24443693562509336, + "learning_rate": 1.9114597151979037e-06, + "loss": 0.6054, + "step": 520 + }, + { + "epoch": 0.4125303112782699, + "grad_norm": 0.2533333192166624, + "learning_rate": 1.9111174003521394e-06, + "loss": 0.6006, + "step": 521 + }, + { + "epoch": 0.41332211609838176, + "grad_norm": 0.24246229964500904, + "learning_rate": 1.910774455825827e-06, + "loss": 0.6008, + "step": 522 + }, + { + "epoch": 0.4141139209184936, + "grad_norm": 0.23914994890194086, + "learning_rate": 1.910430881855978e-06, + "loss": 0.5796, + "step": 523 + }, + { + "epoch": 0.41490572573860546, + "grad_norm": 0.2521848372112751, + "learning_rate": 1.9100866786800403e-06, + "loss": 0.5807, + "step": 524 + }, + { + "epoch": 0.4156975305587173, + "grad_norm": 0.2490663970974661, + "learning_rate": 1.909741846535894e-06, + "loss": 0.586, + "step": 525 + }, + { + "epoch": 0.4164893353788291, + "grad_norm": 0.2510211645568367, + "learning_rate": 1.909396385661856e-06, + "loss": 0.5848, + "step": 526 + }, + { + "epoch": 0.41728114019894097, + "grad_norm": 0.2520941601934756, + "learning_rate": 1.9090502962966773e-06, + "loss": 0.5878, + "step": 527 + }, + { + "epoch": 0.4180729450190528, + "grad_norm": 0.2549101828097431, + "learning_rate": 1.908703578679543e-06, + "loss": 0.5949, + "step": 528 + }, + { + "epoch": 0.41886474983916466, + "grad_norm": 0.24544990353717744, + "learning_rate": 1.908356233050072e-06, + "loss": 0.6007, + "step": 529 + }, + { + "epoch": 0.4196565546592765, + "grad_norm": 0.2445800594456821, + "learning_rate": 1.908008259648318e-06, + "loss": 0.5903, + "step": 530 + }, + { + "epoch": 0.42044835947938836, + "grad_norm": 0.25257051310738954, + "learning_rate": 1.907659658714768e-06, + "loss": 0.5976, + "step": 531 + }, + { + "epoch": 0.4212401642995002, + "grad_norm": 0.25430432707918715, + "learning_rate": 1.9073104304903426e-06, + "loss": 0.5932, + "step": 532 + }, + { + "epoch": 0.422031969119612, + "grad_norm": 0.2676141397255204, + "learning_rate": 1.9069605752163974e-06, + "loss": 0.5919, + "step": 533 + }, + { + "epoch": 0.42282377393972387, + "grad_norm": 0.24913566877200974, + "learning_rate": 1.906610093134719e-06, + "loss": 0.5922, + "step": 534 + }, + { + "epoch": 0.4236155787598357, + "grad_norm": 0.25142050866297777, + "learning_rate": 1.906258984487529e-06, + "loss": 0.5981, + "step": 535 + }, + { + "epoch": 0.42440738357994756, + "grad_norm": 0.2626434372822137, + "learning_rate": 1.9059072495174808e-06, + "loss": 0.6167, + "step": 536 + }, + { + "epoch": 0.4251991884000594, + "grad_norm": 0.26204574666443564, + "learning_rate": 1.9055548884676622e-06, + "loss": 0.5898, + "step": 537 + }, + { + "epoch": 0.42599099322017125, + "grad_norm": 0.24349345230704145, + "learning_rate": 1.905201901581592e-06, + "loss": 0.5989, + "step": 538 + }, + { + "epoch": 0.4267827980402831, + "grad_norm": 0.25895765874169857, + "learning_rate": 1.9048482891032225e-06, + "loss": 0.5865, + "step": 539 + }, + { + "epoch": 0.4275746028603949, + "grad_norm": 0.2543447511075663, + "learning_rate": 1.9044940512769383e-06, + "loss": 0.6044, + "step": 540 + }, + { + "epoch": 0.42836640768050677, + "grad_norm": 0.25660293201048157, + "learning_rate": 1.9041391883475556e-06, + "loss": 0.5971, + "step": 541 + }, + { + "epoch": 0.4291582125006186, + "grad_norm": 0.25695030395594404, + "learning_rate": 1.9037837005603231e-06, + "loss": 0.5771, + "step": 542 + }, + { + "epoch": 0.42995001732073046, + "grad_norm": 0.25993304117454247, + "learning_rate": 1.9034275881609218e-06, + "loss": 0.6093, + "step": 543 + }, + { + "epoch": 0.4307418221408423, + "grad_norm": 0.24704428549680987, + "learning_rate": 1.9030708513954635e-06, + "loss": 0.6085, + "step": 544 + }, + { + "epoch": 0.4315336269609541, + "grad_norm": 0.2542518306972636, + "learning_rate": 1.9027134905104911e-06, + "loss": 0.6047, + "step": 545 + }, + { + "epoch": 0.43232543178106597, + "grad_norm": 0.25406990466760127, + "learning_rate": 1.9023555057529808e-06, + "loss": 0.6003, + "step": 546 + }, + { + "epoch": 0.4331172366011778, + "grad_norm": 0.25453026870759954, + "learning_rate": 1.9019968973703383e-06, + "loss": 0.5934, + "step": 547 + }, + { + "epoch": 0.43390904142128967, + "grad_norm": 0.2685469556823121, + "learning_rate": 1.9016376656104002e-06, + "loss": 0.5842, + "step": 548 + }, + { + "epoch": 0.4347008462414015, + "grad_norm": 0.2524811149335431, + "learning_rate": 1.9012778107214352e-06, + "loss": 0.5886, + "step": 549 + }, + { + "epoch": 0.43549265106151336, + "grad_norm": 0.2622934631117297, + "learning_rate": 1.9009173329521415e-06, + "loss": 0.606, + "step": 550 + }, + { + "epoch": 0.4362844558816252, + "grad_norm": 0.2379389352813738, + "learning_rate": 1.9005562325516481e-06, + "loss": 0.5915, + "step": 551 + }, + { + "epoch": 0.437076260701737, + "grad_norm": 0.25381718346731963, + "learning_rate": 1.900194509769515e-06, + "loss": 0.5928, + "step": 552 + }, + { + "epoch": 0.43786806552184887, + "grad_norm": 0.24768690289303336, + "learning_rate": 1.8998321648557307e-06, + "loss": 0.5966, + "step": 553 + }, + { + "epoch": 0.4386598703419607, + "grad_norm": 0.24692537647588123, + "learning_rate": 1.8994691980607156e-06, + "loss": 0.57, + "step": 554 + }, + { + "epoch": 0.43945167516207256, + "grad_norm": 0.2425032291152309, + "learning_rate": 1.8991056096353186e-06, + "loss": 0.5738, + "step": 555 + }, + { + "epoch": 0.4402434799821844, + "grad_norm": 0.24564209012352464, + "learning_rate": 1.8987413998308185e-06, + "loss": 0.596, + "step": 556 + }, + { + "epoch": 0.44103528480229626, + "grad_norm": 0.24818474606241464, + "learning_rate": 1.8983765688989235e-06, + "loss": 0.5855, + "step": 557 + }, + { + "epoch": 0.4418270896224081, + "grad_norm": 0.2544777868752887, + "learning_rate": 1.8980111170917716e-06, + "loss": 0.5802, + "step": 558 + }, + { + "epoch": 0.4426188944425199, + "grad_norm": 0.2518961660630504, + "learning_rate": 1.8976450446619293e-06, + "loss": 0.5973, + "step": 559 + }, + { + "epoch": 0.44341069926263177, + "grad_norm": 0.24486041155981994, + "learning_rate": 1.8972783518623921e-06, + "loss": 0.6097, + "step": 560 + }, + { + "epoch": 0.4442025040827436, + "grad_norm": 0.2477100340771778, + "learning_rate": 1.8969110389465845e-06, + "loss": 0.5931, + "step": 561 + }, + { + "epoch": 0.44499430890285546, + "grad_norm": 0.24337367141849547, + "learning_rate": 1.8965431061683593e-06, + "loss": 0.5801, + "step": 562 + }, + { + "epoch": 0.4457861137229673, + "grad_norm": 0.24453218088999523, + "learning_rate": 1.8961745537819978e-06, + "loss": 0.5827, + "step": 563 + }, + { + "epoch": 0.44657791854307916, + "grad_norm": 0.24861572385826658, + "learning_rate": 1.8958053820422093e-06, + "loss": 0.5917, + "step": 564 + }, + { + "epoch": 0.447369723363191, + "grad_norm": 0.24765696666423848, + "learning_rate": 1.8954355912041317e-06, + "loss": 0.5738, + "step": 565 + }, + { + "epoch": 0.4481615281833028, + "grad_norm": 0.2465393453299667, + "learning_rate": 1.8950651815233303e-06, + "loss": 0.5938, + "step": 566 + }, + { + "epoch": 0.44895333300341467, + "grad_norm": 0.2491034515317017, + "learning_rate": 1.8946941532557981e-06, + "loss": 0.5891, + "step": 567 + }, + { + "epoch": 0.4497451378235265, + "grad_norm": 0.24458817821467171, + "learning_rate": 1.894322506657956e-06, + "loss": 0.6008, + "step": 568 + }, + { + "epoch": 0.45053694264363836, + "grad_norm": 0.24138603479841458, + "learning_rate": 1.8939502419866517e-06, + "loss": 0.5957, + "step": 569 + }, + { + "epoch": 0.4513287474637502, + "grad_norm": 0.2632618023896708, + "learning_rate": 1.8935773594991602e-06, + "loss": 0.5903, + "step": 570 + }, + { + "epoch": 0.45212055228386205, + "grad_norm": 0.2624748591331253, + "learning_rate": 1.8932038594531839e-06, + "loss": 0.5968, + "step": 571 + }, + { + "epoch": 0.4529123571039739, + "grad_norm": 0.26217399555794163, + "learning_rate": 1.8928297421068513e-06, + "loss": 0.5795, + "step": 572 + }, + { + "epoch": 0.4537041619240857, + "grad_norm": 0.25490627556170353, + "learning_rate": 1.8924550077187183e-06, + "loss": 0.5939, + "step": 573 + }, + { + "epoch": 0.45449596674419757, + "grad_norm": 0.2507838695486079, + "learning_rate": 1.8920796565477666e-06, + "loss": 0.6096, + "step": 574 + }, + { + "epoch": 0.4552877715643094, + "grad_norm": 0.24142344523476766, + "learning_rate": 1.8917036888534042e-06, + "loss": 0.5996, + "step": 575 + }, + { + "epoch": 0.45607957638442126, + "grad_norm": 0.25846384854319515, + "learning_rate": 1.891327104895466e-06, + "loss": 0.5852, + "step": 576 + }, + { + "epoch": 0.4568713812045331, + "grad_norm": 0.24743738676208468, + "learning_rate": 1.8909499049342116e-06, + "loss": 0.5844, + "step": 577 + }, + { + "epoch": 0.45766318602464495, + "grad_norm": 0.2513197485371807, + "learning_rate": 1.8905720892303271e-06, + "loss": 0.5948, + "step": 578 + }, + { + "epoch": 0.45845499084475677, + "grad_norm": 0.24926525473931793, + "learning_rate": 1.8901936580449245e-06, + "loss": 0.5819, + "step": 579 + }, + { + "epoch": 0.4592467956648686, + "grad_norm": 0.2552388048276722, + "learning_rate": 1.8898146116395398e-06, + "loss": 0.5974, + "step": 580 + }, + { + "epoch": 0.46003860048498046, + "grad_norm": 0.25263820247307917, + "learning_rate": 1.8894349502761355e-06, + "loss": 0.5762, + "step": 581 + }, + { + "epoch": 0.4608304053050923, + "grad_norm": 0.2553733533701803, + "learning_rate": 1.8890546742170983e-06, + "loss": 0.6036, + "step": 582 + }, + { + "epoch": 0.46162221012520416, + "grad_norm": 0.2547596600415412, + "learning_rate": 1.8886737837252405e-06, + "loss": 0.5796, + "step": 583 + }, + { + "epoch": 0.462414014945316, + "grad_norm": 0.2513121780285386, + "learning_rate": 1.8882922790637981e-06, + "loss": 0.6148, + "step": 584 + }, + { + "epoch": 0.4632058197654278, + "grad_norm": 0.26739457310607195, + "learning_rate": 1.8879101604964322e-06, + "loss": 0.5939, + "step": 585 + }, + { + "epoch": 0.46399762458553967, + "grad_norm": 0.25951718112872185, + "learning_rate": 1.8875274282872281e-06, + "loss": 0.6, + "step": 586 + }, + { + "epoch": 0.4647894294056515, + "grad_norm": 0.25631245577556405, + "learning_rate": 1.887144082700695e-06, + "loss": 0.5859, + "step": 587 + }, + { + "epoch": 0.46558123422576336, + "grad_norm": 0.2564914517999274, + "learning_rate": 1.8867601240017663e-06, + "loss": 0.5853, + "step": 588 + }, + { + "epoch": 0.4663730390458752, + "grad_norm": 0.24819504652120772, + "learning_rate": 1.8863755524557987e-06, + "loss": 0.5941, + "step": 589 + }, + { + "epoch": 0.46716484386598706, + "grad_norm": 0.2511821237330362, + "learning_rate": 1.885990368328573e-06, + "loss": 0.6003, + "step": 590 + }, + { + "epoch": 0.4679566486860989, + "grad_norm": 0.2470050771408181, + "learning_rate": 1.8856045718862922e-06, + "loss": 0.6006, + "step": 591 + }, + { + "epoch": 0.4687484535062107, + "grad_norm": 0.2432408248373072, + "learning_rate": 1.8852181633955846e-06, + "loss": 0.5868, + "step": 592 + }, + { + "epoch": 0.46954025832632257, + "grad_norm": 0.2525747260022962, + "learning_rate": 1.8848311431234992e-06, + "loss": 0.5801, + "step": 593 + }, + { + "epoch": 0.4703320631464344, + "grad_norm": 0.261657075936512, + "learning_rate": 1.8844435113375093e-06, + "loss": 0.5759, + "step": 594 + }, + { + "epoch": 0.47112386796654626, + "grad_norm": 0.2642731371394818, + "learning_rate": 1.8840552683055102e-06, + "loss": 0.5749, + "step": 595 + }, + { + "epoch": 0.4719156727866581, + "grad_norm": 0.2670554515440983, + "learning_rate": 1.8836664142958197e-06, + "loss": 0.5734, + "step": 596 + }, + { + "epoch": 0.47270747760676995, + "grad_norm": 0.2550154162106024, + "learning_rate": 1.8832769495771782e-06, + "loss": 0.5713, + "step": 597 + }, + { + "epoch": 0.4734992824268818, + "grad_norm": 0.24938960319128795, + "learning_rate": 1.882886874418747e-06, + "loss": 0.6046, + "step": 598 + }, + { + "epoch": 0.4742910872469936, + "grad_norm": 0.2473340062743241, + "learning_rate": 1.8824961890901116e-06, + "loss": 0.5819, + "step": 599 + }, + { + "epoch": 0.47508289206710547, + "grad_norm": 0.2503894664133491, + "learning_rate": 1.8821048938612769e-06, + "loss": 0.5971, + "step": 600 + }, + { + "epoch": 0.4758746968872173, + "grad_norm": 0.2593563375135493, + "learning_rate": 1.8817129890026702e-06, + "loss": 0.6008, + "step": 601 + }, + { + "epoch": 0.47666650170732916, + "grad_norm": 0.2518731330101114, + "learning_rate": 1.8813204747851403e-06, + "loss": 0.5929, + "step": 602 + }, + { + "epoch": 0.477458306527441, + "grad_norm": 0.2534468089092246, + "learning_rate": 1.8809273514799569e-06, + "loss": 0.572, + "step": 603 + }, + { + "epoch": 0.47825011134755285, + "grad_norm": 0.2550315788078505, + "learning_rate": 1.8805336193588103e-06, + "loss": 0.6024, + "step": 604 + }, + { + "epoch": 0.47904191616766467, + "grad_norm": 0.25608653703733064, + "learning_rate": 1.8801392786938125e-06, + "loss": 0.5975, + "step": 605 + }, + { + "epoch": 0.4798337209877765, + "grad_norm": 0.26515830248195815, + "learning_rate": 1.8797443297574952e-06, + "loss": 0.5978, + "step": 606 + }, + { + "epoch": 0.48062552580788837, + "grad_norm": 0.2598031081750377, + "learning_rate": 1.8793487728228104e-06, + "loss": 0.5876, + "step": 607 + }, + { + "epoch": 0.4814173306280002, + "grad_norm": 0.2579622703822964, + "learning_rate": 1.8789526081631313e-06, + "loss": 0.5927, + "step": 608 + }, + { + "epoch": 0.48220913544811206, + "grad_norm": 0.24929178968775514, + "learning_rate": 1.8785558360522505e-06, + "loss": 0.5795, + "step": 609 + }, + { + "epoch": 0.4830009402682239, + "grad_norm": 0.24609387295872273, + "learning_rate": 1.87815845676438e-06, + "loss": 0.5887, + "step": 610 + }, + { + "epoch": 0.48379274508833575, + "grad_norm": 0.2599644480370626, + "learning_rate": 1.877760470574152e-06, + "loss": 0.5918, + "step": 611 + }, + { + "epoch": 0.48458454990844757, + "grad_norm": 0.25322423338767264, + "learning_rate": 1.8773618777566176e-06, + "loss": 0.6005, + "step": 612 + }, + { + "epoch": 0.4853763547285594, + "grad_norm": 0.24765658323912063, + "learning_rate": 1.8769626785872483e-06, + "loss": 0.5802, + "step": 613 + }, + { + "epoch": 0.48616815954867126, + "grad_norm": 0.25864308269449704, + "learning_rate": 1.876562873341933e-06, + "loss": 0.5964, + "step": 614 + }, + { + "epoch": 0.4869599643687831, + "grad_norm": 0.2406859239191986, + "learning_rate": 1.8761624622969807e-06, + "loss": 0.5824, + "step": 615 + }, + { + "epoch": 0.48775176918889496, + "grad_norm": 0.25436975707855747, + "learning_rate": 1.8757614457291187e-06, + "loss": 0.5832, + "step": 616 + }, + { + "epoch": 0.4885435740090068, + "grad_norm": 0.26589957568490175, + "learning_rate": 1.8753598239154926e-06, + "loss": 0.5902, + "step": 617 + }, + { + "epoch": 0.4893353788291186, + "grad_norm": 0.2823144704981466, + "learning_rate": 1.8749575971336672e-06, + "loss": 0.6006, + "step": 618 + }, + { + "epoch": 0.49012718364923047, + "grad_norm": 0.266502255384463, + "learning_rate": 1.874554765661623e-06, + "loss": 0.5925, + "step": 619 + }, + { + "epoch": 0.4909189884693423, + "grad_norm": 0.25868494586041696, + "learning_rate": 1.8741513297777619e-06, + "loss": 0.5848, + "step": 620 + }, + { + "epoch": 0.49171079328945416, + "grad_norm": 0.24420772635871477, + "learning_rate": 1.8737472897609008e-06, + "loss": 0.5858, + "step": 621 + }, + { + "epoch": 0.492502598109566, + "grad_norm": 0.25686435545350483, + "learning_rate": 1.8733426458902748e-06, + "loss": 0.5829, + "step": 622 + }, + { + "epoch": 0.49329440292967786, + "grad_norm": 0.2555585620025713, + "learning_rate": 1.872937398445537e-06, + "loss": 0.5967, + "step": 623 + }, + { + "epoch": 0.4940862077497897, + "grad_norm": 0.24698406765043235, + "learning_rate": 1.872531547706757e-06, + "loss": 0.5834, + "step": 624 + }, + { + "epoch": 0.4948780125699015, + "grad_norm": 0.2588839733704809, + "learning_rate": 1.8721250939544212e-06, + "loss": 0.6028, + "step": 625 + }, + { + "epoch": 0.49566981739001337, + "grad_norm": 0.2577950792786317, + "learning_rate": 1.8717180374694336e-06, + "loss": 0.5899, + "step": 626 + }, + { + "epoch": 0.4964616222101252, + "grad_norm": 0.2437095818796854, + "learning_rate": 1.871310378533114e-06, + "loss": 0.5823, + "step": 627 + }, + { + "epoch": 0.49725342703023706, + "grad_norm": 0.24544352768150463, + "learning_rate": 1.8709021174271982e-06, + "loss": 0.5886, + "step": 628 + }, + { + "epoch": 0.4980452318503489, + "grad_norm": 0.2607396835887682, + "learning_rate": 1.8704932544338395e-06, + "loss": 0.5958, + "step": 629 + }, + { + "epoch": 0.49883703667046075, + "grad_norm": 0.24314494711697102, + "learning_rate": 1.8700837898356063e-06, + "loss": 0.5957, + "step": 630 + }, + { + "epoch": 0.4996288414905726, + "grad_norm": 0.2548555939972503, + "learning_rate": 1.8696737239154823e-06, + "loss": 0.5828, + "step": 631 + }, + { + "epoch": 0.5004206463106844, + "grad_norm": 0.2507961409164316, + "learning_rate": 1.8692630569568682e-06, + "loss": 0.5878, + "step": 632 + }, + { + "epoch": 0.5012124511307963, + "grad_norm": 0.24630360548273397, + "learning_rate": 1.8688517892435782e-06, + "loss": 0.5811, + "step": 633 + }, + { + "epoch": 0.5020042559509081, + "grad_norm": 0.2583300839160151, + "learning_rate": 1.8684399210598438e-06, + "loss": 0.595, + "step": 634 + }, + { + "epoch": 0.5027960607710199, + "grad_norm": 0.24514303969134071, + "learning_rate": 1.8680274526903094e-06, + "loss": 0.5882, + "step": 635 + }, + { + "epoch": 0.5035878655911318, + "grad_norm": 0.25767312521601865, + "learning_rate": 1.867614384420036e-06, + "loss": 0.5838, + "step": 636 + }, + { + "epoch": 0.5043796704112437, + "grad_norm": 0.2637330975882953, + "learning_rate": 1.867200716534498e-06, + "loss": 0.605, + "step": 637 + }, + { + "epoch": 0.5051714752313554, + "grad_norm": 0.25797054438517375, + "learning_rate": 1.8667864493195848e-06, + "loss": 0.5871, + "step": 638 + }, + { + "epoch": 0.5059632800514673, + "grad_norm": 0.2544771462188513, + "learning_rate": 1.8663715830615996e-06, + "loss": 0.5956, + "step": 639 + }, + { + "epoch": 0.5067550848715792, + "grad_norm": 0.26136970915324076, + "learning_rate": 1.8659561180472598e-06, + "loss": 0.5904, + "step": 640 + }, + { + "epoch": 0.507546889691691, + "grad_norm": 0.2644633648378998, + "learning_rate": 1.8655400545636968e-06, + "loss": 0.5931, + "step": 641 + }, + { + "epoch": 0.5083386945118028, + "grad_norm": 0.25217107553398144, + "learning_rate": 1.8651233928984551e-06, + "loss": 0.5818, + "step": 642 + }, + { + "epoch": 0.5091304993319147, + "grad_norm": 0.2612370653410413, + "learning_rate": 1.8647061333394932e-06, + "loss": 0.5878, + "step": 643 + }, + { + "epoch": 0.5099223041520266, + "grad_norm": 0.25112176662694013, + "learning_rate": 1.8642882761751826e-06, + "loss": 0.5792, + "step": 644 + }, + { + "epoch": 0.5107141089721383, + "grad_norm": 0.2553203153129554, + "learning_rate": 1.8638698216943074e-06, + "loss": 0.5855, + "step": 645 + }, + { + "epoch": 0.5115059137922502, + "grad_norm": 0.257605510384718, + "learning_rate": 1.8634507701860651e-06, + "loss": 0.5838, + "step": 646 + }, + { + "epoch": 0.5122977186123621, + "grad_norm": 0.24739133493766602, + "learning_rate": 1.8630311219400655e-06, + "loss": 0.5918, + "step": 647 + }, + { + "epoch": 0.5130895234324739, + "grad_norm": 0.25614233537193654, + "learning_rate": 1.8626108772463307e-06, + "loss": 0.5998, + "step": 648 + }, + { + "epoch": 0.5138813282525857, + "grad_norm": 0.2593901016358386, + "learning_rate": 1.8621900363952957e-06, + "loss": 0.5771, + "step": 649 + }, + { + "epoch": 0.5146731330726976, + "grad_norm": 0.2580921998074504, + "learning_rate": 1.8617685996778066e-06, + "loss": 0.5942, + "step": 650 + }, + { + "epoch": 0.5154649378928094, + "grad_norm": 0.24518934638594403, + "learning_rate": 1.8613465673851214e-06, + "loss": 0.5781, + "step": 651 + }, + { + "epoch": 0.5162567427129212, + "grad_norm": 0.2684980696453002, + "learning_rate": 1.8609239398089105e-06, + "loss": 0.5944, + "step": 652 + }, + { + "epoch": 0.5170485475330331, + "grad_norm": 0.2467354031644924, + "learning_rate": 1.8605007172412553e-06, + "loss": 0.5981, + "step": 653 + }, + { + "epoch": 0.517840352353145, + "grad_norm": 0.2551470866172334, + "learning_rate": 1.8600768999746477e-06, + "loss": 0.5777, + "step": 654 + }, + { + "epoch": 0.5186321571732568, + "grad_norm": 0.2547867933652486, + "learning_rate": 1.8596524883019918e-06, + "loss": 0.6042, + "step": 655 + }, + { + "epoch": 0.5194239619933686, + "grad_norm": 0.258259134300438, + "learning_rate": 1.8592274825166015e-06, + "loss": 0.5829, + "step": 656 + }, + { + "epoch": 0.5202157668134805, + "grad_norm": 0.24675317898231072, + "learning_rate": 1.8588018829122021e-06, + "loss": 0.5841, + "step": 657 + }, + { + "epoch": 0.5210075716335923, + "grad_norm": 0.26642682255162203, + "learning_rate": 1.8583756897829285e-06, + "loss": 0.5868, + "step": 658 + }, + { + "epoch": 0.5217993764537041, + "grad_norm": 0.2680952507866509, + "learning_rate": 1.8579489034233264e-06, + "loss": 0.5781, + "step": 659 + }, + { + "epoch": 0.522591181273816, + "grad_norm": 0.2692078025041211, + "learning_rate": 1.8575215241283515e-06, + "loss": 0.6057, + "step": 660 + }, + { + "epoch": 0.5233829860939279, + "grad_norm": 0.25931733304226906, + "learning_rate": 1.8570935521933686e-06, + "loss": 0.5826, + "step": 661 + }, + { + "epoch": 0.5241747909140397, + "grad_norm": 0.2541083319258514, + "learning_rate": 1.856664987914153e-06, + "loss": 0.5786, + "step": 662 + }, + { + "epoch": 0.5249665957341515, + "grad_norm": 0.2661944097834986, + "learning_rate": 1.8562358315868886e-06, + "loss": 0.5887, + "step": 663 + }, + { + "epoch": 0.5257584005542634, + "grad_norm": 0.276640962139248, + "learning_rate": 1.8558060835081688e-06, + "loss": 0.5866, + "step": 664 + }, + { + "epoch": 0.5265502053743752, + "grad_norm": 0.2592799780279227, + "learning_rate": 1.8553757439749962e-06, + "loss": 0.5855, + "step": 665 + }, + { + "epoch": 0.527342010194487, + "grad_norm": 0.24091040206785624, + "learning_rate": 1.8549448132847814e-06, + "loss": 0.5862, + "step": 666 + }, + { + "epoch": 0.5281338150145989, + "grad_norm": 0.24197516449912457, + "learning_rate": 1.8545132917353445e-06, + "loss": 0.6064, + "step": 667 + }, + { + "epoch": 0.5289256198347108, + "grad_norm": 0.253739866990384, + "learning_rate": 1.8540811796249134e-06, + "loss": 0.5791, + "step": 668 + }, + { + "epoch": 0.5297174246548226, + "grad_norm": 0.24467570017073437, + "learning_rate": 1.8536484772521243e-06, + "loss": 0.5718, + "step": 669 + }, + { + "epoch": 0.5305092294749344, + "grad_norm": 0.24056093726754427, + "learning_rate": 1.8532151849160212e-06, + "loss": 0.5768, + "step": 670 + }, + { + "epoch": 0.5313010342950463, + "grad_norm": 0.24983386225648294, + "learning_rate": 1.852781302916056e-06, + "loss": 0.5943, + "step": 671 + }, + { + "epoch": 0.5320928391151581, + "grad_norm": 0.2594596210177295, + "learning_rate": 1.8523468315520878e-06, + "loss": 0.5929, + "step": 672 + }, + { + "epoch": 0.5328846439352699, + "grad_norm": 0.2477292597466106, + "learning_rate": 1.8519117711243836e-06, + "loss": 0.5986, + "step": 673 + }, + { + "epoch": 0.5336764487553818, + "grad_norm": 0.23765886344255474, + "learning_rate": 1.8514761219336167e-06, + "loss": 0.5707, + "step": 674 + }, + { + "epoch": 0.5344682535754937, + "grad_norm": 0.24901878692343485, + "learning_rate": 1.851039884280868e-06, + "loss": 0.5996, + "step": 675 + }, + { + "epoch": 0.5352600583956055, + "grad_norm": 0.24421886070902452, + "learning_rate": 1.8506030584676253e-06, + "loss": 0.5848, + "step": 676 + }, + { + "epoch": 0.5360518632157173, + "grad_norm": 0.24280405428062857, + "learning_rate": 1.8501656447957817e-06, + "loss": 0.5844, + "step": 677 + }, + { + "epoch": 0.5368436680358292, + "grad_norm": 0.25268297516887805, + "learning_rate": 1.8497276435676375e-06, + "loss": 0.5879, + "step": 678 + }, + { + "epoch": 0.537635472855941, + "grad_norm": 0.25323598508216055, + "learning_rate": 1.8492890550858991e-06, + "loss": 0.5749, + "step": 679 + }, + { + "epoch": 0.5384272776760528, + "grad_norm": 0.2469965783028501, + "learning_rate": 1.8488498796536782e-06, + "loss": 0.6041, + "step": 680 + }, + { + "epoch": 0.5392190824961647, + "grad_norm": 0.2456833497636686, + "learning_rate": 1.8484101175744926e-06, + "loss": 0.5829, + "step": 681 + }, + { + "epoch": 0.5400108873162766, + "grad_norm": 0.23916202458714156, + "learning_rate": 1.8479697691522657e-06, + "loss": 0.5711, + "step": 682 + }, + { + "epoch": 0.5408026921363884, + "grad_norm": 0.25289719726361376, + "learning_rate": 1.8475288346913253e-06, + "loss": 0.5968, + "step": 683 + }, + { + "epoch": 0.5415944969565002, + "grad_norm": 0.26543836193385023, + "learning_rate": 1.847087314496405e-06, + "loss": 0.5929, + "step": 684 + }, + { + "epoch": 0.5423863017766121, + "grad_norm": 0.2746509392353278, + "learning_rate": 1.8466452088726431e-06, + "loss": 0.5916, + "step": 685 + }, + { + "epoch": 0.5431781065967239, + "grad_norm": 0.2486144683867524, + "learning_rate": 1.8462025181255822e-06, + "loss": 0.5935, + "step": 686 + }, + { + "epoch": 0.5439699114168357, + "grad_norm": 0.2572507467885107, + "learning_rate": 1.8457592425611694e-06, + "loss": 0.5856, + "step": 687 + }, + { + "epoch": 0.5447617162369476, + "grad_norm": 0.25691373702239123, + "learning_rate": 1.845315382485756e-06, + "loss": 0.5806, + "step": 688 + }, + { + "epoch": 0.5455535210570595, + "grad_norm": 0.2697588241292833, + "learning_rate": 1.8448709382060975e-06, + "loss": 0.5822, + "step": 689 + }, + { + "epoch": 0.5463453258771713, + "grad_norm": 0.25767372746150385, + "learning_rate": 1.8444259100293523e-06, + "loss": 0.5921, + "step": 690 + }, + { + "epoch": 0.5471371306972831, + "grad_norm": 0.24439659052624355, + "learning_rate": 1.8439802982630837e-06, + "loss": 0.6003, + "step": 691 + }, + { + "epoch": 0.547928935517395, + "grad_norm": 0.24572923347720313, + "learning_rate": 1.8435341032152566e-06, + "loss": 0.5978, + "step": 692 + }, + { + "epoch": 0.5487207403375068, + "grad_norm": 0.26320713917576716, + "learning_rate": 1.8430873251942414e-06, + "loss": 0.5826, + "step": 693 + }, + { + "epoch": 0.5495125451576186, + "grad_norm": 0.26980082421070706, + "learning_rate": 1.842639964508809e-06, + "loss": 0.5841, + "step": 694 + }, + { + "epoch": 0.5503043499777305, + "grad_norm": 0.24912616528732986, + "learning_rate": 1.8421920214681336e-06, + "loss": 0.5676, + "step": 695 + }, + { + "epoch": 0.5510961547978424, + "grad_norm": 0.25120583657924267, + "learning_rate": 1.841743496381793e-06, + "loss": 0.5812, + "step": 696 + }, + { + "epoch": 0.5518879596179542, + "grad_norm": 0.249207112222779, + "learning_rate": 1.8412943895597665e-06, + "loss": 0.585, + "step": 697 + }, + { + "epoch": 0.552679764438066, + "grad_norm": 0.2678058120982978, + "learning_rate": 1.8408447013124352e-06, + "loss": 0.5886, + "step": 698 + }, + { + "epoch": 0.5534715692581779, + "grad_norm": 0.2552786391349403, + "learning_rate": 1.8403944319505817e-06, + "loss": 0.5767, + "step": 699 + }, + { + "epoch": 0.5542633740782897, + "grad_norm": 0.2465657211337802, + "learning_rate": 1.8399435817853916e-06, + "loss": 0.5787, + "step": 700 + }, + { + "epoch": 0.5550551788984015, + "grad_norm": 0.2499739623405065, + "learning_rate": 1.839492151128451e-06, + "loss": 0.5882, + "step": 701 + }, + { + "epoch": 0.5558469837185134, + "grad_norm": 0.2599297446070384, + "learning_rate": 1.8390401402917467e-06, + "loss": 0.5834, + "step": 702 + }, + { + "epoch": 0.5566387885386253, + "grad_norm": 0.25255278534208114, + "learning_rate": 1.8385875495876678e-06, + "loss": 0.5988, + "step": 703 + }, + { + "epoch": 0.5574305933587371, + "grad_norm": 0.26171628996005464, + "learning_rate": 1.8381343793290025e-06, + "loss": 0.5885, + "step": 704 + }, + { + "epoch": 0.5582223981788489, + "grad_norm": 0.251767699867137, + "learning_rate": 1.8376806298289407e-06, + "loss": 0.5851, + "step": 705 + }, + { + "epoch": 0.5590142029989608, + "grad_norm": 0.22989634696864383, + "learning_rate": 1.8372263014010726e-06, + "loss": 0.5972, + "step": 706 + }, + { + "epoch": 0.5598060078190726, + "grad_norm": 0.2516276753539971, + "learning_rate": 1.8367713943593882e-06, + "loss": 0.5768, + "step": 707 + }, + { + "epoch": 0.5605978126391844, + "grad_norm": 0.2507232449803271, + "learning_rate": 1.836315909018277e-06, + "loss": 0.6022, + "step": 708 + }, + { + "epoch": 0.5613896174592963, + "grad_norm": 0.24432084272529186, + "learning_rate": 1.8358598456925288e-06, + "loss": 0.5714, + "step": 709 + }, + { + "epoch": 0.5621814222794081, + "grad_norm": 0.23384148212599462, + "learning_rate": 1.8354032046973328e-06, + "loss": 0.5861, + "step": 710 + }, + { + "epoch": 0.5629732270995199, + "grad_norm": 0.250622420279018, + "learning_rate": 1.8349459863482772e-06, + "loss": 0.5801, + "step": 711 + }, + { + "epoch": 0.5637650319196318, + "grad_norm": 0.24902384594374952, + "learning_rate": 1.8344881909613493e-06, + "loss": 0.5961, + "step": 712 + }, + { + "epoch": 0.5645568367397437, + "grad_norm": 0.24569769352466586, + "learning_rate": 1.8340298188529347e-06, + "loss": 0.5862, + "step": 713 + }, + { + "epoch": 0.5653486415598555, + "grad_norm": 0.24281376574785227, + "learning_rate": 1.8335708703398185e-06, + "loss": 0.5748, + "step": 714 + }, + { + "epoch": 0.5661404463799673, + "grad_norm": 0.23713171840827416, + "learning_rate": 1.8331113457391837e-06, + "loss": 0.5933, + "step": 715 + }, + { + "epoch": 0.5669322512000792, + "grad_norm": 0.24520199763855147, + "learning_rate": 1.832651245368611e-06, + "loss": 0.5958, + "step": 716 + }, + { + "epoch": 0.567724056020191, + "grad_norm": 0.23584122838719365, + "learning_rate": 1.8321905695460802e-06, + "loss": 0.5837, + "step": 717 + }, + { + "epoch": 0.5685158608403028, + "grad_norm": 0.2548975334264314, + "learning_rate": 1.8317293185899676e-06, + "loss": 0.5888, + "step": 718 + }, + { + "epoch": 0.5693076656604147, + "grad_norm": 0.24645152919598493, + "learning_rate": 1.8312674928190472e-06, + "loss": 0.5916, + "step": 719 + }, + { + "epoch": 0.5700994704805266, + "grad_norm": 0.2508435342752548, + "learning_rate": 1.8308050925524912e-06, + "loss": 0.6117, + "step": 720 + }, + { + "epoch": 0.5708912753006384, + "grad_norm": 0.2541834980979583, + "learning_rate": 1.8303421181098676e-06, + "loss": 0.5857, + "step": 721 + }, + { + "epoch": 0.5716830801207502, + "grad_norm": 0.24482232197513573, + "learning_rate": 1.8298785698111415e-06, + "loss": 0.6008, + "step": 722 + }, + { + "epoch": 0.5724748849408621, + "grad_norm": 0.257618849929514, + "learning_rate": 1.8294144479766756e-06, + "loss": 0.5841, + "step": 723 + }, + { + "epoch": 0.573266689760974, + "grad_norm": 0.23694378769102648, + "learning_rate": 1.8289497529272277e-06, + "loss": 0.5856, + "step": 724 + }, + { + "epoch": 0.5740584945810857, + "grad_norm": 0.2559523294309274, + "learning_rate": 1.8284844849839526e-06, + "loss": 0.5939, + "step": 725 + }, + { + "epoch": 0.5748502994011976, + "grad_norm": 0.2430474922964609, + "learning_rate": 1.8280186444684005e-06, + "loss": 0.5735, + "step": 726 + }, + { + "epoch": 0.5756421042213095, + "grad_norm": 0.2421389562029136, + "learning_rate": 1.8275522317025174e-06, + "loss": 0.5863, + "step": 727 + }, + { + "epoch": 0.5764339090414213, + "grad_norm": 0.24719382586531582, + "learning_rate": 1.8270852470086452e-06, + "loss": 0.5793, + "step": 728 + }, + { + "epoch": 0.5772257138615331, + "grad_norm": 0.2613922322447402, + "learning_rate": 1.8266176907095206e-06, + "loss": 0.583, + "step": 729 + }, + { + "epoch": 0.578017518681645, + "grad_norm": 0.24985652742114342, + "learning_rate": 1.8261495631282756e-06, + "loss": 0.5867, + "step": 730 + }, + { + "epoch": 0.5788093235017568, + "grad_norm": 0.2509190981231187, + "learning_rate": 1.8256808645884365e-06, + "loss": 0.5842, + "step": 731 + }, + { + "epoch": 0.5796011283218686, + "grad_norm": 0.25873515370083255, + "learning_rate": 1.8252115954139255e-06, + "loss": 0.5866, + "step": 732 + }, + { + "epoch": 0.5803929331419805, + "grad_norm": 0.24999759706019414, + "learning_rate": 1.8247417559290575e-06, + "loss": 0.6023, + "step": 733 + }, + { + "epoch": 0.5811847379620924, + "grad_norm": 0.24960925570319883, + "learning_rate": 1.8242713464585426e-06, + "loss": 0.5838, + "step": 734 + }, + { + "epoch": 0.5819765427822042, + "grad_norm": 0.24836711610313497, + "learning_rate": 1.8238003673274847e-06, + "loss": 0.5827, + "step": 735 + }, + { + "epoch": 0.582768347602316, + "grad_norm": 0.24643437314791616, + "learning_rate": 1.8233288188613814e-06, + "loss": 0.5913, + "step": 736 + }, + { + "epoch": 0.5835601524224279, + "grad_norm": 0.26687565546039715, + "learning_rate": 1.8228567013861228e-06, + "loss": 0.5839, + "step": 737 + }, + { + "epoch": 0.5843519572425397, + "grad_norm": 0.24424129443944192, + "learning_rate": 1.8223840152279942e-06, + "loss": 0.5771, + "step": 738 + }, + { + "epoch": 0.5851437620626515, + "grad_norm": 0.2471018097029849, + "learning_rate": 1.821910760713672e-06, + "loss": 0.5777, + "step": 739 + }, + { + "epoch": 0.5859355668827634, + "grad_norm": 0.2547865860289802, + "learning_rate": 1.821436938170226e-06, + "loss": 0.6005, + "step": 740 + }, + { + "epoch": 0.5867273717028753, + "grad_norm": 0.25810692590791984, + "learning_rate": 1.82096254792512e-06, + "loss": 0.5918, + "step": 741 + }, + { + "epoch": 0.5875191765229871, + "grad_norm": 0.2534896323977627, + "learning_rate": 1.8204875903062078e-06, + "loss": 0.5781, + "step": 742 + }, + { + "epoch": 0.5883109813430989, + "grad_norm": 0.250310436243135, + "learning_rate": 1.8200120656417365e-06, + "loss": 0.5936, + "step": 743 + }, + { + "epoch": 0.5891027861632108, + "grad_norm": 0.25846177780365504, + "learning_rate": 1.8195359742603455e-06, + "loss": 0.5808, + "step": 744 + }, + { + "epoch": 0.5898945909833226, + "grad_norm": 0.2434882436517003, + "learning_rate": 1.8190593164910651e-06, + "loss": 0.5859, + "step": 745 + }, + { + "epoch": 0.5906863958034344, + "grad_norm": 0.27177810679363984, + "learning_rate": 1.8185820926633173e-06, + "loss": 0.6042, + "step": 746 + }, + { + "epoch": 0.5914782006235463, + "grad_norm": 0.2586591937858233, + "learning_rate": 1.8181043031069155e-06, + "loss": 0.5857, + "step": 747 + }, + { + "epoch": 0.5922700054436582, + "grad_norm": 0.2573533284689728, + "learning_rate": 1.8176259481520635e-06, + "loss": 0.5956, + "step": 748 + }, + { + "epoch": 0.59306181026377, + "grad_norm": 0.24718324290916885, + "learning_rate": 1.8171470281293564e-06, + "loss": 0.584, + "step": 749 + }, + { + "epoch": 0.5938536150838818, + "grad_norm": 0.2696495436808201, + "learning_rate": 1.81666754336978e-06, + "loss": 0.6009, + "step": 750 + }, + { + "epoch": 0.5946454199039937, + "grad_norm": 0.24839540781894592, + "learning_rate": 1.8161874942047097e-06, + "loss": 0.583, + "step": 751 + }, + { + "epoch": 0.5954372247241055, + "grad_norm": 0.251998162865897, + "learning_rate": 1.8157068809659117e-06, + "loss": 0.6036, + "step": 752 + }, + { + "epoch": 0.5962290295442173, + "grad_norm": 0.27137743828456923, + "learning_rate": 1.8152257039855412e-06, + "loss": 0.5889, + "step": 753 + }, + { + "epoch": 0.5970208343643292, + "grad_norm": 0.26559405382573104, + "learning_rate": 1.8147439635961436e-06, + "loss": 0.5994, + "step": 754 + }, + { + "epoch": 0.597812639184441, + "grad_norm": 0.2582769092884584, + "learning_rate": 1.814261660130654e-06, + "loss": 0.5912, + "step": 755 + }, + { + "epoch": 0.5986044440045529, + "grad_norm": 0.28215847993428234, + "learning_rate": 1.8137787939223953e-06, + "loss": 0.5902, + "step": 756 + }, + { + "epoch": 0.5993962488246647, + "grad_norm": 0.25873844774249477, + "learning_rate": 1.8132953653050812e-06, + "loss": 0.5822, + "step": 757 + }, + { + "epoch": 0.6001880536447766, + "grad_norm": 0.25218363810499717, + "learning_rate": 1.8128113746128127e-06, + "loss": 0.6108, + "step": 758 + }, + { + "epoch": 0.6009798584648884, + "grad_norm": 0.2643151580574007, + "learning_rate": 1.8123268221800793e-06, + "loss": 0.5815, + "step": 759 + }, + { + "epoch": 0.6017716632850002, + "grad_norm": 0.251824401021597, + "learning_rate": 1.8118417083417596e-06, + "loss": 0.5758, + "step": 760 + }, + { + "epoch": 0.6025634681051121, + "grad_norm": 0.24613995055877097, + "learning_rate": 1.8113560334331193e-06, + "loss": 0.5776, + "step": 761 + }, + { + "epoch": 0.603355272925224, + "grad_norm": 0.2601752874991997, + "learning_rate": 1.8108697977898126e-06, + "loss": 0.5848, + "step": 762 + }, + { + "epoch": 0.6041470777453358, + "grad_norm": 0.27174380073397775, + "learning_rate": 1.8103830017478805e-06, + "loss": 0.5731, + "step": 763 + }, + { + "epoch": 0.6049388825654476, + "grad_norm": 0.2430994748190506, + "learning_rate": 1.8098956456437518e-06, + "loss": 0.593, + "step": 764 + }, + { + "epoch": 0.6057306873855595, + "grad_norm": 0.2647416968126511, + "learning_rate": 1.8094077298142422e-06, + "loss": 0.5857, + "step": 765 + }, + { + "epoch": 0.6065224922056713, + "grad_norm": 0.27182892276562726, + "learning_rate": 1.8089192545965542e-06, + "loss": 0.5986, + "step": 766 + }, + { + "epoch": 0.6073142970257831, + "grad_norm": 0.25334048747254595, + "learning_rate": 1.808430220328277e-06, + "loss": 0.5791, + "step": 767 + }, + { + "epoch": 0.608106101845895, + "grad_norm": 0.259321595603846, + "learning_rate": 1.807940627347386e-06, + "loss": 0.5987, + "step": 768 + }, + { + "epoch": 0.6088979066660068, + "grad_norm": 0.2524989237650266, + "learning_rate": 1.807450475992243e-06, + "loss": 0.5911, + "step": 769 + }, + { + "epoch": 0.6096897114861187, + "grad_norm": 0.2535110040211958, + "learning_rate": 1.8069597666015956e-06, + "loss": 0.5786, + "step": 770 + }, + { + "epoch": 0.6104815163062305, + "grad_norm": 0.24658292893806905, + "learning_rate": 1.806468499514577e-06, + "loss": 0.5727, + "step": 771 + }, + { + "epoch": 0.6112733211263424, + "grad_norm": 0.2501300035527694, + "learning_rate": 1.8059766750707054e-06, + "loss": 0.5856, + "step": 772 + }, + { + "epoch": 0.6120651259464542, + "grad_norm": 0.24278539486327075, + "learning_rate": 1.8054842936098851e-06, + "loss": 0.5946, + "step": 773 + }, + { + "epoch": 0.612856930766566, + "grad_norm": 0.24808345795590186, + "learning_rate": 1.8049913554724046e-06, + "loss": 0.5828, + "step": 774 + }, + { + "epoch": 0.6136487355866779, + "grad_norm": 0.24297341717574392, + "learning_rate": 1.8044978609989374e-06, + "loss": 0.5817, + "step": 775 + }, + { + "epoch": 0.6144405404067897, + "grad_norm": 0.2573455405039158, + "learning_rate": 1.8040038105305415e-06, + "loss": 0.5883, + "step": 776 + }, + { + "epoch": 0.6152323452269016, + "grad_norm": 0.25158511627175334, + "learning_rate": 1.8035092044086594e-06, + "loss": 0.5672, + "step": 777 + }, + { + "epoch": 0.6160241500470134, + "grad_norm": 0.2522281238300653, + "learning_rate": 1.8030140429751173e-06, + "loss": 0.5715, + "step": 778 + }, + { + "epoch": 0.6168159548671253, + "grad_norm": 0.24727560215888592, + "learning_rate": 1.8025183265721251e-06, + "loss": 0.585, + "step": 779 + }, + { + "epoch": 0.6176077596872371, + "grad_norm": 0.2541287471761876, + "learning_rate": 1.8020220555422763e-06, + "loss": 0.5886, + "step": 780 + }, + { + "epoch": 0.6183995645073489, + "grad_norm": 0.2533308021305354, + "learning_rate": 1.801525230228548e-06, + "loss": 0.6008, + "step": 781 + }, + { + "epoch": 0.6191913693274608, + "grad_norm": 0.23744897496086267, + "learning_rate": 1.8010278509743e-06, + "loss": 0.5711, + "step": 782 + }, + { + "epoch": 0.6199831741475726, + "grad_norm": 0.25297067693147807, + "learning_rate": 1.8005299181232751e-06, + "loss": 0.5805, + "step": 783 + }, + { + "epoch": 0.6207749789676845, + "grad_norm": 0.24932173741694674, + "learning_rate": 1.800031432019599e-06, + "loss": 0.5863, + "step": 784 + }, + { + "epoch": 0.6215667837877963, + "grad_norm": 0.24535893078651158, + "learning_rate": 1.7995323930077791e-06, + "loss": 0.5872, + "step": 785 + }, + { + "epoch": 0.6223585886079082, + "grad_norm": 0.2383218746738785, + "learning_rate": 1.7990328014327054e-06, + "loss": 0.5742, + "step": 786 + }, + { + "epoch": 0.62315039342802, + "grad_norm": 0.25634064286313396, + "learning_rate": 1.7985326576396496e-06, + "loss": 0.6043, + "step": 787 + }, + { + "epoch": 0.6239421982481318, + "grad_norm": 0.24731036437872886, + "learning_rate": 1.798031961974265e-06, + "loss": 0.5789, + "step": 788 + }, + { + "epoch": 0.6247340030682437, + "grad_norm": 0.25243435536386216, + "learning_rate": 1.7975307147825864e-06, + "loss": 0.5845, + "step": 789 + }, + { + "epoch": 0.6255258078883555, + "grad_norm": 0.2511772367483909, + "learning_rate": 1.7970289164110301e-06, + "loss": 0.5776, + "step": 790 + }, + { + "epoch": 0.6263176127084673, + "grad_norm": 0.2532844706828268, + "learning_rate": 1.7965265672063927e-06, + "loss": 0.5824, + "step": 791 + }, + { + "epoch": 0.6271094175285792, + "grad_norm": 0.2658218019833223, + "learning_rate": 1.7960236675158518e-06, + "loss": 0.5956, + "step": 792 + }, + { + "epoch": 0.6279012223486911, + "grad_norm": 0.25412952245730175, + "learning_rate": 1.7955202176869655e-06, + "loss": 0.593, + "step": 793 + }, + { + "epoch": 0.6286930271688029, + "grad_norm": 0.2561659207216323, + "learning_rate": 1.7950162180676717e-06, + "loss": 0.5893, + "step": 794 + }, + { + "epoch": 0.6294848319889147, + "grad_norm": 0.2517474971714552, + "learning_rate": 1.794511669006289e-06, + "loss": 0.6108, + "step": 795 + }, + { + "epoch": 0.6302766368090266, + "grad_norm": 0.2528102682703397, + "learning_rate": 1.7940065708515148e-06, + "loss": 0.5992, + "step": 796 + }, + { + "epoch": 0.6310684416291384, + "grad_norm": 0.25992360575752016, + "learning_rate": 1.7935009239524273e-06, + "loss": 0.5795, + "step": 797 + }, + { + "epoch": 0.6318602464492502, + "grad_norm": 0.2554063427667969, + "learning_rate": 1.7929947286584823e-06, + "loss": 0.5985, + "step": 798 + }, + { + "epoch": 0.6326520512693621, + "grad_norm": 0.24422735834302362, + "learning_rate": 1.792487985319516e-06, + "loss": 0.5832, + "step": 799 + }, + { + "epoch": 0.633443856089474, + "grad_norm": 0.24622861998843143, + "learning_rate": 1.7919806942857426e-06, + "loss": 0.5742, + "step": 800 + }, + { + "epoch": 0.6342356609095858, + "grad_norm": 0.24029933869830092, + "learning_rate": 1.7914728559077546e-06, + "loss": 0.5787, + "step": 801 + }, + { + "epoch": 0.6350274657296976, + "grad_norm": 0.2429316083100275, + "learning_rate": 1.790964470536524e-06, + "loss": 0.592, + "step": 802 + }, + { + "epoch": 0.6358192705498095, + "grad_norm": 0.24302877995526823, + "learning_rate": 1.7904555385233994e-06, + "loss": 0.5866, + "step": 803 + }, + { + "epoch": 0.6366110753699213, + "grad_norm": 0.24746514901937777, + "learning_rate": 1.7899460602201077e-06, + "loss": 0.5839, + "step": 804 + }, + { + "epoch": 0.6374028801900331, + "grad_norm": 0.2420302965684456, + "learning_rate": 1.789436035978754e-06, + "loss": 0.5971, + "step": 805 + }, + { + "epoch": 0.638194685010145, + "grad_norm": 0.24385045963601545, + "learning_rate": 1.7889254661518196e-06, + "loss": 0.5815, + "step": 806 + }, + { + "epoch": 0.6389864898302569, + "grad_norm": 0.23828991606762473, + "learning_rate": 1.7884143510921637e-06, + "loss": 0.5953, + "step": 807 + }, + { + "epoch": 0.6397782946503687, + "grad_norm": 0.24420936560825504, + "learning_rate": 1.7879026911530222e-06, + "loss": 0.5926, + "step": 808 + }, + { + "epoch": 0.6405700994704805, + "grad_norm": 0.2714421774156976, + "learning_rate": 1.787390486688007e-06, + "loss": 0.5844, + "step": 809 + }, + { + "epoch": 0.6413619042905924, + "grad_norm": 0.24412097745674205, + "learning_rate": 1.786877738051107e-06, + "loss": 0.5844, + "step": 810 + }, + { + "epoch": 0.6421537091107042, + "grad_norm": 0.25422701784506546, + "learning_rate": 1.7863644455966865e-06, + "loss": 0.5693, + "step": 811 + }, + { + "epoch": 0.642945513930816, + "grad_norm": 0.2453540573413632, + "learning_rate": 1.7858506096794864e-06, + "loss": 0.5951, + "step": 812 + }, + { + "epoch": 0.6437373187509279, + "grad_norm": 0.2636005996907456, + "learning_rate": 1.7853362306546229e-06, + "loss": 0.5916, + "step": 813 + }, + { + "epoch": 0.6445291235710398, + "grad_norm": 0.24997894283566946, + "learning_rate": 1.7848213088775874e-06, + "loss": 0.5794, + "step": 814 + }, + { + "epoch": 0.6453209283911516, + "grad_norm": 0.24609967370733982, + "learning_rate": 1.784305844704246e-06, + "loss": 0.5842, + "step": 815 + }, + { + "epoch": 0.6461127332112634, + "grad_norm": 0.25314990024112644, + "learning_rate": 1.7837898384908407e-06, + "loss": 0.5754, + "step": 816 + }, + { + "epoch": 0.6469045380313753, + "grad_norm": 0.2528191893842514, + "learning_rate": 1.7832732905939876e-06, + "loss": 0.5924, + "step": 817 + }, + { + "epoch": 0.6476963428514871, + "grad_norm": 0.24435530651812093, + "learning_rate": 1.7827562013706766e-06, + "loss": 0.5826, + "step": 818 + }, + { + "epoch": 0.6484881476715989, + "grad_norm": 0.2572416831231824, + "learning_rate": 1.7822385711782726e-06, + "loss": 0.608, + "step": 819 + }, + { + "epoch": 0.6492799524917108, + "grad_norm": 0.25617732603538695, + "learning_rate": 1.7817204003745134e-06, + "loss": 0.5969, + "step": 820 + }, + { + "epoch": 0.6500717573118227, + "grad_norm": 0.2521166470503421, + "learning_rate": 1.7812016893175119e-06, + "loss": 0.5871, + "step": 821 + }, + { + "epoch": 0.6508635621319345, + "grad_norm": 0.25056053985682086, + "learning_rate": 1.780682438365753e-06, + "loss": 0.5848, + "step": 822 + }, + { + "epoch": 0.6516553669520463, + "grad_norm": 0.25287431062010535, + "learning_rate": 1.780162647878095e-06, + "loss": 0.5828, + "step": 823 + }, + { + "epoch": 0.6524471717721582, + "grad_norm": 0.25007192663814964, + "learning_rate": 1.7796423182137694e-06, + "loss": 0.5854, + "step": 824 + }, + { + "epoch": 0.65323897659227, + "grad_norm": 0.25043387039495113, + "learning_rate": 1.7791214497323805e-06, + "loss": 0.5937, + "step": 825 + }, + { + "epoch": 0.6540307814123818, + "grad_norm": 0.25615998922421296, + "learning_rate": 1.7786000427939047e-06, + "loss": 0.5741, + "step": 826 + }, + { + "epoch": 0.6548225862324937, + "grad_norm": 0.24930777273834764, + "learning_rate": 1.77807809775869e-06, + "loss": 0.5908, + "step": 827 + }, + { + "epoch": 0.6556143910526055, + "grad_norm": 0.2585417804755247, + "learning_rate": 1.777555614987457e-06, + "loss": 0.5883, + "step": 828 + }, + { + "epoch": 0.6564061958727174, + "grad_norm": 0.24450521436137, + "learning_rate": 1.777032594841298e-06, + "loss": 0.5678, + "step": 829 + }, + { + "epoch": 0.6571980006928292, + "grad_norm": 0.24862190261035272, + "learning_rate": 1.7765090376816766e-06, + "loss": 0.592, + "step": 830 + }, + { + "epoch": 0.6579898055129411, + "grad_norm": 0.25432285465044857, + "learning_rate": 1.7759849438704264e-06, + "loss": 0.5703, + "step": 831 + }, + { + "epoch": 0.6587816103330529, + "grad_norm": 0.25194906787878907, + "learning_rate": 1.775460313769754e-06, + "loss": 0.5886, + "step": 832 + }, + { + "epoch": 0.6595734151531647, + "grad_norm": 0.24467044323457524, + "learning_rate": 1.7749351477422345e-06, + "loss": 0.594, + "step": 833 + }, + { + "epoch": 0.6603652199732766, + "grad_norm": 0.2357389926591831, + "learning_rate": 1.774409446150815e-06, + "loss": 0.5786, + "step": 834 + }, + { + "epoch": 0.6611570247933884, + "grad_norm": 0.25621856938464344, + "learning_rate": 1.773883209358812e-06, + "loss": 0.5668, + "step": 835 + }, + { + "epoch": 0.6619488296135003, + "grad_norm": 0.2538284329877011, + "learning_rate": 1.7733564377299115e-06, + "loss": 0.5804, + "step": 836 + }, + { + "epoch": 0.6627406344336121, + "grad_norm": 0.248575447549464, + "learning_rate": 1.7728291316281703e-06, + "loss": 0.5987, + "step": 837 + }, + { + "epoch": 0.663532439253724, + "grad_norm": 0.2559812446732376, + "learning_rate": 1.7723012914180134e-06, + "loss": 0.5894, + "step": 838 + }, + { + "epoch": 0.6643242440738358, + "grad_norm": 0.26293105509088166, + "learning_rate": 1.7717729174642355e-06, + "loss": 0.5879, + "step": 839 + }, + { + "epoch": 0.6651160488939476, + "grad_norm": 0.23970650462828844, + "learning_rate": 1.7712440101320004e-06, + "loss": 0.5985, + "step": 840 + }, + { + "epoch": 0.6659078537140595, + "grad_norm": 0.25677479955425603, + "learning_rate": 1.7707145697868392e-06, + "loss": 0.6022, + "step": 841 + }, + { + "epoch": 0.6666996585341713, + "grad_norm": 0.24918667579670953, + "learning_rate": 1.7701845967946537e-06, + "loss": 0.5706, + "step": 842 + }, + { + "epoch": 0.6674914633542832, + "grad_norm": 0.2565401820917414, + "learning_rate": 1.769654091521712e-06, + "loss": 0.5734, + "step": 843 + }, + { + "epoch": 0.668283268174395, + "grad_norm": 0.26433306600275414, + "learning_rate": 1.7691230543346501e-06, + "loss": 0.5932, + "step": 844 + }, + { + "epoch": 0.6690750729945069, + "grad_norm": 0.2569470821796716, + "learning_rate": 1.7685914856004725e-06, + "loss": 0.5822, + "step": 845 + }, + { + "epoch": 0.6698668778146187, + "grad_norm": 0.2544063865454498, + "learning_rate": 1.7680593856865503e-06, + "loss": 0.5874, + "step": 846 + }, + { + "epoch": 0.6706586826347305, + "grad_norm": 0.2542509968843097, + "learning_rate": 1.7675267549606225e-06, + "loss": 0.584, + "step": 847 + }, + { + "epoch": 0.6714504874548424, + "grad_norm": 0.24318698066947034, + "learning_rate": 1.7669935937907941e-06, + "loss": 0.5906, + "step": 848 + }, + { + "epoch": 0.6722422922749542, + "grad_norm": 0.24661612792719936, + "learning_rate": 1.766459902545537e-06, + "loss": 0.5813, + "step": 849 + }, + { + "epoch": 0.6730340970950661, + "grad_norm": 0.2492753457810347, + "learning_rate": 1.76592568159369e-06, + "loss": 0.5801, + "step": 850 + }, + { + "epoch": 0.6738259019151779, + "grad_norm": 0.24830446834246703, + "learning_rate": 1.765390931304457e-06, + "loss": 0.5915, + "step": 851 + }, + { + "epoch": 0.6746177067352898, + "grad_norm": 0.25431438924953426, + "learning_rate": 1.7648556520474083e-06, + "loss": 0.5836, + "step": 852 + }, + { + "epoch": 0.6754095115554016, + "grad_norm": 0.2588824070511301, + "learning_rate": 1.7643198441924797e-06, + "loss": 0.5879, + "step": 853 + }, + { + "epoch": 0.6762013163755134, + "grad_norm": 0.25038875476912814, + "learning_rate": 1.7637835081099729e-06, + "loss": 0.5725, + "step": 854 + }, + { + "epoch": 0.6769931211956253, + "grad_norm": 0.2514821846393738, + "learning_rate": 1.763246644170553e-06, + "loss": 0.5868, + "step": 855 + }, + { + "epoch": 0.6777849260157371, + "grad_norm": 0.2851916766730875, + "learning_rate": 1.7627092527452515e-06, + "loss": 0.5976, + "step": 856 + }, + { + "epoch": 0.678576730835849, + "grad_norm": 0.24179081935232716, + "learning_rate": 1.762171334205464e-06, + "loss": 0.5793, + "step": 857 + }, + { + "epoch": 0.6793685356559608, + "grad_norm": 0.25292715648491026, + "learning_rate": 1.7616328889229505e-06, + "loss": 0.5827, + "step": 858 + }, + { + "epoch": 0.6801603404760727, + "grad_norm": 0.23964732502085143, + "learning_rate": 1.7610939172698344e-06, + "loss": 0.5846, + "step": 859 + }, + { + "epoch": 0.6809521452961845, + "grad_norm": 0.24641216304573305, + "learning_rate": 1.7605544196186037e-06, + "loss": 0.6043, + "step": 860 + }, + { + "epoch": 0.6817439501162963, + "grad_norm": 0.2591671468302859, + "learning_rate": 1.7600143963421094e-06, + "loss": 0.5837, + "step": 861 + }, + { + "epoch": 0.6825357549364082, + "grad_norm": 0.24314631653055893, + "learning_rate": 1.7594738478135658e-06, + "loss": 0.5926, + "step": 862 + }, + { + "epoch": 0.68332755975652, + "grad_norm": 0.24738598633453043, + "learning_rate": 1.7589327744065507e-06, + "loss": 0.5869, + "step": 863 + }, + { + "epoch": 0.6841193645766318, + "grad_norm": 0.25688559624231405, + "learning_rate": 1.7583911764950039e-06, + "loss": 0.5937, + "step": 864 + }, + { + "epoch": 0.6849111693967437, + "grad_norm": 0.24885371925651226, + "learning_rate": 1.7578490544532283e-06, + "loss": 0.5856, + "step": 865 + }, + { + "epoch": 0.6857029742168556, + "grad_norm": 0.24690074827520492, + "learning_rate": 1.7573064086558884e-06, + "loss": 0.574, + "step": 866 + }, + { + "epoch": 0.6864947790369674, + "grad_norm": 0.24760162187909207, + "learning_rate": 1.7567632394780118e-06, + "loss": 0.5665, + "step": 867 + }, + { + "epoch": 0.6872865838570792, + "grad_norm": 0.24577712965038068, + "learning_rate": 1.7562195472949863e-06, + "loss": 0.584, + "step": 868 + }, + { + "epoch": 0.6880783886771911, + "grad_norm": 0.24707196829506817, + "learning_rate": 1.7556753324825624e-06, + "loss": 0.5728, + "step": 869 + }, + { + "epoch": 0.6888701934973029, + "grad_norm": 0.2427499693077296, + "learning_rate": 1.7551305954168512e-06, + "loss": 0.5907, + "step": 870 + }, + { + "epoch": 0.6896619983174147, + "grad_norm": 0.2557832991364167, + "learning_rate": 1.7545853364743248e-06, + "loss": 0.5817, + "step": 871 + }, + { + "epoch": 0.6904538031375266, + "grad_norm": 0.2480115469674397, + "learning_rate": 1.7540395560318162e-06, + "loss": 0.5878, + "step": 872 + }, + { + "epoch": 0.6912456079576385, + "grad_norm": 0.24717915980270053, + "learning_rate": 1.7534932544665183e-06, + "loss": 0.5734, + "step": 873 + }, + { + "epoch": 0.6920374127777503, + "grad_norm": 0.2524836513942343, + "learning_rate": 1.7529464321559848e-06, + "loss": 0.5948, + "step": 874 + }, + { + "epoch": 0.6928292175978621, + "grad_norm": 0.245275353092912, + "learning_rate": 1.752399089478129e-06, + "loss": 0.5744, + "step": 875 + }, + { + "epoch": 0.693621022417974, + "grad_norm": 0.24062703187758544, + "learning_rate": 1.7518512268112238e-06, + "loss": 0.5784, + "step": 876 + }, + { + "epoch": 0.6944128272380858, + "grad_norm": 0.24530600631442345, + "learning_rate": 1.7513028445339014e-06, + "loss": 0.5782, + "step": 877 + }, + { + "epoch": 0.6952046320581976, + "grad_norm": 0.24669451967407843, + "learning_rate": 1.750753943025153e-06, + "loss": 0.575, + "step": 878 + }, + { + "epoch": 0.6959964368783095, + "grad_norm": 0.2447861480839371, + "learning_rate": 1.750204522664329e-06, + "loss": 0.5886, + "step": 879 + }, + { + "epoch": 0.6967882416984214, + "grad_norm": 0.24324802163539397, + "learning_rate": 1.7496545838311386e-06, + "loss": 0.5978, + "step": 880 + }, + { + "epoch": 0.6975800465185332, + "grad_norm": 0.25376150676455245, + "learning_rate": 1.7491041269056485e-06, + "loss": 0.5718, + "step": 881 + }, + { + "epoch": 0.698371851338645, + "grad_norm": 0.24267518899970847, + "learning_rate": 1.7485531522682839e-06, + "loss": 0.5711, + "step": 882 + }, + { + "epoch": 0.6991636561587569, + "grad_norm": 0.24977132670485755, + "learning_rate": 1.748001660299828e-06, + "loss": 0.5941, + "step": 883 + }, + { + "epoch": 0.6999554609788687, + "grad_norm": 0.2459853658010188, + "learning_rate": 1.7474496513814213e-06, + "loss": 0.5925, + "step": 884 + }, + { + "epoch": 0.7007472657989805, + "grad_norm": 0.26409388113089094, + "learning_rate": 1.7468971258945615e-06, + "loss": 0.5922, + "step": 885 + }, + { + "epoch": 0.7015390706190924, + "grad_norm": 0.24702804605528103, + "learning_rate": 1.7463440842211035e-06, + "loss": 0.573, + "step": 886 + }, + { + "epoch": 0.7023308754392042, + "grad_norm": 0.2541245452020192, + "learning_rate": 1.7457905267432585e-06, + "loss": 0.5839, + "step": 887 + }, + { + "epoch": 0.7031226802593161, + "grad_norm": 0.25110485274127725, + "learning_rate": 1.7452364538435949e-06, + "loss": 0.5743, + "step": 888 + }, + { + "epoch": 0.7039144850794279, + "grad_norm": 0.25186565532483407, + "learning_rate": 1.7446818659050369e-06, + "loss": 0.5948, + "step": 889 + }, + { + "epoch": 0.7047062898995398, + "grad_norm": 0.2429131508592695, + "learning_rate": 1.7441267633108642e-06, + "loss": 0.5891, + "step": 890 + }, + { + "epoch": 0.7054980947196516, + "grad_norm": 0.23545565600107377, + "learning_rate": 1.7435711464447133e-06, + "loss": 0.5697, + "step": 891 + }, + { + "epoch": 0.7062898995397634, + "grad_norm": 0.24937194476011745, + "learning_rate": 1.7430150156905752e-06, + "loss": 0.5797, + "step": 892 + }, + { + "epoch": 0.7070817043598753, + "grad_norm": 0.244376238101983, + "learning_rate": 1.7424583714327965e-06, + "loss": 0.5884, + "step": 893 + }, + { + "epoch": 0.7078735091799871, + "grad_norm": 0.23916929044981503, + "learning_rate": 1.7419012140560782e-06, + "loss": 0.5729, + "step": 894 + }, + { + "epoch": 0.708665314000099, + "grad_norm": 0.25363642242342044, + "learning_rate": 1.7413435439454762e-06, + "loss": 0.5714, + "step": 895 + }, + { + "epoch": 0.7094571188202108, + "grad_norm": 0.24211294200683903, + "learning_rate": 1.7407853614864013e-06, + "loss": 0.5658, + "step": 896 + }, + { + "epoch": 0.7102489236403227, + "grad_norm": 0.24860427991387388, + "learning_rate": 1.7402266670646175e-06, + "loss": 0.5775, + "step": 897 + }, + { + "epoch": 0.7110407284604345, + "grad_norm": 0.2561766068193548, + "learning_rate": 1.7396674610662431e-06, + "loss": 0.5866, + "step": 898 + }, + { + "epoch": 0.7118325332805463, + "grad_norm": 0.25062849494456546, + "learning_rate": 1.7391077438777498e-06, + "loss": 0.5808, + "step": 899 + }, + { + "epoch": 0.7126243381006582, + "grad_norm": 0.25441600351502675, + "learning_rate": 1.7385475158859624e-06, + "loss": 0.6, + "step": 900 + }, + { + "epoch": 0.71341614292077, + "grad_norm": 0.2464011228232308, + "learning_rate": 1.7379867774780594e-06, + "loss": 0.6006, + "step": 901 + }, + { + "epoch": 0.7142079477408819, + "grad_norm": 0.2544724679474216, + "learning_rate": 1.7374255290415714e-06, + "loss": 0.5975, + "step": 902 + }, + { + "epoch": 0.7149997525609937, + "grad_norm": 0.2519462028169189, + "learning_rate": 1.7368637709643817e-06, + "loss": 0.5717, + "step": 903 + }, + { + "epoch": 0.7157915573811056, + "grad_norm": 0.24456334735486743, + "learning_rate": 1.7363015036347259e-06, + "loss": 0.5829, + "step": 904 + }, + { + "epoch": 0.7165833622012174, + "grad_norm": 0.249011649931778, + "learning_rate": 1.7357387274411912e-06, + "loss": 0.5876, + "step": 905 + }, + { + "epoch": 0.7173751670213292, + "grad_norm": 0.24076372543115893, + "learning_rate": 1.7351754427727174e-06, + "loss": 0.5847, + "step": 906 + }, + { + "epoch": 0.7181669718414411, + "grad_norm": 0.25032599232475183, + "learning_rate": 1.734611650018594e-06, + "loss": 0.5949, + "step": 907 + }, + { + "epoch": 0.718958776661553, + "grad_norm": 0.2485987378016031, + "learning_rate": 1.7340473495684637e-06, + "loss": 0.5679, + "step": 908 + }, + { + "epoch": 0.7197505814816648, + "grad_norm": 0.24943278117786208, + "learning_rate": 1.7334825418123187e-06, + "loss": 0.5966, + "step": 909 + }, + { + "epoch": 0.7205423863017766, + "grad_norm": 0.24813196439993077, + "learning_rate": 1.7329172271405019e-06, + "loss": 0.5852, + "step": 910 + }, + { + "epoch": 0.7213341911218885, + "grad_norm": 0.2535279623656571, + "learning_rate": 1.7323514059437075e-06, + "loss": 0.5879, + "step": 911 + }, + { + "epoch": 0.7221259959420003, + "grad_norm": 0.2520772188455546, + "learning_rate": 1.7317850786129782e-06, + "loss": 0.5838, + "step": 912 + }, + { + "epoch": 0.7229178007621121, + "grad_norm": 0.2525009987208763, + "learning_rate": 1.731218245539708e-06, + "loss": 0.5857, + "step": 913 + }, + { + "epoch": 0.723709605582224, + "grad_norm": 0.2433018153161353, + "learning_rate": 1.7306509071156394e-06, + "loss": 0.5914, + "step": 914 + }, + { + "epoch": 0.7245014104023358, + "grad_norm": 0.24811807449526793, + "learning_rate": 1.7300830637328648e-06, + "loss": 0.5832, + "step": 915 + }, + { + "epoch": 0.7252932152224477, + "grad_norm": 0.2469823253728138, + "learning_rate": 1.7295147157838253e-06, + "loss": 0.5834, + "step": 916 + }, + { + "epoch": 0.7260850200425595, + "grad_norm": 0.24970511504740275, + "learning_rate": 1.7289458636613105e-06, + "loss": 0.5753, + "step": 917 + }, + { + "epoch": 0.7268768248626714, + "grad_norm": 0.2430854348807761, + "learning_rate": 1.7283765077584592e-06, + "loss": 0.5761, + "step": 918 + }, + { + "epoch": 0.7276686296827832, + "grad_norm": 0.24594292469061335, + "learning_rate": 1.7278066484687571e-06, + "loss": 0.5891, + "step": 919 + }, + { + "epoch": 0.728460434502895, + "grad_norm": 0.23539605756545798, + "learning_rate": 1.7272362861860388e-06, + "loss": 0.5778, + "step": 920 + }, + { + "epoch": 0.7292522393230069, + "grad_norm": 0.25176104671877025, + "learning_rate": 1.7266654213044865e-06, + "loss": 0.5723, + "step": 921 + }, + { + "epoch": 0.7300440441431187, + "grad_norm": 0.2450854097972758, + "learning_rate": 1.7260940542186293e-06, + "loss": 0.5822, + "step": 922 + }, + { + "epoch": 0.7308358489632306, + "grad_norm": 0.25608007063845983, + "learning_rate": 1.7255221853233438e-06, + "loss": 0.5839, + "step": 923 + }, + { + "epoch": 0.7316276537833424, + "grad_norm": 0.24542022770248775, + "learning_rate": 1.7249498150138528e-06, + "loss": 0.5733, + "step": 924 + }, + { + "epoch": 0.7324194586034543, + "grad_norm": 0.24927466094881492, + "learning_rate": 1.7243769436857262e-06, + "loss": 0.5847, + "step": 925 + }, + { + "epoch": 0.7332112634235661, + "grad_norm": 0.26812137197973235, + "learning_rate": 1.7238035717348801e-06, + "loss": 0.582, + "step": 926 + }, + { + "epoch": 0.7340030682436779, + "grad_norm": 0.2463608580688781, + "learning_rate": 1.7232296995575765e-06, + "loss": 0.5867, + "step": 927 + }, + { + "epoch": 0.7347948730637898, + "grad_norm": 0.2688322608735481, + "learning_rate": 1.7226553275504229e-06, + "loss": 0.5806, + "step": 928 + }, + { + "epoch": 0.7355866778839016, + "grad_norm": 0.26945234351830955, + "learning_rate": 1.722080456110372e-06, + "loss": 0.5846, + "step": 929 + }, + { + "epoch": 0.7363784827040135, + "grad_norm": 0.25408035335687396, + "learning_rate": 1.721505085634723e-06, + "loss": 0.5755, + "step": 930 + }, + { + "epoch": 0.7371702875241253, + "grad_norm": 0.2465434921741765, + "learning_rate": 1.7209292165211182e-06, + "loss": 0.5798, + "step": 931 + }, + { + "epoch": 0.7379620923442372, + "grad_norm": 0.2540837663212656, + "learning_rate": 1.720352849167546e-06, + "loss": 0.5802, + "step": 932 + }, + { + "epoch": 0.738753897164349, + "grad_norm": 0.25238195730729085, + "learning_rate": 1.7197759839723375e-06, + "loss": 0.5718, + "step": 933 + }, + { + "epoch": 0.7395457019844608, + "grad_norm": 0.25437203628781707, + "learning_rate": 1.7191986213341703e-06, + "loss": 0.5668, + "step": 934 + }, + { + "epoch": 0.7403375068045727, + "grad_norm": 0.25335680248875186, + "learning_rate": 1.7186207616520635e-06, + "loss": 0.5697, + "step": 935 + }, + { + "epoch": 0.7411293116246845, + "grad_norm": 0.2470089828067232, + "learning_rate": 1.7180424053253808e-06, + "loss": 0.5788, + "step": 936 + }, + { + "epoch": 0.7419211164447963, + "grad_norm": 0.25981652910641084, + "learning_rate": 1.7174635527538284e-06, + "loss": 0.5666, + "step": 937 + }, + { + "epoch": 0.7427129212649082, + "grad_norm": 0.25609013882045345, + "learning_rate": 1.7168842043374566e-06, + "loss": 0.5809, + "step": 938 + }, + { + "epoch": 0.74350472608502, + "grad_norm": 0.25837506251027265, + "learning_rate": 1.7163043604766573e-06, + "loss": 0.5711, + "step": 939 + }, + { + "epoch": 0.7442965309051319, + "grad_norm": 0.2555723939908134, + "learning_rate": 1.715724021572166e-06, + "loss": 0.5863, + "step": 940 + }, + { + "epoch": 0.7450883357252437, + "grad_norm": 0.24707881931050837, + "learning_rate": 1.7151431880250585e-06, + "loss": 0.5863, + "step": 941 + }, + { + "epoch": 0.7458801405453556, + "grad_norm": 0.24351053097539035, + "learning_rate": 1.714561860236754e-06, + "loss": 0.599, + "step": 942 + }, + { + "epoch": 0.7466719453654674, + "grad_norm": 0.24475544145118233, + "learning_rate": 1.7139800386090131e-06, + "loss": 0.5885, + "step": 943 + }, + { + "epoch": 0.7474637501855792, + "grad_norm": 0.25716157533418266, + "learning_rate": 1.7133977235439371e-06, + "loss": 0.5594, + "step": 944 + }, + { + "epoch": 0.7482555550056911, + "grad_norm": 0.25286004003751, + "learning_rate": 1.7128149154439684e-06, + "loss": 0.5801, + "step": 945 + }, + { + "epoch": 0.749047359825803, + "grad_norm": 0.24534973996948886, + "learning_rate": 1.712231614711891e-06, + "loss": 0.5963, + "step": 946 + }, + { + "epoch": 0.7498391646459148, + "grad_norm": 0.2478474592213585, + "learning_rate": 1.711647821750828e-06, + "loss": 0.582, + "step": 947 + }, + { + "epoch": 0.7506309694660266, + "grad_norm": 0.2516958656147911, + "learning_rate": 1.711063536964244e-06, + "loss": 0.5758, + "step": 948 + }, + { + "epoch": 0.7514227742861385, + "grad_norm": 0.25361160405447486, + "learning_rate": 1.7104787607559424e-06, + "loss": 0.5843, + "step": 949 + }, + { + "epoch": 0.7522145791062503, + "grad_norm": 0.2536184915636103, + "learning_rate": 1.709893493530067e-06, + "loss": 0.5957, + "step": 950 + }, + { + "epoch": 0.7530063839263621, + "grad_norm": 0.2532291449126456, + "learning_rate": 1.7093077356911008e-06, + "loss": 0.5847, + "step": 951 + }, + { + "epoch": 0.753798188746474, + "grad_norm": 0.24017158006580785, + "learning_rate": 1.7087214876438653e-06, + "loss": 0.594, + "step": 952 + }, + { + "epoch": 0.7545899935665858, + "grad_norm": 0.24723979397219245, + "learning_rate": 1.7081347497935216e-06, + "loss": 0.5778, + "step": 953 + }, + { + "epoch": 0.7553817983866977, + "grad_norm": 0.2564595111532207, + "learning_rate": 1.7075475225455688e-06, + "loss": 0.5888, + "step": 954 + }, + { + "epoch": 0.7561736032068095, + "grad_norm": 0.24437381393671384, + "learning_rate": 1.7069598063058448e-06, + "loss": 0.5761, + "step": 955 + }, + { + "epoch": 0.7569654080269214, + "grad_norm": 0.2521756274505913, + "learning_rate": 1.7063716014805242e-06, + "loss": 0.5735, + "step": 956 + }, + { + "epoch": 0.7577572128470332, + "grad_norm": 0.24695700919885905, + "learning_rate": 1.7057829084761207e-06, + "loss": 0.6006, + "step": 957 + }, + { + "epoch": 0.758549017667145, + "grad_norm": 0.24688421725514956, + "learning_rate": 1.7051937276994845e-06, + "loss": 0.5892, + "step": 958 + }, + { + "epoch": 0.7593408224872569, + "grad_norm": 0.24928610762379938, + "learning_rate": 1.7046040595578033e-06, + "loss": 0.5894, + "step": 959 + }, + { + "epoch": 0.7601326273073687, + "grad_norm": 0.2521753453756982, + "learning_rate": 1.7040139044586016e-06, + "loss": 0.5855, + "step": 960 + }, + { + "epoch": 0.7609244321274806, + "grad_norm": 0.24392734940364796, + "learning_rate": 1.7034232628097395e-06, + "loss": 0.5789, + "step": 961 + }, + { + "epoch": 0.7617162369475924, + "grad_norm": 0.25157346435761563, + "learning_rate": 1.7028321350194155e-06, + "loss": 0.5783, + "step": 962 + }, + { + "epoch": 0.7625080417677043, + "grad_norm": 0.24407210040846353, + "learning_rate": 1.7022405214961617e-06, + "loss": 0.5908, + "step": 963 + }, + { + "epoch": 0.7632998465878161, + "grad_norm": 0.24231258628359728, + "learning_rate": 1.7016484226488474e-06, + "loss": 0.5752, + "step": 964 + }, + { + "epoch": 0.7640916514079279, + "grad_norm": 0.24266590901680207, + "learning_rate": 1.7010558388866766e-06, + "loss": 0.589, + "step": 965 + }, + { + "epoch": 0.7648834562280398, + "grad_norm": 0.24311944620382953, + "learning_rate": 1.7004627706191895e-06, + "loss": 0.6077, + "step": 966 + }, + { + "epoch": 0.7656752610481516, + "grad_norm": 0.24549077766115834, + "learning_rate": 1.6998692182562592e-06, + "loss": 0.5778, + "step": 967 + }, + { + "epoch": 0.7664670658682635, + "grad_norm": 0.24241932841306488, + "learning_rate": 1.6992751822080952e-06, + "loss": 0.5909, + "step": 968 + }, + { + "epoch": 0.7672588706883753, + "grad_norm": 0.25902837727029465, + "learning_rate": 1.6986806628852404e-06, + "loss": 0.5709, + "step": 969 + }, + { + "epoch": 0.7680506755084872, + "grad_norm": 0.2489981742220158, + "learning_rate": 1.6980856606985717e-06, + "loss": 0.6042, + "step": 970 + }, + { + "epoch": 0.768842480328599, + "grad_norm": 0.25941636084634623, + "learning_rate": 1.6974901760593006e-06, + "loss": 0.5758, + "step": 971 + }, + { + "epoch": 0.7696342851487108, + "grad_norm": 0.25092704109760794, + "learning_rate": 1.6968942093789704e-06, + "loss": 0.5987, + "step": 972 + }, + { + "epoch": 0.7704260899688227, + "grad_norm": 0.2604196556366055, + "learning_rate": 1.6962977610694588e-06, + "loss": 0.5834, + "step": 973 + }, + { + "epoch": 0.7712178947889345, + "grad_norm": 0.2583034640293634, + "learning_rate": 1.6957008315429761e-06, + "loss": 0.5732, + "step": 974 + }, + { + "epoch": 0.7720096996090464, + "grad_norm": 0.24564918682645784, + "learning_rate": 1.6951034212120652e-06, + "loss": 0.5776, + "step": 975 + }, + { + "epoch": 0.7728015044291582, + "grad_norm": 0.24530129884926857, + "learning_rate": 1.694505530489601e-06, + "loss": 0.581, + "step": 976 + }, + { + "epoch": 0.7735933092492701, + "grad_norm": 0.25839773288616386, + "learning_rate": 1.6939071597887903e-06, + "loss": 0.5649, + "step": 977 + }, + { + "epoch": 0.7743851140693819, + "grad_norm": 0.2633398810124407, + "learning_rate": 1.6933083095231726e-06, + "loss": 0.5903, + "step": 978 + }, + { + "epoch": 0.7751769188894937, + "grad_norm": 0.25183824076871514, + "learning_rate": 1.6927089801066176e-06, + "loss": 0.5742, + "step": 979 + }, + { + "epoch": 0.7759687237096056, + "grad_norm": 0.24269781478025124, + "learning_rate": 1.6921091719533267e-06, + "loss": 0.5772, + "step": 980 + }, + { + "epoch": 0.7767605285297174, + "grad_norm": 0.2418600370580417, + "learning_rate": 1.6915088854778324e-06, + "loss": 0.5715, + "step": 981 + }, + { + "epoch": 0.7775523333498293, + "grad_norm": 0.2572191934436083, + "learning_rate": 1.690908121094997e-06, + "loss": 0.575, + "step": 982 + }, + { + "epoch": 0.7783441381699411, + "grad_norm": 0.25724074356800636, + "learning_rate": 1.690306879220014e-06, + "loss": 0.5847, + "step": 983 + }, + { + "epoch": 0.779135942990053, + "grad_norm": 0.2459839841353052, + "learning_rate": 1.6897051602684067e-06, + "loss": 0.5895, + "step": 984 + }, + { + "epoch": 0.7799277478101648, + "grad_norm": 0.2523185194394151, + "learning_rate": 1.689102964656027e-06, + "loss": 0.5763, + "step": 985 + }, + { + "epoch": 0.7807195526302766, + "grad_norm": 0.2514690273057696, + "learning_rate": 1.688500292799058e-06, + "loss": 0.5891, + "step": 986 + }, + { + "epoch": 0.7815113574503885, + "grad_norm": 0.2554984712579457, + "learning_rate": 1.6878971451140112e-06, + "loss": 0.5871, + "step": 987 + }, + { + "epoch": 0.7823031622705003, + "grad_norm": 0.24460510413890155, + "learning_rate": 1.687293522017726e-06, + "loss": 0.5719, + "step": 988 + }, + { + "epoch": 0.7830949670906122, + "grad_norm": 0.25006028531547964, + "learning_rate": 1.6866894239273716e-06, + "loss": 0.5879, + "step": 989 + }, + { + "epoch": 0.783886771910724, + "grad_norm": 0.2720793329328961, + "learning_rate": 1.6860848512604449e-06, + "loss": 0.5805, + "step": 990 + }, + { + "epoch": 0.7846785767308359, + "grad_norm": 0.26031706750706746, + "learning_rate": 1.6854798044347708e-06, + "loss": 0.6049, + "step": 991 + }, + { + "epoch": 0.7854703815509477, + "grad_norm": 0.25280789896335876, + "learning_rate": 1.6848742838685026e-06, + "loss": 0.5813, + "step": 992 + }, + { + "epoch": 0.7862621863710595, + "grad_norm": 0.2686687521746721, + "learning_rate": 1.6842682899801198e-06, + "loss": 0.5984, + "step": 993 + }, + { + "epoch": 0.7870539911911714, + "grad_norm": 0.2696539128983892, + "learning_rate": 1.6836618231884299e-06, + "loss": 0.5746, + "step": 994 + }, + { + "epoch": 0.7878457960112832, + "grad_norm": 0.24873630460097296, + "learning_rate": 1.6830548839125672e-06, + "loss": 0.5792, + "step": 995 + }, + { + "epoch": 0.7886376008313951, + "grad_norm": 0.25588691360488625, + "learning_rate": 1.6824474725719916e-06, + "loss": 0.5781, + "step": 996 + }, + { + "epoch": 0.7894294056515069, + "grad_norm": 0.2593738561169873, + "learning_rate": 1.6818395895864908e-06, + "loss": 0.592, + "step": 997 + }, + { + "epoch": 0.7902212104716188, + "grad_norm": 0.2503435439903613, + "learning_rate": 1.681231235376177e-06, + "loss": 0.5796, + "step": 998 + }, + { + "epoch": 0.7910130152917306, + "grad_norm": 0.24341615985355178, + "learning_rate": 1.6806224103614887e-06, + "loss": 0.598, + "step": 999 + }, + { + "epoch": 0.7918048201118424, + "grad_norm": 0.23918323278794373, + "learning_rate": 1.6800131149631907e-06, + "loss": 0.5857, + "step": 1000 + }, + { + "epoch": 0.7925966249319543, + "grad_norm": 0.27123704462934584, + "learning_rate": 1.6794033496023704e-06, + "loss": 0.5856, + "step": 1001 + }, + { + "epoch": 0.7933884297520661, + "grad_norm": 0.24976229528373015, + "learning_rate": 1.6787931147004426e-06, + "loss": 0.5697, + "step": 1002 + }, + { + "epoch": 0.794180234572178, + "grad_norm": 0.25081951371265254, + "learning_rate": 1.6781824106791453e-06, + "loss": 0.6036, + "step": 1003 + }, + { + "epoch": 0.7949720393922898, + "grad_norm": 0.2543707728394622, + "learning_rate": 1.6775712379605409e-06, + "loss": 0.5757, + "step": 1004 + }, + { + "epoch": 0.7957638442124016, + "grad_norm": 0.2517746465053035, + "learning_rate": 1.6769595969670158e-06, + "loss": 0.5631, + "step": 1005 + }, + { + "epoch": 0.7965556490325135, + "grad_norm": 0.2505079887028291, + "learning_rate": 1.67634748812128e-06, + "loss": 0.5649, + "step": 1006 + }, + { + "epoch": 0.7973474538526253, + "grad_norm": 0.25100526219511066, + "learning_rate": 1.6757349118463666e-06, + "loss": 0.586, + "step": 1007 + }, + { + "epoch": 0.7981392586727372, + "grad_norm": 0.25008384189162813, + "learning_rate": 1.6751218685656325e-06, + "loss": 0.587, + "step": 1008 + }, + { + "epoch": 0.798931063492849, + "grad_norm": 0.2571108017946599, + "learning_rate": 1.6745083587027562e-06, + "loss": 0.5875, + "step": 1009 + }, + { + "epoch": 0.7997228683129609, + "grad_norm": 0.24575800730786365, + "learning_rate": 1.6738943826817397e-06, + "loss": 0.5772, + "step": 1010 + }, + { + "epoch": 0.8005146731330727, + "grad_norm": 0.25158102693738865, + "learning_rate": 1.6732799409269069e-06, + "loss": 0.5866, + "step": 1011 + }, + { + "epoch": 0.8013064779531845, + "grad_norm": 0.2583811964469298, + "learning_rate": 1.672665033862903e-06, + "loss": 0.5719, + "step": 1012 + }, + { + "epoch": 0.8020982827732964, + "grad_norm": 0.24370943351611393, + "learning_rate": 1.6720496619146955e-06, + "loss": 0.5778, + "step": 1013 + }, + { + "epoch": 0.8028900875934082, + "grad_norm": 0.25906184718775943, + "learning_rate": 1.6714338255075728e-06, + "loss": 0.5775, + "step": 1014 + }, + { + "epoch": 0.8036818924135201, + "grad_norm": 0.2511585348986055, + "learning_rate": 1.6708175250671444e-06, + "loss": 0.5716, + "step": 1015 + }, + { + "epoch": 0.8044736972336319, + "grad_norm": 0.24707065260669275, + "learning_rate": 1.6702007610193403e-06, + "loss": 0.5888, + "step": 1016 + }, + { + "epoch": 0.8052655020537437, + "grad_norm": 0.24987917740310078, + "learning_rate": 1.6695835337904117e-06, + "loss": 0.5717, + "step": 1017 + }, + { + "epoch": 0.8060573068738556, + "grad_norm": 0.25196440898953126, + "learning_rate": 1.6689658438069284e-06, + "loss": 0.5666, + "step": 1018 + }, + { + "epoch": 0.8068491116939674, + "grad_norm": 0.26111092994260254, + "learning_rate": 1.6683476914957812e-06, + "loss": 0.5692, + "step": 1019 + }, + { + "epoch": 0.8076409165140793, + "grad_norm": 0.24716955694239662, + "learning_rate": 1.6677290772841807e-06, + "loss": 0.5736, + "step": 1020 + }, + { + "epoch": 0.8084327213341911, + "grad_norm": 0.2519859731877666, + "learning_rate": 1.6671100015996556e-06, + "loss": 0.5752, + "step": 1021 + }, + { + "epoch": 0.809224526154303, + "grad_norm": 0.2403464065951902, + "learning_rate": 1.666490464870054e-06, + "loss": 0.5955, + "step": 1022 + }, + { + "epoch": 0.8100163309744148, + "grad_norm": 0.2567687570720557, + "learning_rate": 1.6658704675235434e-06, + "loss": 0.6026, + "step": 1023 + }, + { + "epoch": 0.8108081357945266, + "grad_norm": 0.23933427768870075, + "learning_rate": 1.665250009988608e-06, + "loss": 0.5652, + "step": 1024 + }, + { + "epoch": 0.8115999406146385, + "grad_norm": 0.2442575208595484, + "learning_rate": 1.6646290926940512e-06, + "loss": 0.5802, + "step": 1025 + }, + { + "epoch": 0.8123917454347503, + "grad_norm": 0.25096321452317977, + "learning_rate": 1.6640077160689945e-06, + "loss": 0.5776, + "step": 1026 + }, + { + "epoch": 0.8131835502548622, + "grad_norm": 0.2533874645488365, + "learning_rate": 1.6633858805428758e-06, + "loss": 0.5835, + "step": 1027 + }, + { + "epoch": 0.813975355074974, + "grad_norm": 0.2543786497379953, + "learning_rate": 1.6627635865454507e-06, + "loss": 0.5869, + "step": 1028 + }, + { + "epoch": 0.8147671598950859, + "grad_norm": 0.2546574175602153, + "learning_rate": 1.662140834506792e-06, + "loss": 0.5873, + "step": 1029 + }, + { + "epoch": 0.8155589647151977, + "grad_norm": 0.24039403366732648, + "learning_rate": 1.661517624857288e-06, + "loss": 0.5739, + "step": 1030 + }, + { + "epoch": 0.8163507695353095, + "grad_norm": 0.24657766248695567, + "learning_rate": 1.660893958027644e-06, + "loss": 0.5831, + "step": 1031 + }, + { + "epoch": 0.8171425743554214, + "grad_norm": 0.24389759783969212, + "learning_rate": 1.6602698344488816e-06, + "loss": 0.5872, + "step": 1032 + }, + { + "epoch": 0.8179343791755332, + "grad_norm": 0.2554998501325067, + "learning_rate": 1.6596452545523368e-06, + "loss": 0.568, + "step": 1033 + }, + { + "epoch": 0.8187261839956451, + "grad_norm": 0.24487900079848257, + "learning_rate": 1.6590202187696626e-06, + "loss": 0.5778, + "step": 1034 + }, + { + "epoch": 0.8195179888157569, + "grad_norm": 0.2409374318162725, + "learning_rate": 1.6583947275328262e-06, + "loss": 0.5726, + "step": 1035 + }, + { + "epoch": 0.8203097936358688, + "grad_norm": 0.2555336201677589, + "learning_rate": 1.6577687812741092e-06, + "loss": 0.5763, + "step": 1036 + }, + { + "epoch": 0.8211015984559806, + "grad_norm": 0.24788955965702197, + "learning_rate": 1.657142380426108e-06, + "loss": 0.5738, + "step": 1037 + }, + { + "epoch": 0.8218934032760924, + "grad_norm": 0.2434532019250245, + "learning_rate": 1.6565155254217335e-06, + "loss": 0.5849, + "step": 1038 + }, + { + "epoch": 0.8226852080962043, + "grad_norm": 0.25339375614835435, + "learning_rate": 1.6558882166942105e-06, + "loss": 0.6043, + "step": 1039 + }, + { + "epoch": 0.8234770129163161, + "grad_norm": 0.25543803777050145, + "learning_rate": 1.6552604546770766e-06, + "loss": 0.5861, + "step": 1040 + }, + { + "epoch": 0.824268817736428, + "grad_norm": 0.2651661036586476, + "learning_rate": 1.6546322398041832e-06, + "loss": 0.5689, + "step": 1041 + }, + { + "epoch": 0.8250606225565398, + "grad_norm": 0.24732454750675104, + "learning_rate": 1.654003572509695e-06, + "loss": 0.5632, + "step": 1042 + }, + { + "epoch": 0.8258524273766517, + "grad_norm": 0.2542908482879605, + "learning_rate": 1.653374453228089e-06, + "loss": 0.5957, + "step": 1043 + }, + { + "epoch": 0.8266442321967635, + "grad_norm": 0.25243605180228523, + "learning_rate": 1.6527448823941543e-06, + "loss": 0.5893, + "step": 1044 + }, + { + "epoch": 0.8274360370168753, + "grad_norm": 0.26166682310011685, + "learning_rate": 1.6521148604429925e-06, + "loss": 0.5986, + "step": 1045 + }, + { + "epoch": 0.8282278418369872, + "grad_norm": 0.24835269137254626, + "learning_rate": 1.6514843878100163e-06, + "loss": 0.5743, + "step": 1046 + }, + { + "epoch": 0.829019646657099, + "grad_norm": 0.25029496480012176, + "learning_rate": 1.6508534649309516e-06, + "loss": 0.5858, + "step": 1047 + }, + { + "epoch": 0.8298114514772109, + "grad_norm": 0.2484934813624596, + "learning_rate": 1.650222092241833e-06, + "loss": 0.5779, + "step": 1048 + }, + { + "epoch": 0.8306032562973227, + "grad_norm": 0.24747045389662214, + "learning_rate": 1.649590270179008e-06, + "loss": 0.5825, + "step": 1049 + }, + { + "epoch": 0.8313950611174346, + "grad_norm": 0.25678312654227714, + "learning_rate": 1.6489579991791337e-06, + "loss": 0.5779, + "step": 1050 + }, + { + "epoch": 0.8321868659375464, + "grad_norm": 0.25001373774852653, + "learning_rate": 1.6483252796791778e-06, + "loss": 0.5884, + "step": 1051 + }, + { + "epoch": 0.8329786707576582, + "grad_norm": 0.24649088580005798, + "learning_rate": 1.6476921121164176e-06, + "loss": 0.5987, + "step": 1052 + }, + { + "epoch": 0.8337704755777701, + "grad_norm": 0.2537535387527769, + "learning_rate": 1.6470584969284405e-06, + "loss": 0.5851, + "step": 1053 + }, + { + "epoch": 0.8345622803978819, + "grad_norm": 0.24145915608683344, + "learning_rate": 1.6464244345531423e-06, + "loss": 0.5722, + "step": 1054 + }, + { + "epoch": 0.8353540852179938, + "grad_norm": 0.24705155210352187, + "learning_rate": 1.64578992542873e-06, + "loss": 0.5831, + "step": 1055 + }, + { + "epoch": 0.8361458900381056, + "grad_norm": 0.24564210050744636, + "learning_rate": 1.6451549699937166e-06, + "loss": 0.5719, + "step": 1056 + }, + { + "epoch": 0.8369376948582175, + "grad_norm": 0.23904533252059798, + "learning_rate": 1.6445195686869259e-06, + "loss": 0.5941, + "step": 1057 + }, + { + "epoch": 0.8377294996783293, + "grad_norm": 0.2518695754783361, + "learning_rate": 1.6438837219474876e-06, + "loss": 0.5857, + "step": 1058 + }, + { + "epoch": 0.8385213044984411, + "grad_norm": 0.24304954610057472, + "learning_rate": 1.643247430214842e-06, + "loss": 0.5688, + "step": 1059 + }, + { + "epoch": 0.839313109318553, + "grad_norm": 0.2502788325538013, + "learning_rate": 1.6426106939287345e-06, + "loss": 0.5763, + "step": 1060 + }, + { + "epoch": 0.8401049141386648, + "grad_norm": 0.23595313937091714, + "learning_rate": 1.6419735135292188e-06, + "loss": 0.5737, + "step": 1061 + }, + { + "epoch": 0.8408967189587767, + "grad_norm": 0.23719473568301383, + "learning_rate": 1.6413358894566553e-06, + "loss": 0.5767, + "step": 1062 + }, + { + "epoch": 0.8416885237788885, + "grad_norm": 0.24272651721099175, + "learning_rate": 1.6406978221517116e-06, + "loss": 0.5761, + "step": 1063 + }, + { + "epoch": 0.8424803285990003, + "grad_norm": 0.2642905302556802, + "learning_rate": 1.6400593120553606e-06, + "loss": 0.5588, + "step": 1064 + }, + { + "epoch": 0.8432721334191122, + "grad_norm": 0.2488237526188394, + "learning_rate": 1.639420359608882e-06, + "loss": 0.566, + "step": 1065 + }, + { + "epoch": 0.844063938239224, + "grad_norm": 0.25131458781350147, + "learning_rate": 1.6387809652538608e-06, + "loss": 0.5748, + "step": 1066 + }, + { + "epoch": 0.8448557430593359, + "grad_norm": 0.24645204360337702, + "learning_rate": 1.6381411294321876e-06, + "loss": 0.5829, + "step": 1067 + }, + { + "epoch": 0.8456475478794477, + "grad_norm": 0.2517558246624942, + "learning_rate": 1.6375008525860584e-06, + "loss": 0.5697, + "step": 1068 + }, + { + "epoch": 0.8464393526995596, + "grad_norm": 0.24583594853286805, + "learning_rate": 1.6368601351579732e-06, + "loss": 0.5699, + "step": 1069 + }, + { + "epoch": 0.8472311575196714, + "grad_norm": 0.2501707400525852, + "learning_rate": 1.6362189775907373e-06, + "loss": 0.5828, + "step": 1070 + }, + { + "epoch": 0.8480229623397832, + "grad_norm": 0.255049508563797, + "learning_rate": 1.6355773803274597e-06, + "loss": 0.5893, + "step": 1071 + }, + { + "epoch": 0.8488147671598951, + "grad_norm": 0.2539879381739073, + "learning_rate": 1.6349353438115535e-06, + "loss": 0.5725, + "step": 1072 + }, + { + "epoch": 0.8496065719800069, + "grad_norm": 0.24145487424539905, + "learning_rate": 1.6342928684867351e-06, + "loss": 0.5838, + "step": 1073 + }, + { + "epoch": 0.8503983768001188, + "grad_norm": 0.2520150705783432, + "learning_rate": 1.6336499547970246e-06, + "loss": 0.5687, + "step": 1074 + }, + { + "epoch": 0.8511901816202306, + "grad_norm": 0.2475644312970042, + "learning_rate": 1.6330066031867445e-06, + "loss": 0.5794, + "step": 1075 + }, + { + "epoch": 0.8519819864403425, + "grad_norm": 0.24259026078041718, + "learning_rate": 1.6323628141005206e-06, + "loss": 0.5689, + "step": 1076 + }, + { + "epoch": 0.8527737912604543, + "grad_norm": 0.2539525583876839, + "learning_rate": 1.6317185879832805e-06, + "loss": 0.5739, + "step": 1077 + }, + { + "epoch": 0.8535655960805661, + "grad_norm": 0.24574991580450073, + "learning_rate": 1.631073925280254e-06, + "loss": 0.58, + "step": 1078 + }, + { + "epoch": 0.854357400900678, + "grad_norm": 0.26035074540088715, + "learning_rate": 1.6304288264369727e-06, + "loss": 0.5709, + "step": 1079 + }, + { + "epoch": 0.8551492057207898, + "grad_norm": 0.25118729860231354, + "learning_rate": 1.6297832918992698e-06, + "loss": 0.577, + "step": 1080 + }, + { + "epoch": 0.8559410105409017, + "grad_norm": 0.24088051605213703, + "learning_rate": 1.6291373221132793e-06, + "loss": 0.5702, + "step": 1081 + }, + { + "epoch": 0.8567328153610135, + "grad_norm": 0.2686187728277244, + "learning_rate": 1.628490917525436e-06, + "loss": 0.5916, + "step": 1082 + }, + { + "epoch": 0.8575246201811254, + "grad_norm": 0.23663725614238204, + "learning_rate": 1.6278440785824752e-06, + "loss": 0.572, + "step": 1083 + }, + { + "epoch": 0.8583164250012372, + "grad_norm": 0.25765616886797094, + "learning_rate": 1.6271968057314329e-06, + "loss": 0.5909, + "step": 1084 + }, + { + "epoch": 0.859108229821349, + "grad_norm": 0.24534663703048665, + "learning_rate": 1.6265490994196442e-06, + "loss": 0.5707, + "step": 1085 + }, + { + "epoch": 0.8599000346414609, + "grad_norm": 0.265161136643667, + "learning_rate": 1.6259009600947443e-06, + "loss": 0.572, + "step": 1086 + }, + { + "epoch": 0.8606918394615727, + "grad_norm": 0.24368310689432676, + "learning_rate": 1.625252388204667e-06, + "loss": 0.5872, + "step": 1087 + }, + { + "epoch": 0.8614836442816846, + "grad_norm": 0.2459058837532321, + "learning_rate": 1.6246033841976465e-06, + "loss": 0.5725, + "step": 1088 + }, + { + "epoch": 0.8622754491017964, + "grad_norm": 0.25441194174474324, + "learning_rate": 1.6239539485222139e-06, + "loss": 0.5888, + "step": 1089 + }, + { + "epoch": 0.8630672539219082, + "grad_norm": 0.24136222032791688, + "learning_rate": 1.6233040816271996e-06, + "loss": 0.5779, + "step": 1090 + }, + { + "epoch": 0.8638590587420201, + "grad_norm": 0.24333904193511258, + "learning_rate": 1.6226537839617321e-06, + "loss": 0.5834, + "step": 1091 + }, + { + "epoch": 0.8646508635621319, + "grad_norm": 0.24651480245565502, + "learning_rate": 1.6220030559752367e-06, + "loss": 0.5941, + "step": 1092 + }, + { + "epoch": 0.8654426683822438, + "grad_norm": 0.24766500609673905, + "learning_rate": 1.6213518981174374e-06, + "loss": 0.5679, + "step": 1093 + }, + { + "epoch": 0.8662344732023556, + "grad_norm": 0.25470747691542456, + "learning_rate": 1.6207003108383543e-06, + "loss": 0.5996, + "step": 1094 + }, + { + "epoch": 0.8670262780224675, + "grad_norm": 0.2460007287413227, + "learning_rate": 1.6200482945883046e-06, + "loss": 0.5808, + "step": 1095 + }, + { + "epoch": 0.8678180828425793, + "grad_norm": 0.25337461555203933, + "learning_rate": 1.6193958498179022e-06, + "loss": 0.5672, + "step": 1096 + }, + { + "epoch": 0.8686098876626911, + "grad_norm": 0.25552804611255225, + "learning_rate": 1.6187429769780568e-06, + "loss": 0.5723, + "step": 1097 + }, + { + "epoch": 0.869401692482803, + "grad_norm": 0.2420545397711388, + "learning_rate": 1.6180896765199737e-06, + "loss": 0.5778, + "step": 1098 + }, + { + "epoch": 0.8701934973029148, + "grad_norm": 0.25071874604846495, + "learning_rate": 1.6174359488951545e-06, + "loss": 0.5847, + "step": 1099 + }, + { + "epoch": 0.8709853021230267, + "grad_norm": 0.26163100705727776, + "learning_rate": 1.6167817945553956e-06, + "loss": 0.5891, + "step": 1100 + }, + { + "epoch": 0.8717771069431385, + "grad_norm": 0.2553614991918954, + "learning_rate": 1.6161272139527882e-06, + "loss": 0.5691, + "step": 1101 + }, + { + "epoch": 0.8725689117632504, + "grad_norm": 0.24966352488537705, + "learning_rate": 1.6154722075397183e-06, + "loss": 0.5848, + "step": 1102 + }, + { + "epoch": 0.8733607165833622, + "grad_norm": 0.24514512458122928, + "learning_rate": 1.6148167757688657e-06, + "loss": 0.577, + "step": 1103 + }, + { + "epoch": 0.874152521403474, + "grad_norm": 0.2637944713446023, + "learning_rate": 1.6141609190932051e-06, + "loss": 0.5941, + "step": 1104 + }, + { + "epoch": 0.8749443262235859, + "grad_norm": 0.2456325755441037, + "learning_rate": 1.6135046379660037e-06, + "loss": 0.588, + "step": 1105 + }, + { + "epoch": 0.8757361310436977, + "grad_norm": 0.23929723192140198, + "learning_rate": 1.6128479328408236e-06, + "loss": 0.5803, + "step": 1106 + }, + { + "epoch": 0.8765279358638096, + "grad_norm": 0.25769065276824593, + "learning_rate": 1.6121908041715181e-06, + "loss": 0.5828, + "step": 1107 + }, + { + "epoch": 0.8773197406839214, + "grad_norm": 0.26935441628478257, + "learning_rate": 1.6115332524122345e-06, + "loss": 0.5796, + "step": 1108 + }, + { + "epoch": 0.8781115455040333, + "grad_norm": 0.24643537762947543, + "learning_rate": 1.610875278017412e-06, + "loss": 0.5792, + "step": 1109 + }, + { + "epoch": 0.8789033503241451, + "grad_norm": 0.25353326502965123, + "learning_rate": 1.610216881441782e-06, + "loss": 0.5905, + "step": 1110 + }, + { + "epoch": 0.8796951551442569, + "grad_norm": 0.24281644334466226, + "learning_rate": 1.6095580631403676e-06, + "loss": 0.5937, + "step": 1111 + }, + { + "epoch": 0.8804869599643688, + "grad_norm": 0.25565001965436085, + "learning_rate": 1.6088988235684834e-06, + "loss": 0.5859, + "step": 1112 + }, + { + "epoch": 0.8812787647844806, + "grad_norm": 0.2541655484308428, + "learning_rate": 1.6082391631817354e-06, + "loss": 0.5829, + "step": 1113 + }, + { + "epoch": 0.8820705696045925, + "grad_norm": 0.24937428478641344, + "learning_rate": 1.60757908243602e-06, + "loss": 0.5668, + "step": 1114 + }, + { + "epoch": 0.8828623744247043, + "grad_norm": 0.24928470722270202, + "learning_rate": 1.6069185817875242e-06, + "loss": 0.5815, + "step": 1115 + }, + { + "epoch": 0.8836541792448162, + "grad_norm": 0.2560545020452616, + "learning_rate": 1.6062576616927256e-06, + "loss": 0.5887, + "step": 1116 + }, + { + "epoch": 0.884445984064928, + "grad_norm": 0.27127526912359223, + "learning_rate": 1.6055963226083909e-06, + "loss": 0.5969, + "step": 1117 + }, + { + "epoch": 0.8852377888850398, + "grad_norm": 0.2514824688319713, + "learning_rate": 1.6049345649915775e-06, + "loss": 0.5842, + "step": 1118 + }, + { + "epoch": 0.8860295937051517, + "grad_norm": 0.24375986127866897, + "learning_rate": 1.6042723892996309e-06, + "loss": 0.5777, + "step": 1119 + }, + { + "epoch": 0.8868213985252635, + "grad_norm": 0.2546590226484071, + "learning_rate": 1.6036097959901862e-06, + "loss": 0.5874, + "step": 1120 + }, + { + "epoch": 0.8876132033453754, + "grad_norm": 0.25490643966330057, + "learning_rate": 1.602946785521167e-06, + "loss": 0.5829, + "step": 1121 + }, + { + "epoch": 0.8884050081654872, + "grad_norm": 0.2490222536319817, + "learning_rate": 1.6022833583507848e-06, + "loss": 0.5798, + "step": 1122 + }, + { + "epoch": 0.889196812985599, + "grad_norm": 0.25536422958159516, + "learning_rate": 1.60161951493754e-06, + "loss": 0.5954, + "step": 1123 + }, + { + "epoch": 0.8899886178057109, + "grad_norm": 0.24968603426729527, + "learning_rate": 1.6009552557402198e-06, + "loss": 0.5752, + "step": 1124 + }, + { + "epoch": 0.8907804226258227, + "grad_norm": 0.24393303439491235, + "learning_rate": 1.6002905812178988e-06, + "loss": 0.5743, + "step": 1125 + }, + { + "epoch": 0.8915722274459346, + "grad_norm": 0.24528004931946223, + "learning_rate": 1.59962549182994e-06, + "loss": 0.5691, + "step": 1126 + }, + { + "epoch": 0.8923640322660464, + "grad_norm": 0.24646740204639261, + "learning_rate": 1.5989599880359906e-06, + "loss": 0.5688, + "step": 1127 + }, + { + "epoch": 0.8931558370861583, + "grad_norm": 0.2529708580577012, + "learning_rate": 1.5982940702959864e-06, + "loss": 0.5744, + "step": 1128 + }, + { + "epoch": 0.8939476419062701, + "grad_norm": 0.24649907291895468, + "learning_rate": 1.597627739070148e-06, + "loss": 0.5754, + "step": 1129 + }, + { + "epoch": 0.894739446726382, + "grad_norm": 0.25002828092540585, + "learning_rate": 1.596960994818983e-06, + "loss": 0.5862, + "step": 1130 + }, + { + "epoch": 0.8955312515464938, + "grad_norm": 0.24390990267195928, + "learning_rate": 1.5962938380032833e-06, + "loss": 0.5766, + "step": 1131 + }, + { + "epoch": 0.8963230563666056, + "grad_norm": 0.2736736092042765, + "learning_rate": 1.595626269084126e-06, + "loss": 0.598, + "step": 1132 + }, + { + "epoch": 0.8971148611867175, + "grad_norm": 0.25173453453576516, + "learning_rate": 1.5949582885228741e-06, + "loss": 0.5731, + "step": 1133 + }, + { + "epoch": 0.8979066660068293, + "grad_norm": 0.24310431339486674, + "learning_rate": 1.5942898967811735e-06, + "loss": 0.5721, + "step": 1134 + }, + { + "epoch": 0.8986984708269412, + "grad_norm": 0.2508954998227375, + "learning_rate": 1.5936210943209556e-06, + "loss": 0.5712, + "step": 1135 + }, + { + "epoch": 0.899490275647053, + "grad_norm": 0.25916566992709084, + "learning_rate": 1.592951881604435e-06, + "loss": 0.5665, + "step": 1136 + }, + { + "epoch": 0.9002820804671648, + "grad_norm": 0.2609148285652845, + "learning_rate": 1.5922822590941101e-06, + "loss": 0.5934, + "step": 1137 + }, + { + "epoch": 0.9010738852872767, + "grad_norm": 0.24396936865607036, + "learning_rate": 1.5916122272527622e-06, + "loss": 0.5819, + "step": 1138 + }, + { + "epoch": 0.9018656901073885, + "grad_norm": 0.24518618613401058, + "learning_rate": 1.590941786543456e-06, + "loss": 0.5727, + "step": 1139 + }, + { + "epoch": 0.9026574949275004, + "grad_norm": 0.270738638719993, + "learning_rate": 1.5902709374295378e-06, + "loss": 0.5752, + "step": 1140 + }, + { + "epoch": 0.9034492997476122, + "grad_norm": 0.2538745924611469, + "learning_rate": 1.5895996803746377e-06, + "loss": 0.5749, + "step": 1141 + }, + { + "epoch": 0.9042411045677241, + "grad_norm": 0.24372233219327333, + "learning_rate": 1.588928015842666e-06, + "loss": 0.568, + "step": 1142 + }, + { + "epoch": 0.9050329093878359, + "grad_norm": 0.25103700311305904, + "learning_rate": 1.5882559442978161e-06, + "loss": 0.5814, + "step": 1143 + }, + { + "epoch": 0.9058247142079477, + "grad_norm": 0.25214324010928046, + "learning_rate": 1.587583466204562e-06, + "loss": 0.5752, + "step": 1144 + }, + { + "epoch": 0.9066165190280596, + "grad_norm": 0.26722987577286583, + "learning_rate": 1.586910582027658e-06, + "loss": 0.5807, + "step": 1145 + }, + { + "epoch": 0.9074083238481714, + "grad_norm": 0.2428861218982861, + "learning_rate": 1.5862372922321403e-06, + "loss": 0.5805, + "step": 1146 + }, + { + "epoch": 0.9082001286682833, + "grad_norm": 0.2373379401322153, + "learning_rate": 1.585563597283325e-06, + "loss": 0.5759, + "step": 1147 + }, + { + "epoch": 0.9089919334883951, + "grad_norm": 0.23509882651000052, + "learning_rate": 1.5848894976468079e-06, + "loss": 0.5962, + "step": 1148 + }, + { + "epoch": 0.909783738308507, + "grad_norm": 0.2534606776590353, + "learning_rate": 1.5842149937884645e-06, + "loss": 0.5717, + "step": 1149 + }, + { + "epoch": 0.9105755431286188, + "grad_norm": 0.24492279368136252, + "learning_rate": 1.5835400861744504e-06, + "loss": 0.5799, + "step": 1150 + }, + { + "epoch": 0.9113673479487306, + "grad_norm": 0.23969145535922307, + "learning_rate": 1.582864775271199e-06, + "loss": 0.5955, + "step": 1151 + }, + { + "epoch": 0.9121591527688425, + "grad_norm": 0.24042106482444472, + "learning_rate": 1.5821890615454235e-06, + "loss": 0.5906, + "step": 1152 + }, + { + "epoch": 0.9129509575889543, + "grad_norm": 0.25129518800116574, + "learning_rate": 1.5815129454641144e-06, + "loss": 0.5743, + "step": 1153 + }, + { + "epoch": 0.9137427624090662, + "grad_norm": 0.2545510658300164, + "learning_rate": 1.580836427494542e-06, + "loss": 0.5925, + "step": 1154 + }, + { + "epoch": 0.914534567229178, + "grad_norm": 0.24624218829669162, + "learning_rate": 1.5801595081042524e-06, + "loss": 0.5945, + "step": 1155 + }, + { + "epoch": 0.9153263720492899, + "grad_norm": 0.24503633144908887, + "learning_rate": 1.579482187761071e-06, + "loss": 0.5595, + "step": 1156 + }, + { + "epoch": 0.9161181768694017, + "grad_norm": 0.23811483557631802, + "learning_rate": 1.5788044669330982e-06, + "loss": 0.5615, + "step": 1157 + }, + { + "epoch": 0.9169099816895135, + "grad_norm": 0.23977190697285644, + "learning_rate": 1.5781263460887134e-06, + "loss": 0.5685, + "step": 1158 + }, + { + "epoch": 0.9177017865096254, + "grad_norm": 0.24954718549245242, + "learning_rate": 1.57744782569657e-06, + "loss": 0.5949, + "step": 1159 + }, + { + "epoch": 0.9184935913297372, + "grad_norm": 0.2568842831723131, + "learning_rate": 1.5767689062256002e-06, + "loss": 0.5929, + "step": 1160 + }, + { + "epoch": 0.919285396149849, + "grad_norm": 0.2544427897096277, + "learning_rate": 1.57608958814501e-06, + "loss": 0.5801, + "step": 1161 + }, + { + "epoch": 0.9200772009699609, + "grad_norm": 0.24426385349699706, + "learning_rate": 1.575409871924282e-06, + "loss": 0.5755, + "step": 1162 + }, + { + "epoch": 0.9208690057900727, + "grad_norm": 0.2365754597960453, + "learning_rate": 1.574729758033173e-06, + "loss": 0.5632, + "step": 1163 + }, + { + "epoch": 0.9216608106101846, + "grad_norm": 0.2505054501781746, + "learning_rate": 1.5740492469417158e-06, + "loss": 0.5689, + "step": 1164 + }, + { + "epoch": 0.9224526154302964, + "grad_norm": 0.2479162053654987, + "learning_rate": 1.5733683391202167e-06, + "loss": 0.573, + "step": 1165 + }, + { + "epoch": 0.9232444202504083, + "grad_norm": 0.24524728490335904, + "learning_rate": 1.572687035039256e-06, + "loss": 0.5929, + "step": 1166 + }, + { + "epoch": 0.9240362250705201, + "grad_norm": 0.25587349357699457, + "learning_rate": 1.5720053351696896e-06, + "loss": 0.5815, + "step": 1167 + }, + { + "epoch": 0.924828029890632, + "grad_norm": 0.24760385439967084, + "learning_rate": 1.5713232399826453e-06, + "loss": 0.5889, + "step": 1168 + }, + { + "epoch": 0.9256198347107438, + "grad_norm": 0.24903741397827486, + "learning_rate": 1.5706407499495239e-06, + "loss": 0.567, + "step": 1169 + }, + { + "epoch": 0.9264116395308556, + "grad_norm": 0.26127318105688313, + "learning_rate": 1.5699578655420007e-06, + "loss": 0.5804, + "step": 1170 + }, + { + "epoch": 0.9272034443509675, + "grad_norm": 0.252238856846934, + "learning_rate": 1.5692745872320216e-06, + "loss": 0.5815, + "step": 1171 + }, + { + "epoch": 0.9279952491710793, + "grad_norm": 0.24192869754410945, + "learning_rate": 1.5685909154918067e-06, + "loss": 0.566, + "step": 1172 + }, + { + "epoch": 0.9287870539911912, + "grad_norm": 0.24442472614445832, + "learning_rate": 1.5679068507938466e-06, + "loss": 0.5688, + "step": 1173 + }, + { + "epoch": 0.929578858811303, + "grad_norm": 0.2521094423336885, + "learning_rate": 1.567222393610904e-06, + "loss": 0.5765, + "step": 1174 + }, + { + "epoch": 0.9303706636314149, + "grad_norm": 0.24684251870455634, + "learning_rate": 1.5665375444160126e-06, + "loss": 0.5873, + "step": 1175 + }, + { + "epoch": 0.9311624684515267, + "grad_norm": 0.24192921881193505, + "learning_rate": 1.565852303682477e-06, + "loss": 0.5773, + "step": 1176 + }, + { + "epoch": 0.9319542732716385, + "grad_norm": 0.24349858877928449, + "learning_rate": 1.565166671883873e-06, + "loss": 0.584, + "step": 1177 + }, + { + "epoch": 0.9327460780917504, + "grad_norm": 0.24894847440005802, + "learning_rate": 1.5644806494940462e-06, + "loss": 0.5741, + "step": 1178 + }, + { + "epoch": 0.9335378829118622, + "grad_norm": 0.23837509286108713, + "learning_rate": 1.5637942369871117e-06, + "loss": 0.5917, + "step": 1179 + }, + { + "epoch": 0.9343296877319741, + "grad_norm": 0.2568809865381604, + "learning_rate": 1.5631074348374553e-06, + "loss": 0.5794, + "step": 1180 + }, + { + "epoch": 0.9351214925520859, + "grad_norm": 0.24918522428776926, + "learning_rate": 1.562420243519731e-06, + "loss": 0.5838, + "step": 1181 + }, + { + "epoch": 0.9359132973721977, + "grad_norm": 0.24540553438114424, + "learning_rate": 1.5617326635088625e-06, + "loss": 0.5789, + "step": 1182 + }, + { + "epoch": 0.9367051021923096, + "grad_norm": 0.24774553423368362, + "learning_rate": 1.5610446952800422e-06, + "loss": 0.5848, + "step": 1183 + }, + { + "epoch": 0.9374969070124214, + "grad_norm": 0.2660905672231104, + "learning_rate": 1.5603563393087296e-06, + "loss": 0.5681, + "step": 1184 + }, + { + "epoch": 0.9382887118325333, + "grad_norm": 0.2427522788591016, + "learning_rate": 1.5596675960706534e-06, + "loss": 0.5858, + "step": 1185 + }, + { + "epoch": 0.9390805166526451, + "grad_norm": 0.24652400083416762, + "learning_rate": 1.5589784660418102e-06, + "loss": 0.5695, + "step": 1186 + }, + { + "epoch": 0.939872321472757, + "grad_norm": 0.24013995413355602, + "learning_rate": 1.5582889496984621e-06, + "loss": 0.5857, + "step": 1187 + }, + { + "epoch": 0.9406641262928688, + "grad_norm": 0.24921304508425615, + "learning_rate": 1.5575990475171405e-06, + "loss": 0.5793, + "step": 1188 + }, + { + "epoch": 0.9414559311129806, + "grad_norm": 0.24070654325224555, + "learning_rate": 1.556908759974642e-06, + "loss": 0.5845, + "step": 1189 + }, + { + "epoch": 0.9422477359330925, + "grad_norm": 0.25443445419963834, + "learning_rate": 1.5562180875480296e-06, + "loss": 0.5685, + "step": 1190 + }, + { + "epoch": 0.9430395407532043, + "grad_norm": 0.24770278707419008, + "learning_rate": 1.5555270307146326e-06, + "loss": 0.589, + "step": 1191 + }, + { + "epoch": 0.9438313455733162, + "grad_norm": 0.2525597333456365, + "learning_rate": 1.554835589952046e-06, + "loss": 0.5891, + "step": 1192 + }, + { + "epoch": 0.944623150393428, + "grad_norm": 0.24177694953763318, + "learning_rate": 1.5541437657381303e-06, + "loss": 0.5869, + "step": 1193 + }, + { + "epoch": 0.9454149552135399, + "grad_norm": 0.24453407397790725, + "learning_rate": 1.5534515585510105e-06, + "loss": 0.5855, + "step": 1194 + }, + { + "epoch": 0.9462067600336517, + "grad_norm": 0.24980528629373336, + "learning_rate": 1.5527589688690763e-06, + "loss": 0.5943, + "step": 1195 + }, + { + "epoch": 0.9469985648537635, + "grad_norm": 0.23728415171069342, + "learning_rate": 1.5520659971709829e-06, + "loss": 0.5894, + "step": 1196 + }, + { + "epoch": 0.9477903696738754, + "grad_norm": 0.24119160485242816, + "learning_rate": 1.5513726439356475e-06, + "loss": 0.5889, + "step": 1197 + }, + { + "epoch": 0.9485821744939872, + "grad_norm": 0.25828470554024263, + "learning_rate": 1.5506789096422526e-06, + "loss": 0.5784, + "step": 1198 + }, + { + "epoch": 0.9493739793140991, + "grad_norm": 0.2387583307346421, + "learning_rate": 1.5499847947702436e-06, + "loss": 0.5715, + "step": 1199 + }, + { + "epoch": 0.9501657841342109, + "grad_norm": 0.2577285083898989, + "learning_rate": 1.549290299799329e-06, + "loss": 0.5792, + "step": 1200 + }, + { + "epoch": 0.9509575889543228, + "grad_norm": 0.24901630093178415, + "learning_rate": 1.5485954252094792e-06, + "loss": 0.5697, + "step": 1201 + }, + { + "epoch": 0.9517493937744346, + "grad_norm": 0.25010870398297697, + "learning_rate": 1.5479001714809283e-06, + "loss": 0.5774, + "step": 1202 + }, + { + "epoch": 0.9525411985945464, + "grad_norm": 0.24680871093997186, + "learning_rate": 1.5472045390941713e-06, + "loss": 0.5932, + "step": 1203 + }, + { + "epoch": 0.9533330034146583, + "grad_norm": 0.24422855655402023, + "learning_rate": 1.5465085285299658e-06, + "loss": 0.5849, + "step": 1204 + }, + { + "epoch": 0.9541248082347701, + "grad_norm": 0.24598397229101523, + "learning_rate": 1.5458121402693297e-06, + "loss": 0.5598, + "step": 1205 + }, + { + "epoch": 0.954916613054882, + "grad_norm": 0.2599607802640301, + "learning_rate": 1.5451153747935427e-06, + "loss": 0.5818, + "step": 1206 + }, + { + "epoch": 0.9557084178749938, + "grad_norm": 0.27695634670719327, + "learning_rate": 1.5444182325841453e-06, + "loss": 0.5892, + "step": 1207 + }, + { + "epoch": 0.9565002226951057, + "grad_norm": 0.25211054985453246, + "learning_rate": 1.5437207141229378e-06, + "loss": 0.5919, + "step": 1208 + }, + { + "epoch": 0.9572920275152175, + "grad_norm": 0.2451060531835357, + "learning_rate": 1.543022819891981e-06, + "loss": 0.5928, + "step": 1209 + }, + { + "epoch": 0.9580838323353293, + "grad_norm": 0.24547033201409746, + "learning_rate": 1.5423245503735948e-06, + "loss": 0.5825, + "step": 1210 + }, + { + "epoch": 0.9588756371554412, + "grad_norm": 0.24596145085571985, + "learning_rate": 1.5416259060503594e-06, + "loss": 0.5941, + "step": 1211 + }, + { + "epoch": 0.959667441975553, + "grad_norm": 0.2597094087310838, + "learning_rate": 1.5409268874051134e-06, + "loss": 0.574, + "step": 1212 + }, + { + "epoch": 0.9604592467956649, + "grad_norm": 0.2514603318556578, + "learning_rate": 1.5402274949209538e-06, + "loss": 0.5627, + "step": 1213 + }, + { + "epoch": 0.9612510516157767, + "grad_norm": 0.2545090445589627, + "learning_rate": 1.5395277290812365e-06, + "loss": 0.5861, + "step": 1214 + }, + { + "epoch": 0.9620428564358886, + "grad_norm": 0.23975577653391356, + "learning_rate": 1.5388275903695758e-06, + "loss": 0.5882, + "step": 1215 + }, + { + "epoch": 0.9628346612560004, + "grad_norm": 0.24911852486492334, + "learning_rate": 1.5381270792698425e-06, + "loss": 0.5762, + "step": 1216 + }, + { + "epoch": 0.9636264660761122, + "grad_norm": 0.24117914928164164, + "learning_rate": 1.537426196266166e-06, + "loss": 0.5884, + "step": 1217 + }, + { + "epoch": 0.9644182708962241, + "grad_norm": 0.24333257333117847, + "learning_rate": 1.5367249418429316e-06, + "loss": 0.5795, + "step": 1218 + }, + { + "epoch": 0.9652100757163359, + "grad_norm": 0.25614483117781256, + "learning_rate": 1.536023316484782e-06, + "loss": 0.5468, + "step": 1219 + }, + { + "epoch": 0.9660018805364478, + "grad_norm": 0.24673039533023608, + "learning_rate": 1.5353213206766165e-06, + "loss": 0.5846, + "step": 1220 + }, + { + "epoch": 0.9667936853565596, + "grad_norm": 0.2583575821473352, + "learning_rate": 1.53461895490359e-06, + "loss": 0.5806, + "step": 1221 + }, + { + "epoch": 0.9675854901766715, + "grad_norm": 0.24876753094990142, + "learning_rate": 1.5339162196511126e-06, + "loss": 0.5815, + "step": 1222 + }, + { + "epoch": 0.9683772949967833, + "grad_norm": 0.2498128758337844, + "learning_rate": 1.5332131154048509e-06, + "loss": 0.5705, + "step": 1223 + }, + { + "epoch": 0.9691690998168951, + "grad_norm": 0.2459633172393119, + "learning_rate": 1.5325096426507252e-06, + "loss": 0.5677, + "step": 1224 + }, + { + "epoch": 0.969960904637007, + "grad_norm": 0.2509699664179796, + "learning_rate": 1.531805801874912e-06, + "loss": 0.5712, + "step": 1225 + }, + { + "epoch": 0.9707527094571188, + "grad_norm": 0.2586042819186474, + "learning_rate": 1.5311015935638408e-06, + "loss": 0.5677, + "step": 1226 + }, + { + "epoch": 0.9715445142772307, + "grad_norm": 0.24904022429580894, + "learning_rate": 1.5303970182041953e-06, + "loss": 0.5728, + "step": 1227 + }, + { + "epoch": 0.9723363190973425, + "grad_norm": 0.24197372346486745, + "learning_rate": 1.5296920762829139e-06, + "loss": 0.5804, + "step": 1228 + }, + { + "epoch": 0.9731281239174544, + "grad_norm": 0.24753782636573635, + "learning_rate": 1.5289867682871871e-06, + "loss": 0.5758, + "step": 1229 + }, + { + "epoch": 0.9739199287375662, + "grad_norm": 0.25113388858593033, + "learning_rate": 1.5282810947044594e-06, + "loss": 0.5649, + "step": 1230 + }, + { + "epoch": 0.974711733557678, + "grad_norm": 0.23987306987017853, + "learning_rate": 1.527575056022427e-06, + "loss": 0.5809, + "step": 1231 + }, + { + "epoch": 0.9755035383777899, + "grad_norm": 0.2525822493786336, + "learning_rate": 1.526868652729039e-06, + "loss": 0.5865, + "step": 1232 + }, + { + "epoch": 0.9762953431979017, + "grad_norm": 0.2486476903505399, + "learning_rate": 1.5261618853124967e-06, + "loss": 0.5996, + "step": 1233 + }, + { + "epoch": 0.9770871480180136, + "grad_norm": 0.25031091187174087, + "learning_rate": 1.525454754261252e-06, + "loss": 0.5946, + "step": 1234 + }, + { + "epoch": 0.9778789528381254, + "grad_norm": 0.24463685058744283, + "learning_rate": 1.52474726006401e-06, + "loss": 0.5817, + "step": 1235 + }, + { + "epoch": 0.9786707576582372, + "grad_norm": 0.24512456350133258, + "learning_rate": 1.5240394032097248e-06, + "loss": 0.5878, + "step": 1236 + }, + { + "epoch": 0.9794625624783491, + "grad_norm": 0.243488137783844, + "learning_rate": 1.5233311841876018e-06, + "loss": 0.5839, + "step": 1237 + }, + { + "epoch": 0.9802543672984609, + "grad_norm": 0.24976775565439793, + "learning_rate": 1.5226226034870973e-06, + "loss": 0.569, + "step": 1238 + }, + { + "epoch": 0.9810461721185728, + "grad_norm": 0.24828869737831172, + "learning_rate": 1.5219136615979168e-06, + "loss": 0.5788, + "step": 1239 + }, + { + "epoch": 0.9818379769386846, + "grad_norm": 0.248520412702361, + "learning_rate": 1.5212043590100163e-06, + "loss": 0.6001, + "step": 1240 + }, + { + "epoch": 0.9826297817587965, + "grad_norm": 0.25457793621661734, + "learning_rate": 1.5204946962136002e-06, + "loss": 0.5705, + "step": 1241 + }, + { + "epoch": 0.9834215865789083, + "grad_norm": 0.24422652650754226, + "learning_rate": 1.5197846736991218e-06, + "loss": 0.5827, + "step": 1242 + }, + { + "epoch": 0.9842133913990201, + "grad_norm": 0.2538600213012752, + "learning_rate": 1.5190742919572834e-06, + "loss": 0.5856, + "step": 1243 + }, + { + "epoch": 0.985005196219132, + "grad_norm": 0.2438152381132691, + "learning_rate": 1.5183635514790363e-06, + "loss": 0.5815, + "step": 1244 + }, + { + "epoch": 0.9857970010392438, + "grad_norm": 0.24446696920789407, + "learning_rate": 1.517652452755578e-06, + "loss": 0.5613, + "step": 1245 + }, + { + "epoch": 0.9865888058593557, + "grad_norm": 0.25778425733766247, + "learning_rate": 1.5169409962783552e-06, + "loss": 0.5796, + "step": 1246 + }, + { + "epoch": 0.9873806106794675, + "grad_norm": 0.25735735869613113, + "learning_rate": 1.5162291825390608e-06, + "loss": 0.5792, + "step": 1247 + }, + { + "epoch": 0.9881724154995793, + "grad_norm": 0.25348670485786373, + "learning_rate": 1.5155170120296347e-06, + "loss": 0.5825, + "step": 1248 + }, + { + "epoch": 0.9889642203196912, + "grad_norm": 0.24604259316927563, + "learning_rate": 1.5148044852422647e-06, + "loss": 0.5826, + "step": 1249 + }, + { + "epoch": 0.989756025139803, + "grad_norm": 0.23488252274168747, + "learning_rate": 1.5140916026693829e-06, + "loss": 0.5632, + "step": 1250 + }, + { + "epoch": 0.9905478299599149, + "grad_norm": 0.24395727151980548, + "learning_rate": 1.5133783648036685e-06, + "loss": 0.5911, + "step": 1251 + }, + { + "epoch": 0.9913396347800267, + "grad_norm": 0.2773273461281478, + "learning_rate": 1.5126647721380455e-06, + "loss": 0.5774, + "step": 1252 + }, + { + "epoch": 0.9921314396001386, + "grad_norm": 0.24715168847541846, + "learning_rate": 1.5119508251656842e-06, + "loss": 0.5644, + "step": 1253 + }, + { + "epoch": 0.9929232444202504, + "grad_norm": 0.2445962547720453, + "learning_rate": 1.5112365243799986e-06, + "loss": 0.5777, + "step": 1254 + }, + { + "epoch": 0.9937150492403622, + "grad_norm": 0.2428786228336218, + "learning_rate": 1.5105218702746478e-06, + "loss": 0.5826, + "step": 1255 + }, + { + "epoch": 0.9945068540604741, + "grad_norm": 0.2515089428556661, + "learning_rate": 1.5098068633435351e-06, + "loss": 0.568, + "step": 1256 + }, + { + "epoch": 0.9952986588805859, + "grad_norm": 0.25692707472958654, + "learning_rate": 1.5090915040808073e-06, + "loss": 0.5793, + "step": 1257 + }, + { + "epoch": 0.9960904637006978, + "grad_norm": 0.25172227371334727, + "learning_rate": 1.5083757929808551e-06, + "loss": 0.5879, + "step": 1258 + }, + { + "epoch": 0.9968822685208096, + "grad_norm": 0.24433804978221943, + "learning_rate": 1.5076597305383125e-06, + "loss": 0.5704, + "step": 1259 + }, + { + "epoch": 0.9976740733409215, + "grad_norm": 0.2538319779590854, + "learning_rate": 1.5069433172480555e-06, + "loss": 0.5904, + "step": 1260 + }, + { + "epoch": 0.9984658781610333, + "grad_norm": 0.2416778852725012, + "learning_rate": 1.5062265536052029e-06, + "loss": 0.5644, + "step": 1261 + }, + { + "epoch": 0.9992576829811451, + "grad_norm": 0.2460468272195175, + "learning_rate": 1.5055094401051163e-06, + "loss": 0.5752, + "step": 1262 + }, + { + "epoch": 1.000049487801257, + "grad_norm": 0.2443164680146125, + "learning_rate": 1.5047919772433985e-06, + "loss": 0.5692, + "step": 1263 + }, + { + "epoch": 1.0008412926213688, + "grad_norm": 0.2493012691970272, + "learning_rate": 1.5040741655158942e-06, + "loss": 0.5739, + "step": 1264 + }, + { + "epoch": 1.0016330974414807, + "grad_norm": 0.25776351156930466, + "learning_rate": 1.5033560054186878e-06, + "loss": 0.5812, + "step": 1265 + }, + { + "epoch": 1.0016330974414807, + "eval_loss": 0.6005386114120483, + "eval_runtime": 2126.3335, + "eval_samples_per_second": 12.825, + "eval_steps_per_second": 1.603, + "step": 1265 + }, + { + "epoch": 1.000346294647274, + "grad_norm": 0.24794191536714286, + "learning_rate": 1.5026374974481063e-06, + "loss": 0.5743, + "step": 1266 + }, + { + "epoch": 1.001137825269615, + "grad_norm": 0.2535145399680773, + "learning_rate": 1.5019186421007165e-06, + "loss": 0.579, + "step": 1267 + }, + { + "epoch": 1.001929355891956, + "grad_norm": 0.24982740232306555, + "learning_rate": 1.5011994398733245e-06, + "loss": 0.5713, + "step": 1268 + }, + { + "epoch": 1.002720886514297, + "grad_norm": 0.2395927596073369, + "learning_rate": 1.5004798912629767e-06, + "loss": 0.5617, + "step": 1269 + }, + { + "epoch": 1.003512417136638, + "grad_norm": 0.26152042185423313, + "learning_rate": 1.4997599967669597e-06, + "loss": 0.5728, + "step": 1270 + }, + { + "epoch": 1.004303947758979, + "grad_norm": 0.2710493799893223, + "learning_rate": 1.4990397568827974e-06, + "loss": 0.5673, + "step": 1271 + }, + { + "epoch": 1.00509547838132, + "grad_norm": 0.2597138637459402, + "learning_rate": 1.4983191721082544e-06, + "loss": 0.5675, + "step": 1272 + }, + { + "epoch": 1.005887009003661, + "grad_norm": 0.2546589756506511, + "learning_rate": 1.497598242941332e-06, + "loss": 0.5655, + "step": 1273 + }, + { + "epoch": 1.0066785396260018, + "grad_norm": 0.2510955685288097, + "learning_rate": 1.4968769698802697e-06, + "loss": 0.5755, + "step": 1274 + }, + { + "epoch": 1.0074700702483428, + "grad_norm": 0.24678269709753914, + "learning_rate": 1.4961553534235458e-06, + "loss": 0.5598, + "step": 1275 + }, + { + "epoch": 1.0082616008706837, + "grad_norm": 0.25559076819263843, + "learning_rate": 1.495433394069875e-06, + "loss": 0.5668, + "step": 1276 + }, + { + "epoch": 1.0090531314930247, + "grad_norm": 0.26026624192884135, + "learning_rate": 1.4947110923182096e-06, + "loss": 0.5542, + "step": 1277 + }, + { + "epoch": 1.0098446621153656, + "grad_norm": 0.24564726543394408, + "learning_rate": 1.4939884486677375e-06, + "loss": 0.5702, + "step": 1278 + }, + { + "epoch": 1.0106361927377066, + "grad_norm": 0.25567765139893456, + "learning_rate": 1.4932654636178836e-06, + "loss": 0.5673, + "step": 1279 + }, + { + "epoch": 1.0114277233600475, + "grad_norm": 0.25527953402860915, + "learning_rate": 1.4925421376683092e-06, + "loss": 0.5682, + "step": 1280 + }, + { + "epoch": 1.0122192539823884, + "grad_norm": 0.2498700299454256, + "learning_rate": 1.49181847131891e-06, + "loss": 0.5644, + "step": 1281 + }, + { + "epoch": 1.0130107846047294, + "grad_norm": 0.256882496818525, + "learning_rate": 1.4910944650698183e-06, + "loss": 0.5819, + "step": 1282 + }, + { + "epoch": 1.0138023152270703, + "grad_norm": 0.25510467285097205, + "learning_rate": 1.4903701194214002e-06, + "loss": 0.5472, + "step": 1283 + }, + { + "epoch": 1.0145938458494113, + "grad_norm": 0.26764921372549993, + "learning_rate": 1.4896454348742565e-06, + "loss": 0.5611, + "step": 1284 + }, + { + "epoch": 1.0153853764717522, + "grad_norm": 0.25076367991337567, + "learning_rate": 1.4889204119292234e-06, + "loss": 0.5452, + "step": 1285 + }, + { + "epoch": 1.0161769070940931, + "grad_norm": 0.24422920897401315, + "learning_rate": 1.488195051087369e-06, + "loss": 0.5489, + "step": 1286 + }, + { + "epoch": 1.016968437716434, + "grad_norm": 0.24688556667418038, + "learning_rate": 1.487469352849997e-06, + "loss": 0.5602, + "step": 1287 + }, + { + "epoch": 1.017759968338775, + "grad_norm": 0.2452056807184004, + "learning_rate": 1.486743317718643e-06, + "loss": 0.5603, + "step": 1288 + }, + { + "epoch": 1.018551498961116, + "grad_norm": 0.2591709342340121, + "learning_rate": 1.486016946195075e-06, + "loss": 0.5654, + "step": 1289 + }, + { + "epoch": 1.019343029583457, + "grad_norm": 0.25914934971045006, + "learning_rate": 1.4852902387812949e-06, + "loss": 0.5519, + "step": 1290 + }, + { + "epoch": 1.020134560205798, + "grad_norm": 0.25066275706999996, + "learning_rate": 1.484563195979536e-06, + "loss": 0.5712, + "step": 1291 + }, + { + "epoch": 1.020926090828139, + "grad_norm": 0.23897396839102916, + "learning_rate": 1.4838358182922632e-06, + "loss": 0.5553, + "step": 1292 + }, + { + "epoch": 1.02171762145048, + "grad_norm": 0.24223046692686187, + "learning_rate": 1.4831081062221734e-06, + "loss": 0.5585, + "step": 1293 + }, + { + "epoch": 1.022509152072821, + "grad_norm": 0.25774822941778874, + "learning_rate": 1.4823800602721935e-06, + "loss": 0.5618, + "step": 1294 + }, + { + "epoch": 1.0233006826951618, + "grad_norm": 0.24912409745324013, + "learning_rate": 1.4816516809454825e-06, + "loss": 0.5708, + "step": 1295 + }, + { + "epoch": 1.0240922133175028, + "grad_norm": 0.25126066328858004, + "learning_rate": 1.4809229687454292e-06, + "loss": 0.5619, + "step": 1296 + }, + { + "epoch": 1.0248837439398437, + "grad_norm": 0.25175714301525737, + "learning_rate": 1.4801939241756522e-06, + "loss": 0.5439, + "step": 1297 + }, + { + "epoch": 1.0256752745621847, + "grad_norm": 0.2534938847816727, + "learning_rate": 1.4794645477400002e-06, + "loss": 0.5757, + "step": 1298 + }, + { + "epoch": 1.0264668051845256, + "grad_norm": 0.2654739919991637, + "learning_rate": 1.4787348399425511e-06, + "loss": 0.55, + "step": 1299 + }, + { + "epoch": 1.0272583358068665, + "grad_norm": 0.24991454038211489, + "learning_rate": 1.4780048012876118e-06, + "loss": 0.5585, + "step": 1300 + }, + { + "epoch": 1.0280498664292075, + "grad_norm": 0.24786821821932004, + "learning_rate": 1.4772744322797185e-06, + "loss": 0.5632, + "step": 1301 + }, + { + "epoch": 1.0288413970515484, + "grad_norm": 0.2536266778374565, + "learning_rate": 1.4765437334236337e-06, + "loss": 0.5616, + "step": 1302 + }, + { + "epoch": 1.0296329276738894, + "grad_norm": 0.269318489855077, + "learning_rate": 1.4758127052243502e-06, + "loss": 0.5687, + "step": 1303 + }, + { + "epoch": 1.0304244582962303, + "grad_norm": 0.25582220993180177, + "learning_rate": 1.475081348187087e-06, + "loss": 0.5773, + "step": 1304 + }, + { + "epoch": 1.0312159889185712, + "grad_norm": 0.24828004800561954, + "learning_rate": 1.4743496628172915e-06, + "loss": 0.5585, + "step": 1305 + }, + { + "epoch": 1.0320075195409122, + "grad_norm": 0.24955982603745422, + "learning_rate": 1.4736176496206368e-06, + "loss": 0.5446, + "step": 1306 + }, + { + "epoch": 1.0327990501632531, + "grad_norm": 0.2467057872121939, + "learning_rate": 1.472885309103023e-06, + "loss": 0.5578, + "step": 1307 + }, + { + "epoch": 1.033590580785594, + "grad_norm": 0.24803505195333375, + "learning_rate": 1.472152641770577e-06, + "loss": 0.561, + "step": 1308 + }, + { + "epoch": 1.034382111407935, + "grad_norm": 0.244150143950206, + "learning_rate": 1.4714196481296508e-06, + "loss": 0.5523, + "step": 1309 + }, + { + "epoch": 1.035173642030276, + "grad_norm": 0.24392729840689864, + "learning_rate": 1.470686328686822e-06, + "loss": 0.5553, + "step": 1310 + }, + { + "epoch": 1.035965172652617, + "grad_norm": 0.2398461843564468, + "learning_rate": 1.4699526839488936e-06, + "loss": 0.5692, + "step": 1311 + }, + { + "epoch": 1.0367567032749578, + "grad_norm": 0.24964130220548783, + "learning_rate": 1.4692187144228939e-06, + "loss": 0.581, + "step": 1312 + }, + { + "epoch": 1.037548233897299, + "grad_norm": 0.25270711459088857, + "learning_rate": 1.4684844206160744e-06, + "loss": 0.5731, + "step": 1313 + }, + { + "epoch": 1.03833976451964, + "grad_norm": 0.25311734915864864, + "learning_rate": 1.467749803035912e-06, + "loss": 0.5559, + "step": 1314 + }, + { + "epoch": 1.0391312951419809, + "grad_norm": 0.24895949119922856, + "learning_rate": 1.4670148621901064e-06, + "loss": 0.5646, + "step": 1315 + }, + { + "epoch": 1.0399228257643218, + "grad_norm": 0.24692807710604875, + "learning_rate": 1.4662795985865813e-06, + "loss": 0.5633, + "step": 1316 + }, + { + "epoch": 1.0407143563866628, + "grad_norm": 0.24666659458126228, + "learning_rate": 1.4655440127334835e-06, + "loss": 0.5561, + "step": 1317 + }, + { + "epoch": 1.0415058870090037, + "grad_norm": 0.24781497169803066, + "learning_rate": 1.4648081051391817e-06, + "loss": 0.5509, + "step": 1318 + }, + { + "epoch": 1.0422974176313446, + "grad_norm": 0.24987595540755106, + "learning_rate": 1.4640718763122682e-06, + "loss": 0.56, + "step": 1319 + }, + { + "epoch": 1.0430889482536856, + "grad_norm": 0.24412348946717424, + "learning_rate": 1.4633353267615562e-06, + "loss": 0.5465, + "step": 1320 + }, + { + "epoch": 1.0438804788760265, + "grad_norm": 0.25007361354164936, + "learning_rate": 1.4625984569960812e-06, + "loss": 0.5588, + "step": 1321 + }, + { + "epoch": 1.0446720094983675, + "grad_norm": 0.2603620223234983, + "learning_rate": 1.4618612675250996e-06, + "loss": 0.5546, + "step": 1322 + }, + { + "epoch": 1.0454635401207084, + "grad_norm": 0.24236235522821056, + "learning_rate": 1.4611237588580893e-06, + "loss": 0.5666, + "step": 1323 + }, + { + "epoch": 1.0462550707430494, + "grad_norm": 0.24539893131607105, + "learning_rate": 1.4603859315047483e-06, + "loss": 0.5574, + "step": 1324 + }, + { + "epoch": 1.0470466013653903, + "grad_norm": 0.25026973097816974, + "learning_rate": 1.4596477859749953e-06, + "loss": 0.5619, + "step": 1325 + }, + { + "epoch": 1.0478381319877312, + "grad_norm": 0.2512554046209749, + "learning_rate": 1.4589093227789678e-06, + "loss": 0.5648, + "step": 1326 + }, + { + "epoch": 1.0486296626100722, + "grad_norm": 0.2502456623416568, + "learning_rate": 1.4581705424270244e-06, + "loss": 0.5556, + "step": 1327 + }, + { + "epoch": 1.0494211932324131, + "grad_norm": 0.25955130771015766, + "learning_rate": 1.4574314454297416e-06, + "loss": 0.5632, + "step": 1328 + }, + { + "epoch": 1.050212723854754, + "grad_norm": 0.24964711378746987, + "learning_rate": 1.4566920322979157e-06, + "loss": 0.5525, + "step": 1329 + }, + { + "epoch": 1.051004254477095, + "grad_norm": 0.2465198285458817, + "learning_rate": 1.4559523035425612e-06, + "loss": 0.5706, + "step": 1330 + }, + { + "epoch": 1.051795785099436, + "grad_norm": 0.25385504922573, + "learning_rate": 1.4552122596749097e-06, + "loss": 0.5634, + "step": 1331 + }, + { + "epoch": 1.0525873157217769, + "grad_norm": 0.2458409041004127, + "learning_rate": 1.4544719012064122e-06, + "loss": 0.5642, + "step": 1332 + }, + { + "epoch": 1.0533788463441178, + "grad_norm": 0.2560080115195914, + "learning_rate": 1.4537312286487361e-06, + "loss": 0.5701, + "step": 1333 + }, + { + "epoch": 1.054170376966459, + "grad_norm": 0.2478834879649551, + "learning_rate": 1.4529902425137666e-06, + "loss": 0.5704, + "step": 1334 + }, + { + "epoch": 1.0549619075888, + "grad_norm": 0.2517473335379542, + "learning_rate": 1.4522489433136049e-06, + "loss": 0.5647, + "step": 1335 + }, + { + "epoch": 1.0557534382111409, + "grad_norm": 0.26244292716307693, + "learning_rate": 1.4515073315605681e-06, + "loss": 0.5702, + "step": 1336 + }, + { + "epoch": 1.0565449688334818, + "grad_norm": 0.24955021623416915, + "learning_rate": 1.4507654077671911e-06, + "loss": 0.576, + "step": 1337 + }, + { + "epoch": 1.0573364994558228, + "grad_norm": 0.2434535878102341, + "learning_rate": 1.4500231724462232e-06, + "loss": 0.5675, + "step": 1338 + }, + { + "epoch": 1.0581280300781637, + "grad_norm": 0.24865477438057995, + "learning_rate": 1.4492806261106285e-06, + "loss": 0.5631, + "step": 1339 + }, + { + "epoch": 1.0589195607005046, + "grad_norm": 0.2516834885516957, + "learning_rate": 1.4485377692735875e-06, + "loss": 0.5415, + "step": 1340 + }, + { + "epoch": 1.0597110913228456, + "grad_norm": 0.24427606981869535, + "learning_rate": 1.447794602448494e-06, + "loss": 0.5651, + "step": 1341 + }, + { + "epoch": 1.0605026219451865, + "grad_norm": 0.24943473935695282, + "learning_rate": 1.4470511261489567e-06, + "loss": 0.5621, + "step": 1342 + }, + { + "epoch": 1.0612941525675275, + "grad_norm": 0.2507908265549793, + "learning_rate": 1.4463073408887982e-06, + "loss": 0.5531, + "step": 1343 + }, + { + "epoch": 1.0620856831898684, + "grad_norm": 0.23597709877327022, + "learning_rate": 1.4455632471820544e-06, + "loss": 0.5548, + "step": 1344 + }, + { + "epoch": 1.0628772138122093, + "grad_norm": 0.24294964099822539, + "learning_rate": 1.4448188455429744e-06, + "loss": 0.5611, + "step": 1345 + }, + { + "epoch": 1.0636687444345503, + "grad_norm": 0.24044756054247945, + "learning_rate": 1.4440741364860204e-06, + "loss": 0.5705, + "step": 1346 + }, + { + "epoch": 1.0644602750568912, + "grad_norm": 0.2488697572231574, + "learning_rate": 1.443329120525866e-06, + "loss": 0.5679, + "step": 1347 + }, + { + "epoch": 1.0652518056792322, + "grad_norm": 0.2475552616581592, + "learning_rate": 1.4425837981773988e-06, + "loss": 0.5673, + "step": 1348 + }, + { + "epoch": 1.066043336301573, + "grad_norm": 0.24975796330795333, + "learning_rate": 1.4418381699557161e-06, + "loss": 0.5775, + "step": 1349 + }, + { + "epoch": 1.066834866923914, + "grad_norm": 0.24276956911990535, + "learning_rate": 1.441092236376128e-06, + "loss": 0.5477, + "step": 1350 + }, + { + "epoch": 1.067626397546255, + "grad_norm": 0.2554956875493257, + "learning_rate": 1.4403459979541552e-06, + "loss": 0.5527, + "step": 1351 + }, + { + "epoch": 1.068417928168596, + "grad_norm": 0.25159070985489457, + "learning_rate": 1.4395994552055291e-06, + "loss": 0.5715, + "step": 1352 + }, + { + "epoch": 1.0692094587909369, + "grad_norm": 0.2494368117656817, + "learning_rate": 1.4388526086461912e-06, + "loss": 0.5755, + "step": 1353 + }, + { + "epoch": 1.0700009894132778, + "grad_norm": 0.26034209301072064, + "learning_rate": 1.4381054587922934e-06, + "loss": 0.5603, + "step": 1354 + }, + { + "epoch": 1.070792520035619, + "grad_norm": 0.26023920582587345, + "learning_rate": 1.4373580061601967e-06, + "loss": 0.5651, + "step": 1355 + }, + { + "epoch": 1.07158405065796, + "grad_norm": 0.24420554855163062, + "learning_rate": 1.4366102512664717e-06, + "loss": 0.5681, + "step": 1356 + }, + { + "epoch": 1.0723755812803009, + "grad_norm": 0.2415399016964875, + "learning_rate": 1.4358621946278979e-06, + "loss": 0.5558, + "step": 1357 + }, + { + "epoch": 1.0731671119026418, + "grad_norm": 0.25368243576055927, + "learning_rate": 1.435113836761463e-06, + "loss": 0.5599, + "step": 1358 + }, + { + "epoch": 1.0739586425249827, + "grad_norm": 0.2544067704434178, + "learning_rate": 1.4343651781843638e-06, + "loss": 0.5682, + "step": 1359 + }, + { + "epoch": 1.0747501731473237, + "grad_norm": 0.25308737067053566, + "learning_rate": 1.4336162194140032e-06, + "loss": 0.5765, + "step": 1360 + }, + { + "epoch": 1.0755417037696646, + "grad_norm": 0.2506401510851039, + "learning_rate": 1.4328669609679936e-06, + "loss": 0.559, + "step": 1361 + }, + { + "epoch": 1.0763332343920056, + "grad_norm": 0.25875349828322386, + "learning_rate": 1.4321174033641527e-06, + "loss": 0.5655, + "step": 1362 + }, + { + "epoch": 1.0771247650143465, + "grad_norm": 0.25059238334641404, + "learning_rate": 1.4313675471205064e-06, + "loss": 0.5656, + "step": 1363 + }, + { + "epoch": 1.0779162956366874, + "grad_norm": 0.25768255084006086, + "learning_rate": 1.4306173927552863e-06, + "loss": 0.5601, + "step": 1364 + }, + { + "epoch": 1.0787078262590284, + "grad_norm": 0.23875002879243465, + "learning_rate": 1.4298669407869294e-06, + "loss": 0.5557, + "step": 1365 + }, + { + "epoch": 1.0794993568813693, + "grad_norm": 0.2445795213472089, + "learning_rate": 1.4291161917340802e-06, + "loss": 0.5722, + "step": 1366 + }, + { + "epoch": 1.0802908875037103, + "grad_norm": 0.2510424953327719, + "learning_rate": 1.4283651461155865e-06, + "loss": 0.5607, + "step": 1367 + }, + { + "epoch": 1.0810824181260512, + "grad_norm": 0.25830090042117565, + "learning_rate": 1.427613804450502e-06, + "loss": 0.573, + "step": 1368 + }, + { + "epoch": 1.0818739487483922, + "grad_norm": 0.24261289539474715, + "learning_rate": 1.426862167258086e-06, + "loss": 0.5583, + "step": 1369 + }, + { + "epoch": 1.082665479370733, + "grad_norm": 0.24311075923898381, + "learning_rate": 1.4261102350577996e-06, + "loss": 0.5458, + "step": 1370 + }, + { + "epoch": 1.083457009993074, + "grad_norm": 0.25599732005442116, + "learning_rate": 1.4253580083693103e-06, + "loss": 0.5567, + "step": 1371 + }, + { + "epoch": 1.084248540615415, + "grad_norm": 0.26359055736961196, + "learning_rate": 1.424605487712487e-06, + "loss": 0.5521, + "step": 1372 + }, + { + "epoch": 1.085040071237756, + "grad_norm": 0.24429222575076712, + "learning_rate": 1.4238526736074037e-06, + "loss": 0.5547, + "step": 1373 + }, + { + "epoch": 1.0858316018600969, + "grad_norm": 0.2592387090737342, + "learning_rate": 1.4230995665743354e-06, + "loss": 0.5639, + "step": 1374 + }, + { + "epoch": 1.0866231324824378, + "grad_norm": 0.2576967328448918, + "learning_rate": 1.4223461671337611e-06, + "loss": 0.5633, + "step": 1375 + }, + { + "epoch": 1.087414663104779, + "grad_norm": 0.2539279510513475, + "learning_rate": 1.4215924758063612e-06, + "loss": 0.5693, + "step": 1376 + }, + { + "epoch": 1.08820619372712, + "grad_norm": 0.250940301496672, + "learning_rate": 1.4208384931130173e-06, + "loss": 0.5561, + "step": 1377 + }, + { + "epoch": 1.0889977243494608, + "grad_norm": 0.24976103373673933, + "learning_rate": 1.420084219574813e-06, + "loss": 0.5705, + "step": 1378 + }, + { + "epoch": 1.0897892549718018, + "grad_norm": 0.24263277731436725, + "learning_rate": 1.4193296557130332e-06, + "loss": 0.566, + "step": 1379 + }, + { + "epoch": 1.0905807855941427, + "grad_norm": 0.2633927958881685, + "learning_rate": 1.4185748020491628e-06, + "loss": 0.5788, + "step": 1380 + }, + { + "epoch": 1.0913723162164837, + "grad_norm": 0.24815938962315232, + "learning_rate": 1.4178196591048873e-06, + "loss": 0.5557, + "step": 1381 + }, + { + "epoch": 1.0921638468388246, + "grad_norm": 0.25115346718353765, + "learning_rate": 1.4170642274020921e-06, + "loss": 0.5644, + "step": 1382 + }, + { + "epoch": 1.0929553774611656, + "grad_norm": 0.25936916512413793, + "learning_rate": 1.4163085074628615e-06, + "loss": 0.5705, + "step": 1383 + }, + { + "epoch": 1.0937469080835065, + "grad_norm": 0.2540161130865951, + "learning_rate": 1.4155524998094804e-06, + "loss": 0.5703, + "step": 1384 + }, + { + "epoch": 1.0945384387058474, + "grad_norm": 0.2473168264796292, + "learning_rate": 1.4147962049644315e-06, + "loss": 0.5663, + "step": 1385 + }, + { + "epoch": 1.0953299693281884, + "grad_norm": 0.2478505866906445, + "learning_rate": 1.4140396234503964e-06, + "loss": 0.5641, + "step": 1386 + }, + { + "epoch": 1.0961214999505293, + "grad_norm": 0.24769702910603478, + "learning_rate": 1.4132827557902541e-06, + "loss": 0.5505, + "step": 1387 + }, + { + "epoch": 1.0969130305728703, + "grad_norm": 0.26318448683729706, + "learning_rate": 1.4125256025070824e-06, + "loss": 0.5709, + "step": 1388 + }, + { + "epoch": 1.0977045611952112, + "grad_norm": 0.24246624551299162, + "learning_rate": 1.4117681641241559e-06, + "loss": 0.5518, + "step": 1389 + }, + { + "epoch": 1.0984960918175521, + "grad_norm": 0.26244592210781836, + "learning_rate": 1.4110104411649463e-06, + "loss": 0.5692, + "step": 1390 + }, + { + "epoch": 1.099287622439893, + "grad_norm": 0.25937353050620887, + "learning_rate": 1.4102524341531223e-06, + "loss": 0.5543, + "step": 1391 + }, + { + "epoch": 1.100079153062234, + "grad_norm": 0.2606143038056361, + "learning_rate": 1.4094941436125482e-06, + "loss": 0.5683, + "step": 1392 + }, + { + "epoch": 1.100870683684575, + "grad_norm": 0.2589377474035992, + "learning_rate": 1.408735570067285e-06, + "loss": 0.5755, + "step": 1393 + }, + { + "epoch": 1.101662214306916, + "grad_norm": 0.2560676432428256, + "learning_rate": 1.4079767140415886e-06, + "loss": 0.5607, + "step": 1394 + }, + { + "epoch": 1.1024537449292569, + "grad_norm": 0.2544295215042865, + "learning_rate": 1.4072175760599107e-06, + "loss": 0.5658, + "step": 1395 + }, + { + "epoch": 1.1032452755515978, + "grad_norm": 0.25692156591295223, + "learning_rate": 1.406458156646898e-06, + "loss": 0.5491, + "step": 1396 + }, + { + "epoch": 1.104036806173939, + "grad_norm": 0.2615088469762251, + "learning_rate": 1.4056984563273906e-06, + "loss": 0.5663, + "step": 1397 + }, + { + "epoch": 1.10482833679628, + "grad_norm": 0.40982086382454475, + "learning_rate": 1.4049384756264243e-06, + "loss": 0.5676, + "step": 1398 + }, + { + "epoch": 1.1056198674186208, + "grad_norm": 0.2557534167110459, + "learning_rate": 1.4041782150692273e-06, + "loss": 0.5677, + "step": 1399 + }, + { + "epoch": 1.1064113980409618, + "grad_norm": 0.24867310110850344, + "learning_rate": 1.4034176751812219e-06, + "loss": 0.5698, + "step": 1400 + }, + { + "epoch": 1.1072029286633027, + "grad_norm": 0.24449685446842004, + "learning_rate": 1.4026568564880235e-06, + "loss": 0.5603, + "step": 1401 + }, + { + "epoch": 1.1079944592856437, + "grad_norm": 0.2629831409137662, + "learning_rate": 1.4018957595154395e-06, + "loss": 0.551, + "step": 1402 + }, + { + "epoch": 1.1087859899079846, + "grad_norm": 0.26123133768605006, + "learning_rate": 1.401134384789471e-06, + "loss": 0.5852, + "step": 1403 + }, + { + "epoch": 1.1095775205303255, + "grad_norm": 0.2440411786605679, + "learning_rate": 1.4003727328363092e-06, + "loss": 0.5623, + "step": 1404 + }, + { + "epoch": 1.1103690511526665, + "grad_norm": 0.23987090708226047, + "learning_rate": 1.3996108041823386e-06, + "loss": 0.5579, + "step": 1405 + }, + { + "epoch": 1.1111605817750074, + "grad_norm": 0.24206684985462035, + "learning_rate": 1.3988485993541342e-06, + "loss": 0.557, + "step": 1406 + }, + { + "epoch": 1.1119521123973484, + "grad_norm": 0.25278949872560796, + "learning_rate": 1.3980861188784613e-06, + "loss": 0.5633, + "step": 1407 + }, + { + "epoch": 1.1127436430196893, + "grad_norm": 0.2542737514370265, + "learning_rate": 1.3973233632822766e-06, + "loss": 0.5628, + "step": 1408 + }, + { + "epoch": 1.1135351736420303, + "grad_norm": 0.24384929625513102, + "learning_rate": 1.3965603330927267e-06, + "loss": 0.5773, + "step": 1409 + }, + { + "epoch": 1.1143267042643712, + "grad_norm": 0.24890884561690166, + "learning_rate": 1.3957970288371481e-06, + "loss": 0.5596, + "step": 1410 + }, + { + "epoch": 1.1151182348867121, + "grad_norm": 0.24306152813754728, + "learning_rate": 1.3950334510430662e-06, + "loss": 0.5689, + "step": 1411 + }, + { + "epoch": 1.115909765509053, + "grad_norm": 0.24893814741240725, + "learning_rate": 1.3942696002381954e-06, + "loss": 0.5643, + "step": 1412 + }, + { + "epoch": 1.116701296131394, + "grad_norm": 0.25140564614699834, + "learning_rate": 1.3935054769504392e-06, + "loss": 0.5675, + "step": 1413 + }, + { + "epoch": 1.117492826753735, + "grad_norm": 0.24362217835135425, + "learning_rate": 1.3927410817078907e-06, + "loss": 0.5595, + "step": 1414 + }, + { + "epoch": 1.118284357376076, + "grad_norm": 0.24781969989964167, + "learning_rate": 1.3919764150388276e-06, + "loss": 0.5696, + "step": 1415 + }, + { + "epoch": 1.1190758879984168, + "grad_norm": 0.24760728428101522, + "learning_rate": 1.3912114774717183e-06, + "loss": 0.5499, + "step": 1416 + }, + { + "epoch": 1.1198674186207578, + "grad_norm": 0.23910844633467185, + "learning_rate": 1.3904462695352171e-06, + "loss": 0.5443, + "step": 1417 + }, + { + "epoch": 1.120658949243099, + "grad_norm": 0.24489015265636474, + "learning_rate": 1.389680791758165e-06, + "loss": 0.5572, + "step": 1418 + }, + { + "epoch": 1.1214504798654399, + "grad_norm": 0.2460657292272078, + "learning_rate": 1.3889150446695907e-06, + "loss": 0.576, + "step": 1419 + }, + { + "epoch": 1.1222420104877808, + "grad_norm": 0.25033130711864643, + "learning_rate": 1.3881490287987075e-06, + "loss": 0.5544, + "step": 1420 + }, + { + "epoch": 1.1230335411101218, + "grad_norm": 0.24827949995915674, + "learning_rate": 1.387382744674915e-06, + "loss": 0.572, + "step": 1421 + }, + { + "epoch": 1.1238250717324627, + "grad_norm": 0.2378346073817711, + "learning_rate": 1.3866161928277986e-06, + "loss": 0.5682, + "step": 1422 + }, + { + "epoch": 1.1246166023548037, + "grad_norm": 0.2571384344357915, + "learning_rate": 1.3858493737871284e-06, + "loss": 0.5617, + "step": 1423 + }, + { + "epoch": 1.1254081329771446, + "grad_norm": 0.2384405621085864, + "learning_rate": 1.3850822880828593e-06, + "loss": 0.5564, + "step": 1424 + }, + { + "epoch": 1.1261996635994855, + "grad_norm": 0.24597836673469964, + "learning_rate": 1.38431493624513e-06, + "loss": 0.5633, + "step": 1425 + }, + { + "epoch": 1.1269911942218265, + "grad_norm": 0.24829013323231672, + "learning_rate": 1.3835473188042636e-06, + "loss": 0.5738, + "step": 1426 + }, + { + "epoch": 1.1277827248441674, + "grad_norm": 0.24780693733556894, + "learning_rate": 1.3827794362907672e-06, + "loss": 0.5496, + "step": 1427 + }, + { + "epoch": 1.1285742554665084, + "grad_norm": 0.2496677391898154, + "learning_rate": 1.3820112892353301e-06, + "loss": 0.5473, + "step": 1428 + }, + { + "epoch": 1.1293657860888493, + "grad_norm": 0.24347934169416524, + "learning_rate": 1.381242878168825e-06, + "loss": 0.5591, + "step": 1429 + }, + { + "epoch": 1.1301573167111902, + "grad_norm": 0.2949242019925856, + "learning_rate": 1.3804742036223071e-06, + "loss": 0.563, + "step": 1430 + }, + { + "epoch": 1.1309488473335312, + "grad_norm": 0.25587647799333957, + "learning_rate": 1.3797052661270132e-06, + "loss": 0.5595, + "step": 1431 + }, + { + "epoch": 1.1317403779558721, + "grad_norm": 0.2492426640162163, + "learning_rate": 1.3789360662143622e-06, + "loss": 0.5708, + "step": 1432 + }, + { + "epoch": 1.132531908578213, + "grad_norm": 0.24691132782837508, + "learning_rate": 1.378166604415955e-06, + "loss": 0.5669, + "step": 1433 + }, + { + "epoch": 1.133323439200554, + "grad_norm": 0.25270399928726467, + "learning_rate": 1.377396881263572e-06, + "loss": 0.5624, + "step": 1434 + }, + { + "epoch": 1.134114969822895, + "grad_norm": 0.24548804339917815, + "learning_rate": 1.3766268972891757e-06, + "loss": 0.5639, + "step": 1435 + }, + { + "epoch": 1.1349065004452359, + "grad_norm": 0.23617628723323206, + "learning_rate": 1.3758566530249074e-06, + "loss": 0.5604, + "step": 1436 + }, + { + "epoch": 1.1356980310675768, + "grad_norm": 0.25460525737316764, + "learning_rate": 1.3750861490030898e-06, + "loss": 0.5637, + "step": 1437 + }, + { + "epoch": 1.1364895616899178, + "grad_norm": 0.25667284385399564, + "learning_rate": 1.3743153857562244e-06, + "loss": 0.5674, + "step": 1438 + }, + { + "epoch": 1.137281092312259, + "grad_norm": 0.2435879950474742, + "learning_rate": 1.3735443638169914e-06, + "loss": 0.5698, + "step": 1439 + }, + { + "epoch": 1.1380726229345999, + "grad_norm": 0.2469201860892531, + "learning_rate": 1.3727730837182508e-06, + "loss": 0.562, + "step": 1440 + }, + { + "epoch": 1.1388641535569408, + "grad_norm": 0.2504993856121079, + "learning_rate": 1.3720015459930396e-06, + "loss": 0.575, + "step": 1441 + }, + { + "epoch": 1.1396556841792818, + "grad_norm": 0.23936229592820304, + "learning_rate": 1.3712297511745749e-06, + "loss": 0.5773, + "step": 1442 + }, + { + "epoch": 1.1404472148016227, + "grad_norm": 0.2519788767852563, + "learning_rate": 1.3704576997962495e-06, + "loss": 0.5725, + "step": 1443 + }, + { + "epoch": 1.1412387454239636, + "grad_norm": 0.2380806327883765, + "learning_rate": 1.3696853923916343e-06, + "loss": 0.558, + "step": 1444 + }, + { + "epoch": 1.1420302760463046, + "grad_norm": 0.25879478539242484, + "learning_rate": 1.3689128294944777e-06, + "loss": 0.5612, + "step": 1445 + }, + { + "epoch": 1.1428218066686455, + "grad_norm": 0.2525430153085939, + "learning_rate": 1.3681400116387033e-06, + "loss": 0.5626, + "step": 1446 + }, + { + "epoch": 1.1436133372909865, + "grad_norm": 0.2450210620431473, + "learning_rate": 1.3673669393584123e-06, + "loss": 0.5649, + "step": 1447 + }, + { + "epoch": 1.1444048679133274, + "grad_norm": 0.24000612889609255, + "learning_rate": 1.3665936131878814e-06, + "loss": 0.5488, + "step": 1448 + }, + { + "epoch": 1.1451963985356683, + "grad_norm": 0.25832369735084254, + "learning_rate": 1.3658200336615618e-06, + "loss": 0.5698, + "step": 1449 + }, + { + "epoch": 1.1459879291580093, + "grad_norm": 0.24163981156196715, + "learning_rate": 1.365046201314081e-06, + "loss": 0.5596, + "step": 1450 + }, + { + "epoch": 1.1467794597803502, + "grad_norm": 0.24601436851831515, + "learning_rate": 1.3642721166802406e-06, + "loss": 0.5575, + "step": 1451 + }, + { + "epoch": 1.1475709904026912, + "grad_norm": 0.25582215178071455, + "learning_rate": 1.3634977802950166e-06, + "loss": 0.5672, + "step": 1452 + }, + { + "epoch": 1.1483625210250321, + "grad_norm": 0.25421730792632513, + "learning_rate": 1.3627231926935596e-06, + "loss": 0.5867, + "step": 1453 + }, + { + "epoch": 1.149154051647373, + "grad_norm": 0.24605176555963193, + "learning_rate": 1.3619483544111925e-06, + "loss": 0.5779, + "step": 1454 + }, + { + "epoch": 1.149945582269714, + "grad_norm": 0.2339753161272484, + "learning_rate": 1.3611732659834128e-06, + "loss": 0.5494, + "step": 1455 + }, + { + "epoch": 1.150737112892055, + "grad_norm": 0.24505913123384715, + "learning_rate": 1.3603979279458903e-06, + "loss": 0.5484, + "step": 1456 + }, + { + "epoch": 1.1515286435143959, + "grad_norm": 0.24447597123717954, + "learning_rate": 1.3596223408344674e-06, + "loss": 0.5601, + "step": 1457 + }, + { + "epoch": 1.1523201741367368, + "grad_norm": 0.25432198557858027, + "learning_rate": 1.3588465051851583e-06, + "loss": 0.5635, + "step": 1458 + }, + { + "epoch": 1.1531117047590778, + "grad_norm": 0.23893493265840152, + "learning_rate": 1.3580704215341493e-06, + "loss": 0.5589, + "step": 1459 + }, + { + "epoch": 1.153903235381419, + "grad_norm": 0.24689412322769985, + "learning_rate": 1.3572940904177978e-06, + "loss": 0.5642, + "step": 1460 + }, + { + "epoch": 1.1546947660037599, + "grad_norm": 0.24518858861845222, + "learning_rate": 1.3565175123726334e-06, + "loss": 0.5741, + "step": 1461 + }, + { + "epoch": 1.1554862966261008, + "grad_norm": 0.27557275432437467, + "learning_rate": 1.3557406879353544e-06, + "loss": 0.5558, + "step": 1462 + }, + { + "epoch": 1.1562778272484417, + "grad_norm": 0.24806836507237567, + "learning_rate": 1.3549636176428309e-06, + "loss": 0.5459, + "step": 1463 + }, + { + "epoch": 1.1570693578707827, + "grad_norm": 0.24573547223625097, + "learning_rate": 1.3541863020321025e-06, + "loss": 0.5619, + "step": 1464 + }, + { + "epoch": 1.1578608884931236, + "grad_norm": 0.23943167718446362, + "learning_rate": 1.3534087416403777e-06, + "loss": 0.5597, + "step": 1465 + }, + { + "epoch": 1.1586524191154646, + "grad_norm": 0.24002190583674485, + "learning_rate": 1.3526309370050357e-06, + "loss": 0.5616, + "step": 1466 + }, + { + "epoch": 1.1594439497378055, + "grad_norm": 0.2516513879614623, + "learning_rate": 1.3518528886636223e-06, + "loss": 0.5689, + "step": 1467 + }, + { + "epoch": 1.1602354803601465, + "grad_norm": 0.24065964510116664, + "learning_rate": 1.3510745971538537e-06, + "loss": 0.5502, + "step": 1468 + }, + { + "epoch": 1.1610270109824874, + "grad_norm": 0.25407611161783095, + "learning_rate": 1.3502960630136135e-06, + "loss": 0.5848, + "step": 1469 + }, + { + "epoch": 1.1618185416048283, + "grad_norm": 0.2522333241246855, + "learning_rate": 1.3495172867809525e-06, + "loss": 0.5619, + "step": 1470 + }, + { + "epoch": 1.1626100722271693, + "grad_norm": 0.24130571124051212, + "learning_rate": 1.3487382689940895e-06, + "loss": 0.5702, + "step": 1471 + }, + { + "epoch": 1.1634016028495102, + "grad_norm": 0.24551806796107772, + "learning_rate": 1.3479590101914102e-06, + "loss": 0.56, + "step": 1472 + }, + { + "epoch": 1.1641931334718512, + "grad_norm": 0.24466831966052552, + "learning_rate": 1.3471795109114657e-06, + "loss": 0.56, + "step": 1473 + }, + { + "epoch": 1.164984664094192, + "grad_norm": 0.24683947507642787, + "learning_rate": 1.346399771692975e-06, + "loss": 0.5764, + "step": 1474 + }, + { + "epoch": 1.165776194716533, + "grad_norm": 0.2562690119188769, + "learning_rate": 1.3456197930748218e-06, + "loss": 0.5612, + "step": 1475 + }, + { + "epoch": 1.166567725338874, + "grad_norm": 0.25396326381412243, + "learning_rate": 1.3448395755960557e-06, + "loss": 0.5618, + "step": 1476 + }, + { + "epoch": 1.167359255961215, + "grad_norm": 0.24145350778239516, + "learning_rate": 1.3440591197958914e-06, + "loss": 0.5624, + "step": 1477 + }, + { + "epoch": 1.1681507865835559, + "grad_norm": 0.2570848035221008, + "learning_rate": 1.343278426213708e-06, + "loss": 0.566, + "step": 1478 + }, + { + "epoch": 1.1689423172058968, + "grad_norm": 0.25821666285717104, + "learning_rate": 1.3424974953890486e-06, + "loss": 0.5481, + "step": 1479 + }, + { + "epoch": 1.1697338478282377, + "grad_norm": 0.2540322071803146, + "learning_rate": 1.3417163278616219e-06, + "loss": 0.5614, + "step": 1480 + }, + { + "epoch": 1.170525378450579, + "grad_norm": 0.24994903320454964, + "learning_rate": 1.340934924171298e-06, + "loss": 0.5683, + "step": 1481 + }, + { + "epoch": 1.1713169090729199, + "grad_norm": 0.24481055255582856, + "learning_rate": 1.3401532848581118e-06, + "loss": 0.5522, + "step": 1482 + }, + { + "epoch": 1.1721084396952608, + "grad_norm": 0.24942373276188304, + "learning_rate": 1.3393714104622605e-06, + "loss": 0.5594, + "step": 1483 + }, + { + "epoch": 1.1728999703176017, + "grad_norm": 0.26081266748278387, + "learning_rate": 1.3385893015241034e-06, + "loss": 0.5656, + "step": 1484 + }, + { + "epoch": 1.1736915009399427, + "grad_norm": 0.26034434454258165, + "learning_rate": 1.3378069585841628e-06, + "loss": 0.5638, + "step": 1485 + }, + { + "epoch": 1.1744830315622836, + "grad_norm": 0.2513850183241021, + "learning_rate": 1.3370243821831215e-06, + "loss": 0.5566, + "step": 1486 + }, + { + "epoch": 1.1752745621846246, + "grad_norm": 0.24043759547874047, + "learning_rate": 1.336241572861825e-06, + "loss": 0.5731, + "step": 1487 + }, + { + "epoch": 1.1760660928069655, + "grad_norm": 0.2580029426084782, + "learning_rate": 1.3354585311612787e-06, + "loss": 0.5627, + "step": 1488 + }, + { + "epoch": 1.1768576234293064, + "grad_norm": 0.27269605348413717, + "learning_rate": 1.3346752576226495e-06, + "loss": 0.5471, + "step": 1489 + }, + { + "epoch": 1.1776491540516474, + "grad_norm": 0.25760244919508485, + "learning_rate": 1.3338917527872633e-06, + "loss": 0.5577, + "step": 1490 + }, + { + "epoch": 1.1784406846739883, + "grad_norm": 0.24204885480871255, + "learning_rate": 1.3331080171966069e-06, + "loss": 0.5507, + "step": 1491 + }, + { + "epoch": 1.1792322152963293, + "grad_norm": 0.24490566217327128, + "learning_rate": 1.3323240513923266e-06, + "loss": 0.559, + "step": 1492 + }, + { + "epoch": 1.1800237459186702, + "grad_norm": 0.24394288222382954, + "learning_rate": 1.3315398559162268e-06, + "loss": 0.5681, + "step": 1493 + }, + { + "epoch": 1.1808152765410111, + "grad_norm": 0.2487697419681786, + "learning_rate": 1.330755431310272e-06, + "loss": 0.5627, + "step": 1494 + }, + { + "epoch": 1.181606807163352, + "grad_norm": 0.2577056813523912, + "learning_rate": 1.3299707781165838e-06, + "loss": 0.5682, + "step": 1495 + }, + { + "epoch": 1.182398337785693, + "grad_norm": 0.252776565485377, + "learning_rate": 1.3291858968774425e-06, + "loss": 0.5504, + "step": 1496 + }, + { + "epoch": 1.183189868408034, + "grad_norm": 0.23213210854724767, + "learning_rate": 1.3284007881352857e-06, + "loss": 0.5657, + "step": 1497 + }, + { + "epoch": 1.183981399030375, + "grad_norm": 0.24680774056848112, + "learning_rate": 1.3276154524327088e-06, + "loss": 0.5723, + "step": 1498 + }, + { + "epoch": 1.1847729296527159, + "grad_norm": 0.25150523401632685, + "learning_rate": 1.3268298903124632e-06, + "loss": 0.5589, + "step": 1499 + }, + { + "epoch": 1.1855644602750568, + "grad_norm": 0.251403052409526, + "learning_rate": 1.3260441023174573e-06, + "loss": 0.5676, + "step": 1500 + }, + { + "epoch": 1.1863559908973977, + "grad_norm": 0.25303547261663323, + "learning_rate": 1.3252580889907552e-06, + "loss": 0.571, + "step": 1501 + }, + { + "epoch": 1.187147521519739, + "grad_norm": 0.24525609893856698, + "learning_rate": 1.3244718508755775e-06, + "loss": 0.5723, + "step": 1502 + }, + { + "epoch": 1.1879390521420798, + "grad_norm": 0.24875562634094203, + "learning_rate": 1.3236853885152993e-06, + "loss": 0.5567, + "step": 1503 + }, + { + "epoch": 1.1887305827644208, + "grad_norm": 0.2463317871919473, + "learning_rate": 1.3228987024534515e-06, + "loss": 0.5715, + "step": 1504 + }, + { + "epoch": 1.1895221133867617, + "grad_norm": 0.24545871745197517, + "learning_rate": 1.3221117932337187e-06, + "loss": 0.5717, + "step": 1505 + }, + { + "epoch": 1.1903136440091027, + "grad_norm": 0.2473448767957027, + "learning_rate": 1.3213246613999403e-06, + "loss": 0.5675, + "step": 1506 + }, + { + "epoch": 1.1911051746314436, + "grad_norm": 0.24629434812573725, + "learning_rate": 1.3205373074961095e-06, + "loss": 0.559, + "step": 1507 + }, + { + "epoch": 1.1918967052537845, + "grad_norm": 0.24434319715655584, + "learning_rate": 1.319749732066373e-06, + "loss": 0.5592, + "step": 1508 + }, + { + "epoch": 1.1926882358761255, + "grad_norm": 0.24964508059499005, + "learning_rate": 1.3189619356550301e-06, + "loss": 0.5745, + "step": 1509 + }, + { + "epoch": 1.1934797664984664, + "grad_norm": 0.25254233963159106, + "learning_rate": 1.3181739188065337e-06, + "loss": 0.5565, + "step": 1510 + }, + { + "epoch": 1.1942712971208074, + "grad_norm": 0.24142885431486732, + "learning_rate": 1.3173856820654884e-06, + "loss": 0.5544, + "step": 1511 + }, + { + "epoch": 1.1950628277431483, + "grad_norm": 0.24365351975783034, + "learning_rate": 1.3165972259766504e-06, + "loss": 0.5678, + "step": 1512 + }, + { + "epoch": 1.1958543583654893, + "grad_norm": 0.24247554455887, + "learning_rate": 1.3158085510849292e-06, + "loss": 0.5498, + "step": 1513 + }, + { + "epoch": 1.1966458889878302, + "grad_norm": 0.24241098462339952, + "learning_rate": 1.3150196579353833e-06, + "loss": 0.5648, + "step": 1514 + }, + { + "epoch": 1.1974374196101711, + "grad_norm": 0.24249053802488338, + "learning_rate": 1.3142305470732235e-06, + "loss": 0.5608, + "step": 1515 + }, + { + "epoch": 1.198228950232512, + "grad_norm": 0.24206188838756354, + "learning_rate": 1.3134412190438108e-06, + "loss": 0.5813, + "step": 1516 + }, + { + "epoch": 1.199020480854853, + "grad_norm": 0.2514494077696141, + "learning_rate": 1.3126516743926557e-06, + "loss": 0.555, + "step": 1517 + }, + { + "epoch": 1.199812011477194, + "grad_norm": 0.24250342755119866, + "learning_rate": 1.3118619136654194e-06, + "loss": 0.5824, + "step": 1518 + }, + { + "epoch": 1.200603542099535, + "grad_norm": 0.23787699251894545, + "learning_rate": 1.3110719374079114e-06, + "loss": 0.5562, + "step": 1519 + }, + { + "epoch": 1.2013950727218758, + "grad_norm": 0.2479687422231374, + "learning_rate": 1.3102817461660907e-06, + "loss": 0.56, + "step": 1520 + }, + { + "epoch": 1.2021866033442168, + "grad_norm": 0.23767241740347347, + "learning_rate": 1.309491340486065e-06, + "loss": 0.5574, + "step": 1521 + }, + { + "epoch": 1.2029781339665577, + "grad_norm": 0.2446681705477864, + "learning_rate": 1.30870072091409e-06, + "loss": 0.5622, + "step": 1522 + }, + { + "epoch": 1.203769664588899, + "grad_norm": 0.24475978456393724, + "learning_rate": 1.3079098879965692e-06, + "loss": 0.5478, + "step": 1523 + }, + { + "epoch": 1.2045611952112396, + "grad_norm": 0.24473406227114267, + "learning_rate": 1.3071188422800533e-06, + "loss": 0.5373, + "step": 1524 + }, + { + "epoch": 1.2053527258335808, + "grad_norm": 0.2429482384203616, + "learning_rate": 1.3063275843112408e-06, + "loss": 0.5667, + "step": 1525 + }, + { + "epoch": 1.2061442564559217, + "grad_norm": 0.2569384178130532, + "learning_rate": 1.305536114636976e-06, + "loss": 0.5667, + "step": 1526 + }, + { + "epoch": 1.2069357870782627, + "grad_norm": 0.25473603378901205, + "learning_rate": 1.3047444338042503e-06, + "loss": 0.5591, + "step": 1527 + }, + { + "epoch": 1.2077273177006036, + "grad_norm": 0.25697854502479367, + "learning_rate": 1.3039525423602005e-06, + "loss": 0.5775, + "step": 1528 + }, + { + "epoch": 1.2085188483229445, + "grad_norm": 0.25238019491301633, + "learning_rate": 1.3031604408521091e-06, + "loss": 0.5844, + "step": 1529 + }, + { + "epoch": 1.2093103789452855, + "grad_norm": 0.25229905848431244, + "learning_rate": 1.302368129827404e-06, + "loss": 0.5854, + "step": 1530 + }, + { + "epoch": 1.2101019095676264, + "grad_norm": 0.2664347693257625, + "learning_rate": 1.301575609833657e-06, + "loss": 0.5562, + "step": 1531 + }, + { + "epoch": 1.2108934401899674, + "grad_norm": 0.2496922180924536, + "learning_rate": 1.3007828814185865e-06, + "loss": 0.5684, + "step": 1532 + }, + { + "epoch": 1.2116849708123083, + "grad_norm": 0.2570095455485703, + "learning_rate": 1.299989945130052e-06, + "loss": 0.5587, + "step": 1533 + }, + { + "epoch": 1.2124765014346492, + "grad_norm": 0.2504829012924918, + "learning_rate": 1.299196801516059e-06, + "loss": 0.5739, + "step": 1534 + }, + { + "epoch": 1.2132680320569902, + "grad_norm": 0.25609771518114627, + "learning_rate": 1.2984034511247549e-06, + "loss": 0.5503, + "step": 1535 + }, + { + "epoch": 1.2140595626793311, + "grad_norm": 0.2588638522992403, + "learning_rate": 1.297609894504431e-06, + "loss": 0.5642, + "step": 1536 + }, + { + "epoch": 1.214851093301672, + "grad_norm": 0.24021387096343427, + "learning_rate": 1.2968161322035203e-06, + "loss": 0.5589, + "step": 1537 + }, + { + "epoch": 1.215642623924013, + "grad_norm": 0.2466955191888222, + "learning_rate": 1.2960221647705983e-06, + "loss": 0.5628, + "step": 1538 + }, + { + "epoch": 1.216434154546354, + "grad_norm": 0.24947350639256644, + "learning_rate": 1.2952279927543824e-06, + "loss": 0.5776, + "step": 1539 + }, + { + "epoch": 1.217225685168695, + "grad_norm": 0.2560276558387007, + "learning_rate": 1.294433616703731e-06, + "loss": 0.575, + "step": 1540 + }, + { + "epoch": 1.2180172157910358, + "grad_norm": 0.2460633195241213, + "learning_rate": 1.2936390371676444e-06, + "loss": 0.5613, + "step": 1541 + }, + { + "epoch": 1.2188087464133768, + "grad_norm": 0.24500623054045792, + "learning_rate": 1.2928442546952624e-06, + "loss": 0.5732, + "step": 1542 + }, + { + "epoch": 1.2196002770357177, + "grad_norm": 0.2519740921886011, + "learning_rate": 1.2920492698358654e-06, + "loss": 0.5606, + "step": 1543 + }, + { + "epoch": 1.2203918076580589, + "grad_norm": 0.25154306115101094, + "learning_rate": 1.291254083138874e-06, + "loss": 0.5673, + "step": 1544 + }, + { + "epoch": 1.2211833382803996, + "grad_norm": 0.23946440421747092, + "learning_rate": 1.2904586951538482e-06, + "loss": 0.5735, + "step": 1545 + }, + { + "epoch": 1.2219748689027408, + "grad_norm": 0.2444312352710892, + "learning_rate": 1.2896631064304875e-06, + "loss": 0.5624, + "step": 1546 + }, + { + "epoch": 1.2227663995250817, + "grad_norm": 0.2417617325038712, + "learning_rate": 1.2888673175186285e-06, + "loss": 0.5539, + "step": 1547 + }, + { + "epoch": 1.2235579301474226, + "grad_norm": 0.2433204443864132, + "learning_rate": 1.2880713289682482e-06, + "loss": 0.5792, + "step": 1548 + }, + { + "epoch": 1.2243494607697636, + "grad_norm": 0.2412703302687474, + "learning_rate": 1.2872751413294603e-06, + "loss": 0.5412, + "step": 1549 + }, + { + "epoch": 1.2251409913921045, + "grad_norm": 0.244643351278545, + "learning_rate": 1.2864787551525165e-06, + "loss": 0.5694, + "step": 1550 + }, + { + "epoch": 1.2259325220144455, + "grad_norm": 0.23973020727942237, + "learning_rate": 1.285682170987806e-06, + "loss": 0.5735, + "step": 1551 + }, + { + "epoch": 1.2267240526367864, + "grad_norm": 0.24234936017315478, + "learning_rate": 1.2848853893858538e-06, + "loss": 0.5652, + "step": 1552 + }, + { + "epoch": 1.2275155832591274, + "grad_norm": 0.2586282557349905, + "learning_rate": 1.284088410897323e-06, + "loss": 0.5605, + "step": 1553 + }, + { + "epoch": 1.2283071138814683, + "grad_norm": 0.24505684808942604, + "learning_rate": 1.283291236073011e-06, + "loss": 0.5547, + "step": 1554 + }, + { + "epoch": 1.2290986445038092, + "grad_norm": 0.251500866370209, + "learning_rate": 1.2824938654638523e-06, + "loss": 0.5574, + "step": 1555 + }, + { + "epoch": 1.2298901751261502, + "grad_norm": 0.24257617276229285, + "learning_rate": 1.2816962996209158e-06, + "loss": 0.5623, + "step": 1556 + }, + { + "epoch": 1.2306817057484911, + "grad_norm": 0.24439097721753178, + "learning_rate": 1.2808985390954058e-06, + "loss": 0.5528, + "step": 1557 + }, + { + "epoch": 1.231473236370832, + "grad_norm": 0.24002520337491023, + "learning_rate": 1.2801005844386608e-06, + "loss": 0.5457, + "step": 1558 + }, + { + "epoch": 1.232264766993173, + "grad_norm": 0.2489675951780969, + "learning_rate": 1.2793024362021539e-06, + "loss": 0.5625, + "step": 1559 + }, + { + "epoch": 1.233056297615514, + "grad_norm": 0.24263360465866113, + "learning_rate": 1.2785040949374913e-06, + "loss": 0.574, + "step": 1560 + }, + { + "epoch": 1.2338478282378549, + "grad_norm": 0.23885755915714316, + "learning_rate": 1.2777055611964143e-06, + "loss": 0.552, + "step": 1561 + }, + { + "epoch": 1.2346393588601958, + "grad_norm": 0.24079241769982115, + "learning_rate": 1.2769068355307944e-06, + "loss": 0.5677, + "step": 1562 + }, + { + "epoch": 1.2354308894825368, + "grad_norm": 0.2609744072790651, + "learning_rate": 1.2761079184926383e-06, + "loss": 0.5477, + "step": 1563 + }, + { + "epoch": 1.2362224201048777, + "grad_norm": 0.24786241690689384, + "learning_rate": 1.2753088106340834e-06, + "loss": 0.5567, + "step": 1564 + }, + { + "epoch": 1.2370139507272189, + "grad_norm": 0.24911141713974588, + "learning_rate": 1.2745095125074e-06, + "loss": 0.5405, + "step": 1565 + }, + { + "epoch": 1.2378054813495596, + "grad_norm": 0.24570528505748546, + "learning_rate": 1.2737100246649894e-06, + "loss": 0.5622, + "step": 1566 + }, + { + "epoch": 1.2385970119719008, + "grad_norm": 0.23695499893006872, + "learning_rate": 1.2729103476593839e-06, + "loss": 0.5591, + "step": 1567 + }, + { + "epoch": 1.2393885425942417, + "grad_norm": 0.25451569776294236, + "learning_rate": 1.272110482043247e-06, + "loss": 0.5478, + "step": 1568 + }, + { + "epoch": 1.2401800732165826, + "grad_norm": 0.26976294471550605, + "learning_rate": 1.2713104283693717e-06, + "loss": 0.5625, + "step": 1569 + }, + { + "epoch": 1.2409716038389236, + "grad_norm": 0.25424281949570265, + "learning_rate": 1.2705101871906819e-06, + "loss": 0.5721, + "step": 1570 + }, + { + "epoch": 1.2417631344612645, + "grad_norm": 0.3145157934672575, + "learning_rate": 1.2697097590602309e-06, + "loss": 0.5653, + "step": 1571 + }, + { + "epoch": 1.2425546650836055, + "grad_norm": 0.26214677001673914, + "learning_rate": 1.2689091445312002e-06, + "loss": 0.5602, + "step": 1572 + }, + { + "epoch": 1.2433461957059464, + "grad_norm": 0.248967012073164, + "learning_rate": 1.2681083441569016e-06, + "loss": 0.5699, + "step": 1573 + }, + { + "epoch": 1.2441377263282873, + "grad_norm": 0.24808362195885564, + "learning_rate": 1.2673073584907748e-06, + "loss": 0.5566, + "step": 1574 + }, + { + "epoch": 1.2449292569506283, + "grad_norm": 0.24101514028608262, + "learning_rate": 1.266506188086387e-06, + "loss": 0.5426, + "step": 1575 + }, + { + "epoch": 1.2457207875729692, + "grad_norm": 0.2483643234417837, + "learning_rate": 1.265704833497434e-06, + "loss": 0.5553, + "step": 1576 + }, + { + "epoch": 1.2465123181953102, + "grad_norm": 0.25406125539863356, + "learning_rate": 1.2649032952777374e-06, + "loss": 0.5452, + "step": 1577 + }, + { + "epoch": 1.247303848817651, + "grad_norm": 0.25646760949952596, + "learning_rate": 1.2641015739812474e-06, + "loss": 0.5733, + "step": 1578 + }, + { + "epoch": 1.248095379439992, + "grad_norm": 0.2501605676457125, + "learning_rate": 1.2632996701620405e-06, + "loss": 0.565, + "step": 1579 + }, + { + "epoch": 1.248886910062333, + "grad_norm": 0.25303082598696736, + "learning_rate": 1.262497584374318e-06, + "loss": 0.5587, + "step": 1580 + }, + { + "epoch": 1.249678440684674, + "grad_norm": 0.24583501946750871, + "learning_rate": 1.2616953171724078e-06, + "loss": 0.5726, + "step": 1581 + }, + { + "epoch": 1.2504699713070149, + "grad_norm": 0.2597718060699548, + "learning_rate": 1.260892869110764e-06, + "loss": 0.563, + "step": 1582 + }, + { + "epoch": 1.2512615019293558, + "grad_norm": 0.23711734212394223, + "learning_rate": 1.2600902407439643e-06, + "loss": 0.5581, + "step": 1583 + }, + { + "epoch": 1.252053032551697, + "grad_norm": 0.2379141319323817, + "learning_rate": 1.259287432626712e-06, + "loss": 0.5453, + "step": 1584 + }, + { + "epoch": 1.2528445631740377, + "grad_norm": 0.24675536545058904, + "learning_rate": 1.2584844453138334e-06, + "loss": 0.5534, + "step": 1585 + }, + { + "epoch": 1.2536360937963789, + "grad_norm": 0.2416337478880738, + "learning_rate": 1.2576812793602804e-06, + "loss": 0.5531, + "step": 1586 + }, + { + "epoch": 1.2544276244187196, + "grad_norm": 0.25820029418212614, + "learning_rate": 1.2568779353211272e-06, + "loss": 0.5551, + "step": 1587 + }, + { + "epoch": 1.2552191550410607, + "grad_norm": 0.2493044952822321, + "learning_rate": 1.256074413751571e-06, + "loss": 0.5646, + "step": 1588 + }, + { + "epoch": 1.2560106856634017, + "grad_norm": 0.2494315770295232, + "learning_rate": 1.2552707152069325e-06, + "loss": 0.5644, + "step": 1589 + }, + { + "epoch": 1.2568022162857426, + "grad_norm": 0.2430525866343491, + "learning_rate": 1.2544668402426538e-06, + "loss": 0.5618, + "step": 1590 + }, + { + "epoch": 1.2575937469080836, + "grad_norm": 0.24358367177137008, + "learning_rate": 1.253662789414299e-06, + "loss": 0.5691, + "step": 1591 + }, + { + "epoch": 1.2583852775304245, + "grad_norm": 0.2535198189922417, + "learning_rate": 1.252858563277555e-06, + "loss": 0.5737, + "step": 1592 + }, + { + "epoch": 1.2591768081527654, + "grad_norm": 0.24582225593517482, + "learning_rate": 1.2520541623882287e-06, + "loss": 0.5499, + "step": 1593 + }, + { + "epoch": 1.2599683387751064, + "grad_norm": 0.24472109624030428, + "learning_rate": 1.2512495873022474e-06, + "loss": 0.5543, + "step": 1594 + }, + { + "epoch": 1.2607598693974473, + "grad_norm": 0.25137341214156705, + "learning_rate": 1.2504448385756598e-06, + "loss": 0.5464, + "step": 1595 + }, + { + "epoch": 1.2615514000197883, + "grad_norm": 0.2627554199798165, + "learning_rate": 1.2496399167646342e-06, + "loss": 0.5566, + "step": 1596 + }, + { + "epoch": 1.2623429306421292, + "grad_norm": 0.24264954666822985, + "learning_rate": 1.2488348224254588e-06, + "loss": 0.5537, + "step": 1597 + }, + { + "epoch": 1.2631344612644702, + "grad_norm": 0.23472343110165828, + "learning_rate": 1.2480295561145405e-06, + "loss": 0.5536, + "step": 1598 + }, + { + "epoch": 1.263925991886811, + "grad_norm": 0.23799331683738528, + "learning_rate": 1.247224118388405e-06, + "loss": 0.5651, + "step": 1599 + }, + { + "epoch": 1.264717522509152, + "grad_norm": 0.24011674675311853, + "learning_rate": 1.246418509803697e-06, + "loss": 0.5637, + "step": 1600 + }, + { + "epoch": 1.265509053131493, + "grad_norm": 0.2417891185532008, + "learning_rate": 1.2456127309171793e-06, + "loss": 0.5638, + "step": 1601 + }, + { + "epoch": 1.266300583753834, + "grad_norm": 0.24520499447330632, + "learning_rate": 1.244806782285732e-06, + "loss": 0.5686, + "step": 1602 + }, + { + "epoch": 1.2670921143761749, + "grad_norm": 0.24736570319096188, + "learning_rate": 1.2440006644663528e-06, + "loss": 0.5573, + "step": 1603 + }, + { + "epoch": 1.2678836449985158, + "grad_norm": 0.2394906420948201, + "learning_rate": 1.2431943780161556e-06, + "loss": 0.5683, + "step": 1604 + }, + { + "epoch": 1.268675175620857, + "grad_norm": 0.24619462045414997, + "learning_rate": 1.2423879234923723e-06, + "loss": 0.5663, + "step": 1605 + }, + { + "epoch": 1.2694667062431977, + "grad_norm": 0.24828261671946197, + "learning_rate": 1.2415813014523496e-06, + "loss": 0.566, + "step": 1606 + }, + { + "epoch": 1.2702582368655388, + "grad_norm": 0.23611278428927193, + "learning_rate": 1.2407745124535505e-06, + "loss": 0.5645, + "step": 1607 + }, + { + "epoch": 1.2710497674878796, + "grad_norm": 0.24202287638992181, + "learning_rate": 1.2399675570535532e-06, + "loss": 0.58, + "step": 1608 + }, + { + "epoch": 1.2718412981102207, + "grad_norm": 0.23897979132953442, + "learning_rate": 1.2391604358100514e-06, + "loss": 0.5557, + "step": 1609 + }, + { + "epoch": 1.2726328287325617, + "grad_norm": 3.69549056371507, + "learning_rate": 1.2383531492808529e-06, + "loss": 0.5745, + "step": 1610 + }, + { + "epoch": 1.2734243593549026, + "grad_norm": 0.24544340196952252, + "learning_rate": 1.23754569802388e-06, + "loss": 0.5661, + "step": 1611 + }, + { + "epoch": 1.2742158899772436, + "grad_norm": 0.25838718505166175, + "learning_rate": 1.2367380825971686e-06, + "loss": 0.5431, + "step": 1612 + }, + { + "epoch": 1.2750074205995845, + "grad_norm": 0.25681360600743736, + "learning_rate": 1.2359303035588683e-06, + "loss": 0.5499, + "step": 1613 + }, + { + "epoch": 1.2757989512219254, + "grad_norm": 0.2360565094592681, + "learning_rate": 1.2351223614672417e-06, + "loss": 0.5618, + "step": 1614 + }, + { + "epoch": 1.2765904818442664, + "grad_norm": 0.23704001453136386, + "learning_rate": 1.2343142568806638e-06, + "loss": 0.557, + "step": 1615 + }, + { + "epoch": 1.2773820124666073, + "grad_norm": 0.2550329749989943, + "learning_rate": 1.2335059903576227e-06, + "loss": 0.5657, + "step": 1616 + }, + { + "epoch": 1.2781735430889483, + "grad_norm": 0.24638931620607757, + "learning_rate": 1.2326975624567173e-06, + "loss": 0.5601, + "step": 1617 + }, + { + "epoch": 1.2789650737112892, + "grad_norm": 0.2451404275754642, + "learning_rate": 1.231888973736659e-06, + "loss": 0.5636, + "step": 1618 + }, + { + "epoch": 1.2797566043336301, + "grad_norm": 0.2490864823150462, + "learning_rate": 1.2310802247562696e-06, + "loss": 0.5668, + "step": 1619 + }, + { + "epoch": 1.280548134955971, + "grad_norm": 0.2494801103850093, + "learning_rate": 1.2302713160744822e-06, + "loss": 0.5563, + "step": 1620 + }, + { + "epoch": 1.281339665578312, + "grad_norm": 0.2455574765759398, + "learning_rate": 1.2294622482503403e-06, + "loss": 0.5471, + "step": 1621 + }, + { + "epoch": 1.282131196200653, + "grad_norm": 0.24880822483734186, + "learning_rate": 1.2286530218429972e-06, + "loss": 0.5616, + "step": 1622 + }, + { + "epoch": 1.282922726822994, + "grad_norm": 0.2392897415582679, + "learning_rate": 1.2278436374117151e-06, + "loss": 0.5693, + "step": 1623 + }, + { + "epoch": 1.2837142574453348, + "grad_norm": 0.24940610953675144, + "learning_rate": 1.2270340955158668e-06, + "loss": 0.5542, + "step": 1624 + }, + { + "epoch": 1.2845057880676758, + "grad_norm": 0.2500966139602465, + "learning_rate": 1.2262243967149326e-06, + "loss": 0.5415, + "step": 1625 + }, + { + "epoch": 1.285297318690017, + "grad_norm": 0.24248185785508394, + "learning_rate": 1.2254145415685025e-06, + "loss": 0.5603, + "step": 1626 + }, + { + "epoch": 1.2860888493123577, + "grad_norm": 0.24258695020659132, + "learning_rate": 1.2246045306362736e-06, + "loss": 0.5651, + "step": 1627 + }, + { + "epoch": 1.2868803799346988, + "grad_norm": 0.254653360005497, + "learning_rate": 1.2237943644780507e-06, + "loss": 0.5442, + "step": 1628 + }, + { + "epoch": 1.2876719105570396, + "grad_norm": 0.2493302726077809, + "learning_rate": 1.2229840436537464e-06, + "loss": 0.5724, + "step": 1629 + }, + { + "epoch": 1.2884634411793807, + "grad_norm": 0.2402514109423032, + "learning_rate": 1.2221735687233797e-06, + "loss": 0.5564, + "step": 1630 + }, + { + "epoch": 1.2892549718017217, + "grad_norm": 0.2471478455884797, + "learning_rate": 1.2213629402470766e-06, + "loss": 0.5627, + "step": 1631 + }, + { + "epoch": 1.2900465024240626, + "grad_norm": 0.23997860829436274, + "learning_rate": 1.2205521587850692e-06, + "loss": 0.5465, + "step": 1632 + }, + { + "epoch": 1.2908380330464035, + "grad_norm": 0.24985884693129512, + "learning_rate": 1.2197412248976942e-06, + "loss": 0.5575, + "step": 1633 + }, + { + "epoch": 1.2916295636687445, + "grad_norm": 0.24793115694862064, + "learning_rate": 1.2189301391453954e-06, + "loss": 0.5561, + "step": 1634 + }, + { + "epoch": 1.2924210942910854, + "grad_norm": 0.24439507970308486, + "learning_rate": 1.21811890208872e-06, + "loss": 0.5728, + "step": 1635 + }, + { + "epoch": 1.2932126249134264, + "grad_norm": 0.24945868420510264, + "learning_rate": 1.217307514288321e-06, + "loss": 0.5568, + "step": 1636 + }, + { + "epoch": 1.2940041555357673, + "grad_norm": 0.24873126082475502, + "learning_rate": 1.2164959763049547e-06, + "loss": 0.5465, + "step": 1637 + }, + { + "epoch": 1.2947956861581082, + "grad_norm": 0.2456054378861574, + "learning_rate": 1.2156842886994815e-06, + "loss": 0.5637, + "step": 1638 + }, + { + "epoch": 1.2955872167804492, + "grad_norm": 0.240049154102141, + "learning_rate": 1.2148724520328655e-06, + "loss": 0.554, + "step": 1639 + }, + { + "epoch": 1.2963787474027901, + "grad_norm": 0.25353406818399277, + "learning_rate": 1.214060466866173e-06, + "loss": 0.5584, + "step": 1640 + }, + { + "epoch": 1.297170278025131, + "grad_norm": 0.2365662911935809, + "learning_rate": 1.2132483337605744e-06, + "loss": 0.5671, + "step": 1641 + }, + { + "epoch": 1.297961808647472, + "grad_norm": 0.2463877003258589, + "learning_rate": 1.2124360532773402e-06, + "loss": 0.5588, + "step": 1642 + }, + { + "epoch": 1.298753339269813, + "grad_norm": 0.2459247729337839, + "learning_rate": 1.211623625977845e-06, + "loss": 0.5651, + "step": 1643 + }, + { + "epoch": 1.299544869892154, + "grad_norm": 0.2460048320809122, + "learning_rate": 1.210811052423563e-06, + "loss": 0.5464, + "step": 1644 + }, + { + "epoch": 1.3003364005144948, + "grad_norm": 0.25144426047242724, + "learning_rate": 1.2099983331760713e-06, + "loss": 0.5591, + "step": 1645 + }, + { + "epoch": 1.3011279311368358, + "grad_norm": 0.24129146619153063, + "learning_rate": 1.2091854687970455e-06, + "loss": 0.5486, + "step": 1646 + }, + { + "epoch": 1.301919461759177, + "grad_norm": 0.2452087727819343, + "learning_rate": 1.2083724598482636e-06, + "loss": 0.5606, + "step": 1647 + }, + { + "epoch": 1.3027109923815177, + "grad_norm": 0.2558418227604141, + "learning_rate": 1.2075593068916022e-06, + "loss": 0.567, + "step": 1648 + }, + { + "epoch": 1.3035025230038588, + "grad_norm": 0.24655924383451752, + "learning_rate": 1.2067460104890377e-06, + "loss": 0.5533, + "step": 1649 + }, + { + "epoch": 1.3042940536261995, + "grad_norm": 0.2444611861271756, + "learning_rate": 1.2059325712026468e-06, + "loss": 0.5631, + "step": 1650 + }, + { + "epoch": 1.3050855842485407, + "grad_norm": 0.24142955295091997, + "learning_rate": 1.2051189895946024e-06, + "loss": 0.5577, + "step": 1651 + }, + { + "epoch": 1.3058771148708817, + "grad_norm": 0.25144848411228726, + "learning_rate": 1.2043052662271781e-06, + "loss": 0.5451, + "step": 1652 + }, + { + "epoch": 1.3066686454932226, + "grad_norm": 0.24858016163654087, + "learning_rate": 1.2034914016627447e-06, + "loss": 0.5624, + "step": 1653 + }, + { + "epoch": 1.3074601761155635, + "grad_norm": 0.25024588430180106, + "learning_rate": 1.2026773964637703e-06, + "loss": 0.5632, + "step": 1654 + }, + { + "epoch": 1.3082517067379045, + "grad_norm": 0.23830548334228782, + "learning_rate": 1.2018632511928208e-06, + "loss": 0.548, + "step": 1655 + }, + { + "epoch": 1.3090432373602454, + "grad_norm": 0.2538608293049789, + "learning_rate": 1.201048966412558e-06, + "loss": 0.5675, + "step": 1656 + }, + { + "epoch": 1.3098347679825864, + "grad_norm": 0.2516917880013272, + "learning_rate": 1.200234542685741e-06, + "loss": 0.58, + "step": 1657 + }, + { + "epoch": 1.3106262986049273, + "grad_norm": 0.24721531887082182, + "learning_rate": 1.1994199805752243e-06, + "loss": 0.5444, + "step": 1658 + }, + { + "epoch": 1.3114178292272682, + "grad_norm": 0.2520269105815416, + "learning_rate": 1.1986052806439589e-06, + "loss": 0.5606, + "step": 1659 + }, + { + "epoch": 1.3122093598496092, + "grad_norm": 0.2350653945276768, + "learning_rate": 1.1977904434549898e-06, + "loss": 0.5516, + "step": 1660 + }, + { + "epoch": 1.3130008904719501, + "grad_norm": 0.24388750167583753, + "learning_rate": 1.1969754695714575e-06, + "loss": 0.5591, + "step": 1661 + }, + { + "epoch": 1.313792421094291, + "grad_norm": 0.252058942375097, + "learning_rate": 1.1961603595565974e-06, + "loss": 0.5503, + "step": 1662 + }, + { + "epoch": 1.314583951716632, + "grad_norm": 0.24874839822478895, + "learning_rate": 1.1953451139737388e-06, + "loss": 0.5686, + "step": 1663 + }, + { + "epoch": 1.315375482338973, + "grad_norm": 0.23075194020607312, + "learning_rate": 1.194529733386304e-06, + "loss": 0.579, + "step": 1664 + }, + { + "epoch": 1.3161670129613139, + "grad_norm": 0.2552051894720154, + "learning_rate": 1.193714218357809e-06, + "loss": 0.5486, + "step": 1665 + }, + { + "epoch": 1.3169585435836548, + "grad_norm": 0.25481829889713686, + "learning_rate": 1.1928985694518627e-06, + "loss": 0.5487, + "step": 1666 + }, + { + "epoch": 1.3177500742059958, + "grad_norm": 0.24478122947030165, + "learning_rate": 1.192082787232167e-06, + "loss": 0.5586, + "step": 1667 + }, + { + "epoch": 1.318541604828337, + "grad_norm": 0.24904610342558647, + "learning_rate": 1.1912668722625157e-06, + "loss": 0.5588, + "step": 1668 + }, + { + "epoch": 1.3193331354506777, + "grad_norm": 0.2529525708987727, + "learning_rate": 1.1904508251067938e-06, + "loss": 0.5873, + "step": 1669 + }, + { + "epoch": 1.3201246660730188, + "grad_norm": 0.2453123415283199, + "learning_rate": 1.1896346463289778e-06, + "loss": 0.5562, + "step": 1670 + }, + { + "epoch": 1.3209161966953595, + "grad_norm": 0.2416389082493275, + "learning_rate": 1.188818336493136e-06, + "loss": 0.5439, + "step": 1671 + }, + { + "epoch": 1.3217077273177007, + "grad_norm": 0.23635897181983642, + "learning_rate": 1.1880018961634261e-06, + "loss": 0.5428, + "step": 1672 + }, + { + "epoch": 1.3224992579400414, + "grad_norm": 0.25313133701478413, + "learning_rate": 1.1871853259040972e-06, + "loss": 0.55, + "step": 1673 + }, + { + "epoch": 1.3232907885623826, + "grad_norm": 0.2415427081507558, + "learning_rate": 1.186368626279487e-06, + "loss": 0.5496, + "step": 1674 + }, + { + "epoch": 1.3240823191847235, + "grad_norm": 0.24239180033556626, + "learning_rate": 1.1855517978540234e-06, + "loss": 0.5612, + "step": 1675 + }, + { + "epoch": 1.3248738498070645, + "grad_norm": 0.24195732545796123, + "learning_rate": 1.1847348411922235e-06, + "loss": 0.5473, + "step": 1676 + }, + { + "epoch": 1.3256653804294054, + "grad_norm": 0.26194573198725696, + "learning_rate": 1.1839177568586918e-06, + "loss": 0.5614, + "step": 1677 + }, + { + "epoch": 1.3264569110517463, + "grad_norm": 0.24902746243757803, + "learning_rate": 1.1831005454181223e-06, + "loss": 0.5561, + "step": 1678 + }, + { + "epoch": 1.3272484416740873, + "grad_norm": 0.24489567020624506, + "learning_rate": 1.1822832074352962e-06, + "loss": 0.5507, + "step": 1679 + }, + { + "epoch": 1.3280399722964282, + "grad_norm": 0.25648305921788134, + "learning_rate": 1.1814657434750825e-06, + "loss": 0.5772, + "step": 1680 + }, + { + "epoch": 1.3288315029187692, + "grad_norm": 0.24089216814733655, + "learning_rate": 1.1806481541024371e-06, + "loss": 0.5544, + "step": 1681 + }, + { + "epoch": 1.32962303354111, + "grad_norm": 0.24541187842548162, + "learning_rate": 1.1798304398824024e-06, + "loss": 0.5487, + "step": 1682 + }, + { + "epoch": 1.330414564163451, + "grad_norm": 0.24598070389513868, + "learning_rate": 1.1790126013801074e-06, + "loss": 0.5645, + "step": 1683 + }, + { + "epoch": 1.331206094785792, + "grad_norm": 0.24629092938401864, + "learning_rate": 1.1781946391607672e-06, + "loss": 0.5656, + "step": 1684 + }, + { + "epoch": 1.331997625408133, + "grad_norm": 0.24923038130366343, + "learning_rate": 1.177376553789681e-06, + "loss": 0.5606, + "step": 1685 + }, + { + "epoch": 1.3327891560304739, + "grad_norm": 0.24311655903151758, + "learning_rate": 1.1765583458322354e-06, + "loss": 0.5614, + "step": 1686 + }, + { + "epoch": 1.3335806866528148, + "grad_norm": 0.24387615776609642, + "learning_rate": 1.1757400158538997e-06, + "loss": 0.5603, + "step": 1687 + }, + { + "epoch": 1.3343722172751558, + "grad_norm": 0.25462270378559737, + "learning_rate": 1.1749215644202286e-06, + "loss": 0.5749, + "step": 1688 + }, + { + "epoch": 1.335163747897497, + "grad_norm": 0.24532158932525416, + "learning_rate": 1.1741029920968605e-06, + "loss": 0.5583, + "step": 1689 + }, + { + "epoch": 1.3359552785198376, + "grad_norm": 0.24808814626487177, + "learning_rate": 1.1732842994495172e-06, + "loss": 0.5477, + "step": 1690 + }, + { + "epoch": 1.3367468091421788, + "grad_norm": 0.26179824952528297, + "learning_rate": 1.1724654870440036e-06, + "loss": 0.5657, + "step": 1691 + }, + { + "epoch": 1.3375383397645195, + "grad_norm": 0.23685666923300686, + "learning_rate": 1.171646555446208e-06, + "loss": 0.549, + "step": 1692 + }, + { + "epoch": 1.3383298703868607, + "grad_norm": 0.2514157285787257, + "learning_rate": 1.1708275052221e-06, + "loss": 0.558, + "step": 1693 + }, + { + "epoch": 1.3391214010092014, + "grad_norm": 0.2435334830438849, + "learning_rate": 1.1700083369377326e-06, + "loss": 0.56, + "step": 1694 + }, + { + "epoch": 1.3399129316315426, + "grad_norm": 0.24647470455772713, + "learning_rate": 1.1691890511592383e-06, + "loss": 0.5529, + "step": 1695 + }, + { + "epoch": 1.3407044622538835, + "grad_norm": 0.257918680442798, + "learning_rate": 1.1683696484528335e-06, + "loss": 0.5645, + "step": 1696 + }, + { + "epoch": 1.3414959928762245, + "grad_norm": 0.24001912836512246, + "learning_rate": 1.167550129384813e-06, + "loss": 0.569, + "step": 1697 + }, + { + "epoch": 1.3422875234985654, + "grad_norm": 0.25045209939003, + "learning_rate": 1.1667304945215534e-06, + "loss": 0.5629, + "step": 1698 + }, + { + "epoch": 1.3430790541209063, + "grad_norm": 0.24377854341294275, + "learning_rate": 1.1659107444295106e-06, + "loss": 0.5714, + "step": 1699 + }, + { + "epoch": 1.3438705847432473, + "grad_norm": 0.25114587450270015, + "learning_rate": 1.1650908796752213e-06, + "loss": 0.5643, + "step": 1700 + }, + { + "epoch": 1.3446621153655882, + "grad_norm": 0.23726228094488144, + "learning_rate": 1.1642709008252995e-06, + "loss": 0.5572, + "step": 1701 + }, + { + "epoch": 1.3454536459879292, + "grad_norm": 0.2570557148889931, + "learning_rate": 1.1634508084464402e-06, + "loss": 0.5536, + "step": 1702 + }, + { + "epoch": 1.34624517661027, + "grad_norm": 0.24462330596739756, + "learning_rate": 1.162630603105415e-06, + "loss": 0.5702, + "step": 1703 + }, + { + "epoch": 1.347036707232611, + "grad_norm": 0.24375145390909486, + "learning_rate": 1.1618102853690747e-06, + "loss": 0.5644, + "step": 1704 + }, + { + "epoch": 1.347828237854952, + "grad_norm": 0.24861484106318527, + "learning_rate": 1.1609898558043477e-06, + "loss": 0.5558, + "step": 1705 + }, + { + "epoch": 1.348619768477293, + "grad_norm": 0.24589755830104046, + "learning_rate": 1.1601693149782392e-06, + "loss": 0.5548, + "step": 1706 + }, + { + "epoch": 1.3494112990996339, + "grad_norm": 0.2405777155845157, + "learning_rate": 1.1593486634578316e-06, + "loss": 0.5659, + "step": 1707 + }, + { + "epoch": 1.3502028297219748, + "grad_norm": 0.2525715780086026, + "learning_rate": 1.1585279018102837e-06, + "loss": 0.5526, + "step": 1708 + }, + { + "epoch": 1.3509943603443157, + "grad_norm": 0.2482565319576305, + "learning_rate": 1.1577070306028302e-06, + "loss": 0.5443, + "step": 1709 + }, + { + "epoch": 1.351785890966657, + "grad_norm": 0.2447384221840171, + "learning_rate": 1.1568860504027827e-06, + "loss": 0.5533, + "step": 1710 + }, + { + "epoch": 1.3525774215889976, + "grad_norm": 0.24922245484056646, + "learning_rate": 1.1560649617775262e-06, + "loss": 0.5671, + "step": 1711 + }, + { + "epoch": 1.3533689522113388, + "grad_norm": 0.24148496427549768, + "learning_rate": 1.1552437652945223e-06, + "loss": 0.5564, + "step": 1712 + }, + { + "epoch": 1.3541604828336795, + "grad_norm": 0.2406317622445464, + "learning_rate": 1.154422461521306e-06, + "loss": 0.5392, + "step": 1713 + }, + { + "epoch": 1.3549520134560207, + "grad_norm": 0.244910794346466, + "learning_rate": 1.1536010510254872e-06, + "loss": 0.5642, + "step": 1714 + }, + { + "epoch": 1.3557435440783614, + "grad_norm": 0.24531774867589026, + "learning_rate": 1.1527795343747493e-06, + "loss": 0.5609, + "step": 1715 + }, + { + "epoch": 1.3565350747007026, + "grad_norm": 0.24205552864277713, + "learning_rate": 1.151957912136849e-06, + "loss": 0.5609, + "step": 1716 + }, + { + "epoch": 1.3573266053230435, + "grad_norm": 0.245561835662967, + "learning_rate": 1.151136184879616e-06, + "loss": 0.5704, + "step": 1717 + }, + { + "epoch": 1.3581181359453844, + "grad_norm": 0.23983199229942967, + "learning_rate": 1.1503143531709527e-06, + "loss": 0.5676, + "step": 1718 + }, + { + "epoch": 1.3589096665677254, + "grad_norm": 0.24191950280705846, + "learning_rate": 1.1494924175788332e-06, + "loss": 0.5805, + "step": 1719 + }, + { + "epoch": 1.3597011971900663, + "grad_norm": 0.24827783815317256, + "learning_rate": 1.1486703786713043e-06, + "loss": 0.5605, + "step": 1720 + }, + { + "epoch": 1.3604927278124073, + "grad_norm": 0.24507837042657588, + "learning_rate": 1.1478482370164837e-06, + "loss": 0.5518, + "step": 1721 + }, + { + "epoch": 1.3612842584347482, + "grad_norm": 0.24947413634372814, + "learning_rate": 1.1470259931825595e-06, + "loss": 0.5432, + "step": 1722 + }, + { + "epoch": 1.3620757890570891, + "grad_norm": 0.2426407520934941, + "learning_rate": 1.1462036477377917e-06, + "loss": 0.558, + "step": 1723 + }, + { + "epoch": 1.36286731967943, + "grad_norm": 0.2412205789291043, + "learning_rate": 1.1453812012505091e-06, + "loss": 0.566, + "step": 1724 + }, + { + "epoch": 1.363658850301771, + "grad_norm": 0.24108633129742646, + "learning_rate": 1.144558654289112e-06, + "loss": 0.5559, + "step": 1725 + }, + { + "epoch": 1.364450380924112, + "grad_norm": 0.23714632161758006, + "learning_rate": 1.1437360074220683e-06, + "loss": 0.5488, + "step": 1726 + }, + { + "epoch": 1.365241911546453, + "grad_norm": 0.24382604336059036, + "learning_rate": 1.1429132612179164e-06, + "loss": 0.5615, + "step": 1727 + }, + { + "epoch": 1.3660334421687939, + "grad_norm": 0.23894135962205929, + "learning_rate": 1.1420904162452626e-06, + "loss": 0.5655, + "step": 1728 + }, + { + "epoch": 1.3668249727911348, + "grad_norm": 0.24385031573903024, + "learning_rate": 1.1412674730727818e-06, + "loss": 0.5459, + "step": 1729 + }, + { + "epoch": 1.3676165034134757, + "grad_norm": 0.24695290459092584, + "learning_rate": 1.1404444322692167e-06, + "loss": 0.5648, + "step": 1730 + }, + { + "epoch": 1.3684080340358167, + "grad_norm": 0.24493741494489846, + "learning_rate": 1.139621294403377e-06, + "loss": 0.5676, + "step": 1731 + }, + { + "epoch": 1.3691995646581576, + "grad_norm": 0.24886579934473138, + "learning_rate": 1.1387980600441403e-06, + "loss": 0.5616, + "step": 1732 + }, + { + "epoch": 1.3699910952804988, + "grad_norm": 0.2475984162720158, + "learning_rate": 1.1379747297604502e-06, + "loss": 0.5557, + "step": 1733 + }, + { + "epoch": 1.3707826259028395, + "grad_norm": 0.24074846356777077, + "learning_rate": 1.1371513041213168e-06, + "loss": 0.5468, + "step": 1734 + }, + { + "epoch": 1.3715741565251807, + "grad_norm": 0.2481390305891257, + "learning_rate": 1.1363277836958167e-06, + "loss": 0.5719, + "step": 1735 + }, + { + "epoch": 1.3723656871475214, + "grad_norm": 0.24410534282618412, + "learning_rate": 1.1355041690530911e-06, + "loss": 0.5489, + "step": 1736 + }, + { + "epoch": 1.3731572177698625, + "grad_norm": 0.251570922054409, + "learning_rate": 1.1346804607623463e-06, + "loss": 0.5585, + "step": 1737 + }, + { + "epoch": 1.3739487483922035, + "grad_norm": 0.24773556027804694, + "learning_rate": 1.1338566593928544e-06, + "loss": 0.5665, + "step": 1738 + }, + { + "epoch": 1.3747402790145444, + "grad_norm": 0.2523851567742354, + "learning_rate": 1.1330327655139512e-06, + "loss": 0.5585, + "step": 1739 + }, + { + "epoch": 1.3755318096368854, + "grad_norm": 0.24722026729991606, + "learning_rate": 1.1322087796950358e-06, + "loss": 0.5665, + "step": 1740 + }, + { + "epoch": 1.3763233402592263, + "grad_norm": 0.24359214618999236, + "learning_rate": 1.1313847025055716e-06, + "loss": 0.539, + "step": 1741 + }, + { + "epoch": 1.3771148708815673, + "grad_norm": 0.24823250980538344, + "learning_rate": 1.1305605345150848e-06, + "loss": 0.5679, + "step": 1742 + }, + { + "epoch": 1.3779064015039082, + "grad_norm": 0.24950308412089048, + "learning_rate": 1.1297362762931647e-06, + "loss": 0.5513, + "step": 1743 + }, + { + "epoch": 1.3786979321262491, + "grad_norm": 0.24984903403329928, + "learning_rate": 1.1289119284094633e-06, + "loss": 0.5708, + "step": 1744 + }, + { + "epoch": 1.37948946274859, + "grad_norm": 0.2540393327158127, + "learning_rate": 1.128087491433693e-06, + "loss": 0.5502, + "step": 1745 + }, + { + "epoch": 1.380280993370931, + "grad_norm": 0.2397092671429131, + "learning_rate": 1.1272629659356296e-06, + "loss": 0.5514, + "step": 1746 + }, + { + "epoch": 1.381072523993272, + "grad_norm": 0.2545424160596748, + "learning_rate": 1.126438352485109e-06, + "loss": 0.5594, + "step": 1747 + }, + { + "epoch": 1.381864054615613, + "grad_norm": 0.25389933261603476, + "learning_rate": 1.1256136516520283e-06, + "loss": 0.553, + "step": 1748 + }, + { + "epoch": 1.3826555852379538, + "grad_norm": 0.25595186731390124, + "learning_rate": 1.124788864006345e-06, + "loss": 0.5545, + "step": 1749 + }, + { + "epoch": 1.3834471158602948, + "grad_norm": 0.2538215322222843, + "learning_rate": 1.1239639901180757e-06, + "loss": 0.552, + "step": 1750 + }, + { + "epoch": 1.3842386464826357, + "grad_norm": 0.2434561664825095, + "learning_rate": 1.1231390305572981e-06, + "loss": 0.5571, + "step": 1751 + }, + { + "epoch": 1.3850301771049767, + "grad_norm": 0.2515826753187997, + "learning_rate": 1.1223139858941481e-06, + "loss": 0.5621, + "step": 1752 + }, + { + "epoch": 1.3858217077273176, + "grad_norm": 0.24304199261022497, + "learning_rate": 1.121488856698821e-06, + "loss": 0.5636, + "step": 1753 + }, + { + "epoch": 1.3866132383496588, + "grad_norm": 0.249221543401245, + "learning_rate": 1.1206636435415696e-06, + "loss": 0.5621, + "step": 1754 + }, + { + "epoch": 1.3874047689719995, + "grad_norm": 0.25381314826168094, + "learning_rate": 1.1198383469927061e-06, + "loss": 0.5834, + "step": 1755 + }, + { + "epoch": 1.3881962995943407, + "grad_norm": 0.2461090617556699, + "learning_rate": 1.119012967622599e-06, + "loss": 0.5696, + "step": 1756 + }, + { + "epoch": 1.3889878302166814, + "grad_norm": 0.2522213466470216, + "learning_rate": 1.118187506001675e-06, + "loss": 0.5501, + "step": 1757 + }, + { + "epoch": 1.3897793608390225, + "grad_norm": 0.24781293816561925, + "learning_rate": 1.1173619627004168e-06, + "loss": 0.5707, + "step": 1758 + }, + { + "epoch": 1.3905708914613635, + "grad_norm": 0.24020257431457925, + "learning_rate": 1.1165363382893642e-06, + "loss": 0.5676, + "step": 1759 + }, + { + "epoch": 1.3913624220837044, + "grad_norm": 0.2501915603787139, + "learning_rate": 1.1157106333391131e-06, + "loss": 0.5584, + "step": 1760 + }, + { + "epoch": 1.3921539527060454, + "grad_norm": 0.2473703246046045, + "learning_rate": 1.1148848484203143e-06, + "loss": 0.5645, + "step": 1761 + }, + { + "epoch": 1.3929454833283863, + "grad_norm": 0.23890245427011528, + "learning_rate": 1.1140589841036749e-06, + "loss": 0.5673, + "step": 1762 + }, + { + "epoch": 1.3937370139507272, + "grad_norm": 0.23776869694051625, + "learning_rate": 1.1132330409599562e-06, + "loss": 0.574, + "step": 1763 + }, + { + "epoch": 1.3945285445730682, + "grad_norm": 0.23519164494572908, + "learning_rate": 1.1124070195599738e-06, + "loss": 0.5553, + "step": 1764 + }, + { + "epoch": 1.3953200751954091, + "grad_norm": 0.23840089959621127, + "learning_rate": 1.1115809204745982e-06, + "loss": 0.5665, + "step": 1765 + }, + { + "epoch": 1.39611160581775, + "grad_norm": 0.25152689581922855, + "learning_rate": 1.1107547442747529e-06, + "loss": 0.5474, + "step": 1766 + }, + { + "epoch": 1.396903136440091, + "grad_norm": 0.2470266862599932, + "learning_rate": 1.1099284915314152e-06, + "loss": 0.5752, + "step": 1767 + }, + { + "epoch": 1.397694667062432, + "grad_norm": 0.2412530565253777, + "learning_rate": 1.1091021628156143e-06, + "loss": 0.5498, + "step": 1768 + }, + { + "epoch": 1.398486197684773, + "grad_norm": 0.23721111685798252, + "learning_rate": 1.108275758698433e-06, + "loss": 0.5464, + "step": 1769 + }, + { + "epoch": 1.3992777283071138, + "grad_norm": 0.24351562526834195, + "learning_rate": 1.1074492797510062e-06, + "loss": 0.5564, + "step": 1770 + }, + { + "epoch": 1.4000692589294548, + "grad_norm": 0.25588642956275404, + "learning_rate": 1.1066227265445196e-06, + "loss": 0.5474, + "step": 1771 + }, + { + "epoch": 1.4008607895517957, + "grad_norm": 0.24325585961125706, + "learning_rate": 1.1057960996502109e-06, + "loss": 0.5559, + "step": 1772 + }, + { + "epoch": 1.4016523201741367, + "grad_norm": 0.24380364989827122, + "learning_rate": 1.1049693996393687e-06, + "loss": 0.5648, + "step": 1773 + }, + { + "epoch": 1.4024438507964776, + "grad_norm": 0.23597633461039252, + "learning_rate": 1.1041426270833318e-06, + "loss": 0.5673, + "step": 1774 + }, + { + "epoch": 1.4032353814188188, + "grad_norm": 0.24278524172378807, + "learning_rate": 1.1033157825534896e-06, + "loss": 0.551, + "step": 1775 + }, + { + "epoch": 1.4040269120411595, + "grad_norm": 0.25289497514364, + "learning_rate": 1.1024888666212808e-06, + "loss": 0.5515, + "step": 1776 + }, + { + "epoch": 1.4048184426635006, + "grad_norm": 0.2532208606043583, + "learning_rate": 1.1016618798581937e-06, + "loss": 0.5692, + "step": 1777 + }, + { + "epoch": 1.4056099732858414, + "grad_norm": 0.25692567593300203, + "learning_rate": 1.1008348228357657e-06, + "loss": 0.5424, + "step": 1778 + }, + { + "epoch": 1.4064015039081825, + "grad_norm": 0.23890317619691112, + "learning_rate": 1.1000076961255822e-06, + "loss": 0.559, + "step": 1779 + }, + { + "epoch": 1.4071930345305235, + "grad_norm": 0.2485982325211479, + "learning_rate": 1.099180500299277e-06, + "loss": 0.559, + "step": 1780 + }, + { + "epoch": 1.4079845651528644, + "grad_norm": 0.25210770277351785, + "learning_rate": 1.0983532359285324e-06, + "loss": 0.5713, + "step": 1781 + }, + { + "epoch": 1.4087760957752054, + "grad_norm": 0.2553458928622144, + "learning_rate": 1.097525903585077e-06, + "loss": 0.5756, + "step": 1782 + }, + { + "epoch": 1.4095676263975463, + "grad_norm": 0.2511017029175708, + "learning_rate": 1.0966985038406868e-06, + "loss": 0.5518, + "step": 1783 + }, + { + "epoch": 1.4103591570198872, + "grad_norm": 0.2475404708066992, + "learning_rate": 1.0958710372671845e-06, + "loss": 0.5637, + "step": 1784 + }, + { + "epoch": 1.4111506876422282, + "grad_norm": 0.23797302343362914, + "learning_rate": 1.0950435044364388e-06, + "loss": 0.5518, + "step": 1785 + }, + { + "epoch": 1.4119422182645691, + "grad_norm": 0.24656859756288255, + "learning_rate": 1.0942159059203644e-06, + "loss": 0.5431, + "step": 1786 + }, + { + "epoch": 1.41273374888691, + "grad_norm": 0.24107739978828868, + "learning_rate": 1.0933882422909216e-06, + "loss": 0.5567, + "step": 1787 + }, + { + "epoch": 1.413525279509251, + "grad_norm": 0.2356718550731877, + "learning_rate": 1.0925605141201144e-06, + "loss": 0.556, + "step": 1788 + }, + { + "epoch": 1.414316810131592, + "grad_norm": 0.24175815505353185, + "learning_rate": 1.0917327219799935e-06, + "loss": 0.5672, + "step": 1789 + }, + { + "epoch": 1.4151083407539329, + "grad_norm": 0.259273210045425, + "learning_rate": 1.0909048664426519e-06, + "loss": 0.5474, + "step": 1790 + }, + { + "epoch": 1.4158998713762738, + "grad_norm": 0.2452236914724908, + "learning_rate": 1.0900769480802278e-06, + "loss": 0.5657, + "step": 1791 + }, + { + "epoch": 1.4166914019986148, + "grad_norm": 0.24380905191208496, + "learning_rate": 1.0892489674649016e-06, + "loss": 0.5725, + "step": 1792 + }, + { + "epoch": 1.4174829326209557, + "grad_norm": 0.2509559233325704, + "learning_rate": 1.0884209251688978e-06, + "loss": 0.5376, + "step": 1793 + }, + { + "epoch": 1.4182744632432966, + "grad_norm": 0.24322450352928576, + "learning_rate": 1.0875928217644829e-06, + "loss": 0.5681, + "step": 1794 + }, + { + "epoch": 1.4190659938656376, + "grad_norm": 0.2504916663222412, + "learning_rate": 1.086764657823966e-06, + "loss": 0.5512, + "step": 1795 + }, + { + "epoch": 1.4198575244879788, + "grad_norm": 0.267150023425375, + "learning_rate": 1.0859364339196978e-06, + "loss": 0.5625, + "step": 1796 + }, + { + "epoch": 1.4206490551103195, + "grad_norm": 0.24717884320440905, + "learning_rate": 1.0851081506240703e-06, + "loss": 0.5569, + "step": 1797 + }, + { + "epoch": 1.4214405857326606, + "grad_norm": 0.2446443226487652, + "learning_rate": 1.0842798085095165e-06, + "loss": 0.5647, + "step": 1798 + }, + { + "epoch": 1.4222321163550014, + "grad_norm": 0.25692581921323493, + "learning_rate": 1.0834514081485111e-06, + "loss": 0.5653, + "step": 1799 + }, + { + "epoch": 1.4230236469773425, + "grad_norm": 0.2439742451677067, + "learning_rate": 1.0826229501135675e-06, + "loss": 0.5434, + "step": 1800 + }, + { + "epoch": 1.4238151775996835, + "grad_norm": 0.25058440196747644, + "learning_rate": 1.0817944349772398e-06, + "loss": 0.5554, + "step": 1801 + }, + { + "epoch": 1.4246067082220244, + "grad_norm": 0.25052414894956365, + "learning_rate": 1.0809658633121222e-06, + "loss": 0.5813, + "step": 1802 + }, + { + "epoch": 1.4253982388443653, + "grad_norm": 0.2551531754057182, + "learning_rate": 1.0801372356908461e-06, + "loss": 0.5689, + "step": 1803 + }, + { + "epoch": 1.4261897694667063, + "grad_norm": 0.24395819104907066, + "learning_rate": 1.0793085526860832e-06, + "loss": 0.55, + "step": 1804 + }, + { + "epoch": 1.4269813000890472, + "grad_norm": 0.253654833757268, + "learning_rate": 1.0784798148705432e-06, + "loss": 0.5646, + "step": 1805 + }, + { + "epoch": 1.4277728307113882, + "grad_norm": 0.2472594700686716, + "learning_rate": 1.0776510228169733e-06, + "loss": 0.5604, + "step": 1806 + }, + { + "epoch": 1.428564361333729, + "grad_norm": 0.2570127725543254, + "learning_rate": 1.0768221770981583e-06, + "loss": 0.5665, + "step": 1807 + }, + { + "epoch": 1.42935589195607, + "grad_norm": 0.25219949596877267, + "learning_rate": 1.0759932782869198e-06, + "loss": 0.5668, + "step": 1808 + }, + { + "epoch": 1.430147422578411, + "grad_norm": 0.24888935758981642, + "learning_rate": 1.075164326956117e-06, + "loss": 0.565, + "step": 1809 + }, + { + "epoch": 1.430938953200752, + "grad_norm": 0.2508456731183149, + "learning_rate": 1.0743353236786449e-06, + "loss": 0.5429, + "step": 1810 + }, + { + "epoch": 1.4317304838230929, + "grad_norm": 0.25010152903566374, + "learning_rate": 1.0735062690274337e-06, + "loss": 0.5517, + "step": 1811 + }, + { + "epoch": 1.4325220144454338, + "grad_norm": 0.2623661368348324, + "learning_rate": 1.0726771635754502e-06, + "loss": 0.5554, + "step": 1812 + }, + { + "epoch": 1.4333135450677748, + "grad_norm": 0.26566701270731186, + "learning_rate": 1.0718480078956952e-06, + "loss": 0.5676, + "step": 1813 + }, + { + "epoch": 1.4341050756901157, + "grad_norm": 0.2582041514202188, + "learning_rate": 1.0710188025612055e-06, + "loss": 0.5643, + "step": 1814 + }, + { + "epoch": 1.4348966063124566, + "grad_norm": 0.24095795169398565, + "learning_rate": 1.0701895481450516e-06, + "loss": 0.577, + "step": 1815 + }, + { + "epoch": 1.4356881369347976, + "grad_norm": 0.24552980144547748, + "learning_rate": 1.0693602452203374e-06, + "loss": 0.5434, + "step": 1816 + }, + { + "epoch": 1.4364796675571387, + "grad_norm": 0.24802734115308014, + "learning_rate": 1.0685308943602013e-06, + "loss": 0.5495, + "step": 1817 + }, + { + "epoch": 1.4372711981794795, + "grad_norm": 0.24063232282451458, + "learning_rate": 1.0677014961378136e-06, + "loss": 0.5643, + "step": 1818 + }, + { + "epoch": 1.4380627288018206, + "grad_norm": 0.26674826901817056, + "learning_rate": 1.0668720511263786e-06, + "loss": 0.5563, + "step": 1819 + }, + { + "epoch": 1.4388542594241613, + "grad_norm": 0.2519268868939359, + "learning_rate": 1.0660425598991333e-06, + "loss": 0.5708, + "step": 1820 + }, + { + "epoch": 1.4396457900465025, + "grad_norm": 0.25537139429719885, + "learning_rate": 1.0652130230293437e-06, + "loss": 0.5528, + "step": 1821 + }, + { + "epoch": 1.4404373206688434, + "grad_norm": 0.24245967377929403, + "learning_rate": 1.0643834410903108e-06, + "loss": 0.5683, + "step": 1822 + }, + { + "epoch": 1.4412288512911844, + "grad_norm": 0.24700653920398574, + "learning_rate": 1.063553814655365e-06, + "loss": 0.5645, + "step": 1823 + }, + { + "epoch": 1.4420203819135253, + "grad_norm": 0.25569340348431774, + "learning_rate": 1.062724144297868e-06, + "loss": 0.5684, + "step": 1824 + }, + { + "epoch": 1.4428119125358663, + "grad_norm": 0.2442426148762711, + "learning_rate": 1.0618944305912117e-06, + "loss": 0.5611, + "step": 1825 + }, + { + "epoch": 1.4436034431582072, + "grad_norm": 0.24668468930856702, + "learning_rate": 1.0610646741088171e-06, + "loss": 0.5542, + "step": 1826 + }, + { + "epoch": 1.4443949737805482, + "grad_norm": 0.24180925399348951, + "learning_rate": 1.060234875424136e-06, + "loss": 0.5496, + "step": 1827 + }, + { + "epoch": 1.445186504402889, + "grad_norm": 0.24548292791451223, + "learning_rate": 1.0594050351106495e-06, + "loss": 0.5429, + "step": 1828 + }, + { + "epoch": 1.44597803502523, + "grad_norm": 0.2457166387439888, + "learning_rate": 1.0585751537418665e-06, + "loss": 0.5571, + "step": 1829 + }, + { + "epoch": 1.446769565647571, + "grad_norm": 0.2517923689801228, + "learning_rate": 1.0577452318913244e-06, + "loss": 0.5599, + "step": 1830 + }, + { + "epoch": 1.447561096269912, + "grad_norm": 0.2434257429077564, + "learning_rate": 1.0569152701325891e-06, + "loss": 0.5687, + "step": 1831 + }, + { + "epoch": 1.4483526268922529, + "grad_norm": 0.2536063863905166, + "learning_rate": 1.0560852690392538e-06, + "loss": 0.569, + "step": 1832 + }, + { + "epoch": 1.4491441575145938, + "grad_norm": 0.24997586186843135, + "learning_rate": 1.055255229184939e-06, + "loss": 0.5678, + "step": 1833 + }, + { + "epoch": 1.4499356881369347, + "grad_norm": 0.2394202668192573, + "learning_rate": 1.054425151143292e-06, + "loss": 0.5684, + "step": 1834 + }, + { + "epoch": 1.4507272187592757, + "grad_norm": 0.24622681685601036, + "learning_rate": 1.053595035487986e-06, + "loss": 0.56, + "step": 1835 + }, + { + "epoch": 1.4515187493816166, + "grad_norm": 0.24766022547943686, + "learning_rate": 1.052764882792721e-06, + "loss": 0.5507, + "step": 1836 + }, + { + "epoch": 1.4523102800039576, + "grad_norm": 0.24766891264630292, + "learning_rate": 1.0519346936312217e-06, + "loss": 0.5534, + "step": 1837 + }, + { + "epoch": 1.4531018106262987, + "grad_norm": 0.24371750012654173, + "learning_rate": 1.051104468577239e-06, + "loss": 0.5603, + "step": 1838 + }, + { + "epoch": 1.4538933412486394, + "grad_norm": 0.24773989442809743, + "learning_rate": 1.0502742082045478e-06, + "loss": 0.5482, + "step": 1839 + }, + { + "epoch": 1.4546848718709806, + "grad_norm": 0.24593443044170213, + "learning_rate": 1.049443913086948e-06, + "loss": 0.5613, + "step": 1840 + }, + { + "epoch": 1.4554764024933213, + "grad_norm": 0.2520294939428411, + "learning_rate": 1.048613583798263e-06, + "loss": 0.55, + "step": 1841 + }, + { + "epoch": 1.4562679331156625, + "grad_norm": 0.2568090128922785, + "learning_rate": 1.0477832209123399e-06, + "loss": 0.562, + "step": 1842 + }, + { + "epoch": 1.4570594637380034, + "grad_norm": 0.24861420329563133, + "learning_rate": 1.0469528250030495e-06, + "loss": 0.5554, + "step": 1843 + }, + { + "epoch": 1.4578509943603444, + "grad_norm": 0.2397636208883377, + "learning_rate": 1.046122396644285e-06, + "loss": 0.5527, + "step": 1844 + }, + { + "epoch": 1.4586425249826853, + "grad_norm": 0.25211289237402956, + "learning_rate": 1.0452919364099616e-06, + "loss": 0.5578, + "step": 1845 + }, + { + "epoch": 1.4594340556050263, + "grad_norm": 0.2622014777508393, + "learning_rate": 1.0444614448740177e-06, + "loss": 0.5712, + "step": 1846 + }, + { + "epoch": 1.4602255862273672, + "grad_norm": 0.24645617140580847, + "learning_rate": 1.0436309226104124e-06, + "loss": 0.5594, + "step": 1847 + }, + { + "epoch": 1.4610171168497081, + "grad_norm": 0.2459939996889011, + "learning_rate": 1.0428003701931263e-06, + "loss": 0.5311, + "step": 1848 + }, + { + "epoch": 1.461808647472049, + "grad_norm": 0.24216005432750698, + "learning_rate": 1.0419697881961606e-06, + "loss": 0.5449, + "step": 1849 + }, + { + "epoch": 1.46260017809439, + "grad_norm": 0.24265662602239907, + "learning_rate": 1.0411391771935377e-06, + "loss": 0.5483, + "step": 1850 + }, + { + "epoch": 1.463391708716731, + "grad_norm": 0.24575939677213582, + "learning_rate": 1.0403085377592991e-06, + "loss": 0.5509, + "step": 1851 + }, + { + "epoch": 1.464183239339072, + "grad_norm": 0.23687738850752296, + "learning_rate": 1.0394778704675066e-06, + "loss": 0.5555, + "step": 1852 + }, + { + "epoch": 1.4649747699614128, + "grad_norm": 0.2499892511365357, + "learning_rate": 1.038647175892241e-06, + "loss": 0.5551, + "step": 1853 + }, + { + "epoch": 1.4657663005837538, + "grad_norm": 0.24192706787427268, + "learning_rate": 1.0378164546076022e-06, + "loss": 0.5658, + "step": 1854 + }, + { + "epoch": 1.4665578312060947, + "grad_norm": 0.23184326927057436, + "learning_rate": 1.0369857071877077e-06, + "loss": 0.5574, + "step": 1855 + }, + { + "epoch": 1.4673493618284357, + "grad_norm": 0.2687981320629577, + "learning_rate": 1.0361549342066942e-06, + "loss": 0.5525, + "step": 1856 + }, + { + "epoch": 1.4681408924507766, + "grad_norm": 0.24080649133852142, + "learning_rate": 1.0353241362387156e-06, + "loss": 0.5472, + "step": 1857 + }, + { + "epoch": 1.4689324230731176, + "grad_norm": 0.24991388863098019, + "learning_rate": 1.034493313857943e-06, + "loss": 0.5484, + "step": 1858 + }, + { + "epoch": 1.4697239536954587, + "grad_norm": 0.2645689623760348, + "learning_rate": 1.0336624676385644e-06, + "loss": 0.5531, + "step": 1859 + }, + { + "epoch": 1.4705154843177994, + "grad_norm": 0.26137992808052546, + "learning_rate": 1.032831598154784e-06, + "loss": 0.5653, + "step": 1860 + }, + { + "epoch": 1.4713070149401406, + "grad_norm": 0.25109005763019854, + "learning_rate": 1.0320007059808226e-06, + "loss": 0.5462, + "step": 1861 + }, + { + "epoch": 1.4720985455624813, + "grad_norm": 0.2438954035418305, + "learning_rate": 1.0311697916909165e-06, + "loss": 0.5605, + "step": 1862 + }, + { + "epoch": 1.4728900761848225, + "grad_norm": 0.24324977710114248, + "learning_rate": 1.0303388558593176e-06, + "loss": 0.5494, + "step": 1863 + }, + { + "epoch": 1.4736816068071634, + "grad_norm": 0.24616857673174009, + "learning_rate": 1.0295078990602914e-06, + "loss": 0.555, + "step": 1864 + }, + { + "epoch": 1.4744731374295044, + "grad_norm": 0.24526355098388605, + "learning_rate": 1.0286769218681194e-06, + "loss": 0.5538, + "step": 1865 + }, + { + "epoch": 1.4752646680518453, + "grad_norm": 0.2522719853942459, + "learning_rate": 1.0278459248570968e-06, + "loss": 0.5632, + "step": 1866 + }, + { + "epoch": 1.4760561986741862, + "grad_norm": 0.24668814131099567, + "learning_rate": 1.027014908601532e-06, + "loss": 0.557, + "step": 1867 + }, + { + "epoch": 1.4768477292965272, + "grad_norm": 0.24958807685657602, + "learning_rate": 1.0261838736757469e-06, + "loss": 0.5638, + "step": 1868 + }, + { + "epoch": 1.4776392599188681, + "grad_norm": 0.2416674442464471, + "learning_rate": 1.0253528206540764e-06, + "loss": 0.5408, + "step": 1869 + }, + { + "epoch": 1.478430790541209, + "grad_norm": 0.24183631113589926, + "learning_rate": 1.0245217501108686e-06, + "loss": 0.5465, + "step": 1870 + }, + { + "epoch": 1.47922232116355, + "grad_norm": 0.24684608161971405, + "learning_rate": 1.0236906626204818e-06, + "loss": 0.5679, + "step": 1871 + }, + { + "epoch": 1.480013851785891, + "grad_norm": 0.2577969021987222, + "learning_rate": 1.0228595587572886e-06, + "loss": 0.5726, + "step": 1872 + }, + { + "epoch": 1.480805382408232, + "grad_norm": 0.24719018121256667, + "learning_rate": 1.0220284390956703e-06, + "loss": 0.5541, + "step": 1873 + }, + { + "epoch": 1.4815969130305728, + "grad_norm": 0.24327660265334178, + "learning_rate": 1.0211973042100212e-06, + "loss": 0.5593, + "step": 1874 + }, + { + "epoch": 1.4823884436529138, + "grad_norm": 0.24716476661970774, + "learning_rate": 1.020366154674745e-06, + "loss": 0.5582, + "step": 1875 + }, + { + "epoch": 1.4831799742752547, + "grad_norm": 0.24889099861675523, + "learning_rate": 1.019534991064256e-06, + "loss": 0.5617, + "step": 1876 + }, + { + "epoch": 1.4839715048975957, + "grad_norm": 0.24104354323552155, + "learning_rate": 1.0187038139529775e-06, + "loss": 0.54, + "step": 1877 + }, + { + "epoch": 1.4847630355199366, + "grad_norm": 0.25631507617820665, + "learning_rate": 1.0178726239153435e-06, + "loss": 0.5551, + "step": 1878 + }, + { + "epoch": 1.4855545661422775, + "grad_norm": 0.24265480781006588, + "learning_rate": 1.0170414215257955e-06, + "loss": 0.5538, + "step": 1879 + }, + { + "epoch": 1.4863460967646187, + "grad_norm": 0.2571920866849079, + "learning_rate": 1.016210207358784e-06, + "loss": 0.5477, + "step": 1880 + }, + { + "epoch": 1.4871376273869594, + "grad_norm": 0.25687643816628813, + "learning_rate": 1.0153789819887688e-06, + "loss": 0.5546, + "step": 1881 + }, + { + "epoch": 1.4879291580093006, + "grad_norm": 0.24634232950761223, + "learning_rate": 1.014547745990215e-06, + "loss": 0.5716, + "step": 1882 + }, + { + "epoch": 1.4887206886316413, + "grad_norm": 0.24512103236922167, + "learning_rate": 1.0137164999375977e-06, + "loss": 0.5591, + "step": 1883 + }, + { + "epoch": 1.4895122192539825, + "grad_norm": 0.24374321098015242, + "learning_rate": 1.0128852444053969e-06, + "loss": 0.5535, + "step": 1884 + }, + { + "epoch": 1.4903037498763234, + "grad_norm": 0.2524403358422412, + "learning_rate": 1.0120539799681003e-06, + "loss": 0.557, + "step": 1885 + }, + { + "epoch": 1.4910952804986644, + "grad_norm": 0.2517792905086232, + "learning_rate": 1.0112227072002016e-06, + "loss": 0.5492, + "step": 1886 + }, + { + "epoch": 1.4918868111210053, + "grad_norm": 0.24995874361292328, + "learning_rate": 1.0103914266761997e-06, + "loss": 0.5465, + "step": 1887 + }, + { + "epoch": 1.4926783417433462, + "grad_norm": 0.2415788919243182, + "learning_rate": 1.0095601389705995e-06, + "loss": 0.5537, + "step": 1888 + }, + { + "epoch": 1.4934698723656872, + "grad_norm": 0.252420574246598, + "learning_rate": 1.0087288446579104e-06, + "loss": 0.5525, + "step": 1889 + }, + { + "epoch": 1.4942614029880281, + "grad_norm": 0.26350997377549623, + "learning_rate": 1.0078975443126467e-06, + "loss": 0.5566, + "step": 1890 + }, + { + "epoch": 1.495052933610369, + "grad_norm": 0.25324136736305286, + "learning_rate": 1.0070662385093268e-06, + "loss": 0.5621, + "step": 1891 + }, + { + "epoch": 1.49584446423271, + "grad_norm": 0.253270804896081, + "learning_rate": 1.006234927822473e-06, + "loss": 0.5667, + "step": 1892 + }, + { + "epoch": 1.496635994855051, + "grad_norm": 0.24124282896723453, + "learning_rate": 1.0054036128266103e-06, + "loss": 0.5607, + "step": 1893 + }, + { + "epoch": 1.4974275254773919, + "grad_norm": 0.2448640921733728, + "learning_rate": 1.0045722940962674e-06, + "loss": 0.5442, + "step": 1894 + }, + { + "epoch": 1.4982190560997328, + "grad_norm": 0.2640041793280316, + "learning_rate": 1.0037409722059753e-06, + "loss": 0.5509, + "step": 1895 + }, + { + "epoch": 1.4990105867220738, + "grad_norm": 0.25178257836527557, + "learning_rate": 1.0029096477302675e-06, + "loss": 0.5663, + "step": 1896 + }, + { + "epoch": 1.4998021173444147, + "grad_norm": 0.2522885663597851, + "learning_rate": 1.0020783212436786e-06, + "loss": 0.5513, + "step": 1897 + }, + { + "epoch": 1.5005936479667557, + "grad_norm": 0.23903412879321914, + "learning_rate": 1.001246993320745e-06, + "loss": 0.5472, + "step": 1898 + }, + { + "epoch": 1.5013851785890968, + "grad_norm": 0.24148865550325693, + "learning_rate": 1.0004156645360046e-06, + "loss": 0.5478, + "step": 1899 + }, + { + "epoch": 1.5021767092114375, + "grad_norm": 0.24617269278643503, + "learning_rate": 9.995843354639953e-07, + "loss": 0.5697, + "step": 1900 + }, + { + "epoch": 1.5029682398337787, + "grad_norm": 0.2512853954967072, + "learning_rate": 9.98753006679255e-07, + "loss": 0.5468, + "step": 1901 + }, + { + "epoch": 1.5037597704561194, + "grad_norm": 0.24242798075534466, + "learning_rate": 9.979216787563215e-07, + "loss": 0.5474, + "step": 1902 + }, + { + "epoch": 1.5045513010784606, + "grad_norm": 0.2500089048504842, + "learning_rate": 9.970903522697327e-07, + "loss": 0.5641, + "step": 1903 + }, + { + "epoch": 1.5053428317008013, + "grad_norm": 0.2506717772095021, + "learning_rate": 9.962590277940248e-07, + "loss": 0.5689, + "step": 1904 + }, + { + "epoch": 1.5061343623231425, + "grad_norm": 0.23509731460848576, + "learning_rate": 9.954277059037328e-07, + "loss": 0.5665, + "step": 1905 + }, + { + "epoch": 1.5069258929454832, + "grad_norm": 0.23577031562634193, + "learning_rate": 9.9459638717339e-07, + "loss": 0.5592, + "step": 1906 + }, + { + "epoch": 1.5077174235678243, + "grad_norm": 0.23301075318780953, + "learning_rate": 9.937650721775271e-07, + "loss": 0.5453, + "step": 1907 + }, + { + "epoch": 1.5085089541901653, + "grad_norm": 0.23510510224196848, + "learning_rate": 9.92933761490673e-07, + "loss": 0.5653, + "step": 1908 + }, + { + "epoch": 1.5093004848125062, + "grad_norm": 0.2508185976569224, + "learning_rate": 9.921024556873534e-07, + "loss": 0.5653, + "step": 1909 + }, + { + "epoch": 1.5100920154348472, + "grad_norm": 0.24532833686758868, + "learning_rate": 9.9127115534209e-07, + "loss": 0.5659, + "step": 1910 + }, + { + "epoch": 1.510883546057188, + "grad_norm": 0.24982293913960318, + "learning_rate": 9.904398610294009e-07, + "loss": 0.5475, + "step": 1911 + }, + { + "epoch": 1.511675076679529, + "grad_norm": 0.2461122246563086, + "learning_rate": 9.896085733238005e-07, + "loss": 0.5663, + "step": 1912 + }, + { + "epoch": 1.51246660730187, + "grad_norm": 0.24342830334949983, + "learning_rate": 9.887772927997985e-07, + "loss": 0.5636, + "step": 1913 + }, + { + "epoch": 1.513258137924211, + "grad_norm": 0.26595073547637565, + "learning_rate": 9.879460200318996e-07, + "loss": 0.5505, + "step": 1914 + }, + { + "epoch": 1.5140496685465519, + "grad_norm": 0.2547639076075029, + "learning_rate": 9.87114755594603e-07, + "loss": 0.5672, + "step": 1915 + }, + { + "epoch": 1.5148411991688928, + "grad_norm": 0.24435738914065724, + "learning_rate": 9.862835000624027e-07, + "loss": 0.5614, + "step": 1916 + }, + { + "epoch": 1.5156327297912338, + "grad_norm": 0.24217728482583128, + "learning_rate": 9.854522540097849e-07, + "loss": 0.5654, + "step": 1917 + }, + { + "epoch": 1.5164242604135747, + "grad_norm": 0.2438592351187668, + "learning_rate": 9.846210180112313e-07, + "loss": 0.554, + "step": 1918 + }, + { + "epoch": 1.5172157910359156, + "grad_norm": 0.2543337565157083, + "learning_rate": 9.837897926412158e-07, + "loss": 0.5678, + "step": 1919 + }, + { + "epoch": 1.5180073216582568, + "grad_norm": 0.2475247777369915, + "learning_rate": 9.829585784742044e-07, + "loss": 0.5557, + "step": 1920 + }, + { + "epoch": 1.5187988522805975, + "grad_norm": 0.24541528673676888, + "learning_rate": 9.821273760846566e-07, + "loss": 0.5692, + "step": 1921 + }, + { + "epoch": 1.5195903829029387, + "grad_norm": 0.2440537926695465, + "learning_rate": 9.812961860470224e-07, + "loss": 0.5506, + "step": 1922 + }, + { + "epoch": 1.5203819135252794, + "grad_norm": 0.242479518117656, + "learning_rate": 9.80465008935744e-07, + "loss": 0.5486, + "step": 1923 + }, + { + "epoch": 1.5211734441476206, + "grad_norm": 0.2391267667552566, + "learning_rate": 9.79633845325255e-07, + "loss": 0.5679, + "step": 1924 + }, + { + "epoch": 1.5219649747699613, + "grad_norm": 0.25073091567616873, + "learning_rate": 9.788026957899787e-07, + "loss": 0.5559, + "step": 1925 + }, + { + "epoch": 1.5227565053923025, + "grad_norm": 0.251673253025521, + "learning_rate": 9.779715609043298e-07, + "loss": 0.564, + "step": 1926 + }, + { + "epoch": 1.5235480360146432, + "grad_norm": 0.23796244303345004, + "learning_rate": 9.771404412427118e-07, + "loss": 0.5499, + "step": 1927 + }, + { + "epoch": 1.5243395666369843, + "grad_norm": 0.2451054292089041, + "learning_rate": 9.763093373795182e-07, + "loss": 0.5538, + "step": 1928 + }, + { + "epoch": 1.5251310972593253, + "grad_norm": 0.2624505164949271, + "learning_rate": 9.754782498891315e-07, + "loss": 0.5539, + "step": 1929 + }, + { + "epoch": 1.5259226278816662, + "grad_norm": 0.23533235614968642, + "learning_rate": 9.746471793459233e-07, + "loss": 0.575, + "step": 1930 + }, + { + "epoch": 1.5267141585040072, + "grad_norm": 0.2475762834224649, + "learning_rate": 9.738161263242533e-07, + "loss": 0.5417, + "step": 1931 + }, + { + "epoch": 1.527505689126348, + "grad_norm": 0.24674239111653284, + "learning_rate": 9.72985091398468e-07, + "loss": 0.552, + "step": 1932 + }, + { + "epoch": 1.528297219748689, + "grad_norm": 0.25891616435498827, + "learning_rate": 9.721540751429033e-07, + "loss": 0.5555, + "step": 1933 + }, + { + "epoch": 1.52908875037103, + "grad_norm": 0.2416627505402866, + "learning_rate": 9.713230781318805e-07, + "loss": 0.5576, + "step": 1934 + }, + { + "epoch": 1.529880280993371, + "grad_norm": 0.2382625660179014, + "learning_rate": 9.704921009397085e-07, + "loss": 0.5602, + "step": 1935 + }, + { + "epoch": 1.5306718116157119, + "grad_norm": 0.25361790791674116, + "learning_rate": 9.696611441406823e-07, + "loss": 0.5692, + "step": 1936 + }, + { + "epoch": 1.5314633422380528, + "grad_norm": 0.2590692844289554, + "learning_rate": 9.688302083090834e-07, + "loss": 0.5623, + "step": 1937 + }, + { + "epoch": 1.5322548728603937, + "grad_norm": 0.24290857121052734, + "learning_rate": 9.679992940191775e-07, + "loss": 0.5589, + "step": 1938 + }, + { + "epoch": 1.5330464034827347, + "grad_norm": 0.2590396521661701, + "learning_rate": 9.67168401845216e-07, + "loss": 0.5614, + "step": 1939 + }, + { + "epoch": 1.5338379341050756, + "grad_norm": 0.24220575644838827, + "learning_rate": 9.663375323614355e-07, + "loss": 0.5563, + "step": 1940 + }, + { + "epoch": 1.5346294647274168, + "grad_norm": 0.2431427963659207, + "learning_rate": 9.655066861420568e-07, + "loss": 0.5686, + "step": 1941 + }, + { + "epoch": 1.5354209953497575, + "grad_norm": 0.2445220192707097, + "learning_rate": 9.646758637612845e-07, + "loss": 0.5563, + "step": 1942 + }, + { + "epoch": 1.5362125259720987, + "grad_norm": 0.24382547746742983, + "learning_rate": 9.638450657933057e-07, + "loss": 0.5733, + "step": 1943 + }, + { + "epoch": 1.5370040565944394, + "grad_norm": 0.2499805349979736, + "learning_rate": 9.630142928122922e-07, + "loss": 0.5623, + "step": 1944 + }, + { + "epoch": 1.5377955872167806, + "grad_norm": 0.25483671995828583, + "learning_rate": 9.62183545392398e-07, + "loss": 0.5698, + "step": 1945 + }, + { + "epoch": 1.5385871178391213, + "grad_norm": 0.24186776789464984, + "learning_rate": 9.613528241077589e-07, + "loss": 0.5491, + "step": 1946 + }, + { + "epoch": 1.5393786484614624, + "grad_norm": 0.24206995994822683, + "learning_rate": 9.605221295324933e-07, + "loss": 0.5571, + "step": 1947 + }, + { + "epoch": 1.5401701790838032, + "grad_norm": 0.24485203216287849, + "learning_rate": 9.59691462240701e-07, + "loss": 0.5635, + "step": 1948 + }, + { + "epoch": 1.5409617097061443, + "grad_norm": 0.2496955131917218, + "learning_rate": 9.588608228064624e-07, + "loss": 0.5516, + "step": 1949 + }, + { + "epoch": 1.541753240328485, + "grad_norm": 0.24139459751072465, + "learning_rate": 9.580302118038393e-07, + "loss": 0.5441, + "step": 1950 + }, + { + "epoch": 1.5425447709508262, + "grad_norm": 0.24458136696925753, + "learning_rate": 9.571996298068738e-07, + "loss": 0.5548, + "step": 1951 + }, + { + "epoch": 1.5433363015731671, + "grad_norm": 0.2397884266348811, + "learning_rate": 9.563690773895877e-07, + "loss": 0.5526, + "step": 1952 + }, + { + "epoch": 1.544127832195508, + "grad_norm": 0.24533557300048986, + "learning_rate": 9.555385551259826e-07, + "loss": 0.5611, + "step": 1953 + }, + { + "epoch": 1.544919362817849, + "grad_norm": 0.24802243370126517, + "learning_rate": 9.547080635900384e-07, + "loss": 0.5549, + "step": 1954 + }, + { + "epoch": 1.54571089344019, + "grad_norm": 0.24638276392236952, + "learning_rate": 9.538776033557151e-07, + "loss": 0.5422, + "step": 1955 + }, + { + "epoch": 1.546502424062531, + "grad_norm": 0.23797566068655734, + "learning_rate": 9.530471749969505e-07, + "loss": 0.5658, + "step": 1956 + }, + { + "epoch": 1.5472939546848719, + "grad_norm": 0.2498023988712014, + "learning_rate": 9.522167790876599e-07, + "loss": 0.5593, + "step": 1957 + }, + { + "epoch": 1.5480854853072128, + "grad_norm": 0.2347849795627273, + "learning_rate": 9.513864162017372e-07, + "loss": 0.5608, + "step": 1958 + }, + { + "epoch": 1.5488770159295537, + "grad_norm": 0.24766480533354307, + "learning_rate": 9.50556086913052e-07, + "loss": 0.5539, + "step": 1959 + }, + { + "epoch": 1.5496685465518947, + "grad_norm": 0.25857272748452553, + "learning_rate": 9.49725791795452e-07, + "loss": 0.542, + "step": 1960 + }, + { + "epoch": 1.5504600771742356, + "grad_norm": 0.23965192361455062, + "learning_rate": 9.48895531422761e-07, + "loss": 0.558, + "step": 1961 + }, + { + "epoch": 1.5512516077965768, + "grad_norm": 0.2392492882182199, + "learning_rate": 9.480653063687781e-07, + "loss": 0.5469, + "step": 1962 + }, + { + "epoch": 1.5520431384189175, + "grad_norm": 0.24234911620503033, + "learning_rate": 9.472351172072794e-07, + "loss": 0.5508, + "step": 1963 + }, + { + "epoch": 1.5528346690412587, + "grad_norm": 0.2489087947909424, + "learning_rate": 9.464049645120141e-07, + "loss": 0.5683, + "step": 1964 + }, + { + "epoch": 1.5536261996635994, + "grad_norm": 0.24187863610564125, + "learning_rate": 9.45574848856708e-07, + "loss": 0.5583, + "step": 1965 + }, + { + "epoch": 1.5544177302859405, + "grad_norm": 0.25331174039850096, + "learning_rate": 9.447447708150609e-07, + "loss": 0.5668, + "step": 1966 + }, + { + "epoch": 1.5552092609082813, + "grad_norm": 0.2504989290198329, + "learning_rate": 9.439147309607459e-07, + "loss": 0.5684, + "step": 1967 + }, + { + "epoch": 1.5560007915306224, + "grad_norm": 0.24419936683861262, + "learning_rate": 9.430847298674111e-07, + "loss": 0.5584, + "step": 1968 + }, + { + "epoch": 1.5567923221529631, + "grad_norm": 0.2585052556632517, + "learning_rate": 9.422547681086757e-07, + "loss": 0.5518, + "step": 1969 + }, + { + "epoch": 1.5575838527753043, + "grad_norm": 0.24257383662044224, + "learning_rate": 9.414248462581334e-07, + "loss": 0.5767, + "step": 1970 + }, + { + "epoch": 1.558375383397645, + "grad_norm": 0.24407245554214702, + "learning_rate": 9.405949648893504e-07, + "loss": 0.5561, + "step": 1971 + }, + { + "epoch": 1.5591669140199862, + "grad_norm": 0.24918179240562197, + "learning_rate": 9.397651245758636e-07, + "loss": 0.5554, + "step": 1972 + }, + { + "epoch": 1.5599584446423271, + "grad_norm": 0.24634678811139119, + "learning_rate": 9.389353258911832e-07, + "loss": 0.5564, + "step": 1973 + }, + { + "epoch": 1.560749975264668, + "grad_norm": 0.23634915706130913, + "learning_rate": 9.381055694087887e-07, + "loss": 0.5716, + "step": 1974 + }, + { + "epoch": 1.561541505887009, + "grad_norm": 0.240530519493639, + "learning_rate": 9.372758557021319e-07, + "loss": 0.5362, + "step": 1975 + }, + { + "epoch": 1.56233303650935, + "grad_norm": 0.24612319684632025, + "learning_rate": 9.364461853446349e-07, + "loss": 0.5675, + "step": 1976 + }, + { + "epoch": 1.563124567131691, + "grad_norm": 0.24726122032951803, + "learning_rate": 9.35616558909689e-07, + "loss": 0.5646, + "step": 1977 + }, + { + "epoch": 1.5639160977540318, + "grad_norm": 0.2431712736071995, + "learning_rate": 9.347869769706561e-07, + "loss": 0.5471, + "step": 1978 + }, + { + "epoch": 1.5647076283763728, + "grad_norm": 0.24200958775563516, + "learning_rate": 9.339574401008672e-07, + "loss": 0.5732, + "step": 1979 + }, + { + "epoch": 1.5654991589987137, + "grad_norm": 0.24121456560440047, + "learning_rate": 9.331279488736213e-07, + "loss": 0.5536, + "step": 1980 + }, + { + "epoch": 1.5662906896210547, + "grad_norm": 0.24310098306754682, + "learning_rate": 9.322985038621863e-07, + "loss": 0.5526, + "step": 1981 + }, + { + "epoch": 1.5670822202433956, + "grad_norm": 0.23658339428483585, + "learning_rate": 9.314691056397989e-07, + "loss": 0.5496, + "step": 1982 + }, + { + "epoch": 1.5678737508657368, + "grad_norm": 0.2435279041919117, + "learning_rate": 9.306397547796624e-07, + "loss": 0.5628, + "step": 1983 + }, + { + "epoch": 1.5686652814880775, + "grad_norm": 0.23426171320870792, + "learning_rate": 9.298104518549486e-07, + "loss": 0.561, + "step": 1984 + }, + { + "epoch": 1.5694568121104187, + "grad_norm": 0.24702648257739543, + "learning_rate": 9.289811974387945e-07, + "loss": 0.5456, + "step": 1985 + }, + { + "epoch": 1.5702483427327594, + "grad_norm": 0.24863404130824132, + "learning_rate": 9.281519921043048e-07, + "loss": 0.5587, + "step": 1986 + }, + { + "epoch": 1.5710398733551005, + "grad_norm": 0.2332512958343926, + "learning_rate": 9.273228364245499e-07, + "loss": 0.5547, + "step": 1987 + }, + { + "epoch": 1.5718314039774413, + "grad_norm": 0.24031475053794954, + "learning_rate": 9.264937309725662e-07, + "loss": 0.5749, + "step": 1988 + }, + { + "epoch": 1.5726229345997824, + "grad_norm": 0.24036406010731085, + "learning_rate": 9.256646763213553e-07, + "loss": 0.5568, + "step": 1989 + }, + { + "epoch": 1.5734144652221231, + "grad_norm": 0.25120888713194434, + "learning_rate": 9.24835673043883e-07, + "loss": 0.5475, + "step": 1990 + }, + { + "epoch": 1.5742059958444643, + "grad_norm": 0.24591089376286998, + "learning_rate": 9.240067217130801e-07, + "loss": 0.5597, + "step": 1991 + }, + { + "epoch": 1.574997526466805, + "grad_norm": 0.23881311139117428, + "learning_rate": 9.231778229018416e-07, + "loss": 0.5462, + "step": 1992 + }, + { + "epoch": 1.5757890570891462, + "grad_norm": 0.24112698391348775, + "learning_rate": 9.223489771830265e-07, + "loss": 0.5473, + "step": 1993 + }, + { + "epoch": 1.5765805877114871, + "grad_norm": 0.2429405631746065, + "learning_rate": 9.215201851294569e-07, + "loss": 0.5654, + "step": 1994 + }, + { + "epoch": 1.577372118333828, + "grad_norm": 0.2646333136373766, + "learning_rate": 9.206914473139169e-07, + "loss": 0.5557, + "step": 1995 + }, + { + "epoch": 1.578163648956169, + "grad_norm": 0.2486794216251206, + "learning_rate": 9.198627643091541e-07, + "loss": 0.5505, + "step": 1996 + }, + { + "epoch": 1.57895517957851, + "grad_norm": 0.23557275143197978, + "learning_rate": 9.19034136687878e-07, + "loss": 0.5642, + "step": 1997 + }, + { + "epoch": 1.579746710200851, + "grad_norm": 0.24290837948404265, + "learning_rate": 9.1820556502276e-07, + "loss": 0.5527, + "step": 1998 + }, + { + "epoch": 1.5805382408231918, + "grad_norm": 0.2445531298782679, + "learning_rate": 9.173770498864324e-07, + "loss": 0.5406, + "step": 1999 + }, + { + "epoch": 1.5813297714455328, + "grad_norm": 0.24521079534272908, + "learning_rate": 9.165485918514891e-07, + "loss": 0.5719, + "step": 2000 + }, + { + "epoch": 1.5821213020678737, + "grad_norm": 0.24382747432485855, + "learning_rate": 9.157201914904834e-07, + "loss": 0.5643, + "step": 2001 + }, + { + "epoch": 1.5829128326902147, + "grad_norm": 0.24316398481862123, + "learning_rate": 9.148918493759297e-07, + "loss": 0.5561, + "step": 2002 + }, + { + "epoch": 1.5837043633125556, + "grad_norm": 0.23884960368469524, + "learning_rate": 9.140635660803024e-07, + "loss": 0.5526, + "step": 2003 + }, + { + "epoch": 1.5844958939348968, + "grad_norm": 0.23978880256137527, + "learning_rate": 9.132353421760339e-07, + "loss": 0.5375, + "step": 2004 + }, + { + "epoch": 1.5852874245572375, + "grad_norm": 0.2412768962551659, + "learning_rate": 9.124071782355171e-07, + "loss": 0.5518, + "step": 2005 + }, + { + "epoch": 1.5860789551795786, + "grad_norm": 0.2439193302045052, + "learning_rate": 9.115790748311022e-07, + "loss": 0.5583, + "step": 2006 + }, + { + "epoch": 1.5868704858019194, + "grad_norm": 0.24777383189207242, + "learning_rate": 9.107510325350983e-07, + "loss": 0.5474, + "step": 2007 + }, + { + "epoch": 1.5876620164242605, + "grad_norm": 0.25061556066450913, + "learning_rate": 9.099230519197724e-07, + "loss": 0.568, + "step": 2008 + }, + { + "epoch": 1.5884535470466012, + "grad_norm": 0.23769449043634142, + "learning_rate": 9.09095133557348e-07, + "loss": 0.5518, + "step": 2009 + }, + { + "epoch": 1.5892450776689424, + "grad_norm": 0.2437545233145305, + "learning_rate": 9.082672780200068e-07, + "loss": 0.5682, + "step": 2010 + }, + { + "epoch": 1.5900366082912831, + "grad_norm": 0.24597414418264885, + "learning_rate": 9.074394858798856e-07, + "loss": 0.558, + "step": 2011 + }, + { + "epoch": 1.5908281389136243, + "grad_norm": 0.2505269187142412, + "learning_rate": 9.066117577090786e-07, + "loss": 0.5542, + "step": 2012 + }, + { + "epoch": 1.591619669535965, + "grad_norm": 0.2438859735554627, + "learning_rate": 9.057840940796356e-07, + "loss": 0.5527, + "step": 2013 + }, + { + "epoch": 1.5924112001583062, + "grad_norm": 0.24398672777416325, + "learning_rate": 9.049564955635612e-07, + "loss": 0.5516, + "step": 2014 + }, + { + "epoch": 1.5932027307806471, + "grad_norm": 0.23802712142850696, + "learning_rate": 9.041289627328158e-07, + "loss": 0.5522, + "step": 2015 + }, + { + "epoch": 1.593994261402988, + "grad_norm": 0.23947980168767818, + "learning_rate": 9.033014961593134e-07, + "loss": 0.5511, + "step": 2016 + }, + { + "epoch": 1.594785792025329, + "grad_norm": 0.24309875949637275, + "learning_rate": 9.024740964149231e-07, + "loss": 0.5485, + "step": 2017 + }, + { + "epoch": 1.59557732264767, + "grad_norm": 0.24157386327966096, + "learning_rate": 9.016467640714677e-07, + "loss": 0.5507, + "step": 2018 + }, + { + "epoch": 1.5963688532700109, + "grad_norm": 0.2445120784282942, + "learning_rate": 9.008194997007227e-07, + "loss": 0.5445, + "step": 2019 + }, + { + "epoch": 1.5971603838923518, + "grad_norm": 0.23954075205120254, + "learning_rate": 8.999923038744177e-07, + "loss": 0.55, + "step": 2020 + }, + { + "epoch": 1.5979519145146928, + "grad_norm": 0.23531685414910658, + "learning_rate": 8.991651771642345e-07, + "loss": 0.5547, + "step": 2021 + }, + { + "epoch": 1.5987434451370337, + "grad_norm": 0.2518174676964054, + "learning_rate": 8.983381201418062e-07, + "loss": 0.5564, + "step": 2022 + }, + { + "epoch": 1.5995349757593746, + "grad_norm": 0.24207774498953183, + "learning_rate": 8.975111333787192e-07, + "loss": 0.553, + "step": 2023 + }, + { + "epoch": 1.6003265063817156, + "grad_norm": 0.24858406061862517, + "learning_rate": 8.966842174465103e-07, + "loss": 0.5536, + "step": 2024 + }, + { + "epoch": 1.6011180370040567, + "grad_norm": 0.23619562290454044, + "learning_rate": 8.95857372916668e-07, + "loss": 0.546, + "step": 2025 + }, + { + "epoch": 1.6019095676263975, + "grad_norm": 0.2400375915791126, + "learning_rate": 8.950306003606314e-07, + "loss": 0.5724, + "step": 2026 + }, + { + "epoch": 1.6027010982487386, + "grad_norm": 0.24226127560950453, + "learning_rate": 8.942039003497892e-07, + "loss": 0.5531, + "step": 2027 + }, + { + "epoch": 1.6034926288710794, + "grad_norm": 0.24417083063861214, + "learning_rate": 8.933772734554806e-07, + "loss": 0.5589, + "step": 2028 + }, + { + "epoch": 1.6042841594934205, + "grad_norm": 0.24056487563446027, + "learning_rate": 8.925507202489938e-07, + "loss": 0.5691, + "step": 2029 + }, + { + "epoch": 1.6050756901157612, + "grad_norm": 0.24375248747069628, + "learning_rate": 8.917242413015666e-07, + "loss": 0.536, + "step": 2030 + }, + { + "epoch": 1.6058672207381024, + "grad_norm": 0.24527152468161173, + "learning_rate": 8.908978371843858e-07, + "loss": 0.5534, + "step": 2031 + }, + { + "epoch": 1.6066587513604431, + "grad_norm": 0.2387293578057444, + "learning_rate": 8.90071508468585e-07, + "loss": 0.5635, + "step": 2032 + }, + { + "epoch": 1.6074502819827843, + "grad_norm": 0.2402138297428413, + "learning_rate": 8.892452557252471e-07, + "loss": 0.5698, + "step": 2033 + }, + { + "epoch": 1.608241812605125, + "grad_norm": 0.2519986495145601, + "learning_rate": 8.884190795254017e-07, + "loss": 0.5664, + "step": 2034 + }, + { + "epoch": 1.6090333432274662, + "grad_norm": 0.24916973598540385, + "learning_rate": 8.875929804400259e-07, + "loss": 0.5667, + "step": 2035 + }, + { + "epoch": 1.609824873849807, + "grad_norm": 0.23952696403139964, + "learning_rate": 8.86766959040044e-07, + "loss": 0.5477, + "step": 2036 + }, + { + "epoch": 1.610616404472148, + "grad_norm": 0.25479237829538365, + "learning_rate": 8.859410158963254e-07, + "loss": 0.5617, + "step": 2037 + }, + { + "epoch": 1.611407935094489, + "grad_norm": 0.24759472080214803, + "learning_rate": 8.851151515796859e-07, + "loss": 0.5547, + "step": 2038 + }, + { + "epoch": 1.61219946571683, + "grad_norm": 0.2472420808791739, + "learning_rate": 8.842893666608871e-07, + "loss": 0.547, + "step": 2039 + }, + { + "epoch": 1.6129909963391709, + "grad_norm": 0.25737531625494264, + "learning_rate": 8.834636617106356e-07, + "loss": 0.5485, + "step": 2040 + }, + { + "epoch": 1.6137825269615118, + "grad_norm": 0.2427318858871427, + "learning_rate": 8.826380372995832e-07, + "loss": 0.5449, + "step": 2041 + }, + { + "epoch": 1.6145740575838528, + "grad_norm": 0.24872616426240185, + "learning_rate": 8.818124939983254e-07, + "loss": 0.5502, + "step": 2042 + }, + { + "epoch": 1.6153655882061937, + "grad_norm": 0.23712940097713245, + "learning_rate": 8.80987032377401e-07, + "loss": 0.5593, + "step": 2043 + }, + { + "epoch": 1.6161571188285346, + "grad_norm": 0.24970016687100757, + "learning_rate": 8.801616530072939e-07, + "loss": 0.5492, + "step": 2044 + }, + { + "epoch": 1.6169486494508756, + "grad_norm": 0.2408996082081383, + "learning_rate": 8.7933635645843e-07, + "loss": 0.5664, + "step": 2045 + }, + { + "epoch": 1.6177401800732167, + "grad_norm": 0.24399569773186586, + "learning_rate": 8.785111433011789e-07, + "loss": 0.5461, + "step": 2046 + }, + { + "epoch": 1.6185317106955575, + "grad_norm": 0.24474799760899071, + "learning_rate": 8.77686014105852e-07, + "loss": 0.5484, + "step": 2047 + }, + { + "epoch": 1.6193232413178986, + "grad_norm": 0.2546632175346895, + "learning_rate": 8.76860969442702e-07, + "loss": 0.5576, + "step": 2048 + }, + { + "epoch": 1.6201147719402393, + "grad_norm": 0.2409988974971737, + "learning_rate": 8.760360098819243e-07, + "loss": 0.5665, + "step": 2049 + }, + { + "epoch": 1.6209063025625805, + "grad_norm": 0.2513223866448745, + "learning_rate": 8.75211135993655e-07, + "loss": 0.5668, + "step": 2050 + }, + { + "epoch": 1.6216978331849212, + "grad_norm": 0.24429719225143925, + "learning_rate": 8.743863483479716e-07, + "loss": 0.5684, + "step": 2051 + }, + { + "epoch": 1.6224893638072624, + "grad_norm": 0.23998508846294653, + "learning_rate": 8.73561647514891e-07, + "loss": 0.5568, + "step": 2052 + }, + { + "epoch": 1.623280894429603, + "grad_norm": 0.24080160339323878, + "learning_rate": 8.727370340643704e-07, + "loss": 0.5601, + "step": 2053 + }, + { + "epoch": 1.6240724250519443, + "grad_norm": 0.24644207357074463, + "learning_rate": 8.719125085663069e-07, + "loss": 0.5485, + "step": 2054 + }, + { + "epoch": 1.624863955674285, + "grad_norm": 0.25001783187191484, + "learning_rate": 8.710880715905369e-07, + "loss": 0.561, + "step": 2055 + }, + { + "epoch": 1.6256554862966262, + "grad_norm": 0.25594656785194075, + "learning_rate": 8.702637237068351e-07, + "loss": 0.5545, + "step": 2056 + }, + { + "epoch": 1.626447016918967, + "grad_norm": 0.24342006382441958, + "learning_rate": 8.694394654849155e-07, + "loss": 0.5522, + "step": 2057 + }, + { + "epoch": 1.627238547541308, + "grad_norm": 0.23963624568506198, + "learning_rate": 8.686152974944287e-07, + "loss": 0.558, + "step": 2058 + }, + { + "epoch": 1.628030078163649, + "grad_norm": 0.24318954257524006, + "learning_rate": 8.677912203049645e-07, + "loss": 0.557, + "step": 2059 + }, + { + "epoch": 1.62882160878599, + "grad_norm": 0.24052317362646064, + "learning_rate": 8.669672344860491e-07, + "loss": 0.556, + "step": 2060 + }, + { + "epoch": 1.6296131394083309, + "grad_norm": 0.25711502671895936, + "learning_rate": 8.661433406071453e-07, + "loss": 0.5693, + "step": 2061 + }, + { + "epoch": 1.6304046700306718, + "grad_norm": 0.24102907986820435, + "learning_rate": 8.653195392376538e-07, + "loss": 0.558, + "step": 2062 + }, + { + "epoch": 1.6311962006530127, + "grad_norm": 0.2373550834520336, + "learning_rate": 8.644958309469091e-07, + "loss": 0.5395, + "step": 2063 + }, + { + "epoch": 1.6319877312753537, + "grad_norm": 0.2570678590573862, + "learning_rate": 8.636722163041833e-07, + "loss": 0.5391, + "step": 2064 + }, + { + "epoch": 1.6327792618976946, + "grad_norm": 0.2445396534415899, + "learning_rate": 8.628486958786831e-07, + "loss": 0.5457, + "step": 2065 + }, + { + "epoch": 1.6335707925200356, + "grad_norm": 0.24447082953743518, + "learning_rate": 8.620252702395499e-07, + "loss": 0.5519, + "step": 2066 + }, + { + "epoch": 1.6343623231423767, + "grad_norm": 0.2455708603720528, + "learning_rate": 8.612019399558597e-07, + "loss": 0.5618, + "step": 2067 + }, + { + "epoch": 1.6351538537647174, + "grad_norm": 0.2428458574211925, + "learning_rate": 8.603787055966232e-07, + "loss": 0.5711, + "step": 2068 + }, + { + "epoch": 1.6359453843870586, + "grad_norm": 0.2428632953712524, + "learning_rate": 8.595555677307835e-07, + "loss": 0.5469, + "step": 2069 + }, + { + "epoch": 1.6367369150093993, + "grad_norm": 0.25187082947017325, + "learning_rate": 8.587325269272183e-07, + "loss": 0.5558, + "step": 2070 + }, + { + "epoch": 1.6375284456317405, + "grad_norm": 0.23378164006817775, + "learning_rate": 8.579095837547373e-07, + "loss": 0.5487, + "step": 2071 + }, + { + "epoch": 1.6383199762540812, + "grad_norm": 0.23723769921773163, + "learning_rate": 8.570867387820835e-07, + "loss": 0.5571, + "step": 2072 + }, + { + "epoch": 1.6391115068764224, + "grad_norm": 0.23914851721873825, + "learning_rate": 8.562639925779318e-07, + "loss": 0.5451, + "step": 2073 + }, + { + "epoch": 1.639903037498763, + "grad_norm": 0.24889143215976497, + "learning_rate": 8.554413457108881e-07, + "loss": 0.5422, + "step": 2074 + }, + { + "epoch": 1.6406945681211043, + "grad_norm": 0.23847255202101136, + "learning_rate": 8.546187987494908e-07, + "loss": 0.5659, + "step": 2075 + }, + { + "epoch": 1.641486098743445, + "grad_norm": 0.24034831626255304, + "learning_rate": 8.537963522622084e-07, + "loss": 0.5581, + "step": 2076 + }, + { + "epoch": 1.6422776293657861, + "grad_norm": 0.24280446433904804, + "learning_rate": 8.529740068174402e-07, + "loss": 0.5598, + "step": 2077 + }, + { + "epoch": 1.643069159988127, + "grad_norm": 0.24181242354981677, + "learning_rate": 8.521517629835165e-07, + "loss": 0.5477, + "step": 2078 + }, + { + "epoch": 1.643860690610468, + "grad_norm": 0.24745841762119297, + "learning_rate": 8.513296213286956e-07, + "loss": 0.5708, + "step": 2079 + }, + { + "epoch": 1.644652221232809, + "grad_norm": 0.23458264483259214, + "learning_rate": 8.505075824211668e-07, + "loss": 0.5372, + "step": 2080 + }, + { + "epoch": 1.64544375185515, + "grad_norm": 0.2474157511212525, + "learning_rate": 8.496856468290474e-07, + "loss": 0.5667, + "step": 2081 + }, + { + "epoch": 1.6462352824774908, + "grad_norm": 0.23564463504954694, + "learning_rate": 8.488638151203839e-07, + "loss": 0.5609, + "step": 2082 + }, + { + "epoch": 1.6470268130998318, + "grad_norm": 0.2503948137425057, + "learning_rate": 8.480420878631511e-07, + "loss": 0.5577, + "step": 2083 + }, + { + "epoch": 1.6478183437221727, + "grad_norm": 0.24463697719998737, + "learning_rate": 8.472204656252506e-07, + "loss": 0.564, + "step": 2084 + }, + { + "epoch": 1.6486098743445137, + "grad_norm": 0.24132887295057562, + "learning_rate": 8.463989489745129e-07, + "loss": 0.5471, + "step": 2085 + }, + { + "epoch": 1.6494014049668546, + "grad_norm": 0.23539199733933527, + "learning_rate": 8.455775384786939e-07, + "loss": 0.5628, + "step": 2086 + }, + { + "epoch": 1.6501929355891956, + "grad_norm": 0.24810760237934773, + "learning_rate": 8.447562347054776e-07, + "loss": 0.5551, + "step": 2087 + }, + { + "epoch": 1.6509844662115367, + "grad_norm": 0.2428508266769543, + "learning_rate": 8.439350382224737e-07, + "loss": 0.5633, + "step": 2088 + }, + { + "epoch": 1.6517759968338774, + "grad_norm": 0.24540948486221204, + "learning_rate": 8.431139495972176e-07, + "loss": 0.561, + "step": 2089 + }, + { + "epoch": 1.6525675274562186, + "grad_norm": 0.25065727786285197, + "learning_rate": 8.422929693971698e-07, + "loss": 0.5455, + "step": 2090 + }, + { + "epoch": 1.6533590580785593, + "grad_norm": 0.23792356960861794, + "learning_rate": 8.414720981897164e-07, + "loss": 0.5583, + "step": 2091 + }, + { + "epoch": 1.6541505887009005, + "grad_norm": 0.24225723128309418, + "learning_rate": 8.406513365421683e-07, + "loss": 0.5663, + "step": 2092 + }, + { + "epoch": 1.6549421193232412, + "grad_norm": 0.24501700400121662, + "learning_rate": 8.398306850217608e-07, + "loss": 0.5467, + "step": 2093 + }, + { + "epoch": 1.6557336499455824, + "grad_norm": 0.24501439601092997, + "learning_rate": 8.390101441956525e-07, + "loss": 0.5626, + "step": 2094 + }, + { + "epoch": 1.656525180567923, + "grad_norm": 0.24856531462758444, + "learning_rate": 8.381897146309253e-07, + "loss": 0.5629, + "step": 2095 + }, + { + "epoch": 1.6573167111902642, + "grad_norm": 0.23883102164757536, + "learning_rate": 8.373693968945849e-07, + "loss": 0.5398, + "step": 2096 + }, + { + "epoch": 1.658108241812605, + "grad_norm": 0.2445122932551097, + "learning_rate": 8.365491915535597e-07, + "loss": 0.547, + "step": 2097 + }, + { + "epoch": 1.6588997724349461, + "grad_norm": 0.25961728718405314, + "learning_rate": 8.357290991747002e-07, + "loss": 0.5549, + "step": 2098 + }, + { + "epoch": 1.659691303057287, + "grad_norm": 0.2375815462137297, + "learning_rate": 8.34909120324779e-07, + "loss": 0.5574, + "step": 2099 + }, + { + "epoch": 1.660482833679628, + "grad_norm": 0.24519708664453363, + "learning_rate": 8.340892555704893e-07, + "loss": 0.5568, + "step": 2100 + }, + { + "epoch": 1.661274364301969, + "grad_norm": 0.2438913296131192, + "learning_rate": 8.332695054784466e-07, + "loss": 0.5609, + "step": 2101 + }, + { + "epoch": 1.66206589492431, + "grad_norm": 0.23898030997530736, + "learning_rate": 8.324498706151868e-07, + "loss": 0.5559, + "step": 2102 + }, + { + "epoch": 1.6628574255466508, + "grad_norm": 0.24166461498282207, + "learning_rate": 8.316303515471665e-07, + "loss": 0.5595, + "step": 2103 + }, + { + "epoch": 1.6636489561689918, + "grad_norm": 0.2446555279103593, + "learning_rate": 8.308109488407618e-07, + "loss": 0.5632, + "step": 2104 + }, + { + "epoch": 1.6644404867913327, + "grad_norm": 0.2567484699955933, + "learning_rate": 8.299916630622678e-07, + "loss": 0.5565, + "step": 2105 + }, + { + "epoch": 1.6652320174136737, + "grad_norm": 0.23880975061449405, + "learning_rate": 8.291724947778998e-07, + "loss": 0.5487, + "step": 2106 + }, + { + "epoch": 1.6660235480360146, + "grad_norm": 0.25089707885771734, + "learning_rate": 8.283534445537921e-07, + "loss": 0.5632, + "step": 2107 + }, + { + "epoch": 1.6668150786583555, + "grad_norm": 0.24072108293658043, + "learning_rate": 8.275345129559962e-07, + "loss": 0.5523, + "step": 2108 + }, + { + "epoch": 1.6676066092806967, + "grad_norm": 0.2475226641466154, + "learning_rate": 8.267157005504825e-07, + "loss": 0.5569, + "step": 2109 + }, + { + "epoch": 1.6683981399030374, + "grad_norm": 0.24007485038291715, + "learning_rate": 8.258970079031395e-07, + "loss": 0.5552, + "step": 2110 + }, + { + "epoch": 1.6691896705253786, + "grad_norm": 0.23882560906148753, + "learning_rate": 8.250784355797714e-07, + "loss": 0.5673, + "step": 2111 + }, + { + "epoch": 1.6699812011477193, + "grad_norm": 0.2517824349186474, + "learning_rate": 8.242599841461005e-07, + "loss": 0.5574, + "step": 2112 + }, + { + "epoch": 1.6707727317700605, + "grad_norm": 0.24295826491829725, + "learning_rate": 8.234416541677647e-07, + "loss": 0.5523, + "step": 2113 + }, + { + "epoch": 1.6715642623924012, + "grad_norm": 0.24388404869935035, + "learning_rate": 8.226234462103188e-07, + "loss": 0.5777, + "step": 2114 + }, + { + "epoch": 1.6723557930147424, + "grad_norm": 0.23786869322677054, + "learning_rate": 8.218053608392332e-07, + "loss": 0.5562, + "step": 2115 + }, + { + "epoch": 1.673147323637083, + "grad_norm": 0.2531283298486136, + "learning_rate": 8.209873986198926e-07, + "loss": 0.5528, + "step": 2116 + }, + { + "epoch": 1.6739388542594242, + "grad_norm": 0.23718460284078302, + "learning_rate": 8.201695601175977e-07, + "loss": 0.5672, + "step": 2117 + }, + { + "epoch": 1.674730384881765, + "grad_norm": 0.23993927480694083, + "learning_rate": 8.19351845897563e-07, + "loss": 0.5699, + "step": 2118 + }, + { + "epoch": 1.6755219155041061, + "grad_norm": 0.23794714997443736, + "learning_rate": 8.185342565249174e-07, + "loss": 0.5582, + "step": 2119 + }, + { + "epoch": 1.676313446126447, + "grad_norm": 0.23950708340571722, + "learning_rate": 8.177167925647039e-07, + "loss": 0.556, + "step": 2120 + }, + { + "epoch": 1.677104976748788, + "grad_norm": 0.24068667787476933, + "learning_rate": 8.168994545818779e-07, + "loss": 0.5464, + "step": 2121 + }, + { + "epoch": 1.677896507371129, + "grad_norm": 0.2507718932955019, + "learning_rate": 8.160822431413084e-07, + "loss": 0.5501, + "step": 2122 + }, + { + "epoch": 1.6786880379934699, + "grad_norm": 0.24007692263093897, + "learning_rate": 8.152651588077765e-07, + "loss": 0.5517, + "step": 2123 + }, + { + "epoch": 1.6794795686158108, + "grad_norm": 0.2515251510367636, + "learning_rate": 8.144482021459762e-07, + "loss": 0.5518, + "step": 2124 + }, + { + "epoch": 1.6802710992381518, + "grad_norm": 0.25013459716253933, + "learning_rate": 8.13631373720513e-07, + "loss": 0.5598, + "step": 2125 + }, + { + "epoch": 1.6810626298604927, + "grad_norm": 0.23832331109270669, + "learning_rate": 8.128146740959028e-07, + "loss": 0.5549, + "step": 2126 + }, + { + "epoch": 1.6818541604828336, + "grad_norm": 0.24691594384946502, + "learning_rate": 8.119981038365738e-07, + "loss": 0.5635, + "step": 2127 + }, + { + "epoch": 1.6826456911051746, + "grad_norm": 0.24515478763570772, + "learning_rate": 8.11181663506864e-07, + "loss": 0.5448, + "step": 2128 + }, + { + "epoch": 1.6834372217275155, + "grad_norm": 0.24830744464257612, + "learning_rate": 8.103653536710219e-07, + "loss": 0.5619, + "step": 2129 + }, + { + "epoch": 1.6842287523498567, + "grad_norm": 0.24240100747542648, + "learning_rate": 8.095491748932063e-07, + "loss": 0.5568, + "step": 2130 + }, + { + "epoch": 1.6850202829721974, + "grad_norm": 0.23911385549167222, + "learning_rate": 8.087331277374844e-07, + "loss": 0.5642, + "step": 2131 + }, + { + "epoch": 1.6858118135945386, + "grad_norm": 0.23455278963650725, + "learning_rate": 8.079172127678329e-07, + "loss": 0.5471, + "step": 2132 + }, + { + "epoch": 1.6866033442168793, + "grad_norm": 0.2380396825467588, + "learning_rate": 8.071014305481373e-07, + "loss": 0.5496, + "step": 2133 + }, + { + "epoch": 1.6873948748392205, + "grad_norm": 0.24561620775271423, + "learning_rate": 8.06285781642191e-07, + "loss": 0.5628, + "step": 2134 + }, + { + "epoch": 1.6881864054615612, + "grad_norm": 0.23719363419038508, + "learning_rate": 8.054702666136962e-07, + "loss": 0.5424, + "step": 2135 + }, + { + "epoch": 1.6889779360839023, + "grad_norm": 0.24429327343347726, + "learning_rate": 8.046548860262613e-07, + "loss": 0.5617, + "step": 2136 + }, + { + "epoch": 1.689769466706243, + "grad_norm": 0.23427956646463197, + "learning_rate": 8.038396404434024e-07, + "loss": 0.5662, + "step": 2137 + }, + { + "epoch": 1.6905609973285842, + "grad_norm": 0.24412993380524262, + "learning_rate": 8.030245304285424e-07, + "loss": 0.5381, + "step": 2138 + }, + { + "epoch": 1.691352527950925, + "grad_norm": 0.2442892324940378, + "learning_rate": 8.022095565450102e-07, + "loss": 0.5581, + "step": 2139 + }, + { + "epoch": 1.692144058573266, + "grad_norm": 0.24154065834023128, + "learning_rate": 8.013947193560412e-07, + "loss": 0.5585, + "step": 2140 + }, + { + "epoch": 1.692935589195607, + "grad_norm": 0.23448738096386273, + "learning_rate": 8.005800194247759e-07, + "loss": 0.5538, + "step": 2141 + }, + { + "epoch": 1.693727119817948, + "grad_norm": 0.24132536336211838, + "learning_rate": 7.997654573142592e-07, + "loss": 0.556, + "step": 2142 + }, + { + "epoch": 1.694518650440289, + "grad_norm": 0.24420785160970682, + "learning_rate": 7.989510335874419e-07, + "loss": 0.5581, + "step": 2143 + }, + { + "epoch": 1.6953101810626299, + "grad_norm": 0.24196611171201077, + "learning_rate": 7.98136748807179e-07, + "loss": 0.5595, + "step": 2144 + }, + { + "epoch": 1.6961017116849708, + "grad_norm": 0.23855984750560175, + "learning_rate": 7.973226035362295e-07, + "loss": 0.5598, + "step": 2145 + }, + { + "epoch": 1.6968932423073118, + "grad_norm": 0.24608009031288522, + "learning_rate": 7.965085983372555e-07, + "loss": 0.5561, + "step": 2146 + }, + { + "epoch": 1.6976847729296527, + "grad_norm": 0.24465724540214806, + "learning_rate": 7.956947337728221e-07, + "loss": 0.5571, + "step": 2147 + }, + { + "epoch": 1.6984763035519936, + "grad_norm": 0.2392334027180945, + "learning_rate": 7.948810104053976e-07, + "loss": 0.5595, + "step": 2148 + }, + { + "epoch": 1.6992678341743346, + "grad_norm": 0.24978335197145127, + "learning_rate": 7.940674287973535e-07, + "loss": 0.567, + "step": 2149 + }, + { + "epoch": 1.7000593647966755, + "grad_norm": 0.23399604183084974, + "learning_rate": 7.932539895109621e-07, + "loss": 0.541, + "step": 2150 + }, + { + "epoch": 1.7008508954190167, + "grad_norm": 0.24858612960019044, + "learning_rate": 7.924406931083978e-07, + "loss": 0.5496, + "step": 2151 + }, + { + "epoch": 1.7016424260413574, + "grad_norm": 0.25648218351749696, + "learning_rate": 7.916275401517368e-07, + "loss": 0.5654, + "step": 2152 + }, + { + "epoch": 1.7024339566636986, + "grad_norm": 0.23677924634192826, + "learning_rate": 7.908145312029546e-07, + "loss": 0.5546, + "step": 2153 + }, + { + "epoch": 1.7032254872860393, + "grad_norm": 0.24112980505116388, + "learning_rate": 7.900016668239289e-07, + "loss": 0.5431, + "step": 2154 + }, + { + "epoch": 1.7040170179083804, + "grad_norm": 0.2409520761415017, + "learning_rate": 7.891889475764369e-07, + "loss": 0.5485, + "step": 2155 + }, + { + "epoch": 1.7048085485307212, + "grad_norm": 0.24313335964792243, + "learning_rate": 7.88376374022155e-07, + "loss": 0.5582, + "step": 2156 + }, + { + "epoch": 1.7056000791530623, + "grad_norm": 0.24219718663063755, + "learning_rate": 7.875639467226599e-07, + "loss": 0.5618, + "step": 2157 + }, + { + "epoch": 1.706391609775403, + "grad_norm": 0.23917136316741616, + "learning_rate": 7.867516662394258e-07, + "loss": 0.5447, + "step": 2158 + }, + { + "epoch": 1.7071831403977442, + "grad_norm": 0.2446518091624776, + "learning_rate": 7.85939533133827e-07, + "loss": 0.5489, + "step": 2159 + }, + { + "epoch": 1.707974671020085, + "grad_norm": 0.2462233916332169, + "learning_rate": 7.851275479671346e-07, + "loss": 0.5643, + "step": 2160 + }, + { + "epoch": 1.708766201642426, + "grad_norm": 0.25948166380731974, + "learning_rate": 7.843157113005184e-07, + "loss": 0.555, + "step": 2161 + }, + { + "epoch": 1.709557732264767, + "grad_norm": 0.23802898169647893, + "learning_rate": 7.835040236950455e-07, + "loss": 0.5491, + "step": 2162 + }, + { + "epoch": 1.710349262887108, + "grad_norm": 0.2449931462774046, + "learning_rate": 7.82692485711679e-07, + "loss": 0.5539, + "step": 2163 + }, + { + "epoch": 1.711140793509449, + "grad_norm": 0.2370948135147026, + "learning_rate": 7.8188109791128e-07, + "loss": 0.5529, + "step": 2164 + }, + { + "epoch": 1.7119323241317899, + "grad_norm": 0.24817303915339273, + "learning_rate": 7.810698608546046e-07, + "loss": 0.5572, + "step": 2165 + }, + { + "epoch": 1.7127238547541308, + "grad_norm": 0.23626202654201317, + "learning_rate": 7.802587751023055e-07, + "loss": 0.5571, + "step": 2166 + }, + { + "epoch": 1.7135153853764717, + "grad_norm": 0.23842992920460468, + "learning_rate": 7.794478412149311e-07, + "loss": 0.5556, + "step": 2167 + }, + { + "epoch": 1.7143069159988127, + "grad_norm": 0.24572744490954124, + "learning_rate": 7.786370597529233e-07, + "loss": 0.5509, + "step": 2168 + }, + { + "epoch": 1.7150984466211536, + "grad_norm": 0.23783734579789367, + "learning_rate": 7.778264312766203e-07, + "loss": 0.5642, + "step": 2169 + }, + { + "epoch": 1.7158899772434946, + "grad_norm": 0.24430720648740292, + "learning_rate": 7.770159563462537e-07, + "loss": 0.5657, + "step": 2170 + }, + { + "epoch": 1.7166815078658355, + "grad_norm": 0.23201775490562962, + "learning_rate": 7.762056355219493e-07, + "loss": 0.5535, + "step": 2171 + }, + { + "epoch": 1.7174730384881767, + "grad_norm": 0.23706798034542195, + "learning_rate": 7.753954693637262e-07, + "loss": 0.5505, + "step": 2172 + }, + { + "epoch": 1.7182645691105174, + "grad_norm": 0.23849100010630137, + "learning_rate": 7.745854584314975e-07, + "loss": 0.5614, + "step": 2173 + }, + { + "epoch": 1.7190560997328586, + "grad_norm": 0.24251874043226324, + "learning_rate": 7.737756032850674e-07, + "loss": 0.562, + "step": 2174 + }, + { + "epoch": 1.7198476303551993, + "grad_norm": 0.23995004484046845, + "learning_rate": 7.729659044841333e-07, + "loss": 0.5527, + "step": 2175 + }, + { + "epoch": 1.7206391609775404, + "grad_norm": 0.24590505624862533, + "learning_rate": 7.721563625882848e-07, + "loss": 0.5582, + "step": 2176 + }, + { + "epoch": 1.7214306915998812, + "grad_norm": 0.24601574831899525, + "learning_rate": 7.713469781570029e-07, + "loss": 0.5554, + "step": 2177 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.24380639161955578, + "learning_rate": 7.705377517496596e-07, + "loss": 0.554, + "step": 2178 + }, + { + "epoch": 1.723013752844563, + "grad_norm": 0.2407684830091183, + "learning_rate": 7.697286839255178e-07, + "loss": 0.5533, + "step": 2179 + }, + { + "epoch": 1.7238052834669042, + "grad_norm": 0.23927455389202604, + "learning_rate": 7.689197752437305e-07, + "loss": 0.5408, + "step": 2180 + }, + { + "epoch": 1.724596814089245, + "grad_norm": 0.23757612892397442, + "learning_rate": 7.68111026263341e-07, + "loss": 0.5464, + "step": 2181 + }, + { + "epoch": 1.725388344711586, + "grad_norm": 0.24157743935413045, + "learning_rate": 7.673024375432827e-07, + "loss": 0.5578, + "step": 2182 + }, + { + "epoch": 1.726179875333927, + "grad_norm": 0.2340409671932611, + "learning_rate": 7.664940096423776e-07, + "loss": 0.5564, + "step": 2183 + }, + { + "epoch": 1.726971405956268, + "grad_norm": 0.23546781514033982, + "learning_rate": 7.656857431193363e-07, + "loss": 0.5582, + "step": 2184 + }, + { + "epoch": 1.727762936578609, + "grad_norm": 0.24013399422650472, + "learning_rate": 7.648776385327585e-07, + "loss": 0.5644, + "step": 2185 + }, + { + "epoch": 1.7285544672009499, + "grad_norm": 0.24190172048969363, + "learning_rate": 7.640696964411315e-07, + "loss": 0.5537, + "step": 2186 + }, + { + "epoch": 1.7293459978232908, + "grad_norm": 0.24171309768875512, + "learning_rate": 7.632619174028313e-07, + "loss": 0.5354, + "step": 2187 + }, + { + "epoch": 1.7301375284456317, + "grad_norm": 0.24035832391791973, + "learning_rate": 7.624543019761201e-07, + "loss": 0.5545, + "step": 2188 + }, + { + "epoch": 1.7309290590679727, + "grad_norm": 0.2406686377007751, + "learning_rate": 7.616468507191471e-07, + "loss": 0.5572, + "step": 2189 + }, + { + "epoch": 1.7317205896903136, + "grad_norm": 0.2427060129926691, + "learning_rate": 7.608395641899486e-07, + "loss": 0.559, + "step": 2190 + }, + { + "epoch": 1.7325121203126546, + "grad_norm": 0.23709023805563265, + "learning_rate": 7.600324429464466e-07, + "loss": 0.555, + "step": 2191 + }, + { + "epoch": 1.7333036509349955, + "grad_norm": 0.24132054975928716, + "learning_rate": 7.592254875464496e-07, + "loss": 0.5638, + "step": 2192 + }, + { + "epoch": 1.7340951815573367, + "grad_norm": 0.24230958726738466, + "learning_rate": 7.584186985476504e-07, + "loss": 0.5615, + "step": 2193 + }, + { + "epoch": 1.7348867121796774, + "grad_norm": 0.23703804551595645, + "learning_rate": 7.576120765076279e-07, + "loss": 0.5619, + "step": 2194 + }, + { + "epoch": 1.7356782428020185, + "grad_norm": 0.24522677182745897, + "learning_rate": 7.568056219838444e-07, + "loss": 0.5613, + "step": 2195 + }, + { + "epoch": 1.7364697734243593, + "grad_norm": 0.24551047082297947, + "learning_rate": 7.559993355336473e-07, + "loss": 0.5777, + "step": 2196 + }, + { + "epoch": 1.7372613040467004, + "grad_norm": 0.24554824554869178, + "learning_rate": 7.55193217714268e-07, + "loss": 0.557, + "step": 2197 + }, + { + "epoch": 1.7380528346690411, + "grad_norm": 0.24293205701393464, + "learning_rate": 7.543872690828206e-07, + "loss": 0.5303, + "step": 2198 + }, + { + "epoch": 1.7388443652913823, + "grad_norm": 0.24302354125452177, + "learning_rate": 7.535814901963031e-07, + "loss": 0.5454, + "step": 2199 + }, + { + "epoch": 1.739635895913723, + "grad_norm": 0.2431861330386916, + "learning_rate": 7.527758816115953e-07, + "loss": 0.5453, + "step": 2200 + }, + { + "epoch": 1.7404274265360642, + "grad_norm": 0.23923352331044176, + "learning_rate": 7.519704438854598e-07, + "loss": 0.5636, + "step": 2201 + }, + { + "epoch": 1.741218957158405, + "grad_norm": 0.23972811842788172, + "learning_rate": 7.511651775745413e-07, + "loss": 0.5326, + "step": 2202 + }, + { + "epoch": 1.742010487780746, + "grad_norm": 0.24985436390755797, + "learning_rate": 7.503600832353656e-07, + "loss": 0.5644, + "step": 2203 + }, + { + "epoch": 1.742802018403087, + "grad_norm": 0.2424396509739881, + "learning_rate": 7.495551614243404e-07, + "loss": 0.5453, + "step": 2204 + }, + { + "epoch": 1.743593549025428, + "grad_norm": 0.24083600712817166, + "learning_rate": 7.487504126977528e-07, + "loss": 0.5458, + "step": 2205 + }, + { + "epoch": 1.744385079647769, + "grad_norm": 0.23736625232490574, + "learning_rate": 7.479458376117715e-07, + "loss": 0.5559, + "step": 2206 + }, + { + "epoch": 1.7451766102701098, + "grad_norm": 0.2496008231762276, + "learning_rate": 7.471414367224449e-07, + "loss": 0.5501, + "step": 2207 + }, + { + "epoch": 1.7459681408924508, + "grad_norm": 0.2416875135646901, + "learning_rate": 7.463372105857008e-07, + "loss": 0.5551, + "step": 2208 + }, + { + "epoch": 1.7467596715147917, + "grad_norm": 0.23910205162063805, + "learning_rate": 7.455331597573467e-07, + "loss": 0.5546, + "step": 2209 + }, + { + "epoch": 1.7475512021371327, + "grad_norm": 0.23642334161783982, + "learning_rate": 7.447292847930677e-07, + "loss": 0.5406, + "step": 2210 + }, + { + "epoch": 1.7483427327594736, + "grad_norm": 0.2402782176963215, + "learning_rate": 7.439255862484288e-07, + "loss": 0.5556, + "step": 2211 + }, + { + "epoch": 1.7491342633818145, + "grad_norm": 0.24319025847672862, + "learning_rate": 7.431220646788728e-07, + "loss": 0.5604, + "step": 2212 + }, + { + "epoch": 1.7499257940041555, + "grad_norm": 0.24183606139928246, + "learning_rate": 7.423187206397194e-07, + "loss": 0.561, + "step": 2213 + }, + { + "epoch": 1.7507173246264967, + "grad_norm": 0.23647144330481903, + "learning_rate": 7.415155546861667e-07, + "loss": 0.5508, + "step": 2214 + }, + { + "epoch": 1.7515088552488374, + "grad_norm": 0.24047185581119493, + "learning_rate": 7.407125673732884e-07, + "loss": 0.5695, + "step": 2215 + }, + { + "epoch": 1.7523003858711785, + "grad_norm": 0.24779995107088315, + "learning_rate": 7.399097592560359e-07, + "loss": 0.5609, + "step": 2216 + }, + { + "epoch": 1.7530919164935193, + "grad_norm": 0.23624790745492177, + "learning_rate": 7.39107130889236e-07, + "loss": 0.5554, + "step": 2217 + }, + { + "epoch": 1.7538834471158604, + "grad_norm": 0.24288340429453822, + "learning_rate": 7.38304682827592e-07, + "loss": 0.5552, + "step": 2218 + }, + { + "epoch": 1.7546749777382011, + "grad_norm": 0.23878211179399783, + "learning_rate": 7.37502415625682e-07, + "loss": 0.5525, + "step": 2219 + }, + { + "epoch": 1.7554665083605423, + "grad_norm": 0.24104183274827198, + "learning_rate": 7.367003298379597e-07, + "loss": 0.549, + "step": 2220 + }, + { + "epoch": 1.756258038982883, + "grad_norm": 0.23501424265923035, + "learning_rate": 7.358984260187526e-07, + "loss": 0.5657, + "step": 2221 + }, + { + "epoch": 1.7570495696052242, + "grad_norm": 0.2372833396939188, + "learning_rate": 7.350967047222627e-07, + "loss": 0.5619, + "step": 2222 + }, + { + "epoch": 1.757841100227565, + "grad_norm": 0.23553083697812366, + "learning_rate": 7.342951665025663e-07, + "loss": 0.5366, + "step": 2223 + }, + { + "epoch": 1.758632630849906, + "grad_norm": 0.23548253077937908, + "learning_rate": 7.334938119136127e-07, + "loss": 0.5563, + "step": 2224 + }, + { + "epoch": 1.759424161472247, + "grad_norm": 0.23773384962064845, + "learning_rate": 7.326926415092252e-07, + "loss": 0.55, + "step": 2225 + }, + { + "epoch": 1.760215692094588, + "grad_norm": 0.24050851822944924, + "learning_rate": 7.318916558430984e-07, + "loss": 0.569, + "step": 2226 + }, + { + "epoch": 1.7610072227169289, + "grad_norm": 0.23953819099930582, + "learning_rate": 7.310908554687998e-07, + "loss": 0.5495, + "step": 2227 + }, + { + "epoch": 1.7617987533392698, + "grad_norm": 0.24258422612890876, + "learning_rate": 7.302902409397693e-07, + "loss": 0.5422, + "step": 2228 + }, + { + "epoch": 1.7625902839616108, + "grad_norm": 0.23696866660215898, + "learning_rate": 7.294898128093179e-07, + "loss": 0.5662, + "step": 2229 + }, + { + "epoch": 1.7633818145839517, + "grad_norm": 0.2361512263061421, + "learning_rate": 7.286895716306284e-07, + "loss": 0.5499, + "step": 2230 + }, + { + "epoch": 1.7641733452062927, + "grad_norm": 0.24095374873912595, + "learning_rate": 7.278895179567533e-07, + "loss": 0.5498, + "step": 2231 + }, + { + "epoch": 1.7649648758286336, + "grad_norm": 0.23322150218966986, + "learning_rate": 7.27089652340616e-07, + "loss": 0.5526, + "step": 2232 + }, + { + "epoch": 1.7657564064509745, + "grad_norm": 0.24547347021758917, + "learning_rate": 7.262899753350104e-07, + "loss": 0.5477, + "step": 2233 + }, + { + "epoch": 1.7665479370733155, + "grad_norm": 0.2428990539423586, + "learning_rate": 7.254904874925999e-07, + "loss": 0.5612, + "step": 2234 + }, + { + "epoch": 1.7673394676956566, + "grad_norm": 0.24596385189491438, + "learning_rate": 7.246911893659167e-07, + "loss": 0.5489, + "step": 2235 + }, + { + "epoch": 1.7681309983179974, + "grad_norm": 0.24544528922836092, + "learning_rate": 7.238920815073621e-07, + "loss": 0.5585, + "step": 2236 + }, + { + "epoch": 1.7689225289403385, + "grad_norm": 0.2380119745734608, + "learning_rate": 7.230931644692057e-07, + "loss": 0.5519, + "step": 2237 + }, + { + "epoch": 1.7697140595626792, + "grad_norm": 0.23862840909204722, + "learning_rate": 7.222944388035859e-07, + "loss": 0.57, + "step": 2238 + }, + { + "epoch": 1.7705055901850204, + "grad_norm": 0.23283778262407775, + "learning_rate": 7.214959050625085e-07, + "loss": 0.5456, + "step": 2239 + }, + { + "epoch": 1.7712971208073611, + "grad_norm": 0.2374660754544533, + "learning_rate": 7.20697563797846e-07, + "loss": 0.5622, + "step": 2240 + }, + { + "epoch": 1.7720886514297023, + "grad_norm": 0.23978966065378568, + "learning_rate": 7.198994155613395e-07, + "loss": 0.5546, + "step": 2241 + }, + { + "epoch": 1.772880182052043, + "grad_norm": 0.2514758149552116, + "learning_rate": 7.191014609045945e-07, + "loss": 0.5608, + "step": 2242 + }, + { + "epoch": 1.7736717126743842, + "grad_norm": 0.239362411396253, + "learning_rate": 7.183037003790842e-07, + "loss": 0.5596, + "step": 2243 + }, + { + "epoch": 1.774463243296725, + "grad_norm": 0.23869763089069795, + "learning_rate": 7.175061345361479e-07, + "loss": 0.5588, + "step": 2244 + }, + { + "epoch": 1.775254773919066, + "grad_norm": 0.22948716001022224, + "learning_rate": 7.167087639269889e-07, + "loss": 0.5463, + "step": 2245 + }, + { + "epoch": 1.776046304541407, + "grad_norm": 0.23597151449855, + "learning_rate": 7.159115891026773e-07, + "loss": 0.5551, + "step": 2246 + }, + { + "epoch": 1.776837835163748, + "grad_norm": 0.23445464954183326, + "learning_rate": 7.151146106141461e-07, + "loss": 0.5483, + "step": 2247 + }, + { + "epoch": 1.7776293657860889, + "grad_norm": 0.24079164518017426, + "learning_rate": 7.143178290121941e-07, + "loss": 0.5798, + "step": 2248 + }, + { + "epoch": 1.7784208964084298, + "grad_norm": 0.24548032825300617, + "learning_rate": 7.135212448474836e-07, + "loss": 0.5673, + "step": 2249 + }, + { + "epoch": 1.7792124270307708, + "grad_norm": 0.24462915858719766, + "learning_rate": 7.127248586705397e-07, + "loss": 0.564, + "step": 2250 + }, + { + "epoch": 1.7800039576531117, + "grad_norm": 0.25313748158719196, + "learning_rate": 7.11928671031752e-07, + "loss": 0.5662, + "step": 2251 + }, + { + "epoch": 1.7807954882754526, + "grad_norm": 0.2509929007772327, + "learning_rate": 7.111326824813716e-07, + "loss": 0.5525, + "step": 2252 + }, + { + "epoch": 1.7815870188977936, + "grad_norm": 0.2392518481507003, + "learning_rate": 7.103368935695127e-07, + "loss": 0.5615, + "step": 2253 + }, + { + "epoch": 1.7823785495201345, + "grad_norm": 0.23491375049248236, + "learning_rate": 7.095413048461516e-07, + "loss": 0.5434, + "step": 2254 + }, + { + "epoch": 1.7831700801424755, + "grad_norm": 0.24024078798241236, + "learning_rate": 7.087459168611259e-07, + "loss": 0.5448, + "step": 2255 + }, + { + "epoch": 1.7839616107648166, + "grad_norm": 0.2439628732463763, + "learning_rate": 7.079507301641348e-07, + "loss": 0.5556, + "step": 2256 + }, + { + "epoch": 1.7847531413871573, + "grad_norm": 0.23730279363087756, + "learning_rate": 7.071557453047378e-07, + "loss": 0.5615, + "step": 2257 + }, + { + "epoch": 1.7855446720094985, + "grad_norm": 0.23959308056939885, + "learning_rate": 7.063609628323555e-07, + "loss": 0.5593, + "step": 2258 + }, + { + "epoch": 1.7863362026318392, + "grad_norm": 0.23250071223661176, + "learning_rate": 7.055663832962688e-07, + "loss": 0.5662, + "step": 2259 + }, + { + "epoch": 1.7871277332541804, + "grad_norm": 0.2376050572540389, + "learning_rate": 7.047720072456175e-07, + "loss": 0.5508, + "step": 2260 + }, + { + "epoch": 1.7879192638765211, + "grad_norm": 0.23641651237842087, + "learning_rate": 7.039778352294015e-07, + "loss": 0.5733, + "step": 2261 + }, + { + "epoch": 1.7887107944988623, + "grad_norm": 0.23982577133592545, + "learning_rate": 7.031838677964798e-07, + "loss": 0.5654, + "step": 2262 + }, + { + "epoch": 1.789502325121203, + "grad_norm": 0.2521637985444799, + "learning_rate": 7.023901054955691e-07, + "loss": 0.5689, + "step": 2263 + }, + { + "epoch": 1.7902938557435442, + "grad_norm": 0.24211730729014924, + "learning_rate": 7.015965488752451e-07, + "loss": 0.558, + "step": 2264 + }, + { + "epoch": 1.7910853863658849, + "grad_norm": 0.2496910755279375, + "learning_rate": 7.00803198483941e-07, + "loss": 0.5686, + "step": 2265 + }, + { + "epoch": 1.791876916988226, + "grad_norm": 0.23018708943321897, + "learning_rate": 7.000100548699476e-07, + "loss": 0.547, + "step": 2266 + }, + { + "epoch": 1.792668447610567, + "grad_norm": 0.2311351296522197, + "learning_rate": 6.992171185814137e-07, + "loss": 0.5619, + "step": 2267 + }, + { + "epoch": 1.793459978232908, + "grad_norm": 0.24818603262501357, + "learning_rate": 6.984243901663429e-07, + "loss": 0.5645, + "step": 2268 + }, + { + "epoch": 1.7942515088552489, + "grad_norm": 0.23417529603719672, + "learning_rate": 6.976318701725963e-07, + "loss": 0.5504, + "step": 2269 + }, + { + "epoch": 1.7950430394775898, + "grad_norm": 0.2513033097750515, + "learning_rate": 6.96839559147891e-07, + "loss": 0.5433, + "step": 2270 + }, + { + "epoch": 1.7958345700999307, + "grad_norm": 0.24566331048313653, + "learning_rate": 6.960474576397994e-07, + "loss": 0.5519, + "step": 2271 + }, + { + "epoch": 1.7966261007222717, + "grad_norm": 0.23738942204593105, + "learning_rate": 6.9525556619575e-07, + "loss": 0.5337, + "step": 2272 + }, + { + "epoch": 1.7974176313446126, + "grad_norm": 0.2366066672752667, + "learning_rate": 6.944638853630243e-07, + "loss": 0.558, + "step": 2273 + }, + { + "epoch": 1.7982091619669536, + "grad_norm": 0.2510111639222807, + "learning_rate": 6.936724156887595e-07, + "loss": 0.5508, + "step": 2274 + }, + { + "epoch": 1.7990006925892945, + "grad_norm": 0.23754972873567884, + "learning_rate": 6.928811577199467e-07, + "loss": 0.5647, + "step": 2275 + }, + { + "epoch": 1.7997922232116355, + "grad_norm": 0.238934512310531, + "learning_rate": 6.920901120034308e-07, + "loss": 0.5667, + "step": 2276 + }, + { + "epoch": 1.8005837538339764, + "grad_norm": 0.24434541535275991, + "learning_rate": 6.912992790859101e-07, + "loss": 0.5359, + "step": 2277 + }, + { + "epoch": 1.8013752844563173, + "grad_norm": 0.23673331978972978, + "learning_rate": 6.905086595139351e-07, + "loss": 0.5524, + "step": 2278 + }, + { + "epoch": 1.8021668150786585, + "grad_norm": 0.23799036271887128, + "learning_rate": 6.897182538339093e-07, + "loss": 0.5653, + "step": 2279 + }, + { + "epoch": 1.8029583457009992, + "grad_norm": 0.2430485402993927, + "learning_rate": 6.889280625920886e-07, + "loss": 0.5488, + "step": 2280 + }, + { + "epoch": 1.8037498763233404, + "grad_norm": 0.24479649312428692, + "learning_rate": 6.881380863345805e-07, + "loss": 0.5624, + "step": 2281 + }, + { + "epoch": 1.804541406945681, + "grad_norm": 0.24286408865509948, + "learning_rate": 6.87348325607344e-07, + "loss": 0.5424, + "step": 2282 + }, + { + "epoch": 1.8053329375680223, + "grad_norm": 0.2409483758178677, + "learning_rate": 6.865587809561893e-07, + "loss": 0.556, + "step": 2283 + }, + { + "epoch": 1.806124468190363, + "grad_norm": 0.24298280116705154, + "learning_rate": 6.857694529267763e-07, + "loss": 0.548, + "step": 2284 + }, + { + "epoch": 1.8069159988127041, + "grad_norm": 0.23905238009781168, + "learning_rate": 6.849803420646165e-07, + "loss": 0.5525, + "step": 2285 + }, + { + "epoch": 1.8077075294350449, + "grad_norm": 0.23604023844260322, + "learning_rate": 6.841914489150708e-07, + "loss": 0.5501, + "step": 2286 + }, + { + "epoch": 1.808499060057386, + "grad_norm": 0.23906787494105905, + "learning_rate": 6.834027740233492e-07, + "loss": 0.5565, + "step": 2287 + }, + { + "epoch": 1.809290590679727, + "grad_norm": 0.24029615943963029, + "learning_rate": 6.826143179345118e-07, + "loss": 0.5571, + "step": 2288 + }, + { + "epoch": 1.810082121302068, + "grad_norm": 0.24357915428979005, + "learning_rate": 6.818260811934664e-07, + "loss": 0.557, + "step": 2289 + }, + { + "epoch": 1.8108736519244089, + "grad_norm": 0.2498477135697297, + "learning_rate": 6.810380643449698e-07, + "loss": 0.5488, + "step": 2290 + }, + { + "epoch": 1.8116651825467498, + "grad_norm": 0.2459033357611074, + "learning_rate": 6.80250267933627e-07, + "loss": 0.5605, + "step": 2291 + }, + { + "epoch": 1.8124567131690907, + "grad_norm": 0.23632460590930843, + "learning_rate": 6.794626925038905e-07, + "loss": 0.5761, + "step": 2292 + }, + { + "epoch": 1.8132482437914317, + "grad_norm": 0.24900934919267886, + "learning_rate": 6.786753386000599e-07, + "loss": 0.5698, + "step": 2293 + }, + { + "epoch": 1.8140397744137726, + "grad_norm": 0.24210728450429736, + "learning_rate": 6.778882067662815e-07, + "loss": 0.5535, + "step": 2294 + }, + { + "epoch": 1.8148313050361136, + "grad_norm": 0.24209896764555133, + "learning_rate": 6.771012975465484e-07, + "loss": 0.557, + "step": 2295 + }, + { + "epoch": 1.8156228356584545, + "grad_norm": 0.23664483800454386, + "learning_rate": 6.763146114847006e-07, + "loss": 0.5587, + "step": 2296 + }, + { + "epoch": 1.8164143662807954, + "grad_norm": 0.23539531716987616, + "learning_rate": 6.755281491244224e-07, + "loss": 0.5552, + "step": 2297 + }, + { + "epoch": 1.8172058969031364, + "grad_norm": 0.23970394453645905, + "learning_rate": 6.747419110092449e-07, + "loss": 0.5496, + "step": 2298 + }, + { + "epoch": 1.8179974275254773, + "grad_norm": 0.23951359745776754, + "learning_rate": 6.73955897682543e-07, + "loss": 0.5459, + "step": 2299 + }, + { + "epoch": 1.8187889581478185, + "grad_norm": 0.23611766505624682, + "learning_rate": 6.731701096875368e-07, + "loss": 0.5638, + "step": 2300 + }, + { + "epoch": 1.8195804887701592, + "grad_norm": 0.24096794686089035, + "learning_rate": 6.723845475672914e-07, + "loss": 0.5517, + "step": 2301 + }, + { + "epoch": 1.8203720193925004, + "grad_norm": 0.2387576102864395, + "learning_rate": 6.715992118647142e-07, + "loss": 0.5554, + "step": 2302 + }, + { + "epoch": 1.821163550014841, + "grad_norm": 0.23919209056115712, + "learning_rate": 6.708141031225574e-07, + "loss": 0.5702, + "step": 2303 + }, + { + "epoch": 1.8219550806371823, + "grad_norm": 0.23916623288866273, + "learning_rate": 6.700292218834164e-07, + "loss": 0.5597, + "step": 2304 + }, + { + "epoch": 1.822746611259523, + "grad_norm": 0.23514975978391578, + "learning_rate": 6.692445686897281e-07, + "loss": 0.5449, + "step": 2305 + }, + { + "epoch": 1.8235381418818641, + "grad_norm": 0.235941676409706, + "learning_rate": 6.684601440837731e-07, + "loss": 0.5506, + "step": 2306 + }, + { + "epoch": 1.8243296725042049, + "grad_norm": 0.24204959008901106, + "learning_rate": 6.676759486076735e-07, + "loss": 0.5673, + "step": 2307 + }, + { + "epoch": 1.825121203126546, + "grad_norm": 0.2409770722540558, + "learning_rate": 6.668919828033929e-07, + "loss": 0.5469, + "step": 2308 + }, + { + "epoch": 1.825912733748887, + "grad_norm": 0.23392382638544185, + "learning_rate": 6.661082472127368e-07, + "loss": 0.5621, + "step": 2309 + }, + { + "epoch": 1.826704264371228, + "grad_norm": 0.23542651782289323, + "learning_rate": 6.653247423773507e-07, + "loss": 0.5548, + "step": 2310 + }, + { + "epoch": 1.8274957949935688, + "grad_norm": 0.23736117326812825, + "learning_rate": 6.645414688387212e-07, + "loss": 0.5618, + "step": 2311 + }, + { + "epoch": 1.8282873256159098, + "grad_norm": 0.24155604449826412, + "learning_rate": 6.637584271381749e-07, + "loss": 0.5447, + "step": 2312 + }, + { + "epoch": 1.8290788562382507, + "grad_norm": 0.23430895391792395, + "learning_rate": 6.629756178168783e-07, + "loss": 0.5708, + "step": 2313 + }, + { + "epoch": 1.8298703868605917, + "grad_norm": 0.24636770889127307, + "learning_rate": 6.621930414158374e-07, + "loss": 0.5645, + "step": 2314 + }, + { + "epoch": 1.8306619174829326, + "grad_norm": 0.23744470174473464, + "learning_rate": 6.614106984758965e-07, + "loss": 0.5563, + "step": 2315 + }, + { + "epoch": 1.8314534481052736, + "grad_norm": 0.2356690086717818, + "learning_rate": 6.606285895377397e-07, + "loss": 0.564, + "step": 2316 + }, + { + "epoch": 1.8322449787276145, + "grad_norm": 0.23488291816006648, + "learning_rate": 6.59846715141888e-07, + "loss": 0.5422, + "step": 2317 + }, + { + "epoch": 1.8330365093499554, + "grad_norm": 0.24926387103850173, + "learning_rate": 6.590650758287018e-07, + "loss": 0.5474, + "step": 2318 + }, + { + "epoch": 1.8338280399722964, + "grad_norm": 0.2341450028715121, + "learning_rate": 6.582836721383784e-07, + "loss": 0.5407, + "step": 2319 + }, + { + "epoch": 1.8346195705946373, + "grad_norm": 0.24123193482149877, + "learning_rate": 6.575025046109515e-07, + "loss": 0.5595, + "step": 2320 + }, + { + "epoch": 1.8354111012169785, + "grad_norm": 0.2389911607109484, + "learning_rate": 6.567215737862923e-07, + "loss": 0.5613, + "step": 2321 + }, + { + "epoch": 1.8362026318393192, + "grad_norm": 0.23176945932886384, + "learning_rate": 6.559408802041087e-07, + "loss": 0.55, + "step": 2322 + }, + { + "epoch": 1.8369941624616604, + "grad_norm": 0.24386516463106725, + "learning_rate": 6.551604244039441e-07, + "loss": 0.5684, + "step": 2323 + }, + { + "epoch": 1.837785693084001, + "grad_norm": 0.24491123056474223, + "learning_rate": 6.54380206925178e-07, + "loss": 0.5697, + "step": 2324 + }, + { + "epoch": 1.8385772237063422, + "grad_norm": 0.23959194903187608, + "learning_rate": 6.536002283070253e-07, + "loss": 0.5514, + "step": 2325 + }, + { + "epoch": 1.839368754328683, + "grad_norm": 0.23343505688377264, + "learning_rate": 6.528204890885342e-07, + "loss": 0.5588, + "step": 2326 + }, + { + "epoch": 1.8401602849510241, + "grad_norm": 0.24011311991343826, + "learning_rate": 6.520409898085899e-07, + "loss": 0.5658, + "step": 2327 + }, + { + "epoch": 1.8409518155733648, + "grad_norm": 0.23997235651902463, + "learning_rate": 6.512617310059101e-07, + "loss": 0.5457, + "step": 2328 + }, + { + "epoch": 1.841743346195706, + "grad_norm": 0.23664962533257064, + "learning_rate": 6.504827132190471e-07, + "loss": 0.5754, + "step": 2329 + }, + { + "epoch": 1.842534876818047, + "grad_norm": 0.2408846776707785, + "learning_rate": 6.497039369863866e-07, + "loss": 0.5515, + "step": 2330 + }, + { + "epoch": 1.843326407440388, + "grad_norm": 0.24200784743574702, + "learning_rate": 6.489254028461462e-07, + "loss": 0.5587, + "step": 2331 + }, + { + "epoch": 1.8441179380627288, + "grad_norm": 0.23869103910857442, + "learning_rate": 6.481471113363776e-07, + "loss": 0.5485, + "step": 2332 + }, + { + "epoch": 1.8449094686850698, + "grad_norm": 0.23916108197027652, + "learning_rate": 6.473690629949644e-07, + "loss": 0.5581, + "step": 2333 + }, + { + "epoch": 1.8457009993074107, + "grad_norm": 0.24265705116236377, + "learning_rate": 6.465912583596221e-07, + "loss": 0.5509, + "step": 2334 + }, + { + "epoch": 1.8464925299297517, + "grad_norm": 0.23998194252671326, + "learning_rate": 6.458136979678978e-07, + "loss": 0.554, + "step": 2335 + }, + { + "epoch": 1.8472840605520926, + "grad_norm": 0.24258965204353683, + "learning_rate": 6.450363823571691e-07, + "loss": 0.5508, + "step": 2336 + }, + { + "epoch": 1.8480755911744335, + "grad_norm": 0.24387263782873647, + "learning_rate": 6.442593120646456e-07, + "loss": 0.5532, + "step": 2337 + }, + { + "epoch": 1.8488671217967745, + "grad_norm": 0.24070305923056098, + "learning_rate": 6.434824876273668e-07, + "loss": 0.5654, + "step": 2338 + }, + { + "epoch": 1.8496586524191154, + "grad_norm": 0.23472031609301547, + "learning_rate": 6.427059095822019e-07, + "loss": 0.5594, + "step": 2339 + }, + { + "epoch": 1.8504501830414564, + "grad_norm": 0.23574981654708868, + "learning_rate": 6.419295784658511e-07, + "loss": 0.5433, + "step": 2340 + }, + { + "epoch": 1.8512417136637973, + "grad_norm": 0.2455516349980044, + "learning_rate": 6.411534948148421e-07, + "loss": 0.5616, + "step": 2341 + }, + { + "epoch": 1.8520332442861385, + "grad_norm": 0.23866347416769718, + "learning_rate": 6.403776591655328e-07, + "loss": 0.5549, + "step": 2342 + }, + { + "epoch": 1.8528247749084792, + "grad_norm": 0.23097699749081696, + "learning_rate": 6.396020720541097e-07, + "loss": 0.5491, + "step": 2343 + }, + { + "epoch": 1.8536163055308204, + "grad_norm": 0.23200056818734432, + "learning_rate": 6.38826734016587e-07, + "loss": 0.5645, + "step": 2344 + }, + { + "epoch": 1.854407836153161, + "grad_norm": 0.2417124508988092, + "learning_rate": 6.380516455888072e-07, + "loss": 0.5527, + "step": 2345 + }, + { + "epoch": 1.8551993667755022, + "grad_norm": 0.24046045058609652, + "learning_rate": 6.372768073064406e-07, + "loss": 0.5409, + "step": 2346 + }, + { + "epoch": 1.855990897397843, + "grad_norm": 0.23470534804207058, + "learning_rate": 6.365022197049832e-07, + "loss": 0.5492, + "step": 2347 + }, + { + "epoch": 1.8567824280201841, + "grad_norm": 0.2390242999424857, + "learning_rate": 6.357278833197594e-07, + "loss": 0.5525, + "step": 2348 + }, + { + "epoch": 1.8575739586425248, + "grad_norm": 0.2388123870630192, + "learning_rate": 6.349537986859189e-07, + "loss": 0.5519, + "step": 2349 + }, + { + "epoch": 1.858365489264866, + "grad_norm": 0.2354214384007597, + "learning_rate": 6.341799663384379e-07, + "loss": 0.5506, + "step": 2350 + }, + { + "epoch": 1.859157019887207, + "grad_norm": 0.24107434793571558, + "learning_rate": 6.334063868121188e-07, + "loss": 0.5609, + "step": 2351 + }, + { + "epoch": 1.8599485505095479, + "grad_norm": 0.24345399329835843, + "learning_rate": 6.326330606415875e-07, + "loss": 0.5561, + "step": 2352 + }, + { + "epoch": 1.8607400811318888, + "grad_norm": 0.23938860381305677, + "learning_rate": 6.318599883612968e-07, + "loss": 0.5533, + "step": 2353 + }, + { + "epoch": 1.8615316117542298, + "grad_norm": 0.24017682844728316, + "learning_rate": 6.310871705055224e-07, + "loss": 0.5547, + "step": 2354 + }, + { + "epoch": 1.8623231423765707, + "grad_norm": 0.2432670533361399, + "learning_rate": 6.303146076083654e-07, + "loss": 0.5762, + "step": 2355 + }, + { + "epoch": 1.8631146729989116, + "grad_norm": 0.2363381041473353, + "learning_rate": 6.295423002037507e-07, + "loss": 0.5681, + "step": 2356 + }, + { + "epoch": 1.8639062036212526, + "grad_norm": 0.24283339079057742, + "learning_rate": 6.287702488254252e-07, + "loss": 0.5604, + "step": 2357 + }, + { + "epoch": 1.8646977342435935, + "grad_norm": 0.2325040714927763, + "learning_rate": 6.279984540069602e-07, + "loss": 0.5595, + "step": 2358 + }, + { + "epoch": 1.8654892648659345, + "grad_norm": 0.24415758673149396, + "learning_rate": 6.272269162817494e-07, + "loss": 0.5716, + "step": 2359 + }, + { + "epoch": 1.8662807954882754, + "grad_norm": 0.2428025755069738, + "learning_rate": 6.264556361830084e-07, + "loss": 0.5538, + "step": 2360 + }, + { + "epoch": 1.8670723261106164, + "grad_norm": 0.23313158379518548, + "learning_rate": 6.256846142437757e-07, + "loss": 0.5596, + "step": 2361 + }, + { + "epoch": 1.8678638567329573, + "grad_norm": 0.23042869370617095, + "learning_rate": 6.249138509969101e-07, + "loss": 0.5574, + "step": 2362 + }, + { + "epoch": 1.8686553873552985, + "grad_norm": 0.24431359866881122, + "learning_rate": 6.241433469750926e-07, + "loss": 0.5639, + "step": 2363 + }, + { + "epoch": 1.8694469179776392, + "grad_norm": 0.23373404423656063, + "learning_rate": 6.233731027108244e-07, + "loss": 0.5535, + "step": 2364 + }, + { + "epoch": 1.8702384485999803, + "grad_norm": 0.25129592335429896, + "learning_rate": 6.226031187364278e-07, + "loss": 0.5632, + "step": 2365 + }, + { + "epoch": 1.871029979222321, + "grad_norm": 0.2305640766257186, + "learning_rate": 6.218333955840451e-07, + "loss": 0.553, + "step": 2366 + }, + { + "epoch": 1.8718215098446622, + "grad_norm": 0.23897738138262625, + "learning_rate": 6.210639337856376e-07, + "loss": 0.5408, + "step": 2367 + }, + { + "epoch": 1.872613040467003, + "grad_norm": 0.2380331124139679, + "learning_rate": 6.20294733872987e-07, + "loss": 0.5518, + "step": 2368 + }, + { + "epoch": 1.873404571089344, + "grad_norm": 0.2406179368363775, + "learning_rate": 6.19525796377693e-07, + "loss": 0.5719, + "step": 2369 + }, + { + "epoch": 1.8741961017116848, + "grad_norm": 0.24157444867831274, + "learning_rate": 6.187571218311748e-07, + "loss": 0.5535, + "step": 2370 + }, + { + "epoch": 1.874987632334026, + "grad_norm": 0.2409576246223222, + "learning_rate": 6.179887107646698e-07, + "loss": 0.5587, + "step": 2371 + }, + { + "epoch": 1.8757791629563667, + "grad_norm": 0.24402319965739602, + "learning_rate": 6.172205637092329e-07, + "loss": 0.5508, + "step": 2372 + }, + { + "epoch": 1.8765706935787079, + "grad_norm": 0.2365119379994907, + "learning_rate": 6.164526811957363e-07, + "loss": 0.5598, + "step": 2373 + }, + { + "epoch": 1.8773622242010488, + "grad_norm": 0.2526132954101458, + "learning_rate": 6.156850637548701e-07, + "loss": 0.5718, + "step": 2374 + }, + { + "epoch": 1.8781537548233898, + "grad_norm": 0.24510438664346818, + "learning_rate": 6.149177119171407e-07, + "loss": 0.5497, + "step": 2375 + }, + { + "epoch": 1.8789452854457307, + "grad_norm": 0.24376795264398032, + "learning_rate": 6.141506262128715e-07, + "loss": 0.554, + "step": 2376 + }, + { + "epoch": 1.8797368160680716, + "grad_norm": 0.23899904359061272, + "learning_rate": 6.133838071722017e-07, + "loss": 0.5432, + "step": 2377 + }, + { + "epoch": 1.8805283466904126, + "grad_norm": 0.24760673460820454, + "learning_rate": 6.126172553250852e-07, + "loss": 0.5603, + "step": 2378 + }, + { + "epoch": 1.8813198773127535, + "grad_norm": 0.24439397727668621, + "learning_rate": 6.118509712012927e-07, + "loss": 0.5667, + "step": 2379 + }, + { + "epoch": 1.8821114079350945, + "grad_norm": 0.23570472393437591, + "learning_rate": 6.110849553304091e-07, + "loss": 0.5487, + "step": 2380 + }, + { + "epoch": 1.8829029385574354, + "grad_norm": 0.2396300432209492, + "learning_rate": 6.103192082418347e-07, + "loss": 0.5571, + "step": 2381 + }, + { + "epoch": 1.8836944691797763, + "grad_norm": 0.2359323187904901, + "learning_rate": 6.095537304647832e-07, + "loss": 0.5584, + "step": 2382 + }, + { + "epoch": 1.8844859998021173, + "grad_norm": 0.24259348781786255, + "learning_rate": 6.087885225282818e-07, + "loss": 0.5483, + "step": 2383 + }, + { + "epoch": 1.8852775304244584, + "grad_norm": 0.23938494203387234, + "learning_rate": 6.080235849611726e-07, + "loss": 0.5554, + "step": 2384 + }, + { + "epoch": 1.8860690610467992, + "grad_norm": 0.23551924646792072, + "learning_rate": 6.072589182921095e-07, + "loss": 0.5498, + "step": 2385 + }, + { + "epoch": 1.8868605916691403, + "grad_norm": 0.2370825388647084, + "learning_rate": 6.064945230495604e-07, + "loss": 0.5759, + "step": 2386 + }, + { + "epoch": 1.887652122291481, + "grad_norm": 0.236241511559059, + "learning_rate": 6.057303997618049e-07, + "loss": 0.5632, + "step": 2387 + }, + { + "epoch": 1.8884436529138222, + "grad_norm": 0.23687697173096994, + "learning_rate": 6.049665489569342e-07, + "loss": 0.5522, + "step": 2388 + }, + { + "epoch": 1.889235183536163, + "grad_norm": 0.2561281312211803, + "learning_rate": 6.04202971162852e-07, + "loss": 0.5401, + "step": 2389 + }, + { + "epoch": 1.890026714158504, + "grad_norm": 0.23887935233284158, + "learning_rate": 6.034396669072732e-07, + "loss": 0.553, + "step": 2390 + }, + { + "epoch": 1.8908182447808448, + "grad_norm": 0.24033422577314426, + "learning_rate": 6.026766367177232e-07, + "loss": 0.5634, + "step": 2391 + }, + { + "epoch": 1.891609775403186, + "grad_norm": 0.23369547767029367, + "learning_rate": 6.019138811215385e-07, + "loss": 0.537, + "step": 2392 + }, + { + "epoch": 1.8924013060255267, + "grad_norm": 0.23490532270272227, + "learning_rate": 6.01151400645866e-07, + "loss": 0.5571, + "step": 2393 + }, + { + "epoch": 1.8931928366478679, + "grad_norm": 0.2312910104944147, + "learning_rate": 6.003891958176612e-07, + "loss": 0.5607, + "step": 2394 + }, + { + "epoch": 1.8939843672702088, + "grad_norm": 0.23983094007084702, + "learning_rate": 5.996272671636909e-07, + "loss": 0.5445, + "step": 2395 + }, + { + "epoch": 1.8947758978925497, + "grad_norm": 0.24533562974078352, + "learning_rate": 5.988656152105292e-07, + "loss": 0.5649, + "step": 2396 + }, + { + "epoch": 1.8955674285148907, + "grad_norm": 0.24138758840329555, + "learning_rate": 5.981042404845602e-07, + "loss": 0.547, + "step": 2397 + }, + { + "epoch": 1.8963589591372316, + "grad_norm": 0.25398257402202423, + "learning_rate": 5.973431435119768e-07, + "loss": 0.5688, + "step": 2398 + }, + { + "epoch": 1.8971504897595726, + "grad_norm": 0.23325624017989163, + "learning_rate": 5.965823248187782e-07, + "loss": 0.5582, + "step": 2399 + }, + { + "epoch": 1.8979420203819135, + "grad_norm": 0.2405526192243757, + "learning_rate": 5.95821784930773e-07, + "loss": 0.5285, + "step": 2400 + }, + { + "epoch": 1.8987335510042544, + "grad_norm": 0.2433360587435491, + "learning_rate": 5.950615243735757e-07, + "loss": 0.557, + "step": 2401 + }, + { + "epoch": 1.8995250816265954, + "grad_norm": 0.26265562627560823, + "learning_rate": 5.943015436726091e-07, + "loss": 0.5465, + "step": 2402 + }, + { + "epoch": 1.9003166122489363, + "grad_norm": 0.23714179154726767, + "learning_rate": 5.935418433531022e-07, + "loss": 0.5653, + "step": 2403 + }, + { + "epoch": 1.9011081428712773, + "grad_norm": 0.2379637823598576, + "learning_rate": 5.927824239400892e-07, + "loss": 0.5566, + "step": 2404 + }, + { + "epoch": 1.9018996734936184, + "grad_norm": 0.24168197952379378, + "learning_rate": 5.920232859584115e-07, + "loss": 0.559, + "step": 2405 + }, + { + "epoch": 1.9026912041159592, + "grad_norm": 0.2364198883706187, + "learning_rate": 5.912644299327151e-07, + "loss": 0.5751, + "step": 2406 + }, + { + "epoch": 1.9034827347383003, + "grad_norm": 0.2461630194769159, + "learning_rate": 5.905058563874517e-07, + "loss": 0.544, + "step": 2407 + }, + { + "epoch": 1.904274265360641, + "grad_norm": 0.23906810091061048, + "learning_rate": 5.897475658468778e-07, + "loss": 0.5563, + "step": 2408 + }, + { + "epoch": 1.9050657959829822, + "grad_norm": 0.23987543128055613, + "learning_rate": 5.889895588350535e-07, + "loss": 0.5491, + "step": 2409 + }, + { + "epoch": 1.905857326605323, + "grad_norm": 0.2355722118955971, + "learning_rate": 5.88231835875844e-07, + "loss": 0.5519, + "step": 2410 + }, + { + "epoch": 1.906648857227664, + "grad_norm": 0.2358550161853384, + "learning_rate": 5.874743974929175e-07, + "loss": 0.5517, + "step": 2411 + }, + { + "epoch": 1.9074403878500048, + "grad_norm": 0.24080033229834438, + "learning_rate": 5.867172442097457e-07, + "loss": 0.5496, + "step": 2412 + }, + { + "epoch": 1.908231918472346, + "grad_norm": 0.22818664797067068, + "learning_rate": 5.859603765496036e-07, + "loss": 0.544, + "step": 2413 + }, + { + "epoch": 1.9090234490946867, + "grad_norm": 0.24635159524151135, + "learning_rate": 5.852037950355684e-07, + "loss": 0.5536, + "step": 2414 + }, + { + "epoch": 1.9098149797170278, + "grad_norm": 0.24616316436901883, + "learning_rate": 5.844475001905196e-07, + "loss": 0.5595, + "step": 2415 + }, + { + "epoch": 1.9106065103393688, + "grad_norm": 0.2368376079471897, + "learning_rate": 5.836914925371384e-07, + "loss": 0.5354, + "step": 2416 + }, + { + "epoch": 1.9113980409617097, + "grad_norm": 0.237265488635935, + "learning_rate": 5.829357725979083e-07, + "loss": 0.5644, + "step": 2417 + }, + { + "epoch": 1.9121895715840507, + "grad_norm": 0.2367113836381169, + "learning_rate": 5.821803408951125e-07, + "loss": 0.5449, + "step": 2418 + }, + { + "epoch": 1.9129811022063916, + "grad_norm": 0.23783758483957596, + "learning_rate": 5.814251979508373e-07, + "loss": 0.548, + "step": 2419 + }, + { + "epoch": 1.9137726328287326, + "grad_norm": 0.23645640597685252, + "learning_rate": 5.806703442869667e-07, + "loss": 0.5488, + "step": 2420 + }, + { + "epoch": 1.9145641634510735, + "grad_norm": 0.24353731461583766, + "learning_rate": 5.799157804251868e-07, + "loss": 0.5484, + "step": 2421 + }, + { + "epoch": 1.9153556940734144, + "grad_norm": 0.235479765701421, + "learning_rate": 5.791615068869828e-07, + "loss": 0.5651, + "step": 2422 + }, + { + "epoch": 1.9161472246957554, + "grad_norm": 0.24182147611689045, + "learning_rate": 5.784075241936387e-07, + "loss": 0.5543, + "step": 2423 + }, + { + "epoch": 1.9169387553180963, + "grad_norm": 0.234808676925544, + "learning_rate": 5.77653832866239e-07, + "loss": 0.5561, + "step": 2424 + }, + { + "epoch": 1.9177302859404373, + "grad_norm": 0.23768948880263918, + "learning_rate": 5.769004334256645e-07, + "loss": 0.5645, + "step": 2425 + }, + { + "epoch": 1.9185218165627784, + "grad_norm": 0.22840772589416633, + "learning_rate": 5.761473263925964e-07, + "loss": 0.5409, + "step": 2426 + }, + { + "epoch": 1.9193133471851191, + "grad_norm": 0.23969343364944234, + "learning_rate": 5.75394512287513e-07, + "loss": 0.5564, + "step": 2427 + }, + { + "epoch": 1.9201048778074603, + "grad_norm": 0.24547323203686924, + "learning_rate": 5.746419916306898e-07, + "loss": 0.5451, + "step": 2428 + }, + { + "epoch": 1.920896408429801, + "grad_norm": 0.23758892838939458, + "learning_rate": 5.738897649422005e-07, + "loss": 0.5494, + "step": 2429 + }, + { + "epoch": 1.9216879390521422, + "grad_norm": 0.250850511325333, + "learning_rate": 5.731378327419141e-07, + "loss": 0.5557, + "step": 2430 + }, + { + "epoch": 1.922479469674483, + "grad_norm": 0.2474183927050043, + "learning_rate": 5.723861955494977e-07, + "loss": 0.5575, + "step": 2431 + }, + { + "epoch": 1.923271000296824, + "grad_norm": 0.24081355385277323, + "learning_rate": 5.716348538844136e-07, + "loss": 0.5499, + "step": 2432 + }, + { + "epoch": 1.9240625309191648, + "grad_norm": 0.23944059010205088, + "learning_rate": 5.708838082659198e-07, + "loss": 0.5524, + "step": 2433 + }, + { + "epoch": 1.924854061541506, + "grad_norm": 0.24110743457635514, + "learning_rate": 5.701330592130704e-07, + "loss": 0.5526, + "step": 2434 + }, + { + "epoch": 1.9256455921638467, + "grad_norm": 0.24173994727975137, + "learning_rate": 5.693826072447139e-07, + "loss": 0.557, + "step": 2435 + }, + { + "epoch": 1.9264371227861878, + "grad_norm": 0.2464393142787827, + "learning_rate": 5.686324528794937e-07, + "loss": 0.5385, + "step": 2436 + }, + { + "epoch": 1.9272286534085288, + "grad_norm": 0.24029954615659102, + "learning_rate": 5.678825966358474e-07, + "loss": 0.5513, + "step": 2437 + }, + { + "epoch": 1.9280201840308697, + "grad_norm": 0.2348592985551055, + "learning_rate": 5.671330390320063e-07, + "loss": 0.5387, + "step": 2438 + }, + { + "epoch": 1.9288117146532107, + "grad_norm": 0.2427186720103737, + "learning_rate": 5.663837805859966e-07, + "loss": 0.5583, + "step": 2439 + }, + { + "epoch": 1.9296032452755516, + "grad_norm": 0.2377050876736877, + "learning_rate": 5.656348218156363e-07, + "loss": 0.5505, + "step": 2440 + }, + { + "epoch": 1.9303947758978925, + "grad_norm": 0.24523301092284647, + "learning_rate": 5.648861632385368e-07, + "loss": 0.5317, + "step": 2441 + }, + { + "epoch": 1.9311863065202335, + "grad_norm": 0.23440624414493083, + "learning_rate": 5.641378053721022e-07, + "loss": 0.5537, + "step": 2442 + }, + { + "epoch": 1.9319778371425744, + "grad_norm": 0.24180475598605752, + "learning_rate": 5.633897487335279e-07, + "loss": 0.5429, + "step": 2443 + }, + { + "epoch": 1.9327693677649154, + "grad_norm": 0.23849571115961757, + "learning_rate": 5.626419938398032e-07, + "loss": 0.5468, + "step": 2444 + }, + { + "epoch": 1.9335608983872563, + "grad_norm": 0.2460470566941156, + "learning_rate": 5.618945412077065e-07, + "loss": 0.5614, + "step": 2445 + }, + { + "epoch": 1.9343524290095973, + "grad_norm": 0.23681777891851846, + "learning_rate": 5.611473913538087e-07, + "loss": 0.5575, + "step": 2446 + }, + { + "epoch": 1.9351439596319384, + "grad_norm": 0.23023517497313897, + "learning_rate": 5.604005447944711e-07, + "loss": 0.5458, + "step": 2447 + }, + { + "epoch": 1.9359354902542791, + "grad_norm": 0.23989326935200456, + "learning_rate": 5.596540020458445e-07, + "loss": 0.5606, + "step": 2448 + }, + { + "epoch": 1.9367270208766203, + "grad_norm": 0.24559666397466526, + "learning_rate": 5.589077636238719e-07, + "loss": 0.5495, + "step": 2449 + }, + { + "epoch": 1.937518551498961, + "grad_norm": 0.2315148502891609, + "learning_rate": 5.58161830044284e-07, + "loss": 0.5523, + "step": 2450 + }, + { + "epoch": 1.9383100821213022, + "grad_norm": 0.24074998530427816, + "learning_rate": 5.574162018226015e-07, + "loss": 0.5455, + "step": 2451 + }, + { + "epoch": 1.939101612743643, + "grad_norm": 0.23730949762800163, + "learning_rate": 5.566708794741341e-07, + "loss": 0.5483, + "step": 2452 + }, + { + "epoch": 1.939893143365984, + "grad_norm": 0.24244224941050907, + "learning_rate": 5.559258635139796e-07, + "loss": 0.5654, + "step": 2453 + }, + { + "epoch": 1.9406846739883248, + "grad_norm": 0.24326649471398717, + "learning_rate": 5.551811544570255e-07, + "loss": 0.558, + "step": 2454 + }, + { + "epoch": 1.941476204610666, + "grad_norm": 0.25468287478151763, + "learning_rate": 5.544367528179456e-07, + "loss": 0.5591, + "step": 2455 + }, + { + "epoch": 1.9422677352330067, + "grad_norm": 0.23107902824037338, + "learning_rate": 5.536926591112018e-07, + "loss": 0.5448, + "step": 2456 + }, + { + "epoch": 1.9430592658553478, + "grad_norm": 0.24051650444495715, + "learning_rate": 5.529488738510434e-07, + "loss": 0.5637, + "step": 2457 + }, + { + "epoch": 1.9438507964776888, + "grad_norm": 0.23882938825699424, + "learning_rate": 5.52205397551506e-07, + "loss": 0.5531, + "step": 2458 + }, + { + "epoch": 1.9446423271000297, + "grad_norm": 0.23670699492810005, + "learning_rate": 5.514622307264125e-07, + "loss": 0.5665, + "step": 2459 + }, + { + "epoch": 1.9454338577223707, + "grad_norm": 0.23841907768128848, + "learning_rate": 5.507193738893714e-07, + "loss": 0.5495, + "step": 2460 + }, + { + "epoch": 1.9462253883447116, + "grad_norm": 0.23961571643571905, + "learning_rate": 5.49976827553777e-07, + "loss": 0.5494, + "step": 2461 + }, + { + "epoch": 1.9470169189670525, + "grad_norm": 0.24335249171704146, + "learning_rate": 5.492345922328089e-07, + "loss": 0.5661, + "step": 2462 + }, + { + "epoch": 1.9478084495893935, + "grad_norm": 0.2314967581467103, + "learning_rate": 5.484926684394316e-07, + "loss": 0.5583, + "step": 2463 + }, + { + "epoch": 1.9485999802117344, + "grad_norm": 0.239064802795224, + "learning_rate": 5.477510566863953e-07, + "loss": 0.541, + "step": 2464 + }, + { + "epoch": 1.9493915108340754, + "grad_norm": 0.2284958524975405, + "learning_rate": 5.470097574862334e-07, + "loss": 0.5603, + "step": 2465 + }, + { + "epoch": 1.9501830414564163, + "grad_norm": 0.23631318786547262, + "learning_rate": 5.462687713512637e-07, + "loss": 0.5551, + "step": 2466 + }, + { + "epoch": 1.9509745720787572, + "grad_norm": 0.2367226366344625, + "learning_rate": 5.455280987935879e-07, + "loss": 0.5578, + "step": 2467 + }, + { + "epoch": 1.9517661027010984, + "grad_norm": 0.2383950714885435, + "learning_rate": 5.447877403250901e-07, + "loss": 0.567, + "step": 2468 + }, + { + "epoch": 1.9525576333234391, + "grad_norm": 0.2350701691920177, + "learning_rate": 5.44047696457439e-07, + "loss": 0.54, + "step": 2469 + }, + { + "epoch": 1.9533491639457803, + "grad_norm": 0.23771857910109087, + "learning_rate": 5.433079677020842e-07, + "loss": 0.554, + "step": 2470 + }, + { + "epoch": 1.954140694568121, + "grad_norm": 0.23727350879604792, + "learning_rate": 5.425685545702585e-07, + "loss": 0.5467, + "step": 2471 + }, + { + "epoch": 1.9549322251904622, + "grad_norm": 0.22989087330747757, + "learning_rate": 5.418294575729758e-07, + "loss": 0.5455, + "step": 2472 + }, + { + "epoch": 1.9557237558128029, + "grad_norm": 0.23442124535998407, + "learning_rate": 5.410906772210325e-07, + "loss": 0.5519, + "step": 2473 + }, + { + "epoch": 1.956515286435144, + "grad_norm": 0.23784311728729576, + "learning_rate": 5.403522140250049e-07, + "loss": 0.5597, + "step": 2474 + }, + { + "epoch": 1.9573068170574848, + "grad_norm": 0.2397409728072015, + "learning_rate": 5.396140684952516e-07, + "loss": 0.5431, + "step": 2475 + }, + { + "epoch": 1.958098347679826, + "grad_norm": 0.2398035543797548, + "learning_rate": 5.388762411419104e-07, + "loss": 0.5614, + "step": 2476 + }, + { + "epoch": 1.9588898783021667, + "grad_norm": 0.23994136890298104, + "learning_rate": 5.381387324749005e-07, + "loss": 0.5554, + "step": 2477 + }, + { + "epoch": 1.9596814089245078, + "grad_norm": 0.24169417187721992, + "learning_rate": 5.374015430039191e-07, + "loss": 0.5536, + "step": 2478 + }, + { + "epoch": 1.9604729395468488, + "grad_norm": 0.23709731857020674, + "learning_rate": 5.366646732384437e-07, + "loss": 0.557, + "step": 2479 + }, + { + "epoch": 1.9612644701691897, + "grad_norm": 0.2402655953454577, + "learning_rate": 5.35928123687732e-07, + "loss": 0.551, + "step": 2480 + }, + { + "epoch": 1.9620560007915306, + "grad_norm": 0.24224574444439703, + "learning_rate": 5.351918948608181e-07, + "loss": 0.5646, + "step": 2481 + }, + { + "epoch": 1.9628475314138716, + "grad_norm": 0.2457113684861732, + "learning_rate": 5.344559872665168e-07, + "loss": 0.542, + "step": 2482 + }, + { + "epoch": 1.9636390620362125, + "grad_norm": 0.24704185219831643, + "learning_rate": 5.337204014134189e-07, + "loss": 0.5589, + "step": 2483 + }, + { + "epoch": 1.9644305926585535, + "grad_norm": 0.24402006383958874, + "learning_rate": 5.329851378098935e-07, + "loss": 0.557, + "step": 2484 + }, + { + "epoch": 1.9652221232808944, + "grad_norm": 0.23717589771406625, + "learning_rate": 5.322501969640882e-07, + "loss": 0.5657, + "step": 2485 + }, + { + "epoch": 1.9660136539032353, + "grad_norm": 0.24098898733719273, + "learning_rate": 5.315155793839252e-07, + "loss": 0.5568, + "step": 2486 + }, + { + "epoch": 1.9668051845255763, + "grad_norm": 0.23162980850798595, + "learning_rate": 5.307812855771064e-07, + "loss": 0.5632, + "step": 2487 + }, + { + "epoch": 1.9675967151479172, + "grad_norm": 0.2451998753327035, + "learning_rate": 5.300473160511065e-07, + "loss": 0.5519, + "step": 2488 + }, + { + "epoch": 1.9683882457702584, + "grad_norm": 0.24608389236797332, + "learning_rate": 5.29313671313178e-07, + "loss": 0.5385, + "step": 2489 + }, + { + "epoch": 1.9691797763925991, + "grad_norm": 0.23457900464866493, + "learning_rate": 5.285803518703493e-07, + "loss": 0.5576, + "step": 2490 + }, + { + "epoch": 1.9699713070149403, + "grad_norm": 0.2371699065569416, + "learning_rate": 5.27847358229423e-07, + "loss": 0.557, + "step": 2491 + }, + { + "epoch": 1.970762837637281, + "grad_norm": 0.24009406599812902, + "learning_rate": 5.27114690896977e-07, + "loss": 0.5525, + "step": 2492 + }, + { + "epoch": 1.9715543682596222, + "grad_norm": 0.23655243716817018, + "learning_rate": 5.263823503793634e-07, + "loss": 0.5609, + "step": 2493 + }, + { + "epoch": 1.9723458988819629, + "grad_norm": 0.24445936204627025, + "learning_rate": 5.256503371827085e-07, + "loss": 0.5435, + "step": 2494 + }, + { + "epoch": 1.973137429504304, + "grad_norm": 0.24478990006565868, + "learning_rate": 5.249186518129128e-07, + "loss": 0.5612, + "step": 2495 + }, + { + "epoch": 1.9739289601266448, + "grad_norm": 0.23624292044273565, + "learning_rate": 5.241872947756501e-07, + "loss": 0.5525, + "step": 2496 + }, + { + "epoch": 1.974720490748986, + "grad_norm": 0.23910063314875665, + "learning_rate": 5.234562665763662e-07, + "loss": 0.5534, + "step": 2497 + }, + { + "epoch": 1.9755120213713266, + "grad_norm": 0.23513898688014817, + "learning_rate": 5.227255677202821e-07, + "loss": 0.5469, + "step": 2498 + }, + { + "epoch": 1.9763035519936678, + "grad_norm": 0.24291059633629505, + "learning_rate": 5.219951987123878e-07, + "loss": 0.5585, + "step": 2499 + }, + { + "epoch": 1.9770950826160087, + "grad_norm": 0.2387987313855466, + "learning_rate": 5.212651600574487e-07, + "loss": 0.5384, + "step": 2500 + }, + { + "epoch": 1.9778866132383497, + "grad_norm": 0.2450052462307983, + "learning_rate": 5.205354522599998e-07, + "loss": 0.5673, + "step": 2501 + }, + { + "epoch": 1.9786781438606906, + "grad_norm": 0.23726334743327526, + "learning_rate": 5.198060758243475e-07, + "loss": 0.5511, + "step": 2502 + }, + { + "epoch": 1.9794696744830316, + "grad_norm": 0.241254532507191, + "learning_rate": 5.190770312545711e-07, + "loss": 0.5596, + "step": 2503 + }, + { + "epoch": 1.9802612051053725, + "grad_norm": 0.2536493764522025, + "learning_rate": 5.183483190545175e-07, + "loss": 0.5662, + "step": 2504 + }, + { + "epoch": 1.9810527357277135, + "grad_norm": 0.2359397363385922, + "learning_rate": 5.176199397278066e-07, + "loss": 0.5412, + "step": 2505 + }, + { + "epoch": 1.9818442663500544, + "grad_norm": 0.2353383907282585, + "learning_rate": 5.16891893777827e-07, + "loss": 0.5562, + "step": 2506 + }, + { + "epoch": 1.9826357969723953, + "grad_norm": 0.2420766802841231, + "learning_rate": 5.161641817077366e-07, + "loss": 0.5502, + "step": 2507 + }, + { + "epoch": 1.9834273275947363, + "grad_norm": 0.3428472738074613, + "learning_rate": 5.154368040204642e-07, + "loss": 0.5485, + "step": 2508 + }, + { + "epoch": 1.9842188582170772, + "grad_norm": 0.2383569482950343, + "learning_rate": 5.14709761218705e-07, + "loss": 0.567, + "step": 2509 + }, + { + "epoch": 1.9850103888394184, + "grad_norm": 0.2412249723452846, + "learning_rate": 5.13983053804925e-07, + "loss": 0.5726, + "step": 2510 + }, + { + "epoch": 1.985801919461759, + "grad_norm": 0.24101319591698722, + "learning_rate": 5.132566822813574e-07, + "loss": 0.5551, + "step": 2511 + }, + { + "epoch": 1.9865934500841003, + "grad_norm": 0.23809317461331223, + "learning_rate": 5.125306471500027e-07, + "loss": 0.5561, + "step": 2512 + }, + { + "epoch": 1.987384980706441, + "grad_norm": 0.24225892795271026, + "learning_rate": 5.118049489126311e-07, + "loss": 0.5441, + "step": 2513 + }, + { + "epoch": 1.9881765113287821, + "grad_norm": 0.2320756177212838, + "learning_rate": 5.110795880707766e-07, + "loss": 0.5407, + "step": 2514 + }, + { + "epoch": 1.9889680419511229, + "grad_norm": 0.23590815149266026, + "learning_rate": 5.103545651257434e-07, + "loss": 0.5581, + "step": 2515 + }, + { + "epoch": 1.989759572573464, + "grad_norm": 0.2310993563834982, + "learning_rate": 5.096298805786001e-07, + "loss": 0.5519, + "step": 2516 + }, + { + "epoch": 1.9905511031958047, + "grad_norm": 0.2465016184253646, + "learning_rate": 5.089055349301816e-07, + "loss": 0.5543, + "step": 2517 + }, + { + "epoch": 1.991342633818146, + "grad_norm": 0.24490968259524826, + "learning_rate": 5.081815286810902e-07, + "loss": 0.5473, + "step": 2518 + }, + { + "epoch": 1.9921341644404866, + "grad_norm": 0.2394550568566711, + "learning_rate": 5.074578623316907e-07, + "loss": 0.5717, + "step": 2519 + }, + { + "epoch": 1.9929256950628278, + "grad_norm": 0.2351647758535822, + "learning_rate": 5.067345363821162e-07, + "loss": 0.554, + "step": 2520 + }, + { + "epoch": 1.9937172256851687, + "grad_norm": 0.24227970138606264, + "learning_rate": 5.060115513322627e-07, + "loss": 0.5676, + "step": 2521 + }, + { + "epoch": 1.9945087563075097, + "grad_norm": 0.24532665453467145, + "learning_rate": 5.052889076817903e-07, + "loss": 0.5405, + "step": 2522 + }, + { + "epoch": 1.9953002869298506, + "grad_norm": 0.23808811316520417, + "learning_rate": 5.045666059301246e-07, + "loss": 0.5599, + "step": 2523 + }, + { + "epoch": 1.9960918175521916, + "grad_norm": 0.24540727646746247, + "learning_rate": 5.038446465764541e-07, + "loss": 0.5624, + "step": 2524 + }, + { + "epoch": 1.9968833481745325, + "grad_norm": 0.24379096659303265, + "learning_rate": 5.031230301197303e-07, + "loss": 0.5636, + "step": 2525 + }, + { + "epoch": 1.9976748787968734, + "grad_norm": 0.23696774982991786, + "learning_rate": 5.024017570586684e-07, + "loss": 0.5613, + "step": 2526 + }, + { + "epoch": 1.9984664094192144, + "grad_norm": 0.2438080404125615, + "learning_rate": 5.016808278917455e-07, + "loss": 0.5457, + "step": 2527 + }, + { + "epoch": 1.9992579400415553, + "grad_norm": 0.24294862123670521, + "learning_rate": 5.009602431172022e-07, + "loss": 0.5512, + "step": 2528 + }, + { + "epoch": 2.000049470663896, + "grad_norm": 0.23540041623911523, + "learning_rate": 5.002400032330403e-07, + "loss": 0.5471, + "step": 2529 + }, + { + "epoch": 2.000841001286237, + "grad_norm": 0.24136383377777998, + "learning_rate": 4.995201087370232e-07, + "loss": 0.5639, + "step": 2530 + }, + { + "epoch": 2.0016325319085784, + "grad_norm": 0.24016731049277348, + "learning_rate": 4.988005601266758e-07, + "loss": 0.5538, + "step": 2531 + }, + { + "epoch": 2.0016325319085784, + "eval_loss": 0.5885822772979736, + "eval_runtime": 2118.5562, + "eval_samples_per_second": 12.872, + "eval_steps_per_second": 1.609, + "step": 2531 + }, + { + "epoch": 2.0007420232500617, + "grad_norm": 0.24773872144799403, + "learning_rate": 4.980813578992835e-07, + "loss": 0.5381, + "step": 2532 + }, + { + "epoch": 2.0015335147167943, + "grad_norm": 0.2385073683372732, + "learning_rate": 4.973625025518934e-07, + "loss": 0.5313, + "step": 2533 + }, + { + "epoch": 2.002325006183527, + "grad_norm": 0.24022890170412228, + "learning_rate": 4.966439945813121e-07, + "loss": 0.5483, + "step": 2534 + }, + { + "epoch": 2.0031164976502596, + "grad_norm": 0.24169024205808562, + "learning_rate": 4.959258344841061e-07, + "loss": 0.5524, + "step": 2535 + }, + { + "epoch": 2.003907989116992, + "grad_norm": 0.23275002001052666, + "learning_rate": 4.952080227566014e-07, + "loss": 0.5295, + "step": 2536 + }, + { + "epoch": 2.004699480583725, + "grad_norm": 0.23000403348109522, + "learning_rate": 4.944905598948834e-07, + "loss": 0.5476, + "step": 2537 + }, + { + "epoch": 2.0054909720504575, + "grad_norm": 0.24300537805341524, + "learning_rate": 4.93773446394797e-07, + "loss": 0.5485, + "step": 2538 + }, + { + "epoch": 2.00628246351719, + "grad_norm": 0.24273370758876583, + "learning_rate": 4.930566827519447e-07, + "loss": 0.539, + "step": 2539 + }, + { + "epoch": 2.0070739549839227, + "grad_norm": 0.24281565983217607, + "learning_rate": 4.923402694616877e-07, + "loss": 0.5393, + "step": 2540 + }, + { + "epoch": 2.0078654464506553, + "grad_norm": 0.24177680184458014, + "learning_rate": 4.91624207019145e-07, + "loss": 0.5435, + "step": 2541 + }, + { + "epoch": 2.008656937917388, + "grad_norm": 0.24674405255012505, + "learning_rate": 4.909084959191925e-07, + "loss": 0.5552, + "step": 2542 + }, + { + "epoch": 2.0094484293841206, + "grad_norm": 0.23832192168061545, + "learning_rate": 4.901931366564649e-07, + "loss": 0.541, + "step": 2543 + }, + { + "epoch": 2.0102399208508532, + "grad_norm": 0.23311165779172743, + "learning_rate": 4.894781297253522e-07, + "loss": 0.5371, + "step": 2544 + }, + { + "epoch": 2.011031412317586, + "grad_norm": 0.23750875001267496, + "learning_rate": 4.887634756200015e-07, + "loss": 0.549, + "step": 2545 + }, + { + "epoch": 2.0118229037843185, + "grad_norm": 0.23927372447424175, + "learning_rate": 4.880491748343159e-07, + "loss": 0.5311, + "step": 2546 + }, + { + "epoch": 2.012614395251051, + "grad_norm": 0.23826738272169978, + "learning_rate": 4.873352278619542e-07, + "loss": 0.5356, + "step": 2547 + }, + { + "epoch": 2.0134058867177838, + "grad_norm": 0.2430163007596369, + "learning_rate": 4.866216351963316e-07, + "loss": 0.5495, + "step": 2548 + }, + { + "epoch": 2.0141973781845164, + "grad_norm": 0.23496655183307583, + "learning_rate": 4.859083973306171e-07, + "loss": 0.5374, + "step": 2549 + }, + { + "epoch": 2.014988869651249, + "grad_norm": 0.23169821982515804, + "learning_rate": 4.851955147577354e-07, + "loss": 0.5462, + "step": 2550 + }, + { + "epoch": 2.0157803611179816, + "grad_norm": 0.23525321337936167, + "learning_rate": 4.844829879703653e-07, + "loss": 0.5204, + "step": 2551 + }, + { + "epoch": 2.0165718525847143, + "grad_norm": 0.2412893577983348, + "learning_rate": 4.837708174609393e-07, + "loss": 0.5543, + "step": 2552 + }, + { + "epoch": 2.017363344051447, + "grad_norm": 0.24020913441771027, + "learning_rate": 4.830590037216449e-07, + "loss": 0.5492, + "step": 2553 + }, + { + "epoch": 2.0181548355181795, + "grad_norm": 0.2388973935767555, + "learning_rate": 4.823475472444221e-07, + "loss": 0.5522, + "step": 2554 + }, + { + "epoch": 2.018946326984912, + "grad_norm": 0.23177782154784332, + "learning_rate": 4.81636448520964e-07, + "loss": 0.5365, + "step": 2555 + }, + { + "epoch": 2.019737818451645, + "grad_norm": 0.23893233132437353, + "learning_rate": 4.809257080427166e-07, + "loss": 0.5405, + "step": 2556 + }, + { + "epoch": 2.0205293099183774, + "grad_norm": 0.24141766708765985, + "learning_rate": 4.802153263008782e-07, + "loss": 0.5479, + "step": 2557 + }, + { + "epoch": 2.02132080138511, + "grad_norm": 0.2333453756743298, + "learning_rate": 4.795053037863999e-07, + "loss": 0.5502, + "step": 2558 + }, + { + "epoch": 2.0221122928518427, + "grad_norm": 0.2383671275575243, + "learning_rate": 4.787956409899836e-07, + "loss": 0.5439, + "step": 2559 + }, + { + "epoch": 2.0229037843185753, + "grad_norm": 0.2361967368521129, + "learning_rate": 4.78086338402083e-07, + "loss": 0.5508, + "step": 2560 + }, + { + "epoch": 2.023695275785308, + "grad_norm": 0.23710023443369116, + "learning_rate": 4.773773965129028e-07, + "loss": 0.5514, + "step": 2561 + }, + { + "epoch": 2.0244867672520406, + "grad_norm": 0.24349644792421318, + "learning_rate": 4.76668815812398e-07, + "loss": 0.5489, + "step": 2562 + }, + { + "epoch": 2.025278258718773, + "grad_norm": 0.23121129785638533, + "learning_rate": 4.7596059679027536e-07, + "loss": 0.5465, + "step": 2563 + }, + { + "epoch": 2.026069750185506, + "grad_norm": 0.23738825335000968, + "learning_rate": 4.7525273993599015e-07, + "loss": 0.5532, + "step": 2564 + }, + { + "epoch": 2.0268612416522385, + "grad_norm": 0.2372106152865567, + "learning_rate": 4.7454524573874744e-07, + "loss": 0.544, + "step": 2565 + }, + { + "epoch": 2.027652733118971, + "grad_norm": 0.2367507110443533, + "learning_rate": 4.7383811468750356e-07, + "loss": 0.5398, + "step": 2566 + }, + { + "epoch": 2.0284442245857037, + "grad_norm": 0.23602386237918596, + "learning_rate": 4.7313134727096083e-07, + "loss": 0.5348, + "step": 2567 + }, + { + "epoch": 2.0292357160524364, + "grad_norm": 0.2420129410819989, + "learning_rate": 4.7242494397757303e-07, + "loss": 0.5344, + "step": 2568 + }, + { + "epoch": 2.030027207519169, + "grad_norm": 0.2380690253218786, + "learning_rate": 4.717189052955407e-07, + "loss": 0.5395, + "step": 2569 + }, + { + "epoch": 2.0308186989859016, + "grad_norm": 0.2409138666986156, + "learning_rate": 4.710132317128125e-07, + "loss": 0.5538, + "step": 2570 + }, + { + "epoch": 2.0316101904526342, + "grad_norm": 0.24516111762394113, + "learning_rate": 4.7030792371708625e-07, + "loss": 0.5539, + "step": 2571 + }, + { + "epoch": 2.032401681919367, + "grad_norm": 0.24333049370617602, + "learning_rate": 4.696029817958045e-07, + "loss": 0.5523, + "step": 2572 + }, + { + "epoch": 2.0331931733860995, + "grad_norm": 0.23243381498396162, + "learning_rate": 4.688984064361593e-07, + "loss": 0.5478, + "step": 2573 + }, + { + "epoch": 2.033984664852832, + "grad_norm": 0.23501751026576856, + "learning_rate": 4.6819419812508807e-07, + "loss": 0.5401, + "step": 2574 + }, + { + "epoch": 2.0347761563195648, + "grad_norm": 0.2370957909666972, + "learning_rate": 4.6749035734927434e-07, + "loss": 0.5365, + "step": 2575 + }, + { + "epoch": 2.0355676477862974, + "grad_norm": 0.2350543528277413, + "learning_rate": 4.6678688459514937e-07, + "loss": 0.5292, + "step": 2576 + }, + { + "epoch": 2.03635913925303, + "grad_norm": 0.23770123833645423, + "learning_rate": 4.6608378034888763e-07, + "loss": 0.5568, + "step": 2577 + }, + { + "epoch": 2.0371506307197627, + "grad_norm": 0.24308606133493194, + "learning_rate": 4.6538104509641007e-07, + "loss": 0.5452, + "step": 2578 + }, + { + "epoch": 2.0379421221864953, + "grad_norm": 0.2363203380658553, + "learning_rate": 4.6467867932338344e-07, + "loss": 0.5296, + "step": 2579 + }, + { + "epoch": 2.038733613653228, + "grad_norm": 0.24286814987173197, + "learning_rate": 4.639766835152177e-07, + "loss": 0.5401, + "step": 2580 + }, + { + "epoch": 2.0395251051199605, + "grad_norm": 0.23125599966751756, + "learning_rate": 4.632750581570687e-07, + "loss": 0.5508, + "step": 2581 + }, + { + "epoch": 2.040316596586693, + "grad_norm": 0.25038102575664556, + "learning_rate": 4.625738037338345e-07, + "loss": 0.5452, + "step": 2582 + }, + { + "epoch": 2.041108088053426, + "grad_norm": 0.2356423019517063, + "learning_rate": 4.618729207301575e-07, + "loss": 0.5423, + "step": 2583 + }, + { + "epoch": 2.0418995795201584, + "grad_norm": 0.23772180389882316, + "learning_rate": 4.611724096304244e-07, + "loss": 0.5395, + "step": 2584 + }, + { + "epoch": 2.042691070986891, + "grad_norm": 0.23939242487627102, + "learning_rate": 4.6047227091876305e-07, + "loss": 0.5283, + "step": 2585 + }, + { + "epoch": 2.0434825624536237, + "grad_norm": 0.24759837515967656, + "learning_rate": 4.597725050790461e-07, + "loss": 0.5594, + "step": 2586 + }, + { + "epoch": 2.0442740539203563, + "grad_norm": 0.238897498076543, + "learning_rate": 4.59073112594887e-07, + "loss": 0.544, + "step": 2587 + }, + { + "epoch": 2.045065545387089, + "grad_norm": 0.2385577619041336, + "learning_rate": 4.583740939496404e-07, + "loss": 0.5599, + "step": 2588 + }, + { + "epoch": 2.0458570368538216, + "grad_norm": 0.23923905909510285, + "learning_rate": 4.576754496264051e-07, + "loss": 0.548, + "step": 2589 + }, + { + "epoch": 2.046648528320554, + "grad_norm": 0.23462535030007492, + "learning_rate": 4.5697718010801877e-07, + "loss": 0.5519, + "step": 2590 + }, + { + "epoch": 2.047440019787287, + "grad_norm": 0.24857974456978588, + "learning_rate": 4.5627928587706197e-07, + "loss": 0.5494, + "step": 2591 + }, + { + "epoch": 2.0482315112540195, + "grad_norm": 0.23635724494456387, + "learning_rate": 4.5558176741585497e-07, + "loss": 0.55, + "step": 2592 + }, + { + "epoch": 2.049023002720752, + "grad_norm": 0.23616867778197356, + "learning_rate": 4.548846252064572e-07, + "loss": 0.551, + "step": 2593 + }, + { + "epoch": 2.0498144941874847, + "grad_norm": 0.2383207284099584, + "learning_rate": 4.541878597306704e-07, + "loss": 0.5335, + "step": 2594 + }, + { + "epoch": 2.0506059856542174, + "grad_norm": 0.23933865356005052, + "learning_rate": 4.534914714700345e-07, + "loss": 0.5545, + "step": 2595 + }, + { + "epoch": 2.05139747712095, + "grad_norm": 0.2475085228852165, + "learning_rate": 4.5279546090582855e-07, + "loss": 0.5459, + "step": 2596 + }, + { + "epoch": 2.0521889685876826, + "grad_norm": 0.23124372369872748, + "learning_rate": 4.5209982851907205e-07, + "loss": 0.5432, + "step": 2597 + }, + { + "epoch": 2.052980460054415, + "grad_norm": 0.23836302808378285, + "learning_rate": 4.5140457479052076e-07, + "loss": 0.5439, + "step": 2598 + }, + { + "epoch": 2.0537719515211474, + "grad_norm": 0.24070375492131732, + "learning_rate": 4.507097002006712e-07, + "loss": 0.5298, + "step": 2599 + }, + { + "epoch": 2.05456344298788, + "grad_norm": 0.24882750453944374, + "learning_rate": 4.5001520522975655e-07, + "loss": 0.5477, + "step": 2600 + }, + { + "epoch": 2.0553549344546127, + "grad_norm": 0.23998037775805112, + "learning_rate": 4.493210903577472e-07, + "loss": 0.5445, + "step": 2601 + }, + { + "epoch": 2.0561464259213453, + "grad_norm": 0.22944328523082502, + "learning_rate": 4.4862735606435286e-07, + "loss": 0.5391, + "step": 2602 + }, + { + "epoch": 2.056937917388078, + "grad_norm": 0.2375348653351975, + "learning_rate": 4.479340028290174e-07, + "loss": 0.5415, + "step": 2603 + }, + { + "epoch": 2.0577294088548106, + "grad_norm": 0.24575707157167756, + "learning_rate": 4.472410311309236e-07, + "loss": 0.5375, + "step": 2604 + }, + { + "epoch": 2.0585209003215432, + "grad_norm": 0.23996252446730126, + "learning_rate": 4.465484414489897e-07, + "loss": 0.5558, + "step": 2605 + }, + { + "epoch": 2.059312391788276, + "grad_norm": 0.22860285437539668, + "learning_rate": 4.458562342618696e-07, + "loss": 0.5521, + "step": 2606 + }, + { + "epoch": 2.0601038832550085, + "grad_norm": 0.23868020440907983, + "learning_rate": 4.4516441004795393e-07, + "loss": 0.5372, + "step": 2607 + }, + { + "epoch": 2.060895374721741, + "grad_norm": 0.24299037579234517, + "learning_rate": 4.444729692853675e-07, + "loss": 0.5387, + "step": 2608 + }, + { + "epoch": 2.0616868661884737, + "grad_norm": 0.23582683768296428, + "learning_rate": 4.437819124519706e-07, + "loss": 0.5398, + "step": 2609 + }, + { + "epoch": 2.0624783576552064, + "grad_norm": 0.2388253141147197, + "learning_rate": 4.4309124002535824e-07, + "loss": 0.554, + "step": 2610 + }, + { + "epoch": 2.063269849121939, + "grad_norm": 0.23809082107721263, + "learning_rate": 4.4240095248285923e-07, + "loss": 0.5452, + "step": 2611 + }, + { + "epoch": 2.0640613405886716, + "grad_norm": 0.24457771276029272, + "learning_rate": 4.4171105030153764e-07, + "loss": 0.5442, + "step": 2612 + }, + { + "epoch": 2.0648528320554043, + "grad_norm": 0.2327143037804538, + "learning_rate": 4.4102153395819e-07, + "loss": 0.5448, + "step": 2613 + }, + { + "epoch": 2.065644323522137, + "grad_norm": 0.23749102179802173, + "learning_rate": 4.4033240392934656e-07, + "loss": 0.5432, + "step": 2614 + }, + { + "epoch": 2.0664358149888695, + "grad_norm": 0.23936681376959112, + "learning_rate": 4.3964366069127055e-07, + "loss": 0.5578, + "step": 2615 + }, + { + "epoch": 2.067227306455602, + "grad_norm": 0.24895254802281508, + "learning_rate": 4.389553047199578e-07, + "loss": 0.5441, + "step": 2616 + }, + { + "epoch": 2.068018797922335, + "grad_norm": 0.24822475647937434, + "learning_rate": 4.382673364911372e-07, + "loss": 0.5389, + "step": 2617 + }, + { + "epoch": 2.0688102893890674, + "grad_norm": 0.23421098129477877, + "learning_rate": 4.375797564802689e-07, + "loss": 0.5378, + "step": 2618 + }, + { + "epoch": 2.0696017808558, + "grad_norm": 0.23370889073902257, + "learning_rate": 4.368925651625448e-07, + "loss": 0.5446, + "step": 2619 + }, + { + "epoch": 2.0703932723225327, + "grad_norm": 0.2420375741406236, + "learning_rate": 4.3620576301288834e-07, + "loss": 0.5523, + "step": 2620 + }, + { + "epoch": 2.0711847637892653, + "grad_norm": 0.23700570041043656, + "learning_rate": 4.355193505059537e-07, + "loss": 0.5402, + "step": 2621 + }, + { + "epoch": 2.071976255255998, + "grad_norm": 0.2401536630396451, + "learning_rate": 4.348333281161268e-07, + "loss": 0.5491, + "step": 2622 + }, + { + "epoch": 2.0727677467227306, + "grad_norm": 0.24256794326623432, + "learning_rate": 4.341476963175229e-07, + "loss": 0.5427, + "step": 2623 + }, + { + "epoch": 2.073559238189463, + "grad_norm": 0.24137579698066972, + "learning_rate": 4.3346245558398753e-07, + "loss": 0.5473, + "step": 2624 + }, + { + "epoch": 2.074350729656196, + "grad_norm": 0.2414256494195632, + "learning_rate": 4.327776063890962e-07, + "loss": 0.5502, + "step": 2625 + }, + { + "epoch": 2.0751422211229285, + "grad_norm": 0.2350478866333628, + "learning_rate": 4.3209314920615314e-07, + "loss": 0.5479, + "step": 2626 + }, + { + "epoch": 2.075933712589661, + "grad_norm": 0.24060422874115028, + "learning_rate": 4.314090845081931e-07, + "loss": 0.5416, + "step": 2627 + }, + { + "epoch": 2.0767252040563937, + "grad_norm": 0.24005254014681046, + "learning_rate": 4.307254127679782e-07, + "loss": 0.5334, + "step": 2628 + }, + { + "epoch": 2.0775166955231263, + "grad_norm": 0.23726947495812292, + "learning_rate": 4.300421344579996e-07, + "loss": 0.5365, + "step": 2629 + }, + { + "epoch": 2.078308186989859, + "grad_norm": 0.23920009310993925, + "learning_rate": 4.2935925005047624e-07, + "loss": 0.535, + "step": 2630 + }, + { + "epoch": 2.0790996784565916, + "grad_norm": 0.23120415369048664, + "learning_rate": 4.2867676001735486e-07, + "loss": 0.537, + "step": 2631 + }, + { + "epoch": 2.0798911699233242, + "grad_norm": 0.23700420227631172, + "learning_rate": 4.2799466483031033e-07, + "loss": 0.5442, + "step": 2632 + }, + { + "epoch": 2.080682661390057, + "grad_norm": 0.23965011423463708, + "learning_rate": 4.273129649607439e-07, + "loss": 0.5435, + "step": 2633 + }, + { + "epoch": 2.0814741528567895, + "grad_norm": 0.2346525230558761, + "learning_rate": 4.266316608797836e-07, + "loss": 0.5268, + "step": 2634 + }, + { + "epoch": 2.082265644323522, + "grad_norm": 0.23502062739655638, + "learning_rate": 4.2595075305828445e-07, + "loss": 0.5304, + "step": 2635 + }, + { + "epoch": 2.0830571357902548, + "grad_norm": 0.23455472138946823, + "learning_rate": 4.252702419668267e-07, + "loss": 0.5307, + "step": 2636 + }, + { + "epoch": 2.0838486272569874, + "grad_norm": 0.23408216408686333, + "learning_rate": 4.245901280757179e-07, + "loss": 0.5501, + "step": 2637 + }, + { + "epoch": 2.08464011872372, + "grad_norm": 0.23716650040576878, + "learning_rate": 4.2391041185498987e-07, + "loss": 0.5493, + "step": 2638 + }, + { + "epoch": 2.0854316101904526, + "grad_norm": 0.2376582721488692, + "learning_rate": 4.232310937743998e-07, + "loss": 0.5548, + "step": 2639 + }, + { + "epoch": 2.0862231016571853, + "grad_norm": 0.23617707987290446, + "learning_rate": 4.2255217430343005e-07, + "loss": 0.5238, + "step": 2640 + }, + { + "epoch": 2.087014593123918, + "grad_norm": 0.24201311469735495, + "learning_rate": 4.218736539112868e-07, + "loss": 0.5417, + "step": 2641 + }, + { + "epoch": 2.0878060845906505, + "grad_norm": 0.2377382925311404, + "learning_rate": 4.211955330669017e-07, + "loss": 0.5458, + "step": 2642 + }, + { + "epoch": 2.088597576057383, + "grad_norm": 0.23487792135662103, + "learning_rate": 4.205178122389291e-07, + "loss": 0.5457, + "step": 2643 + }, + { + "epoch": 2.089389067524116, + "grad_norm": 0.2407008574725615, + "learning_rate": 4.198404918957474e-07, + "loss": 0.5407, + "step": 2644 + }, + { + "epoch": 2.0901805589908484, + "grad_norm": 0.24245992583135814, + "learning_rate": 4.191635725054582e-07, + "loss": 0.5448, + "step": 2645 + }, + { + "epoch": 2.090972050457581, + "grad_norm": 0.23423440438327897, + "learning_rate": 4.184870545358854e-07, + "loss": 0.5296, + "step": 2646 + }, + { + "epoch": 2.0917635419243137, + "grad_norm": 0.23723200022314908, + "learning_rate": 4.178109384545767e-07, + "loss": 0.5424, + "step": 2647 + }, + { + "epoch": 2.0925550333910463, + "grad_norm": 0.24589976472930544, + "learning_rate": 4.1713522472880115e-07, + "loss": 0.544, + "step": 2648 + }, + { + "epoch": 2.093346524857779, + "grad_norm": 0.23780461189688573, + "learning_rate": 4.1645991382554945e-07, + "loss": 0.528, + "step": 2649 + }, + { + "epoch": 2.0941380163245116, + "grad_norm": 0.23329018650218644, + "learning_rate": 4.1578500621153556e-07, + "loss": 0.5597, + "step": 2650 + }, + { + "epoch": 2.094929507791244, + "grad_norm": 0.24396786237580703, + "learning_rate": 4.15110502353192e-07, + "loss": 0.5427, + "step": 2651 + }, + { + "epoch": 2.095720999257977, + "grad_norm": 0.2390954572392998, + "learning_rate": 4.144364027166749e-07, + "loss": 0.5389, + "step": 2652 + }, + { + "epoch": 2.0965124907247095, + "grad_norm": 0.2309555686291755, + "learning_rate": 4.137627077678596e-07, + "loss": 0.5372, + "step": 2653 + }, + { + "epoch": 2.097303982191442, + "grad_norm": 0.24066560664586562, + "learning_rate": 4.1308941797234174e-07, + "loss": 0.5585, + "step": 2654 + }, + { + "epoch": 2.0980954736581747, + "grad_norm": 0.23979703252462348, + "learning_rate": 4.124165337954384e-07, + "loss": 0.543, + "step": 2655 + }, + { + "epoch": 2.0988869651249074, + "grad_norm": 0.2462660341072815, + "learning_rate": 4.1174405570218375e-07, + "loss": 0.5466, + "step": 2656 + }, + { + "epoch": 2.09967845659164, + "grad_norm": 0.23253145245822232, + "learning_rate": 4.110719841573339e-07, + "loss": 0.5253, + "step": 2657 + }, + { + "epoch": 2.1004699480583726, + "grad_norm": 0.2399868131554026, + "learning_rate": 4.104003196253625e-07, + "loss": 0.5537, + "step": 2658 + }, + { + "epoch": 2.1012614395251052, + "grad_norm": 0.2323392761111308, + "learning_rate": 4.097290625704619e-07, + "loss": 0.5472, + "step": 2659 + }, + { + "epoch": 2.102052930991838, + "grad_norm": 0.24553950723811055, + "learning_rate": 4.0905821345654437e-07, + "loss": 0.5445, + "step": 2660 + }, + { + "epoch": 2.1028444224585705, + "grad_norm": 0.2428412102787307, + "learning_rate": 4.083877727472378e-07, + "loss": 0.5346, + "step": 2661 + }, + { + "epoch": 2.103635913925303, + "grad_norm": 0.235044009071209, + "learning_rate": 4.0771774090588994e-07, + "loss": 0.5267, + "step": 2662 + }, + { + "epoch": 2.1044274053920358, + "grad_norm": 0.23369441567291457, + "learning_rate": 4.0704811839556506e-07, + "loss": 0.5317, + "step": 2663 + }, + { + "epoch": 2.1052188968587684, + "grad_norm": 0.23642228760111267, + "learning_rate": 4.063789056790442e-07, + "loss": 0.5437, + "step": 2664 + }, + { + "epoch": 2.106010388325501, + "grad_norm": 0.24511624494914874, + "learning_rate": 4.0571010321882683e-07, + "loss": 0.5569, + "step": 2665 + }, + { + "epoch": 2.1068018797922337, + "grad_norm": 0.23807134925894802, + "learning_rate": 4.050417114771261e-07, + "loss": 0.5445, + "step": 2666 + }, + { + "epoch": 2.1075933712589663, + "grad_norm": 0.2450748062872641, + "learning_rate": 4.04373730915874e-07, + "loss": 0.5403, + "step": 2667 + }, + { + "epoch": 2.108384862725699, + "grad_norm": 0.22752898002700206, + "learning_rate": 4.037061619967169e-07, + "loss": 0.5344, + "step": 2668 + }, + { + "epoch": 2.1091763541924315, + "grad_norm": 0.235979027692674, + "learning_rate": 4.0303900518101685e-07, + "loss": 0.5514, + "step": 2669 + }, + { + "epoch": 2.109967845659164, + "grad_norm": 0.2358535907753104, + "learning_rate": 4.023722609298521e-07, + "loss": 0.5616, + "step": 2670 + }, + { + "epoch": 2.110759337125897, + "grad_norm": 0.23800463564472196, + "learning_rate": 4.0170592970401373e-07, + "loss": 0.5383, + "step": 2671 + }, + { + "epoch": 2.1115508285926294, + "grad_norm": 0.24083426114622639, + "learning_rate": 4.010400119640095e-07, + "loss": 0.5347, + "step": 2672 + }, + { + "epoch": 2.112342320059362, + "grad_norm": 0.24370066558921152, + "learning_rate": 4.0037450817006037e-07, + "loss": 0.5503, + "step": 2673 + }, + { + "epoch": 2.1131338115260947, + "grad_norm": 0.23224467494246734, + "learning_rate": 3.9970941878210076e-07, + "loss": 0.5505, + "step": 2674 + }, + { + "epoch": 2.1139253029928273, + "grad_norm": 0.23644803338116782, + "learning_rate": 3.990447442597801e-07, + "loss": 0.5529, + "step": 2675 + }, + { + "epoch": 2.11471679445956, + "grad_norm": 0.2391182606163028, + "learning_rate": 3.9838048506246e-07, + "loss": 0.5438, + "step": 2676 + }, + { + "epoch": 2.115508285926292, + "grad_norm": 0.235304197389743, + "learning_rate": 3.9771664164921516e-07, + "loss": 0.5535, + "step": 2677 + }, + { + "epoch": 2.1162997773930248, + "grad_norm": 0.23629465426666046, + "learning_rate": 3.970532144788332e-07, + "loss": 0.5467, + "step": 2678 + }, + { + "epoch": 2.1170912688597574, + "grad_norm": 0.23914376324070952, + "learning_rate": 3.963902040098137e-07, + "loss": 0.5376, + "step": 2679 + }, + { + "epoch": 2.11788276032649, + "grad_norm": 0.2416501276838314, + "learning_rate": 3.957276107003691e-07, + "loss": 0.5537, + "step": 2680 + }, + { + "epoch": 2.1186742517932227, + "grad_norm": 0.23513994997497853, + "learning_rate": 3.950654350084225e-07, + "loss": 0.5361, + "step": 2681 + }, + { + "epoch": 2.1194657432599553, + "grad_norm": 0.23945735733077111, + "learning_rate": 3.9440367739160894e-07, + "loss": 0.5435, + "step": 2682 + }, + { + "epoch": 2.120257234726688, + "grad_norm": 0.24057992870637823, + "learning_rate": 3.9374233830727453e-07, + "loss": 0.5642, + "step": 2683 + }, + { + "epoch": 2.1210487261934206, + "grad_norm": 0.23542654349540829, + "learning_rate": 3.9308141821247555e-07, + "loss": 0.5329, + "step": 2684 + }, + { + "epoch": 2.121840217660153, + "grad_norm": 0.23207027638096936, + "learning_rate": 3.924209175639799e-07, + "loss": 0.5372, + "step": 2685 + }, + { + "epoch": 2.122631709126886, + "grad_norm": 0.241740908022761, + "learning_rate": 3.9176083681826487e-07, + "loss": 0.5293, + "step": 2686 + }, + { + "epoch": 2.1234232005936184, + "grad_norm": 0.23303805496442506, + "learning_rate": 3.911011764315165e-07, + "loss": 0.5365, + "step": 2687 + }, + { + "epoch": 2.124214692060351, + "grad_norm": 0.23518516070727688, + "learning_rate": 3.9044193685963254e-07, + "loss": 0.5359, + "step": 2688 + }, + { + "epoch": 2.1250061835270837, + "grad_norm": 0.2312937285854721, + "learning_rate": 3.8978311855821787e-07, + "loss": 0.5273, + "step": 2689 + }, + { + "epoch": 2.1257976749938163, + "grad_norm": 0.240795395078905, + "learning_rate": 3.891247219825879e-07, + "loss": 0.545, + "step": 2690 + }, + { + "epoch": 2.126589166460549, + "grad_norm": 0.22992709791566163, + "learning_rate": 3.884667475877659e-07, + "loss": 0.5427, + "step": 2691 + }, + { + "epoch": 2.1273806579272816, + "grad_norm": 0.23547723324550332, + "learning_rate": 3.8780919582848194e-07, + "loss": 0.5518, + "step": 2692 + }, + { + "epoch": 2.1281721493940142, + "grad_norm": 0.2376023112368025, + "learning_rate": 3.871520671591766e-07, + "loss": 0.544, + "step": 2693 + }, + { + "epoch": 2.128963640860747, + "grad_norm": 0.2485137223948424, + "learning_rate": 3.864953620339959e-07, + "loss": 0.5314, + "step": 2694 + }, + { + "epoch": 2.1297551323274795, + "grad_norm": 0.23854155144796496, + "learning_rate": 3.8583908090679495e-07, + "loss": 0.553, + "step": 2695 + }, + { + "epoch": 2.130546623794212, + "grad_norm": 0.24172009255298904, + "learning_rate": 3.8518322423113423e-07, + "loss": 0.5381, + "step": 2696 + }, + { + "epoch": 2.1313381152609447, + "grad_norm": 0.24317528384140413, + "learning_rate": 3.84527792460282e-07, + "loss": 0.5581, + "step": 2697 + }, + { + "epoch": 2.1321296067276774, + "grad_norm": 0.23843831512216013, + "learning_rate": 3.83872786047212e-07, + "loss": 0.5414, + "step": 2698 + }, + { + "epoch": 2.13292109819441, + "grad_norm": 0.2375907806760504, + "learning_rate": 3.8321820544460427e-07, + "loss": 0.5536, + "step": 2699 + }, + { + "epoch": 2.1337125896611426, + "grad_norm": 0.23910046786798353, + "learning_rate": 3.8256405110484535e-07, + "loss": 0.5492, + "step": 2700 + }, + { + "epoch": 2.1345040811278753, + "grad_norm": 0.2314723668263214, + "learning_rate": 3.819103234800263e-07, + "loss": 0.5225, + "step": 2701 + }, + { + "epoch": 2.135295572594608, + "grad_norm": 0.23372676483647006, + "learning_rate": 3.812570230219433e-07, + "loss": 0.5447, + "step": 2702 + }, + { + "epoch": 2.1360870640613405, + "grad_norm": 0.24784857348868877, + "learning_rate": 3.806041501820979e-07, + "loss": 0.5445, + "step": 2703 + }, + { + "epoch": 2.136878555528073, + "grad_norm": 0.23839361925335847, + "learning_rate": 3.799517054116954e-07, + "loss": 0.5341, + "step": 2704 + }, + { + "epoch": 2.137670046994806, + "grad_norm": 0.2376321030641214, + "learning_rate": 3.7929968916164554e-07, + "loss": 0.5401, + "step": 2705 + }, + { + "epoch": 2.1384615384615384, + "grad_norm": 0.23541406069684748, + "learning_rate": 3.7864810188256255e-07, + "loss": 0.542, + "step": 2706 + }, + { + "epoch": 2.139253029928271, + "grad_norm": 0.23830045324424498, + "learning_rate": 3.779969440247632e-07, + "loss": 0.5509, + "step": 2707 + }, + { + "epoch": 2.1400445213950037, + "grad_norm": 0.23907942240071353, + "learning_rate": 3.773462160382681e-07, + "loss": 0.5471, + "step": 2708 + }, + { + "epoch": 2.1408360128617363, + "grad_norm": 0.2396199043443259, + "learning_rate": 3.766959183728006e-07, + "loss": 0.5434, + "step": 2709 + }, + { + "epoch": 2.141627504328469, + "grad_norm": 0.24243347753188646, + "learning_rate": 3.76046051477786e-07, + "loss": 0.5396, + "step": 2710 + }, + { + "epoch": 2.1424189957952016, + "grad_norm": 0.23490660986564252, + "learning_rate": 3.753966158023535e-07, + "loss": 0.5494, + "step": 2711 + }, + { + "epoch": 2.143210487261934, + "grad_norm": 0.23643994727318168, + "learning_rate": 3.7474761179533287e-07, + "loss": 0.5364, + "step": 2712 + }, + { + "epoch": 2.144001978728667, + "grad_norm": 0.23813687206078374, + "learning_rate": 3.7409903990525593e-07, + "loss": 0.5655, + "step": 2713 + }, + { + "epoch": 2.1447934701953995, + "grad_norm": 0.23658533828618422, + "learning_rate": 3.73450900580356e-07, + "loss": 0.5499, + "step": 2714 + }, + { + "epoch": 2.145584961662132, + "grad_norm": 0.2378604628576131, + "learning_rate": 3.72803194268567e-07, + "loss": 0.5421, + "step": 2715 + }, + { + "epoch": 2.1463764531288647, + "grad_norm": 0.23632934850343384, + "learning_rate": 3.721559214175246e-07, + "loss": 0.5422, + "step": 2716 + }, + { + "epoch": 2.1471679445955973, + "grad_norm": 0.23413369926924474, + "learning_rate": 3.715090824745637e-07, + "loss": 0.5408, + "step": 2717 + }, + { + "epoch": 2.14795943606233, + "grad_norm": 0.23640766332836996, + "learning_rate": 3.708626778867208e-07, + "loss": 0.5502, + "step": 2718 + }, + { + "epoch": 2.1487509275290626, + "grad_norm": 0.23533575904719933, + "learning_rate": 3.7021670810073023e-07, + "loss": 0.5363, + "step": 2719 + }, + { + "epoch": 2.1495424189957952, + "grad_norm": 0.24218923848920135, + "learning_rate": 3.6957117356302715e-07, + "loss": 0.5403, + "step": 2720 + }, + { + "epoch": 2.150333910462528, + "grad_norm": 0.236820793190202, + "learning_rate": 3.6892607471974603e-07, + "loss": 0.551, + "step": 2721 + }, + { + "epoch": 2.1511254019292605, + "grad_norm": 0.2410691061573119, + "learning_rate": 3.682814120167197e-07, + "loss": 0.5397, + "step": 2722 + }, + { + "epoch": 2.151916893395993, + "grad_norm": 0.23352531474515983, + "learning_rate": 3.6763718589947967e-07, + "loss": 0.5391, + "step": 2723 + }, + { + "epoch": 2.1527083848627258, + "grad_norm": 0.2365230560264958, + "learning_rate": 3.6699339681325567e-07, + "loss": 0.5392, + "step": 2724 + }, + { + "epoch": 2.1534998763294584, + "grad_norm": 0.2360532844037301, + "learning_rate": 3.663500452029755e-07, + "loss": 0.554, + "step": 2725 + }, + { + "epoch": 2.154291367796191, + "grad_norm": 0.2420408481557582, + "learning_rate": 3.6570713151326494e-07, + "loss": 0.5411, + "step": 2726 + }, + { + "epoch": 2.1550828592629236, + "grad_norm": 0.23247463401551066, + "learning_rate": 3.6506465618844664e-07, + "loss": 0.5322, + "step": 2727 + }, + { + "epoch": 2.1558743507296563, + "grad_norm": 0.23236867983238696, + "learning_rate": 3.644226196725404e-07, + "loss": 0.5312, + "step": 2728 + }, + { + "epoch": 2.156665842196389, + "grad_norm": 0.24358870002051564, + "learning_rate": 3.637810224092629e-07, + "loss": 0.5464, + "step": 2729 + }, + { + "epoch": 2.1574573336631215, + "grad_norm": 0.2364477616915208, + "learning_rate": 3.6313986484202666e-07, + "loss": 0.5411, + "step": 2730 + }, + { + "epoch": 2.158248825129854, + "grad_norm": 0.23768339433671795, + "learning_rate": 3.624991474139416e-07, + "loss": 0.5406, + "step": 2731 + }, + { + "epoch": 2.159040316596587, + "grad_norm": 0.24196534113116164, + "learning_rate": 3.618588705678124e-07, + "loss": 0.5455, + "step": 2732 + }, + { + "epoch": 2.1598318080633194, + "grad_norm": 0.2352209662500066, + "learning_rate": 3.6121903474613935e-07, + "loss": 0.528, + "step": 2733 + }, + { + "epoch": 2.160623299530052, + "grad_norm": 0.23640826060048364, + "learning_rate": 3.6057964039111834e-07, + "loss": 0.5527, + "step": 2734 + }, + { + "epoch": 2.1614147909967847, + "grad_norm": 0.23550542411389946, + "learning_rate": 3.599406879446394e-07, + "loss": 0.5534, + "step": 2735 + }, + { + "epoch": 2.1622062824635173, + "grad_norm": 0.23195874916767886, + "learning_rate": 3.593021778482884e-07, + "loss": 0.5356, + "step": 2736 + }, + { + "epoch": 2.16299777393025, + "grad_norm": 0.23604071060154344, + "learning_rate": 3.586641105433447e-07, + "loss": 0.5375, + "step": 2737 + }, + { + "epoch": 2.1637892653969826, + "grad_norm": 0.23808981850245756, + "learning_rate": 3.5802648647078107e-07, + "loss": 0.5546, + "step": 2738 + }, + { + "epoch": 2.164580756863715, + "grad_norm": 0.23328256503214606, + "learning_rate": 3.573893060712657e-07, + "loss": 0.5396, + "step": 2739 + }, + { + "epoch": 2.165372248330448, + "grad_norm": 0.23538990673898114, + "learning_rate": 3.567525697851579e-07, + "loss": 0.5349, + "step": 2740 + }, + { + "epoch": 2.1661637397971805, + "grad_norm": 0.24033583361609867, + "learning_rate": 3.561162780525122e-07, + "loss": 0.5375, + "step": 2741 + }, + { + "epoch": 2.166955231263913, + "grad_norm": 0.23752082108251496, + "learning_rate": 3.5548043131307437e-07, + "loss": 0.546, + "step": 2742 + }, + { + "epoch": 2.1677467227306457, + "grad_norm": 0.2420787082419534, + "learning_rate": 3.548450300062832e-07, + "loss": 0.5408, + "step": 2743 + }, + { + "epoch": 2.1685382141973784, + "grad_norm": 0.239040103059141, + "learning_rate": 3.542100745712704e-07, + "loss": 0.5351, + "step": 2744 + }, + { + "epoch": 2.169329705664111, + "grad_norm": 0.23789310410737252, + "learning_rate": 3.535755654468574e-07, + "loss": 0.5364, + "step": 2745 + }, + { + "epoch": 2.1701211971308436, + "grad_norm": 0.2509356328833936, + "learning_rate": 3.5294150307155965e-07, + "loss": 0.5467, + "step": 2746 + }, + { + "epoch": 2.1709126885975762, + "grad_norm": 0.23855493455386934, + "learning_rate": 3.5230788788358244e-07, + "loss": 0.5541, + "step": 2747 + }, + { + "epoch": 2.171704180064309, + "grad_norm": 0.2320380484076634, + "learning_rate": 3.5167472032082193e-07, + "loss": 0.5419, + "step": 2748 + }, + { + "epoch": 2.1724956715310415, + "grad_norm": 0.24097966011385014, + "learning_rate": 3.510420008208663e-07, + "loss": 0.5472, + "step": 2749 + }, + { + "epoch": 2.173287162997774, + "grad_norm": 0.24582658971565477, + "learning_rate": 3.504097298209918e-07, + "loss": 0.5444, + "step": 2750 + }, + { + "epoch": 2.1740786544645063, + "grad_norm": 0.2410878215392785, + "learning_rate": 3.4977790775816694e-07, + "loss": 0.5412, + "step": 2751 + }, + { + "epoch": 2.174870145931239, + "grad_norm": 0.23521436046476052, + "learning_rate": 3.491465350690487e-07, + "loss": 0.5375, + "step": 2752 + }, + { + "epoch": 2.1756616373979716, + "grad_norm": 0.23996796346983834, + "learning_rate": 3.4851561218998337e-07, + "loss": 0.5446, + "step": 2753 + }, + { + "epoch": 2.176453128864704, + "grad_norm": 0.23994080060671666, + "learning_rate": 3.4788513955700794e-07, + "loss": 0.5511, + "step": 2754 + }, + { + "epoch": 2.177244620331437, + "grad_norm": 0.24092313468444498, + "learning_rate": 3.472551176058458e-07, + "loss": 0.5527, + "step": 2755 + }, + { + "epoch": 2.1780361117981695, + "grad_norm": 0.24460037463040793, + "learning_rate": 3.4662554677191115e-07, + "loss": 0.5436, + "step": 2756 + }, + { + "epoch": 2.178827603264902, + "grad_norm": 0.2352021106165701, + "learning_rate": 3.45996427490305e-07, + "loss": 0.5514, + "step": 2757 + }, + { + "epoch": 2.1796190947316347, + "grad_norm": 0.23613075232978054, + "learning_rate": 3.453677601958166e-07, + "loss": 0.5373, + "step": 2758 + }, + { + "epoch": 2.1804105861983674, + "grad_norm": 0.23833584609528025, + "learning_rate": 3.4473954532292336e-07, + "loss": 0.5386, + "step": 2759 + }, + { + "epoch": 2.1812020776651, + "grad_norm": 0.24048857569727802, + "learning_rate": 3.441117833057896e-07, + "loss": 0.5381, + "step": 2760 + }, + { + "epoch": 2.1819935691318326, + "grad_norm": 0.2314595284658549, + "learning_rate": 3.4348447457826646e-07, + "loss": 0.5464, + "step": 2761 + }, + { + "epoch": 2.1827850605985653, + "grad_norm": 0.23498305258167132, + "learning_rate": 3.428576195738921e-07, + "loss": 0.5318, + "step": 2762 + }, + { + "epoch": 2.183576552065298, + "grad_norm": 0.23942660920743883, + "learning_rate": 3.4223121872589076e-07, + "loss": 0.5429, + "step": 2763 + }, + { + "epoch": 2.1843680435320305, + "grad_norm": 0.23629933727881583, + "learning_rate": 3.416052724671737e-07, + "loss": 0.5389, + "step": 2764 + }, + { + "epoch": 2.185159534998763, + "grad_norm": 0.2484889529273199, + "learning_rate": 3.409797812303371e-07, + "loss": 0.5379, + "step": 2765 + }, + { + "epoch": 2.1859510264654958, + "grad_norm": 0.2342655173341307, + "learning_rate": 3.4035474544766306e-07, + "loss": 0.5437, + "step": 2766 + }, + { + "epoch": 2.1867425179322284, + "grad_norm": 0.24166378020389223, + "learning_rate": 3.3973016555111865e-07, + "loss": 0.5357, + "step": 2767 + }, + { + "epoch": 2.187534009398961, + "grad_norm": 0.2332169509558839, + "learning_rate": 3.391060419723558e-07, + "loss": 0.5469, + "step": 2768 + }, + { + "epoch": 2.1883255008656937, + "grad_norm": 0.24274548354086564, + "learning_rate": 3.3848237514271196e-07, + "loss": 0.5408, + "step": 2769 + }, + { + "epoch": 2.1891169923324263, + "grad_norm": 0.23325643021174935, + "learning_rate": 3.3785916549320804e-07, + "loss": 0.5429, + "step": 2770 + }, + { + "epoch": 2.189908483799159, + "grad_norm": 0.23626208169532442, + "learning_rate": 3.372364134545491e-07, + "loss": 0.5428, + "step": 2771 + }, + { + "epoch": 2.1906999752658916, + "grad_norm": 0.23943777925930157, + "learning_rate": 3.3661411945712415e-07, + "loss": 0.5461, + "step": 2772 + }, + { + "epoch": 2.191491466732624, + "grad_norm": 0.23883505497909377, + "learning_rate": 3.359922839310053e-07, + "loss": 0.5426, + "step": 2773 + }, + { + "epoch": 2.192282958199357, + "grad_norm": 0.23593992088353088, + "learning_rate": 3.3537090730594864e-07, + "loss": 0.5539, + "step": 2774 + }, + { + "epoch": 2.1930744496660894, + "grad_norm": 0.24296391543886683, + "learning_rate": 3.3474999001139214e-07, + "loss": 0.5502, + "step": 2775 + }, + { + "epoch": 2.193865941132822, + "grad_norm": 0.23568483856630298, + "learning_rate": 3.341295324764569e-07, + "loss": 0.5388, + "step": 2776 + }, + { + "epoch": 2.1946574325995547, + "grad_norm": 0.23613563619882116, + "learning_rate": 3.3350953512994606e-07, + "loss": 0.5534, + "step": 2777 + }, + { + "epoch": 2.1954489240662873, + "grad_norm": 0.24378112398366505, + "learning_rate": 3.3288999840034425e-07, + "loss": 0.548, + "step": 2778 + }, + { + "epoch": 2.19624041553302, + "grad_norm": 0.23251859526158788, + "learning_rate": 3.3227092271581924e-07, + "loss": 0.5424, + "step": 2779 + }, + { + "epoch": 2.1970319069997526, + "grad_norm": 0.23370396151069003, + "learning_rate": 3.3165230850421864e-07, + "loss": 0.5496, + "step": 2780 + }, + { + "epoch": 2.1978233984664852, + "grad_norm": 0.24493607278999252, + "learning_rate": 3.3103415619307175e-07, + "loss": 0.5425, + "step": 2781 + }, + { + "epoch": 2.198614889933218, + "grad_norm": 0.2441568731723912, + "learning_rate": 3.304164662095886e-07, + "loss": 0.5433, + "step": 2782 + }, + { + "epoch": 2.1994063813999505, + "grad_norm": 0.23975135718916102, + "learning_rate": 3.297992389806594e-07, + "loss": 0.5467, + "step": 2783 + }, + { + "epoch": 2.200197872866683, + "grad_norm": 0.23788564669911225, + "learning_rate": 3.2918247493285556e-07, + "loss": 0.551, + "step": 2784 + }, + { + "epoch": 2.2009893643334157, + "grad_norm": 0.23649951902652033, + "learning_rate": 3.285661744924272e-07, + "loss": 0.5573, + "step": 2785 + }, + { + "epoch": 2.2017808558001484, + "grad_norm": 0.2345774311166057, + "learning_rate": 3.2795033808530445e-07, + "loss": 0.5507, + "step": 2786 + }, + { + "epoch": 2.202572347266881, + "grad_norm": 0.23692663247155593, + "learning_rate": 3.273349661370971e-07, + "loss": 0.5511, + "step": 2787 + }, + { + "epoch": 2.2033638387336136, + "grad_norm": 0.23441563157854112, + "learning_rate": 3.2672005907309297e-07, + "loss": 0.5446, + "step": 2788 + }, + { + "epoch": 2.2041553302003463, + "grad_norm": 0.23310473967990403, + "learning_rate": 3.261056173182601e-07, + "loss": 0.5402, + "step": 2789 + }, + { + "epoch": 2.204946821667079, + "grad_norm": 0.24161903549455568, + "learning_rate": 3.254916412972437e-07, + "loss": 0.5405, + "step": 2790 + }, + { + "epoch": 2.2057383131338115, + "grad_norm": 0.24051136701477105, + "learning_rate": 3.248781314343677e-07, + "loss": 0.5501, + "step": 2791 + }, + { + "epoch": 2.206529804600544, + "grad_norm": 0.2348785076805987, + "learning_rate": 3.242650881536335e-07, + "loss": 0.5424, + "step": 2792 + }, + { + "epoch": 2.207321296067277, + "grad_norm": 0.23919692370677437, + "learning_rate": 3.2365251187872e-07, + "loss": 0.5327, + "step": 2793 + }, + { + "epoch": 2.2081127875340094, + "grad_norm": 0.23552891269201812, + "learning_rate": 3.230404030329842e-07, + "loss": 0.545, + "step": 2794 + }, + { + "epoch": 2.208904279000742, + "grad_norm": 0.24049913539703127, + "learning_rate": 3.224287620394591e-07, + "loss": 0.5412, + "step": 2795 + }, + { + "epoch": 2.2096957704674747, + "grad_norm": 0.2350548822292516, + "learning_rate": 3.218175893208548e-07, + "loss": 0.5505, + "step": 2796 + }, + { + "epoch": 2.2104872619342073, + "grad_norm": 0.2340448502858718, + "learning_rate": 3.212068852995574e-07, + "loss": 0.5516, + "step": 2797 + }, + { + "epoch": 2.21127875340094, + "grad_norm": 0.23892522416827608, + "learning_rate": 3.205966503976294e-07, + "loss": 0.5501, + "step": 2798 + }, + { + "epoch": 2.2120702448676726, + "grad_norm": 0.23321367696906653, + "learning_rate": 3.199868850368095e-07, + "loss": 0.5481, + "step": 2799 + }, + { + "epoch": 2.212861736334405, + "grad_norm": 0.24740200464824078, + "learning_rate": 3.1937758963851105e-07, + "loss": 0.5539, + "step": 2800 + }, + { + "epoch": 2.213653227801138, + "grad_norm": 0.2316305960967838, + "learning_rate": 3.187687646238227e-07, + "loss": 0.5377, + "step": 2801 + }, + { + "epoch": 2.2144447192678705, + "grad_norm": 0.24081111597547666, + "learning_rate": 3.181604104135094e-07, + "loss": 0.5358, + "step": 2802 + }, + { + "epoch": 2.215236210734603, + "grad_norm": 0.23645305344607873, + "learning_rate": 3.1755252742800817e-07, + "loss": 0.5387, + "step": 2803 + }, + { + "epoch": 2.2160277022013357, + "grad_norm": 0.23275317040884852, + "learning_rate": 3.169451160874329e-07, + "loss": 0.5323, + "step": 2804 + }, + { + "epoch": 2.2168191936680683, + "grad_norm": 0.2334082778496196, + "learning_rate": 3.1633817681157005e-07, + "loss": 0.5423, + "step": 2805 + }, + { + "epoch": 2.217610685134801, + "grad_norm": 0.2427650789454405, + "learning_rate": 3.1573171001987996e-07, + "loss": 0.5258, + "step": 2806 + }, + { + "epoch": 2.2184021766015336, + "grad_norm": 0.23771869244918228, + "learning_rate": 3.1512571613149764e-07, + "loss": 0.5377, + "step": 2807 + }, + { + "epoch": 2.2191936680682662, + "grad_norm": 0.2415593130470467, + "learning_rate": 3.145201955652289e-07, + "loss": 0.534, + "step": 2808 + }, + { + "epoch": 2.219985159534999, + "grad_norm": 0.2340140824836716, + "learning_rate": 3.139151487395552e-07, + "loss": 0.5502, + "step": 2809 + }, + { + "epoch": 2.2207766510017315, + "grad_norm": 0.2401592208945918, + "learning_rate": 3.1331057607262854e-07, + "loss": 0.5316, + "step": 2810 + }, + { + "epoch": 2.221568142468464, + "grad_norm": 0.24120828857298485, + "learning_rate": 3.127064779822738e-07, + "loss": 0.5503, + "step": 2811 + }, + { + "epoch": 2.2223596339351968, + "grad_norm": 0.24117083049827034, + "learning_rate": 3.1210285488598897e-07, + "loss": 0.5647, + "step": 2812 + }, + { + "epoch": 2.2231511254019294, + "grad_norm": 0.23528875437046673, + "learning_rate": 3.114997072009419e-07, + "loss": 0.544, + "step": 2813 + }, + { + "epoch": 2.223942616868662, + "grad_norm": 0.23486315716716744, + "learning_rate": 3.108970353439727e-07, + "loss": 0.5389, + "step": 2814 + }, + { + "epoch": 2.2247341083353946, + "grad_norm": 0.23108068584911928, + "learning_rate": 3.102948397315934e-07, + "loss": 0.5139, + "step": 2815 + }, + { + "epoch": 2.2255255998021273, + "grad_norm": 0.23643343155253604, + "learning_rate": 3.0969312077998564e-07, + "loss": 0.5353, + "step": 2816 + }, + { + "epoch": 2.22631709126886, + "grad_norm": 0.24404072862273804, + "learning_rate": 3.090918789050031e-07, + "loss": 0.5417, + "step": 2817 + }, + { + "epoch": 2.2271085827355925, + "grad_norm": 0.234829036271383, + "learning_rate": 3.084911145221679e-07, + "loss": 0.5307, + "step": 2818 + }, + { + "epoch": 2.227900074202325, + "grad_norm": 0.23909490101015032, + "learning_rate": 3.0789082804667323e-07, + "loss": 0.5587, + "step": 2819 + }, + { + "epoch": 2.228691565669058, + "grad_norm": 0.24775122596074298, + "learning_rate": 3.0729101989338247e-07, + "loss": 0.5468, + "step": 2820 + }, + { + "epoch": 2.2294830571357904, + "grad_norm": 0.2342429049625052, + "learning_rate": 3.066916904768271e-07, + "loss": 0.5328, + "step": 2821 + }, + { + "epoch": 2.230274548602523, + "grad_norm": 0.24281846191245626, + "learning_rate": 3.060928402112096e-07, + "loss": 0.5346, + "step": 2822 + }, + { + "epoch": 2.2310660400692557, + "grad_norm": 0.2444610729786211, + "learning_rate": 3.0549446951039927e-07, + "loss": 0.5505, + "step": 2823 + }, + { + "epoch": 2.2318575315359883, + "grad_norm": 0.24526065759415705, + "learning_rate": 3.0489657878793473e-07, + "loss": 0.5463, + "step": 2824 + }, + { + "epoch": 2.232649023002721, + "grad_norm": 0.23808868335388847, + "learning_rate": 3.0429916845702387e-07, + "loss": 0.5461, + "step": 2825 + }, + { + "epoch": 2.2334405144694536, + "grad_norm": 0.24672306196776342, + "learning_rate": 3.0370223893054094e-07, + "loss": 0.5595, + "step": 2826 + }, + { + "epoch": 2.234232005936186, + "grad_norm": 0.2380471364714382, + "learning_rate": 3.0310579062102946e-07, + "loss": 0.5377, + "step": 2827 + }, + { + "epoch": 2.235023497402919, + "grad_norm": 0.23080436472819818, + "learning_rate": 3.025098239406998e-07, + "loss": 0.5486, + "step": 2828 + }, + { + "epoch": 2.2358149888696515, + "grad_norm": 0.23672293323577456, + "learning_rate": 3.0191433930142805e-07, + "loss": 0.5439, + "step": 2829 + }, + { + "epoch": 2.236606480336384, + "grad_norm": 0.2441709080931778, + "learning_rate": 3.0131933711475965e-07, + "loss": 0.5559, + "step": 2830 + }, + { + "epoch": 2.2373979718031167, + "grad_norm": 0.240840113760501, + "learning_rate": 3.007248177919048e-07, + "loss": 0.5478, + "step": 2831 + }, + { + "epoch": 2.2381894632698494, + "grad_norm": 0.24233695817539602, + "learning_rate": 3.001307817437406e-07, + "loss": 0.5394, + "step": 2832 + }, + { + "epoch": 2.2389809547365815, + "grad_norm": 0.24063609595208607, + "learning_rate": 2.995372293808108e-07, + "loss": 0.5471, + "step": 2833 + }, + { + "epoch": 2.239772446203314, + "grad_norm": 0.23924643250796646, + "learning_rate": 2.989441611133231e-07, + "loss": 0.5333, + "step": 2834 + }, + { + "epoch": 2.240563937670047, + "grad_norm": 0.24813417318421102, + "learning_rate": 2.9835157735115257e-07, + "loss": 0.5533, + "step": 2835 + }, + { + "epoch": 2.2413554291367794, + "grad_norm": 0.24649175721522854, + "learning_rate": 2.9775947850383844e-07, + "loss": 0.5354, + "step": 2836 + }, + { + "epoch": 2.242146920603512, + "grad_norm": 0.23923480918210283, + "learning_rate": 2.9716786498058446e-07, + "loss": 0.5319, + "step": 2837 + }, + { + "epoch": 2.2429384120702447, + "grad_norm": 0.23767840866812015, + "learning_rate": 2.9657673719026054e-07, + "loss": 0.5542, + "step": 2838 + }, + { + "epoch": 2.2437299035369773, + "grad_norm": 0.22920949091170925, + "learning_rate": 2.9598609554139854e-07, + "loss": 0.5314, + "step": 2839 + }, + { + "epoch": 2.24452139500371, + "grad_norm": 0.24646364189934578, + "learning_rate": 2.9539594044219673e-07, + "loss": 0.5409, + "step": 2840 + }, + { + "epoch": 2.2453128864704426, + "grad_norm": 0.23608561878992967, + "learning_rate": 2.948062723005155e-07, + "loss": 0.5242, + "step": 2841 + }, + { + "epoch": 2.246104377937175, + "grad_norm": 0.23586637749198103, + "learning_rate": 2.9421709152387906e-07, + "loss": 0.5418, + "step": 2842 + }, + { + "epoch": 2.246895869403908, + "grad_norm": 0.23898935280822234, + "learning_rate": 2.9362839851947596e-07, + "loss": 0.5384, + "step": 2843 + }, + { + "epoch": 2.2476873608706405, + "grad_norm": 0.2413763631419046, + "learning_rate": 2.9304019369415525e-07, + "loss": 0.5437, + "step": 2844 + }, + { + "epoch": 2.248478852337373, + "grad_norm": 0.23564479828315718, + "learning_rate": 2.9245247745443105e-07, + "loss": 0.5321, + "step": 2845 + }, + { + "epoch": 2.2492703438041057, + "grad_norm": 0.23321194155400574, + "learning_rate": 2.918652502064785e-07, + "loss": 0.539, + "step": 2846 + }, + { + "epoch": 2.2500618352708384, + "grad_norm": 0.2254629116591103, + "learning_rate": 2.9127851235613453e-07, + "loss": 0.5282, + "step": 2847 + }, + { + "epoch": 2.250853326737571, + "grad_norm": 0.23478463008165587, + "learning_rate": 2.906922643088993e-07, + "loss": 0.5645, + "step": 2848 + }, + { + "epoch": 2.2516448182043036, + "grad_norm": 0.23452103089076926, + "learning_rate": 2.9010650646993295e-07, + "loss": 0.5372, + "step": 2849 + }, + { + "epoch": 2.2524363096710363, + "grad_norm": 0.22881393447554058, + "learning_rate": 2.895212392440577e-07, + "loss": 0.5331, + "step": 2850 + }, + { + "epoch": 2.253227801137769, + "grad_norm": 0.24671940165475068, + "learning_rate": 2.8893646303575625e-07, + "loss": 0.5399, + "step": 2851 + }, + { + "epoch": 2.2540192926045015, + "grad_norm": 0.244503937172185, + "learning_rate": 2.883521782491718e-07, + "loss": 0.5384, + "step": 2852 + }, + { + "epoch": 2.254810784071234, + "grad_norm": 0.24163619148587315, + "learning_rate": 2.87768385288109e-07, + "loss": 0.5646, + "step": 2853 + }, + { + "epoch": 2.2556022755379668, + "grad_norm": 0.2411564234197795, + "learning_rate": 2.871850845560314e-07, + "loss": 0.5508, + "step": 2854 + }, + { + "epoch": 2.2563937670046994, + "grad_norm": 0.23508813064354636, + "learning_rate": 2.8660227645606296e-07, + "loss": 0.549, + "step": 2855 + }, + { + "epoch": 2.257185258471432, + "grad_norm": 0.23477381984369078, + "learning_rate": 2.8601996139098695e-07, + "loss": 0.5426, + "step": 2856 + }, + { + "epoch": 2.2579767499381647, + "grad_norm": 0.2370525453618061, + "learning_rate": 2.854381397632457e-07, + "loss": 0.5391, + "step": 2857 + }, + { + "epoch": 2.2587682414048973, + "grad_norm": 0.2390607000164692, + "learning_rate": 2.848568119749415e-07, + "loss": 0.5426, + "step": 2858 + }, + { + "epoch": 2.25955973287163, + "grad_norm": 0.23094266808799557, + "learning_rate": 2.8427597842783414e-07, + "loss": 0.5394, + "step": 2859 + }, + { + "epoch": 2.2603512243383626, + "grad_norm": 0.24087702331429744, + "learning_rate": 2.836956395233425e-07, + "loss": 0.5395, + "step": 2860 + }, + { + "epoch": 2.261142715805095, + "grad_norm": 0.23751207190954038, + "learning_rate": 2.831157956625434e-07, + "loss": 0.5483, + "step": 2861 + }, + { + "epoch": 2.261934207271828, + "grad_norm": 0.247604081371722, + "learning_rate": 2.825364472461714e-07, + "loss": 0.5416, + "step": 2862 + }, + { + "epoch": 2.2627256987385604, + "grad_norm": 0.23789480125692128, + "learning_rate": 2.819575946746192e-07, + "loss": 0.5409, + "step": 2863 + }, + { + "epoch": 2.263517190205293, + "grad_norm": 0.2349177918274698, + "learning_rate": 2.813792383479364e-07, + "loss": 0.547, + "step": 2864 + }, + { + "epoch": 2.2643086816720257, + "grad_norm": 0.24211911719136156, + "learning_rate": 2.8080137866582967e-07, + "loss": 0.5437, + "step": 2865 + }, + { + "epoch": 2.2651001731387583, + "grad_norm": 0.2313582053135601, + "learning_rate": 2.802240160276623e-07, + "loss": 0.5324, + "step": 2866 + }, + { + "epoch": 2.265891664605491, + "grad_norm": 0.23156413476438797, + "learning_rate": 2.7964715083245406e-07, + "loss": 0.5443, + "step": 2867 + }, + { + "epoch": 2.2666831560722236, + "grad_norm": 0.23733214690280127, + "learning_rate": 2.790707834788818e-07, + "loss": 0.5418, + "step": 2868 + }, + { + "epoch": 2.2674746475389562, + "grad_norm": 0.23342416904972457, + "learning_rate": 2.784949143652772e-07, + "loss": 0.5376, + "step": 2869 + }, + { + "epoch": 2.268266139005689, + "grad_norm": 0.24012437478052984, + "learning_rate": 2.7791954388962804e-07, + "loss": 0.5341, + "step": 2870 + }, + { + "epoch": 2.2690576304724215, + "grad_norm": 0.2416102948062892, + "learning_rate": 2.773446724495774e-07, + "loss": 0.5421, + "step": 2871 + }, + { + "epoch": 2.269849121939154, + "grad_norm": 0.24175842485162605, + "learning_rate": 2.7677030044242345e-07, + "loss": 0.5449, + "step": 2872 + }, + { + "epoch": 2.2706406134058867, + "grad_norm": 0.23826793457919446, + "learning_rate": 2.761964282651197e-07, + "loss": 0.5625, + "step": 2873 + }, + { + "epoch": 2.2714321048726194, + "grad_norm": 0.24175799585516589, + "learning_rate": 2.7562305631427373e-07, + "loss": 0.5524, + "step": 2874 + }, + { + "epoch": 2.272223596339352, + "grad_norm": 0.23639796460897217, + "learning_rate": 2.750501849861473e-07, + "loss": 0.542, + "step": 2875 + }, + { + "epoch": 2.2730150878060846, + "grad_norm": 0.23684692153515702, + "learning_rate": 2.744778146766564e-07, + "loss": 0.5319, + "step": 2876 + }, + { + "epoch": 2.2738065792728173, + "grad_norm": 0.2353212441198773, + "learning_rate": 2.739059457813705e-07, + "loss": 0.5374, + "step": 2877 + }, + { + "epoch": 2.27459807073955, + "grad_norm": 0.2419176199222997, + "learning_rate": 2.733345786955135e-07, + "loss": 0.5427, + "step": 2878 + }, + { + "epoch": 2.2753895622062825, + "grad_norm": 0.24536013058010053, + "learning_rate": 2.727637138139612e-07, + "loss": 0.5374, + "step": 2879 + }, + { + "epoch": 2.276181053673015, + "grad_norm": 0.23024852786858077, + "learning_rate": 2.721933515312431e-07, + "loss": 0.5296, + "step": 2880 + }, + { + "epoch": 2.276972545139748, + "grad_norm": 0.23470183671325456, + "learning_rate": 2.7162349224154114e-07, + "loss": 0.5441, + "step": 2881 + }, + { + "epoch": 2.2777640366064804, + "grad_norm": 0.2322393667970669, + "learning_rate": 2.7105413633868936e-07, + "loss": 0.5427, + "step": 2882 + }, + { + "epoch": 2.278555528073213, + "grad_norm": 0.24380547084349297, + "learning_rate": 2.704852842161747e-07, + "loss": 0.5531, + "step": 2883 + }, + { + "epoch": 2.2793470195399457, + "grad_norm": 0.23162637501137084, + "learning_rate": 2.699169362671352e-07, + "loss": 0.5505, + "step": 2884 + }, + { + "epoch": 2.2801385110066783, + "grad_norm": 0.23554080571533584, + "learning_rate": 2.693490928843607e-07, + "loss": 0.5411, + "step": 2885 + }, + { + "epoch": 2.280930002473411, + "grad_norm": 0.24389813206815725, + "learning_rate": 2.6878175446029217e-07, + "loss": 0.5522, + "step": 2886 + }, + { + "epoch": 2.2817214939401436, + "grad_norm": 0.23710487701566618, + "learning_rate": 2.6821492138702164e-07, + "loss": 0.5404, + "step": 2887 + }, + { + "epoch": 2.282512985406876, + "grad_norm": 0.23897379699795512, + "learning_rate": 2.6764859405629257e-07, + "loss": 0.5405, + "step": 2888 + }, + { + "epoch": 2.283304476873609, + "grad_norm": 0.23482860235947972, + "learning_rate": 2.6708277285949786e-07, + "loss": 0.5499, + "step": 2889 + }, + { + "epoch": 2.2840959683403415, + "grad_norm": 0.24132942821579026, + "learning_rate": 2.6651745818768103e-07, + "loss": 0.5501, + "step": 2890 + }, + { + "epoch": 2.284887459807074, + "grad_norm": 0.2352002575063765, + "learning_rate": 2.6595265043153623e-07, + "loss": 0.5451, + "step": 2891 + }, + { + "epoch": 2.2856789512738067, + "grad_norm": 0.24083178361958704, + "learning_rate": 2.653883499814057e-07, + "loss": 0.5477, + "step": 2892 + }, + { + "epoch": 2.2864704427405393, + "grad_norm": 0.23212187081542618, + "learning_rate": 2.648245572272827e-07, + "loss": 0.5447, + "step": 2893 + }, + { + "epoch": 2.287261934207272, + "grad_norm": 0.2337076125479155, + "learning_rate": 2.642612725588087e-07, + "loss": 0.5304, + "step": 2894 + }, + { + "epoch": 2.2880534256740046, + "grad_norm": 0.2413573851502974, + "learning_rate": 2.6369849636527386e-07, + "loss": 0.5313, + "step": 2895 + }, + { + "epoch": 2.2888449171407372, + "grad_norm": 0.2413442351417876, + "learning_rate": 2.6313622903561836e-07, + "loss": 0.5392, + "step": 2896 + }, + { + "epoch": 2.28963640860747, + "grad_norm": 0.23452248538934567, + "learning_rate": 2.6257447095842854e-07, + "loss": 0.5429, + "step": 2897 + }, + { + "epoch": 2.2904279000742025, + "grad_norm": 0.23223040055805008, + "learning_rate": 2.620132225219406e-07, + "loss": 0.5476, + "step": 2898 + }, + { + "epoch": 2.291219391540935, + "grad_norm": 0.2329488894717753, + "learning_rate": 2.6145248411403765e-07, + "loss": 0.5429, + "step": 2899 + }, + { + "epoch": 2.2920108830076678, + "grad_norm": 0.23360015005402276, + "learning_rate": 2.608922561222502e-07, + "loss": 0.5373, + "step": 2900 + }, + { + "epoch": 2.2928023744744004, + "grad_norm": 0.2362549233317612, + "learning_rate": 2.6033253893375715e-07, + "loss": 0.5352, + "step": 2901 + }, + { + "epoch": 2.293593865941133, + "grad_norm": 0.24488868241053144, + "learning_rate": 2.597733329353824e-07, + "loss": 0.5322, + "step": 2902 + }, + { + "epoch": 2.294385357407865, + "grad_norm": 0.24004854502643702, + "learning_rate": 2.592146385135986e-07, + "loss": 0.5383, + "step": 2903 + }, + { + "epoch": 2.295176848874598, + "grad_norm": 0.23926218240031688, + "learning_rate": 2.586564560545237e-07, + "loss": 0.5496, + "step": 2904 + }, + { + "epoch": 2.2959683403413305, + "grad_norm": 0.2319858869114076, + "learning_rate": 2.5809878594392166e-07, + "loss": 0.5455, + "step": 2905 + }, + { + "epoch": 2.296759831808063, + "grad_norm": 0.23041785337081805, + "learning_rate": 2.5754162856720375e-07, + "loss": 0.5387, + "step": 2906 + }, + { + "epoch": 2.2975513232747957, + "grad_norm": 0.23762253769348288, + "learning_rate": 2.569849843094246e-07, + "loss": 0.5543, + "step": 2907 + }, + { + "epoch": 2.2983428147415284, + "grad_norm": 0.2337980834477202, + "learning_rate": 2.5642885355528663e-07, + "loss": 0.5443, + "step": 2908 + }, + { + "epoch": 2.299134306208261, + "grad_norm": 0.23676957670959684, + "learning_rate": 2.5587323668913575e-07, + "loss": 0.536, + "step": 2909 + }, + { + "epoch": 2.2999257976749936, + "grad_norm": 0.23441760003192555, + "learning_rate": 2.553181340949632e-07, + "loss": 0.5497, + "step": 2910 + }, + { + "epoch": 2.3007172891417262, + "grad_norm": 0.23508857955133236, + "learning_rate": 2.5476354615640504e-07, + "loss": 0.5435, + "step": 2911 + }, + { + "epoch": 2.301508780608459, + "grad_norm": 0.2422690010031859, + "learning_rate": 2.5420947325674145e-07, + "loss": 0.544, + "step": 2912 + }, + { + "epoch": 2.3023002720751915, + "grad_norm": 0.23926052851832066, + "learning_rate": 2.536559157788967e-07, + "loss": 0.5512, + "step": 2913 + }, + { + "epoch": 2.303091763541924, + "grad_norm": 0.23921418589979995, + "learning_rate": 2.531028741054386e-07, + "loss": 0.5562, + "step": 2914 + }, + { + "epoch": 2.3038832550086568, + "grad_norm": 0.23848422835146474, + "learning_rate": 2.525503486185785e-07, + "loss": 0.5439, + "step": 2915 + }, + { + "epoch": 2.3046747464753894, + "grad_norm": 0.23711473131955782, + "learning_rate": 2.519983397001717e-07, + "loss": 0.5576, + "step": 2916 + }, + { + "epoch": 2.305466237942122, + "grad_norm": 0.23309687717908328, + "learning_rate": 2.5144684773171623e-07, + "loss": 0.5317, + "step": 2917 + }, + { + "epoch": 2.3062577294088547, + "grad_norm": 0.23776376999681284, + "learning_rate": 2.5089587309435144e-07, + "loss": 0.5415, + "step": 2918 + }, + { + "epoch": 2.3070492208755873, + "grad_norm": 0.2379389945672291, + "learning_rate": 2.503454161688614e-07, + "loss": 0.5323, + "step": 2919 + }, + { + "epoch": 2.30784071234232, + "grad_norm": 0.23274745607177677, + "learning_rate": 2.497954773356705e-07, + "loss": 0.5417, + "step": 2920 + }, + { + "epoch": 2.3086322038090525, + "grad_norm": 0.23283432148544148, + "learning_rate": 2.492460569748468e-07, + "loss": 0.5382, + "step": 2921 + }, + { + "epoch": 2.309423695275785, + "grad_norm": 0.23672261901751507, + "learning_rate": 2.4869715546609894e-07, + "loss": 0.5272, + "step": 2922 + }, + { + "epoch": 2.310215186742518, + "grad_norm": 0.24404734310473322, + "learning_rate": 2.481487731887761e-07, + "loss": 0.5353, + "step": 2923 + }, + { + "epoch": 2.3110066782092504, + "grad_norm": 0.23489632952562617, + "learning_rate": 2.476009105218709e-07, + "loss": 0.5453, + "step": 2924 + }, + { + "epoch": 2.311798169675983, + "grad_norm": 0.23290588005746646, + "learning_rate": 2.4705356784401486e-07, + "loss": 0.5379, + "step": 2925 + }, + { + "epoch": 2.3125896611427157, + "grad_norm": 0.23709934150914688, + "learning_rate": 2.465067455334815e-07, + "loss": 0.5374, + "step": 2926 + }, + { + "epoch": 2.3133811526094483, + "grad_norm": 0.24157094934977066, + "learning_rate": 2.4596044396818415e-07, + "loss": 0.5346, + "step": 2927 + }, + { + "epoch": 2.314172644076181, + "grad_norm": 0.23607873462862783, + "learning_rate": 2.454146635256752e-07, + "loss": 0.527, + "step": 2928 + }, + { + "epoch": 2.3149641355429136, + "grad_norm": 0.2327484448089389, + "learning_rate": 2.448694045831489e-07, + "loss": 0.5506, + "step": 2929 + }, + { + "epoch": 2.315755627009646, + "grad_norm": 0.2292591355480856, + "learning_rate": 2.443246675174374e-07, + "loss": 0.5402, + "step": 2930 + }, + { + "epoch": 2.316547118476379, + "grad_norm": 0.23424073187304081, + "learning_rate": 2.4378045270501356e-07, + "loss": 0.5206, + "step": 2931 + }, + { + "epoch": 2.3173386099431115, + "grad_norm": 0.23523815302769308, + "learning_rate": 2.432367605219883e-07, + "loss": 0.5415, + "step": 2932 + }, + { + "epoch": 2.318130101409844, + "grad_norm": 0.24322651728949735, + "learning_rate": 2.426935913441115e-07, + "loss": 0.5556, + "step": 2933 + }, + { + "epoch": 2.3189215928765767, + "grad_norm": 0.2308249704959735, + "learning_rate": 2.4215094554677186e-07, + "loss": 0.5383, + "step": 2934 + }, + { + "epoch": 2.3197130843433094, + "grad_norm": 0.23709984308039514, + "learning_rate": 2.4160882350499625e-07, + "loss": 0.5517, + "step": 2935 + }, + { + "epoch": 2.320504575810042, + "grad_norm": 0.23740547802018616, + "learning_rate": 2.4106722559344914e-07, + "loss": 0.54, + "step": 2936 + }, + { + "epoch": 2.3212960672767746, + "grad_norm": 0.25173178577131733, + "learning_rate": 2.4052615218643403e-07, + "loss": 0.5498, + "step": 2937 + }, + { + "epoch": 2.3220875587435073, + "grad_norm": 0.24070201028750376, + "learning_rate": 2.3998560365789055e-07, + "loss": 0.5484, + "step": 2938 + }, + { + "epoch": 2.32287905021024, + "grad_norm": 0.23765755990070234, + "learning_rate": 2.394455803813963e-07, + "loss": 0.5491, + "step": 2939 + }, + { + "epoch": 2.3236705416769725, + "grad_norm": 0.2389945952429259, + "learning_rate": 2.389060827301657e-07, + "loss": 0.539, + "step": 2940 + }, + { + "epoch": 2.324462033143705, + "grad_norm": 0.23250324232305528, + "learning_rate": 2.3836711107704944e-07, + "loss": 0.5471, + "step": 2941 + }, + { + "epoch": 2.3252535246104378, + "grad_norm": 0.23999911548685604, + "learning_rate": 2.3782866579453586e-07, + "loss": 0.5543, + "step": 2942 + }, + { + "epoch": 2.3260450160771704, + "grad_norm": 0.23675045453576574, + "learning_rate": 2.372907472547485e-07, + "loss": 0.5576, + "step": 2943 + }, + { + "epoch": 2.326836507543903, + "grad_norm": 0.24175296270765648, + "learning_rate": 2.367533558294472e-07, + "loss": 0.5439, + "step": 2944 + }, + { + "epoch": 2.3276279990106357, + "grad_norm": 0.23572156866038801, + "learning_rate": 2.3621649189002745e-07, + "loss": 0.5458, + "step": 2945 + }, + { + "epoch": 2.3284194904773683, + "grad_norm": 0.2400512235425696, + "learning_rate": 2.356801558075201e-07, + "loss": 0.5466, + "step": 2946 + }, + { + "epoch": 2.329210981944101, + "grad_norm": 0.23140180592120257, + "learning_rate": 2.3514434795259164e-07, + "loss": 0.5476, + "step": 2947 + }, + { + "epoch": 2.3300024734108336, + "grad_norm": 0.23740514054863193, + "learning_rate": 2.3460906869554308e-07, + "loss": 0.5385, + "step": 2948 + }, + { + "epoch": 2.330793964877566, + "grad_norm": 0.2353411242490673, + "learning_rate": 2.3407431840631008e-07, + "loss": 0.5297, + "step": 2949 + }, + { + "epoch": 2.331585456344299, + "grad_norm": 0.2361315966652804, + "learning_rate": 2.3354009745446303e-07, + "loss": 0.5393, + "step": 2950 + }, + { + "epoch": 2.3323769478110314, + "grad_norm": 0.23801386911590616, + "learning_rate": 2.3300640620920587e-07, + "loss": 0.538, + "step": 2951 + }, + { + "epoch": 2.333168439277764, + "grad_norm": 0.2378470424093242, + "learning_rate": 2.3247324503937748e-07, + "loss": 0.5377, + "step": 2952 + }, + { + "epoch": 2.3339599307444967, + "grad_norm": 0.23316348768934472, + "learning_rate": 2.3194061431344968e-07, + "loss": 0.5225, + "step": 2953 + }, + { + "epoch": 2.3347514222112293, + "grad_norm": 0.24397046916608647, + "learning_rate": 2.314085143995277e-07, + "loss": 0.5639, + "step": 2954 + }, + { + "epoch": 2.335542913677962, + "grad_norm": 0.23741322935364065, + "learning_rate": 2.308769456653501e-07, + "loss": 0.5544, + "step": 2955 + }, + { + "epoch": 2.3363344051446946, + "grad_norm": 0.23296794692410755, + "learning_rate": 2.3034590847828806e-07, + "loss": 0.5369, + "step": 2956 + }, + { + "epoch": 2.3371258966114272, + "grad_norm": 0.24641419323860192, + "learning_rate": 2.298154032053461e-07, + "loss": 0.5392, + "step": 2957 + }, + { + "epoch": 2.33791738807816, + "grad_norm": 0.2341220817496533, + "learning_rate": 2.292854302131606e-07, + "loss": 0.5255, + "step": 2958 + }, + { + "epoch": 2.3387088795448925, + "grad_norm": 0.2417259848899059, + "learning_rate": 2.2875598986799992e-07, + "loss": 0.5533, + "step": 2959 + }, + { + "epoch": 2.339500371011625, + "grad_norm": 0.2384720082016968, + "learning_rate": 2.2822708253576462e-07, + "loss": 0.5403, + "step": 2960 + }, + { + "epoch": 2.3402918624783577, + "grad_norm": 0.24021821063661916, + "learning_rate": 2.2769870858198647e-07, + "loss": 0.5351, + "step": 2961 + }, + { + "epoch": 2.3410833539450904, + "grad_norm": 0.23341851615304104, + "learning_rate": 2.2717086837182964e-07, + "loss": 0.5465, + "step": 2962 + }, + { + "epoch": 2.341874845411823, + "grad_norm": 0.2282609483014954, + "learning_rate": 2.266435622700884e-07, + "loss": 0.5386, + "step": 2963 + }, + { + "epoch": 2.3426663368785556, + "grad_norm": 0.2381507009889597, + "learning_rate": 2.2611679064118816e-07, + "loss": 0.5413, + "step": 2964 + }, + { + "epoch": 2.3434578283452883, + "grad_norm": 0.23369932358400436, + "learning_rate": 2.2559055384918512e-07, + "loss": 0.5423, + "step": 2965 + }, + { + "epoch": 2.344249319812021, + "grad_norm": 0.2372626435269223, + "learning_rate": 2.250648522577654e-07, + "loss": 0.5346, + "step": 2966 + }, + { + "epoch": 2.3450408112787535, + "grad_norm": 0.23575100203991795, + "learning_rate": 2.245396862302461e-07, + "loss": 0.553, + "step": 2967 + }, + { + "epoch": 2.345832302745486, + "grad_norm": 0.24193031770458479, + "learning_rate": 2.240150561295735e-07, + "loss": 0.541, + "step": 2968 + }, + { + "epoch": 2.346623794212219, + "grad_norm": 0.23871335460532514, + "learning_rate": 2.2349096231832366e-07, + "loss": 0.5381, + "step": 2969 + }, + { + "epoch": 2.3474152856789514, + "grad_norm": 0.23360995485358593, + "learning_rate": 2.2296740515870205e-07, + "loss": 0.5405, + "step": 2970 + }, + { + "epoch": 2.348206777145684, + "grad_norm": 0.24073556478269748, + "learning_rate": 2.224443850125428e-07, + "loss": 0.5475, + "step": 2971 + }, + { + "epoch": 2.3489982686124167, + "grad_norm": 0.2316980279828743, + "learning_rate": 2.2192190224131003e-07, + "loss": 0.5437, + "step": 2972 + }, + { + "epoch": 2.3497897600791493, + "grad_norm": 0.23504108629980844, + "learning_rate": 2.213999572060955e-07, + "loss": 0.5437, + "step": 2973 + }, + { + "epoch": 2.350581251545882, + "grad_norm": 0.23090311838451977, + "learning_rate": 2.208785502676195e-07, + "loss": 0.5346, + "step": 2974 + }, + { + "epoch": 2.3513727430126146, + "grad_norm": 0.23252338648942963, + "learning_rate": 2.203576817862306e-07, + "loss": 0.5399, + "step": 2975 + }, + { + "epoch": 2.352164234479347, + "grad_norm": 0.23479332311143924, + "learning_rate": 2.198373521219049e-07, + "loss": 0.5358, + "step": 2976 + }, + { + "epoch": 2.35295572594608, + "grad_norm": 0.23545280272525002, + "learning_rate": 2.1931756163424708e-07, + "loss": 0.5535, + "step": 2977 + }, + { + "epoch": 2.3537472174128125, + "grad_norm": 0.23562554623559326, + "learning_rate": 2.187983106824881e-07, + "loss": 0.5399, + "step": 2978 + }, + { + "epoch": 2.354538708879545, + "grad_norm": 0.2385058108290646, + "learning_rate": 2.1827959962548626e-07, + "loss": 0.5387, + "step": 2979 + }, + { + "epoch": 2.3553302003462777, + "grad_norm": 0.2444467679434618, + "learning_rate": 2.1776142882172766e-07, + "loss": 0.544, + "step": 2980 + }, + { + "epoch": 2.3561216918130103, + "grad_norm": 0.23492791468464386, + "learning_rate": 2.172437986293233e-07, + "loss": 0.5404, + "step": 2981 + }, + { + "epoch": 2.356913183279743, + "grad_norm": 0.23734604714372437, + "learning_rate": 2.1672670940601244e-07, + "loss": 0.5659, + "step": 2982 + }, + { + "epoch": 2.3577046747464756, + "grad_norm": 0.23559863206262585, + "learning_rate": 2.1621016150915916e-07, + "loss": 0.5253, + "step": 2983 + }, + { + "epoch": 2.3584961662132082, + "grad_norm": 0.23137078846940448, + "learning_rate": 2.1569415529575363e-07, + "loss": 0.5188, + "step": 2984 + }, + { + "epoch": 2.359287657679941, + "grad_norm": 0.23127965442800444, + "learning_rate": 2.1517869112241282e-07, + "loss": 0.5367, + "step": 2985 + }, + { + "epoch": 2.3600791491466735, + "grad_norm": 0.24431856585504763, + "learning_rate": 2.14663769345377e-07, + "loss": 0.5358, + "step": 2986 + }, + { + "epoch": 2.360870640613406, + "grad_norm": 0.23671112881562123, + "learning_rate": 2.1414939032051338e-07, + "loss": 0.5415, + "step": 2987 + }, + { + "epoch": 2.3616621320801388, + "grad_norm": 0.23467375875259117, + "learning_rate": 2.1363555440331349e-07, + "loss": 0.5583, + "step": 2988 + }, + { + "epoch": 2.3624536235468714, + "grad_norm": 0.23551544465173518, + "learning_rate": 2.1312226194889294e-07, + "loss": 0.5297, + "step": 2989 + }, + { + "epoch": 2.3632451150136036, + "grad_norm": 0.24252246719560705, + "learning_rate": 2.1260951331199318e-07, + "loss": 0.5438, + "step": 2990 + }, + { + "epoch": 2.364036606480336, + "grad_norm": 0.23527427658269348, + "learning_rate": 2.1209730884697773e-07, + "loss": 0.5585, + "step": 2991 + }, + { + "epoch": 2.364828097947069, + "grad_norm": 0.2369771155174878, + "learning_rate": 2.1158564890783614e-07, + "loss": 0.5461, + "step": 2992 + }, + { + "epoch": 2.3656195894138015, + "grad_norm": 0.22913161426046072, + "learning_rate": 2.1107453384818041e-07, + "loss": 0.5471, + "step": 2993 + }, + { + "epoch": 2.366411080880534, + "grad_norm": 0.241289509398457, + "learning_rate": 2.1056396402124598e-07, + "loss": 0.5555, + "step": 2994 + }, + { + "epoch": 2.3672025723472667, + "grad_norm": 0.23499416053446745, + "learning_rate": 2.100539397798925e-07, + "loss": 0.5315, + "step": 2995 + }, + { + "epoch": 2.3679940638139994, + "grad_norm": 0.23796863417235548, + "learning_rate": 2.0954446147660076e-07, + "loss": 0.5353, + "step": 2996 + }, + { + "epoch": 2.368785555280732, + "grad_norm": 0.23104230521797295, + "learning_rate": 2.0903552946347613e-07, + "loss": 0.5213, + "step": 2997 + }, + { + "epoch": 2.3695770467474646, + "grad_norm": 0.23531598896105188, + "learning_rate": 2.0852714409224537e-07, + "loss": 0.5365, + "step": 2998 + }, + { + "epoch": 2.3703685382141972, + "grad_norm": 0.23130820172264557, + "learning_rate": 2.080193057142574e-07, + "loss": 0.5447, + "step": 2999 + }, + { + "epoch": 2.37116002968093, + "grad_norm": 0.24572529630980294, + "learning_rate": 2.0751201468048386e-07, + "loss": 0.554, + "step": 3000 + }, + { + "epoch": 2.3719515211476625, + "grad_norm": 0.23708559641086727, + "learning_rate": 2.0700527134151757e-07, + "loss": 0.5413, + "step": 3001 + }, + { + "epoch": 2.372743012614395, + "grad_norm": 0.23449673402384064, + "learning_rate": 2.0649907604757267e-07, + "loss": 0.542, + "step": 3002 + }, + { + "epoch": 2.3735345040811278, + "grad_norm": 0.23661757635590114, + "learning_rate": 2.0599342914848507e-07, + "loss": 0.5368, + "step": 3003 + }, + { + "epoch": 2.3743259955478604, + "grad_norm": 0.23694048392448555, + "learning_rate": 2.054883309937109e-07, + "loss": 0.5395, + "step": 3004 + }, + { + "epoch": 2.375117487014593, + "grad_norm": 0.24350189562956667, + "learning_rate": 2.0498378193232824e-07, + "loss": 0.5491, + "step": 3005 + }, + { + "epoch": 2.3759089784813257, + "grad_norm": 0.24335964799360535, + "learning_rate": 2.0447978231303465e-07, + "loss": 0.5536, + "step": 3006 + }, + { + "epoch": 2.3767004699480583, + "grad_norm": 0.23622368180971987, + "learning_rate": 2.0397633248414824e-07, + "loss": 0.5394, + "step": 3007 + }, + { + "epoch": 2.377491961414791, + "grad_norm": 0.23085108037031152, + "learning_rate": 2.034734327936074e-07, + "loss": 0.5408, + "step": 3008 + }, + { + "epoch": 2.3782834528815235, + "grad_norm": 0.2338776831000615, + "learning_rate": 2.0297108358896976e-07, + "loss": 0.5464, + "step": 3009 + }, + { + "epoch": 2.379074944348256, + "grad_norm": 0.23398572930753478, + "learning_rate": 2.0246928521741336e-07, + "loss": 0.5311, + "step": 3010 + }, + { + "epoch": 2.379866435814989, + "grad_norm": 0.23550836865287855, + "learning_rate": 2.0196803802573504e-07, + "loss": 0.5302, + "step": 3011 + }, + { + "epoch": 2.3806579272817214, + "grad_norm": 0.2401035646778562, + "learning_rate": 2.0146734236035056e-07, + "loss": 0.5411, + "step": 3012 + }, + { + "epoch": 2.381449418748454, + "grad_norm": 0.2428198648386074, + "learning_rate": 2.009671985672947e-07, + "loss": 0.5578, + "step": 3013 + }, + { + "epoch": 2.3822409102151867, + "grad_norm": 0.23172459856296815, + "learning_rate": 2.0046760699222077e-07, + "loss": 0.5395, + "step": 3014 + }, + { + "epoch": 2.3830324016819193, + "grad_norm": 0.23267258196412796, + "learning_rate": 1.9996856798040096e-07, + "loss": 0.5285, + "step": 3015 + }, + { + "epoch": 2.383823893148652, + "grad_norm": 0.23379681011650655, + "learning_rate": 1.9947008187672476e-07, + "loss": 0.5385, + "step": 3016 + }, + { + "epoch": 2.3846153846153846, + "grad_norm": 0.23882978808989294, + "learning_rate": 1.9897214902570004e-07, + "loss": 0.5454, + "step": 3017 + }, + { + "epoch": 2.385406876082117, + "grad_norm": 0.23279583356351305, + "learning_rate": 1.9847476977145206e-07, + "loss": 0.5553, + "step": 3018 + }, + { + "epoch": 2.38619836754885, + "grad_norm": 0.2358515290040839, + "learning_rate": 1.9797794445772354e-07, + "loss": 0.5465, + "step": 3019 + }, + { + "epoch": 2.3869898590155825, + "grad_norm": 0.23214625087536142, + "learning_rate": 1.974816734278748e-07, + "loss": 0.5341, + "step": 3020 + }, + { + "epoch": 2.387781350482315, + "grad_norm": 0.23957482651366258, + "learning_rate": 1.9698595702488264e-07, + "loss": 0.5382, + "step": 3021 + }, + { + "epoch": 2.3885728419490477, + "grad_norm": 0.2375334170362343, + "learning_rate": 1.9649079559134052e-07, + "loss": 0.5431, + "step": 3022 + }, + { + "epoch": 2.3893643334157804, + "grad_norm": 0.23426344141299393, + "learning_rate": 1.959961894694585e-07, + "loss": 0.5465, + "step": 3023 + }, + { + "epoch": 2.390155824882513, + "grad_norm": 0.2427116950044707, + "learning_rate": 1.955021390010625e-07, + "loss": 0.5422, + "step": 3024 + }, + { + "epoch": 2.3909473163492456, + "grad_norm": 0.23268379552551366, + "learning_rate": 1.9500864452759546e-07, + "loss": 0.5367, + "step": 3025 + }, + { + "epoch": 2.3917388078159783, + "grad_norm": 0.2396047627331565, + "learning_rate": 1.9451570639011506e-07, + "loss": 0.5332, + "step": 3026 + }, + { + "epoch": 2.392530299282711, + "grad_norm": 0.23972633958727738, + "learning_rate": 1.9402332492929462e-07, + "loss": 0.5378, + "step": 3027 + }, + { + "epoch": 2.3933217907494435, + "grad_norm": 0.24085801974620788, + "learning_rate": 1.9353150048542323e-07, + "loss": 0.5467, + "step": 3028 + }, + { + "epoch": 2.394113282216176, + "grad_norm": 0.23765836270601126, + "learning_rate": 1.9304023339840413e-07, + "loss": 0.5373, + "step": 3029 + }, + { + "epoch": 2.3949047736829088, + "grad_norm": 0.23759056797531627, + "learning_rate": 1.925495240077567e-07, + "loss": 0.5429, + "step": 3030 + }, + { + "epoch": 2.3956962651496414, + "grad_norm": 0.24133582058090594, + "learning_rate": 1.9205937265261386e-07, + "loss": 0.5277, + "step": 3031 + }, + { + "epoch": 2.396487756616374, + "grad_norm": 0.23810254393805264, + "learning_rate": 1.9156977967172305e-07, + "loss": 0.5452, + "step": 3032 + }, + { + "epoch": 2.3972792480831067, + "grad_norm": 0.23935644607567932, + "learning_rate": 1.9108074540344589e-07, + "loss": 0.5533, + "step": 3033 + }, + { + "epoch": 2.3980707395498393, + "grad_norm": 0.23402378461251894, + "learning_rate": 1.9059227018575773e-07, + "loss": 0.536, + "step": 3034 + }, + { + "epoch": 2.398862231016572, + "grad_norm": 0.234941047429903, + "learning_rate": 1.9010435435624817e-07, + "loss": 0.5312, + "step": 3035 + }, + { + "epoch": 2.3996537224833046, + "grad_norm": 0.23274382332399768, + "learning_rate": 1.896169982521194e-07, + "loss": 0.543, + "step": 3036 + }, + { + "epoch": 2.400445213950037, + "grad_norm": 0.2346041612978699, + "learning_rate": 1.8913020221018738e-07, + "loss": 0.5319, + "step": 3037 + }, + { + "epoch": 2.40123670541677, + "grad_norm": 0.23119817591818537, + "learning_rate": 1.8864396656688065e-07, + "loss": 0.5347, + "step": 3038 + }, + { + "epoch": 2.4020281968835024, + "grad_norm": 0.23559666560723652, + "learning_rate": 1.8815829165824027e-07, + "loss": 0.557, + "step": 3039 + }, + { + "epoch": 2.402819688350235, + "grad_norm": 0.23902309595885962, + "learning_rate": 1.8767317781992053e-07, + "loss": 0.5455, + "step": 3040 + }, + { + "epoch": 2.4036111798169677, + "grad_norm": 0.23874849537561182, + "learning_rate": 1.8718862538718727e-07, + "loss": 0.561, + "step": 3041 + }, + { + "epoch": 2.4044026712837003, + "grad_norm": 0.23487207124785586, + "learning_rate": 1.8670463469491848e-07, + "loss": 0.5493, + "step": 3042 + }, + { + "epoch": 2.405194162750433, + "grad_norm": 0.2348907171177612, + "learning_rate": 1.8622120607760462e-07, + "loss": 0.5445, + "step": 3043 + }, + { + "epoch": 2.4059856542171656, + "grad_norm": 0.23511639513535781, + "learning_rate": 1.857383398693463e-07, + "loss": 0.5361, + "step": 3044 + }, + { + "epoch": 2.4067771456838982, + "grad_norm": 0.23172284220984307, + "learning_rate": 1.8525603640385623e-07, + "loss": 0.5398, + "step": 3045 + }, + { + "epoch": 2.407568637150631, + "grad_norm": 0.24145893268253132, + "learning_rate": 1.8477429601445883e-07, + "loss": 0.555, + "step": 3046 + }, + { + "epoch": 2.4083601286173635, + "grad_norm": 0.23923201446021647, + "learning_rate": 1.8429311903408818e-07, + "loss": 0.5425, + "step": 3047 + }, + { + "epoch": 2.409151620084096, + "grad_norm": 0.23142822151905634, + "learning_rate": 1.8381250579529028e-07, + "loss": 0.5246, + "step": 3048 + }, + { + "epoch": 2.4099431115508287, + "grad_norm": 0.23318458482271598, + "learning_rate": 1.8333245663022012e-07, + "loss": 0.5334, + "step": 3049 + }, + { + "epoch": 2.4107346030175614, + "grad_norm": 0.234765056105339, + "learning_rate": 1.828529718706434e-07, + "loss": 0.5484, + "step": 3050 + }, + { + "epoch": 2.411526094484294, + "grad_norm": 0.24291943608056135, + "learning_rate": 1.8237405184793653e-07, + "loss": 0.5258, + "step": 3051 + }, + { + "epoch": 2.4123175859510266, + "grad_norm": 0.23763549523903982, + "learning_rate": 1.8189569689308447e-07, + "loss": 0.5503, + "step": 3052 + }, + { + "epoch": 2.4131090774177593, + "grad_norm": 0.2345372918505549, + "learning_rate": 1.8141790733668284e-07, + "loss": 0.5476, + "step": 3053 + }, + { + "epoch": 2.413900568884492, + "grad_norm": 0.2328438476755046, + "learning_rate": 1.8094068350893498e-07, + "loss": 0.5449, + "step": 3054 + }, + { + "epoch": 2.4146920603512245, + "grad_norm": 0.2397623560412376, + "learning_rate": 1.8046402573965435e-07, + "loss": 0.5486, + "step": 3055 + }, + { + "epoch": 2.415483551817957, + "grad_norm": 0.2350165835587842, + "learning_rate": 1.7998793435826331e-07, + "loss": 0.5375, + "step": 3056 + }, + { + "epoch": 2.4162750432846893, + "grad_norm": 0.2271690396423099, + "learning_rate": 1.79512409693792e-07, + "loss": 0.5225, + "step": 3057 + }, + { + "epoch": 2.417066534751422, + "grad_norm": 0.2363044776651791, + "learning_rate": 1.7903745207488008e-07, + "loss": 0.5519, + "step": 3058 + }, + { + "epoch": 2.4178580262181546, + "grad_norm": 0.2389641908921406, + "learning_rate": 1.7856306182977377e-07, + "loss": 0.5484, + "step": 3059 + }, + { + "epoch": 2.4186495176848872, + "grad_norm": 0.23733756879821363, + "learning_rate": 1.7808923928632802e-07, + "loss": 0.5407, + "step": 3060 + }, + { + "epoch": 2.41944100915162, + "grad_norm": 0.2367910945018087, + "learning_rate": 1.776159847720059e-07, + "loss": 0.5529, + "step": 3061 + }, + { + "epoch": 2.4202325006183525, + "grad_norm": 0.2404032568053986, + "learning_rate": 1.771432986138771e-07, + "loss": 0.5474, + "step": 3062 + }, + { + "epoch": 2.421023992085085, + "grad_norm": 0.2361832628890172, + "learning_rate": 1.7667118113861856e-07, + "loss": 0.5515, + "step": 3063 + }, + { + "epoch": 2.4218154835518177, + "grad_norm": 0.23336322102034698, + "learning_rate": 1.7619963267251535e-07, + "loss": 0.5362, + "step": 3064 + }, + { + "epoch": 2.4226069750185504, + "grad_norm": 0.23227217709511253, + "learning_rate": 1.7572865354145727e-07, + "loss": 0.5285, + "step": 3065 + }, + { + "epoch": 2.423398466485283, + "grad_norm": 0.23915673343967842, + "learning_rate": 1.7525824407094248e-07, + "loss": 0.5461, + "step": 3066 + }, + { + "epoch": 2.4241899579520156, + "grad_norm": 0.22756092226139843, + "learning_rate": 1.747884045860746e-07, + "loss": 0.5242, + "step": 3067 + }, + { + "epoch": 2.4249814494187483, + "grad_norm": 0.23655072304336305, + "learning_rate": 1.7431913541156317e-07, + "loss": 0.5375, + "step": 3068 + }, + { + "epoch": 2.425772940885481, + "grad_norm": 0.2408872647317191, + "learning_rate": 1.7385043687172462e-07, + "loss": 0.5299, + "step": 3069 + }, + { + "epoch": 2.4265644323522135, + "grad_norm": 0.2350455613550991, + "learning_rate": 1.7338230929047935e-07, + "loss": 0.5458, + "step": 3070 + }, + { + "epoch": 2.427355923818946, + "grad_norm": 0.23106468062471258, + "learning_rate": 1.7291475299135483e-07, + "loss": 0.5506, + "step": 3071 + }, + { + "epoch": 2.428147415285679, + "grad_norm": 0.2347809663190683, + "learning_rate": 1.7244776829748265e-07, + "loss": 0.5441, + "step": 3072 + }, + { + "epoch": 2.4289389067524114, + "grad_norm": 0.2359511824980337, + "learning_rate": 1.7198135553159941e-07, + "loss": 0.535, + "step": 3073 + }, + { + "epoch": 2.429730398219144, + "grad_norm": 0.23348670547692796, + "learning_rate": 1.7151551501604754e-07, + "loss": 0.5357, + "step": 3074 + }, + { + "epoch": 2.4305218896858767, + "grad_norm": 0.23205260864080945, + "learning_rate": 1.7105024707277216e-07, + "loss": 0.532, + "step": 3075 + }, + { + "epoch": 2.4313133811526093, + "grad_norm": 0.23874489046367486, + "learning_rate": 1.7058555202332435e-07, + "loss": 0.5517, + "step": 3076 + }, + { + "epoch": 2.432104872619342, + "grad_norm": 0.23587560295683307, + "learning_rate": 1.7012143018885837e-07, + "loss": 0.5446, + "step": 3077 + }, + { + "epoch": 2.4328963640860746, + "grad_norm": 0.23924043326356567, + "learning_rate": 1.696578818901324e-07, + "loss": 0.5496, + "step": 3078 + }, + { + "epoch": 2.433687855552807, + "grad_norm": 0.22826423171367932, + "learning_rate": 1.6919490744750898e-07, + "loss": 0.5417, + "step": 3079 + }, + { + "epoch": 2.43447934701954, + "grad_norm": 0.23315932588856364, + "learning_rate": 1.6873250718095257e-07, + "loss": 0.5374, + "step": 3080 + }, + { + "epoch": 2.4352708384862725, + "grad_norm": 0.24658934266044152, + "learning_rate": 1.6827068141003242e-07, + "loss": 0.5349, + "step": 3081 + }, + { + "epoch": 2.436062329953005, + "grad_norm": 0.24013862399555566, + "learning_rate": 1.678094304539198e-07, + "loss": 0.5452, + "step": 3082 + }, + { + "epoch": 2.4368538214197377, + "grad_norm": 0.23589336828081162, + "learning_rate": 1.6734875463138864e-07, + "loss": 0.5296, + "step": 3083 + }, + { + "epoch": 2.4376453128864704, + "grad_norm": 0.23847558779221928, + "learning_rate": 1.668886542608162e-07, + "loss": 0.5397, + "step": 3084 + }, + { + "epoch": 2.438436804353203, + "grad_norm": 0.2385144819390216, + "learning_rate": 1.664291296601814e-07, + "loss": 0.5465, + "step": 3085 + }, + { + "epoch": 2.4392282958199356, + "grad_norm": 0.2374882046971228, + "learning_rate": 1.659701811470653e-07, + "loss": 0.5437, + "step": 3086 + }, + { + "epoch": 2.4400197872866682, + "grad_norm": 0.23157400279180596, + "learning_rate": 1.655118090386509e-07, + "loss": 0.5132, + "step": 3087 + }, + { + "epoch": 2.440811278753401, + "grad_norm": 0.24130126154312947, + "learning_rate": 1.6505401365172267e-07, + "loss": 0.5638, + "step": 3088 + }, + { + "epoch": 2.4416027702201335, + "grad_norm": 0.23707093723590836, + "learning_rate": 1.6459679530266703e-07, + "loss": 0.5619, + "step": 3089 + }, + { + "epoch": 2.442394261686866, + "grad_norm": 0.23990481490120463, + "learning_rate": 1.6414015430747107e-07, + "loss": 0.5401, + "step": 3090 + }, + { + "epoch": 2.4431857531535988, + "grad_norm": 0.23655098262787114, + "learning_rate": 1.6368409098172297e-07, + "loss": 0.5426, + "step": 3091 + }, + { + "epoch": 2.4439772446203314, + "grad_norm": 0.23289934639423135, + "learning_rate": 1.6322860564061203e-07, + "loss": 0.541, + "step": 3092 + }, + { + "epoch": 2.444768736087064, + "grad_norm": 0.24182482596804827, + "learning_rate": 1.627736985989272e-07, + "loss": 0.5579, + "step": 3093 + }, + { + "epoch": 2.4455602275537967, + "grad_norm": 0.24072551065716197, + "learning_rate": 1.6231937017105923e-07, + "loss": 0.5397, + "step": 3094 + }, + { + "epoch": 2.4463517190205293, + "grad_norm": 0.24121211759958325, + "learning_rate": 1.6186562067099764e-07, + "loss": 0.5433, + "step": 3095 + }, + { + "epoch": 2.447143210487262, + "grad_norm": 0.23902272198133934, + "learning_rate": 1.6141245041233242e-07, + "loss": 0.5397, + "step": 3096 + }, + { + "epoch": 2.4479347019539945, + "grad_norm": 0.23477298657527262, + "learning_rate": 1.6095985970825321e-07, + "loss": 0.5447, + "step": 3097 + }, + { + "epoch": 2.448726193420727, + "grad_norm": 0.23495547741671985, + "learning_rate": 1.605078488715489e-07, + "loss": 0.5361, + "step": 3098 + }, + { + "epoch": 2.44951768488746, + "grad_norm": 0.23688597208873882, + "learning_rate": 1.6005641821460813e-07, + "loss": 0.5295, + "step": 3099 + }, + { + "epoch": 2.4503091763541924, + "grad_norm": 0.23306646738205733, + "learning_rate": 1.5960556804941815e-07, + "loss": 0.5443, + "step": 3100 + }, + { + "epoch": 2.451100667820925, + "grad_norm": 0.2333315996373385, + "learning_rate": 1.5915529868756507e-07, + "loss": 0.5453, + "step": 3101 + }, + { + "epoch": 2.4518921592876577, + "grad_norm": 0.23531650793015646, + "learning_rate": 1.5870561044023355e-07, + "loss": 0.5406, + "step": 3102 + }, + { + "epoch": 2.4526836507543903, + "grad_norm": 0.23207235034413456, + "learning_rate": 1.5825650361820674e-07, + "loss": 0.5603, + "step": 3103 + }, + { + "epoch": 2.453475142221123, + "grad_norm": 0.23593052853948004, + "learning_rate": 1.5780797853186623e-07, + "loss": 0.5483, + "step": 3104 + }, + { + "epoch": 2.4542666336878556, + "grad_norm": 0.23842629564774298, + "learning_rate": 1.5736003549119125e-07, + "loss": 0.5443, + "step": 3105 + }, + { + "epoch": 2.455058125154588, + "grad_norm": 0.22824167790126995, + "learning_rate": 1.5691267480575865e-07, + "loss": 0.5334, + "step": 3106 + }, + { + "epoch": 2.455849616621321, + "grad_norm": 0.23843678125334108, + "learning_rate": 1.564658967847432e-07, + "loss": 0.5324, + "step": 3107 + }, + { + "epoch": 2.4566411080880535, + "grad_norm": 0.23499125374752075, + "learning_rate": 1.5601970173691635e-07, + "loss": 0.5448, + "step": 3108 + }, + { + "epoch": 2.457432599554786, + "grad_norm": 0.22915750706814803, + "learning_rate": 1.5557408997064768e-07, + "loss": 0.5362, + "step": 3109 + }, + { + "epoch": 2.4582240910215187, + "grad_norm": 0.2353437999296168, + "learning_rate": 1.5512906179390272e-07, + "loss": 0.5346, + "step": 3110 + }, + { + "epoch": 2.4590155824882514, + "grad_norm": 0.23919039171583964, + "learning_rate": 1.54684617514244e-07, + "loss": 0.5435, + "step": 3111 + }, + { + "epoch": 2.459807073954984, + "grad_norm": 0.23043079808369574, + "learning_rate": 1.542407574388307e-07, + "loss": 0.5459, + "step": 3112 + }, + { + "epoch": 2.4605985654217166, + "grad_norm": 0.23705034936994757, + "learning_rate": 1.5379748187441766e-07, + "loss": 0.5497, + "step": 3113 + }, + { + "epoch": 2.4613900568884493, + "grad_norm": 0.23271866752152964, + "learning_rate": 1.5335479112735683e-07, + "loss": 0.5361, + "step": 3114 + }, + { + "epoch": 2.462181548355182, + "grad_norm": 0.24397729770321486, + "learning_rate": 1.5291268550359493e-07, + "loss": 0.5549, + "step": 3115 + }, + { + "epoch": 2.4629730398219145, + "grad_norm": 0.23664546071419895, + "learning_rate": 1.5247116530867477e-07, + "loss": 0.5461, + "step": 3116 + }, + { + "epoch": 2.463764531288647, + "grad_norm": 0.23449734137667236, + "learning_rate": 1.5203023084773448e-07, + "loss": 0.5532, + "step": 3117 + }, + { + "epoch": 2.4645560227553798, + "grad_norm": 0.22847149227790406, + "learning_rate": 1.5158988242550718e-07, + "loss": 0.5367, + "step": 3118 + }, + { + "epoch": 2.4653475142221124, + "grad_norm": 0.23977652202005234, + "learning_rate": 1.511501203463218e-07, + "loss": 0.5444, + "step": 3119 + }, + { + "epoch": 2.466139005688845, + "grad_norm": 0.237980361612846, + "learning_rate": 1.5071094491410098e-07, + "loss": 0.5435, + "step": 3120 + }, + { + "epoch": 2.4669304971555777, + "grad_norm": 0.23745647639017792, + "learning_rate": 1.5027235643236257e-07, + "loss": 0.5543, + "step": 3121 + }, + { + "epoch": 2.4677219886223103, + "grad_norm": 0.23550908797276845, + "learning_rate": 1.4983435520421849e-07, + "loss": 0.556, + "step": 3122 + }, + { + "epoch": 2.468513480089043, + "grad_norm": 0.23840297172092187, + "learning_rate": 1.493969415323747e-07, + "loss": 0.5395, + "step": 3123 + }, + { + "epoch": 2.4693049715557756, + "grad_norm": 0.23904245391064374, + "learning_rate": 1.4896011571913169e-07, + "loss": 0.5482, + "step": 3124 + }, + { + "epoch": 2.470096463022508, + "grad_norm": 0.23456396224995432, + "learning_rate": 1.485238780663832e-07, + "loss": 0.5415, + "step": 3125 + }, + { + "epoch": 2.470887954489241, + "grad_norm": 0.23216838017885, + "learning_rate": 1.4808822887561645e-07, + "loss": 0.5297, + "step": 3126 + }, + { + "epoch": 2.4716794459559734, + "grad_norm": 0.23579546284320718, + "learning_rate": 1.476531684479122e-07, + "loss": 0.5356, + "step": 3127 + }, + { + "epoch": 2.472470937422706, + "grad_norm": 0.23692145416780397, + "learning_rate": 1.4721869708394396e-07, + "loss": 0.5412, + "step": 3128 + }, + { + "epoch": 2.4732624288894387, + "grad_norm": 0.23638279385837965, + "learning_rate": 1.4678481508397868e-07, + "loss": 0.5545, + "step": 3129 + }, + { + "epoch": 2.4740539203561713, + "grad_norm": 0.23120172321373875, + "learning_rate": 1.4635152274787564e-07, + "loss": 0.5474, + "step": 3130 + }, + { + "epoch": 2.474845411822904, + "grad_norm": 0.22828148544503526, + "learning_rate": 1.4591882037508629e-07, + "loss": 0.541, + "step": 3131 + }, + { + "epoch": 2.4756369032896366, + "grad_norm": 0.24079017997885094, + "learning_rate": 1.4548670826465548e-07, + "loss": 0.5342, + "step": 3132 + }, + { + "epoch": 2.4764283947563692, + "grad_norm": 0.23558816283982148, + "learning_rate": 1.4505518671521854e-07, + "loss": 0.5485, + "step": 3133 + }, + { + "epoch": 2.477219886223102, + "grad_norm": 0.24578326210122056, + "learning_rate": 1.4462425602500395e-07, + "loss": 0.5331, + "step": 3134 + }, + { + "epoch": 2.4780113776898345, + "grad_norm": 0.2345514476597932, + "learning_rate": 1.441939164918312e-07, + "loss": 0.544, + "step": 3135 + }, + { + "epoch": 2.478802869156567, + "grad_norm": 0.24189182491688455, + "learning_rate": 1.4376416841311122e-07, + "loss": 0.5596, + "step": 3136 + }, + { + "epoch": 2.4795943606232997, + "grad_norm": 0.23566027035426818, + "learning_rate": 1.4333501208584708e-07, + "loss": 0.5493, + "step": 3137 + }, + { + "epoch": 2.4803858520900324, + "grad_norm": 0.23381446085837193, + "learning_rate": 1.429064478066312e-07, + "loss": 0.5291, + "step": 3138 + }, + { + "epoch": 2.481177343556765, + "grad_norm": 0.22876724413413854, + "learning_rate": 1.424784758716485e-07, + "loss": 0.541, + "step": 3139 + }, + { + "epoch": 2.4819688350234976, + "grad_norm": 0.23719487554609764, + "learning_rate": 1.4205109657667346e-07, + "loss": 0.5432, + "step": 3140 + }, + { + "epoch": 2.4827603264902303, + "grad_norm": 0.23126560447353134, + "learning_rate": 1.4162431021707134e-07, + "loss": 0.5397, + "step": 3141 + }, + { + "epoch": 2.483551817956963, + "grad_norm": 0.23569293373808486, + "learning_rate": 1.4119811708779816e-07, + "loss": 0.5503, + "step": 3142 + }, + { + "epoch": 2.4843433094236955, + "grad_norm": 0.24010869254677691, + "learning_rate": 1.4077251748339845e-07, + "loss": 0.5485, + "step": 3143 + }, + { + "epoch": 2.4851348008904277, + "grad_norm": 0.23668252067028273, + "learning_rate": 1.4034751169800828e-07, + "loss": 0.548, + "step": 3144 + }, + { + "epoch": 2.4859262923571603, + "grad_norm": 0.23061861151519614, + "learning_rate": 1.3992310002535235e-07, + "loss": 0.5334, + "step": 3145 + }, + { + "epoch": 2.486717783823893, + "grad_norm": 0.23424778790966844, + "learning_rate": 1.3949928275874468e-07, + "loss": 0.5522, + "step": 3146 + }, + { + "epoch": 2.4875092752906256, + "grad_norm": 0.23764531062904293, + "learning_rate": 1.3907606019108952e-07, + "loss": 0.5382, + "step": 3147 + }, + { + "epoch": 2.4883007667573582, + "grad_norm": 0.2344565009397954, + "learning_rate": 1.386534326148787e-07, + "loss": 0.544, + "step": 3148 + }, + { + "epoch": 2.489092258224091, + "grad_norm": 0.24085363881369712, + "learning_rate": 1.382314003221935e-07, + "loss": 0.5498, + "step": 3149 + }, + { + "epoch": 2.4898837496908235, + "grad_norm": 0.24149543811050822, + "learning_rate": 1.3780996360470432e-07, + "loss": 0.5442, + "step": 3150 + }, + { + "epoch": 2.490675241157556, + "grad_norm": 0.23687706561240804, + "learning_rate": 1.3738912275366898e-07, + "loss": 0.5445, + "step": 3151 + }, + { + "epoch": 2.4914667326242887, + "grad_norm": 0.24028087206167023, + "learning_rate": 1.369688780599344e-07, + "loss": 0.553, + "step": 3152 + }, + { + "epoch": 2.4922582240910214, + "grad_norm": 0.2362314693025883, + "learning_rate": 1.3654922981393503e-07, + "loss": 0.5555, + "step": 3153 + }, + { + "epoch": 2.493049715557754, + "grad_norm": 0.24010830855718476, + "learning_rate": 1.3613017830569252e-07, + "loss": 0.5493, + "step": 3154 + }, + { + "epoch": 2.4938412070244866, + "grad_norm": 0.23996396449725285, + "learning_rate": 1.3571172382481744e-07, + "loss": 0.5532, + "step": 3155 + }, + { + "epoch": 2.4946326984912193, + "grad_norm": 0.2565057060388274, + "learning_rate": 1.3529386666050656e-07, + "loss": 0.5539, + "step": 3156 + }, + { + "epoch": 2.495424189957952, + "grad_norm": 0.2280272817404365, + "learning_rate": 1.348766071015447e-07, + "loss": 0.5386, + "step": 3157 + }, + { + "epoch": 2.4962156814246845, + "grad_norm": 0.2332522637920383, + "learning_rate": 1.3445994543630346e-07, + "loss": 0.5246, + "step": 3158 + }, + { + "epoch": 2.497007172891417, + "grad_norm": 0.2321020310190722, + "learning_rate": 1.340438819527402e-07, + "loss": 0.5472, + "step": 3159 + }, + { + "epoch": 2.49779866435815, + "grad_norm": 0.23551347986342572, + "learning_rate": 1.3362841693840044e-07, + "loss": 0.548, + "step": 3160 + }, + { + "epoch": 2.4985901558248824, + "grad_norm": 0.23739641225781086, + "learning_rate": 1.3321355068041505e-07, + "loss": 0.555, + "step": 3161 + }, + { + "epoch": 2.499381647291615, + "grad_norm": 0.2383612647706636, + "learning_rate": 1.3279928346550185e-07, + "loss": 0.5559, + "step": 3162 + }, + { + "epoch": 2.5001731387583477, + "grad_norm": 0.23159310295039023, + "learning_rate": 1.3238561557996408e-07, + "loss": 0.5431, + "step": 3163 + }, + { + "epoch": 2.5009646302250803, + "grad_norm": 0.2364148670331119, + "learning_rate": 1.3197254730969042e-07, + "loss": 0.5365, + "step": 3164 + }, + { + "epoch": 2.501756121691813, + "grad_norm": 0.23841468904139962, + "learning_rate": 1.315600789401563e-07, + "loss": 0.5346, + "step": 3165 + }, + { + "epoch": 2.5025476131585456, + "grad_norm": 0.23440476605397462, + "learning_rate": 1.311482107564218e-07, + "loss": 0.5566, + "step": 3166 + }, + { + "epoch": 2.503339104625278, + "grad_norm": 0.2367995177563655, + "learning_rate": 1.3073694304313186e-07, + "loss": 0.553, + "step": 3167 + }, + { + "epoch": 2.504130596092011, + "grad_norm": 0.23088595959065358, + "learning_rate": 1.3032627608451774e-07, + "loss": 0.5359, + "step": 3168 + }, + { + "epoch": 2.5049220875587435, + "grad_norm": 0.23754198283978084, + "learning_rate": 1.299162101643938e-07, + "loss": 0.5498, + "step": 3169 + }, + { + "epoch": 2.505713579025476, + "grad_norm": 0.23484073685539716, + "learning_rate": 1.2950674556616047e-07, + "loss": 0.5263, + "step": 3170 + }, + { + "epoch": 2.5065050704922087, + "grad_norm": 0.23690510100612922, + "learning_rate": 1.2909788257280185e-07, + "loss": 0.5466, + "step": 3171 + }, + { + "epoch": 2.5072965619589413, + "grad_norm": 0.24288340940723652, + "learning_rate": 1.2868962146688612e-07, + "loss": 0.5522, + "step": 3172 + }, + { + "epoch": 2.508088053425674, + "grad_norm": 0.22880200656756094, + "learning_rate": 1.2828196253056633e-07, + "loss": 0.5294, + "step": 3173 + }, + { + "epoch": 2.5088795448924066, + "grad_norm": 0.23376621779238194, + "learning_rate": 1.278749060455787e-07, + "loss": 0.5447, + "step": 3174 + }, + { + "epoch": 2.5096710363591392, + "grad_norm": 0.23597945277757415, + "learning_rate": 1.274684522932431e-07, + "loss": 0.5561, + "step": 3175 + }, + { + "epoch": 2.510462527825872, + "grad_norm": 0.23957405682153782, + "learning_rate": 1.2706260155446302e-07, + "loss": 0.5466, + "step": 3176 + }, + { + "epoch": 2.5112540192926045, + "grad_norm": 0.2595199288251904, + "learning_rate": 1.26657354109725e-07, + "loss": 0.5421, + "step": 3177 + }, + { + "epoch": 2.512045510759337, + "grad_norm": 0.23222753790037834, + "learning_rate": 1.2625271023909924e-07, + "loss": 0.5526, + "step": 3178 + }, + { + "epoch": 2.5128370022260698, + "grad_norm": 0.23166962662010931, + "learning_rate": 1.25848670222238e-07, + "loss": 0.5343, + "step": 3179 + }, + { + "epoch": 2.5136284936928024, + "grad_norm": 0.23028563286774928, + "learning_rate": 1.2544523433837672e-07, + "loss": 0.5422, + "step": 3180 + }, + { + "epoch": 2.514419985159535, + "grad_norm": 0.2348245119892915, + "learning_rate": 1.2504240286633317e-07, + "loss": 0.5405, + "step": 3181 + }, + { + "epoch": 2.5152114766262677, + "grad_norm": 0.2334086471897704, + "learning_rate": 1.2464017608450706e-07, + "loss": 0.5252, + "step": 3182 + }, + { + "epoch": 2.5160029680930003, + "grad_norm": 0.23597440381205284, + "learning_rate": 1.2423855427088114e-07, + "loss": 0.5424, + "step": 3183 + }, + { + "epoch": 2.516794459559733, + "grad_norm": 0.24481159177007725, + "learning_rate": 1.238375377030192e-07, + "loss": 0.5425, + "step": 3184 + }, + { + "epoch": 2.5175859510264655, + "grad_norm": 0.23343778404294818, + "learning_rate": 1.2343712665806705e-07, + "loss": 0.5462, + "step": 3185 + }, + { + "epoch": 2.518377442493198, + "grad_norm": 0.23356755763259637, + "learning_rate": 1.2303732141275192e-07, + "loss": 0.5608, + "step": 3186 + }, + { + "epoch": 2.519168933959931, + "grad_norm": 0.23451727738070482, + "learning_rate": 1.2263812224338222e-07, + "loss": 0.5444, + "step": 3187 + }, + { + "epoch": 2.5199604254266634, + "grad_norm": 0.24347780073046107, + "learning_rate": 1.2223952942584804e-07, + "loss": 0.5452, + "step": 3188 + }, + { + "epoch": 2.520751916893396, + "grad_norm": 0.23632368926850109, + "learning_rate": 1.2184154323562003e-07, + "loss": 0.5468, + "step": 3189 + }, + { + "epoch": 2.5215434083601287, + "grad_norm": 0.2320789472342085, + "learning_rate": 1.2144416394774958e-07, + "loss": 0.5404, + "step": 3190 + }, + { + "epoch": 2.5223348998268613, + "grad_norm": 0.23496457576160137, + "learning_rate": 1.2104739183686863e-07, + "loss": 0.5344, + "step": 3191 + }, + { + "epoch": 2.523126391293594, + "grad_norm": 0.23491603881659645, + "learning_rate": 1.2065122717718946e-07, + "loss": 0.5444, + "step": 3192 + }, + { + "epoch": 2.5239178827603266, + "grad_norm": 0.23610267089430673, + "learning_rate": 1.2025567024250504e-07, + "loss": 0.545, + "step": 3193 + }, + { + "epoch": 2.524709374227059, + "grad_norm": 0.2373998158761749, + "learning_rate": 1.1986072130618762e-07, + "loss": 0.5372, + "step": 3194 + }, + { + "epoch": 2.525500865693792, + "grad_norm": 0.23146349394195206, + "learning_rate": 1.1946638064118975e-07, + "loss": 0.5409, + "step": 3195 + }, + { + "epoch": 2.5262923571605245, + "grad_norm": 0.23361231660690018, + "learning_rate": 1.190726485200434e-07, + "loss": 0.5412, + "step": 3196 + }, + { + "epoch": 2.527083848627257, + "grad_norm": 0.23289859428625342, + "learning_rate": 1.1867952521485958e-07, + "loss": 0.5352, + "step": 3197 + }, + { + "epoch": 2.5278753400939897, + "grad_norm": 0.23007999272443694, + "learning_rate": 1.1828701099732973e-07, + "loss": 0.547, + "step": 3198 + }, + { + "epoch": 2.5286668315607224, + "grad_norm": 0.23535159755948495, + "learning_rate": 1.1789510613872311e-07, + "loss": 0.5238, + "step": 3199 + }, + { + "epoch": 2.529458323027455, + "grad_norm": 0.2341115865553397, + "learning_rate": 1.1750381090988837e-07, + "loss": 0.5449, + "step": 3200 + }, + { + "epoch": 2.5302498144941876, + "grad_norm": 0.2315684136615418, + "learning_rate": 1.1711312558125275e-07, + "loss": 0.5455, + "step": 3201 + }, + { + "epoch": 2.5310413059609203, + "grad_norm": 0.2342899619112105, + "learning_rate": 1.1672305042282193e-07, + "loss": 0.5417, + "step": 3202 + }, + { + "epoch": 2.531832797427653, + "grad_norm": 0.23359039775215643, + "learning_rate": 1.1633358570418028e-07, + "loss": 0.5529, + "step": 3203 + }, + { + "epoch": 2.5326242888943855, + "grad_norm": 0.23345481895639528, + "learning_rate": 1.1594473169448982e-07, + "loss": 0.5338, + "step": 3204 + }, + { + "epoch": 2.533415780361118, + "grad_norm": 0.23947237723853748, + "learning_rate": 1.1555648866249068e-07, + "loss": 0.553, + "step": 3205 + }, + { + "epoch": 2.5342072718278508, + "grad_norm": 0.23313844462433372, + "learning_rate": 1.151688568765008e-07, + "loss": 0.5368, + "step": 3206 + }, + { + "epoch": 2.534998763294583, + "grad_norm": 0.23168108763650186, + "learning_rate": 1.1478183660441532e-07, + "loss": 0.549, + "step": 3207 + }, + { + "epoch": 2.5357902547613156, + "grad_norm": 0.23784493111942692, + "learning_rate": 1.143954281137075e-07, + "loss": 0.535, + "step": 3208 + }, + { + "epoch": 2.536581746228048, + "grad_norm": 0.2374851402734962, + "learning_rate": 1.1400963167142719e-07, + "loss": 0.5491, + "step": 3209 + }, + { + "epoch": 2.537373237694781, + "grad_norm": 0.2317170391103134, + "learning_rate": 1.1362444754420131e-07, + "loss": 0.5384, + "step": 3210 + }, + { + "epoch": 2.5381647291615135, + "grad_norm": 0.23267068568888466, + "learning_rate": 1.132398759982337e-07, + "loss": 0.5313, + "step": 3211 + }, + { + "epoch": 2.538956220628246, + "grad_norm": 0.23703211103344654, + "learning_rate": 1.1285591729930477e-07, + "loss": 0.546, + "step": 3212 + }, + { + "epoch": 2.5397477120949787, + "grad_norm": 0.2321030061925981, + "learning_rate": 1.124725717127718e-07, + "loss": 0.5463, + "step": 3213 + }, + { + "epoch": 2.5405392035617114, + "grad_norm": 0.23448127548658504, + "learning_rate": 1.1208983950356776e-07, + "loss": 0.5394, + "step": 3214 + }, + { + "epoch": 2.541330695028444, + "grad_norm": 0.22892239445385304, + "learning_rate": 1.1170772093620184e-07, + "loss": 0.5384, + "step": 3215 + }, + { + "epoch": 2.5421221864951766, + "grad_norm": 0.23527838040468413, + "learning_rate": 1.1132621627475969e-07, + "loss": 0.5435, + "step": 3216 + }, + { + "epoch": 2.5429136779619093, + "grad_norm": 0.2357248356865842, + "learning_rate": 1.1094532578290161e-07, + "loss": 0.5417, + "step": 3217 + }, + { + "epoch": 2.543705169428642, + "grad_norm": 0.23333939799332037, + "learning_rate": 1.1056504972386449e-07, + "loss": 0.5267, + "step": 3218 + }, + { + "epoch": 2.5444966608953745, + "grad_norm": 0.2355635099505826, + "learning_rate": 1.1018538836046021e-07, + "loss": 0.5296, + "step": 3219 + }, + { + "epoch": 2.545288152362107, + "grad_norm": 0.23679869926609523, + "learning_rate": 1.0980634195507543e-07, + "loss": 0.5419, + "step": 3220 + }, + { + "epoch": 2.54607964382884, + "grad_norm": 0.22770204545009404, + "learning_rate": 1.0942791076967272e-07, + "loss": 0.5326, + "step": 3221 + }, + { + "epoch": 2.5468711352955724, + "grad_norm": 0.24015483577927543, + "learning_rate": 1.0905009506578822e-07, + "loss": 0.5336, + "step": 3222 + }, + { + "epoch": 2.547662626762305, + "grad_norm": 0.24046943015257521, + "learning_rate": 1.0867289510453403e-07, + "loss": 0.5595, + "step": 3223 + }, + { + "epoch": 2.5484541182290377, + "grad_norm": 0.23767828535833865, + "learning_rate": 1.0829631114659576e-07, + "loss": 0.5229, + "step": 3224 + }, + { + "epoch": 2.5492456096957703, + "grad_norm": 0.2404685202276093, + "learning_rate": 1.0792034345223333e-07, + "loss": 0.5632, + "step": 3225 + }, + { + "epoch": 2.550037101162503, + "grad_norm": 0.23068258897467078, + "learning_rate": 1.0754499228128189e-07, + "loss": 0.532, + "step": 3226 + }, + { + "epoch": 2.5508285926292356, + "grad_norm": 0.23602688581386108, + "learning_rate": 1.071702578931486e-07, + "loss": 0.5358, + "step": 3227 + }, + { + "epoch": 2.551620084095968, + "grad_norm": 0.22894322587648852, + "learning_rate": 1.0679614054681618e-07, + "loss": 0.5329, + "step": 3228 + }, + { + "epoch": 2.552411575562701, + "grad_norm": 0.2312081834433616, + "learning_rate": 1.0642264050083983e-07, + "loss": 0.546, + "step": 3229 + }, + { + "epoch": 2.5532030670294334, + "grad_norm": 0.2419104611354076, + "learning_rate": 1.0604975801334826e-07, + "loss": 0.5473, + "step": 3230 + }, + { + "epoch": 2.553994558496166, + "grad_norm": 0.23702332247088082, + "learning_rate": 1.0567749334204412e-07, + "loss": 0.5442, + "step": 3231 + }, + { + "epoch": 2.5547860499628987, + "grad_norm": 0.23270703236793058, + "learning_rate": 1.0530584674420173e-07, + "loss": 0.5456, + "step": 3232 + }, + { + "epoch": 2.5555775414296313, + "grad_norm": 0.22961756966436309, + "learning_rate": 1.0493481847666963e-07, + "loss": 0.5352, + "step": 3233 + }, + { + "epoch": 2.556369032896364, + "grad_norm": 0.28629052688498435, + "learning_rate": 1.0456440879586825e-07, + "loss": 0.5511, + "step": 3234 + }, + { + "epoch": 2.5571605243630966, + "grad_norm": 0.24356184836417788, + "learning_rate": 1.041946179577905e-07, + "loss": 0.5294, + "step": 3235 + }, + { + "epoch": 2.5579520158298292, + "grad_norm": 0.23696380822952862, + "learning_rate": 1.0382544621800216e-07, + "loss": 0.5577, + "step": 3236 + }, + { + "epoch": 2.558743507296562, + "grad_norm": 0.23649415572248325, + "learning_rate": 1.0345689383164068e-07, + "loss": 0.5571, + "step": 3237 + }, + { + "epoch": 2.5595349987632945, + "grad_norm": 0.2425831872092614, + "learning_rate": 1.0308896105341547e-07, + "loss": 0.5494, + "step": 3238 + }, + { + "epoch": 2.560326490230027, + "grad_norm": 0.23098879668762667, + "learning_rate": 1.0272164813760787e-07, + "loss": 0.5318, + "step": 3239 + }, + { + "epoch": 2.5611179816967597, + "grad_norm": 0.23372087151188703, + "learning_rate": 1.0235495533807048e-07, + "loss": 0.5287, + "step": 3240 + }, + { + "epoch": 2.5619094731634924, + "grad_norm": 0.23265948131008293, + "learning_rate": 1.0198888290822827e-07, + "loss": 0.5396, + "step": 3241 + }, + { + "epoch": 2.562700964630225, + "grad_norm": 0.23426685852800547, + "learning_rate": 1.0162343110107641e-07, + "loss": 0.5274, + "step": 3242 + }, + { + "epoch": 2.5634924560969576, + "grad_norm": 0.2271260522432803, + "learning_rate": 1.0125860016918163e-07, + "loss": 0.5324, + "step": 3243 + }, + { + "epoch": 2.5642839475636903, + "grad_norm": 0.23571815367832072, + "learning_rate": 1.008943903646815e-07, + "loss": 0.5447, + "step": 3244 + }, + { + "epoch": 2.565075439030423, + "grad_norm": 0.23668867736100763, + "learning_rate": 1.0053080193928432e-07, + "loss": 0.5511, + "step": 3245 + }, + { + "epoch": 2.5658669304971555, + "grad_norm": 0.2295744892562428, + "learning_rate": 1.0016783514426919e-07, + "loss": 0.5336, + "step": 3246 + }, + { + "epoch": 2.566658421963888, + "grad_norm": 0.23245559143610908, + "learning_rate": 9.980549023048512e-08, + "loss": 0.5434, + "step": 3247 + }, + { + "epoch": 2.567449913430621, + "grad_norm": 0.23430477667816743, + "learning_rate": 9.944376744835181e-08, + "loss": 0.528, + "step": 3248 + }, + { + "epoch": 2.5682414048973534, + "grad_norm": 0.23550847832675992, + "learning_rate": 9.90826670478585e-08, + "loss": 0.5451, + "step": 3249 + }, + { + "epoch": 2.569032896364086, + "grad_norm": 0.23578221932952387, + "learning_rate": 9.872218927856468e-08, + "loss": 0.5546, + "step": 3250 + }, + { + "epoch": 2.5698243878308187, + "grad_norm": 0.2355131785530226, + "learning_rate": 9.836233438959962e-08, + "loss": 0.5599, + "step": 3251 + }, + { + "epoch": 2.5706158792975513, + "grad_norm": 0.23085690150964858, + "learning_rate": 9.800310262966183e-08, + "loss": 0.5392, + "step": 3252 + }, + { + "epoch": 2.571407370764284, + "grad_norm": 0.23610506349929572, + "learning_rate": 9.764449424701915e-08, + "loss": 0.5524, + "step": 3253 + }, + { + "epoch": 2.5721988622310166, + "grad_norm": 0.2320838939245817, + "learning_rate": 9.728650948950878e-08, + "loss": 0.5533, + "step": 3254 + }, + { + "epoch": 2.572990353697749, + "grad_norm": 0.2371740138803932, + "learning_rate": 9.69291486045366e-08, + "loss": 0.5498, + "step": 3255 + }, + { + "epoch": 2.573781845164482, + "grad_norm": 0.2345519697901397, + "learning_rate": 9.657241183907805e-08, + "loss": 0.5384, + "step": 3256 + }, + { + "epoch": 2.5745733366312145, + "grad_norm": 0.22977283745907118, + "learning_rate": 9.621629943967669e-08, + "loss": 0.536, + "step": 3257 + }, + { + "epoch": 2.575364828097947, + "grad_norm": 0.2317062114794596, + "learning_rate": 9.586081165244442e-08, + "loss": 0.5463, + "step": 3258 + }, + { + "epoch": 2.5761563195646797, + "grad_norm": 0.2361978413822048, + "learning_rate": 9.550594872306183e-08, + "loss": 0.55, + "step": 3259 + }, + { + "epoch": 2.5769478110314123, + "grad_norm": 0.22798913560472214, + "learning_rate": 9.515171089677731e-08, + "loss": 0.5372, + "step": 3260 + }, + { + "epoch": 2.577739302498145, + "grad_norm": 0.23256596597325183, + "learning_rate": 9.47980984184078e-08, + "loss": 0.5409, + "step": 3261 + }, + { + "epoch": 2.5785307939648776, + "grad_norm": 0.23189532545734856, + "learning_rate": 9.44451115323377e-08, + "loss": 0.5446, + "step": 3262 + }, + { + "epoch": 2.5793222854316102, + "grad_norm": 0.2336721072308425, + "learning_rate": 9.409275048251896e-08, + "loss": 0.5427, + "step": 3263 + }, + { + "epoch": 2.580113776898343, + "grad_norm": 0.24228150907741597, + "learning_rate": 9.374101551247115e-08, + "loss": 0.5629, + "step": 3264 + }, + { + "epoch": 2.5809052683650755, + "grad_norm": 0.2332936061305752, + "learning_rate": 9.338990686528092e-08, + "loss": 0.5439, + "step": 3265 + }, + { + "epoch": 2.581696759831808, + "grad_norm": 0.2603182451765577, + "learning_rate": 9.30394247836026e-08, + "loss": 0.5356, + "step": 3266 + }, + { + "epoch": 2.5824882512985408, + "grad_norm": 0.23445377307724014, + "learning_rate": 9.268956950965712e-08, + "loss": 0.5566, + "step": 3267 + }, + { + "epoch": 2.5832797427652734, + "grad_norm": 0.23170666610167254, + "learning_rate": 9.234034128523227e-08, + "loss": 0.5456, + "step": 3268 + }, + { + "epoch": 2.584071234232006, + "grad_norm": 0.23973468636497786, + "learning_rate": 9.199174035168222e-08, + "loss": 0.5471, + "step": 3269 + }, + { + "epoch": 2.5848627256987387, + "grad_norm": 0.23884964300759012, + "learning_rate": 9.164376694992804e-08, + "loss": 0.5288, + "step": 3270 + }, + { + "epoch": 2.5856542171654713, + "grad_norm": 0.23616746385245097, + "learning_rate": 9.129642132045712e-08, + "loss": 0.5297, + "step": 3271 + }, + { + "epoch": 2.586445708632204, + "grad_norm": 0.23211053247086316, + "learning_rate": 9.094970370332256e-08, + "loss": 0.5422, + "step": 3272 + }, + { + "epoch": 2.5872372000989365, + "grad_norm": 0.23237141188613064, + "learning_rate": 9.060361433814378e-08, + "loss": 0.5376, + "step": 3273 + }, + { + "epoch": 2.588028691565669, + "grad_norm": 0.23244278897303372, + "learning_rate": 9.025815346410604e-08, + "loss": 0.5499, + "step": 3274 + }, + { + "epoch": 2.588820183032402, + "grad_norm": 0.2405070173127924, + "learning_rate": 8.991332131995987e-08, + "loss": 0.55, + "step": 3275 + }, + { + "epoch": 2.5896116744991344, + "grad_norm": 0.22891260878735048, + "learning_rate": 8.956911814402157e-08, + "loss": 0.5409, + "step": 3276 + }, + { + "epoch": 2.590403165965867, + "grad_norm": 0.23211457918407827, + "learning_rate": 8.922554417417294e-08, + "loss": 0.5439, + "step": 3277 + }, + { + "epoch": 2.5911946574325997, + "grad_norm": 0.23800191370437923, + "learning_rate": 8.88825996478606e-08, + "loss": 0.5369, + "step": 3278 + }, + { + "epoch": 2.5919861488993323, + "grad_norm": 0.2422727156327912, + "learning_rate": 8.85402848020963e-08, + "loss": 0.5432, + "step": 3279 + }, + { + "epoch": 2.592777640366065, + "grad_norm": 0.24076231174928742, + "learning_rate": 8.819859987345645e-08, + "loss": 0.5611, + "step": 3280 + }, + { + "epoch": 2.5935691318327976, + "grad_norm": 0.23758532741580235, + "learning_rate": 8.785754509808207e-08, + "loss": 0.5401, + "step": 3281 + }, + { + "epoch": 2.59436062329953, + "grad_norm": 0.2349556462570344, + "learning_rate": 8.751712071167939e-08, + "loss": 0.5432, + "step": 3282 + }, + { + "epoch": 2.595152114766263, + "grad_norm": 0.22976396515524597, + "learning_rate": 8.717732694951796e-08, + "loss": 0.5421, + "step": 3283 + }, + { + "epoch": 2.5959436062329955, + "grad_norm": 0.23680472103924952, + "learning_rate": 8.683816404643252e-08, + "loss": 0.5355, + "step": 3284 + }, + { + "epoch": 2.596735097699728, + "grad_norm": 0.24204305535113652, + "learning_rate": 8.649963223682066e-08, + "loss": 0.5404, + "step": 3285 + }, + { + "epoch": 2.5975265891664607, + "grad_norm": 0.23402848897488596, + "learning_rate": 8.616173175464458e-08, + "loss": 0.5391, + "step": 3286 + }, + { + "epoch": 2.5983180806331934, + "grad_norm": 0.23380288641913877, + "learning_rate": 8.582446283343026e-08, + "loss": 0.5356, + "step": 3287 + }, + { + "epoch": 2.599109572099926, + "grad_norm": 0.23782795634634146, + "learning_rate": 8.548782570626666e-08, + "loss": 0.5286, + "step": 3288 + }, + { + "epoch": 2.5999010635666586, + "grad_norm": 0.23406536154328317, + "learning_rate": 8.515182060580683e-08, + "loss": 0.533, + "step": 3289 + }, + { + "epoch": 2.6006925550333913, + "grad_norm": 0.23267787452687938, + "learning_rate": 8.481644776426633e-08, + "loss": 0.5418, + "step": 3290 + }, + { + "epoch": 2.601484046500124, + "grad_norm": 0.23271510406162305, + "learning_rate": 8.448170741342364e-08, + "loss": 0.5467, + "step": 3291 + }, + { + "epoch": 2.6022755379668565, + "grad_norm": 0.23772719690013952, + "learning_rate": 8.414759978462116e-08, + "loss": 0.538, + "step": 3292 + }, + { + "epoch": 2.603067029433589, + "grad_norm": 0.2364391397005506, + "learning_rate": 8.381412510876307e-08, + "loss": 0.5479, + "step": 3293 + }, + { + "epoch": 2.6038585209003218, + "grad_norm": 0.23015522710192796, + "learning_rate": 8.348128361631634e-08, + "loss": 0.5447, + "step": 3294 + }, + { + "epoch": 2.6046500123670544, + "grad_norm": 0.2370660271484739, + "learning_rate": 8.314907553731054e-08, + "loss": 0.5463, + "step": 3295 + }, + { + "epoch": 2.605441503833787, + "grad_norm": 0.23698433146290473, + "learning_rate": 8.281750110133701e-08, + "loss": 0.5367, + "step": 3296 + }, + { + "epoch": 2.6062329953005197, + "grad_norm": 0.23910359816647553, + "learning_rate": 8.248656053754999e-08, + "loss": 0.5407, + "step": 3297 + }, + { + "epoch": 2.6070244867672523, + "grad_norm": 0.23728545288367459, + "learning_rate": 8.215625407466486e-08, + "loss": 0.5453, + "step": 3298 + }, + { + "epoch": 2.607815978233985, + "grad_norm": 0.23480932634169566, + "learning_rate": 8.182658194095914e-08, + "loss": 0.5515, + "step": 3299 + }, + { + "epoch": 2.6086074697007176, + "grad_norm": 0.23802606396141185, + "learning_rate": 8.14975443642718e-08, + "loss": 0.5509, + "step": 3300 + }, + { + "epoch": 2.60939896116745, + "grad_norm": 0.22972271494621538, + "learning_rate": 8.116914157200339e-08, + "loss": 0.5362, + "step": 3301 + }, + { + "epoch": 2.610190452634183, + "grad_norm": 0.23486271426552657, + "learning_rate": 8.084137379111577e-08, + "loss": 0.5218, + "step": 3302 + }, + { + "epoch": 2.610981944100915, + "grad_norm": 0.2306382742158135, + "learning_rate": 8.051424124813199e-08, + "loss": 0.5499, + "step": 3303 + }, + { + "epoch": 2.6117734355676476, + "grad_norm": 0.2343152822354507, + "learning_rate": 8.018774416913565e-08, + "loss": 0.5483, + "step": 3304 + }, + { + "epoch": 2.6125649270343803, + "grad_norm": 0.23138535389201664, + "learning_rate": 7.986188277977202e-08, + "loss": 0.5304, + "step": 3305 + }, + { + "epoch": 2.613356418501113, + "grad_norm": 0.23748623310209613, + "learning_rate": 7.953665730524584e-08, + "loss": 0.5455, + "step": 3306 + }, + { + "epoch": 2.6141479099678455, + "grad_norm": 0.23805936946734416, + "learning_rate": 7.921206797032354e-08, + "loss": 0.5558, + "step": 3307 + }, + { + "epoch": 2.614939401434578, + "grad_norm": 0.23203940724739996, + "learning_rate": 7.888811499933135e-08, + "loss": 0.5291, + "step": 3308 + }, + { + "epoch": 2.6157308929013108, + "grad_norm": 0.24196687723997037, + "learning_rate": 7.856479861615551e-08, + "loss": 0.5568, + "step": 3309 + }, + { + "epoch": 2.6165223843680434, + "grad_norm": 0.22756235172460287, + "learning_rate": 7.824211904424305e-08, + "loss": 0.5415, + "step": 3310 + }, + { + "epoch": 2.617313875834776, + "grad_norm": 0.2345438116625913, + "learning_rate": 7.792007650659981e-08, + "loss": 0.5402, + "step": 3311 + }, + { + "epoch": 2.6181053673015087, + "grad_norm": 0.2321566853556492, + "learning_rate": 7.759867122579245e-08, + "loss": 0.5306, + "step": 3312 + }, + { + "epoch": 2.6188968587682413, + "grad_norm": 0.2303537722632866, + "learning_rate": 7.727790342394657e-08, + "loss": 0.5417, + "step": 3313 + }, + { + "epoch": 2.619688350234974, + "grad_norm": 0.23668844423079852, + "learning_rate": 7.695777332274722e-08, + "loss": 0.5382, + "step": 3314 + }, + { + "epoch": 2.6204798417017066, + "grad_norm": 0.2414251156724988, + "learning_rate": 7.663828114343939e-08, + "loss": 0.5461, + "step": 3315 + }, + { + "epoch": 2.621271333168439, + "grad_norm": 0.2288192327340346, + "learning_rate": 7.631942710682593e-08, + "loss": 0.5508, + "step": 3316 + }, + { + "epoch": 2.622062824635172, + "grad_norm": 0.2309913863417435, + "learning_rate": 7.60012114332701e-08, + "loss": 0.5526, + "step": 3317 + }, + { + "epoch": 2.6228543161019044, + "grad_norm": 0.23524931378287434, + "learning_rate": 7.568363434269299e-08, + "loss": 0.5471, + "step": 3318 + }, + { + "epoch": 2.623645807568637, + "grad_norm": 0.23661072595053065, + "learning_rate": 7.536669605457446e-08, + "loss": 0.5576, + "step": 3319 + }, + { + "epoch": 2.6244372990353697, + "grad_norm": 0.24557479693371542, + "learning_rate": 7.505039678795377e-08, + "loss": 0.5527, + "step": 3320 + }, + { + "epoch": 2.6252287905021023, + "grad_norm": 0.23182584962680358, + "learning_rate": 7.473473676142716e-08, + "loss": 0.5423, + "step": 3321 + }, + { + "epoch": 2.626020281968835, + "grad_norm": 0.2320309639633608, + "learning_rate": 7.441971619315024e-08, + "loss": 0.5417, + "step": 3322 + }, + { + "epoch": 2.6268117734355676, + "grad_norm": 0.22920057909618372, + "learning_rate": 7.410533530083618e-08, + "loss": 0.5313, + "step": 3323 + }, + { + "epoch": 2.6276032649023002, + "grad_norm": 0.24020858834656983, + "learning_rate": 7.379159430175596e-08, + "loss": 0.5501, + "step": 3324 + }, + { + "epoch": 2.628394756369033, + "grad_norm": 0.2339868081826666, + "learning_rate": 7.34784934127387e-08, + "loss": 0.5492, + "step": 3325 + }, + { + "epoch": 2.6291862478357655, + "grad_norm": 0.23996926487304107, + "learning_rate": 7.316603285017098e-08, + "loss": 0.5329, + "step": 3326 + }, + { + "epoch": 2.629977739302498, + "grad_norm": 0.23393305474716156, + "learning_rate": 7.28542128299967e-08, + "loss": 0.5494, + "step": 3327 + }, + { + "epoch": 2.6307692307692307, + "grad_norm": 0.22963241060417952, + "learning_rate": 7.254303356771707e-08, + "loss": 0.5406, + "step": 3328 + }, + { + "epoch": 2.6315607222359634, + "grad_norm": 0.23886740051813526, + "learning_rate": 7.223249527839059e-08, + "loss": 0.5446, + "step": 3329 + }, + { + "epoch": 2.632352213702696, + "grad_norm": 0.22744165041862927, + "learning_rate": 7.192259817663304e-08, + "loss": 0.545, + "step": 3330 + }, + { + "epoch": 2.6331437051694286, + "grad_norm": 0.23670022225669177, + "learning_rate": 7.161334247661655e-08, + "loss": 0.5391, + "step": 3331 + }, + { + "epoch": 2.6339351966361613, + "grad_norm": 0.23971268561491108, + "learning_rate": 7.130472839207025e-08, + "loss": 0.5492, + "step": 3332 + }, + { + "epoch": 2.634726688102894, + "grad_norm": 0.2383035981779266, + "learning_rate": 7.099675613627998e-08, + "loss": 0.5399, + "step": 3333 + }, + { + "epoch": 2.6355181795696265, + "grad_norm": 0.23579330856816882, + "learning_rate": 7.068942592208737e-08, + "loss": 0.5416, + "step": 3334 + }, + { + "epoch": 2.636309671036359, + "grad_norm": 0.23575770594660464, + "learning_rate": 7.038273796189142e-08, + "loss": 0.5385, + "step": 3335 + }, + { + "epoch": 2.637101162503092, + "grad_norm": 0.23548350110621555, + "learning_rate": 7.007669246764625e-08, + "loss": 0.5537, + "step": 3336 + }, + { + "epoch": 2.6378926539698244, + "grad_norm": 0.23844529878142853, + "learning_rate": 6.977128965086243e-08, + "loss": 0.5469, + "step": 3337 + }, + { + "epoch": 2.638684145436557, + "grad_norm": 0.22686586098540332, + "learning_rate": 6.946652972260636e-08, + "loss": 0.54, + "step": 3338 + }, + { + "epoch": 2.6394756369032897, + "grad_norm": 0.23694325529283686, + "learning_rate": 6.916241289349988e-08, + "loss": 0.5447, + "step": 3339 + }, + { + "epoch": 2.6402671283700223, + "grad_norm": 0.2348179002723714, + "learning_rate": 6.885893937372089e-08, + "loss": 0.5433, + "step": 3340 + }, + { + "epoch": 2.641058619836755, + "grad_norm": 0.22958201656947394, + "learning_rate": 6.855610937300216e-08, + "loss": 0.5482, + "step": 3341 + }, + { + "epoch": 2.6418501113034876, + "grad_norm": 0.24174400808676, + "learning_rate": 6.825392310063205e-08, + "loss": 0.5617, + "step": 3342 + }, + { + "epoch": 2.64264160277022, + "grad_norm": 0.23418458670941242, + "learning_rate": 6.795238076545384e-08, + "loss": 0.54, + "step": 3343 + }, + { + "epoch": 2.643433094236953, + "grad_norm": 0.22909574160163376, + "learning_rate": 6.765148257586573e-08, + "loss": 0.5269, + "step": 3344 + }, + { + "epoch": 2.6442245857036855, + "grad_norm": 0.24226787531773467, + "learning_rate": 6.735122873982113e-08, + "loss": 0.5481, + "step": 3345 + }, + { + "epoch": 2.645016077170418, + "grad_norm": 0.2379068130862316, + "learning_rate": 6.705161946482773e-08, + "loss": 0.5585, + "step": 3346 + }, + { + "epoch": 2.6458075686371507, + "grad_norm": 0.23731897910040356, + "learning_rate": 6.67526549579479e-08, + "loss": 0.5412, + "step": 3347 + }, + { + "epoch": 2.6465990601038833, + "grad_norm": 0.23372179787258096, + "learning_rate": 6.645433542579848e-08, + "loss": 0.539, + "step": 3348 + }, + { + "epoch": 2.647390551570616, + "grad_norm": 0.23682260207578243, + "learning_rate": 6.615666107455031e-08, + "loss": 0.5603, + "step": 3349 + }, + { + "epoch": 2.6481820430373486, + "grad_norm": 0.23206983260338943, + "learning_rate": 6.585963210992885e-08, + "loss": 0.538, + "step": 3350 + }, + { + "epoch": 2.6489735345040812, + "grad_norm": 0.23993646439239033, + "learning_rate": 6.556324873721297e-08, + "loss": 0.5514, + "step": 3351 + }, + { + "epoch": 2.649765025970814, + "grad_norm": 0.236293576505746, + "learning_rate": 6.526751116123574e-08, + "loss": 0.5468, + "step": 3352 + }, + { + "epoch": 2.6505565174375465, + "grad_norm": 0.23182344719612016, + "learning_rate": 6.497241958638367e-08, + "loss": 0.5404, + "step": 3353 + }, + { + "epoch": 2.651348008904279, + "grad_norm": 0.2301902403497955, + "learning_rate": 6.46779742165967e-08, + "loss": 0.5458, + "step": 3354 + }, + { + "epoch": 2.6521395003710118, + "grad_norm": 0.2343777032733534, + "learning_rate": 6.438417525536876e-08, + "loss": 0.5409, + "step": 3355 + }, + { + "epoch": 2.6529309918377444, + "grad_norm": 0.23665232079532303, + "learning_rate": 6.409102290574653e-08, + "loss": 0.5459, + "step": 3356 + }, + { + "epoch": 2.653722483304477, + "grad_norm": 0.2376784514350585, + "learning_rate": 6.379851737032993e-08, + "loss": 0.5381, + "step": 3357 + }, + { + "epoch": 2.6545139747712097, + "grad_norm": 0.23866739464754536, + "learning_rate": 6.350665885127193e-08, + "loss": 0.5529, + "step": 3358 + }, + { + "epoch": 2.6553054662379423, + "grad_norm": 0.2293476005110921, + "learning_rate": 6.321544755027819e-08, + "loss": 0.5401, + "step": 3359 + }, + { + "epoch": 2.656096957704675, + "grad_norm": 0.23295232015536255, + "learning_rate": 6.292488366860726e-08, + "loss": 0.54, + "step": 3360 + }, + { + "epoch": 2.656888449171407, + "grad_norm": 0.23398681265126528, + "learning_rate": 6.26349674070703e-08, + "loss": 0.5423, + "step": 3361 + }, + { + "epoch": 2.6576799406381397, + "grad_norm": 0.2425264702879415, + "learning_rate": 6.234569896603049e-08, + "loss": 0.5454, + "step": 3362 + }, + { + "epoch": 2.6584714321048724, + "grad_norm": 0.2312996214940595, + "learning_rate": 6.205707854540376e-08, + "loss": 0.5454, + "step": 3363 + }, + { + "epoch": 2.659262923571605, + "grad_norm": 0.2277981269472317, + "learning_rate": 6.176910634465781e-08, + "loss": 0.5387, + "step": 3364 + }, + { + "epoch": 2.6600544150383376, + "grad_norm": 0.23853364569520374, + "learning_rate": 6.148178256281289e-08, + "loss": 0.5528, + "step": 3365 + }, + { + "epoch": 2.6608459065050702, + "grad_norm": 0.22907449802683474, + "learning_rate": 6.119510739844046e-08, + "loss": 0.5474, + "step": 3366 + }, + { + "epoch": 2.661637397971803, + "grad_norm": 0.2299247984715103, + "learning_rate": 6.090908104966397e-08, + "loss": 0.5411, + "step": 3367 + }, + { + "epoch": 2.6624288894385355, + "grad_norm": 0.227551049767968, + "learning_rate": 6.062370371415903e-08, + "loss": 0.5402, + "step": 3368 + }, + { + "epoch": 2.663220380905268, + "grad_norm": 0.23378723243047844, + "learning_rate": 6.033897558915158e-08, + "loss": 0.5485, + "step": 3369 + }, + { + "epoch": 2.6640118723720008, + "grad_norm": 0.2394556610967616, + "learning_rate": 6.005489687141996e-08, + "loss": 0.5486, + "step": 3370 + }, + { + "epoch": 2.6648033638387334, + "grad_norm": 0.23009070758179592, + "learning_rate": 5.977146775729302e-08, + "loss": 0.5354, + "step": 3371 + }, + { + "epoch": 2.665594855305466, + "grad_norm": 0.2363466914378266, + "learning_rate": 5.948868844265076e-08, + "loss": 0.5447, + "step": 3372 + }, + { + "epoch": 2.6663863467721987, + "grad_norm": 0.23746064750217005, + "learning_rate": 5.9206559122924826e-08, + "loss": 0.5326, + "step": 3373 + }, + { + "epoch": 2.6671778382389313, + "grad_norm": 0.2366549271451058, + "learning_rate": 5.892507999309637e-08, + "loss": 0.5402, + "step": 3374 + }, + { + "epoch": 2.667969329705664, + "grad_norm": 0.2394632783060834, + "learning_rate": 5.864425124769823e-08, + "loss": 0.5428, + "step": 3375 + }, + { + "epoch": 2.6687608211723965, + "grad_norm": 0.2412885040753179, + "learning_rate": 5.836407308081337e-08, + "loss": 0.5401, + "step": 3376 + }, + { + "epoch": 2.669552312639129, + "grad_norm": 0.23311997952084815, + "learning_rate": 5.808454568607501e-08, + "loss": 0.5518, + "step": 3377 + }, + { + "epoch": 2.670343804105862, + "grad_norm": 0.23394786556968425, + "learning_rate": 5.780566925666719e-08, + "loss": 0.527, + "step": 3378 + }, + { + "epoch": 2.6711352955725944, + "grad_norm": 0.2316218940832262, + "learning_rate": 5.752744398532317e-08, + "loss": 0.5427, + "step": 3379 + }, + { + "epoch": 2.671926787039327, + "grad_norm": 0.23739609076706067, + "learning_rate": 5.724987006432691e-08, + "loss": 0.5413, + "step": 3380 + }, + { + "epoch": 2.6727182785060597, + "grad_norm": 0.23654663494691486, + "learning_rate": 5.697294768551197e-08, + "loss": 0.543, + "step": 3381 + }, + { + "epoch": 2.6735097699727923, + "grad_norm": 0.24486486134295715, + "learning_rate": 5.6696677040261376e-08, + "loss": 0.5555, + "step": 3382 + }, + { + "epoch": 2.674301261439525, + "grad_norm": 0.22913015899333763, + "learning_rate": 5.6421058319508495e-08, + "loss": 0.5444, + "step": 3383 + }, + { + "epoch": 2.6750927529062576, + "grad_norm": 0.23127123534788113, + "learning_rate": 5.61460917137353e-08, + "loss": 0.5307, + "step": 3384 + }, + { + "epoch": 2.67588424437299, + "grad_norm": 0.23662149668399493, + "learning_rate": 5.587177741297322e-08, + "loss": 0.5403, + "step": 3385 + }, + { + "epoch": 2.676675735839723, + "grad_norm": 0.22941545147855613, + "learning_rate": 5.559811560680361e-08, + "loss": 0.538, + "step": 3386 + }, + { + "epoch": 2.6774672273064555, + "grad_norm": 0.23557226516061405, + "learning_rate": 5.5325106484355735e-08, + "loss": 0.549, + "step": 3387 + }, + { + "epoch": 2.678258718773188, + "grad_norm": 0.23903798783195193, + "learning_rate": 5.5052750234308884e-08, + "loss": 0.5384, + "step": 3388 + }, + { + "epoch": 2.6790502102399207, + "grad_norm": 0.23390759042618223, + "learning_rate": 5.478104704489061e-08, + "loss": 0.55, + "step": 3389 + }, + { + "epoch": 2.6798417017066534, + "grad_norm": 0.24319314506092046, + "learning_rate": 5.4509997103876824e-08, + "loss": 0.5478, + "step": 3390 + }, + { + "epoch": 2.680633193173386, + "grad_norm": 0.2324985845713776, + "learning_rate": 5.423960059859256e-08, + "loss": 0.5503, + "step": 3391 + }, + { + "epoch": 2.6814246846401186, + "grad_norm": 0.23451652089274386, + "learning_rate": 5.396985771591078e-08, + "loss": 0.5471, + "step": 3392 + }, + { + "epoch": 2.6822161761068513, + "grad_norm": 0.24989306663252006, + "learning_rate": 5.3700768642253366e-08, + "loss": 0.5356, + "step": 3393 + }, + { + "epoch": 2.683007667573584, + "grad_norm": 0.24397093985512394, + "learning_rate": 5.343233356358989e-08, + "loss": 0.5412, + "step": 3394 + }, + { + "epoch": 2.6837991590403165, + "grad_norm": 0.229773232873424, + "learning_rate": 5.3164552665437404e-08, + "loss": 0.5456, + "step": 3395 + }, + { + "epoch": 2.684590650507049, + "grad_norm": 0.23722507770809703, + "learning_rate": 5.289742613286208e-08, + "loss": 0.5481, + "step": 3396 + }, + { + "epoch": 2.6853821419737818, + "grad_norm": 0.2373573136152371, + "learning_rate": 5.2630954150476914e-08, + "loss": 0.5358, + "step": 3397 + }, + { + "epoch": 2.6861736334405144, + "grad_norm": 0.23428134873211282, + "learning_rate": 5.236513690244282e-08, + "loss": 0.553, + "step": 3398 + }, + { + "epoch": 2.686965124907247, + "grad_norm": 0.23310966726833876, + "learning_rate": 5.209997457246851e-08, + "loss": 0.5243, + "step": 3399 + }, + { + "epoch": 2.6877566163739797, + "grad_norm": 0.23758896816669747, + "learning_rate": 5.183546734380928e-08, + "loss": 0.5468, + "step": 3400 + }, + { + "epoch": 2.6885481078407123, + "grad_norm": 0.23864284541641398, + "learning_rate": 5.15716153992688e-08, + "loss": 0.5437, + "step": 3401 + }, + { + "epoch": 2.689339599307445, + "grad_norm": 0.2277969446365102, + "learning_rate": 5.130841892119686e-08, + "loss": 0.5457, + "step": 3402 + }, + { + "epoch": 2.6901310907741776, + "grad_norm": 0.22992842908824299, + "learning_rate": 5.1045878091490636e-08, + "loss": 0.5465, + "step": 3403 + }, + { + "epoch": 2.69092258224091, + "grad_norm": 0.22899571568499077, + "learning_rate": 5.0783993091594643e-08, + "loss": 0.537, + "step": 3404 + }, + { + "epoch": 2.691714073707643, + "grad_norm": 0.2426667299516774, + "learning_rate": 5.052276410249912e-08, + "loss": 0.5419, + "step": 3405 + }, + { + "epoch": 2.6925055651743754, + "grad_norm": 0.23553616708092487, + "learning_rate": 5.026219130474185e-08, + "loss": 0.5412, + "step": 3406 + }, + { + "epoch": 2.693297056641108, + "grad_norm": 0.23398828031835345, + "learning_rate": 5.0002274878406804e-08, + "loss": 0.5396, + "step": 3407 + }, + { + "epoch": 2.6940885481078407, + "grad_norm": 0.23580587669446443, + "learning_rate": 4.974301500312417e-08, + "loss": 0.5334, + "step": 3408 + }, + { + "epoch": 2.6948800395745733, + "grad_norm": 0.24035198235528163, + "learning_rate": 4.948441185807062e-08, + "loss": 0.5468, + "step": 3409 + }, + { + "epoch": 2.695671531041306, + "grad_norm": 0.23806444946182978, + "learning_rate": 4.9226465621968974e-08, + "loss": 0.55, + "step": 3410 + }, + { + "epoch": 2.6964630225080386, + "grad_norm": 0.23194104144441952, + "learning_rate": 4.8969176473087737e-08, + "loss": 0.5496, + "step": 3411 + }, + { + "epoch": 2.6972545139747712, + "grad_norm": 0.23867628766648052, + "learning_rate": 4.871254458924157e-08, + "loss": 0.5381, + "step": 3412 + }, + { + "epoch": 2.698046005441504, + "grad_norm": 0.23339696658980788, + "learning_rate": 4.8456570147790696e-08, + "loss": 0.5436, + "step": 3413 + }, + { + "epoch": 2.6988374969082365, + "grad_norm": 0.2320492748563772, + "learning_rate": 4.82012533256414e-08, + "loss": 0.552, + "step": 3414 + }, + { + "epoch": 2.699628988374969, + "grad_norm": 0.2325736734263487, + "learning_rate": 4.794659429924508e-08, + "loss": 0.5392, + "step": 3415 + }, + { + "epoch": 2.7004204798417017, + "grad_norm": 0.22975884339493674, + "learning_rate": 4.769259324459851e-08, + "loss": 0.5343, + "step": 3416 + }, + { + "epoch": 2.7012119713084344, + "grad_norm": 0.2298298087474154, + "learning_rate": 4.743925033724405e-08, + "loss": 0.5543, + "step": 3417 + }, + { + "epoch": 2.702003462775167, + "grad_norm": 0.2344361780445073, + "learning_rate": 4.718656575226865e-08, + "loss": 0.5499, + "step": 3418 + }, + { + "epoch": 2.7027949542418996, + "grad_norm": 0.24206674411947315, + "learning_rate": 4.693453966430505e-08, + "loss": 0.5465, + "step": 3419 + }, + { + "epoch": 2.7035864457086323, + "grad_norm": 0.22954596310049344, + "learning_rate": 4.668317224753049e-08, + "loss": 0.5249, + "step": 3420 + }, + { + "epoch": 2.704377937175365, + "grad_norm": 0.23727187656397647, + "learning_rate": 4.643246367566678e-08, + "loss": 0.5427, + "step": 3421 + }, + { + "epoch": 2.7051694286420975, + "grad_norm": 0.2363763923476531, + "learning_rate": 4.618241412198076e-08, + "loss": 0.5473, + "step": 3422 + }, + { + "epoch": 2.70596092010883, + "grad_norm": 0.2401557462835347, + "learning_rate": 4.593302375928343e-08, + "loss": 0.5621, + "step": 3423 + }, + { + "epoch": 2.706752411575563, + "grad_norm": 0.23243396201237482, + "learning_rate": 4.568429275993091e-08, + "loss": 0.5389, + "step": 3424 + }, + { + "epoch": 2.7075439030422954, + "grad_norm": 0.23562065303751875, + "learning_rate": 4.543622129582303e-08, + "loss": 0.5527, + "step": 3425 + }, + { + "epoch": 2.708335394509028, + "grad_norm": 0.2419214235285186, + "learning_rate": 4.518880953840376e-08, + "loss": 0.5419, + "step": 3426 + }, + { + "epoch": 2.7091268859757607, + "grad_norm": 0.231034387406549, + "learning_rate": 4.4942057658661655e-08, + "loss": 0.5427, + "step": 3427 + }, + { + "epoch": 2.7099183774424933, + "grad_norm": 0.23365120173504692, + "learning_rate": 4.469596582712854e-08, + "loss": 0.5377, + "step": 3428 + }, + { + "epoch": 2.710709868909226, + "grad_norm": 0.23276674721623392, + "learning_rate": 4.4450534213880785e-08, + "loss": 0.5447, + "step": 3429 + }, + { + "epoch": 2.7115013603759586, + "grad_norm": 0.23521469344731916, + "learning_rate": 4.4205762988538175e-08, + "loss": 0.536, + "step": 3430 + }, + { + "epoch": 2.712292851842691, + "grad_norm": 0.23243297975161775, + "learning_rate": 4.3961652320263834e-08, + "loss": 0.5462, + "step": 3431 + }, + { + "epoch": 2.713084343309424, + "grad_norm": 0.23899991736523044, + "learning_rate": 4.371820237776469e-08, + "loss": 0.5569, + "step": 3432 + }, + { + "epoch": 2.7138758347761565, + "grad_norm": 0.2350694293606461, + "learning_rate": 4.347541332929095e-08, + "loss": 0.5585, + "step": 3433 + }, + { + "epoch": 2.714667326242889, + "grad_norm": 0.2398373071592293, + "learning_rate": 4.323328534263615e-08, + "loss": 0.5427, + "step": 3434 + }, + { + "epoch": 2.7154588177096217, + "grad_norm": 0.22978786890343167, + "learning_rate": 4.2991818585136786e-08, + "loss": 0.537, + "step": 3435 + }, + { + "epoch": 2.7162503091763543, + "grad_norm": 0.23521253337923248, + "learning_rate": 4.275101322367258e-08, + "loss": 0.54, + "step": 3436 + }, + { + "epoch": 2.717041800643087, + "grad_norm": 0.23354481297301627, + "learning_rate": 4.2510869424665974e-08, + "loss": 0.5434, + "step": 3437 + }, + { + "epoch": 2.7178332921098196, + "grad_norm": 0.22686230379711145, + "learning_rate": 4.227138735408209e-08, + "loss": 0.533, + "step": 3438 + }, + { + "epoch": 2.7186247835765522, + "grad_norm": 0.2309054443517431, + "learning_rate": 4.203256717742931e-08, + "loss": 0.533, + "step": 3439 + }, + { + "epoch": 2.719416275043285, + "grad_norm": 0.24156887530710502, + "learning_rate": 4.1794409059757954e-08, + "loss": 0.5467, + "step": 3440 + }, + { + "epoch": 2.7202077665100175, + "grad_norm": 0.23660085736384973, + "learning_rate": 4.155691316566101e-08, + "loss": 0.54, + "step": 3441 + }, + { + "epoch": 2.72099925797675, + "grad_norm": 0.24514687244327169, + "learning_rate": 4.1320079659273864e-08, + "loss": 0.5469, + "step": 3442 + }, + { + "epoch": 2.7217907494434828, + "grad_norm": 0.23847074510715047, + "learning_rate": 4.1083908704273805e-08, + "loss": 0.5561, + "step": 3443 + }, + { + "epoch": 2.7225822409102154, + "grad_norm": 0.2358355223625499, + "learning_rate": 4.084840046388083e-08, + "loss": 0.5403, + "step": 3444 + }, + { + "epoch": 2.723373732376948, + "grad_norm": 0.23573668088691632, + "learning_rate": 4.06135551008564e-08, + "loss": 0.5549, + "step": 3445 + }, + { + "epoch": 2.7241652238436806, + "grad_norm": 0.23490689572976287, + "learning_rate": 4.037937277750403e-08, + "loss": 0.5453, + "step": 3446 + }, + { + "epoch": 2.7249567153104133, + "grad_norm": 0.23070750162727222, + "learning_rate": 4.014585365566914e-08, + "loss": 0.5508, + "step": 3447 + }, + { + "epoch": 2.725748206777146, + "grad_norm": 0.23341170525102728, + "learning_rate": 3.99129978967383e-08, + "loss": 0.5528, + "step": 3448 + }, + { + "epoch": 2.7265396982438785, + "grad_norm": 0.23470833326746446, + "learning_rate": 3.9680805661640335e-08, + "loss": 0.5148, + "step": 3449 + }, + { + "epoch": 2.727331189710611, + "grad_norm": 0.2311784537193434, + "learning_rate": 3.944927711084511e-08, + "loss": 0.5426, + "step": 3450 + }, + { + "epoch": 2.728122681177344, + "grad_norm": 0.2356104303311427, + "learning_rate": 3.9218412404363854e-08, + "loss": 0.5282, + "step": 3451 + }, + { + "epoch": 2.7289141726440764, + "grad_norm": 0.23252944127723466, + "learning_rate": 3.898821170174904e-08, + "loss": 0.537, + "step": 3452 + }, + { + "epoch": 2.729705664110809, + "grad_norm": 0.24246467763086027, + "learning_rate": 3.875867516209397e-08, + "loss": 0.5523, + "step": 3453 + }, + { + "epoch": 2.7304971555775417, + "grad_norm": 0.23288654085502317, + "learning_rate": 3.852980294403352e-08, + "loss": 0.5458, + "step": 3454 + }, + { + "epoch": 2.7312886470442743, + "grad_norm": 0.23268975477803808, + "learning_rate": 3.830159520574294e-08, + "loss": 0.5381, + "step": 3455 + }, + { + "epoch": 2.732080138511007, + "grad_norm": 0.23980609129077946, + "learning_rate": 3.807405210493841e-08, + "loss": 0.5461, + "step": 3456 + }, + { + "epoch": 2.732871629977739, + "grad_norm": 0.23630067177042066, + "learning_rate": 3.7847173798877033e-08, + "loss": 0.5478, + "step": 3457 + }, + { + "epoch": 2.7336631214444718, + "grad_norm": 0.23318535717061248, + "learning_rate": 3.762096044435592e-08, + "loss": 0.5508, + "step": 3458 + }, + { + "epoch": 2.7344546129112044, + "grad_norm": 0.23267507197446813, + "learning_rate": 3.739541219771314e-08, + "loss": 0.5473, + "step": 3459 + }, + { + "epoch": 2.735246104377937, + "grad_norm": 0.2399675008164859, + "learning_rate": 3.7170529214826774e-08, + "loss": 0.548, + "step": 3460 + }, + { + "epoch": 2.7360375958446697, + "grad_norm": 0.22669170257218765, + "learning_rate": 3.694631165111528e-08, + "loss": 0.5336, + "step": 3461 + }, + { + "epoch": 2.7368290873114023, + "grad_norm": 0.23240953576710244, + "learning_rate": 3.6722759661537374e-08, + "loss": 0.5335, + "step": 3462 + }, + { + "epoch": 2.737620578778135, + "grad_norm": 0.23444655359985708, + "learning_rate": 3.6499873400591353e-08, + "loss": 0.5403, + "step": 3463 + }, + { + "epoch": 2.7384120702448675, + "grad_norm": 0.23233076560046437, + "learning_rate": 3.627765302231589e-08, + "loss": 0.5384, + "step": 3464 + }, + { + "epoch": 2.7392035617116, + "grad_norm": 0.2347489961324914, + "learning_rate": 3.605609868028925e-08, + "loss": 0.5277, + "step": 3465 + }, + { + "epoch": 2.739995053178333, + "grad_norm": 0.2299054594238578, + "learning_rate": 3.583521052762928e-08, + "loss": 0.5346, + "step": 3466 + }, + { + "epoch": 2.7407865446450654, + "grad_norm": 0.23102598850353004, + "learning_rate": 3.5614988716993975e-08, + "loss": 0.5383, + "step": 3467 + }, + { + "epoch": 2.741578036111798, + "grad_norm": 0.23021780179591364, + "learning_rate": 3.5395433400579823e-08, + "loss": 0.5416, + "step": 3468 + }, + { + "epoch": 2.7423695275785307, + "grad_norm": 0.2333466076786601, + "learning_rate": 3.5176544730123546e-08, + "loss": 0.5482, + "step": 3469 + }, + { + "epoch": 2.7431610190452633, + "grad_norm": 0.2296607821578593, + "learning_rate": 3.4958322856900925e-08, + "loss": 0.5385, + "step": 3470 + }, + { + "epoch": 2.743952510511996, + "grad_norm": 0.22970187576139794, + "learning_rate": 3.4740767931726754e-08, + "loss": 0.5465, + "step": 3471 + }, + { + "epoch": 2.7447440019787286, + "grad_norm": 0.23140483990915722, + "learning_rate": 3.4523880104955105e-08, + "loss": 0.5371, + "step": 3472 + }, + { + "epoch": 2.745535493445461, + "grad_norm": 0.24235975648798602, + "learning_rate": 3.430765952647863e-08, + "loss": 0.5448, + "step": 3473 + }, + { + "epoch": 2.746326984912194, + "grad_norm": 0.23555635868935854, + "learning_rate": 3.409210634572934e-08, + "loss": 0.5394, + "step": 3474 + }, + { + "epoch": 2.7471184763789265, + "grad_norm": 0.23966553106887273, + "learning_rate": 3.387722071167765e-08, + "loss": 0.5424, + "step": 3475 + }, + { + "epoch": 2.747909967845659, + "grad_norm": 0.24839035325363107, + "learning_rate": 3.366300277283285e-08, + "loss": 0.5536, + "step": 3476 + }, + { + "epoch": 2.7487014593123917, + "grad_norm": 0.2286134790665496, + "learning_rate": 3.344945267724264e-08, + "loss": 0.5379, + "step": 3477 + }, + { + "epoch": 2.7494929507791244, + "grad_norm": 0.2333887450627868, + "learning_rate": 3.323657057249318e-08, + "loss": 0.5345, + "step": 3478 + }, + { + "epoch": 2.750284442245857, + "grad_norm": 0.2304774829009927, + "learning_rate": 3.302435660570901e-08, + "loss": 0.529, + "step": 3479 + }, + { + "epoch": 2.7510759337125896, + "grad_norm": 0.2364774765020302, + "learning_rate": 3.281281092355293e-08, + "loss": 0.5644, + "step": 3480 + }, + { + "epoch": 2.7518674251793223, + "grad_norm": 0.23448978825284153, + "learning_rate": 3.2601933672225634e-08, + "loss": 0.5481, + "step": 3481 + }, + { + "epoch": 2.752658916646055, + "grad_norm": 0.23242910389211896, + "learning_rate": 3.239172499746645e-08, + "loss": 0.5507, + "step": 3482 + }, + { + "epoch": 2.7534504081127875, + "grad_norm": 0.2327606911525051, + "learning_rate": 3.2182185044552166e-08, + "loss": 0.5408, + "step": 3483 + }, + { + "epoch": 2.75424189957952, + "grad_norm": 0.2391192422876664, + "learning_rate": 3.1973313958297496e-08, + "loss": 0.5402, + "step": 3484 + }, + { + "epoch": 2.7550333910462528, + "grad_norm": 0.2604482409561624, + "learning_rate": 3.176511188305475e-08, + "loss": 0.5471, + "step": 3485 + }, + { + "epoch": 2.7558248825129854, + "grad_norm": 0.23540208883453675, + "learning_rate": 3.155757896271427e-08, + "loss": 0.5567, + "step": 3486 + }, + { + "epoch": 2.756616373979718, + "grad_norm": 0.23058161967027863, + "learning_rate": 3.1350715340703656e-08, + "loss": 0.5371, + "step": 3487 + }, + { + "epoch": 2.7574078654464507, + "grad_norm": 0.242007068792223, + "learning_rate": 3.1144521159988114e-08, + "loss": 0.5471, + "step": 3488 + }, + { + "epoch": 2.7581993569131833, + "grad_norm": 0.2354541265211657, + "learning_rate": 3.0938996563069865e-08, + "loss": 0.5578, + "step": 3489 + }, + { + "epoch": 2.758990848379916, + "grad_norm": 0.23229659189411686, + "learning_rate": 3.0734141691988845e-08, + "loss": 0.5353, + "step": 3490 + }, + { + "epoch": 2.7597823398466486, + "grad_norm": 0.2291565180226149, + "learning_rate": 3.0529956688321587e-08, + "loss": 0.5517, + "step": 3491 + }, + { + "epoch": 2.760573831313381, + "grad_norm": 0.23328504089921456, + "learning_rate": 3.032644169318244e-08, + "loss": 0.5435, + "step": 3492 + }, + { + "epoch": 2.761365322780114, + "grad_norm": 0.2377236397176347, + "learning_rate": 3.012359684722199e-08, + "loss": 0.5384, + "step": 3493 + }, + { + "epoch": 2.7621568142468464, + "grad_norm": 0.23127161748830127, + "learning_rate": 2.9921422290627773e-08, + "loss": 0.5551, + "step": 3494 + }, + { + "epoch": 2.762948305713579, + "grad_norm": 0.23847532948448244, + "learning_rate": 2.971991816312458e-08, + "loss": 0.5488, + "step": 3495 + }, + { + "epoch": 2.7637397971803117, + "grad_norm": 0.22552587630689253, + "learning_rate": 2.9519084603973344e-08, + "loss": 0.5414, + "step": 3496 + }, + { + "epoch": 2.7645312886470443, + "grad_norm": 0.23767561808758256, + "learning_rate": 2.9318921751971813e-08, + "loss": 0.5354, + "step": 3497 + }, + { + "epoch": 2.765322780113777, + "grad_norm": 0.23387258295826305, + "learning_rate": 2.9119429745454115e-08, + "loss": 0.5425, + "step": 3498 + }, + { + "epoch": 2.7661142715805096, + "grad_norm": 0.23093274846171483, + "learning_rate": 2.892060872229074e-08, + "loss": 0.5475, + "step": 3499 + }, + { + "epoch": 2.7669057630472422, + "grad_norm": 0.23556135428259767, + "learning_rate": 2.872245881988855e-08, + "loss": 0.548, + "step": 3500 + }, + { + "epoch": 2.767697254513975, + "grad_norm": 0.23018517015742726, + "learning_rate": 2.852498017519045e-08, + "loss": 0.5444, + "step": 3501 + }, + { + "epoch": 2.7684887459807075, + "grad_norm": 0.23840627780612114, + "learning_rate": 2.8328172924675596e-08, + "loss": 0.5336, + "step": 3502 + }, + { + "epoch": 2.76928023744744, + "grad_norm": 0.23468436703558332, + "learning_rate": 2.8132037204359082e-08, + "loss": 0.5435, + "step": 3503 + }, + { + "epoch": 2.7700717289141727, + "grad_norm": 0.23643967159386806, + "learning_rate": 2.793657314979192e-08, + "loss": 0.5495, + "step": 3504 + }, + { + "epoch": 2.7708632203809054, + "grad_norm": 0.23623426572835918, + "learning_rate": 2.7741780896060717e-08, + "loss": 0.5319, + "step": 3505 + }, + { + "epoch": 2.771654711847638, + "grad_norm": 0.23190503062035855, + "learning_rate": 2.7547660577788123e-08, + "loss": 0.5307, + "step": 3506 + }, + { + "epoch": 2.7724462033143706, + "grad_norm": 0.23768799013678277, + "learning_rate": 2.7354212329131932e-08, + "loss": 0.541, + "step": 3507 + }, + { + "epoch": 2.7732376947811033, + "grad_norm": 0.23000439972524941, + "learning_rate": 2.716143628378631e-08, + "loss": 0.5617, + "step": 3508 + }, + { + "epoch": 2.774029186247836, + "grad_norm": 0.23595121150464654, + "learning_rate": 2.696933257497991e-08, + "loss": 0.5572, + "step": 3509 + }, + { + "epoch": 2.7748206777145685, + "grad_norm": 0.23560222839548514, + "learning_rate": 2.6777901335477303e-08, + "loss": 0.5574, + "step": 3510 + }, + { + "epoch": 2.775612169181301, + "grad_norm": 0.23403609430669742, + "learning_rate": 2.6587142697578114e-08, + "loss": 0.5513, + "step": 3511 + }, + { + "epoch": 2.776403660648034, + "grad_norm": 0.22425397228693494, + "learning_rate": 2.6397056793117212e-08, + "loss": 0.5346, + "step": 3512 + }, + { + "epoch": 2.7771951521147664, + "grad_norm": 0.2376508180433101, + "learning_rate": 2.6207643753464405e-08, + "loss": 0.5391, + "step": 3513 + }, + { + "epoch": 2.777986643581499, + "grad_norm": 0.23130205544977794, + "learning_rate": 2.6018903709524753e-08, + "loss": 0.5409, + "step": 3514 + }, + { + "epoch": 2.7787781350482312, + "grad_norm": 0.23409277352689428, + "learning_rate": 2.5830836791737808e-08, + "loss": 0.5289, + "step": 3515 + }, + { + "epoch": 2.779569626514964, + "grad_norm": 0.23395289069440428, + "learning_rate": 2.564344313007827e-08, + "loss": 0.5559, + "step": 3516 + }, + { + "epoch": 2.7803611179816965, + "grad_norm": 0.22712398663529615, + "learning_rate": 2.5456722854055203e-08, + "loss": 0.5468, + "step": 3517 + }, + { + "epoch": 2.781152609448429, + "grad_norm": 0.23634599671534784, + "learning_rate": 2.5270676092712717e-08, + "loss": 0.5349, + "step": 3518 + }, + { + "epoch": 2.7819441009151618, + "grad_norm": 0.2312066239681208, + "learning_rate": 2.5085302974629074e-08, + "loss": 0.5543, + "step": 3519 + }, + { + "epoch": 2.7827355923818944, + "grad_norm": 0.24353576828579954, + "learning_rate": 2.4900603627917238e-08, + "loss": 0.5347, + "step": 3520 + }, + { + "epoch": 2.783527083848627, + "grad_norm": 0.23467857738704168, + "learning_rate": 2.4716578180224434e-08, + "loss": 0.5488, + "step": 3521 + }, + { + "epoch": 2.7843185753153596, + "grad_norm": 0.23502749131964953, + "learning_rate": 2.453322675873182e-08, + "loss": 0.5477, + "step": 3522 + }, + { + "epoch": 2.7851100667820923, + "grad_norm": 0.23564631541599618, + "learning_rate": 2.435054949015547e-08, + "loss": 0.5348, + "step": 3523 + }, + { + "epoch": 2.785901558248825, + "grad_norm": 0.23526267646175178, + "learning_rate": 2.4168546500744845e-08, + "loss": 0.5409, + "step": 3524 + }, + { + "epoch": 2.7866930497155575, + "grad_norm": 0.23494317308400237, + "learning_rate": 2.3987217916283662e-08, + "loss": 0.5331, + "step": 3525 + }, + { + "epoch": 2.78748454118229, + "grad_norm": 0.2542467653741271, + "learning_rate": 2.3806563862089678e-08, + "loss": 0.5422, + "step": 3526 + }, + { + "epoch": 2.788276032649023, + "grad_norm": 0.23253837434076613, + "learning_rate": 2.3626584463014242e-08, + "loss": 0.5475, + "step": 3527 + }, + { + "epoch": 2.7890675241157554, + "grad_norm": 0.23967383421535285, + "learning_rate": 2.3447279843442637e-08, + "loss": 0.5588, + "step": 3528 + }, + { + "epoch": 2.789859015582488, + "grad_norm": 0.23653332543355077, + "learning_rate": 2.326865012729373e-08, + "loss": 0.5461, + "step": 3529 + }, + { + "epoch": 2.7906505070492207, + "grad_norm": 0.23499861737507716, + "learning_rate": 2.3090695438020004e-08, + "loss": 0.5351, + "step": 3530 + }, + { + "epoch": 2.7914419985159533, + "grad_norm": 0.2305686362074388, + "learning_rate": 2.2913415898607293e-08, + "loss": 0.5366, + "step": 3531 + }, + { + "epoch": 2.792233489982686, + "grad_norm": 0.2297406533202431, + "learning_rate": 2.2736811631574814e-08, + "loss": 0.544, + "step": 3532 + }, + { + "epoch": 2.7930249814494186, + "grad_norm": 0.23216831218152165, + "learning_rate": 2.2560882758975387e-08, + "loss": 0.5571, + "step": 3533 + }, + { + "epoch": 2.793816472916151, + "grad_norm": 0.2345578836500067, + "learning_rate": 2.2385629402394745e-08, + "loss": 0.5452, + "step": 3534 + }, + { + "epoch": 2.794607964382884, + "grad_norm": 0.23321595494458838, + "learning_rate": 2.221105168295201e-08, + "loss": 0.5452, + "step": 3535 + }, + { + "epoch": 2.7953994558496165, + "grad_norm": 0.23098645497815196, + "learning_rate": 2.2037149721299107e-08, + "loss": 0.5294, + "step": 3536 + }, + { + "epoch": 2.796190947316349, + "grad_norm": 0.2314132114243511, + "learning_rate": 2.1863923637621017e-08, + "loss": 0.547, + "step": 3537 + }, + { + "epoch": 2.7969824387830817, + "grad_norm": 0.22801882532163015, + "learning_rate": 2.169137355163586e-08, + "loss": 0.5371, + "step": 3538 + }, + { + "epoch": 2.7977739302498144, + "grad_norm": 0.233720369303306, + "learning_rate": 2.151949958259436e-08, + "loss": 0.5471, + "step": 3539 + }, + { + "epoch": 2.798565421716547, + "grad_norm": 0.23423377714684682, + "learning_rate": 2.1348301849279825e-08, + "loss": 0.5586, + "step": 3540 + }, + { + "epoch": 2.7993569131832796, + "grad_norm": 0.23850635298398823, + "learning_rate": 2.117778047000873e-08, + "loss": 0.546, + "step": 3541 + }, + { + "epoch": 2.8001484046500122, + "grad_norm": 0.2316721329812288, + "learning_rate": 2.1007935562629364e-08, + "loss": 0.5475, + "step": 3542 + }, + { + "epoch": 2.800939896116745, + "grad_norm": 0.23374530840302743, + "learning_rate": 2.083876724452316e-08, + "loss": 0.5538, + "step": 3543 + }, + { + "epoch": 2.8017313875834775, + "grad_norm": 0.23499009395978418, + "learning_rate": 2.0670275632603705e-08, + "loss": 0.5332, + "step": 3544 + }, + { + "epoch": 2.80252287905021, + "grad_norm": 0.2401152049814733, + "learning_rate": 2.0502460843316638e-08, + "loss": 0.5325, + "step": 3545 + }, + { + "epoch": 2.8033143705169428, + "grad_norm": 0.2283248348034765, + "learning_rate": 2.0335322992640515e-08, + "loss": 0.5422, + "step": 3546 + }, + { + "epoch": 2.8041058619836754, + "grad_norm": 0.23694216114420952, + "learning_rate": 2.0168862196085268e-08, + "loss": 0.5555, + "step": 3547 + }, + { + "epoch": 2.804897353450408, + "grad_norm": 0.2287060180999378, + "learning_rate": 2.0003078568693542e-08, + "loss": 0.5304, + "step": 3548 + }, + { + "epoch": 2.8056888449171407, + "grad_norm": 0.23742244512905672, + "learning_rate": 1.983797222503969e-08, + "loss": 0.5409, + "step": 3549 + }, + { + "epoch": 2.8064803363838733, + "grad_norm": 0.24173773252938405, + "learning_rate": 1.9673543279229766e-08, + "loss": 0.5438, + "step": 3550 + }, + { + "epoch": 2.807271827850606, + "grad_norm": 0.23258151407290656, + "learning_rate": 1.9509791844902425e-08, + "loss": 0.5503, + "step": 3551 + }, + { + "epoch": 2.8080633193173385, + "grad_norm": 0.23040099587075968, + "learning_rate": 1.9346718035227138e-08, + "loss": 0.5417, + "step": 3552 + }, + { + "epoch": 2.808854810784071, + "grad_norm": 0.23511468624766385, + "learning_rate": 1.918432196290576e-08, + "loss": 0.533, + "step": 3553 + }, + { + "epoch": 2.809646302250804, + "grad_norm": 0.23219468499841275, + "learning_rate": 1.9022603740171506e-08, + "loss": 0.5421, + "step": 3554 + }, + { + "epoch": 2.8104377937175364, + "grad_norm": 0.23301189976458736, + "learning_rate": 1.8861563478788977e-08, + "loss": 0.5305, + "step": 3555 + }, + { + "epoch": 2.811229285184269, + "grad_norm": 0.23340596839688357, + "learning_rate": 1.87012012900547e-08, + "loss": 0.5381, + "step": 3556 + }, + { + "epoch": 2.8120207766510017, + "grad_norm": 0.23783691954664868, + "learning_rate": 1.8541517284795916e-08, + "loss": 0.5458, + "step": 3557 + }, + { + "epoch": 2.8128122681177343, + "grad_norm": 0.2282605799176566, + "learning_rate": 1.8382511573371785e-08, + "loss": 0.556, + "step": 3558 + }, + { + "epoch": 2.813603759584467, + "grad_norm": 0.23753014772684272, + "learning_rate": 1.8224184265672295e-08, + "loss": 0.5404, + "step": 3559 + }, + { + "epoch": 2.8143952510511996, + "grad_norm": 0.23031513978317342, + "learning_rate": 1.8066535471118693e-08, + "loss": 0.5415, + "step": 3560 + }, + { + "epoch": 2.815186742517932, + "grad_norm": 0.23667671620117772, + "learning_rate": 1.79095652986635e-08, + "loss": 0.5353, + "step": 3561 + }, + { + "epoch": 2.815978233984665, + "grad_norm": 0.23722248155222286, + "learning_rate": 1.775327385678993e-08, + "loss": 0.5547, + "step": 3562 + }, + { + "epoch": 2.8167697254513975, + "grad_norm": 0.23049729316446924, + "learning_rate": 1.7597661253512363e-08, + "loss": 0.5403, + "step": 3563 + }, + { + "epoch": 2.81756121691813, + "grad_norm": 0.22592288027231294, + "learning_rate": 1.744272759637566e-08, + "loss": 0.5304, + "step": 3564 + }, + { + "epoch": 2.8183527083848627, + "grad_norm": 0.32681943286543735, + "learning_rate": 1.7288472992455948e-08, + "loss": 0.5392, + "step": 3565 + }, + { + "epoch": 2.8191441998515954, + "grad_norm": 0.23437810226232614, + "learning_rate": 1.713489754835984e-08, + "loss": 0.5307, + "step": 3566 + }, + { + "epoch": 2.819935691318328, + "grad_norm": 0.24144825637446976, + "learning_rate": 1.698200137022443e-08, + "loss": 0.5509, + "step": 3567 + }, + { + "epoch": 2.8207271827850606, + "grad_norm": 0.2313386721507604, + "learning_rate": 1.6829784563717532e-08, + "loss": 0.5278, + "step": 3568 + }, + { + "epoch": 2.8215186742517933, + "grad_norm": 0.23995629981802358, + "learning_rate": 1.6678247234037435e-08, + "loss": 0.5416, + "step": 3569 + }, + { + "epoch": 2.822310165718526, + "grad_norm": 0.23027681319850457, + "learning_rate": 1.6527389485912813e-08, + "loss": 0.5315, + "step": 3570 + }, + { + "epoch": 2.8231016571852585, + "grad_norm": 0.24034734751644593, + "learning_rate": 1.63772114236026e-08, + "loss": 0.5506, + "step": 3571 + }, + { + "epoch": 2.823893148651991, + "grad_norm": 0.23562823586904702, + "learning_rate": 1.6227713150896106e-08, + "loss": 0.5496, + "step": 3572 + }, + { + "epoch": 2.8246846401187238, + "grad_norm": 0.23697594867551047, + "learning_rate": 1.6078894771112796e-08, + "loss": 0.5433, + "step": 3573 + }, + { + "epoch": 2.8254761315854564, + "grad_norm": 0.23202930551790968, + "learning_rate": 1.5930756387102174e-08, + "loss": 0.5446, + "step": 3574 + }, + { + "epoch": 2.826267623052189, + "grad_norm": 0.23437474430100744, + "learning_rate": 1.57832981012439e-08, + "loss": 0.5516, + "step": 3575 + }, + { + "epoch": 2.8270591145189217, + "grad_norm": 0.22678494836197086, + "learning_rate": 1.5636520015447686e-08, + "loss": 0.5349, + "step": 3576 + }, + { + "epoch": 2.8278506059856543, + "grad_norm": 0.23084951474263085, + "learning_rate": 1.5490422231152933e-08, + "loss": 0.5527, + "step": 3577 + }, + { + "epoch": 2.828642097452387, + "grad_norm": 0.2415577245295724, + "learning_rate": 1.534500484932899e-08, + "loss": 0.5289, + "step": 3578 + }, + { + "epoch": 2.8294335889191196, + "grad_norm": 0.2322913620283281, + "learning_rate": 1.5200267970475022e-08, + "loss": 0.5262, + "step": 3579 + }, + { + "epoch": 2.830225080385852, + "grad_norm": 0.22879348301674715, + "learning_rate": 1.505621169461979e-08, + "loss": 0.5439, + "step": 3580 + }, + { + "epoch": 2.831016571852585, + "grad_norm": 0.23081602018863082, + "learning_rate": 1.4912836121321882e-08, + "loss": 0.5217, + "step": 3581 + }, + { + "epoch": 2.8318080633193174, + "grad_norm": 0.23953489128778746, + "learning_rate": 1.4770141349669141e-08, + "loss": 0.5644, + "step": 3582 + }, + { + "epoch": 2.83259955478605, + "grad_norm": 0.23594273303660432, + "learning_rate": 1.462812747827924e-08, + "loss": 0.5422, + "step": 3583 + }, + { + "epoch": 2.8333910462527827, + "grad_norm": 0.23107491008936532, + "learning_rate": 1.4486794605298891e-08, + "loss": 0.5421, + "step": 3584 + }, + { + "epoch": 2.8341825377195153, + "grad_norm": 0.23846821821463982, + "learning_rate": 1.4346142828404407e-08, + "loss": 0.5416, + "step": 3585 + }, + { + "epoch": 2.834974029186248, + "grad_norm": 0.22986602265930947, + "learning_rate": 1.4206172244801585e-08, + "loss": 0.5347, + "step": 3586 + }, + { + "epoch": 2.8357655206529806, + "grad_norm": 0.2296823559896836, + "learning_rate": 1.4066882951225046e-08, + "loss": 0.5466, + "step": 3587 + }, + { + "epoch": 2.8365570121197132, + "grad_norm": 0.2301570053150726, + "learning_rate": 1.3928275043938786e-08, + "loss": 0.5564, + "step": 3588 + }, + { + "epoch": 2.837348503586446, + "grad_norm": 0.2296514803800265, + "learning_rate": 1.3790348618735848e-08, + "loss": 0.5325, + "step": 3589 + }, + { + "epoch": 2.8381399950531785, + "grad_norm": 0.22901689070204145, + "learning_rate": 1.3653103770938091e-08, + "loss": 0.5418, + "step": 3590 + }, + { + "epoch": 2.838931486519911, + "grad_norm": 0.23699501479345256, + "learning_rate": 1.3516540595396864e-08, + "loss": 0.5492, + "step": 3591 + }, + { + "epoch": 2.8397229779866437, + "grad_norm": 0.23808576468403075, + "learning_rate": 1.3380659186491894e-08, + "loss": 0.5423, + "step": 3592 + }, + { + "epoch": 2.8405144694533764, + "grad_norm": 0.23491662688542603, + "learning_rate": 1.324545963813195e-08, + "loss": 0.5513, + "step": 3593 + }, + { + "epoch": 2.841305960920109, + "grad_norm": 0.22997445500928412, + "learning_rate": 1.3110942043754513e-08, + "loss": 0.5275, + "step": 3594 + }, + { + "epoch": 2.8420974523868416, + "grad_norm": 0.23387300453408702, + "learning_rate": 1.2977106496325662e-08, + "loss": 0.5463, + "step": 3595 + }, + { + "epoch": 2.8428889438535743, + "grad_norm": 0.2336068330709784, + "learning_rate": 1.2843953088340409e-08, + "loss": 0.5452, + "step": 3596 + }, + { + "epoch": 2.843680435320307, + "grad_norm": 0.23790914023782178, + "learning_rate": 1.271148191182203e-08, + "loss": 0.5532, + "step": 3597 + }, + { + "epoch": 2.8444719267870395, + "grad_norm": 0.2393653332050634, + "learning_rate": 1.2579693058322515e-08, + "loss": 0.5384, + "step": 3598 + }, + { + "epoch": 2.845263418253772, + "grad_norm": 0.2348288395855586, + "learning_rate": 1.2448586618922119e-08, + "loss": 0.5328, + "step": 3599 + }, + { + "epoch": 2.846054909720505, + "grad_norm": 0.27919785487916793, + "learning_rate": 1.2318162684229471e-08, + "loss": 0.5369, + "step": 3600 + }, + { + "epoch": 2.8468464011872374, + "grad_norm": 0.2278029862920757, + "learning_rate": 1.2188421344381805e-08, + "loss": 0.5624, + "step": 3601 + }, + { + "epoch": 2.84763789265397, + "grad_norm": 0.23543883959177841, + "learning_rate": 1.2059362689044172e-08, + "loss": 0.5435, + "step": 3602 + }, + { + "epoch": 2.8484293841207027, + "grad_norm": 0.2326574508053554, + "learning_rate": 1.1930986807410337e-08, + "loss": 0.5397, + "step": 3603 + }, + { + "epoch": 2.8492208755874353, + "grad_norm": 0.2345907080681373, + "learning_rate": 1.1803293788201662e-08, + "loss": 0.5377, + "step": 3604 + }, + { + "epoch": 2.850012367054168, + "grad_norm": 0.23047322101162068, + "learning_rate": 1.1676283719667778e-08, + "loss": 0.5431, + "step": 3605 + }, + { + "epoch": 2.8508038585209006, + "grad_norm": 0.2337950098843311, + "learning_rate": 1.1549956689586582e-08, + "loss": 0.531, + "step": 3606 + }, + { + "epoch": 2.851595349987633, + "grad_norm": 0.23696401034968725, + "learning_rate": 1.1424312785263568e-08, + "loss": 0.541, + "step": 3607 + }, + { + "epoch": 2.852386841454366, + "grad_norm": 0.23381401620065134, + "learning_rate": 1.1299352093532166e-08, + "loss": 0.5456, + "step": 3608 + }, + { + "epoch": 2.8531783329210985, + "grad_norm": 0.2328559920772804, + "learning_rate": 1.1175074700753962e-08, + "loss": 0.5426, + "step": 3609 + }, + { + "epoch": 2.853969824387831, + "grad_norm": 0.24369332847811104, + "learning_rate": 1.1051480692817804e-08, + "loss": 0.54, + "step": 3610 + }, + { + "epoch": 2.8547613158545637, + "grad_norm": 0.23404016134285346, + "learning_rate": 1.0928570155140593e-08, + "loss": 0.5428, + "step": 3611 + }, + { + "epoch": 2.855552807321296, + "grad_norm": 0.23350951551608884, + "learning_rate": 1.0806343172666931e-08, + "loss": 0.5437, + "step": 3612 + }, + { + "epoch": 2.8563442987880285, + "grad_norm": 0.23588925655037235, + "learning_rate": 1.0684799829868585e-08, + "loss": 0.543, + "step": 3613 + }, + { + "epoch": 2.857135790254761, + "grad_norm": 0.2349753036718382, + "learning_rate": 1.056394021074547e-08, + "loss": 0.542, + "step": 3614 + }, + { + "epoch": 2.857927281721494, + "grad_norm": 0.23506426952632153, + "learning_rate": 1.0443764398824328e-08, + "loss": 0.5463, + "step": 3615 + }, + { + "epoch": 2.8587187731882264, + "grad_norm": 0.2344535064572982, + "learning_rate": 1.0324272477159834e-08, + "loss": 0.5452, + "step": 3616 + }, + { + "epoch": 2.859510264654959, + "grad_norm": 0.22766830797611373, + "learning_rate": 1.0205464528333596e-08, + "loss": 0.5568, + "step": 3617 + }, + { + "epoch": 2.8603017561216917, + "grad_norm": 0.2327774942747983, + "learning_rate": 1.0087340634454934e-08, + "loss": 0.5516, + "step": 3618 + }, + { + "epoch": 2.8610932475884243, + "grad_norm": 0.23033824478521248, + "learning_rate": 9.96990087716032e-09, + "loss": 0.5462, + "step": 3619 + }, + { + "epoch": 2.861884739055157, + "grad_norm": 0.23527791127801123, + "learning_rate": 9.853145337613056e-09, + "loss": 0.5577, + "step": 3620 + }, + { + "epoch": 2.8626762305218896, + "grad_norm": 0.23043840008569028, + "learning_rate": 9.737074096503928e-09, + "loss": 0.5471, + "step": 3621 + }, + { + "epoch": 2.863467721988622, + "grad_norm": 0.23488737054786746, + "learning_rate": 9.621687234050658e-09, + "loss": 0.5405, + "step": 3622 + }, + { + "epoch": 2.864259213455355, + "grad_norm": 0.2324017817548189, + "learning_rate": 9.50698482999812e-09, + "loss": 0.552, + "step": 3623 + }, + { + "epoch": 2.8650507049220875, + "grad_norm": 0.23832437701855722, + "learning_rate": 9.392966963618131e-09, + "loss": 0.5381, + "step": 3624 + }, + { + "epoch": 2.86584219638882, + "grad_norm": 0.2356875120153845, + "learning_rate": 9.279633713709212e-09, + "loss": 0.5425, + "step": 3625 + }, + { + "epoch": 2.8666336878555527, + "grad_norm": 0.23554714723924744, + "learning_rate": 9.166985158596818e-09, + "loss": 0.5449, + "step": 3626 + }, + { + "epoch": 2.8674251793222854, + "grad_norm": 0.22964814443532963, + "learning_rate": 9.055021376133453e-09, + "loss": 0.5374, + "step": 3627 + }, + { + "epoch": 2.868216670789018, + "grad_norm": 0.2360445105076418, + "learning_rate": 8.943742443697999e-09, + "loss": 0.5478, + "step": 3628 + }, + { + "epoch": 2.8690081622557506, + "grad_norm": 0.23249939484838325, + "learning_rate": 8.833148438196492e-09, + "loss": 0.541, + "step": 3629 + }, + { + "epoch": 2.8697996537224832, + "grad_norm": 0.23685114299265833, + "learning_rate": 8.723239436061126e-09, + "loss": 0.5451, + "step": 3630 + }, + { + "epoch": 2.870591145189216, + "grad_norm": 0.23567758136093175, + "learning_rate": 8.614015513250916e-09, + "loss": 0.5385, + "step": 3631 + }, + { + "epoch": 2.8713826366559485, + "grad_norm": 0.2314050478292498, + "learning_rate": 8.505476745251483e-09, + "loss": 0.5362, + "step": 3632 + }, + { + "epoch": 2.872174128122681, + "grad_norm": 0.23804147753747956, + "learning_rate": 8.397623207074711e-09, + "loss": 0.5508, + "step": 3633 + }, + { + "epoch": 2.8729656195894138, + "grad_norm": 0.23344629198932168, + "learning_rate": 8.290454973259087e-09, + "loss": 0.5586, + "step": 3634 + }, + { + "epoch": 2.8737571110561464, + "grad_norm": 0.24088819059057526, + "learning_rate": 8.183972117869475e-09, + "loss": 0.5441, + "step": 3635 + }, + { + "epoch": 2.874548602522879, + "grad_norm": 0.23362904142228713, + "learning_rate": 8.078174714497011e-09, + "loss": 0.5552, + "step": 3636 + }, + { + "epoch": 2.8753400939896117, + "grad_norm": 0.23195150006361642, + "learning_rate": 7.973062836259204e-09, + "loss": 0.5503, + "step": 3637 + }, + { + "epoch": 2.8761315854563443, + "grad_norm": 0.23281250790763872, + "learning_rate": 7.868636555799613e-09, + "loss": 0.5333, + "step": 3638 + }, + { + "epoch": 2.876923076923077, + "grad_norm": 0.23484417361113952, + "learning_rate": 7.764895945288064e-09, + "loss": 0.5471, + "step": 3639 + }, + { + "epoch": 2.8777145683898095, + "grad_norm": 0.2403679884476522, + "learning_rate": 7.66184107642065e-09, + "loss": 0.5277, + "step": 3640 + }, + { + "epoch": 2.878506059856542, + "grad_norm": 0.23633848873242225, + "learning_rate": 7.559472020419177e-09, + "loss": 0.5258, + "step": 3641 + }, + { + "epoch": 2.879297551323275, + "grad_norm": 0.23270832961159474, + "learning_rate": 7.457788848032054e-09, + "loss": 0.5356, + "step": 3642 + }, + { + "epoch": 2.8800890427900074, + "grad_norm": 0.235574059142225, + "learning_rate": 7.356791629533177e-09, + "loss": 0.5427, + "step": 3643 + }, + { + "epoch": 2.88088053425674, + "grad_norm": 0.2330963450443202, + "learning_rate": 7.2564804347223785e-09, + "loss": 0.5468, + "step": 3644 + }, + { + "epoch": 2.8816720257234727, + "grad_norm": 0.22829126129192814, + "learning_rate": 7.156855332925871e-09, + "loss": 0.5267, + "step": 3645 + }, + { + "epoch": 2.8824635171902053, + "grad_norm": 0.23199124614006095, + "learning_rate": 7.057916392995023e-09, + "loss": 0.5509, + "step": 3646 + }, + { + "epoch": 2.883255008656938, + "grad_norm": 0.23393639588493617, + "learning_rate": 6.9596636833075815e-09, + "loss": 0.5386, + "step": 3647 + }, + { + "epoch": 2.8840465001236706, + "grad_norm": 0.22419508574534858, + "learning_rate": 6.862097271766565e-09, + "loss": 0.5226, + "step": 3648 + }, + { + "epoch": 2.884837991590403, + "grad_norm": 0.2345720986651518, + "learning_rate": 6.765217225801145e-09, + "loss": 0.5378, + "step": 3649 + }, + { + "epoch": 2.885629483057136, + "grad_norm": 0.23287378541532291, + "learning_rate": 6.669023612365654e-09, + "loss": 0.5386, + "step": 3650 + }, + { + "epoch": 2.8864209745238685, + "grad_norm": 0.22496024331992462, + "learning_rate": 6.573516497940468e-09, + "loss": 0.5294, + "step": 3651 + }, + { + "epoch": 2.887212465990601, + "grad_norm": 0.23452122179342824, + "learning_rate": 6.478695948531232e-09, + "loss": 0.5335, + "step": 3652 + }, + { + "epoch": 2.8880039574573337, + "grad_norm": 0.22952083605403764, + "learning_rate": 6.384562029669193e-09, + "loss": 0.5346, + "step": 3653 + }, + { + "epoch": 2.8887954489240664, + "grad_norm": 0.2417314657178486, + "learning_rate": 6.29111480641098e-09, + "loss": 0.5588, + "step": 3654 + }, + { + "epoch": 2.889586940390799, + "grad_norm": 0.22907102186316622, + "learning_rate": 6.1983543433389295e-09, + "loss": 0.5357, + "step": 3655 + }, + { + "epoch": 2.8903784318575316, + "grad_norm": 0.2383233633695022, + "learning_rate": 6.106280704560318e-09, + "loss": 0.5432, + "step": 3656 + }, + { + "epoch": 2.8911699233242643, + "grad_norm": 0.23278374702529836, + "learning_rate": 6.014893953708134e-09, + "loss": 0.5557, + "step": 3657 + }, + { + "epoch": 2.891961414790997, + "grad_norm": 0.2362970080721708, + "learning_rate": 5.924194153940409e-09, + "loss": 0.5347, + "step": 3658 + }, + { + "epoch": 2.8927529062577295, + "grad_norm": 0.24076146219857836, + "learning_rate": 5.834181367940449e-09, + "loss": 0.5596, + "step": 3659 + }, + { + "epoch": 2.893544397724462, + "grad_norm": 0.2357068807951035, + "learning_rate": 5.744855657916936e-09, + "loss": 0.5504, + "step": 3660 + }, + { + "epoch": 2.8943358891911948, + "grad_norm": 0.2318661379144214, + "learning_rate": 5.656217085603598e-09, + "loss": 0.526, + "step": 3661 + }, + { + "epoch": 2.8951273806579274, + "grad_norm": 0.23135737972044004, + "learning_rate": 5.568265712259212e-09, + "loss": 0.5474, + "step": 3662 + }, + { + "epoch": 2.89591887212466, + "grad_norm": 0.2312055681180558, + "learning_rate": 5.481001598667601e-09, + "loss": 0.5405, + "step": 3663 + }, + { + "epoch": 2.8967103635913927, + "grad_norm": 0.22954336397695554, + "learning_rate": 5.394424805137743e-09, + "loss": 0.5408, + "step": 3664 + }, + { + "epoch": 2.8975018550581253, + "grad_norm": 0.2350687889999863, + "learning_rate": 5.3085353915035545e-09, + "loss": 0.5464, + "step": 3665 + }, + { + "epoch": 2.898293346524858, + "grad_norm": 0.22990848568273484, + "learning_rate": 5.223333417123999e-09, + "loss": 0.5457, + "step": 3666 + }, + { + "epoch": 2.8990848379915906, + "grad_norm": 0.2343502009069639, + "learning_rate": 5.13881894088275e-09, + "loss": 0.5481, + "step": 3667 + }, + { + "epoch": 2.899876329458323, + "grad_norm": 0.23345598474950643, + "learning_rate": 5.054992021188531e-09, + "loss": 0.5409, + "step": 3668 + }, + { + "epoch": 2.900667820925056, + "grad_norm": 0.23391372354504558, + "learning_rate": 4.971852715974556e-09, + "loss": 0.5615, + "step": 3669 + }, + { + "epoch": 2.901459312391788, + "grad_norm": 0.2333856026236808, + "learning_rate": 4.889401082699307e-09, + "loss": 0.5362, + "step": 3670 + }, + { + "epoch": 2.9022508038585206, + "grad_norm": 0.23740898652369022, + "learning_rate": 4.807637178345758e-09, + "loss": 0.5394, + "step": 3671 + }, + { + "epoch": 2.9030422953252533, + "grad_norm": 0.2351692934039835, + "learning_rate": 4.726561059421485e-09, + "loss": 0.5341, + "step": 3672 + }, + { + "epoch": 2.903833786791986, + "grad_norm": 0.23123807687856532, + "learning_rate": 4.64617278195889e-09, + "loss": 0.5528, + "step": 3673 + }, + { + "epoch": 2.9046252782587185, + "grad_norm": 0.23253440822820867, + "learning_rate": 4.566472401514976e-09, + "loss": 0.5415, + "step": 3674 + }, + { + "epoch": 2.905416769725451, + "grad_norm": 0.22763153955250545, + "learning_rate": 4.487459973171237e-09, + "loss": 0.5333, + "step": 3675 + }, + { + "epoch": 2.906208261192184, + "grad_norm": 0.23320624013952176, + "learning_rate": 4.409135551533993e-09, + "loss": 0.5524, + "step": 3676 + }, + { + "epoch": 2.9069997526589164, + "grad_norm": 0.23190345421383513, + "learning_rate": 4.331499190733723e-09, + "loss": 0.5374, + "step": 3677 + }, + { + "epoch": 2.907791244125649, + "grad_norm": 0.2383379936333827, + "learning_rate": 4.254550944425506e-09, + "loss": 0.55, + "step": 3678 + }, + { + "epoch": 2.9085827355923817, + "grad_norm": 0.22785390609910963, + "learning_rate": 4.178290865788914e-09, + "loss": 0.5402, + "step": 3679 + }, + { + "epoch": 2.9093742270591143, + "grad_norm": 0.2352625918619613, + "learning_rate": 4.1027190075280105e-09, + "loss": 0.5354, + "step": 3680 + }, + { + "epoch": 2.910165718525847, + "grad_norm": 0.23786703181770358, + "learning_rate": 4.027835421871018e-09, + "loss": 0.5428, + "step": 3681 + }, + { + "epoch": 2.9109572099925796, + "grad_norm": 0.23571169879494544, + "learning_rate": 3.953640160570648e-09, + "loss": 0.5354, + "step": 3682 + }, + { + "epoch": 2.911748701459312, + "grad_norm": 0.23769944504185458, + "learning_rate": 3.880133274903774e-09, + "loss": 0.5445, + "step": 3683 + }, + { + "epoch": 2.912540192926045, + "grad_norm": 0.23227835874811428, + "learning_rate": 3.8073148156715364e-09, + "loss": 0.5461, + "step": 3684 + }, + { + "epoch": 2.9133316843927775, + "grad_norm": 0.2355513808693547, + "learning_rate": 3.7351848331995674e-09, + "loss": 0.547, + "step": 3685 + }, + { + "epoch": 2.91412317585951, + "grad_norm": 0.23317426937749436, + "learning_rate": 3.663743377337325e-09, + "loss": 0.5467, + "step": 3686 + }, + { + "epoch": 2.9149146673262427, + "grad_norm": 0.23798269387358634, + "learning_rate": 3.592990497458537e-09, + "loss": 0.5407, + "step": 3687 + }, + { + "epoch": 2.9157061587929753, + "grad_norm": 0.2382784295768859, + "learning_rate": 3.5229262424611996e-09, + "loss": 0.5576, + "step": 3688 + }, + { + "epoch": 2.916497650259708, + "grad_norm": 0.23215209734361164, + "learning_rate": 3.453550660767135e-09, + "loss": 0.5409, + "step": 3689 + }, + { + "epoch": 2.9172891417264406, + "grad_norm": 0.23637942810673981, + "learning_rate": 3.384863800322546e-09, + "loss": 0.5255, + "step": 3690 + }, + { + "epoch": 2.9180806331931732, + "grad_norm": 0.23261767139674736, + "learning_rate": 3.3168657085974603e-09, + "loss": 0.5293, + "step": 3691 + }, + { + "epoch": 2.918872124659906, + "grad_norm": 0.23205218319756035, + "learning_rate": 3.2495564325857316e-09, + "loss": 0.5178, + "step": 3692 + }, + { + "epoch": 2.9196636161266385, + "grad_norm": 0.23363088780947577, + "learning_rate": 3.182936018805482e-09, + "loss": 0.5384, + "step": 3693 + }, + { + "epoch": 2.920455107593371, + "grad_norm": 0.23404640760239762, + "learning_rate": 3.117004513298438e-09, + "loss": 0.5354, + "step": 3694 + }, + { + "epoch": 2.9212465990601038, + "grad_norm": 0.22895226717897585, + "learning_rate": 3.0517619616307054e-09, + "loss": 0.5225, + "step": 3695 + }, + { + "epoch": 2.9220380905268364, + "grad_norm": 0.2360703490332348, + "learning_rate": 2.9872084088916617e-09, + "loss": 0.5417, + "step": 3696 + }, + { + "epoch": 2.922829581993569, + "grad_norm": 0.2333998971287613, + "learning_rate": 2.923343899694841e-09, + "loss": 0.5461, + "step": 3697 + }, + { + "epoch": 2.9236210734603016, + "grad_norm": 0.2378209196119549, + "learning_rate": 2.8601684781777156e-09, + "loss": 0.5459, + "step": 3698 + }, + { + "epoch": 2.9244125649270343, + "grad_norm": 0.23861138065538962, + "learning_rate": 2.797682188001027e-09, + "loss": 0.5447, + "step": 3699 + }, + { + "epoch": 2.925204056393767, + "grad_norm": 0.23404748186940524, + "learning_rate": 2.735885072349675e-09, + "loss": 0.5349, + "step": 3700 + }, + { + "epoch": 2.9259955478604995, + "grad_norm": 0.2364313283670403, + "learning_rate": 2.6747771739322745e-09, + "loss": 0.5438, + "step": 3701 + }, + { + "epoch": 2.926787039327232, + "grad_norm": 0.25078310404459, + "learning_rate": 2.6143585349808204e-09, + "loss": 0.5492, + "step": 3702 + }, + { + "epoch": 2.927578530793965, + "grad_norm": 0.2335519225837004, + "learning_rate": 2.5546291972511345e-09, + "loss": 0.5228, + "step": 3703 + }, + { + "epoch": 2.9283700222606974, + "grad_norm": 0.22623403321301705, + "learning_rate": 2.495589202022752e-09, + "loss": 0.5385, + "step": 3704 + }, + { + "epoch": 2.92916151372743, + "grad_norm": 0.2377080853338766, + "learning_rate": 2.4372385900984782e-09, + "loss": 0.5338, + "step": 3705 + }, + { + "epoch": 2.9299530051941627, + "grad_norm": 0.23171608354222004, + "learning_rate": 2.3795774018051663e-09, + "loss": 0.5405, + "step": 3706 + }, + { + "epoch": 2.9307444966608953, + "grad_norm": 0.23272173466659182, + "learning_rate": 2.322605676992717e-09, + "loss": 0.5422, + "step": 3707 + }, + { + "epoch": 2.931535988127628, + "grad_norm": 0.2304871167327762, + "learning_rate": 2.2663234550347464e-09, + "loss": 0.5464, + "step": 3708 + }, + { + "epoch": 2.9323274795943606, + "grad_norm": 0.24568605879888988, + "learning_rate": 2.210730774828473e-09, + "loss": 0.5369, + "step": 3709 + }, + { + "epoch": 2.933118971061093, + "grad_norm": 0.2297037081702141, + "learning_rate": 2.155827674794386e-09, + "loss": 0.5379, + "step": 3710 + }, + { + "epoch": 2.933910462527826, + "grad_norm": 0.2402732800167233, + "learning_rate": 2.1016141928765772e-09, + "loss": 0.5373, + "step": 3711 + }, + { + "epoch": 2.9347019539945585, + "grad_norm": 0.22853359580876342, + "learning_rate": 2.048090366542077e-09, + "loss": 0.5483, + "step": 3712 + }, + { + "epoch": 2.935493445461291, + "grad_norm": 0.23225064386070493, + "learning_rate": 1.995256232781961e-09, + "loss": 0.5471, + "step": 3713 + }, + { + "epoch": 2.9362849369280237, + "grad_norm": 0.23148387471125012, + "learning_rate": 1.9431118281102443e-09, + "loss": 0.5357, + "step": 3714 + }, + { + "epoch": 2.9370764283947564, + "grad_norm": 0.23470038523800482, + "learning_rate": 1.891657188564322e-09, + "loss": 0.5525, + "step": 3715 + }, + { + "epoch": 2.937867919861489, + "grad_norm": 0.23264448419154526, + "learning_rate": 1.840892349704859e-09, + "loss": 0.5541, + "step": 3716 + }, + { + "epoch": 2.9386594113282216, + "grad_norm": 0.23209477039544965, + "learning_rate": 1.7908173466159026e-09, + "loss": 0.5307, + "step": 3717 + }, + { + "epoch": 2.9394509027949542, + "grad_norm": 0.22584066617976098, + "learning_rate": 1.741432213904659e-09, + "loss": 0.5395, + "step": 3718 + }, + { + "epoch": 2.940242394261687, + "grad_norm": 0.23676557988764327, + "learning_rate": 1.6927369857017149e-09, + "loss": 0.5453, + "step": 3719 + }, + { + "epoch": 2.9410338857284195, + "grad_norm": 0.23213111535032288, + "learning_rate": 1.644731695660484e-09, + "loss": 0.5364, + "step": 3720 + }, + { + "epoch": 2.941825377195152, + "grad_norm": 0.22968614395011067, + "learning_rate": 1.597416376957983e-09, + "loss": 0.5361, + "step": 3721 + }, + { + "epoch": 2.9426168686618848, + "grad_norm": 0.2382832272179857, + "learning_rate": 1.5507910622941655e-09, + "loss": 0.5554, + "step": 3722 + }, + { + "epoch": 2.9434083601286174, + "grad_norm": 0.23169949406280965, + "learning_rate": 1.5048557838921449e-09, + "loss": 0.533, + "step": 3723 + }, + { + "epoch": 2.94419985159535, + "grad_norm": 0.23665168804830689, + "learning_rate": 1.4596105734983043e-09, + "loss": 0.5272, + "step": 3724 + }, + { + "epoch": 2.9449913430620827, + "grad_norm": 0.23398965155126505, + "learning_rate": 1.4150554623817424e-09, + "loss": 0.5429, + "step": 3725 + }, + { + "epoch": 2.9457828345288153, + "grad_norm": 0.2399756658001373, + "learning_rate": 1.3711904813349384e-09, + "loss": 0.5411, + "step": 3726 + }, + { + "epoch": 2.946574325995548, + "grad_norm": 0.2409703242003032, + "learning_rate": 1.3280156606734206e-09, + "loss": 0.5551, + "step": 3727 + }, + { + "epoch": 2.9473658174622805, + "grad_norm": 0.23291663729754578, + "learning_rate": 1.2855310302355426e-09, + "loss": 0.5469, + "step": 3728 + }, + { + "epoch": 2.948157308929013, + "grad_norm": 0.23607926421418604, + "learning_rate": 1.2437366193828181e-09, + "loss": 0.545, + "step": 3729 + }, + { + "epoch": 2.948948800395746, + "grad_norm": 0.23271538265326955, + "learning_rate": 1.202632456999808e-09, + "loss": 0.5339, + "step": 3730 + }, + { + "epoch": 2.9497402918624784, + "grad_norm": 0.23455239752017618, + "learning_rate": 1.1622185714936783e-09, + "loss": 0.5434, + "step": 3731 + }, + { + "epoch": 2.950531783329211, + "grad_norm": 0.23122451040910008, + "learning_rate": 1.1224949907949754e-09, + "loss": 0.5476, + "step": 3732 + }, + { + "epoch": 2.9513232747959437, + "grad_norm": 0.2314625171421439, + "learning_rate": 1.0834617423569614e-09, + "loss": 0.5538, + "step": 3733 + }, + { + "epoch": 2.9521147662626763, + "grad_norm": 0.23516302668753056, + "learning_rate": 1.0451188531558353e-09, + "loss": 0.5371, + "step": 3734 + }, + { + "epoch": 2.952906257729409, + "grad_norm": 0.2344195709587897, + "learning_rate": 1.0074663496906222e-09, + "loss": 0.5282, + "step": 3735 + }, + { + "epoch": 2.9536977491961416, + "grad_norm": 0.22724554654759332, + "learning_rate": 9.705042579832845e-10, + "loss": 0.5472, + "step": 3736 + }, + { + "epoch": 2.954489240662874, + "grad_norm": 0.2299260794859835, + "learning_rate": 9.342326035787218e-10, + "loss": 0.5331, + "step": 3737 + }, + { + "epoch": 2.955280732129607, + "grad_norm": 0.23031166598159403, + "learning_rate": 8.986514115443266e-10, + "loss": 0.5343, + "step": 3738 + }, + { + "epoch": 2.9560722235963395, + "grad_norm": 0.23191019069221036, + "learning_rate": 8.63760706470873e-10, + "loss": 0.547, + "step": 3739 + }, + { + "epoch": 2.956863715063072, + "grad_norm": 0.2312162938804386, + "learning_rate": 8.295605124712946e-10, + "loss": 0.5384, + "step": 3740 + }, + { + "epoch": 2.9576552065298047, + "grad_norm": 0.22724776984982867, + "learning_rate": 7.960508531817955e-10, + "loss": 0.5631, + "step": 3741 + }, + { + "epoch": 2.9584466979965374, + "grad_norm": 0.2303256186878083, + "learning_rate": 7.632317517610731e-10, + "loss": 0.5357, + "step": 3742 + }, + { + "epoch": 2.95923818946327, + "grad_norm": 0.23279163047722276, + "learning_rate": 7.311032308908727e-10, + "loss": 0.5399, + "step": 3743 + }, + { + "epoch": 2.9600296809300026, + "grad_norm": 0.23305250004275926, + "learning_rate": 6.996653127752106e-10, + "loss": 0.5415, + "step": 3744 + }, + { + "epoch": 2.9608211723967353, + "grad_norm": 0.23375977755189584, + "learning_rate": 6.689180191412624e-10, + "loss": 0.5478, + "step": 3745 + }, + { + "epoch": 2.961612663863468, + "grad_norm": 0.24026216062993833, + "learning_rate": 6.388613712385859e-10, + "loss": 0.5385, + "step": 3746 + }, + { + "epoch": 2.9624041553302005, + "grad_norm": 0.23268894387393466, + "learning_rate": 6.09495389839787e-10, + "loss": 0.5351, + "step": 3747 + }, + { + "epoch": 2.963195646796933, + "grad_norm": 0.2289542926917948, + "learning_rate": 5.808200952396314e-10, + "loss": 0.5335, + "step": 3748 + }, + { + "epoch": 2.9639871382636658, + "grad_norm": 0.23523839503162927, + "learning_rate": 5.528355072561552e-10, + "loss": 0.5464, + "step": 3749 + }, + { + "epoch": 2.9647786297303984, + "grad_norm": 0.22840618491477813, + "learning_rate": 5.255416452295547e-10, + "loss": 0.5502, + "step": 3750 + }, + { + "epoch": 2.965570121197131, + "grad_norm": 0.23445275168731747, + "learning_rate": 4.98938528022852e-10, + "loss": 0.5404, + "step": 3751 + }, + { + "epoch": 2.9663616126638637, + "grad_norm": 0.23208368631000384, + "learning_rate": 4.730261740217845e-10, + "loss": 0.5462, + "step": 3752 + }, + { + "epoch": 2.9671531041305963, + "grad_norm": 0.2374890121648241, + "learning_rate": 4.4780460113436056e-10, + "loss": 0.5404, + "step": 3753 + }, + { + "epoch": 2.967944595597329, + "grad_norm": 0.2386164219573942, + "learning_rate": 4.232738267916369e-10, + "loss": 0.536, + "step": 3754 + }, + { + "epoch": 2.9687360870640616, + "grad_norm": 0.24677232879508854, + "learning_rate": 3.9943386794694114e-10, + "loss": 0.5504, + "step": 3755 + }, + { + "epoch": 2.969527578530794, + "grad_norm": 0.2323060441295835, + "learning_rate": 3.762847410762049e-10, + "loss": 0.5517, + "step": 3756 + }, + { + "epoch": 2.970319069997527, + "grad_norm": 0.23257010792329827, + "learning_rate": 3.5382646217807513e-10, + "loss": 0.5475, + "step": 3757 + }, + { + "epoch": 2.9711105614642594, + "grad_norm": 0.2397127420431173, + "learning_rate": 3.3205904677358067e-10, + "loss": 0.5544, + "step": 3758 + }, + { + "epoch": 2.971902052930992, + "grad_norm": 0.2300831983679334, + "learning_rate": 3.109825099064656e-10, + "loss": 0.5289, + "step": 3759 + }, + { + "epoch": 2.9726935443977247, + "grad_norm": 0.2366547519309783, + "learning_rate": 2.905968661427449e-10, + "loss": 0.5452, + "step": 3760 + }, + { + "epoch": 2.9734850358644573, + "grad_norm": 0.23545772111242555, + "learning_rate": 2.709021295711489e-10, + "loss": 0.543, + "step": 3761 + }, + { + "epoch": 2.97427652733119, + "grad_norm": 0.23449394354914055, + "learning_rate": 2.518983138029007e-10, + "loss": 0.5454, + "step": 3762 + }, + { + "epoch": 2.9750680187979226, + "grad_norm": 0.23740749952846354, + "learning_rate": 2.3358543197171675e-10, + "loss": 0.5446, + "step": 3763 + }, + { + "epoch": 2.9758595102646552, + "grad_norm": 0.22915338657332054, + "learning_rate": 2.159634967336954e-10, + "loss": 0.5366, + "step": 3764 + }, + { + "epoch": 2.976651001731388, + "grad_norm": 0.23119515539409277, + "learning_rate": 1.9903252026753913e-10, + "loss": 0.5416, + "step": 3765 + }, + { + "epoch": 2.97744249319812, + "grad_norm": 0.23040765132410965, + "learning_rate": 1.827925142743325e-10, + "loss": 0.5487, + "step": 3766 + }, + { + "epoch": 2.9782339846648527, + "grad_norm": 0.23374006345544013, + "learning_rate": 1.6724348997776416e-10, + "loss": 0.5476, + "step": 3767 + }, + { + "epoch": 2.9790254761315853, + "grad_norm": 0.23645004763469699, + "learning_rate": 1.5238545812390478e-10, + "loss": 0.5391, + "step": 3768 + }, + { + "epoch": 2.979816967598318, + "grad_norm": 0.23381810614972207, + "learning_rate": 1.3821842898109614e-10, + "loss": 0.5394, + "step": 3769 + }, + { + "epoch": 2.9806084590650506, + "grad_norm": 0.24314135577603835, + "learning_rate": 1.2474241234039506e-10, + "loss": 0.5394, + "step": 3770 + }, + { + "epoch": 2.981399950531783, + "grad_norm": 0.22682123727042564, + "learning_rate": 1.1195741751524046e-10, + "loss": 0.5147, + "step": 3771 + }, + { + "epoch": 2.982191441998516, + "grad_norm": 0.2333728467115532, + "learning_rate": 9.986345334134227e-11, + "loss": 0.5564, + "step": 3772 + }, + { + "epoch": 2.9829829334652485, + "grad_norm": 0.2369145629920544, + "learning_rate": 8.846052817701455e-11, + "loss": 0.5519, + "step": 3773 + }, + { + "epoch": 2.983774424931981, + "grad_norm": 0.23765521298710948, + "learning_rate": 7.774864990284235e-11, + "loss": 0.5294, + "step": 3774 + }, + { + "epoch": 2.9845659163987137, + "grad_norm": 0.24207765137101736, + "learning_rate": 6.772782592201487e-11, + "loss": 0.5433, + "step": 3775 + }, + { + "epoch": 2.9853574078654463, + "grad_norm": 0.23325791923099654, + "learning_rate": 5.839806315977025e-11, + "loss": 0.5381, + "step": 3776 + }, + { + "epoch": 2.986148899332179, + "grad_norm": 0.23140694777700227, + "learning_rate": 4.9759368064283826e-11, + "loss": 0.5334, + "step": 3777 + }, + { + "epoch": 2.9869403907989116, + "grad_norm": 0.2316247147131433, + "learning_rate": 4.181174660555786e-11, + "loss": 0.5349, + "step": 3778 + }, + { + "epoch": 2.9877318822656442, + "grad_norm": 0.23038555873196936, + "learning_rate": 3.4555204276309756e-11, + "loss": 0.5313, + "step": 3779 + }, + { + "epoch": 2.988523373732377, + "grad_norm": 0.2323028047014364, + "learning_rate": 2.7989746091749976e-11, + "loss": 0.5479, + "step": 3780 + }, + { + "epoch": 2.9893148651991095, + "grad_norm": 0.2351028460363793, + "learning_rate": 2.2115376589249e-11, + "loss": 0.536, + "step": 3781 + }, + { + "epoch": 2.990106356665842, + "grad_norm": 0.23625805916792003, + "learning_rate": 1.6932099828448344e-11, + "loss": 0.5518, + "step": 3782 + }, + { + "epoch": 2.9908978481325748, + "grad_norm": 0.27357256852199685, + "learning_rate": 1.2439919391815657e-11, + "loss": 0.5439, + "step": 3783 + }, + { + "epoch": 2.9916893395993074, + "grad_norm": 0.22866341813556185, + "learning_rate": 8.638838383867586e-12, + "loss": 0.547, + "step": 3784 + }, + { + "epoch": 2.99248083106604, + "grad_norm": 0.25991542219861574, + "learning_rate": 5.528859431391808e-12, + "loss": 0.5617, + "step": 3785 + }, + { + "epoch": 2.9932723225327726, + "grad_norm": 0.2268350128674517, + "learning_rate": 3.1099846840021427e-12, + "loss": 0.5443, + "step": 3786 + }, + { + "epoch": 2.9940638139995053, + "grad_norm": 0.23165237079234247, + "learning_rate": 1.3822158131393536e-12, + "loss": 0.5305, + "step": 3787 + }, + { + "epoch": 2.994855305466238, + "grad_norm": 0.2496839263118096, + "learning_rate": 3.4555401295932593e-13, + "loss": 0.5444, + "step": 3788 + }, + { + "epoch": 2.9956467969329705, + "grad_norm": 0.2291287178701386, + "learning_rate": 0.0, + "loss": 0.5383, + "step": 3789 + } + ], + "logging_steps": 1, + "max_steps": 3789, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1263, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6928365495713792e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}