PretrainedPHD-v2 / last-checkpoint /trainer_state.json
Nadav's picture
Training in progress, step 190000
1f2bdc9
raw
history blame
54.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2,
"global_step": 190000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 0.00010077985004622052,
"loss": 0.5572,
"step": 500
},
{
"epoch": 0.01,
"learning_rate": 0.00010077565027123787,
"loss": 0.5131,
"step": 1000
},
{
"epoch": 0.01,
"learning_rate": 0.00010076865093411392,
"loss": 0.505,
"step": 1500
},
{
"epoch": 0.01,
"learning_rate": 0.00010075885246660077,
"loss": 0.5001,
"step": 2000
},
{
"epoch": 0.01,
"learning_rate": 0.00010074625547311406,
"loss": 0.4965,
"step": 2500
},
{
"epoch": 0.01,
"learning_rate": 0.00010073086073069567,
"loss": 0.4937,
"step": 3000
},
{
"epoch": 0.02,
"learning_rate": 0.00010071266918896582,
"loss": 0.4921,
"step": 3500
},
{
"epoch": 0.02,
"learning_rate": 0.0001006916819700645,
"loss": 0.4885,
"step": 4000
},
{
"epoch": 0.02,
"learning_rate": 0.00010066790036858225,
"loss": 0.4872,
"step": 4500
},
{
"epoch": 0.03,
"learning_rate": 0.00010064132585148025,
"loss": 0.4861,
"step": 5000
},
{
"epoch": 0.03,
"eval_loss": 0.45849505066871643,
"eval_runtime": 280.4645,
"eval_samples_per_second": 153.317,
"eval_steps_per_second": 2.396,
"step": 5000
},
{
"epoch": 0.03,
"learning_rate": 0.0001006119600579999,
"loss": 0.4851,
"step": 5500
},
{
"epoch": 0.03,
"learning_rate": 0.00010057980479956167,
"loss": 0.4827,
"step": 6000
},
{
"epoch": 0.03,
"learning_rate": 0.00010054493472563566,
"loss": 0.4814,
"step": 6500
},
{
"epoch": 0.04,
"learning_rate": 0.00010050729045131372,
"loss": 0.4818,
"step": 7000
},
{
"epoch": 0.04,
"learning_rate": 0.0001004667905137074,
"loss": 0.4796,
"step": 7500
},
{
"epoch": 0.04,
"learning_rate": 0.00010042351006588448,
"loss": 0.4787,
"step": 8000
},
{
"epoch": 0.04,
"learning_rate": 0.00010037745177758686,
"loss": 0.4792,
"step": 8500
},
{
"epoch": 0.04,
"learning_rate": 0.00010032861848990674,
"loss": 0.4775,
"step": 9000
},
{
"epoch": 0.05,
"learning_rate": 0.0001002770132151113,
"loss": 0.4767,
"step": 9500
},
{
"epoch": 0.05,
"learning_rate": 0.00010022275064567964,
"loss": 0.4755,
"step": 10000
},
{
"epoch": 0.05,
"eval_loss": 0.44909462332725525,
"eval_runtime": 263.0328,
"eval_samples_per_second": 163.478,
"eval_steps_per_second": 2.555,
"step": 10000
},
{
"epoch": 0.05,
"learning_rate": 0.00010016561664465461,
"loss": 0.4746,
"step": 10500
},
{
"epoch": 0.06,
"learning_rate": 0.00010010572071123591,
"loss": 0.4733,
"step": 11000
},
{
"epoch": 0.06,
"learning_rate": 0.00010004306654008681,
"loss": 0.4721,
"step": 11500
},
{
"epoch": 0.06,
"learning_rate": 9.997765799601176e-05,
"loss": 0.473,
"step": 12000
},
{
"epoch": 0.06,
"learning_rate": 9.990949911371783e-05,
"loss": 0.4708,
"step": 12500
},
{
"epoch": 0.07,
"learning_rate": 9.983859409756594e-05,
"loss": 0.4723,
"step": 13000
},
{
"epoch": 0.07,
"learning_rate": 9.976494732131149e-05,
"loss": 0.4711,
"step": 13500
},
{
"epoch": 0.07,
"learning_rate": 9.968856332783455e-05,
"loss": 0.4713,
"step": 14000
},
{
"epoch": 0.07,
"learning_rate": 9.960944682885961e-05,
"loss": 0.4699,
"step": 14500
},
{
"epoch": 0.07,
"learning_rate": 9.952760270466502e-05,
"loss": 0.4705,
"step": 15000
},
{
"epoch": 0.07,
"eval_loss": 0.44262173771858215,
"eval_runtime": 272.1067,
"eval_samples_per_second": 158.026,
"eval_steps_per_second": 2.47,
"step": 15000
},
{
"epoch": 0.08,
"learning_rate": 9.94430360037819e-05,
"loss": 0.4686,
"step": 15500
},
{
"epoch": 0.08,
"learning_rate": 9.935575194268265e-05,
"loss": 0.469,
"step": 16000
},
{
"epoch": 0.08,
"learning_rate": 9.926612128456279e-05,
"loss": 0.467,
"step": 16500
},
{
"epoch": 0.09,
"learning_rate": 9.917342963701418e-05,
"loss": 0.4674,
"step": 17000
},
{
"epoch": 0.09,
"learning_rate": 9.907803725984013e-05,
"loss": 0.4662,
"step": 17500
},
{
"epoch": 0.09,
"learning_rate": 9.897995003729183e-05,
"loss": 0.4654,
"step": 18000
},
{
"epoch": 0.09,
"learning_rate": 9.887917401985114e-05,
"loss": 0.4647,
"step": 18500
},
{
"epoch": 0.1,
"learning_rate": 9.877592501404869e-05,
"loss": 0.4656,
"step": 19000
},
{
"epoch": 0.1,
"learning_rate": 9.866979556723038e-05,
"loss": 0.465,
"step": 19500
},
{
"epoch": 0.1,
"learning_rate": 9.856099645730841e-05,
"loss": 0.4649,
"step": 20000
},
{
"epoch": 0.1,
"eval_loss": 0.43856754899024963,
"eval_runtime": 269.8512,
"eval_samples_per_second": 159.347,
"eval_steps_per_second": 2.49,
"step": 20000
},
{
"epoch": 0.1,
"learning_rate": 9.844953439552432e-05,
"loss": 0.4627,
"step": 20500
},
{
"epoch": 0.1,
"learning_rate": 9.833564713977207e-05,
"loss": 0.4633,
"step": 21000
},
{
"epoch": 0.11,
"learning_rate": 9.821912141831972e-05,
"loss": 0.4627,
"step": 21500
},
{
"epoch": 0.11,
"learning_rate": 9.809972296167543e-05,
"loss": 0.4613,
"step": 22000
},
{
"epoch": 0.11,
"learning_rate": 9.797769000669104e-05,
"loss": 0.462,
"step": 22500
},
{
"epoch": 0.12,
"learning_rate": 9.785303008093405e-05,
"loss": 0.4605,
"step": 23000
},
{
"epoch": 0.12,
"learning_rate": 9.772575087401588e-05,
"loss": 0.4605,
"step": 23500
},
{
"epoch": 0.12,
"learning_rate": 9.759586023711756e-05,
"loss": 0.4587,
"step": 24000
},
{
"epoch": 0.12,
"learning_rate": 9.746336618250542e-05,
"loss": 0.4596,
"step": 24500
},
{
"epoch": 0.12,
"learning_rate": 9.732827688303682e-05,
"loss": 0.4579,
"step": 25000
},
{
"epoch": 0.12,
"eval_loss": 0.43463194370269775,
"eval_runtime": 278.2026,
"eval_samples_per_second": 154.564,
"eval_steps_per_second": 2.416,
"step": 25000
},
{
"epoch": 0.13,
"learning_rate": 9.71906006716561e-05,
"loss": 0.4588,
"step": 25500
},
{
"epoch": 0.13,
"learning_rate": 9.705034604088048e-05,
"loss": 0.4579,
"step": 26000
},
{
"epoch": 0.13,
"learning_rate": 9.690780984987233e-05,
"loss": 0.4586,
"step": 26500
},
{
"epoch": 0.14,
"learning_rate": 9.67624296065391e-05,
"loss": 0.4577,
"step": 27000
},
{
"epoch": 0.14,
"learning_rate": 9.661449735541914e-05,
"loss": 0.459,
"step": 27500
},
{
"epoch": 0.14,
"learning_rate": 9.646402222167052e-05,
"loss": 0.4576,
"step": 28000
},
{
"epoch": 0.14,
"learning_rate": 9.631101348730831e-05,
"loss": 0.4583,
"step": 28500
},
{
"epoch": 0.14,
"learning_rate": 9.615548059063195e-05,
"loss": 0.4571,
"step": 29000
},
{
"epoch": 0.15,
"learning_rate": 9.599775172365814e-05,
"loss": 0.4558,
"step": 29500
},
{
"epoch": 0.15,
"learning_rate": 9.58375280271162e-05,
"loss": 0.4555,
"step": 30000
},
{
"epoch": 0.15,
"eval_loss": 0.43021583557128906,
"eval_runtime": 269.3626,
"eval_samples_per_second": 159.636,
"eval_steps_per_second": 2.495,
"step": 30000
},
{
"epoch": 0.15,
"learning_rate": 9.567449078707802e-05,
"loss": 0.4555,
"step": 30500
},
{
"epoch": 0.15,
"learning_rate": 9.550896864844414e-05,
"loss": 0.4563,
"step": 31000
},
{
"epoch": 0.16,
"learning_rate": 9.534097182139975e-05,
"loss": 0.4554,
"step": 31500
},
{
"epoch": 0.16,
"learning_rate": 9.517051066878048e-05,
"loss": 0.4547,
"step": 32000
},
{
"epoch": 0.16,
"learning_rate": 9.499759570543318e-05,
"loss": 0.4537,
"step": 32500
},
{
"epoch": 0.17,
"learning_rate": 9.482223759756727e-05,
"loss": 0.4534,
"step": 33000
},
{
"epoch": 0.17,
"learning_rate": 9.464444716209686e-05,
"loss": 0.454,
"step": 33500
},
{
"epoch": 0.17,
"learning_rate": 9.446423536597349e-05,
"loss": 0.4528,
"step": 34000
},
{
"epoch": 0.17,
"learning_rate": 9.42816133255096e-05,
"loss": 0.4523,
"step": 34500
},
{
"epoch": 0.17,
"learning_rate": 9.409659230569288e-05,
"loss": 0.4543,
"step": 35000
},
{
"epoch": 0.17,
"eval_loss": 0.4261643886566162,
"eval_runtime": 271.7099,
"eval_samples_per_second": 158.257,
"eval_steps_per_second": 2.473,
"step": 35000
},
{
"epoch": 0.18,
"learning_rate": 9.390918371949136e-05,
"loss": 0.4533,
"step": 35500
},
{
"epoch": 0.18,
"learning_rate": 9.371978105983142e-05,
"loss": 0.4531,
"step": 36000
},
{
"epoch": 0.18,
"learning_rate": 9.352802352506516e-05,
"loss": 0.4508,
"step": 36500
},
{
"epoch": 0.18,
"learning_rate": 9.333353157269004e-05,
"loss": 0.4512,
"step": 37000
},
{
"epoch": 0.19,
"learning_rate": 9.313669912311046e-05,
"loss": 0.4522,
"step": 37500
},
{
"epoch": 0.19,
"learning_rate": 9.293753831787908e-05,
"loss": 0.4498,
"step": 38000
},
{
"epoch": 0.19,
"learning_rate": 9.27360614421725e-05,
"loss": 0.4491,
"step": 38500
},
{
"epoch": 0.2,
"learning_rate": 9.25326907757735e-05,
"loss": 0.4498,
"step": 39000
},
{
"epoch": 0.2,
"learning_rate": 9.232662375484976e-05,
"loss": 0.4503,
"step": 39500
},
{
"epoch": 0.2,
"learning_rate": 9.211827834753872e-05,
"loss": 0.4498,
"step": 40000
},
{
"epoch": 0.2,
"eval_loss": 0.42189013957977295,
"eval_runtime": 264.7676,
"eval_samples_per_second": 162.407,
"eval_steps_per_second": 2.538,
"step": 40000
},
{
"epoch": 0.0,
"learning_rate": 9.190766740556652e-05,
"loss": 0.4488,
"step": 40500
},
{
"epoch": 0.01,
"learning_rate": 9.169480392040811e-05,
"loss": 0.4482,
"step": 41000
},
{
"epoch": 0.01,
"learning_rate": 9.147970102248595e-05,
"loss": 0.4487,
"step": 41500
},
{
"epoch": 0.01,
"learning_rate": 9.126280885124848e-05,
"loss": 0.4484,
"step": 42000
},
{
"epoch": 0.01,
"learning_rate": 9.104327148280447e-05,
"loss": 0.4469,
"step": 42500
},
{
"epoch": 0.01,
"learning_rate": 9.082153489118645e-05,
"loss": 0.4483,
"step": 43000
},
{
"epoch": 0.02,
"learning_rate": 9.059761275415186e-05,
"loss": 0.4474,
"step": 43500
},
{
"epoch": 0.02,
"learning_rate": 9.037151888427288e-05,
"loss": 0.4483,
"step": 44000
},
{
"epoch": 0.02,
"learning_rate": 9.014372587553127e-05,
"loss": 0.4464,
"step": 44500
},
{
"epoch": 0.03,
"learning_rate": 8.991333478594443e-05,
"loss": 0.4466,
"step": 45000
},
{
"epoch": 0.03,
"eval_loss": 0.4197126626968384,
"eval_runtime": 296.3831,
"eval_samples_per_second": 145.083,
"eval_steps_per_second": 2.267,
"step": 45000
},
{
"epoch": 0.03,
"learning_rate": 8.968081417300147e-05,
"loss": 0.4457,
"step": 45500
},
{
"epoch": 0.03,
"learning_rate": 8.944664975260744e-05,
"loss": 0.4454,
"step": 46000
},
{
"epoch": 0.03,
"learning_rate": 8.920991743920979e-05,
"loss": 0.4466,
"step": 46500
},
{
"epoch": 0.04,
"learning_rate": 8.89710989925514e-05,
"loss": 0.445,
"step": 47000
},
{
"epoch": 0.04,
"learning_rate": 8.8730209144079e-05,
"loss": 0.4453,
"step": 47500
},
{
"epoch": 0.04,
"learning_rate": 8.848726275301312e-05,
"loss": 0.444,
"step": 48000
},
{
"epoch": 0.04,
"learning_rate": 8.824227480543154e-05,
"loss": 0.4442,
"step": 48500
},
{
"epoch": 0.04,
"learning_rate": 8.799526041334489e-05,
"loss": 0.4442,
"step": 49000
},
{
"epoch": 0.05,
"learning_rate": 8.77462348137644e-05,
"loss": 0.4433,
"step": 49500
},
{
"epoch": 0.05,
"learning_rate": 8.749521336776206e-05,
"loss": 0.443,
"step": 50000
},
{
"epoch": 0.05,
"eval_loss": 0.41723188757896423,
"eval_runtime": 268.864,
"eval_samples_per_second": 159.932,
"eval_steps_per_second": 2.499,
"step": 50000
},
{
"epoch": 0.05,
"learning_rate": 8.724271952918828e-05,
"loss": 0.4439,
"step": 50500
},
{
"epoch": 0.06,
"learning_rate": 8.69877568789119e-05,
"loss": 0.4432,
"step": 51000
},
{
"epoch": 0.06,
"learning_rate": 8.673084516870541e-05,
"loss": 0.4437,
"step": 51500
},
{
"epoch": 0.06,
"learning_rate": 8.647251985469168e-05,
"loss": 0.4429,
"step": 52000
},
{
"epoch": 0.06,
"learning_rate": 8.621176150492572e-05,
"loss": 0.4419,
"step": 52500
},
{
"epoch": 0.07,
"learning_rate": 8.594910196227962e-05,
"loss": 0.4422,
"step": 53000
},
{
"epoch": 0.07,
"learning_rate": 8.568455742883119e-05,
"loss": 0.4424,
"step": 53500
},
{
"epoch": 0.07,
"learning_rate": 8.54181442229334e-05,
"loss": 0.4421,
"step": 54000
},
{
"epoch": 0.07,
"learning_rate": 8.51498787782077e-05,
"loss": 0.4431,
"step": 54500
},
{
"epoch": 0.07,
"learning_rate": 8.487977764253034e-05,
"loss": 0.4408,
"step": 55000
},
{
"epoch": 0.07,
"eval_loss": 0.41320380568504333,
"eval_runtime": 276.5136,
"eval_samples_per_second": 155.508,
"eval_steps_per_second": 2.43,
"step": 55000
},
{
"epoch": 0.08,
"learning_rate": 8.460785747701169e-05,
"loss": 0.4411,
"step": 55500
},
{
"epoch": 0.08,
"learning_rate": 8.433468428726034e-05,
"loss": 0.4405,
"step": 56000
},
{
"epoch": 0.08,
"learning_rate": 8.405918004700074e-05,
"loss": 0.4399,
"step": 56500
},
{
"epoch": 0.09,
"learning_rate": 8.378190739522489e-05,
"loss": 0.4399,
"step": 57000
},
{
"epoch": 0.09,
"learning_rate": 8.350288343541602e-05,
"loss": 0.4396,
"step": 57500
},
{
"epoch": 0.09,
"learning_rate": 8.322212537908629e-05,
"loss": 0.4381,
"step": 58000
},
{
"epoch": 0.09,
"learning_rate": 8.29396505447151e-05,
"loss": 0.439,
"step": 58500
},
{
"epoch": 0.1,
"learning_rate": 8.265547635668083e-05,
"loss": 0.4398,
"step": 59000
},
{
"epoch": 0.1,
"learning_rate": 8.236962034418596e-05,
"loss": 0.4377,
"step": 59500
},
{
"epoch": 0.1,
"learning_rate": 8.208267682967567e-05,
"loss": 0.4393,
"step": 60000
},
{
"epoch": 0.1,
"eval_loss": 0.41322511434555054,
"eval_runtime": 270.2321,
"eval_samples_per_second": 159.122,
"eval_steps_per_second": 2.487,
"step": 60000
},
{
"epoch": 0.1,
"learning_rate": 8.179409340302407e-05,
"loss": 0.4383,
"step": 60500
},
{
"epoch": 0.1,
"learning_rate": 8.150330460315606e-05,
"loss": 0.4384,
"step": 61000
},
{
"epoch": 0.11,
"learning_rate": 8.121090505020843e-05,
"loss": 0.4387,
"step": 61500
},
{
"epoch": 0.11,
"learning_rate": 8.091691278076293e-05,
"loss": 0.4368,
"step": 62000
},
{
"epoch": 0.11,
"learning_rate": 8.062134592964756e-05,
"loss": 0.4377,
"step": 62500
},
{
"epoch": 0.12,
"learning_rate": 8.032481851629047e-05,
"loss": 0.4381,
"step": 63000
},
{
"epoch": 0.12,
"learning_rate": 8.00267591904771e-05,
"loss": 0.4371,
"step": 63500
},
{
"epoch": 0.12,
"learning_rate": 7.972658441054002e-05,
"loss": 0.4376,
"step": 64000
},
{
"epoch": 0.12,
"learning_rate": 7.94249084739939e-05,
"loss": 0.4359,
"step": 64500
},
{
"epoch": 0.12,
"learning_rate": 7.912174998963153e-05,
"loss": 0.4359,
"step": 65000
},
{
"epoch": 0.12,
"eval_loss": 0.41053175926208496,
"eval_runtime": 268.5256,
"eval_samples_per_second": 160.134,
"eval_steps_per_second": 2.503,
"step": 65000
},
{
"epoch": 0.13,
"learning_rate": 7.88171276576962e-05,
"loss": 0.4359,
"step": 65500
},
{
"epoch": 0.13,
"learning_rate": 7.851167383313914e-05,
"loss": 0.4372,
"step": 66000
},
{
"epoch": 0.13,
"learning_rate": 7.820418310027099e-05,
"loss": 0.436,
"step": 66500
},
{
"epoch": 0.14,
"learning_rate": 7.789528511967774e-05,
"loss": 0.4355,
"step": 67000
},
{
"epoch": 0.14,
"learning_rate": 7.758499894564191e-05,
"loss": 0.4333,
"step": 67500
},
{
"epoch": 0.14,
"learning_rate": 7.727334371807629e-05,
"loss": 0.4338,
"step": 68000
},
{
"epoch": 0.14,
"learning_rate": 7.696033866134332e-05,
"loss": 0.4355,
"step": 68500
},
{
"epoch": 0.14,
"learning_rate": 7.664600308306929e-05,
"loss": 0.4351,
"step": 69000
},
{
"epoch": 0.15,
"learning_rate": 7.633035637295333e-05,
"loss": 0.4349,
"step": 69500
},
{
"epoch": 0.15,
"learning_rate": 7.601341800157135e-05,
"loss": 0.4345,
"step": 70000
},
{
"epoch": 0.15,
"eval_loss": 0.4069764316082001,
"eval_runtime": 286.0485,
"eval_samples_per_second": 150.324,
"eval_steps_per_second": 2.349,
"step": 70000
},
{
"epoch": 0.15,
"learning_rate": 7.569584519667294e-05,
"loss": 0.4335,
"step": 70500
},
{
"epoch": 0.15,
"learning_rate": 7.537702487519748e-05,
"loss": 0.4336,
"step": 71000
},
{
"epoch": 0.16,
"learning_rate": 7.505633402594554e-05,
"loss": 0.4338,
"step": 71500
},
{
"epoch": 0.16,
"learning_rate": 7.473443010312711e-05,
"loss": 0.4342,
"step": 72000
},
{
"epoch": 0.16,
"learning_rate": 7.441133296329247e-05,
"loss": 0.4325,
"step": 72500
},
{
"epoch": 0.17,
"learning_rate": 7.408771223510569e-05,
"loss": 0.433,
"step": 73000
},
{
"epoch": 0.17,
"learning_rate": 7.376229081063164e-05,
"loss": 0.4322,
"step": 73500
},
{
"epoch": 0.17,
"learning_rate": 7.343573613527213e-05,
"loss": 0.4323,
"step": 74000
},
{
"epoch": 0.17,
"learning_rate": 7.310872478548158e-05,
"loss": 0.4329,
"step": 74500
},
{
"epoch": 0.17,
"learning_rate": 7.277996627287863e-05,
"loss": 0.4321,
"step": 75000
},
{
"epoch": 0.17,
"eval_loss": 0.40518081188201904,
"eval_runtime": 272.4309,
"eval_samples_per_second": 157.838,
"eval_steps_per_second": 2.467,
"step": 75000
},
{
"epoch": 0.18,
"learning_rate": 7.245013510379486e-05,
"loss": 0.4324,
"step": 75500
},
{
"epoch": 0.18,
"learning_rate": 7.211925162377042e-05,
"loss": 0.4305,
"step": 76000
},
{
"epoch": 0.18,
"learning_rate": 7.178733624325697e-05,
"loss": 0.4312,
"step": 76500
},
{
"epoch": 0.18,
"learning_rate": 7.145440943635861e-05,
"loss": 0.4318,
"step": 77000
},
{
"epoch": 0.19,
"learning_rate": 7.112049173956905e-05,
"loss": 0.43,
"step": 77500
},
{
"epoch": 0.19,
"learning_rate": 7.07856037505047e-05,
"loss": 0.431,
"step": 78000
},
{
"epoch": 0.19,
"learning_rate": 7.04497661266342e-05,
"loss": 0.4302,
"step": 78500
},
{
"epoch": 0.2,
"learning_rate": 7.01129995840041e-05,
"loss": 0.4288,
"step": 79000
},
{
"epoch": 0.2,
"learning_rate": 6.97753248959611e-05,
"loss": 0.4321,
"step": 79500
},
{
"epoch": 0.2,
"learning_rate": 6.943676289187054e-05,
"loss": 0.4288,
"step": 80000
},
{
"epoch": 0.2,
"eval_loss": 0.4031592011451721,
"eval_runtime": 278.2873,
"eval_samples_per_second": 154.517,
"eval_steps_per_second": 2.415,
"step": 80000
},
{
"epoch": 0.0,
"learning_rate": 6.9098014163495e-05,
"loss": 0.4297,
"step": 80500
},
{
"epoch": 0.01,
"learning_rate": 6.875774190311069e-05,
"loss": 0.429,
"step": 81000
},
{
"epoch": 0.01,
"learning_rate": 6.841732809852635e-05,
"loss": 0.4286,
"step": 81500
},
{
"epoch": 0.01,
"learning_rate": 6.80754293711102e-05,
"loss": 0.4287,
"step": 82000
},
{
"epoch": 0.01,
"learning_rate": 6.773274818520966e-05,
"loss": 0.4273,
"step": 82500
},
{
"epoch": 0.01,
"learning_rate": 6.738930567901446e-05,
"loss": 0.4277,
"step": 83000
},
{
"epoch": 0.02,
"learning_rate": 6.704512303767616e-05,
"loss": 0.4283,
"step": 83500
},
{
"epoch": 0.02,
"learning_rate": 6.67002214920013e-05,
"loss": 0.4286,
"step": 84000
},
{
"epoch": 0.02,
"learning_rate": 6.635462231714186e-05,
"loss": 0.4278,
"step": 84500
},
{
"epoch": 0.03,
"learning_rate": 6.600904004302253e-05,
"loss": 0.4269,
"step": 85000
},
{
"epoch": 0.03,
"eval_loss": 0.40107953548431396,
"eval_runtime": 324.3295,
"eval_samples_per_second": 132.581,
"eval_steps_per_second": 2.072,
"step": 85000
},
{
"epoch": 0.03,
"learning_rate": 6.566211089462497e-05,
"loss": 0.4278,
"step": 85500
},
{
"epoch": 0.03,
"learning_rate": 6.531454815259442e-05,
"loss": 0.4275,
"step": 86000
},
{
"epoch": 0.03,
"learning_rate": 6.496637325623796e-05,
"loss": 0.4251,
"step": 86500
},
{
"epoch": 0.04,
"learning_rate": 6.461760768262325e-05,
"loss": 0.4271,
"step": 87000
},
{
"epoch": 0.04,
"learning_rate": 6.426897216843693e-05,
"loss": 0.4261,
"step": 87500
},
{
"epoch": 0.04,
"learning_rate": 6.391909088962678e-05,
"loss": 0.4262,
"step": 88000
},
{
"epoch": 0.04,
"learning_rate": 6.356868353486788e-05,
"loss": 0.4275,
"step": 88500
},
{
"epoch": 0.04,
"learning_rate": 6.321777171893646e-05,
"loss": 0.4251,
"step": 89000
},
{
"epoch": 0.05,
"learning_rate": 6.286637708772627e-05,
"loss": 0.4256,
"step": 89500
},
{
"epoch": 0.05,
"learning_rate": 6.251452131691338e-05,
"loss": 0.4246,
"step": 90000
},
{
"epoch": 0.05,
"eval_loss": 0.399631530046463,
"eval_runtime": 304.7497,
"eval_samples_per_second": 141.099,
"eval_steps_per_second": 2.205,
"step": 90000
},
{
"epoch": 0.05,
"learning_rate": 6.216293112514862e-05,
"loss": 0.4251,
"step": 90500
},
{
"epoch": 0.06,
"learning_rate": 6.181021902830414e-05,
"loss": 0.4251,
"step": 91000
},
{
"epoch": 0.06,
"learning_rate": 6.145711094066101e-05,
"loss": 0.4256,
"step": 91500
},
{
"epoch": 0.06,
"learning_rate": 6.110362864358936e-05,
"loss": 0.4232,
"step": 92000
},
{
"epoch": 0.06,
"learning_rate": 6.07505019481429e-05,
"loss": 0.4244,
"step": 92500
},
{
"epoch": 0.07,
"learning_rate": 6.039633730667286e-05,
"loss": 0.4252,
"step": 93000
},
{
"epoch": 0.07,
"learning_rate": 6.004186388928874e-05,
"loss": 0.4234,
"step": 93500
},
{
"epoch": 0.07,
"learning_rate": 5.968710356158062e-05,
"loss": 0.4232,
"step": 94000
},
{
"epoch": 0.07,
"learning_rate": 5.933207820683662e-05,
"loss": 0.4229,
"step": 94500
},
{
"epoch": 0.07,
"learning_rate": 5.897752048973475e-05,
"loss": 0.4227,
"step": 95000
},
{
"epoch": 0.07,
"eval_loss": 0.3970061242580414,
"eval_runtime": 304.9641,
"eval_samples_per_second": 141.0,
"eval_steps_per_second": 2.204,
"step": 95000
},
{
"epoch": 0.08,
"learning_rate": 5.8622031215370216e-05,
"loss": 0.4228,
"step": 95500
},
{
"epoch": 0.08,
"learning_rate": 5.826634261264905e-05,
"loss": 0.4215,
"step": 96000
},
{
"epoch": 0.08,
"learning_rate": 5.7910476622119674e-05,
"loss": 0.4227,
"step": 96500
},
{
"epoch": 0.09,
"learning_rate": 5.7554455195272715e-05,
"loss": 0.4227,
"step": 97000
},
{
"epoch": 0.09,
"learning_rate": 5.7198300293186744e-05,
"loss": 0.4218,
"step": 97500
},
{
"epoch": 0.09,
"learning_rate": 5.684203388517376e-05,
"loss": 0.4214,
"step": 98000
},
{
"epoch": 0.09,
"learning_rate": 5.6485677947424015e-05,
"loss": 0.4217,
"step": 98500
},
{
"epoch": 0.1,
"learning_rate": 5.612996736142203e-05,
"loss": 0.4224,
"step": 99000
},
{
"epoch": 0.1,
"learning_rate": 5.577421135154171e-05,
"loss": 0.4213,
"step": 99500
},
{
"epoch": 0.1,
"learning_rate": 5.5417718780659065e-05,
"loss": 0.423,
"step": 100000
},
{
"epoch": 0.1,
"eval_loss": 0.395292192697525,
"eval_runtime": 296.1319,
"eval_samples_per_second": 145.206,
"eval_steps_per_second": 2.269,
"step": 100000
},
{
"epoch": 0.1,
"learning_rate": 5.50612245385044e-05,
"loss": 0.4212,
"step": 100500
},
{
"epoch": 0.1,
"learning_rate": 5.470475061532183e-05,
"loss": 0.4202,
"step": 101000
},
{
"epoch": 0.11,
"learning_rate": 5.4348319000102045e-05,
"loss": 0.4217,
"step": 101500
},
{
"epoch": 0.11,
"learning_rate": 5.399266433509025e-05,
"loss": 0.4193,
"step": 102000
},
{
"epoch": 0.11,
"learning_rate": 5.363638309648435e-05,
"loss": 0.4214,
"step": 102500
},
{
"epoch": 0.12,
"learning_rate": 5.328021006778228e-05,
"loss": 0.4197,
"step": 103000
},
{
"epoch": 0.12,
"learning_rate": 5.292416721941417e-05,
"loss": 0.4197,
"step": 103500
},
{
"epoch": 0.12,
"learning_rate": 5.2568988128759686e-05,
"loss": 0.4195,
"step": 104000
},
{
"epoch": 0.12,
"learning_rate": 5.2213271148780935e-05,
"loss": 0.4199,
"step": 104500
},
{
"epoch": 0.12,
"learning_rate": 5.1857750162954374e-05,
"loss": 0.4206,
"step": 105000
},
{
"epoch": 0.12,
"eval_loss": 0.394319623708725,
"eval_runtime": 306.4174,
"eval_samples_per_second": 140.331,
"eval_steps_per_second": 2.193,
"step": 105000
},
{
"epoch": 0.13,
"learning_rate": 5.1502447101489115e-05,
"loss": 0.4185,
"step": 105500
},
{
"epoch": 0.13,
"learning_rate": 5.114738388115157e-05,
"loss": 0.4184,
"step": 106000
},
{
"epoch": 0.13,
"learning_rate": 5.079329173109894e-05,
"loss": 0.4179,
"step": 106500
},
{
"epoch": 0.14,
"learning_rate": 5.043877329369859e-05,
"loss": 0.4199,
"step": 107000
},
{
"epoch": 0.14,
"learning_rate": 5.008456030983681e-05,
"loss": 0.419,
"step": 107500
},
{
"epoch": 0.14,
"learning_rate": 4.973067462903889e-05,
"loss": 0.4167,
"step": 108000
},
{
"epoch": 0.14,
"learning_rate": 4.937784479080462e-05,
"loss": 0.417,
"step": 108500
},
{
"epoch": 0.14,
"learning_rate": 4.902467841897598e-05,
"loss": 0.418,
"step": 109000
},
{
"epoch": 0.15,
"learning_rate": 4.867190472871874e-05,
"loss": 0.4177,
"step": 109500
},
{
"epoch": 0.15,
"learning_rate": 4.8319545480775854e-05,
"loss": 0.4172,
"step": 110000
},
{
"epoch": 0.15,
"eval_loss": 0.3924800455570221,
"eval_runtime": 289.0122,
"eval_samples_per_second": 148.783,
"eval_steps_per_second": 2.325,
"step": 110000
},
{
"epoch": 0.15,
"learning_rate": 4.796762241032546e-05,
"loss": 0.4173,
"step": 110500
},
{
"epoch": 0.15,
"learning_rate": 4.7616157225640255e-05,
"loss": 0.417,
"step": 111000
},
{
"epoch": 0.16,
"learning_rate": 4.7265873084984087e-05,
"loss": 0.4173,
"step": 111500
},
{
"epoch": 0.16,
"learning_rate": 4.6915387658312535e-05,
"loss": 0.416,
"step": 112000
},
{
"epoch": 0.16,
"learning_rate": 4.656542502420213e-05,
"loss": 0.4165,
"step": 112500
},
{
"epoch": 0.17,
"learning_rate": 4.621600676999665e-05,
"loss": 0.4163,
"step": 113000
},
{
"epoch": 0.17,
"learning_rate": 4.586715444945989e-05,
"loss": 0.4161,
"step": 113500
},
{
"epoch": 0.17,
"learning_rate": 4.551888958144626e-05,
"loss": 0.417,
"step": 114000
},
{
"epoch": 0.17,
"learning_rate": 4.517123364857326e-05,
"loss": 0.4158,
"step": 114500
},
{
"epoch": 0.17,
"learning_rate": 4.4824208095896454e-05,
"loss": 0.416,
"step": 115000
},
{
"epoch": 0.17,
"eval_loss": 0.39157894253730774,
"eval_runtime": 277.9435,
"eval_samples_per_second": 154.708,
"eval_steps_per_second": 2.418,
"step": 115000
},
{
"epoch": 0.18,
"learning_rate": 4.4477834329586547e-05,
"loss": 0.4148,
"step": 115500
},
{
"epoch": 0.18,
"learning_rate": 4.4132133715609044e-05,
"loss": 0.4157,
"step": 116000
},
{
"epoch": 0.18,
"learning_rate": 4.378712757840617e-05,
"loss": 0.4154,
"step": 116500
},
{
"epoch": 0.18,
"learning_rate": 4.344283719958163e-05,
"loss": 0.4156,
"step": 117000
},
{
"epoch": 0.19,
"learning_rate": 4.3099283816587726e-05,
"loss": 0.415,
"step": 117500
},
{
"epoch": 0.19,
"learning_rate": 4.2757173441071825e-05,
"loss": 0.4143,
"step": 118000
},
{
"epoch": 0.19,
"learning_rate": 4.241515599920613e-05,
"loss": 0.4152,
"step": 118500
},
{
"epoch": 0.2,
"learning_rate": 4.207462056671048e-05,
"loss": 0.4138,
"step": 119000
},
{
"epoch": 0.2,
"learning_rate": 4.173422328497157e-05,
"loss": 0.4143,
"step": 119500
},
{
"epoch": 0.2,
"learning_rate": 4.139466839441784e-05,
"loss": 0.4154,
"step": 120000
},
{
"epoch": 0.2,
"eval_loss": 0.3890155851840973,
"eval_runtime": 291.4194,
"eval_samples_per_second": 147.554,
"eval_steps_per_second": 2.306,
"step": 120000
},
{
"epoch": 0.0,
"learning_rate": 4.105597684039436e-05,
"loss": 0.4022,
"step": 120500
},
{
"epoch": 0.01,
"learning_rate": 4.071816951499159e-05,
"loss": 0.3996,
"step": 121000
},
{
"epoch": 0.01,
"learning_rate": 4.038194014319665e-05,
"loss": 0.3976,
"step": 121500
},
{
"epoch": 0.01,
"learning_rate": 4.00459618594495e-05,
"loss": 0.3971,
"step": 122000
},
{
"epoch": 0.01,
"learning_rate": 3.971093010680468e-05,
"loss": 0.3963,
"step": 122500
},
{
"epoch": 0.01,
"learning_rate": 3.937686555159882e-05,
"loss": 0.3957,
"step": 123000
},
{
"epoch": 0.02,
"learning_rate": 3.9044453954514625e-05,
"loss": 0.395,
"step": 123500
},
{
"epoch": 0.02,
"learning_rate": 3.871238351611711e-05,
"loss": 0.3956,
"step": 124000
},
{
"epoch": 0.02,
"learning_rate": 3.83820029131894e-05,
"loss": 0.3954,
"step": 124500
},
{
"epoch": 0.03,
"learning_rate": 3.805200836123052e-05,
"loss": 0.394,
"step": 125000
},
{
"epoch": 0.03,
"eval_loss": 0.3622290790081024,
"eval_runtime": 282.4866,
"eval_samples_per_second": 152.22,
"eval_steps_per_second": 2.379,
"step": 125000
},
{
"epoch": 0.03,
"learning_rate": 3.772308333681807e-05,
"loss": 0.3947,
"step": 125500
},
{
"epoch": 0.03,
"learning_rate": 3.739524812959698e-05,
"loss": 0.3934,
"step": 126000
},
{
"epoch": 0.03,
"learning_rate": 3.7068522961987034e-05,
"loss": 0.3918,
"step": 126500
},
{
"epoch": 0.04,
"learning_rate": 3.6742927987935615e-05,
"loss": 0.3942,
"step": 127000
},
{
"epoch": 0.04,
"learning_rate": 3.6418483291674376e-05,
"loss": 0.3934,
"step": 127500
},
{
"epoch": 0.04,
"learning_rate": 3.6095208886480486e-05,
"loss": 0.3927,
"step": 128000
},
{
"epoch": 0.04,
"learning_rate": 3.577312471344201e-05,
"loss": 0.3949,
"step": 128500
},
{
"epoch": 0.04,
"learning_rate": 3.545225064022787e-05,
"loss": 0.3927,
"step": 129000
},
{
"epoch": 0.05,
"learning_rate": 3.513324450767193e-05,
"loss": 0.3933,
"step": 129500
},
{
"epoch": 0.05,
"learning_rate": 3.481484741846708e-05,
"loss": 0.3911,
"step": 130000
},
{
"epoch": 0.05,
"eval_loss": 0.3602633476257324,
"eval_runtime": 265.9858,
"eval_samples_per_second": 161.663,
"eval_steps_per_second": 2.526,
"step": 130000
},
{
"epoch": 0.05,
"learning_rate": 3.449771954014401e-05,
"loss": 0.393,
"step": 130500
},
{
"epoch": 0.06,
"learning_rate": 3.418188043464409e-05,
"loss": 0.3919,
"step": 131000
},
{
"epoch": 0.06,
"learning_rate": 3.3867977327563496e-05,
"loss": 0.3922,
"step": 131500
},
{
"epoch": 0.06,
"learning_rate": 3.3554771459719055e-05,
"loss": 0.3908,
"step": 132000
},
{
"epoch": 0.06,
"learning_rate": 3.324291253017703e-05,
"loss": 0.39,
"step": 132500
},
{
"epoch": 0.07,
"learning_rate": 3.29330393851866e-05,
"loss": 0.391,
"step": 133000
},
{
"epoch": 0.07,
"learning_rate": 3.262454599427242e-05,
"loss": 0.39,
"step": 133500
},
{
"epoch": 0.07,
"learning_rate": 3.231683730748652e-05,
"loss": 0.3907,
"step": 134000
},
{
"epoch": 0.07,
"learning_rate": 3.201055192061272e-05,
"loss": 0.3888,
"step": 134500
},
{
"epoch": 0.07,
"learning_rate": 3.170570872677642e-05,
"loss": 0.3891,
"step": 135000
},
{
"epoch": 0.07,
"eval_loss": 0.35786134004592896,
"eval_runtime": 272.1536,
"eval_samples_per_second": 157.999,
"eval_steps_per_second": 2.469,
"step": 135000
},
{
"epoch": 0.08,
"learning_rate": 3.1402326530141794e-05,
"loss": 0.3904,
"step": 135500
},
{
"epoch": 0.08,
"learning_rate": 3.110042404475174e-05,
"loss": 0.3873,
"step": 136000
},
{
"epoch": 0.08,
"learning_rate": 3.080001989337368e-05,
"loss": 0.3885,
"step": 136500
},
{
"epoch": 0.09,
"learning_rate": 3.050113260635069e-05,
"loss": 0.3892,
"step": 137000
},
{
"epoch": 0.09,
"learning_rate": 3.0203780620458514e-05,
"loss": 0.3891,
"step": 137500
},
{
"epoch": 0.09,
"learning_rate": 2.990798227776831e-05,
"loss": 0.3884,
"step": 138000
},
{
"epoch": 0.09,
"learning_rate": 2.9613755824515164e-05,
"loss": 0.3893,
"step": 138500
},
{
"epoch": 0.1,
"learning_rate": 2.9321119409972645e-05,
"loss": 0.389,
"step": 139000
},
{
"epoch": 0.1,
"learning_rate": 2.903009108533329e-05,
"loss": 0.3884,
"step": 139500
},
{
"epoch": 0.1,
"learning_rate": 2.8740688802594957e-05,
"loss": 0.3892,
"step": 140000
},
{
"epoch": 0.1,
"eval_loss": 0.3559122383594513,
"eval_runtime": 296.1288,
"eval_samples_per_second": 145.207,
"eval_steps_per_second": 2.269,
"step": 140000
},
{
"epoch": 0.1,
"learning_rate": 2.8453504277811132e-05,
"loss": 0.3876,
"step": 140500
},
{
"epoch": 0.1,
"learning_rate": 2.816740419162523e-05,
"loss": 0.3869,
"step": 141000
},
{
"epoch": 0.11,
"learning_rate": 2.7882983361931612e-05,
"loss": 0.3877,
"step": 141500
},
{
"epoch": 0.11,
"learning_rate": 2.760025933314676e-05,
"loss": 0.3862,
"step": 142000
},
{
"epoch": 0.11,
"learning_rate": 2.7320370146396765e-05,
"loss": 0.3872,
"step": 142500
},
{
"epoch": 0.12,
"learning_rate": 2.7041084972249825e-05,
"loss": 0.3865,
"step": 143000
},
{
"epoch": 0.12,
"learning_rate": 2.6763548531270056e-05,
"loss": 0.3866,
"step": 143500
},
{
"epoch": 0.12,
"learning_rate": 2.648777794321244e-05,
"loss": 0.3857,
"step": 144000
},
{
"epoch": 0.12,
"learning_rate": 2.621379021890586e-05,
"loss": 0.3857,
"step": 144500
},
{
"epoch": 0.12,
"learning_rate": 2.5941602259203728e-05,
"loss": 0.3873,
"step": 145000
},
{
"epoch": 0.12,
"eval_loss": 0.3540988564491272,
"eval_runtime": 283.9076,
"eval_samples_per_second": 151.458,
"eval_steps_per_second": 2.367,
"step": 145000
},
{
"epoch": 0.13,
"learning_rate": 2.5671230853941542e-05,
"loss": 0.3862,
"step": 145500
},
{
"epoch": 0.13,
"learning_rate": 2.5402692680901127e-05,
"loss": 0.3861,
"step": 146000
},
{
"epoch": 0.13,
"learning_rate": 2.5136004304781915e-05,
"loss": 0.3857,
"step": 146500
},
{
"epoch": 0.14,
"learning_rate": 2.487170994704443e-05,
"loss": 0.3851,
"step": 147000
},
{
"epoch": 0.14,
"learning_rate": 2.4608766620042728e-05,
"loss": 0.3852,
"step": 147500
},
{
"epoch": 0.14,
"learning_rate": 2.434772206306137e-05,
"loss": 0.3838,
"step": 148000
},
{
"epoch": 0.14,
"learning_rate": 2.4088592378558306e-05,
"loss": 0.3852,
"step": 148500
},
{
"epoch": 0.14,
"learning_rate": 2.3831393550873072e-05,
"loss": 0.3847,
"step": 149000
},
{
"epoch": 0.15,
"learning_rate": 2.3576649996138134e-05,
"loss": 0.3836,
"step": 149500
},
{
"epoch": 0.15,
"learning_rate": 2.332335641714342e-05,
"loss": 0.3842,
"step": 150000
},
{
"epoch": 0.15,
"eval_loss": 0.35215088725090027,
"eval_runtime": 263.7454,
"eval_samples_per_second": 163.036,
"eval_steps_per_second": 2.548,
"step": 150000
},
{
"epoch": 0.0,
"learning_rate": 2.3072541544934485e-05,
"loss": 0.3809,
"step": 150500
},
{
"epoch": 0.01,
"learning_rate": 2.2823215586103048e-05,
"loss": 0.3803,
"step": 151000
},
{
"epoch": 0.01,
"learning_rate": 2.2575898538490455e-05,
"loss": 0.3791,
"step": 151500
},
{
"epoch": 0.01,
"learning_rate": 2.2330605657777174e-05,
"loss": 0.3789,
"step": 152000
},
{
"epoch": 0.01,
"learning_rate": 2.2087352074783502e-05,
"loss": 0.3785,
"step": 152500
},
{
"epoch": 0.01,
"learning_rate": 2.1846152794536308e-05,
"loss": 0.3783,
"step": 153000
},
{
"epoch": 0.02,
"learning_rate": 2.1607022695343304e-05,
"loss": 0.3779,
"step": 153500
},
{
"epoch": 0.02,
"learning_rate": 2.1369976527875413e-05,
"loss": 0.3786,
"step": 154000
},
{
"epoch": 0.02,
"learning_rate": 2.1135496705472888e-05,
"loss": 0.3786,
"step": 154500
},
{
"epoch": 0.03,
"learning_rate": 2.090265789790893e-05,
"loss": 0.3777,
"step": 155000
},
{
"epoch": 0.03,
"eval_loss": 0.3511093854904175,
"eval_runtime": 298.7311,
"eval_samples_per_second": 143.942,
"eval_steps_per_second": 2.25,
"step": 155000
},
{
"epoch": 0.03,
"learning_rate": 2.067194647060863e-05,
"loss": 0.3785,
"step": 155500
},
{
"epoch": 0.03,
"learning_rate": 2.044337665493961e-05,
"loss": 0.3775,
"step": 156000
},
{
"epoch": 0.03,
"learning_rate": 2.0216962550164754e-05,
"loss": 0.376,
"step": 156500
},
{
"epoch": 0.04,
"learning_rate": 1.9993164436873545e-05,
"loss": 0.3785,
"step": 157000
},
{
"epoch": 0.04,
"learning_rate": 1.977109913818189e-05,
"loss": 0.3776,
"step": 157500
},
{
"epoch": 0.04,
"learning_rate": 1.9551231019628344e-05,
"loss": 0.3772,
"step": 158000
},
{
"epoch": 0.04,
"learning_rate": 1.933357364371412e-05,
"loss": 0.3792,
"step": 158500
},
{
"epoch": 0.04,
"learning_rate": 1.9118569074409627e-05,
"loss": 0.3773,
"step": 159000
},
{
"epoch": 0.05,
"learning_rate": 1.8905368836887298e-05,
"loss": 0.3781,
"step": 159500
},
{
"epoch": 0.05,
"learning_rate": 1.8694419181825997e-05,
"loss": 0.376,
"step": 160000
},
{
"epoch": 0.05,
"eval_loss": 0.3505874574184418,
"eval_runtime": 298.148,
"eval_samples_per_second": 144.224,
"eval_steps_per_second": 2.254,
"step": 160000
},
{
"epoch": 0.05,
"learning_rate": 1.848573312159415e-05,
"loss": 0.378,
"step": 160500
},
{
"epoch": 0.06,
"learning_rate": 1.8279734067715378e-05,
"loss": 0.3769,
"step": 161000
},
{
"epoch": 0.06,
"learning_rate": 1.8075609083923823e-05,
"loss": 0.3772,
"step": 161500
},
{
"epoch": 0.06,
"learning_rate": 1.7873785866083376e-05,
"loss": 0.3761,
"step": 162000
},
{
"epoch": 0.06,
"learning_rate": 1.7674276863600826e-05,
"loss": 0.3754,
"step": 162500
},
{
"epoch": 0.07,
"learning_rate": 1.747709438313117e-05,
"loss": 0.3761,
"step": 163000
},
{
"epoch": 0.07,
"learning_rate": 1.728225058781864e-05,
"loss": 0.3757,
"step": 163500
},
{
"epoch": 0.07,
"learning_rate": 1.7089757496546325e-05,
"loss": 0.3764,
"step": 164000
},
{
"epoch": 0.07,
"learning_rate": 1.6899626983194823e-05,
"loss": 0.3746,
"step": 164500
},
{
"epoch": 0.07,
"learning_rate": 1.6712243911044467e-05,
"loss": 0.375,
"step": 165000
},
{
"epoch": 0.07,
"eval_loss": 0.34934455156326294,
"eval_runtime": 286.3366,
"eval_samples_per_second": 150.173,
"eval_steps_per_second": 2.347,
"step": 165000
},
{
"epoch": 0.08,
"learning_rate": 1.6526868808276845e-05,
"loss": 0.3763,
"step": 165500
},
{
"epoch": 0.08,
"learning_rate": 1.6344254560630497e-05,
"loss": 0.3733,
"step": 166000
},
{
"epoch": 0.08,
"learning_rate": 1.616368051555008e-05,
"loss": 0.3749,
"step": 166500
},
{
"epoch": 0.09,
"learning_rate": 1.5985526173180873e-05,
"loss": 0.3755,
"step": 167000
},
{
"epoch": 0.09,
"learning_rate": 1.580980252292188e-05,
"loss": 0.3754,
"step": 167500
},
{
"epoch": 0.09,
"learning_rate": 1.563652040423561e-05,
"loss": 0.3747,
"step": 168000
},
{
"epoch": 0.09,
"learning_rate": 1.546569050597935e-05,
"loss": 0.3758,
"step": 168500
},
{
"epoch": 0.1,
"learning_rate": 1.5297657635265495e-05,
"loss": 0.3756,
"step": 169000
},
{
"epoch": 0.1,
"learning_rate": 1.5131758682182151e-05,
"loss": 0.375,
"step": 169500
},
{
"epoch": 0.1,
"learning_rate": 1.49683430856095e-05,
"loss": 0.3759,
"step": 170000
},
{
"epoch": 0.1,
"eval_loss": 0.3487904369831085,
"eval_runtime": 298.7435,
"eval_samples_per_second": 143.936,
"eval_steps_per_second": 2.249,
"step": 170000
},
{
"epoch": 0.1,
"learning_rate": 1.4807420925791258e-05,
"loss": 0.3746,
"step": 170500
},
{
"epoch": 0.1,
"learning_rate": 1.4649002129164283e-05,
"loss": 0.3739,
"step": 171000
},
{
"epoch": 0.11,
"learning_rate": 1.449309646774616e-05,
"loss": 0.3747,
"step": 171500
},
{
"epoch": 0.11,
"learning_rate": 1.4340017800329543e-05,
"loss": 0.3735,
"step": 172000
},
{
"epoch": 0.11,
"learning_rate": 1.4189162030934715e-05,
"loss": 0.3744,
"step": 172500
},
{
"epoch": 0.12,
"learning_rate": 1.4040847761852026e-05,
"loss": 0.3739,
"step": 173000
},
{
"epoch": 0.12,
"learning_rate": 1.3895084141804244e-05,
"loss": 0.374,
"step": 173500
},
{
"epoch": 0.12,
"learning_rate": 1.3751880162178036e-05,
"loss": 0.3731,
"step": 174000
},
{
"epoch": 0.12,
"learning_rate": 1.3611523358351411e-05,
"loss": 0.3732,
"step": 174500
},
{
"epoch": 0.12,
"learning_rate": 1.3473459838770744e-05,
"loss": 0.3751,
"step": 175000
},
{
"epoch": 0.12,
"eval_loss": 0.3473358154296875,
"eval_runtime": 291.1365,
"eval_samples_per_second": 147.697,
"eval_steps_per_second": 2.308,
"step": 175000
},
{
"epoch": 0.13,
"learning_rate": 1.3337981967384716e-05,
"loss": 0.3742,
"step": 175500
},
{
"epoch": 0.13,
"learning_rate": 1.3205098101106558e-05,
"loss": 0.374,
"step": 176000
},
{
"epoch": 0.13,
"learning_rate": 1.3074816436839109e-05,
"loss": 0.3738,
"step": 176500
},
{
"epoch": 0.14,
"learning_rate": 1.2947145010969087e-05,
"loss": 0.3735,
"step": 177000
},
{
"epoch": 0.14,
"learning_rate": 1.2822091698871432e-05,
"loss": 0.3736,
"step": 177500
},
{
"epoch": 0.14,
"learning_rate": 1.2699906443769858e-05,
"loss": 0.3721,
"step": 178000
},
{
"epoch": 0.14,
"learning_rate": 1.2580107064687531e-05,
"loss": 0.3737,
"step": 178500
},
{
"epoch": 0.14,
"learning_rate": 1.2462948440006997e-05,
"loss": 0.3731,
"step": 179000
},
{
"epoch": 0.15,
"learning_rate": 1.2348437796624094e-05,
"loss": 0.3722,
"step": 179500
},
{
"epoch": 0.15,
"learning_rate": 1.2236582198094697e-05,
"loss": 0.3728,
"step": 180000
},
{
"epoch": 0.15,
"eval_loss": 0.34677574038505554,
"eval_runtime": 288.6592,
"eval_samples_per_second": 148.965,
"eval_steps_per_second": 2.328,
"step": 180000
},
{
"epoch": 0.15,
"learning_rate": 1.2127388544199013e-05,
"loss": 0.378,
"step": 180500
},
{
"epoch": 0.15,
"learning_rate": 1.2020863570515961e-05,
"loss": 0.3783,
"step": 181000
},
{
"epoch": 0.16,
"learning_rate": 1.1917218873266704e-05,
"loss": 0.3774,
"step": 181500
},
{
"epoch": 0.16,
"learning_rate": 1.1816245104688946e-05,
"loss": 0.3768,
"step": 182000
},
{
"epoch": 0.16,
"learning_rate": 1.1717754173131136e-05,
"loss": 0.378,
"step": 182500
},
{
"epoch": 0.17,
"learning_rate": 1.162195718996353e-05,
"loss": 0.3775,
"step": 183000
},
{
"epoch": 0.17,
"learning_rate": 1.1528860064395268e-05,
"loss": 0.3778,
"step": 183500
},
{
"epoch": 0.17,
"learning_rate": 1.14384685390956e-05,
"loss": 0.377,
"step": 184000
},
{
"epoch": 0.17,
"learning_rate": 1.1350788189839584e-05,
"loss": 0.3769,
"step": 184500
},
{
"epoch": 0.17,
"learning_rate": 1.126582442516417e-05,
"loss": 0.3779,
"step": 185000
},
{
"epoch": 0.17,
"eval_loss": 0.3469138443470001,
"eval_runtime": 287.4474,
"eval_samples_per_second": 149.593,
"eval_steps_per_second": 2.338,
"step": 185000
},
{
"epoch": 0.18,
"learning_rate": 1.1183582486034581e-05,
"loss": 0.3766,
"step": 185500
},
{
"epoch": 0.18,
"learning_rate": 1.1104067445521018e-05,
"loss": 0.3776,
"step": 186000
},
{
"epoch": 0.18,
"learning_rate": 1.102728420848572e-05,
"loss": 0.3772,
"step": 186500
},
{
"epoch": 0.18,
"learning_rate": 1.0953237511280449e-05,
"loss": 0.3769,
"step": 187000
},
{
"epoch": 0.19,
"learning_rate": 1.0881931921454253e-05,
"loss": 0.3776,
"step": 187500
},
{
"epoch": 0.19,
"learning_rate": 1.0813506214785774e-05,
"loss": 0.3769,
"step": 188000
},
{
"epoch": 0.19,
"learning_rate": 1.0747690362178142e-05,
"loss": 0.377,
"step": 188500
},
{
"epoch": 0.2,
"learning_rate": 1.0684628296065977e-05,
"loss": 0.3765,
"step": 189000
},
{
"epoch": 0.2,
"learning_rate": 1.0624323906414552e-05,
"loss": 0.376,
"step": 189500
},
{
"epoch": 0.2,
"learning_rate": 1.0566780913082688e-05,
"loss": 0.3777,
"step": 190000
},
{
"epoch": 0.2,
"eval_loss": 0.34515419602394104,
"eval_runtime": 275.3559,
"eval_samples_per_second": 156.162,
"eval_steps_per_second": 2.44,
"step": 190000
}
],
"max_steps": 200000,
"num_train_epochs": 9223372036854775807,
"total_flos": 4.4743682799304704e+21,
"trial_name": null,
"trial_params": null
}