{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 7815, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06397952655150352, "grad_norm": 1.344713568687439, "learning_rate": 0.00029616122840690973, "loss": 2.6677, "step": 100 }, { "epoch": 0.12795905310300704, "grad_norm": 1.3358043432235718, "learning_rate": 0.00029232245681381954, "loss": 2.2932, "step": 200 }, { "epoch": 0.19193857965451055, "grad_norm": 1.3450862169265747, "learning_rate": 0.0002885220729366602, "loss": 2.2158, "step": 300 }, { "epoch": 0.2559181062060141, "grad_norm": 1.1904783248901367, "learning_rate": 0.00028468330134357004, "loss": 2.1907, "step": 400 }, { "epoch": 0.3198976327575176, "grad_norm": 1.3194555044174194, "learning_rate": 0.00028084452975047985, "loss": 2.1725, "step": 500 }, { "epoch": 0.3838771593090211, "grad_norm": 1.3454678058624268, "learning_rate": 0.0002770057581573896, "loss": 2.1586, "step": 600 }, { "epoch": 0.44785668586052463, "grad_norm": 1.318642497062683, "learning_rate": 0.0002731669865642994, "loss": 2.1429, "step": 700 }, { "epoch": 0.5118362124120281, "grad_norm": 1.1936390399932861, "learning_rate": 0.0002693282149712092, "loss": 2.0988, "step": 800 }, { "epoch": 0.5758157389635317, "grad_norm": 1.1406426429748535, "learning_rate": 0.000265489443378119, "loss": 2.0834, "step": 900 }, { "epoch": 0.6397952655150352, "grad_norm": 1.2195415496826172, "learning_rate": 0.0002616506717850288, "loss": 2.1004, "step": 1000 }, { "epoch": 0.7037747920665387, "grad_norm": 1.2042839527130127, "learning_rate": 0.00025781190019193856, "loss": 2.0883, "step": 1100 }, { "epoch": 0.7677543186180422, "grad_norm": 1.159279465675354, "learning_rate": 0.0002539731285988483, "loss": 2.0852, "step": 1200 }, { "epoch": 0.8317338451695457, "grad_norm": 1.2210711240768433, "learning_rate": 0.00025013435700575813, "loss": 2.0449, "step": 1300 }, { "epoch": 0.8957133717210493, "grad_norm": 1.2941797971725464, "learning_rate": 0.00024629558541266794, "loss": 2.0455, "step": 1400 }, { "epoch": 0.9596928982725528, "grad_norm": 1.1550540924072266, "learning_rate": 0.00024245681381957772, "loss": 2.0293, "step": 1500 }, { "epoch": 1.0236724248240563, "grad_norm": 1.3520652055740356, "learning_rate": 0.0002386564299424184, "loss": 1.9799, "step": 1600 }, { "epoch": 1.0876519513755598, "grad_norm": 1.375148892402649, "learning_rate": 0.0002348176583493282, "loss": 1.9032, "step": 1700 }, { "epoch": 1.1516314779270633, "grad_norm": 1.4116652011871338, "learning_rate": 0.00023097888675623797, "loss": 1.9398, "step": 1800 }, { "epoch": 1.2156110044785668, "grad_norm": 1.2254273891448975, "learning_rate": 0.00022714011516314776, "loss": 1.9097, "step": 1900 }, { "epoch": 1.2795905310300704, "grad_norm": 1.3888587951660156, "learning_rate": 0.00022330134357005757, "loss": 1.9039, "step": 2000 }, { "epoch": 1.3435700575815739, "grad_norm": 1.4431171417236328, "learning_rate": 0.00021946257197696736, "loss": 1.9095, "step": 2100 }, { "epoch": 1.4075495841330774, "grad_norm": 1.2467221021652222, "learning_rate": 0.00021562380038387714, "loss": 1.9103, "step": 2200 }, { "epoch": 1.471529110684581, "grad_norm": 1.41363525390625, "learning_rate": 0.00021178502879078693, "loss": 1.9132, "step": 2300 }, { "epoch": 1.5355086372360844, "grad_norm": 1.4501458406448364, "learning_rate": 0.0002079462571976967, "loss": 1.9095, "step": 2400 }, { "epoch": 1.599488163787588, "grad_norm": 1.2889657020568848, "learning_rate": 0.00020410748560460652, "loss": 1.9308, "step": 2500 }, { "epoch": 1.6634676903390915, "grad_norm": 1.4488581418991089, "learning_rate": 0.0002002687140115163, "loss": 1.9161, "step": 2600 }, { "epoch": 1.727447216890595, "grad_norm": 1.3905428647994995, "learning_rate": 0.0001964299424184261, "loss": 1.8958, "step": 2700 }, { "epoch": 1.7914267434420985, "grad_norm": 1.3509632349014282, "learning_rate": 0.00019259117082533588, "loss": 1.8933, "step": 2800 }, { "epoch": 1.855406269993602, "grad_norm": 1.3293097019195557, "learning_rate": 0.00018875239923224566, "loss": 1.8902, "step": 2900 }, { "epoch": 1.9193857965451055, "grad_norm": 1.3575371503829956, "learning_rate": 0.00018491362763915547, "loss": 1.9107, "step": 3000 }, { "epoch": 1.983365323096609, "grad_norm": 1.4029614925384521, "learning_rate": 0.00018107485604606526, "loss": 1.9122, "step": 3100 }, { "epoch": 2.0473448496481126, "grad_norm": 1.6483345031738281, "learning_rate": 0.00017723608445297504, "loss": 1.7797, "step": 3200 }, { "epoch": 2.111324376199616, "grad_norm": 1.7276026010513306, "learning_rate": 0.00017339731285988483, "loss": 1.7411, "step": 3300 }, { "epoch": 2.1753039027511196, "grad_norm": 1.6449826955795288, "learning_rate": 0.00016955854126679461, "loss": 1.7355, "step": 3400 }, { "epoch": 2.239283429302623, "grad_norm": 1.606766939163208, "learning_rate": 0.00016571976967370443, "loss": 1.7479, "step": 3500 }, { "epoch": 2.3032629558541267, "grad_norm": 1.6743805408477783, "learning_rate": 0.0001618809980806142, "loss": 1.7376, "step": 3600 }, { "epoch": 2.36724248240563, "grad_norm": 1.58048415184021, "learning_rate": 0.00015804222648752397, "loss": 1.7499, "step": 3700 }, { "epoch": 2.4312220089571337, "grad_norm": 1.7509996891021729, "learning_rate": 0.00015420345489443375, "loss": 1.741, "step": 3800 }, { "epoch": 2.495201535508637, "grad_norm": 1.6279881000518799, "learning_rate": 0.00015036468330134354, "loss": 1.7584, "step": 3900 }, { "epoch": 2.5591810620601407, "grad_norm": 1.6708228588104248, "learning_rate": 0.00014652591170825335, "loss": 1.7505, "step": 4000 }, { "epoch": 2.6231605886116443, "grad_norm": 1.628318428993225, "learning_rate": 0.00014268714011516314, "loss": 1.7535, "step": 4100 }, { "epoch": 2.6871401151631478, "grad_norm": 1.66116464138031, "learning_rate": 0.00013884836852207292, "loss": 1.7534, "step": 4200 }, { "epoch": 2.7511196417146513, "grad_norm": 1.7303767204284668, "learning_rate": 0.0001350095969289827, "loss": 1.7605, "step": 4300 }, { "epoch": 2.815099168266155, "grad_norm": 1.6892797946929932, "learning_rate": 0.00013117082533589252, "loss": 1.7343, "step": 4400 }, { "epoch": 2.8790786948176583, "grad_norm": 1.700649380683899, "learning_rate": 0.0001273320537428023, "loss": 1.7545, "step": 4500 }, { "epoch": 2.943058221369162, "grad_norm": 1.7158896923065186, "learning_rate": 0.0001234932821497121, "loss": 1.7472, "step": 4600 }, { "epoch": 3.0070377479206654, "grad_norm": 1.5952404737472534, "learning_rate": 0.00011965451055662187, "loss": 1.709, "step": 4700 }, { "epoch": 3.071017274472169, "grad_norm": 1.8965271711349487, "learning_rate": 0.00011581573896353166, "loss": 1.5308, "step": 4800 }, { "epoch": 3.1349968010236724, "grad_norm": 1.9957573413848877, "learning_rate": 0.00011197696737044146, "loss": 1.564, "step": 4900 }, { "epoch": 3.198976327575176, "grad_norm": 2.0544333457946777, "learning_rate": 0.00010813819577735124, "loss": 1.5668, "step": 5000 }, { "epoch": 3.2629558541266794, "grad_norm": 2.041703462600708, "learning_rate": 0.00010429942418426103, "loss": 1.5766, "step": 5100 }, { "epoch": 3.326935380678183, "grad_norm": 2.300631284713745, "learning_rate": 0.00010046065259117082, "loss": 1.5701, "step": 5200 }, { "epoch": 3.3909149072296865, "grad_norm": 1.9454134702682495, "learning_rate": 9.662188099808061e-05, "loss": 1.5701, "step": 5300 }, { "epoch": 3.45489443378119, "grad_norm": 2.113377571105957, "learning_rate": 9.278310940499041e-05, "loss": 1.5882, "step": 5400 }, { "epoch": 3.5188739603326935, "grad_norm": 2.2492353916168213, "learning_rate": 8.894433781190018e-05, "loss": 1.5778, "step": 5500 }, { "epoch": 3.582853486884197, "grad_norm": 2.1024489402770996, "learning_rate": 8.510556621880996e-05, "loss": 1.5926, "step": 5600 }, { "epoch": 3.6468330134357005, "grad_norm": 2.1116743087768555, "learning_rate": 8.126679462571976e-05, "loss": 1.5937, "step": 5700 }, { "epoch": 3.710812539987204, "grad_norm": 2.013080596923828, "learning_rate": 7.742802303262955e-05, "loss": 1.5913, "step": 5800 }, { "epoch": 3.7747920665387076, "grad_norm": 2.1557400226593018, "learning_rate": 7.358925143953934e-05, "loss": 1.6041, "step": 5900 }, { "epoch": 3.838771593090211, "grad_norm": 2.10186767578125, "learning_rate": 6.975047984644913e-05, "loss": 1.5799, "step": 6000 }, { "epoch": 3.9027511196417146, "grad_norm": 2.129519462585449, "learning_rate": 6.591170825335893e-05, "loss": 1.5946, "step": 6100 }, { "epoch": 3.966730646193218, "grad_norm": 2.045646905899048, "learning_rate": 6.20729366602687e-05, "loss": 1.5882, "step": 6200 }, { "epoch": 4.030710172744722, "grad_norm": 2.2427146434783936, "learning_rate": 5.82341650671785e-05, "loss": 1.5005, "step": 6300 }, { "epoch": 4.094689699296225, "grad_norm": 2.2632296085357666, "learning_rate": 5.439539347408829e-05, "loss": 1.3888, "step": 6400 }, { "epoch": 4.158669225847729, "grad_norm": 2.541220188140869, "learning_rate": 5.0556621880998075e-05, "loss": 1.4124, "step": 6500 }, { "epoch": 4.222648752399232, "grad_norm": 2.566311836242676, "learning_rate": 4.6717850287907866e-05, "loss": 1.409, "step": 6600 }, { "epoch": 4.286628278950736, "grad_norm": 2.425945281982422, "learning_rate": 4.287907869481765e-05, "loss": 1.4134, "step": 6700 }, { "epoch": 4.350607805502239, "grad_norm": 2.4377615451812744, "learning_rate": 3.904030710172744e-05, "loss": 1.4257, "step": 6800 }, { "epoch": 4.414587332053743, "grad_norm": 2.6660194396972656, "learning_rate": 3.5201535508637234e-05, "loss": 1.4288, "step": 6900 }, { "epoch": 4.478566858605246, "grad_norm": 2.393036365509033, "learning_rate": 3.1362763915547026e-05, "loss": 1.4182, "step": 7000 }, { "epoch": 4.54254638515675, "grad_norm": 2.6361422538757324, "learning_rate": 2.752399232245681e-05, "loss": 1.4149, "step": 7100 }, { "epoch": 4.606525911708253, "grad_norm": 2.6104772090911865, "learning_rate": 2.3685220729366603e-05, "loss": 1.407, "step": 7200 }, { "epoch": 4.670505438259757, "grad_norm": 2.4266579151153564, "learning_rate": 1.9846449136276387e-05, "loss": 1.4113, "step": 7300 }, { "epoch": 4.73448496481126, "grad_norm": 2.5348973274230957, "learning_rate": 1.600767754318618e-05, "loss": 1.3936, "step": 7400 }, { "epoch": 4.798464491362764, "grad_norm": 2.3764045238494873, "learning_rate": 1.2168905950095967e-05, "loss": 1.4097, "step": 7500 }, { "epoch": 4.862444017914267, "grad_norm": 2.4267590045928955, "learning_rate": 8.330134357005757e-06, "loss": 1.4209, "step": 7600 }, { "epoch": 4.926423544465771, "grad_norm": 2.7150962352752686, "learning_rate": 4.4913627639155465e-06, "loss": 1.4148, "step": 7700 }, { "epoch": 4.990403071017274, "grad_norm": 2.550471067428589, "learning_rate": 6.525911708253358e-07, "loss": 1.4254, "step": 7800 } ], "logging_steps": 100, "max_steps": 7815, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.2180160877992346e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }