{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10000, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004, "grad_norm": 28.533812348552967, "learning_rate": 2e-09, "logits/chosen": 64.40786743164062, "logits/rejected": 45.84376525878906, "logps/chosen": -100.49942779541016, "logps/rejected": -65.1117172241211, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.004, "grad_norm": 30.859731079131965, "learning_rate": 2e-08, "logits/chosen": 57.91438674926758, "logits/rejected": 56.28672790527344, "logps/chosen": -55.96101760864258, "logps/rejected": -68.71676635742188, "loss": 0.6919, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": -0.007306650280952454, "rewards/margins": -0.014283686876296997, "rewards/rejected": 0.006977038457989693, "step": 10 }, { "epoch": 0.008, "grad_norm": 27.85742234963813, "learning_rate": 4e-08, "logits/chosen": 58.18536376953125, "logits/rejected": 55.3450927734375, "logps/chosen": -54.76381301879883, "logps/rejected": -60.58549880981445, "loss": 0.6953, "rewards/accuracies": 0.5, "rewards/chosen": 0.011082855053246021, "rewards/margins": 0.0007482476648874581, "rewards/rejected": 0.010334606282413006, "step": 20 }, { "epoch": 0.012, "grad_norm": 23.673728380473765, "learning_rate": 6e-08, "logits/chosen": 58.1522102355957, "logits/rejected": 57.0158576965332, "logps/chosen": -57.9251823425293, "logps/rejected": -69.07908630371094, "loss": 0.6949, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.008253919892013073, "rewards/margins": 0.016202187165617943, "rewards/rejected": -0.007948270067572594, "step": 30 }, { "epoch": 0.016, "grad_norm": 28.67344451856464, "learning_rate": 8e-08, "logits/chosen": 58.062538146972656, "logits/rejected": 57.1534309387207, "logps/chosen": -61.0693473815918, "logps/rejected": -73.38080596923828, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": 0.019384900107979774, "rewards/margins": 0.0025020535103976727, "rewards/rejected": 0.016882847994565964, "step": 40 }, { "epoch": 0.02, "grad_norm": 31.647118654717907, "learning_rate": 1e-07, "logits/chosen": 59.191986083984375, "logits/rejected": 53.941322326660156, "logps/chosen": -56.7202033996582, "logps/rejected": -66.81363677978516, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.009077167138457298, "rewards/margins": 0.010364162735641003, "rewards/rejected": -0.0012869939673691988, "step": 50 }, { "epoch": 0.024, "grad_norm": 24.669256574908005, "learning_rate": 1.2e-07, "logits/chosen": 56.26934051513672, "logits/rejected": 56.80645751953125, "logps/chosen": -58.25422286987305, "logps/rejected": -62.55634689331055, "loss": 0.6951, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00721455505117774, "rewards/margins": -0.0019339373102411628, "rewards/rejected": -0.005280619021505117, "step": 60 }, { "epoch": 0.028, "grad_norm": 26.74384143919655, "learning_rate": 1.4e-07, "logits/chosen": 58.523521423339844, "logits/rejected": 55.622764587402344, "logps/chosen": -54.381553649902344, "logps/rejected": -60.544677734375, "loss": 0.6959, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.024661114439368248, "rewards/margins": -0.0074427248910069466, "rewards/rejected": -0.017218390479683876, "step": 70 }, { "epoch": 0.032, "grad_norm": 31.840440623804586, "learning_rate": 1.6e-07, "logits/chosen": 54.36933135986328, "logits/rejected": 59.20940017700195, "logps/chosen": -58.11821365356445, "logps/rejected": -70.71514892578125, "loss": 0.7036, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.01636091247200966, "rewards/margins": -0.008592168800532818, "rewards/rejected": -0.007768744137138128, "step": 80 }, { "epoch": 0.036, "grad_norm": 34.438677762408695, "learning_rate": 1.8e-07, "logits/chosen": 57.025726318359375, "logits/rejected": 58.043739318847656, "logps/chosen": -54.304481506347656, "logps/rejected": -68.1335678100586, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": -0.009257683530449867, "rewards/margins": -0.007113204337656498, "rewards/rejected": -0.002144479425624013, "step": 90 }, { "epoch": 0.04, "grad_norm": 27.55087060204859, "learning_rate": 2e-07, "logits/chosen": 58.55786895751953, "logits/rejected": 55.395263671875, "logps/chosen": -63.9180793762207, "logps/rejected": -65.21315002441406, "loss": 0.6993, "rewards/accuracies": 0.5, "rewards/chosen": -0.010401496663689613, "rewards/margins": 0.014750251546502113, "rewards/rejected": -0.025151750072836876, "step": 100 }, { "epoch": 0.044, "grad_norm": 34.47770001073974, "learning_rate": 2.1999999999999998e-07, "logits/chosen": 56.813232421875, "logits/rejected": 56.03766632080078, "logps/chosen": -58.6301155090332, "logps/rejected": -74.81730651855469, "loss": 0.6944, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.02511642314493656, "rewards/margins": 0.00451306626200676, "rewards/rejected": -0.02962948940694332, "step": 110 }, { "epoch": 0.048, "grad_norm": 30.953399050687445, "learning_rate": 2.4e-07, "logits/chosen": 57.8872184753418, "logits/rejected": 57.22182083129883, "logps/chosen": -59.11281204223633, "logps/rejected": -70.58454895019531, "loss": 0.6964, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04789043590426445, "rewards/margins": 0.009716681204736233, "rewards/rejected": -0.057607125490903854, "step": 120 }, { "epoch": 0.052, "grad_norm": 28.466689861082365, "learning_rate": 2.6e-07, "logits/chosen": 57.37153244018555, "logits/rejected": 56.422645568847656, "logps/chosen": -62.57783889770508, "logps/rejected": -73.04331970214844, "loss": 0.6871, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.023358307778835297, "rewards/margins": 0.020040828734636307, "rewards/rejected": -0.043399132788181305, "step": 130 }, { "epoch": 0.056, "grad_norm": 28.47666983148757, "learning_rate": 2.8e-07, "logits/chosen": 56.3998908996582, "logits/rejected": 56.23912811279297, "logps/chosen": -54.915016174316406, "logps/rejected": -70.63322448730469, "loss": 0.6795, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04837559536099434, "rewards/margins": 0.03748052567243576, "rewards/rejected": -0.0858561173081398, "step": 140 }, { "epoch": 0.06, "grad_norm": 31.41163403514937, "learning_rate": 3e-07, "logits/chosen": 58.12103271484375, "logits/rejected": 56.27421188354492, "logps/chosen": -63.10313034057617, "logps/rejected": -66.35491943359375, "loss": 0.6908, "rewards/accuracies": 0.5, "rewards/chosen": -0.08965396881103516, "rewards/margins": -0.004495841450989246, "rewards/rejected": -0.08515812456607819, "step": 150 }, { "epoch": 0.064, "grad_norm": 26.840900506278157, "learning_rate": 3.2e-07, "logits/chosen": 55.839454650878906, "logits/rejected": 58.34368133544922, "logps/chosen": -50.59211349487305, "logps/rejected": -73.50721740722656, "loss": 0.6767, "rewards/accuracies": 0.625, "rewards/chosen": -0.06941857188940048, "rewards/margins": 0.04276493936777115, "rewards/rejected": -0.11218351125717163, "step": 160 }, { "epoch": 0.068, "grad_norm": 27.292091726963864, "learning_rate": 3.4000000000000003e-07, "logits/chosen": 57.70301055908203, "logits/rejected": 56.25288009643555, "logps/chosen": -63.054412841796875, "logps/rejected": -68.32513427734375, "loss": 0.6661, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07662694901227951, "rewards/margins": 0.05604536086320877, "rewards/rejected": -0.13267230987548828, "step": 170 }, { "epoch": 0.072, "grad_norm": 30.73299858141547, "learning_rate": 3.6e-07, "logits/chosen": 56.193214416503906, "logits/rejected": 58.65944290161133, "logps/chosen": -59.230201721191406, "logps/rejected": -72.97864532470703, "loss": 0.6541, "rewards/accuracies": 0.75, "rewards/chosen": -0.11543898284435272, "rewards/margins": 0.10617627948522568, "rewards/rejected": -0.221615269780159, "step": 180 }, { "epoch": 0.076, "grad_norm": 27.159221184778563, "learning_rate": 3.7999999999999996e-07, "logits/chosen": 57.91032791137695, "logits/rejected": 55.99603271484375, "logps/chosen": -59.06760787963867, "logps/rejected": -74.1017837524414, "loss": 0.642, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12510164082050323, "rewards/margins": 0.09499747306108475, "rewards/rejected": -0.22009912133216858, "step": 190 }, { "epoch": 0.08, "grad_norm": 27.221778714068638, "learning_rate": 4e-07, "logits/chosen": 58.486793518066406, "logits/rejected": 55.91951370239258, "logps/chosen": -58.47349166870117, "logps/rejected": -62.717193603515625, "loss": 0.6393, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12836897373199463, "rewards/margins": 0.1229625716805458, "rewards/rejected": -0.25133153796195984, "step": 200 }, { "epoch": 0.084, "grad_norm": 29.03653153648935, "learning_rate": 4.1999999999999995e-07, "logits/chosen": 57.81336212158203, "logits/rejected": 58.15123748779297, "logps/chosen": -61.33382034301758, "logps/rejected": -74.703125, "loss": 0.6376, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.23162539303302765, "rewards/margins": 0.11328538507223129, "rewards/rejected": -0.34491077065467834, "step": 210 }, { "epoch": 0.088, "grad_norm": 26.95404534183593, "learning_rate": 4.3999999999999997e-07, "logits/chosen": 57.90010452270508, "logits/rejected": 54.545555114746094, "logps/chosen": -63.3861198425293, "logps/rejected": -69.0276107788086, "loss": 0.6285, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.22368836402893066, "rewards/margins": 0.12568101286888123, "rewards/rejected": -0.3493694067001343, "step": 220 }, { "epoch": 0.092, "grad_norm": 26.883373858805115, "learning_rate": 4.6e-07, "logits/chosen": 56.64574432373047, "logits/rejected": 55.44426345825195, "logps/chosen": -54.02278518676758, "logps/rejected": -70.68146514892578, "loss": 0.608, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3271670937538147, "rewards/margins": 0.13781145215034485, "rewards/rejected": -0.46497854590415955, "step": 230 }, { "epoch": 0.096, "grad_norm": 26.103399727840255, "learning_rate": 4.8e-07, "logits/chosen": 56.06645965576172, "logits/rejected": 53.25556182861328, "logps/chosen": -59.9603271484375, "logps/rejected": -73.2823257446289, "loss": 0.6089, "rewards/accuracies": 0.75, "rewards/chosen": -0.3069990277290344, "rewards/margins": 0.23678632080554962, "rewards/rejected": -0.5437853336334229, "step": 240 }, { "epoch": 0.1, "grad_norm": 23.722260965256044, "learning_rate": 5e-07, "logits/chosen": 56.92375946044922, "logits/rejected": 55.26447677612305, "logps/chosen": -69.1079330444336, "logps/rejected": -73.78802490234375, "loss": 0.6297, "rewards/accuracies": 0.625, "rewards/chosen": -0.4947397708892822, "rewards/margins": 0.14389055967330933, "rewards/rejected": -0.6386303305625916, "step": 250 }, { "epoch": 0.104, "grad_norm": 25.422708939657227, "learning_rate": 4.99975631002326e-07, "logits/chosen": 54.239952087402344, "logits/rejected": 54.537017822265625, "logps/chosen": -58.73674392700195, "logps/rejected": -69.70175170898438, "loss": 0.5727, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4754973351955414, "rewards/margins": 0.28132364153862, "rewards/rejected": -0.7568209767341614, "step": 260 }, { "epoch": 0.108, "grad_norm": 26.17681951761786, "learning_rate": 4.999025287600885e-07, "logits/chosen": 56.4760627746582, "logits/rejected": 54.777679443359375, "logps/chosen": -68.2262954711914, "logps/rejected": -77.97975158691406, "loss": 0.558, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5877800583839417, "rewards/margins": 0.398207426071167, "rewards/rejected": -0.9859874844551086, "step": 270 }, { "epoch": 0.112, "grad_norm": 20.299221772759086, "learning_rate": 4.997807075247145e-07, "logits/chosen": 55.838172912597656, "logits/rejected": 55.217002868652344, "logps/chosen": -58.478309631347656, "logps/rejected": -74.65936279296875, "loss": 0.4985, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5783756971359253, "rewards/margins": 0.5172783136367798, "rewards/rejected": -1.095654010772705, "step": 280 }, { "epoch": 0.116, "grad_norm": 28.36770834985096, "learning_rate": 4.996101910454953e-07, "logits/chosen": 56.4325065612793, "logits/rejected": 54.16033172607422, "logps/chosen": -69.69315338134766, "logps/rejected": -79.90055847167969, "loss": 0.5267, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7261027097702026, "rewards/margins": 0.5981040596961975, "rewards/rejected": -1.324206829071045, "step": 290 }, { "epoch": 0.12, "grad_norm": 25.02377762418046, "learning_rate": 4.99391012564956e-07, "logits/chosen": 53.800933837890625, "logits/rejected": 55.656532287597656, "logps/chosen": -66.37216186523438, "logps/rejected": -82.07975769042969, "loss": 0.4772, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7666959762573242, "rewards/margins": 0.5859344005584717, "rewards/rejected": -1.352630376815796, "step": 300 }, { "epoch": 0.124, "grad_norm": 29.446782373035884, "learning_rate": 4.991232148123761e-07, "logits/chosen": 57.31342697143555, "logits/rejected": 54.00205612182617, "logps/chosen": -72.55195617675781, "logps/rejected": -85.0060806274414, "loss": 0.4842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.089627742767334, "rewards/margins": 0.5522868633270264, "rewards/rejected": -1.64191472530365, "step": 310 }, { "epoch": 0.128, "grad_norm": 31.47681515659833, "learning_rate": 4.988068499954577e-07, "logits/chosen": 55.69438934326172, "logits/rejected": 54.8391227722168, "logps/chosen": -70.90913391113281, "logps/rejected": -82.49961853027344, "loss": 0.5177, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8803223371505737, "rewards/margins": 0.687156081199646, "rewards/rejected": -1.5674786567687988, "step": 320 }, { "epoch": 0.132, "grad_norm": 29.597337732185277, "learning_rate": 4.984419797901491e-07, "logits/chosen": 53.4282112121582, "logits/rejected": 54.79206466674805, "logps/chosen": -69.86516571044922, "logps/rejected": -90.27011108398438, "loss": 0.4469, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9048943519592285, "rewards/margins": 1.1756442785263062, "rewards/rejected": -2.080538511276245, "step": 330 }, { "epoch": 0.136, "grad_norm": 17.009359229526957, "learning_rate": 4.980286753286194e-07, "logits/chosen": 52.42332077026367, "logits/rejected": 57.04686737060547, "logps/chosen": -64.6177749633789, "logps/rejected": -89.59310150146484, "loss": 0.4411, "rewards/accuracies": 0.875, "rewards/chosen": -1.1452090740203857, "rewards/margins": 0.9245441555976868, "rewards/rejected": -2.0697531700134277, "step": 340 }, { "epoch": 0.14, "grad_norm": 25.909018906834667, "learning_rate": 4.975670171853925e-07, "logits/chosen": 56.42598342895508, "logits/rejected": 53.42926025390625, "logps/chosen": -69.07212829589844, "logps/rejected": -84.19108581542969, "loss": 0.4175, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2340478897094727, "rewards/margins": 1.0201036930084229, "rewards/rejected": -2.2541518211364746, "step": 350 }, { "epoch": 0.144, "grad_norm": 27.75848608177815, "learning_rate": 4.970570953616382e-07, "logits/chosen": 53.7008056640625, "logits/rejected": 55.94395065307617, "logps/chosen": -66.99665832519531, "logps/rejected": -87.76789855957031, "loss": 0.4435, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3015058040618896, "rewards/margins": 0.9406806826591492, "rewards/rejected": -2.2421865463256836, "step": 360 }, { "epoch": 0.148, "grad_norm": 24.273863705867576, "learning_rate": 4.964990092676262e-07, "logits/chosen": 49.97655487060547, "logits/rejected": 58.69891357421875, "logps/chosen": -68.9350814819336, "logps/rejected": -98.9703369140625, "loss": 0.3817, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4265937805175781, "rewards/margins": 1.1566370725631714, "rewards/rejected": -2.583230495452881, "step": 370 }, { "epoch": 0.152, "grad_norm": 37.00653796376565, "learning_rate": 4.958928677033465e-07, "logits/chosen": 55.870933532714844, "logits/rejected": 54.31379318237305, "logps/chosen": -77.0854263305664, "logps/rejected": -90.93013000488281, "loss": 0.4856, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7595773935317993, "rewards/margins": 0.8833327293395996, "rewards/rejected": -2.6429102420806885, "step": 380 }, { "epoch": 0.156, "grad_norm": 28.256248306216513, "learning_rate": 4.952387888372978e-07, "logits/chosen": 53.375587463378906, "logits/rejected": 52.61153030395508, "logps/chosen": -78.0414810180664, "logps/rejected": -98.44496154785156, "loss": 0.38, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7100744247436523, "rewards/margins": 1.1639991998672485, "rewards/rejected": -2.8740735054016113, "step": 390 }, { "epoch": 0.16, "grad_norm": 29.2072173920723, "learning_rate": 4.945369001834514e-07, "logits/chosen": 51.90997314453125, "logits/rejected": 53.769920349121094, "logps/chosen": -66.5736083984375, "logps/rejected": -94.8015365600586, "loss": 0.4308, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6438815593719482, "rewards/margins": 1.1793160438537598, "rewards/rejected": -2.823197841644287, "step": 400 }, { "epoch": 0.164, "grad_norm": 20.246428107882906, "learning_rate": 4.937873385763907e-07, "logits/chosen": 51.662147521972656, "logits/rejected": 52.028175354003906, "logps/chosen": -79.66545867919922, "logps/rejected": -107.46966552734375, "loss": 0.394, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8286956548690796, "rewards/margins": 1.288888931274414, "rewards/rejected": -3.117584705352783, "step": 410 }, { "epoch": 0.168, "grad_norm": 27.307865717439135, "learning_rate": 4.929902501446366e-07, "logits/chosen": 50.78424835205078, "logits/rejected": 50.39312744140625, "logps/chosen": -75.19126892089844, "logps/rejected": -95.62612915039062, "loss": 0.4315, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.0071020126342773, "rewards/margins": 1.3031747341156006, "rewards/rejected": -3.310276746749878, "step": 420 }, { "epoch": 0.172, "grad_norm": 20.03898118642927, "learning_rate": 4.921457902821578e-07, "logits/chosen": 53.468177795410156, "logits/rejected": 53.30914306640625, "logps/chosen": -82.42778015136719, "logps/rejected": -109.83222961425781, "loss": 0.3376, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6454699039459229, "rewards/margins": 1.615443468093872, "rewards/rejected": -3.260913133621216, "step": 430 }, { "epoch": 0.176, "grad_norm": 30.325957397734495, "learning_rate": 4.912541236180778e-07, "logits/chosen": 51.10878372192383, "logits/rejected": 53.10695266723633, "logps/chosen": -74.79537200927734, "logps/rejected": -100.06138610839844, "loss": 0.3929, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8177331686019897, "rewards/margins": 1.257211446762085, "rewards/rejected": -3.0749447345733643, "step": 440 }, { "epoch": 0.18, "grad_norm": 14.262220601317786, "learning_rate": 4.903154239845797e-07, "logits/chosen": 51.69508743286133, "logits/rejected": 52.68989181518555, "logps/chosen": -68.33168029785156, "logps/rejected": -107.64765930175781, "loss": 0.3691, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.747357726097107, "rewards/margins": 1.74077570438385, "rewards/rejected": -3.488133192062378, "step": 450 }, { "epoch": 0.184, "grad_norm": 22.979886521568336, "learning_rate": 4.893298743830167e-07, "logits/chosen": 52.4492301940918, "logits/rejected": 53.06267547607422, "logps/chosen": -81.17327117919922, "logps/rejected": -111.0452651977539, "loss": 0.3741, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.073838710784912, "rewards/margins": 1.7042049169540405, "rewards/rejected": -3.778043031692505, "step": 460 }, { "epoch": 0.188, "grad_norm": 14.722797570982818, "learning_rate": 4.882976669482367e-07, "logits/chosen": 50.58995819091797, "logits/rejected": 51.509681701660156, "logps/chosen": -79.2942123413086, "logps/rejected": -102.22583770751953, "loss": 0.3359, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9251492023468018, "rewards/margins": 1.7899284362792969, "rewards/rejected": -3.7150776386260986, "step": 470 }, { "epoch": 0.192, "grad_norm": 14.245338009793953, "learning_rate": 4.872190029111241e-07, "logits/chosen": 50.895538330078125, "logits/rejected": 54.43767166137695, "logps/chosen": -79.74618530273438, "logps/rejected": -120.28243255615234, "loss": 0.2751, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.2277004718780518, "rewards/margins": 1.9976272583007812, "rewards/rejected": -4.225327491760254, "step": 480 }, { "epoch": 0.196, "grad_norm": 21.050016087172274, "learning_rate": 4.860940925593702e-07, "logits/chosen": 48.723018646240234, "logits/rejected": 50.939937591552734, "logps/chosen": -77.64906311035156, "logps/rejected": -111.71051025390625, "loss": 0.2957, "rewards/accuracies": 0.875, "rewards/chosen": -2.465684652328491, "rewards/margins": 2.0183236598968506, "rewards/rejected": -4.484008312225342, "step": 490 }, { "epoch": 0.2, "grad_norm": 50.320855504854826, "learning_rate": 4.849231551964771e-07, "logits/chosen": 51.236595153808594, "logits/rejected": 50.26428985595703, "logps/chosen": -92.86045837402344, "logps/rejected": -112.11564636230469, "loss": 0.3985, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6628010272979736, "rewards/margins": 1.7543842792510986, "rewards/rejected": -4.417185306549072, "step": 500 }, { "epoch": 0.204, "grad_norm": 14.73313528703832, "learning_rate": 4.837064190990036e-07, "logits/chosen": 55.05743408203125, "logits/rejected": 50.69791793823242, "logps/chosen": -98.44610595703125, "logps/rejected": -122.69828796386719, "loss": 0.3647, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5547454357147217, "rewards/margins": 2.1788697242736816, "rewards/rejected": -4.733614921569824, "step": 510 }, { "epoch": 0.208, "grad_norm": 27.57905277549734, "learning_rate": 4.824441214720628e-07, "logits/chosen": 50.13386535644531, "logits/rejected": 50.777809143066406, "logps/chosen": -86.73146057128906, "logps/rejected": -112.5364990234375, "loss": 0.3732, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.320972442626953, "rewards/margins": 2.0467121601104736, "rewards/rejected": -4.367684364318848, "step": 520 }, { "epoch": 0.212, "grad_norm": 25.097188193390338, "learning_rate": 4.811365084030783e-07, "logits/chosen": 49.33405685424805, "logits/rejected": 50.48732376098633, "logps/chosen": -75.11846160888672, "logps/rejected": -104.57139587402344, "loss": 0.3868, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.4332354068756104, "rewards/margins": 1.9999496936798096, "rewards/rejected": -4.433184623718262, "step": 530 }, { "epoch": 0.216, "grad_norm": 32.7903194698391, "learning_rate": 4.797838348138086e-07, "logits/chosen": 50.36452102661133, "logits/rejected": 50.02265167236328, "logps/chosen": -90.46897888183594, "logps/rejected": -112.5868911743164, "loss": 0.3413, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6965606212615967, "rewards/margins": 1.6849288940429688, "rewards/rejected": -4.381489276885986, "step": 540 }, { "epoch": 0.22, "grad_norm": 24.213068673102388, "learning_rate": 4.783863644106502e-07, "logits/chosen": 52.17128372192383, "logits/rejected": 50.315284729003906, "logps/chosen": -93.4994125366211, "logps/rejected": -119.5452880859375, "loss": 0.3351, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.50398325920105, "rewards/margins": 2.50455904006958, "rewards/rejected": -5.008542537689209, "step": 550 }, { "epoch": 0.224, "grad_norm": 29.49589281477644, "learning_rate": 4.769443696332272e-07, "logits/chosen": 47.702674865722656, "logits/rejected": 50.343238830566406, "logps/chosen": -83.9803695678711, "logps/rejected": -123.3960189819336, "loss": 0.3181, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.564993381500244, "rewards/margins": 2.2127933502197266, "rewards/rejected": -4.7777862548828125, "step": 560 }, { "epoch": 0.228, "grad_norm": 19.69678290974152, "learning_rate": 4.7545813160127845e-07, "logits/chosen": 50.214874267578125, "logits/rejected": 48.206787109375, "logps/chosen": -90.08515930175781, "logps/rejected": -118.11824798583984, "loss": 0.4009, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.8223087787628174, "rewards/margins": 2.011645555496216, "rewards/rejected": -4.833954811096191, "step": 570 }, { "epoch": 0.232, "grad_norm": 13.246157822107891, "learning_rate": 4.739279400598532e-07, "logits/chosen": 51.175697326660156, "logits/rejected": 48.75572967529297, "logps/chosen": -82.0949935913086, "logps/rejected": -111.55613708496094, "loss": 0.3156, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.699707269668579, "rewards/margins": 1.895646333694458, "rewards/rejected": -4.595353126525879, "step": 580 }, { "epoch": 0.236, "grad_norm": 22.333620027716158, "learning_rate": 4.7235409332282436e-07, "logits/chosen": 49.49880599975586, "logits/rejected": 51.62006759643555, "logps/chosen": -73.26454162597656, "logps/rejected": -111.3040771484375, "loss": 0.3451, "rewards/accuracies": 0.875, "rewards/chosen": -2.179260730743408, "rewards/margins": 2.1387388706207275, "rewards/rejected": -4.317999839782715, "step": 590 }, { "epoch": 0.24, "grad_norm": 26.7757211390623, "learning_rate": 4.707368982147317e-07, "logits/chosen": 47.9814567565918, "logits/rejected": 51.40210723876953, "logps/chosen": -91.81172180175781, "logps/rejected": -128.8383331298828, "loss": 0.2713, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.9874844551086426, "rewards/margins": 2.1201720237731934, "rewards/rejected": -5.107656955718994, "step": 600 }, { "epoch": 0.244, "grad_norm": 40.79432104607236, "learning_rate": 4.6907667001096585e-07, "logits/chosen": 48.35408401489258, "logits/rejected": 47.354530334472656, "logps/chosen": -90.21234893798828, "logps/rejected": -118.80155944824219, "loss": 0.2689, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.0216469764709473, "rewards/margins": 2.228628635406494, "rewards/rejected": -5.250275611877441, "step": 610 }, { "epoch": 0.248, "grad_norm": 31.8455689759253, "learning_rate": 4.6737373237630473e-07, "logits/chosen": 48.34640121459961, "logits/rejected": 51.8663444519043, "logps/chosen": -85.09153747558594, "logps/rejected": -127.3193130493164, "loss": 0.3664, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.7675275802612305, "rewards/margins": 2.4404561519622803, "rewards/rejected": -5.207983493804932, "step": 620 }, { "epoch": 0.252, "grad_norm": 19.397624289021156, "learning_rate": 4.6562841730181435e-07, "logits/chosen": 48.62126159667969, "logits/rejected": 48.2672119140625, "logps/chosen": -90.38846588134766, "logps/rejected": -122.53373718261719, "loss": 0.3227, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.1435444355010986, "rewards/margins": 2.2223925590515137, "rewards/rejected": -5.365937232971191, "step": 630 }, { "epoch": 0.256, "grad_norm": 34.367583617826874, "learning_rate": 4.6384106504012665e-07, "logits/chosen": 48.072669982910156, "logits/rejected": 49.07495880126953, "logps/chosen": -93.60935974121094, "logps/rejected": -128.94163513183594, "loss": 0.2825, "rewards/accuracies": 0.875, "rewards/chosen": -3.1240580081939697, "rewards/margins": 2.519449472427368, "rewards/rejected": -5.64350700378418, "step": 640 }, { "epoch": 0.26, "grad_norm": 27.938891622888274, "learning_rate": 4.6201202403910643e-07, "logits/chosen": 45.11116027832031, "logits/rejected": 51.54563522338867, "logps/chosen": -76.74453735351562, "logps/rejected": -129.6508331298828, "loss": 0.2858, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.9022960662841797, "rewards/margins": 2.7343924045562744, "rewards/rejected": -5.636688709259033, "step": 650 }, { "epoch": 0.264, "grad_norm": 33.20013982791981, "learning_rate": 4.6014165087392105e-07, "logits/chosen": 47.67237854003906, "logits/rejected": 46.87964630126953, "logps/chosen": -87.0808334350586, "logps/rejected": -122.21199035644531, "loss": 0.2727, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.771348476409912, "rewards/margins": 2.724174976348877, "rewards/rejected": -5.495523452758789, "step": 660 }, { "epoch": 0.268, "grad_norm": 29.256933473551793, "learning_rate": 4.582303101775248e-07, "logits/chosen": 49.69163513183594, "logits/rejected": 45.95726013183594, "logps/chosen": -93.39743041992188, "logps/rejected": -120.0958023071289, "loss": 0.2879, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.911816120147705, "rewards/margins": 2.617412805557251, "rewards/rejected": -5.529229164123535, "step": 670 }, { "epoch": 0.272, "grad_norm": 43.80379099330064, "learning_rate": 4.5627837456957374e-07, "logits/chosen": 48.20374298095703, "logits/rejected": 47.810935974121094, "logps/chosen": -94.90364074707031, "logps/rejected": -125.94508361816406, "loss": 0.3458, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.2113685607910156, "rewards/margins": 2.2549712657928467, "rewards/rejected": -5.466340065002441, "step": 680 }, { "epoch": 0.276, "grad_norm": 38.68916019817628, "learning_rate": 4.542862245837821e-07, "logits/chosen": 48.5870246887207, "logits/rejected": 47.81504440307617, "logps/chosen": -102.28499603271484, "logps/rejected": -129.62196350097656, "loss": 0.2492, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.5993125438690186, "rewards/margins": 2.266263484954834, "rewards/rejected": -5.865576267242432, "step": 690 }, { "epoch": 0.28, "grad_norm": 21.578470273867236, "learning_rate": 4.5225424859373684e-07, "logits/chosen": 49.09315490722656, "logits/rejected": 48.342491149902344, "logps/chosen": -98.66539001464844, "logps/rejected": -139.3084259033203, "loss": 0.215, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.2259106636047363, "rewards/margins": 3.0345213413238525, "rewards/rejected": -6.260432243347168, "step": 700 }, { "epoch": 0.284, "grad_norm": 16.808420798200792, "learning_rate": 4.501828427371833e-07, "logits/chosen": 46.62156295776367, "logits/rejected": 48.56031036376953, "logps/chosen": -92.45174407958984, "logps/rejected": -137.70553588867188, "loss": 0.3082, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.2984604835510254, "rewards/margins": 2.9785618782043457, "rewards/rejected": -6.277022361755371, "step": 710 }, { "epoch": 0.288, "grad_norm": 25.497403458788426, "learning_rate": 4.4807241083879764e-07, "logits/chosen": 44.841453552246094, "logits/rejected": 48.752197265625, "logps/chosen": -88.73442077636719, "logps/rejected": -139.46563720703125, "loss": 0.2662, "rewards/accuracies": 0.875, "rewards/chosen": -3.5202784538269043, "rewards/margins": 2.9266552925109863, "rewards/rejected": -6.446933746337891, "step": 720 }, { "epoch": 0.292, "grad_norm": 26.661142376929647, "learning_rate": 4.459233643314599e-07, "logits/chosen": 48.446815490722656, "logits/rejected": 48.538719177246094, "logps/chosen": -84.2086410522461, "logps/rejected": -123.36454772949219, "loss": 0.2538, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9666292667388916, "rewards/margins": 2.7437281608581543, "rewards/rejected": -5.710357189178467, "step": 730 }, { "epoch": 0.296, "grad_norm": 8.139706838501583, "learning_rate": 4.437361221760449e-07, "logits/chosen": 48.00127410888672, "logits/rejected": 48.78575897216797, "logps/chosen": -96.60485076904297, "logps/rejected": -124.87052917480469, "loss": 0.2151, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.223308563232422, "rewards/margins": 2.8362784385681152, "rewards/rejected": -6.059587478637695, "step": 740 }, { "epoch": 0.3, "grad_norm": 34.66768947372861, "learning_rate": 4.415111107797445e-07, "logits/chosen": 46.998199462890625, "logits/rejected": 46.860145568847656, "logps/chosen": -99.70954895019531, "logps/rejected": -132.5911407470703, "loss": 0.3284, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.904909133911133, "rewards/margins": 2.286958932876587, "rewards/rejected": -6.191867828369141, "step": 750 }, { "epoch": 0.304, "grad_norm": 27.280956863429665, "learning_rate": 4.392487639129391e-07, "logits/chosen": 44.16685485839844, "logits/rejected": 46.52399826049805, "logps/chosen": -89.0407485961914, "logps/rejected": -125.99409484863281, "loss": 0.2755, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.4610557556152344, "rewards/margins": 2.747231960296631, "rewards/rejected": -6.208288669586182, "step": 760 }, { "epoch": 0.308, "grad_norm": 45.42173052837384, "learning_rate": 4.36949522624633e-07, "logits/chosen": 46.8876838684082, "logits/rejected": 45.308528900146484, "logps/chosen": -98.10558319091797, "logps/rejected": -131.274658203125, "loss": 0.2579, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.8448243141174316, "rewards/margins": 2.463934898376465, "rewards/rejected": -6.3087592124938965, "step": 770 }, { "epoch": 0.312, "grad_norm": 44.27869354906801, "learning_rate": 4.34613835156471e-07, "logits/chosen": 46.87074279785156, "logits/rejected": 45.099308013916016, "logps/chosen": -111.92747497558594, "logps/rejected": -131.96014404296875, "loss": 0.3277, "rewards/accuracies": 0.75, "rewards/chosen": -4.581133842468262, "rewards/margins": 2.0176618099212646, "rewards/rejected": -6.5987958908081055, "step": 780 }, { "epoch": 0.316, "grad_norm": 22.111566033012554, "learning_rate": 4.3224215685535287e-07, "logits/chosen": 45.65962219238281, "logits/rejected": 45.8907585144043, "logps/chosen": -99.42485809326172, "logps/rejected": -141.40390014648438, "loss": 0.2526, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.320735454559326, "rewards/margins": 2.8679370880126953, "rewards/rejected": -7.188672065734863, "step": 790 }, { "epoch": 0.32, "grad_norm": 52.35572333396793, "learning_rate": 4.2983495008466273e-07, "logits/chosen": 44.55757141113281, "logits/rejected": 47.1688232421875, "logps/chosen": -100.08128356933594, "logps/rejected": -150.44851684570312, "loss": 0.256, "rewards/accuracies": 0.875, "rewards/chosen": -4.098430633544922, "rewards/margins": 3.171543598175049, "rewards/rejected": -7.269974708557129, "step": 800 }, { "epoch": 0.324, "grad_norm": 33.21188816300218, "learning_rate": 4.273926841341302e-07, "logits/chosen": 46.28192138671875, "logits/rejected": 43.64000701904297, "logps/chosen": -99.21965789794922, "logps/rejected": -129.105712890625, "loss": 0.2544, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.7107014656066895, "rewards/margins": 3.0634143352508545, "rewards/rejected": -6.774115085601807, "step": 810 }, { "epoch": 0.328, "grad_norm": 12.094382688192468, "learning_rate": 4.249158351283413e-07, "logits/chosen": 44.60408020019531, "logits/rejected": 46.49042510986328, "logps/chosen": -90.05680847167969, "logps/rejected": -142.6536102294922, "loss": 0.1868, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.59674072265625, "rewards/margins": 3.3266303539276123, "rewards/rejected": -6.923371315002441, "step": 820 }, { "epoch": 0.332, "grad_norm": 4.00271990880729, "learning_rate": 4.224048859339174e-07, "logits/chosen": 45.333580017089844, "logits/rejected": 45.82084274291992, "logps/chosen": -102.02961730957031, "logps/rejected": -137.76678466796875, "loss": 0.2172, "rewards/accuracies": 0.875, "rewards/chosen": -4.266281604766846, "rewards/margins": 2.6827759742736816, "rewards/rejected": -6.949057102203369, "step": 830 }, { "epoch": 0.336, "grad_norm": 20.780824485200213, "learning_rate": 4.1986032606537916e-07, "logits/chosen": 46.636512756347656, "logits/rejected": 44.504981994628906, "logps/chosen": -97.64836120605469, "logps/rejected": -135.35731506347656, "loss": 0.2624, "rewards/accuracies": 0.875, "rewards/chosen": -3.672236680984497, "rewards/margins": 3.0585567951202393, "rewards/rejected": -6.730792999267578, "step": 840 }, { "epoch": 0.34, "grad_norm": 42.59198624354184, "learning_rate": 4.172826515897145e-07, "logits/chosen": 43.75339126586914, "logits/rejected": 47.46000289916992, "logps/chosen": -104.68599700927734, "logps/rejected": -141.9625701904297, "loss": 0.2583, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.268365859985352, "rewards/margins": 2.5187840461730957, "rewards/rejected": -6.787149906158447, "step": 850 }, { "epoch": 0.344, "grad_norm": 20.008127683146643, "learning_rate": 4.146723650296701e-07, "logits/chosen": 43.556148529052734, "logits/rejected": 43.975608825683594, "logps/chosen": -95.18064880371094, "logps/rejected": -139.5054931640625, "loss": 0.3563, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.966346025466919, "rewards/margins": 2.8288350105285645, "rewards/rejected": -6.795180320739746, "step": 860 }, { "epoch": 0.348, "grad_norm": 20.334985858976758, "learning_rate": 4.120299752657827e-07, "logits/chosen": 43.38778305053711, "logits/rejected": 45.5791015625, "logps/chosen": -98.27373504638672, "logps/rejected": -140.9309539794922, "loss": 0.1811, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.264477252960205, "rewards/margins": 3.044534206390381, "rewards/rejected": -7.309010982513428, "step": 870 }, { "epoch": 0.352, "grad_norm": 28.757062776427272, "learning_rate": 4.0935599743717244e-07, "logits/chosen": 44.6265983581543, "logits/rejected": 44.702178955078125, "logps/chosen": -97.5745620727539, "logps/rejected": -126.3858413696289, "loss": 0.2602, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.221047401428223, "rewards/margins": 2.2703094482421875, "rewards/rejected": -6.49135684967041, "step": 880 }, { "epoch": 0.356, "grad_norm": 14.773459336240398, "learning_rate": 4.066509528411151e-07, "logits/chosen": 45.27838897705078, "logits/rejected": 43.685638427734375, "logps/chosen": -95.4419937133789, "logps/rejected": -131.39169311523438, "loss": 0.198, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.907078981399536, "rewards/margins": 3.0583863258361816, "rewards/rejected": -6.9654645919799805, "step": 890 }, { "epoch": 0.36, "grad_norm": 52.381278515402784, "learning_rate": 4.039153688314145e-07, "logits/chosen": 41.99245834350586, "logits/rejected": 46.48981857299805, "logps/chosen": -99.83820343017578, "logps/rejected": -151.70034790039062, "loss": 0.2602, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.080926418304443, "rewards/margins": 3.463571071624756, "rewards/rejected": -7.544497489929199, "step": 900 }, { "epoch": 0.364, "grad_norm": 18.192241188575945, "learning_rate": 4.0114977871559377e-07, "logits/chosen": 42.960872650146484, "logits/rejected": 43.536293029785156, "logps/chosen": -89.50424194335938, "logps/rejected": -141.3973388671875, "loss": 0.1604, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7081432342529297, "rewards/margins": 3.6759707927703857, "rewards/rejected": -7.3841142654418945, "step": 910 }, { "epoch": 0.368, "grad_norm": 40.31098563989808, "learning_rate": 3.983547216509254e-07, "logits/chosen": 42.86689758300781, "logits/rejected": 45.748714447021484, "logps/chosen": -103.40921783447266, "logps/rejected": -158.37911987304688, "loss": 0.2078, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.184783458709717, "rewards/margins": 3.848146438598633, "rewards/rejected": -8.032930374145508, "step": 920 }, { "epoch": 0.372, "grad_norm": 10.07444543448383, "learning_rate": 3.9553074253932233e-07, "logits/chosen": 42.905662536621094, "logits/rejected": 41.272789001464844, "logps/chosen": -100.79463195800781, "logps/rejected": -128.82476806640625, "loss": 0.2664, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.691293239593506, "rewards/margins": 2.189171314239502, "rewards/rejected": -6.880465507507324, "step": 930 }, { "epoch": 0.376, "grad_norm": 20.93691989678441, "learning_rate": 3.9267839192110797e-07, "logits/chosen": 43.836544036865234, "logits/rejected": 42.157772064208984, "logps/chosen": -106.8996810913086, "logps/rejected": -141.3354034423828, "loss": 0.2145, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.653143405914307, "rewards/margins": 3.2738826274871826, "rewards/rejected": -7.927026271820068, "step": 940 }, { "epoch": 0.38, "grad_norm": 31.706598510789135, "learning_rate": 3.8979822586768666e-07, "logits/chosen": 41.932212829589844, "logits/rejected": 42.14612579345703, "logps/chosen": -99.3321304321289, "logps/rejected": -132.33554077148438, "loss": 0.2512, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.518923282623291, "rewards/margins": 2.864084243774414, "rewards/rejected": -7.383008003234863, "step": 950 }, { "epoch": 0.384, "grad_norm": 37.02742534141979, "learning_rate": 3.8689080587313755e-07, "logits/chosen": 40.756717681884766, "logits/rejected": 43.150856018066406, "logps/chosen": -94.36783599853516, "logps/rejected": -147.3682403564453, "loss": 0.2317, "rewards/accuracies": 0.875, "rewards/chosen": -4.52658748626709, "rewards/margins": 3.43182373046875, "rewards/rejected": -7.95841121673584, "step": 960 }, { "epoch": 0.388, "grad_norm": 46.22429207605269, "learning_rate": 3.839566987447491e-07, "logits/chosen": 40.86772918701172, "logits/rejected": 43.2618408203125, "logps/chosen": -96.85578918457031, "logps/rejected": -158.2089080810547, "loss": 0.2296, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.181971073150635, "rewards/margins": 3.6908957958221436, "rewards/rejected": -7.872866630554199, "step": 970 }, { "epoch": 0.392, "grad_norm": 53.76947098726966, "learning_rate": 3.809964764925198e-07, "logits/chosen": 43.136131286621094, "logits/rejected": 43.18577194213867, "logps/chosen": -109.56819915771484, "logps/rejected": -163.19302368164062, "loss": 0.197, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.513330459594727, "rewards/margins": 4.060441493988037, "rewards/rejected": -8.573771476745605, "step": 980 }, { "epoch": 0.396, "grad_norm": 32.036494164799386, "learning_rate": 3.780107162176429e-07, "logits/chosen": 42.74140167236328, "logits/rejected": 44.60755157470703, "logps/chosen": -110.55721282958984, "logps/rejected": -153.0295867919922, "loss": 0.2728, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.982661247253418, "rewards/margins": 2.932508945465088, "rewards/rejected": -7.915169715881348, "step": 990 }, { "epoch": 0.4, "grad_norm": 52.99187462304637, "learning_rate": 3.75e-07, "logits/chosen": 40.694297790527344, "logits/rejected": 43.3404426574707, "logps/chosen": -113.76619720458984, "logps/rejected": -155.80531311035156, "loss": 0.2724, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.276650428771973, "rewards/margins": 3.189429998397827, "rewards/rejected": -8.466080665588379, "step": 1000 }, { "epoch": 0.404, "grad_norm": 25.664982214116712, "learning_rate": 3.7196491478468316e-07, "logits/chosen": 41.09137725830078, "logits/rejected": 44.35279083251953, "logps/chosen": -101.16764068603516, "logps/rejected": -150.22509765625, "loss": 0.1726, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.6303510665893555, "rewards/margins": 3.4471046924591064, "rewards/rejected": -8.077455520629883, "step": 1010 }, { "epoch": 0.408, "grad_norm": 4.601273961866918, "learning_rate": 3.689060522675688e-07, "logits/chosen": 42.17076873779297, "logits/rejected": 43.80558776855469, "logps/chosen": -114.15092468261719, "logps/rejected": -155.05160522460938, "loss": 0.1537, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.794250965118408, "rewards/margins": 4.128665447235107, "rewards/rejected": -8.922918319702148, "step": 1020 }, { "epoch": 0.412, "grad_norm": 23.199840684684883, "learning_rate": 3.658240087799654e-07, "logits/chosen": 39.78450393676758, "logits/rejected": 42.42991638183594, "logps/chosen": -101.93121337890625, "logps/rejected": -154.7505340576172, "loss": 0.1553, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.455100059509277, "rewards/margins": 4.042738914489746, "rewards/rejected": -8.497838973999023, "step": 1030 }, { "epoch": 0.416, "grad_norm": 31.277129565852135, "learning_rate": 3.6271938517235765e-07, "logits/chosen": 40.167823791503906, "logits/rejected": 42.2595100402832, "logps/chosen": -107.13337707519531, "logps/rejected": -156.47689819335938, "loss": 0.2185, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.04506778717041, "rewards/margins": 3.721726655960083, "rewards/rejected": -8.76679515838623, "step": 1040 }, { "epoch": 0.42, "grad_norm": 27.680206880133927, "learning_rate": 3.595927866972693e-07, "logits/chosen": 38.985374450683594, "logits/rejected": 39.77815628051758, "logps/chosen": -106.81770324707031, "logps/rejected": -149.25894165039062, "loss": 0.2087, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.879289150238037, "rewards/margins": 3.535437822341919, "rewards/rejected": -8.414728164672852, "step": 1050 }, { "epoch": 0.424, "grad_norm": 1.249926854778577, "learning_rate": 3.5644482289126813e-07, "logits/chosen": 39.94206237792969, "logits/rejected": 43.77482604980469, "logps/chosen": -106.6061782836914, "logps/rejected": -151.3231658935547, "loss": 0.2439, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.947367191314697, "rewards/margins": 3.605046033859253, "rewards/rejected": -8.552412986755371, "step": 1060 }, { "epoch": 0.428, "grad_norm": 6.362016874170334, "learning_rate": 3.5327610745613546e-07, "logits/chosen": 40.91377258300781, "logits/rejected": 41.30303192138672, "logps/chosen": -115.51036071777344, "logps/rejected": -159.98422241210938, "loss": 0.1804, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.147997856140137, "rewards/margins": 3.556520462036133, "rewards/rejected": -8.70451831817627, "step": 1070 }, { "epoch": 0.432, "grad_norm": 17.200244197099025, "learning_rate": 3.500872581392238e-07, "logits/chosen": 39.87400436401367, "logits/rejected": 40.99711990356445, "logps/chosen": -107.35628509521484, "logps/rejected": -152.93698120117188, "loss": 0.1926, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.61740255355835, "rewards/margins": 3.3743271827697754, "rewards/rejected": -8.991729736328125, "step": 1080 }, { "epoch": 0.436, "grad_norm": 6.686230161730707, "learning_rate": 3.468788966130257e-07, "logits/chosen": 39.11838150024414, "logits/rejected": 40.328468322753906, "logps/chosen": -111.20975494384766, "logps/rejected": -164.8342742919922, "loss": 0.2234, "rewards/accuracies": 0.875, "rewards/chosen": -5.404995918273926, "rewards/margins": 4.121036529541016, "rewards/rejected": -9.526032447814941, "step": 1090 }, { "epoch": 0.44, "grad_norm": 43.68985759612088, "learning_rate": 3.43651648353978e-07, "logits/chosen": 38.73166275024414, "logits/rejected": 42.4930534362793, "logps/chosen": -101.05915069580078, "logps/rejected": -165.46095275878906, "loss": 0.1615, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.2063822746276855, "rewards/margins": 4.015776634216309, "rewards/rejected": -9.222158432006836, "step": 1100 }, { "epoch": 0.444, "grad_norm": 26.738862667956763, "learning_rate": 3.40406142520523e-07, "logits/chosen": 42.12168502807617, "logits/rejected": 41.788394927978516, "logps/chosen": -123.79948425292969, "logps/rejected": -153.36776733398438, "loss": 0.2421, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -5.871365547180176, "rewards/margins": 2.8958840370178223, "rewards/rejected": -8.76724910736084, "step": 1110 }, { "epoch": 0.448, "grad_norm": 17.03770002319556, "learning_rate": 3.371430118304538e-07, "logits/chosen": 36.466461181640625, "logits/rejected": 40.315452575683594, "logps/chosen": -112.66862487792969, "logps/rejected": -174.78466796875, "loss": 0.2123, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.390850067138672, "rewards/margins": 4.580835342407227, "rewards/rejected": -9.971685409545898, "step": 1120 }, { "epoch": 0.452, "grad_norm": 24.01278859618808, "learning_rate": 3.338628924375638e-07, "logits/chosen": 40.4681282043457, "logits/rejected": 41.152469635009766, "logps/chosen": -113.4747543334961, "logps/rejected": -158.70449829101562, "loss": 0.2367, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -5.428784370422363, "rewards/margins": 3.6359362602233887, "rewards/rejected": -9.06472110748291, "step": 1130 }, { "epoch": 0.456, "grad_norm": 29.87651257208437, "learning_rate": 3.305664238076278e-07, "logits/chosen": 39.47797775268555, "logits/rejected": 40.79694366455078, "logps/chosen": -100.62152099609375, "logps/rejected": -157.24269104003906, "loss": 0.1456, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.463057994842529, "rewards/margins": 4.45237922668457, "rewards/rejected": -8.915437698364258, "step": 1140 }, { "epoch": 0.46, "grad_norm": 20.100774899172244, "learning_rate": 3.272542485937368e-07, "logits/chosen": 38.019832611083984, "logits/rejected": 39.65892791748047, "logps/chosen": -99.71014404296875, "logps/rejected": -152.48388671875, "loss": 0.2683, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.522518157958984, "rewards/margins": 4.272935390472412, "rewards/rejected": -8.795454025268555, "step": 1150 }, { "epoch": 0.464, "grad_norm": 17.271502510574486, "learning_rate": 3.2392701251101167e-07, "logits/chosen": 38.93779754638672, "logits/rejected": 43.86870574951172, "logps/chosen": -106.25166320800781, "logps/rejected": -169.70816040039062, "loss": 0.1857, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.724178314208984, "rewards/margins": 4.432940483093262, "rewards/rejected": -9.157119750976562, "step": 1160 }, { "epoch": 0.468, "grad_norm": 14.85667949892512, "learning_rate": 3.2058536421071914e-07, "logits/chosen": 41.40235900878906, "logits/rejected": 40.91047286987305, "logps/chosen": -121.07177734375, "logps/rejected": -163.6028289794922, "loss": 0.2094, "rewards/accuracies": 0.875, "rewards/chosen": -5.4994378089904785, "rewards/margins": 3.685161590576172, "rewards/rejected": -9.184598922729492, "step": 1170 }, { "epoch": 0.472, "grad_norm": 27.172740058479278, "learning_rate": 3.172299551538164e-07, "logits/chosen": 38.644805908203125, "logits/rejected": 41.06559371948242, "logps/chosen": -116.37313079833984, "logps/rejected": -170.63470458984375, "loss": 0.2396, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.4368085861206055, "rewards/margins": 4.352160453796387, "rewards/rejected": -9.788969039916992, "step": 1180 }, { "epoch": 0.476, "grad_norm": 68.26652671242996, "learning_rate": 3.1386143948394763e-07, "logits/chosen": 39.66558074951172, "logits/rejected": 39.35725784301758, "logps/chosen": -110.7066421508789, "logps/rejected": -148.9991455078125, "loss": 0.2147, "rewards/accuracies": 0.875, "rewards/chosen": -5.205443382263184, "rewards/margins": 3.4201292991638184, "rewards/rejected": -8.62557315826416, "step": 1190 }, { "epoch": 0.48, "grad_norm": 29.844573464953335, "learning_rate": 3.104804738999169e-07, "logits/chosen": 38.45096969604492, "logits/rejected": 39.93912887573242, "logps/chosen": -118.87516021728516, "logps/rejected": -177.38160705566406, "loss": 0.1226, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.623927116394043, "rewards/margins": 4.835057735443115, "rewards/rejected": -10.458983421325684, "step": 1200 }, { "epoch": 0.484, "grad_norm": 33.168807520397415, "learning_rate": 3.0708771752766395e-07, "logits/chosen": 37.66096496582031, "logits/rejected": 41.729278564453125, "logps/chosen": -113.61344909667969, "logps/rejected": -181.90975952148438, "loss": 0.2327, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.367053985595703, "rewards/margins": 5.373785495758057, "rewards/rejected": -10.740839958190918, "step": 1210 }, { "epoch": 0.488, "grad_norm": 35.949428687976074, "learning_rate": 3.036838317917658e-07, "logits/chosen": 39.2284049987793, "logits/rejected": 39.56584167480469, "logps/chosen": -116.41062927246094, "logps/rejected": -161.64004516601562, "loss": 0.2307, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.617162704467773, "rewards/margins": 3.87888765335083, "rewards/rejected": -9.496048927307129, "step": 1220 }, { "epoch": 0.492, "grad_norm": 21.071566027005225, "learning_rate": 3.002694802864912e-07, "logits/chosen": 38.54627227783203, "logits/rejected": 41.567474365234375, "logps/chosen": -126.9264144897461, "logps/rejected": -171.91610717773438, "loss": 0.1884, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -6.11997127532959, "rewards/margins": 3.630235195159912, "rewards/rejected": -9.75020694732666, "step": 1230 }, { "epoch": 0.496, "grad_norm": 19.472597346579764, "learning_rate": 2.968453286464312e-07, "logits/chosen": 38.088741302490234, "logits/rejected": 40.73246383666992, "logps/chosen": -110.57450103759766, "logps/rejected": -164.86895751953125, "loss": 0.1301, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.830416679382324, "rewards/margins": 3.7488045692443848, "rewards/rejected": -9.579221725463867, "step": 1240 }, { "epoch": 0.5, "grad_norm": 9.297784140084044, "learning_rate": 2.934120444167326e-07, "logits/chosen": 39.20268249511719, "logits/rejected": 39.98442077636719, "logps/chosen": -110.739013671875, "logps/rejected": -155.21205139160156, "loss": 0.1205, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.348870277404785, "rewards/margins": 4.167140007019043, "rewards/rejected": -9.516010284423828, "step": 1250 }, { "epoch": 0.504, "grad_norm": 15.15594138373471, "learning_rate": 2.899702969229587e-07, "logits/chosen": 39.848670959472656, "logits/rejected": 38.895225524902344, "logps/chosen": -113.55269622802734, "logps/rejected": -164.986083984375, "loss": 0.1671, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.258578777313232, "rewards/margins": 4.728585243225098, "rewards/rejected": -9.987164497375488, "step": 1260 }, { "epoch": 0.508, "grad_norm": 58.68168492653791, "learning_rate": 2.865207571406029e-07, "logits/chosen": 39.46464920043945, "logits/rejected": 37.720924377441406, "logps/chosen": -124.67518615722656, "logps/rejected": -169.3942413330078, "loss": 0.2329, "rewards/accuracies": 0.875, "rewards/chosen": -5.639501571655273, "rewards/margins": 4.678294658660889, "rewards/rejected": -10.31779670715332, "step": 1270 }, { "epoch": 0.512, "grad_norm": 26.11850298779012, "learning_rate": 2.830640975642806e-07, "logits/chosen": 36.690589904785156, "logits/rejected": 37.93729782104492, "logps/chosen": -115.35652160644531, "logps/rejected": -169.6790008544922, "loss": 0.1717, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.6271867752075195, "rewards/margins": 4.886000633239746, "rewards/rejected": -10.513187408447266, "step": 1280 }, { "epoch": 0.516, "grad_norm": 18.8486793071701, "learning_rate": 2.796009920766253e-07, "logits/chosen": 34.7783203125, "logits/rejected": 40.29841232299805, "logps/chosen": -109.31401062011719, "logps/rejected": -170.9484100341797, "loss": 0.1634, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.725811958312988, "rewards/margins": 4.456699848175049, "rewards/rejected": -10.182512283325195, "step": 1290 }, { "epoch": 0.52, "grad_norm": 47.45375394157672, "learning_rate": 2.761321158169134e-07, "logits/chosen": 37.883079528808594, "logits/rejected": 40.114280700683594, "logps/chosen": -124.6867904663086, "logps/rejected": -172.8629150390625, "loss": 0.2061, "rewards/accuracies": 0.875, "rewards/chosen": -6.434460639953613, "rewards/margins": 3.7778351306915283, "rewards/rejected": -10.212295532226562, "step": 1300 }, { "epoch": 0.524, "grad_norm": 16.432297078281568, "learning_rate": 2.726581450494451e-07, "logits/chosen": 35.259639739990234, "logits/rejected": 40.99456024169922, "logps/chosen": -111.4013900756836, "logps/rejected": -174.25094604492188, "loss": 0.154, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.062236785888672, "rewards/margins": 4.306510925292969, "rewards/rejected": -10.368746757507324, "step": 1310 }, { "epoch": 0.528, "grad_norm": 30.21091242436278, "learning_rate": 2.6917975703170465e-07, "logits/chosen": 35.50202178955078, "logits/rejected": 39.13561248779297, "logps/chosen": -113.72837829589844, "logps/rejected": -176.2806396484375, "loss": 0.176, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.105679035186768, "rewards/margins": 4.945074081420898, "rewards/rejected": -10.050752639770508, "step": 1320 }, { "epoch": 0.532, "grad_norm": 31.809974002259562, "learning_rate": 2.6569762988232837e-07, "logits/chosen": 35.528358459472656, "logits/rejected": 39.22612380981445, "logps/chosen": -108.7910385131836, "logps/rejected": -159.57582092285156, "loss": 0.2619, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -6.001819133758545, "rewards/margins": 3.755993366241455, "rewards/rejected": -9.757813453674316, "step": 1330 }, { "epoch": 0.536, "grad_norm": 18.493232174978285, "learning_rate": 2.6221244244890336e-07, "logits/chosen": 34.30480194091797, "logits/rejected": 38.207496643066406, "logps/chosen": -104.4682388305664, "logps/rejected": -172.13088989257812, "loss": 0.1043, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1313066482543945, "rewards/margins": 5.272080421447754, "rewards/rejected": -10.403387069702148, "step": 1340 }, { "epoch": 0.54, "grad_norm": 24.170314835898346, "learning_rate": 2.5872487417562527e-07, "logits/chosen": 36.6849479675293, "logits/rejected": 37.138465881347656, "logps/chosen": -123.2945785522461, "logps/rejected": -178.46588134765625, "loss": 0.1769, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.805965423583984, "rewards/margins": 4.122119426727295, "rewards/rejected": -10.928085327148438, "step": 1350 }, { "epoch": 0.544, "grad_norm": 23.047663073832965, "learning_rate": 2.5523560497083924e-07, "logits/chosen": 35.815589904785156, "logits/rejected": 37.165496826171875, "logps/chosen": -118.9620361328125, "logps/rejected": -173.7703094482422, "loss": 0.1205, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.1362409591674805, "rewards/margins": 4.513824462890625, "rewards/rejected": -10.650065422058105, "step": 1360 }, { "epoch": 0.548, "grad_norm": 48.732887198712326, "learning_rate": 2.5174531507449037e-07, "logits/chosen": 36.993038177490234, "logits/rejected": 36.96677780151367, "logps/chosen": -132.6764678955078, "logps/rejected": -172.9881591796875, "loss": 0.2913, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.736947536468506, "rewards/margins": 4.004305362701416, "rewards/rejected": -10.741253852844238, "step": 1370 }, { "epoch": 0.552, "grad_norm": 18.181102544014543, "learning_rate": 2.482546849255096e-07, "logits/chosen": 34.073097229003906, "logits/rejected": 36.943172454833984, "logps/chosen": -117.58740234375, "logps/rejected": -197.7882537841797, "loss": 0.1062, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.832341194152832, "rewards/margins": 6.429389953613281, "rewards/rejected": -12.26173210144043, "step": 1380 }, { "epoch": 0.556, "grad_norm": 15.544623320366716, "learning_rate": 2.447643950291608e-07, "logits/chosen": 33.73606491088867, "logits/rejected": 36.898773193359375, "logps/chosen": -119.47406005859375, "logps/rejected": -167.00711059570312, "loss": 0.2418, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.214328289031982, "rewards/margins": 4.625095844268799, "rewards/rejected": -10.839425086975098, "step": 1390 }, { "epoch": 0.56, "grad_norm": 77.37565224093566, "learning_rate": 2.412751258243748e-07, "logits/chosen": 33.67988204956055, "logits/rejected": 34.09700393676758, "logps/chosen": -128.5489501953125, "logps/rejected": -176.24871826171875, "loss": 0.1963, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.089766502380371, "rewards/margins": 3.989410400390625, "rewards/rejected": -11.07917594909668, "step": 1400 }, { "epoch": 0.564, "grad_norm": 30.43460528340991, "learning_rate": 2.3778755755109667e-07, "logits/chosen": 34.34267044067383, "logits/rejected": 37.77847671508789, "logps/chosen": -105.62583923339844, "logps/rejected": -190.62722778320312, "loss": 0.1466, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.149474620819092, "rewards/margins": 6.530264854431152, "rewards/rejected": -11.679739952087402, "step": 1410 }, { "epoch": 0.568, "grad_norm": 7.007072586126817, "learning_rate": 2.3430237011767164e-07, "logits/chosen": 33.648799896240234, "logits/rejected": 36.46876525878906, "logps/chosen": -113.22758483886719, "logps/rejected": -178.4688262939453, "loss": 0.1731, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.234652042388916, "rewards/margins": 4.812324047088623, "rewards/rejected": -11.046976089477539, "step": 1420 }, { "epoch": 0.572, "grad_norm": 19.130152675478577, "learning_rate": 2.3082024296829532e-07, "logits/chosen": 35.6358757019043, "logits/rejected": 36.253257751464844, "logps/chosen": -113.02217864990234, "logps/rejected": -185.16107177734375, "loss": 0.1124, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.546517372131348, "rewards/margins": 5.918461799621582, "rewards/rejected": -11.464980125427246, "step": 1430 }, { "epoch": 0.576, "grad_norm": 18.125883555971487, "learning_rate": 2.2734185495055498e-07, "logits/chosen": 35.95376968383789, "logits/rejected": 37.0533332824707, "logps/chosen": -121.62260437011719, "logps/rejected": -175.02882385253906, "loss": 0.1505, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.334244251251221, "rewards/margins": 4.729000568389893, "rewards/rejected": -11.06324577331543, "step": 1440 }, { "epoch": 0.58, "grad_norm": 56.564917397952655, "learning_rate": 2.2386788418308665e-07, "logits/chosen": 33.461631774902344, "logits/rejected": 37.12461853027344, "logps/chosen": -112.4884262084961, "logps/rejected": -202.30970764160156, "loss": 0.0664, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.148140907287598, "rewards/margins": 7.338738441467285, "rewards/rejected": -12.486879348754883, "step": 1450 }, { "epoch": 0.584, "grad_norm": 43.54182209696087, "learning_rate": 2.2039900792337474e-07, "logits/chosen": 32.17646026611328, "logits/rejected": 35.69232940673828, "logps/chosen": -121.44427490234375, "logps/rejected": -192.20877075195312, "loss": 0.1323, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.396025657653809, "rewards/margins": 5.983513832092285, "rewards/rejected": -12.379539489746094, "step": 1460 }, { "epoch": 0.588, "grad_norm": 75.32127336294906, "learning_rate": 2.1693590243571935e-07, "logits/chosen": 34.178611755371094, "logits/rejected": 36.59508514404297, "logps/chosen": -123.4473648071289, "logps/rejected": -192.17855834960938, "loss": 0.2353, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.135283470153809, "rewards/margins": 5.6739821434021, "rewards/rejected": -11.80926513671875, "step": 1470 }, { "epoch": 0.592, "grad_norm": 45.58891579183569, "learning_rate": 2.1347924285939712e-07, "logits/chosen": 34.34710693359375, "logits/rejected": 34.652095794677734, "logps/chosen": -125.2497329711914, "logps/rejected": -172.30828857421875, "loss": 0.2612, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.1709747314453125, "rewards/margins": 4.1582560539245605, "rewards/rejected": -11.329230308532715, "step": 1480 }, { "epoch": 0.596, "grad_norm": 15.251049246391148, "learning_rate": 2.100297030770413e-07, "logits/chosen": 34.02775573730469, "logits/rejected": 35.586952209472656, "logps/chosen": -118.30632019042969, "logps/rejected": -179.9751434326172, "loss": 0.0947, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.386832237243652, "rewards/margins": 5.212705135345459, "rewards/rejected": -11.599536895751953, "step": 1490 }, { "epoch": 0.6, "grad_norm": 25.472331773437826, "learning_rate": 2.065879555832674e-07, "logits/chosen": 31.809621810913086, "logits/rejected": 35.93482208251953, "logps/chosen": -122.5445785522461, "logps/rejected": -203.33628845214844, "loss": 0.1295, "rewards/accuracies": 1.0, "rewards/chosen": -6.351222991943359, "rewards/margins": 6.3509392738342285, "rewards/rejected": -12.702162742614746, "step": 1500 }, { "epoch": 0.604, "grad_norm": 36.99215746879388, "learning_rate": 2.0315467135356878e-07, "logits/chosen": 34.279014587402344, "logits/rejected": 36.6539421081543, "logps/chosen": -118.03910827636719, "logps/rejected": -185.2469482421875, "loss": 0.1324, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.5454583168029785, "rewards/margins": 5.202276229858398, "rewards/rejected": -11.747734069824219, "step": 1510 }, { "epoch": 0.608, "grad_norm": 5.72817076275817, "learning_rate": 1.9973051971350888e-07, "logits/chosen": 32.36498260498047, "logits/rejected": 36.872802734375, "logps/chosen": -119.40788269042969, "logps/rejected": -182.93051147460938, "loss": 0.1548, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.524139404296875, "rewards/margins": 4.851975440979004, "rewards/rejected": -11.376115798950195, "step": 1520 }, { "epoch": 0.612, "grad_norm": 62.7485215727146, "learning_rate": 1.9631616820823418e-07, "logits/chosen": 33.554473876953125, "logits/rejected": 34.441917419433594, "logps/chosen": -128.63380432128906, "logps/rejected": -187.55038452148438, "loss": 0.2279, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.242335319519043, "rewards/margins": 5.341699600219727, "rewards/rejected": -11.58403491973877, "step": 1530 }, { "epoch": 0.616, "grad_norm": 50.55388655519452, "learning_rate": 1.9291228247233603e-07, "logits/chosen": 34.57413101196289, "logits/rejected": 38.980873107910156, "logps/chosen": -124.8611068725586, "logps/rejected": -188.2270050048828, "loss": 0.1368, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.550134181976318, "rewards/margins": 4.793581008911133, "rewards/rejected": -11.34371566772461, "step": 1540 }, { "epoch": 0.62, "grad_norm": 20.98136994366918, "learning_rate": 1.895195261000831e-07, "logits/chosen": 32.666053771972656, "logits/rejected": 36.28156661987305, "logps/chosen": -123.55598449707031, "logps/rejected": -190.2432098388672, "loss": 0.2085, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.651708126068115, "rewards/margins": 5.466277122497559, "rewards/rejected": -12.117984771728516, "step": 1550 }, { "epoch": 0.624, "grad_norm": 3.695214530883218, "learning_rate": 1.861385605160524e-07, "logits/chosen": 34.80469512939453, "logits/rejected": 37.0577507019043, "logps/chosen": -127.52436828613281, "logps/rejected": -200.47572326660156, "loss": 0.1962, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.4474639892578125, "rewards/margins": 6.185749530792236, "rewards/rejected": -12.633213996887207, "step": 1560 }, { "epoch": 0.628, "grad_norm": 6.204487863528272, "learning_rate": 1.8277004484618357e-07, "logits/chosen": 35.11884689331055, "logits/rejected": 34.14295959472656, "logps/chosen": -123.8480224609375, "logps/rejected": -175.71351623535156, "loss": 0.1239, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.91684103012085, "rewards/margins": 5.305278778076172, "rewards/rejected": -11.22212028503418, "step": 1570 }, { "epoch": 0.632, "grad_norm": 27.777656870634978, "learning_rate": 1.7941463578928083e-07, "logits/chosen": 31.193029403686523, "logits/rejected": 34.23871612548828, "logps/chosen": -110.49909973144531, "logps/rejected": -183.0133514404297, "loss": 0.1618, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.92338752746582, "rewards/margins": 5.685771942138672, "rewards/rejected": -11.609160423278809, "step": 1580 }, { "epoch": 0.636, "grad_norm": 25.59557039318974, "learning_rate": 1.760729874889884e-07, "logits/chosen": 34.273590087890625, "logits/rejected": 36.67070388793945, "logps/chosen": -123.4420394897461, "logps/rejected": -188.76451110839844, "loss": 0.1413, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.854828834533691, "rewards/margins": 5.536397457122803, "rewards/rejected": -11.391225814819336, "step": 1590 }, { "epoch": 0.64, "grad_norm": 54.20969538897634, "learning_rate": 1.7274575140626315e-07, "logits/chosen": 35.63384246826172, "logits/rejected": 36.39768600463867, "logps/chosen": -142.24166870117188, "logps/rejected": -204.695556640625, "loss": 0.2189, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.065789699554443, "rewards/margins": 5.59266471862793, "rewards/rejected": -12.658454895019531, "step": 1600 }, { "epoch": 0.644, "grad_norm": 23.952776600344606, "learning_rate": 1.6943357619237225e-07, "logits/chosen": 30.349929809570312, "logits/rejected": 35.475894927978516, "logps/chosen": -136.0992431640625, "logps/rejected": -200.9610595703125, "loss": 0.1836, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.412652492523193, "rewards/margins": 4.916618347167969, "rewards/rejected": -12.32927131652832, "step": 1610 }, { "epoch": 0.648, "grad_norm": 20.457675381739918, "learning_rate": 1.6613710756243627e-07, "logits/chosen": 34.76255416870117, "logits/rejected": 34.33202362060547, "logps/chosen": -123.5507583618164, "logps/rejected": -174.77584838867188, "loss": 0.1588, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.010063171386719, "rewards/margins": 4.99750280380249, "rewards/rejected": -12.007566452026367, "step": 1620 }, { "epoch": 0.652, "grad_norm": 8.740451188853967, "learning_rate": 1.6285698816954624e-07, "logits/chosen": 32.92230987548828, "logits/rejected": 37.1563720703125, "logps/chosen": -125.0332260131836, "logps/rejected": -198.4005126953125, "loss": 0.146, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.896471977233887, "rewards/margins": 5.607396602630615, "rewards/rejected": -12.503868103027344, "step": 1630 }, { "epoch": 0.656, "grad_norm": 12.82247945171976, "learning_rate": 1.5959385747947695e-07, "logits/chosen": 32.56360626220703, "logits/rejected": 36.79113006591797, "logps/chosen": -115.17347717285156, "logps/rejected": -194.03790283203125, "loss": 0.1129, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.800166130065918, "rewards/margins": 5.930178165435791, "rewards/rejected": -11.73034381866455, "step": 1640 }, { "epoch": 0.66, "grad_norm": 54.05169324828869, "learning_rate": 1.5634835164602196e-07, "logits/chosen": 30.58074378967285, "logits/rejected": 34.36854934692383, "logps/chosen": -113.1893539428711, "logps/rejected": -182.39627075195312, "loss": 0.1431, "rewards/accuracies": 0.875, "rewards/chosen": -6.311237812042236, "rewards/margins": 5.61757755279541, "rewards/rejected": -11.928815841674805, "step": 1650 }, { "epoch": 0.664, "grad_norm": 24.825767693321374, "learning_rate": 1.5312110338697427e-07, "logits/chosen": 35.91250228881836, "logits/rejected": 36.06043243408203, "logps/chosen": -124.50358581542969, "logps/rejected": -201.7527313232422, "loss": 0.1154, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.220883369445801, "rewards/margins": 6.432749271392822, "rewards/rejected": -12.653631210327148, "step": 1660 }, { "epoch": 0.668, "grad_norm": 21.917052429686564, "learning_rate": 1.4991274186077628e-07, "logits/chosen": 32.15296936035156, "logits/rejected": 34.598384857177734, "logps/chosen": -128.67645263671875, "logps/rejected": -194.93014526367188, "loss": 0.2175, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.7162299156188965, "rewards/margins": 5.552061557769775, "rewards/rejected": -12.268292427062988, "step": 1670 }, { "epoch": 0.672, "grad_norm": 73.93391750261412, "learning_rate": 1.4672389254386457e-07, "logits/chosen": 33.413238525390625, "logits/rejected": 34.18181610107422, "logps/chosen": -124.91861724853516, "logps/rejected": -205.4623260498047, "loss": 0.2138, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.610821723937988, "rewards/margins": 6.807159423828125, "rewards/rejected": -13.41797924041748, "step": 1680 }, { "epoch": 0.676, "grad_norm": 14.047455580384543, "learning_rate": 1.4355517710873182e-07, "logits/chosen": 34.12297821044922, "logits/rejected": 35.99665451049805, "logps/chosen": -121.33824157714844, "logps/rejected": -176.9901123046875, "loss": 0.1439, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.118622779846191, "rewards/margins": 5.124892234802246, "rewards/rejected": -11.243515014648438, "step": 1690 }, { "epoch": 0.68, "grad_norm": 0.7246303761815563, "learning_rate": 1.404072133027306e-07, "logits/chosen": 33.210655212402344, "logits/rejected": 36.71358108520508, "logps/chosen": -116.57881927490234, "logps/rejected": -182.92950439453125, "loss": 0.0992, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.269998550415039, "rewards/margins": 5.008700370788574, "rewards/rejected": -11.27869987487793, "step": 1700 }, { "epoch": 0.684, "grad_norm": 34.85736611285892, "learning_rate": 1.3728061482764235e-07, "logits/chosen": 32.43885040283203, "logits/rejected": 34.203834533691406, "logps/chosen": -125.60906982421875, "logps/rejected": -184.39035034179688, "loss": 0.1539, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.697512149810791, "rewards/margins": 5.102963924407959, "rewards/rejected": -11.800477027893066, "step": 1710 }, { "epoch": 0.688, "grad_norm": 9.586401996642, "learning_rate": 1.341759912200346e-07, "logits/chosen": 32.35176467895508, "logits/rejected": 34.566925048828125, "logps/chosen": -126.4211196899414, "logps/rejected": -186.43441772460938, "loss": 0.1744, "rewards/accuracies": 0.875, "rewards/chosen": -6.952803611755371, "rewards/margins": 4.971192836761475, "rewards/rejected": -11.923995971679688, "step": 1720 }, { "epoch": 0.692, "grad_norm": 51.225200538375525, "learning_rate": 1.3109394773243115e-07, "logits/chosen": 32.60570526123047, "logits/rejected": 34.30889892578125, "logps/chosen": -112.5150375366211, "logps/rejected": -185.24020385742188, "loss": 0.1232, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.684107780456543, "rewards/margins": 5.926398277282715, "rewards/rejected": -11.610506057739258, "step": 1730 }, { "epoch": 0.696, "grad_norm": 7.892076438359368, "learning_rate": 1.2803508521531677e-07, "logits/chosen": 32.87091827392578, "logits/rejected": 35.00414276123047, "logps/chosen": -115.19721984863281, "logps/rejected": -186.33670043945312, "loss": 0.1689, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.1829729080200195, "rewards/margins": 5.768080711364746, "rewards/rejected": -11.951054573059082, "step": 1740 }, { "epoch": 0.7, "grad_norm": 71.60841944470864, "learning_rate": 1.2500000000000005e-07, "logits/chosen": 31.9818115234375, "logits/rejected": 34.28449630737305, "logps/chosen": -131.71206665039062, "logps/rejected": -179.4838104248047, "loss": 0.2651, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.21808385848999, "rewards/margins": 4.244929313659668, "rewards/rejected": -11.463014602661133, "step": 1750 }, { "epoch": 0.704, "grad_norm": 11.403139100682084, "learning_rate": 1.2198928378235715e-07, "logits/chosen": 35.4173583984375, "logits/rejected": 32.19343185424805, "logps/chosen": -141.1168212890625, "logps/rejected": -190.25442504882812, "loss": 0.1845, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.428712368011475, "rewards/margins": 4.997444152832031, "rewards/rejected": -12.426156997680664, "step": 1760 }, { "epoch": 0.708, "grad_norm": 15.614574564320913, "learning_rate": 1.1900352350748024e-07, "logits/chosen": 32.91938018798828, "logits/rejected": 37.30883026123047, "logps/chosen": -128.07351684570312, "logps/rejected": -202.3236083984375, "loss": 0.1459, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.7811689376831055, "rewards/margins": 5.805022239685059, "rewards/rejected": -12.586191177368164, "step": 1770 }, { "epoch": 0.712, "grad_norm": 36.93773962705704, "learning_rate": 1.1604330125525078e-07, "logits/chosen": 33.32939910888672, "logits/rejected": 34.7056770324707, "logps/chosen": -126.90995025634766, "logps/rejected": -206.4558563232422, "loss": 0.109, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.557587623596191, "rewards/margins": 6.45965576171875, "rewards/rejected": -13.017242431640625, "step": 1780 }, { "epoch": 0.716, "grad_norm": 19.64090712772826, "learning_rate": 1.1310919412686245e-07, "logits/chosen": 31.58943748474121, "logits/rejected": 32.97970962524414, "logps/chosen": -120.78947448730469, "logps/rejected": -185.22781372070312, "loss": 0.1554, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.556436061859131, "rewards/margins": 5.089656829833984, "rewards/rejected": -11.646093368530273, "step": 1790 }, { "epoch": 0.72, "grad_norm": 1.8073260877539248, "learning_rate": 1.1020177413231332e-07, "logits/chosen": 32.85557556152344, "logits/rejected": 36.344078063964844, "logps/chosen": -119.52928161621094, "logps/rejected": -186.12684631347656, "loss": 0.1001, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.19125509262085, "rewards/margins": 5.620010852813721, "rewards/rejected": -11.81126594543457, "step": 1800 }, { "epoch": 0.724, "grad_norm": 1.233410177278683, "learning_rate": 1.073216080788921e-07, "logits/chosen": 31.43521499633789, "logits/rejected": 34.373809814453125, "logps/chosen": -117.0105972290039, "logps/rejected": -202.0782928466797, "loss": 0.0878, "rewards/accuracies": 1.0, "rewards/chosen": -6.203524112701416, "rewards/margins": 6.944014072418213, "rewards/rejected": -13.147537231445312, "step": 1810 }, { "epoch": 0.728, "grad_norm": 76.66139035605511, "learning_rate": 1.0446925746067766e-07, "logits/chosen": 34.948951721191406, "logits/rejected": 38.018917083740234, "logps/chosen": -128.2957305908203, "logps/rejected": -192.70611572265625, "loss": 0.1567, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.262060642242432, "rewards/margins": 5.220892906188965, "rewards/rejected": -12.482953071594238, "step": 1820 }, { "epoch": 0.732, "grad_norm": 3.2686353366997087, "learning_rate": 1.0164527834907466e-07, "logits/chosen": 33.62763977050781, "logits/rejected": 32.20967483520508, "logps/chosen": -115.2950668334961, "logps/rejected": -181.83990478515625, "loss": 0.1879, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.048405170440674, "rewards/margins": 6.285736083984375, "rewards/rejected": -12.334141731262207, "step": 1830 }, { "epoch": 0.736, "grad_norm": 19.72682359281921, "learning_rate": 9.885022128440629e-08, "logits/chosen": 30.57301902770996, "logits/rejected": 33.91904067993164, "logps/chosen": -123.07301330566406, "logps/rejected": -196.01779174804688, "loss": 0.1789, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.5850043296813965, "rewards/margins": 5.699263572692871, "rewards/rejected": -12.28426742553711, "step": 1840 }, { "epoch": 0.74, "grad_norm": 27.782784282950555, "learning_rate": 9.608463116858542e-08, "logits/chosen": 32.8736457824707, "logits/rejected": 33.712486267089844, "logps/chosen": -124.52708435058594, "logps/rejected": -185.42349243164062, "loss": 0.2559, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.897887229919434, "rewards/margins": 4.95641565322876, "rewards/rejected": -11.854303359985352, "step": 1850 }, { "epoch": 0.744, "grad_norm": 11.434937847700061, "learning_rate": 9.334904715888494e-08, "logits/chosen": 32.95331954956055, "logits/rejected": 34.37748336791992, "logps/chosen": -135.6911163330078, "logps/rejected": -199.61776733398438, "loss": 0.1256, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.312950134277344, "rewards/margins": 5.76673698425293, "rewards/rejected": -13.079686164855957, "step": 1860 }, { "epoch": 0.748, "grad_norm": 21.711629131923438, "learning_rate": 9.064400256282755e-08, "logits/chosen": 32.11374282836914, "logits/rejected": 32.81106185913086, "logps/chosen": -126.60643005371094, "logps/rejected": -190.9490966796875, "loss": 0.1124, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.639864444732666, "rewards/margins": 5.9442009925842285, "rewards/rejected": -12.584066390991211, "step": 1870 }, { "epoch": 0.752, "grad_norm": 4.882787255482076, "learning_rate": 8.797002473421727e-08, "logits/chosen": 32.480838775634766, "logits/rejected": 34.38404846191406, "logps/chosen": -120.6583023071289, "logps/rejected": -187.3675079345703, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": -6.4553680419921875, "rewards/margins": 5.736647605895996, "rewards/rejected": -12.192015647888184, "step": 1880 }, { "epoch": 0.756, "grad_norm": 73.20268420406155, "learning_rate": 8.532763497032986e-08, "logits/chosen": 30.1946964263916, "logits/rejected": 35.3504638671875, "logps/chosen": -130.51853942871094, "logps/rejected": -197.19638061523438, "loss": 0.1867, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.48050594329834, "rewards/margins": 5.310255527496338, "rewards/rejected": -12.79076099395752, "step": 1890 }, { "epoch": 0.76, "grad_norm": 62.43015390970909, "learning_rate": 8.271734841028552e-08, "logits/chosen": 30.940185546875, "logits/rejected": 34.006492614746094, "logps/chosen": -125.14324951171875, "logps/rejected": -189.832763671875, "loss": 0.2225, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.153977870941162, "rewards/margins": 5.524373531341553, "rewards/rejected": -12.678351402282715, "step": 1900 }, { "epoch": 0.764, "grad_norm": 58.76354716466513, "learning_rate": 8.013967393462093e-08, "logits/chosen": 32.89315414428711, "logits/rejected": 35.409217834472656, "logps/chosen": -114.31298828125, "logps/rejected": -197.35025024414062, "loss": 0.089, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.788350582122803, "rewards/margins": 6.997166633605957, "rewards/rejected": -12.785517692565918, "step": 1910 }, { "epoch": 0.768, "grad_norm": 40.55885429267448, "learning_rate": 7.759511406608255e-08, "logits/chosen": 30.79556655883789, "logits/rejected": 33.01952362060547, "logps/chosen": -136.01585388183594, "logps/rejected": -189.73330688476562, "loss": 0.134, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.579942226409912, "rewards/margins": 4.825619220733643, "rewards/rejected": -12.405561447143555, "step": 1920 }, { "epoch": 0.772, "grad_norm": 11.57424881967435, "learning_rate": 7.508416487165862e-08, "logits/chosen": 28.126171112060547, "logits/rejected": 33.28355026245117, "logps/chosen": -123.10121154785156, "logps/rejected": -203.12960815429688, "loss": 0.093, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.236498832702637, "rewards/margins": 5.934015274047852, "rewards/rejected": -13.170514106750488, "step": 1930 }, { "epoch": 0.776, "grad_norm": 21.840133243019153, "learning_rate": 7.260731586586982e-08, "logits/chosen": 31.944263458251953, "logits/rejected": 30.55642318725586, "logps/chosen": -123.80488586425781, "logps/rejected": -195.6708526611328, "loss": 0.162, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.678342342376709, "rewards/margins": 6.705534934997559, "rewards/rejected": -13.383877754211426, "step": 1940 }, { "epoch": 0.78, "grad_norm": 13.922142563059072, "learning_rate": 7.016504991533726e-08, "logits/chosen": 32.323158264160156, "logits/rejected": 34.14168167114258, "logps/chosen": -122.30216979980469, "logps/rejected": -184.92527770996094, "loss": 0.178, "rewards/accuracies": 0.875, "rewards/chosen": -6.948062896728516, "rewards/margins": 4.983668327331543, "rewards/rejected": -11.931732177734375, "step": 1950 }, { "epoch": 0.784, "grad_norm": 43.52548154571825, "learning_rate": 6.775784314464716e-08, "logits/chosen": 31.536640167236328, "logits/rejected": 33.37910461425781, "logps/chosen": -122.21321868896484, "logps/rejected": -192.3970184326172, "loss": 0.055, "rewards/accuracies": 1.0, "rewards/chosen": -6.536123752593994, "rewards/margins": 6.116715908050537, "rewards/rejected": -12.652839660644531, "step": 1960 }, { "epoch": 0.788, "grad_norm": 14.960056288014227, "learning_rate": 6.538616484352902e-08, "logits/chosen": 30.65342140197754, "logits/rejected": 31.002685546875, "logps/chosen": -119.79139709472656, "logps/rejected": -194.21560668945312, "loss": 0.1468, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.551787376403809, "rewards/margins": 6.742495536804199, "rewards/rejected": -13.294283866882324, "step": 1970 }, { "epoch": 0.792, "grad_norm": 63.96276766967657, "learning_rate": 6.305047737536707e-08, "logits/chosen": 32.422218322753906, "logits/rejected": 34.42628479003906, "logps/chosen": -141.8745574951172, "logps/rejected": -196.88424682617188, "loss": 0.2212, "rewards/accuracies": 0.875, "rewards/chosen": -8.096589088439941, "rewards/margins": 4.696638584136963, "rewards/rejected": -12.79322624206543, "step": 1980 }, { "epoch": 0.796, "grad_norm": 6.994169229009863, "learning_rate": 6.075123608706093e-08, "logits/chosen": 31.055633544921875, "logits/rejected": 33.48008346557617, "logps/chosen": -122.30110931396484, "logps/rejected": -194.860595703125, "loss": 0.1579, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.244012355804443, "rewards/margins": 6.198871612548828, "rewards/rejected": -12.44288444519043, "step": 1990 }, { "epoch": 0.8, "grad_norm": 21.842683348401124, "learning_rate": 5.848888922025552e-08, "logits/chosen": 31.886560440063477, "logits/rejected": 35.48186492919922, "logps/chosen": -135.24679565429688, "logps/rejected": -206.4519500732422, "loss": 0.0904, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.111126899719238, "rewards/margins": 5.913634300231934, "rewards/rejected": -13.024760246276855, "step": 2000 }, { "epoch": 0.804, "grad_norm": 10.8862310531102, "learning_rate": 5.6263877823955115e-08, "logits/chosen": 33.398433685302734, "logits/rejected": 33.78891372680664, "logps/chosen": -123.2912368774414, "logps/rejected": -192.1443634033203, "loss": 0.1133, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.0102667808532715, "rewards/margins": 5.909992218017578, "rewards/rejected": -12.920259475708008, "step": 2010 }, { "epoch": 0.808, "grad_norm": 42.03601830195942, "learning_rate": 5.4076635668540065e-08, "logits/chosen": 29.69512939453125, "logits/rejected": 33.41075134277344, "logps/chosen": -120.9056625366211, "logps/rejected": -192.8750457763672, "loss": 0.1104, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.7075395584106445, "rewards/margins": 5.716131687164307, "rewards/rejected": -12.423670768737793, "step": 2020 }, { "epoch": 0.812, "grad_norm": 32.933513594571174, "learning_rate": 5.192758916120235e-08, "logits/chosen": 32.2730712890625, "logits/rejected": 35.09003448486328, "logps/chosen": -126.17935943603516, "logps/rejected": -197.20889282226562, "loss": 0.1222, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.885321140289307, "rewards/margins": 5.830145835876465, "rewards/rejected": -12.71546745300293, "step": 2030 }, { "epoch": 0.816, "grad_norm": 42.07730167231684, "learning_rate": 4.981715726281666e-08, "logits/chosen": 32.14569091796875, "logits/rejected": 33.87199401855469, "logps/chosen": -137.12149047851562, "logps/rejected": -203.73658752441406, "loss": 0.2057, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.4929962158203125, "rewards/margins": 5.718442916870117, "rewards/rejected": -13.211441040039062, "step": 2040 }, { "epoch": 0.82, "grad_norm": 15.321840192563384, "learning_rate": 4.774575140626316e-08, "logits/chosen": 30.928787231445312, "logits/rejected": 33.557373046875, "logps/chosen": -122.8592529296875, "logps/rejected": -194.76742553710938, "loss": 0.1891, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.503848075866699, "rewards/margins": 6.216989040374756, "rewards/rejected": -12.720837593078613, "step": 2050 }, { "epoch": 0.824, "grad_norm": 71.00238324419556, "learning_rate": 4.5713775416217875e-08, "logits/chosen": 29.931808471679688, "logits/rejected": 32.696876525878906, "logps/chosen": -133.57138061523438, "logps/rejected": -205.98837280273438, "loss": 0.1357, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.4561920166015625, "rewards/margins": 5.7885637283325195, "rewards/rejected": -13.244755744934082, "step": 2060 }, { "epoch": 0.828, "grad_norm": 9.35367075528383, "learning_rate": 4.372162543042623e-08, "logits/chosen": 31.20676612854004, "logits/rejected": 31.955196380615234, "logps/chosen": -135.50155639648438, "logps/rejected": -201.00784301757812, "loss": 0.0994, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.0842413902282715, "rewards/margins": 6.034939289093018, "rewards/rejected": -13.119178771972656, "step": 2070 }, { "epoch": 0.832, "grad_norm": 24.304073923638086, "learning_rate": 4.176968982247514e-08, "logits/chosen": 29.5570068359375, "logits/rejected": 33.91077423095703, "logps/chosen": -126.53129577636719, "logps/rejected": -205.4754638671875, "loss": 0.1221, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.051833152770996, "rewards/margins": 6.18133020401001, "rewards/rejected": -13.233163833618164, "step": 2080 }, { "epoch": 0.836, "grad_norm": 21.40116937016571, "learning_rate": 3.9858349126078936e-08, "logits/chosen": 30.234338760375977, "logits/rejected": 35.29171371459961, "logps/chosen": -118.93162536621094, "logps/rejected": -211.0615692138672, "loss": 0.0803, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.521851539611816, "rewards/margins": 6.673744201660156, "rewards/rejected": -13.195594787597656, "step": 2090 }, { "epoch": 0.84, "grad_norm": 63.74043787739687, "learning_rate": 3.798797596089351e-08, "logits/chosen": 30.151935577392578, "logits/rejected": 33.42070770263672, "logps/chosen": -126.11729431152344, "logps/rejected": -205.4882354736328, "loss": 0.1006, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.028811454772949, "rewards/margins": 6.618474006652832, "rewards/rejected": -13.647287368774414, "step": 2100 }, { "epoch": 0.844, "grad_norm": 12.990695199065254, "learning_rate": 3.615893495987335e-08, "logits/chosen": 30.569061279296875, "logits/rejected": 33.5135383605957, "logps/chosen": -118.1374282836914, "logps/rejected": -183.0607147216797, "loss": 0.2439, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.712668418884277, "rewards/margins": 5.418641090393066, "rewards/rejected": -12.131309509277344, "step": 2110 }, { "epoch": 0.848, "grad_norm": 19.041922220697664, "learning_rate": 3.437158269818563e-08, "logits/chosen": 30.906530380249023, "logits/rejected": 33.76823425292969, "logps/chosen": -128.64764404296875, "logps/rejected": -212.09213256835938, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": -6.712831020355225, "rewards/margins": 7.552107810974121, "rewards/rejected": -14.264938354492188, "step": 2120 }, { "epoch": 0.852, "grad_norm": 73.85387160932243, "learning_rate": 3.262626762369525e-08, "logits/chosen": 30.180126190185547, "logits/rejected": 34.350563049316406, "logps/chosen": -122.08283996582031, "logps/rejected": -191.43997192382812, "loss": 0.1989, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.999587059020996, "rewards/margins": 5.256472587585449, "rewards/rejected": -12.256059646606445, "step": 2130 }, { "epoch": 0.856, "grad_norm": 23.568790751855996, "learning_rate": 3.092332998903416e-08, "logits/chosen": 29.4277286529541, "logits/rejected": 31.689163208007812, "logps/chosen": -136.5303192138672, "logps/rejected": -214.65731811523438, "loss": 0.108, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.013432502746582, "rewards/margins": 6.365548133850098, "rewards/rejected": -14.378979682922363, "step": 2140 }, { "epoch": 0.86, "grad_norm": 16.09088977053807, "learning_rate": 2.9263101785268252e-08, "logits/chosen": 29.914642333984375, "logits/rejected": 30.405963897705078, "logps/chosen": -134.8611297607422, "logps/rejected": -205.1629180908203, "loss": 0.0879, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.248504638671875, "rewards/margins": 6.40295934677124, "rewards/rejected": -13.651464462280273, "step": 2150 }, { "epoch": 0.864, "grad_norm": 97.3126034768412, "learning_rate": 2.764590667717562e-08, "logits/chosen": 31.892868041992188, "logits/rejected": 33.220489501953125, "logps/chosen": -138.26364135742188, "logps/rejected": -187.60067749023438, "loss": 0.2452, "rewards/accuracies": 0.875, "rewards/chosen": -7.446439266204834, "rewards/margins": 4.706901550292969, "rewards/rejected": -12.153340339660645, "step": 2160 }, { "epoch": 0.868, "grad_norm": 34.25127629439165, "learning_rate": 2.6072059940146772e-08, "logits/chosen": 30.961380004882812, "logits/rejected": 32.999900817871094, "logps/chosen": -123.99442291259766, "logps/rejected": -199.34922790527344, "loss": 0.1935, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.214547634124756, "rewards/margins": 6.843503475189209, "rewards/rejected": -13.058052062988281, "step": 2170 }, { "epoch": 0.872, "grad_norm": 24.468484869974596, "learning_rate": 2.4541868398721576e-08, "logits/chosen": 31.01216697692871, "logits/rejected": 32.20166778564453, "logps/chosen": -128.03175354003906, "logps/rejected": -194.68743896484375, "loss": 0.0858, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.191014766693115, "rewards/margins": 5.623617649078369, "rewards/rejected": -12.8146333694458, "step": 2180 }, { "epoch": 0.876, "grad_norm": 2.66007251335378, "learning_rate": 2.3055630366772856e-08, "logits/chosen": 29.776905059814453, "logits/rejected": 32.563453674316406, "logps/chosen": -130.4777069091797, "logps/rejected": -196.01138305664062, "loss": 0.1665, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.950491428375244, "rewards/margins": 6.007107734680176, "rewards/rejected": -12.957598686218262, "step": 2190 }, { "epoch": 0.88, "grad_norm": 28.407188913509714, "learning_rate": 2.1613635589349756e-08, "logits/chosen": 31.004446029663086, "logits/rejected": 31.923709869384766, "logps/chosen": -129.14511108398438, "logps/rejected": -189.40194702148438, "loss": 0.152, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.862164497375488, "rewards/margins": 5.448739051818848, "rewards/rejected": -12.310903549194336, "step": 2200 }, { "epoch": 0.884, "grad_norm": 27.05189743480488, "learning_rate": 2.0216165186191404e-08, "logits/chosen": 29.659343719482422, "logits/rejected": 32.05767059326172, "logps/chosen": -132.9698028564453, "logps/rejected": -204.43704223632812, "loss": 0.054, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.252483367919922, "rewards/margins": 5.930403709411621, "rewards/rejected": -13.182887077331543, "step": 2210 }, { "epoch": 0.888, "grad_norm": 55.43405815780065, "learning_rate": 1.8863491596921743e-08, "logits/chosen": 32.782066345214844, "logits/rejected": 32.522891998291016, "logps/chosen": -136.2987060546875, "logps/rejected": -198.53738403320312, "loss": 0.2253, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.071662902832031, "rewards/margins": 6.142470359802246, "rewards/rejected": -13.214132308959961, "step": 2220 }, { "epoch": 0.892, "grad_norm": 51.20992346555615, "learning_rate": 1.7555878527937163e-08, "logits/chosen": 31.446645736694336, "logits/rejected": 35.24169921875, "logps/chosen": -118.80924224853516, "logps/rejected": -208.82534790039062, "loss": 0.0807, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.682692527770996, "rewards/margins": 6.966187477111816, "rewards/rejected": -13.648880004882812, "step": 2230 }, { "epoch": 0.896, "grad_norm": 2.6516558135690262, "learning_rate": 1.629358090099639e-08, "logits/chosen": 29.057031631469727, "logits/rejected": 32.2264404296875, "logps/chosen": -120.19584655761719, "logps/rejected": -200.8718719482422, "loss": 0.0942, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.863171577453613, "rewards/margins": 6.121987342834473, "rewards/rejected": -12.98515796661377, "step": 2240 }, { "epoch": 0.9, "grad_norm": 19.604673890955695, "learning_rate": 1.507684480352292e-08, "logits/chosen": 31.421955108642578, "logits/rejected": 33.76438522338867, "logps/chosen": -122.68287658691406, "logps/rejected": -201.4803009033203, "loss": 0.1018, "rewards/accuracies": 1.0, "rewards/chosen": -6.61349630355835, "rewards/margins": 7.108196258544922, "rewards/rejected": -13.721692085266113, "step": 2250 }, { "epoch": 0.904, "grad_norm": 18.823513111687696, "learning_rate": 1.390590744062975e-08, "logits/chosen": 29.491558074951172, "logits/rejected": 31.476119995117188, "logps/chosen": -130.34339904785156, "logps/rejected": -202.07351684570312, "loss": 0.1108, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.389586448669434, "rewards/margins": 6.204103469848633, "rewards/rejected": -13.593690872192383, "step": 2260 }, { "epoch": 0.908, "grad_norm": 41.18098938197995, "learning_rate": 1.2780997088875866e-08, "logits/chosen": 32.09846878051758, "logits/rejected": 34.34507751464844, "logps/chosen": -122.9142074584961, "logps/rejected": -186.44857788085938, "loss": 0.1196, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.446104526519775, "rewards/margins": 5.684818267822266, "rewards/rejected": -12.130921363830566, "step": 2270 }, { "epoch": 0.912, "grad_norm": 26.449033993731902, "learning_rate": 1.1702333051763268e-08, "logits/chosen": 30.299280166625977, "logits/rejected": 33.13639831542969, "logps/chosen": -125.92582702636719, "logps/rejected": -201.4826202392578, "loss": 0.0914, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.8485307693481445, "rewards/margins": 6.146922588348389, "rewards/rejected": -12.995452880859375, "step": 2280 }, { "epoch": 0.916, "grad_norm": 3.868894147937223, "learning_rate": 1.0670125616983189e-08, "logits/chosen": 31.07822036743164, "logits/rejected": 33.824302673339844, "logps/chosen": -133.55230712890625, "logps/rejected": -204.15899658203125, "loss": 0.2123, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.182616233825684, "rewards/margins": 5.8420729637146, "rewards/rejected": -13.024690628051758, "step": 2290 }, { "epoch": 0.92, "grad_norm": 3.3807387885526454, "learning_rate": 9.684576015420275e-09, "logits/chosen": 29.672496795654297, "logits/rejected": 37.18809509277344, "logps/chosen": -126.36643981933594, "logps/rejected": -208.7083740234375, "loss": 0.158, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.423619270324707, "rewards/margins": 5.624678611755371, "rewards/rejected": -13.048298835754395, "step": 2300 }, { "epoch": 0.924, "grad_norm": 66.31275321980026, "learning_rate": 8.745876381922146e-09, "logits/chosen": 30.262380599975586, "logits/rejected": 33.276100158691406, "logps/chosen": -124.06925964355469, "logps/rejected": -195.91195678710938, "loss": 0.1534, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.720442295074463, "rewards/margins": 6.354052543640137, "rewards/rejected": -13.074495315551758, "step": 2310 }, { "epoch": 0.928, "grad_norm": 7.064746922185591, "learning_rate": 7.85420971784223e-09, "logits/chosen": 31.4277400970459, "logits/rejected": 30.434642791748047, "logps/chosen": -133.32081604003906, "logps/rejected": -188.7599639892578, "loss": 0.2425, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.45855712890625, "rewards/margins": 5.378756999969482, "rewards/rejected": -12.837315559387207, "step": 2320 }, { "epoch": 0.932, "grad_norm": 4.788662000330739, "learning_rate": 7.009749855363456e-09, "logits/chosen": 30.91952896118164, "logits/rejected": 32.42715072631836, "logps/chosen": -133.62342834472656, "logps/rejected": -188.62930297851562, "loss": 0.3079, "rewards/accuracies": 0.875, "rewards/chosen": -7.070608615875244, "rewards/margins": 5.031331539154053, "rewards/rejected": -12.101941108703613, "step": 2330 }, { "epoch": 0.936, "grad_norm": 10.233821689329831, "learning_rate": 6.2126614236091834e-09, "logits/chosen": 28.668411254882812, "logits/rejected": 32.377296447753906, "logps/chosen": -117.2125244140625, "logps/rejected": -195.70059204101562, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": -6.799081325531006, "rewards/margins": 6.002202987670898, "rewards/rejected": -12.801284790039062, "step": 2340 }, { "epoch": 0.94, "grad_norm": 12.346515996086536, "learning_rate": 5.463099816548577e-09, "logits/chosen": 31.012216567993164, "logits/rejected": 35.09211349487305, "logps/chosen": -144.66915893554688, "logps/rejected": -219.4803009033203, "loss": 0.0968, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.998269557952881, "rewards/margins": 5.874829292297363, "rewards/rejected": -13.873098373413086, "step": 2350 }, { "epoch": 0.944, "grad_norm": 33.1665678427252, "learning_rate": 4.761211162702117e-09, "logits/chosen": 29.088903427124023, "logits/rejected": 34.32123947143555, "logps/chosen": -127.30374908447266, "logps/rejected": -211.43215942382812, "loss": 0.1457, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.1538214683532715, "rewards/margins": 6.435636043548584, "rewards/rejected": -13.589457511901855, "step": 2360 }, { "epoch": 0.948, "grad_norm": 23.425984241535982, "learning_rate": 4.107132296653548e-09, "logits/chosen": 28.269973754882812, "logits/rejected": 31.716510772705078, "logps/chosen": -125.16642761230469, "logps/rejected": -197.10952758789062, "loss": 0.1364, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.0634284019470215, "rewards/margins": 5.970315933227539, "rewards/rejected": -13.033744812011719, "step": 2370 }, { "epoch": 0.952, "grad_norm": 10.233047916013119, "learning_rate": 3.5009907323737818e-09, "logits/chosen": 32.33917999267578, "logits/rejected": 31.942947387695312, "logps/chosen": -150.05201721191406, "logps/rejected": -198.63037109375, "loss": 0.1165, "rewards/accuracies": 1.0, "rewards/chosen": -7.604508876800537, "rewards/margins": 5.146544933319092, "rewards/rejected": -12.751053810119629, "step": 2380 }, { "epoch": 0.956, "grad_norm": 43.68344404125229, "learning_rate": 2.9429046383618038e-09, "logits/chosen": 32.400047302246094, "logits/rejected": 32.68498992919922, "logps/chosen": -131.60537719726562, "logps/rejected": -197.2787628173828, "loss": 0.1184, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.099400520324707, "rewards/margins": 6.27431058883667, "rewards/rejected": -13.373710632324219, "step": 2390 }, { "epoch": 0.96, "grad_norm": 59.46460968091823, "learning_rate": 2.4329828146074096e-09, "logits/chosen": 30.623287200927734, "logits/rejected": 32.035682678222656, "logps/chosen": -131.70294189453125, "logps/rejected": -198.9145050048828, "loss": 0.099, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.071308135986328, "rewards/margins": 6.146972179412842, "rewards/rejected": -13.218279838562012, "step": 2400 }, { "epoch": 0.964, "grad_norm": 56.79097277412103, "learning_rate": 1.9713246713805587e-09, "logits/chosen": 31.465078353881836, "logits/rejected": 34.087913513183594, "logps/chosen": -137.09597778320312, "logps/rejected": -203.9413604736328, "loss": 0.1021, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.499648094177246, "rewards/margins": 5.972261905670166, "rewards/rejected": -13.47191047668457, "step": 2410 }, { "epoch": 0.968, "grad_norm": 0.8585461059925868, "learning_rate": 1.5580202098509076e-09, "logits/chosen": 29.60024070739746, "logits/rejected": 32.122642517089844, "logps/chosen": -138.75808715820312, "logps/rejected": -205.38681030273438, "loss": 0.1414, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.920256614685059, "rewards/margins": 5.451854228973389, "rewards/rejected": -13.372111320495605, "step": 2420 }, { "epoch": 0.972, "grad_norm": 6.596142925147252, "learning_rate": 1.1931500045422038e-09, "logits/chosen": 27.959314346313477, "logits/rejected": 32.65019226074219, "logps/chosen": -126.5806655883789, "logps/rejected": -199.76040649414062, "loss": 0.2261, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.968267917633057, "rewards/margins": 5.849569797515869, "rewards/rejected": -12.817837715148926, "step": 2430 }, { "epoch": 0.976, "grad_norm": 35.62534557996734, "learning_rate": 8.767851876239074e-10, "logits/chosen": 32.697261810302734, "logits/rejected": 35.91864013671875, "logps/chosen": -123.58503723144531, "logps/rejected": -206.4244842529297, "loss": 0.1628, "rewards/accuracies": 1.0, "rewards/chosen": -6.5748610496521, "rewards/margins": 6.952831268310547, "rewards/rejected": -13.527691841125488, "step": 2440 }, { "epoch": 0.98, "grad_norm": 27.658485623502656, "learning_rate": 6.089874350439505e-10, "logits/chosen": 31.490856170654297, "logits/rejected": 32.93403625488281, "logps/chosen": -138.20703125, "logps/rejected": -193.5495147705078, "loss": 0.216, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.787785530090332, "rewards/margins": 4.913016319274902, "rewards/rejected": -12.700800895690918, "step": 2450 }, { "epoch": 0.984, "grad_norm": 0.08797343919690617, "learning_rate": 3.898089545047445e-10, "logits/chosen": 31.071331024169922, "logits/rejected": 32.911354064941406, "logps/chosen": -130.91525268554688, "logps/rejected": -205.3039093017578, "loss": 0.0549, "rewards/accuracies": 1.0, "rewards/chosen": -7.0166826248168945, "rewards/margins": 6.3437819480896, "rewards/rejected": -13.360466003417969, "step": 2460 }, { "epoch": 0.988, "grad_norm": 1.2212789051560087, "learning_rate": 2.1929247528540418e-10, "logits/chosen": 28.965045928955078, "logits/rejected": 32.625064849853516, "logps/chosen": -127.5809326171875, "logps/rejected": -214.167724609375, "loss": 0.0704, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.630097389221191, "rewards/margins": 7.368367671966553, "rewards/rejected": -13.998464584350586, "step": 2470 }, { "epoch": 0.992, "grad_norm": 91.06898861755354, "learning_rate": 9.747123991141193e-11, "logits/chosen": 30.026744842529297, "logits/rejected": 32.432098388671875, "logps/chosen": -131.2405548095703, "logps/rejected": -194.84970092773438, "loss": 0.2616, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.285294532775879, "rewards/margins": 5.499347686767578, "rewards/rejected": -12.78464126586914, "step": 2480 }, { "epoch": 0.996, "grad_norm": 89.82748805014613, "learning_rate": 2.4368997673940294e-11, "logits/chosen": 29.70314598083496, "logits/rejected": 32.16979217529297, "logps/chosen": -126.3776626586914, "logps/rejected": -199.11517333984375, "loss": 0.174, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.552847385406494, "rewards/margins": 6.022560119628906, "rewards/rejected": -13.575407028198242, "step": 2490 }, { "epoch": 1.0, "grad_norm": 18.11358569727584, "learning_rate": 0.0, "logits/chosen": 30.229150772094727, "logits/rejected": 34.69512176513672, "logps/chosen": -133.08348083496094, "logps/rejected": -207.73001098632812, "loss": 0.1378, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.222729682922363, "rewards/margins": 5.665229320526123, "rewards/rejected": -12.887959480285645, "step": 2500 }, { "epoch": 1.0, "step": 2500, "total_flos": 0.0, "train_loss": 0.26106402876377105, "train_runtime": 17697.4864, "train_samples_per_second": 1.13, "train_steps_per_second": 0.141 } ], "logging_steps": 10, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }