{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9962157048249763, "eval_steps": 500, "global_step": 162, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 44.790242854161896, "learning_rate": 2.941176470588235e-08, "logits/chosen": 0.4138435125350952, "logits/rejected": 0.3073309361934662, "logps/chosen": -238.74684143066406, "logps/rejected": -277.3367919921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "grad_norm": 32.66922851838542, "learning_rate": 2.941176470588235e-07, "logits/chosen": 0.19251321256160736, "logits/rejected": 0.15595921874046326, "logps/chosen": -266.190673828125, "logps/rejected": -288.5094299316406, "loss": 0.6912, "rewards/accuracies": 0.5384615659713745, "rewards/chosen": -0.04362406209111214, "rewards/margins": 0.010117193683981895, "rewards/rejected": -0.05374125763773918, "step": 10 }, { "epoch": 0.12, "grad_norm": 34.2665441677138, "learning_rate": 4.99472085783721e-07, "logits/chosen": 0.4772653877735138, "logits/rejected": 0.34988027811050415, "logps/chosen": -288.5440368652344, "logps/rejected": -317.422607421875, "loss": 0.6966, "rewards/accuracies": 0.48846152424812317, "rewards/chosen": -0.29237106442451477, "rewards/margins": 0.041345465928316116, "rewards/rejected": -0.3337165117263794, "step": 20 }, { "epoch": 0.18, "grad_norm": 51.989784907303765, "learning_rate": 4.901488388458247e-07, "logits/chosen": 0.08469453454017639, "logits/rejected": 0.02361711673438549, "logps/chosen": -257.55908203125, "logps/rejected": -290.6418151855469, "loss": 0.6904, "rewards/accuracies": 0.5153846144676208, "rewards/chosen": 0.0216812863945961, "rewards/margins": 0.06838896870613098, "rewards/rejected": -0.04670768231153488, "step": 30 }, { "epoch": 0.25, "grad_norm": 48.07967092856285, "learning_rate": 4.695964991097616e-07, "logits/chosen": 0.5012978315353394, "logits/rejected": 0.3107348382472992, "logps/chosen": -293.4065856933594, "logps/rejected": -318.2868347167969, "loss": 0.6923, "rewards/accuracies": 0.5153846144676208, "rewards/chosen": -0.3082655370235443, "rewards/margins": 0.04468757286667824, "rewards/rejected": -0.35295310616493225, "step": 40 }, { "epoch": 0.31, "grad_norm": 50.8642865419175, "learning_rate": 4.3877607113930516e-07, "logits/chosen": 0.5789575576782227, "logits/rejected": 0.747968852519989, "logps/chosen": -291.36279296875, "logps/rejected": -306.2597961425781, "loss": 0.696, "rewards/accuracies": 0.5269230604171753, "rewards/chosen": -0.29860785603523254, "rewards/margins": 0.016954706981778145, "rewards/rejected": -0.31556254625320435, "step": 50 }, { "epoch": 0.37, "grad_norm": 42.68053454562829, "learning_rate": 3.991286838919086e-07, "logits/chosen": 0.5176121592521667, "logits/rejected": 0.48673737049102783, "logps/chosen": -285.60284423828125, "logps/rejected": -301.834228515625, "loss": 0.6896, "rewards/accuracies": 0.4961538314819336, "rewards/chosen": -0.20814552903175354, "rewards/margins": 0.02328580990433693, "rewards/rejected": -0.23143133521080017, "step": 60 }, { "epoch": 0.43, "grad_norm": 37.59954622299739, "learning_rate": 3.52508205130354e-07, "logits/chosen": 0.47016510367393494, "logits/rejected": 0.6350060105323792, "logps/chosen": -298.3149719238281, "logps/rejected": -311.8184509277344, "loss": 0.6954, "rewards/accuracies": 0.5423076748847961, "rewards/chosen": -0.33209383487701416, "rewards/margins": 0.01967799849808216, "rewards/rejected": -0.35177183151245117, "step": 70 }, { "epoch": 0.49, "grad_norm": 40.19774755409481, "learning_rate": 3.010945566265912e-07, "logits/chosen": 0.8041943311691284, "logits/rejected": 0.9286781549453735, "logps/chosen": -320.7852783203125, "logps/rejected": -339.48193359375, "loss": 0.6856, "rewards/accuracies": 0.557692289352417, "rewards/chosen": -0.5020374655723572, "rewards/margins": 0.013489325530827045, "rewards/rejected": -0.5155267715454102, "step": 80 }, { "epoch": 0.55, "grad_norm": 40.3298911710128, "learning_rate": 2.4729178344249006e-07, "logits/chosen": 0.526244044303894, "logits/rejected": 0.5612362027168274, "logps/chosen": -289.9747009277344, "logps/rejected": -304.9601135253906, "loss": 0.6927, "rewards/accuracies": 0.5230769515037537, "rewards/chosen": -0.2381971925497055, "rewards/margins": 0.04978089779615402, "rewards/rejected": -0.2879781126976013, "step": 90 }, { "epoch": 0.61, "grad_norm": 35.40524465285595, "learning_rate": 1.9361564345465145e-07, "logits/chosen": 0.3361697196960449, "logits/rejected": 0.5479218363761902, "logps/chosen": -272.99285888671875, "logps/rejected": -300.7643737792969, "loss": 0.6878, "rewards/accuracies": 0.5153846144676208, "rewards/chosen": -0.1911364644765854, "rewards/margins": 0.03422596678137779, "rewards/rejected": -0.22536242008209229, "step": 100 }, { "epoch": 0.68, "grad_norm": 38.98659700871813, "learning_rate": 1.4257597331216208e-07, "logits/chosen": 0.6074225902557373, "logits/rejected": 0.7285165786743164, "logps/chosen": -311.04827880859375, "logps/rejected": -335.56396484375, "loss": 0.685, "rewards/accuracies": 0.5423076748847961, "rewards/chosen": -0.4663804769515991, "rewards/margins": 0.05671105906367302, "rewards/rejected": -0.5230914950370789, "step": 110 }, { "epoch": 0.74, "grad_norm": 50.7969521151244, "learning_rate": 9.655933126436563e-08, "logits/chosen": 0.5686596035957336, "logits/rejected": 0.6905936002731323, "logps/chosen": -276.1257629394531, "logps/rejected": -292.55120849609375, "loss": 0.7065, "rewards/accuracies": 0.5538461804389954, "rewards/chosen": -0.14772899448871613, "rewards/margins": 0.05048359930515289, "rewards/rejected": -0.19821257889270782, "step": 120 }, { "epoch": 0.8, "grad_norm": 44.99508540990744, "learning_rate": 5.771740434959277e-08, "logits/chosen": 0.7891207337379456, "logits/rejected": 0.6963477730751038, "logps/chosen": -289.924072265625, "logps/rejected": -315.6805419921875, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.2571752965450287, "rewards/margins": 0.07165656983852386, "rewards/rejected": -0.32883188128471375, "step": 130 }, { "epoch": 0.86, "grad_norm": 41.07192587696782, "learning_rate": 2.7866397900677185e-08, "logits/chosen": 0.728725016117096, "logits/rejected": 0.6798302531242371, "logps/chosen": -313.1536865234375, "logps/rejected": -325.9017639160156, "loss": 0.686, "rewards/accuracies": 0.5423076748847961, "rewards/chosen": -0.5450281500816345, "rewards/margins": 0.03127431869506836, "rewards/rejected": -0.5763024687767029, "step": 140 }, { "epoch": 0.92, "grad_norm": 51.88495068918687, "learning_rate": 8.402111802159412e-09, "logits/chosen": 0.7722111344337463, "logits/rejected": 0.8273798227310181, "logps/chosen": -296.1044921875, "logps/rejected": -326.81695556640625, "loss": 0.6812, "rewards/accuracies": 0.607692301273346, "rewards/chosen": -0.4110731780529022, "rewards/margins": 0.07831522077322006, "rewards/rejected": -0.4893884062767029, "step": 150 }, { "epoch": 0.98, "grad_norm": 43.53404843175348, "learning_rate": 2.3467443900582197e-10, "logits/chosen": 0.9644160866737366, "logits/rejected": 1.0424695014953613, "logps/chosen": -288.78082275390625, "logps/rejected": -310.8374328613281, "loss": 0.685, "rewards/accuracies": 0.5346153974533081, "rewards/chosen": -0.3611108064651489, "rewards/margins": 0.10043878108263016, "rewards/rejected": -0.4615496098995209, "step": 160 }, { "epoch": 1.0, "step": 162, "total_flos": 0.0, "train_loss": 0.6911558170377472, "train_runtime": 23474.6733, "train_samples_per_second": 0.9, "train_steps_per_second": 0.007 } ], "logging_steps": 10, "max_steps": 162, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }