{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998691442030882, "eval_steps": 50, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_lambda": 1.0, "epoch": 0.002093692750588851, "grad_norm": 4.402319572917802, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -0.8043479323387146, "logits/rejected": -0.8551070690155029, "logps/chosen": -318.6319885253906, "logps/rejected": -337.8906555175781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "dpo_lambda": 0.9998951554298401, "epoch": 0.004187385501177702, "grad_norm": 4.793462944453188, "learning_rate": 2.083333333333333e-08, "logits/chosen": -0.7836206555366516, "logits/rejected": -0.9540650844573975, "logps/chosen": -330.71966552734375, "logps/rejected": -286.4294128417969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "dpo_lambda": 0.9997903108596802, "epoch": 0.006281078251766554, "grad_norm": 5.083035763031515, "learning_rate": 3.125e-08, "logits/chosen": -0.8143987655639648, "logits/rejected": -0.8199301958084106, "logps/chosen": -276.8949890136719, "logps/rejected": -268.3603820800781, "loss": 0.6932, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0008645334746688604, "rewards/margins": -0.001431022654287517, "rewards/rejected": 0.0005664890632033348, "step": 3 }, { "dpo_lambda": 0.9996854066848755, "epoch": 0.008374771002355404, "grad_norm": 5.902572234969009, "learning_rate": 4.166666666666666e-08, "logits/chosen": -0.8467217087745667, "logits/rejected": -0.8571422100067139, "logps/chosen": -300.71966552734375, "logps/rejected": -329.18927001953125, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 6.150537228677422e-05, "rewards/margins": -0.00012183257786091417, "rewards/rejected": 0.00018333786283619702, "step": 4 }, { "dpo_lambda": 0.9995808005332947, "epoch": 0.010468463752944255, "grad_norm": 5.030790216066496, "learning_rate": 5.208333333333333e-08, "logits/chosen": -0.854633092880249, "logits/rejected": -0.8405370116233826, "logps/chosen": -263.6219482421875, "logps/rejected": -256.4416198730469, "loss": 0.693, "rewards/accuracies": 0.40625, "rewards/chosen": -0.00030443898867815733, "rewards/margins": -0.0004514011961873621, "rewards/rejected": 0.00014696212019771338, "step": 5 }, { "dpo_lambda": 0.99947589635849, "epoch": 0.012562156503533107, "grad_norm": 6.449855736894302, "learning_rate": 6.25e-08, "logits/chosen": -0.848020076751709, "logits/rejected": -0.879482626914978, "logps/chosen": -288.12579345703125, "logps/rejected": -244.55783081054688, "loss": 0.6935, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0011313408613204956, "rewards/margins": -0.0024054457899183035, "rewards/rejected": 0.0012741051614284515, "step": 6 }, { "dpo_lambda": 0.9993710517883301, "epoch": 0.014655849254121958, "grad_norm": 4.4471205365033715, "learning_rate": 7.291666666666667e-08, "logits/chosen": -0.8126897215843201, "logits/rejected": -0.9293793439865112, "logps/chosen": -359.7424621582031, "logps/rejected": -305.7148742675781, "loss": 0.6926, "rewards/accuracies": 0.515625, "rewards/chosen": 0.00028104332159273326, "rewards/margins": 0.0005955130327492952, "rewards/rejected": -0.00031446965294890106, "step": 7 }, { "dpo_lambda": 0.9992662072181702, "epoch": 0.016749542004710807, "grad_norm": 5.6547119853423276, "learning_rate": 8.333333333333333e-08, "logits/chosen": -0.8336946964263916, "logits/rejected": -0.8718341588973999, "logps/chosen": -305.8148193359375, "logps/rejected": -274.6888732910156, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": -0.00020592098007909954, "rewards/margins": -0.00026029106811620295, "rewards/rejected": 5.437021900434047e-05, "step": 8 }, { "dpo_lambda": 0.9991613626480103, "epoch": 0.01884323475529966, "grad_norm": 3.9638195143574837, "learning_rate": 9.375e-08, "logits/chosen": -0.7655616402626038, "logits/rejected": -0.908790111541748, "logps/chosen": -326.245361328125, "logps/rejected": -278.6490478515625, "loss": 0.6933, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0012860479764640331, "rewards/margins": 0.000617672863882035, "rewards/rejected": 0.000668375170789659, "step": 9 }, { "dpo_lambda": 0.9990566968917847, "epoch": 0.02093692750588851, "grad_norm": 4.403617727179671, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -0.7897341251373291, "logits/rejected": -0.853586733341217, "logps/chosen": -327.54168701171875, "logps/rejected": -333.5374450683594, "loss": 0.6934, "rewards/accuracies": 0.46875, "rewards/chosen": 0.00038820982445031404, "rewards/margins": -0.0005907297600060701, "rewards/rejected": 0.0009789395844563842, "step": 10 }, { "dpo_lambda": 0.9989518523216248, "epoch": 0.023030620256477362, "grad_norm": 7.796052749512685, "learning_rate": 1.1458333333333332e-07, "logits/chosen": -0.8637887239456177, "logits/rejected": -0.9242954254150391, "logps/chosen": -261.32867431640625, "logps/rejected": -249.37728881835938, "loss": 0.6933, "rewards/accuracies": 0.546875, "rewards/chosen": -0.00039038294926285744, "rewards/margins": -0.00036161605385132134, "rewards/rejected": -2.8766971809091046e-05, "step": 11 }, { "dpo_lambda": 0.9988469481468201, "epoch": 0.025124313007066214, "grad_norm": 3.8821871111757082, "learning_rate": 1.25e-07, "logits/chosen": -0.9050667881965637, "logits/rejected": -0.8800326585769653, "logps/chosen": -280.99847412109375, "logps/rejected": -275.5593566894531, "loss": 0.6929, "rewards/accuracies": 0.46875, "rewards/chosen": -0.001003009034320712, "rewards/margins": -0.00038465563557110727, "rewards/rejected": -0.0006183534860610962, "step": 12 }, { "dpo_lambda": 0.9987421631813049, "epoch": 0.027218005757655064, "grad_norm": 6.440110466185814, "learning_rate": 1.3541666666666666e-07, "logits/chosen": -0.826982855796814, "logits/rejected": -0.8742626309394836, "logps/chosen": -280.1139831542969, "logps/rejected": -244.7361297607422, "loss": 0.6932, "rewards/accuracies": 0.40625, "rewards/chosen": 0.00022524214000441134, "rewards/margins": -0.0009332930203527212, "rewards/rejected": 0.001158535131253302, "step": 13 }, { "dpo_lambda": 0.9986372590065002, "epoch": 0.029311698508243916, "grad_norm": 4.229417019989064, "learning_rate": 1.4583333333333335e-07, "logits/chosen": -0.8535632491111755, "logits/rejected": -0.8801539540290833, "logps/chosen": -328.8179016113281, "logps/rejected": -320.2232971191406, "loss": 0.6937, "rewards/accuracies": 0.40625, "rewards/chosen": -0.000532336242031306, "rewards/margins": -0.0019397324649617076, "rewards/rejected": 0.0014073961647227407, "step": 14 }, { "dpo_lambda": 0.9985324144363403, "epoch": 0.031405391258832765, "grad_norm": 4.467816070741843, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -0.8298668265342712, "logits/rejected": -0.9168381690979004, "logps/chosen": -303.4217224121094, "logps/rejected": -259.37548828125, "loss": 0.6932, "rewards/accuracies": 0.5625, "rewards/chosen": 6.753446359653026e-05, "rewards/margins": -0.0004937391495332122, "rewards/rejected": 0.0005612736567854881, "step": 15 }, { "dpo_lambda": 0.9984277486801147, "epoch": 0.033499084009421615, "grad_norm": 3.888254222848592, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.7577842473983765, "logits/rejected": -0.8261817693710327, "logps/chosen": -297.6632385253906, "logps/rejected": -301.09295654296875, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 1.160617102868855e-05, "rewards/margins": 0.00021146482322365046, "rewards/rejected": -0.00019985856488347054, "step": 16 }, { "dpo_lambda": 0.9983229041099548, "epoch": 0.03559277676001047, "grad_norm": 4.452768078866186, "learning_rate": 1.7708333333333334e-07, "logits/chosen": -0.868607759475708, "logits/rejected": -0.8844305276870728, "logps/chosen": -289.6638488769531, "logps/rejected": -255.01210021972656, "loss": 0.6932, "rewards/accuracies": 0.375, "rewards/chosen": 0.00011004415864590555, "rewards/margins": -0.0005320608615875244, "rewards/rejected": 0.0006421051220968366, "step": 17 }, { "dpo_lambda": 0.9982179999351501, "epoch": 0.03768646951059932, "grad_norm": 4.8480605497744, "learning_rate": 1.875e-07, "logits/chosen": -0.7690554261207581, "logits/rejected": -0.9109585285186768, "logps/chosen": -349.46014404296875, "logps/rejected": -280.4686584472656, "loss": 0.6929, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0018899872666224837, "rewards/margins": 0.001062125200405717, "rewards/rejected": 0.0008278620080091059, "step": 18 }, { "dpo_lambda": 0.998113214969635, "epoch": 0.03978016226118817, "grad_norm": 4.425003512881252, "learning_rate": 1.9791666666666664e-07, "logits/chosen": -0.7577191591262817, "logits/rejected": -0.8322458863258362, "logps/chosen": -324.3868713378906, "logps/rejected": -302.09149169921875, "loss": 0.6927, "rewards/accuracies": 0.53125, "rewards/chosen": 0.002668360248208046, "rewards/margins": 0.0012416491517797112, "rewards/rejected": 0.0014267113292589784, "step": 19 }, { "dpo_lambda": 0.9980083107948303, "epoch": 0.04187385501177702, "grad_norm": 4.846085404616861, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -0.8820050954818726, "logits/rejected": -0.9251440763473511, "logps/chosen": -268.20587158203125, "logps/rejected": -281.5534973144531, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.002546853618696332, "rewards/margins": 0.0012899779248982668, "rewards/rejected": 0.001256875810213387, "step": 20 }, { "dpo_lambda": 0.9979037046432495, "epoch": 0.043967547762365874, "grad_norm": 4.117368237143938, "learning_rate": 2.1875e-07, "logits/chosen": -0.8503252267837524, "logits/rejected": -0.92568039894104, "logps/chosen": -226.6487274169922, "logps/rejected": -226.0305633544922, "loss": 0.6927, "rewards/accuracies": 0.640625, "rewards/chosen": 0.002446984639391303, "rewards/margins": 0.0014305507065728307, "rewards/rejected": 0.0010164338164031506, "step": 21 }, { "dpo_lambda": 0.9977988004684448, "epoch": 0.046061240512954724, "grad_norm": 5.26507499436042, "learning_rate": 2.2916666666666663e-07, "logits/chosen": -0.7547565698623657, "logits/rejected": -0.7918416261672974, "logps/chosen": -299.7794189453125, "logps/rejected": -281.09222412109375, "loss": 0.6919, "rewards/accuracies": 0.65625, "rewards/chosen": 0.00438675656914711, "rewards/margins": 0.003029727144166827, "rewards/rejected": 0.0013570297742262483, "step": 22 }, { "dpo_lambda": 0.9976939558982849, "epoch": 0.04815493326354357, "grad_norm": 4.528414091774314, "learning_rate": 2.3958333333333335e-07, "logits/chosen": -0.8128759860992432, "logits/rejected": -0.8892397284507751, "logps/chosen": -342.2197265625, "logps/rejected": -282.14398193359375, "loss": 0.6922, "rewards/accuracies": 0.578125, "rewards/chosen": 0.0038836237508803606, "rewards/margins": 0.001989413285627961, "rewards/rejected": 0.0018942105816677213, "step": 23 }, { "dpo_lambda": 0.997589111328125, "epoch": 0.05024862601413243, "grad_norm": 4.040981518845972, "learning_rate": 2.5e-07, "logits/chosen": -0.743541955947876, "logits/rejected": -0.9808051586151123, "logps/chosen": -350.8001403808594, "logps/rejected": -279.0107727050781, "loss": 0.6921, "rewards/accuracies": 0.578125, "rewards/chosen": 0.005917892791330814, "rewards/margins": 0.002584913745522499, "rewards/rejected": 0.003332979278638959, "step": 24 }, { "dpo_lambda": 0.9974842667579651, "epoch": 0.05234231876472128, "grad_norm": 4.923955440464468, "learning_rate": 2.604166666666667e-07, "logits/chosen": -0.7823559045791626, "logits/rejected": -0.9089896082878113, "logps/chosen": -280.17523193359375, "logps/rejected": -243.14208984375, "loss": 0.6921, "rewards/accuracies": 0.65625, "rewards/chosen": 0.004923699423670769, "rewards/margins": 0.0027135859709233046, "rewards/rejected": 0.002210113452747464, "step": 25 }, { "dpo_lambda": 0.9973794221878052, "epoch": 0.05443601151531013, "grad_norm": 4.351217149891559, "learning_rate": 2.708333333333333e-07, "logits/chosen": -0.7708956003189087, "logits/rejected": -0.9697441458702087, "logps/chosen": -338.752197265625, "logps/rejected": -289.9631652832031, "loss": 0.6915, "rewards/accuracies": 0.6875, "rewards/chosen": 0.009810867719352245, "rewards/margins": 0.004754879977554083, "rewards/rejected": 0.005055988673120737, "step": 26 }, { "dpo_lambda": 0.9972745180130005, "epoch": 0.056529704265898977, "grad_norm": 4.473346542639472, "learning_rate": 2.8125e-07, "logits/chosen": -0.8319710493087769, "logits/rejected": -0.9578927755355835, "logps/chosen": -320.4068908691406, "logps/rejected": -257.923583984375, "loss": 0.692, "rewards/accuracies": 0.59375, "rewards/chosen": 0.009162155911326408, "rewards/margins": 0.0028106458485126495, "rewards/rejected": 0.006351510062813759, "step": 27 }, { "dpo_lambda": 0.9971698522567749, "epoch": 0.05862339701648783, "grad_norm": 4.3426457588367136, "learning_rate": 2.916666666666667e-07, "logits/chosen": -0.7956329584121704, "logits/rejected": -0.8427987694740295, "logps/chosen": -320.89215087890625, "logps/rejected": -300.1125793457031, "loss": 0.6913, "rewards/accuracies": 0.546875, "rewards/chosen": 0.01058600191026926, "rewards/margins": 0.00340261054225266, "rewards/rejected": 0.007183392066508532, "step": 28 }, { "dpo_lambda": 0.997065007686615, "epoch": 0.06071708976707668, "grad_norm": 4.085431965259117, "learning_rate": 3.020833333333333e-07, "logits/chosen": -0.726355254650116, "logits/rejected": -0.9321560859680176, "logps/chosen": -381.36444091796875, "logps/rejected": -298.1344299316406, "loss": 0.6919, "rewards/accuracies": 0.703125, "rewards/chosen": 0.013005386106669903, "rewards/margins": 0.00456521799787879, "rewards/rejected": 0.0084401685744524, "step": 29 }, { "dpo_lambda": 0.9969601631164551, "epoch": 0.06281078251766553, "grad_norm": 3.783478029121267, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -0.790072500705719, "logits/rejected": -0.8063573837280273, "logps/chosen": -345.5115661621094, "logps/rejected": -364.02142333984375, "loss": 0.6911, "rewards/accuracies": 0.609375, "rewards/chosen": 0.014460022561252117, "rewards/margins": 0.002242105081677437, "rewards/rejected": 0.012217918410897255, "step": 30 }, { "dpo_lambda": 0.9968553185462952, "epoch": 0.06490447526825438, "grad_norm": 4.725066162809079, "learning_rate": 3.2291666666666666e-07, "logits/chosen": -0.7989029884338379, "logits/rejected": -0.8742246031761169, "logps/chosen": -305.10986328125, "logps/rejected": -267.1070251464844, "loss": 0.6915, "rewards/accuracies": 0.578125, "rewards/chosen": 0.015996551141142845, "rewards/margins": 0.004079596605151892, "rewards/rejected": 0.011916955932974815, "step": 31 }, { "dpo_lambda": 0.9967504739761353, "epoch": 0.06699816801884323, "grad_norm": 6.263003607437198, "learning_rate": 3.333333333333333e-07, "logits/chosen": -0.7864735126495361, "logits/rejected": -0.7694317102432251, "logps/chosen": -314.7187194824219, "logps/rejected": -318.4085693359375, "loss": 0.6904, "rewards/accuracies": 0.640625, "rewards/chosen": 0.018498174846172333, "rewards/margins": 0.00419366592541337, "rewards/rejected": 0.01430450938642025, "step": 32 }, { "dpo_lambda": 0.9966458082199097, "epoch": 0.06909186076943209, "grad_norm": 4.996565408773174, "learning_rate": 3.4375e-07, "logits/chosen": -0.8627775311470032, "logits/rejected": -0.9619947075843811, "logps/chosen": -329.125732421875, "logps/rejected": -290.11798095703125, "loss": 0.689, "rewards/accuracies": 0.65625, "rewards/chosen": 0.022791242226958275, "rewards/margins": 0.008790804073214531, "rewards/rejected": 0.01400043722242117, "step": 33 }, { "dpo_lambda": 0.9965409636497498, "epoch": 0.07118555352002094, "grad_norm": 4.077713925438457, "learning_rate": 3.541666666666667e-07, "logits/chosen": -0.8728955984115601, "logits/rejected": -0.9118562340736389, "logps/chosen": -257.28759765625, "logps/rejected": -223.10824584960938, "loss": 0.6892, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02388455532491207, "rewards/margins": 0.009245344437658787, "rewards/rejected": 0.01463920809328556, "step": 34 }, { "dpo_lambda": 0.9964360594749451, "epoch": 0.07327924627060979, "grad_norm": 4.163606742901362, "learning_rate": 3.645833333333333e-07, "logits/chosen": -0.8726727366447449, "logits/rejected": -0.9065307378768921, "logps/chosen": -219.03260803222656, "logps/rejected": -223.0487518310547, "loss": 0.6897, "rewards/accuracies": 0.5625, "rewards/chosen": 0.023589186370372772, "rewards/margins": 0.004913577809929848, "rewards/rejected": 0.018675606697797775, "step": 35 }, { "dpo_lambda": 0.9963312149047852, "epoch": 0.07537293902119864, "grad_norm": 4.814967779155049, "learning_rate": 3.75e-07, "logits/chosen": -0.7583918571472168, "logits/rejected": -0.8758710622787476, "logps/chosen": -299.6081848144531, "logps/rejected": -260.1003723144531, "loss": 0.689, "rewards/accuracies": 0.6875, "rewards/chosen": 0.030413059517741203, "rewards/margins": 0.009431293234229088, "rewards/rejected": 0.020981768146157265, "step": 36 }, { "dpo_lambda": 0.9962263703346252, "epoch": 0.07746663177178749, "grad_norm": 6.11369907380103, "learning_rate": 3.8541666666666665e-07, "logits/chosen": -0.8049490451812744, "logits/rejected": -0.8439275622367859, "logps/chosen": -332.357421875, "logps/rejected": -262.38671875, "loss": 0.6875, "rewards/accuracies": 0.71875, "rewards/chosen": 0.036942947655916214, "rewards/margins": 0.01394661981612444, "rewards/rejected": 0.0229963306337595, "step": 37 }, { "dpo_lambda": 0.9961215257644653, "epoch": 0.07956032452237634, "grad_norm": 4.137117122207877, "learning_rate": 3.958333333333333e-07, "logits/chosen": -0.8226761817932129, "logits/rejected": -0.8536137342453003, "logps/chosen": -258.0992126464844, "logps/rejected": -246.155029296875, "loss": 0.687, "rewards/accuracies": 0.765625, "rewards/chosen": 0.03529813885688782, "rewards/margins": 0.0135354558005929, "rewards/rejected": 0.021762683987617493, "step": 38 }, { "dpo_lambda": 0.9960166215896606, "epoch": 0.08165401727296519, "grad_norm": 4.21153020487944, "learning_rate": 4.0625e-07, "logits/chosen": -0.7359837293624878, "logits/rejected": -0.8013171553611755, "logps/chosen": -321.1292724609375, "logps/rejected": -266.85919189453125, "loss": 0.6849, "rewards/accuracies": 0.75, "rewards/chosen": 0.040478628128767014, "rewards/margins": 0.014545298181474209, "rewards/rejected": 0.02593333274126053, "step": 39 }, { "dpo_lambda": 0.9959120154380798, "epoch": 0.08374771002355404, "grad_norm": 4.6715375481143795, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.6943680047988892, "logits/rejected": -0.7993655204772949, "logps/chosen": -294.47052001953125, "logps/rejected": -283.4633483886719, "loss": 0.6883, "rewards/accuracies": 0.59375, "rewards/chosen": 0.04448702558875084, "rewards/margins": 0.008819807320833206, "rewards/rejected": 0.03566721826791763, "step": 40 }, { "dpo_lambda": 0.9958071112632751, "epoch": 0.0858414027741429, "grad_norm": 3.8258723543140745, "learning_rate": 4.270833333333333e-07, "logits/chosen": -0.7413933277130127, "logits/rejected": -0.8862082958221436, "logps/chosen": -313.7814025878906, "logps/rejected": -279.2095642089844, "loss": 0.6877, "rewards/accuracies": 0.78125, "rewards/chosen": 0.04902862012386322, "rewards/margins": 0.0171800684183836, "rewards/rejected": 0.03184855356812477, "step": 41 }, { "dpo_lambda": 0.99570232629776, "epoch": 0.08793509552473175, "grad_norm": 4.072117115897332, "learning_rate": 4.375e-07, "logits/chosen": -0.8718756437301636, "logits/rejected": -0.8905589580535889, "logps/chosen": -289.25311279296875, "logps/rejected": -283.2568664550781, "loss": 0.687, "rewards/accuracies": 0.625, "rewards/chosen": 0.04595724865794182, "rewards/margins": 0.012137094512581825, "rewards/rejected": 0.03382015600800514, "step": 42 }, { "dpo_lambda": 0.9955974221229553, "epoch": 0.0900287882753206, "grad_norm": 7.88653117810627, "learning_rate": 4.479166666666667e-07, "logits/chosen": -0.8274166584014893, "logits/rejected": -0.9210019707679749, "logps/chosen": -324.5426940917969, "logps/rejected": -277.363037109375, "loss": 0.6838, "rewards/accuracies": 0.703125, "rewards/chosen": 0.058937303721904755, "rewards/margins": 0.025202667340636253, "rewards/rejected": 0.03373463824391365, "step": 43 }, { "dpo_lambda": 0.9954925775527954, "epoch": 0.09212248102590945, "grad_norm": 5.275783711822973, "learning_rate": 4.5833333333333327e-07, "logits/chosen": -0.7992769479751587, "logits/rejected": -0.8615429997444153, "logps/chosen": -292.977294921875, "logps/rejected": -282.3927001953125, "loss": 0.6842, "rewards/accuracies": 0.71875, "rewards/chosen": 0.05305924639105797, "rewards/margins": 0.024420510977506638, "rewards/rejected": 0.028638731688261032, "step": 44 }, { "dpo_lambda": 0.9953879117965698, "epoch": 0.0942161737764983, "grad_norm": 3.3397512285849142, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -0.8438299894332886, "logits/rejected": -0.8847188949584961, "logps/chosen": -263.0268249511719, "logps/rejected": -230.08824157714844, "loss": 0.6867, "rewards/accuracies": 0.640625, "rewards/chosen": 0.051549993455410004, "rewards/margins": 0.01370037067681551, "rewards/rejected": 0.03784961998462677, "step": 45 }, { "dpo_lambda": 0.9952830672264099, "epoch": 0.09630986652708715, "grad_norm": 4.375598831962758, "learning_rate": 4.791666666666667e-07, "logits/chosen": -0.8429378867149353, "logits/rejected": -0.8720081448554993, "logps/chosen": -290.9970703125, "logps/rejected": -254.0476531982422, "loss": 0.6796, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05708230659365654, "rewards/margins": 0.023602774366736412, "rewards/rejected": 0.03347952663898468, "step": 46 }, { "dpo_lambda": 0.99517822265625, "epoch": 0.098403559277676, "grad_norm": 7.220768015853875, "learning_rate": 4.895833333333333e-07, "logits/chosen": -0.8145819902420044, "logits/rejected": -0.9115846753120422, "logps/chosen": -321.8493347167969, "logps/rejected": -276.99609375, "loss": 0.6808, "rewards/accuracies": 0.625, "rewards/chosen": 0.07153000682592392, "rewards/margins": 0.023595348000526428, "rewards/rejected": 0.04793466255068779, "step": 47 }, { "dpo_lambda": 0.9950733780860901, "epoch": 0.10049725202826486, "grad_norm": 4.4604551692806655, "learning_rate": 5e-07, "logits/chosen": -0.7160397171974182, "logits/rejected": -0.8093741536140442, "logps/chosen": -279.6042175292969, "logps/rejected": -284.2084655761719, "loss": 0.678, "rewards/accuracies": 0.765625, "rewards/chosen": 0.058196570724248886, "rewards/margins": 0.029132841154932976, "rewards/rejected": 0.029063725844025612, "step": 48 }, { "dpo_lambda": 0.9949684739112854, "epoch": 0.10259094477885371, "grad_norm": 4.361132421326402, "learning_rate": 4.999932966293553e-07, "logits/chosen": -0.7594467401504517, "logits/rejected": -0.8968175649642944, "logps/chosen": -292.1831970214844, "logps/rejected": -258.41021728515625, "loss": 0.6795, "rewards/accuracies": 0.734375, "rewards/chosen": 0.07236772030591965, "rewards/margins": 0.03399732708930969, "rewards/rejected": 0.03837038576602936, "step": 49 }, { "dpo_lambda": 0.9948636293411255, "epoch": 0.10468463752944256, "grad_norm": 3.9072947893099683, "learning_rate": 4.999731868769026e-07, "logits/chosen": -0.9222686886787415, "logits/rejected": -0.9353721737861633, "logps/chosen": -304.7239990234375, "logps/rejected": -269.6137390136719, "loss": 0.6826, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0706743523478508, "rewards/margins": 0.01939474046230316, "rewards/rejected": 0.05127961188554764, "step": 50 }, { "epoch": 0.10468463752944256, "eval_dpo_lambda": 0.9947588443756104, "eval_logits/chosen": -0.8412269949913025, "eval_logits/rejected": -0.9094433188438416, "eval_logps/chosen": -293.9556884765625, "eval_logps/rejected": -267.0430908203125, "eval_loss": 0.6803367733955383, "eval_rewards/accuracies": 0.6690000295639038, "eval_rewards/chosen": 0.06691199541091919, "eval_rewards/margins": 0.027035199105739594, "eval_rewards/rejected": 0.0398767925798893, "eval_runtime": 566.7091, "eval_samples_per_second": 3.529, "eval_steps_per_second": 0.882, "step": 50 }, { "dpo_lambda": 0.9947589635848999, "epoch": 0.1067783302800314, "grad_norm": 3.4690982750836916, "learning_rate": 4.99939671821067e-07, "logits/chosen": -0.8632264137268066, "logits/rejected": -0.9709175229072571, "logps/chosen": -291.12359619140625, "logps/rejected": -259.31390380859375, "loss": 0.6863, "rewards/accuracies": 0.640625, "rewards/chosen": 0.06085498631000519, "rewards/margins": 0.01772325485944748, "rewards/rejected": 0.043131738901138306, "step": 51 }, { "dpo_lambda": 0.99465411901474, "epoch": 0.10887202303062025, "grad_norm": 3.933263008508187, "learning_rate": 4.998927532591591e-07, "logits/chosen": -0.7605027556419373, "logits/rejected": -0.8193937540054321, "logps/chosen": -303.9796142578125, "logps/rejected": -279.7144470214844, "loss": 0.6833, "rewards/accuracies": 0.609375, "rewards/chosen": 0.06420192122459412, "rewards/margins": 0.015458992682397366, "rewards/rejected": 0.048742931336164474, "step": 52 }, { "dpo_lambda": 0.9945492744445801, "epoch": 0.1109657157812091, "grad_norm": 4.1504325386412395, "learning_rate": 4.998324337072792e-07, "logits/chosen": -0.8832409381866455, "logits/rejected": -0.9020228385925293, "logps/chosen": -230.05926513671875, "logps/rejected": -230.46888732910156, "loss": 0.6786, "rewards/accuracies": 0.75, "rewards/chosen": 0.06142224371433258, "rewards/margins": 0.030234448611736298, "rewards/rejected": 0.03118780255317688, "step": 53 }, { "dpo_lambda": 0.9944444298744202, "epoch": 0.11305940853179795, "grad_norm": 4.022336476891531, "learning_rate": 4.997587164001815e-07, "logits/chosen": -0.9572932720184326, "logits/rejected": -0.8822497129440308, "logps/chosen": -312.7522277832031, "logps/rejected": -292.34979248046875, "loss": 0.6729, "rewards/accuracies": 0.75, "rewards/chosen": 0.0748954638838768, "rewards/margins": 0.05051225423812866, "rewards/rejected": 0.02438320592045784, "step": 54 }, { "dpo_lambda": 0.9943395853042603, "epoch": 0.11515310128238682, "grad_norm": 4.085909737303222, "learning_rate": 4.996716052911017e-07, "logits/chosen": -0.798669159412384, "logits/rejected": -0.8256829380989075, "logps/chosen": -270.40679931640625, "logps/rejected": -253.59112548828125, "loss": 0.677, "rewards/accuracies": 0.796875, "rewards/chosen": 0.07010850310325623, "rewards/margins": 0.03986787796020508, "rewards/rejected": 0.030240625143051147, "step": 55 }, { "dpo_lambda": 0.9942349195480347, "epoch": 0.11724679403297567, "grad_norm": 4.021809912258076, "learning_rate": 4.99571105051544e-07, "logits/chosen": -0.8572608232498169, "logits/rejected": -0.9068230390548706, "logps/chosen": -269.0156555175781, "logps/rejected": -263.5270690917969, "loss": 0.6726, "rewards/accuracies": 0.640625, "rewards/chosen": 0.0742337703704834, "rewards/margins": 0.03828868642449379, "rewards/rejected": 0.03594507277011871, "step": 56 }, { "dpo_lambda": 0.99413001537323, "epoch": 0.11934048678356451, "grad_norm": 4.4423528608842995, "learning_rate": 4.994572210710314e-07, "logits/chosen": -0.8098608255386353, "logits/rejected": -0.9324737787246704, "logps/chosen": -323.1205749511719, "logps/rejected": -322.8743896484375, "loss": 0.6707, "rewards/accuracies": 0.796875, "rewards/chosen": 0.07381981611251831, "rewards/margins": 0.053245287388563156, "rewards/rejected": 0.020574528723955154, "step": 57 }, { "dpo_lambda": 0.9940251708030701, "epoch": 0.12143417953415336, "grad_norm": 6.2104322840923984, "learning_rate": 4.993299594568162e-07, "logits/chosen": -0.7725510001182556, "logits/rejected": -0.882312536239624, "logps/chosen": -274.6170654296875, "logps/rejected": -282.4775390625, "loss": 0.669, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0651557445526123, "rewards/margins": 0.04476066306233406, "rewards/rejected": 0.020395075902342796, "step": 58 }, { "dpo_lambda": 0.9939203262329102, "epoch": 0.12352787228474221, "grad_norm": 4.039407324119509, "learning_rate": 4.991893270335525e-07, "logits/chosen": -0.8754050731658936, "logits/rejected": -0.9039373397827148, "logps/chosen": -300.01531982421875, "logps/rejected": -246.3870086669922, "loss": 0.6774, "rewards/accuracies": 0.703125, "rewards/chosen": 0.0613943375647068, "rewards/margins": 0.04010332375764847, "rewards/rejected": 0.021291015669703484, "step": 59 }, { "dpo_lambda": 0.9938154816627502, "epoch": 0.12562156503533106, "grad_norm": 4.220351838843851, "learning_rate": 4.990353313429303e-07, "logits/chosen": -0.8983861804008484, "logits/rejected": -0.9741111397743225, "logps/chosen": -293.3563537597656, "logps/rejected": -272.404052734375, "loss": 0.6698, "rewards/accuracies": 0.625, "rewards/chosen": 0.050274576991796494, "rewards/margins": 0.04937704652547836, "rewards/rejected": 0.0008975326200015843, "step": 60 }, { "dpo_lambda": 0.9937106370925903, "epoch": 0.1277152577859199, "grad_norm": 4.830185238287808, "learning_rate": 4.988679806432711e-07, "logits/chosen": -0.8061741590499878, "logits/rejected": -0.8219490051269531, "logps/chosen": -333.5423278808594, "logps/rejected": -304.4486083984375, "loss": 0.6691, "rewards/accuracies": 0.765625, "rewards/chosen": 0.04406392201781273, "rewards/margins": 0.0422632060945034, "rewards/rejected": 0.0018007168546319008, "step": 61 }, { "dpo_lambda": 0.9936057329177856, "epoch": 0.12980895053650876, "grad_norm": 13.0411543579937, "learning_rate": 4.986872839090852e-07, "logits/chosen": -0.9093612432479858, "logits/rejected": -0.987296998500824, "logps/chosen": -323.12005615234375, "logps/rejected": -277.0774841308594, "loss": 0.6655, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05420689284801483, "rewards/margins": 0.06399476528167725, "rewards/rejected": -0.009787879884243011, "step": 62 }, { "dpo_lambda": 0.9935011267662048, "epoch": 0.1319026432870976, "grad_norm": 7.307100303338513, "learning_rate": 4.9849325083059e-07, "logits/chosen": -0.9184948205947876, "logits/rejected": -0.9709855914115906, "logps/chosen": -288.5375061035156, "logps/rejected": -257.5970458984375, "loss": 0.6596, "rewards/accuracies": 0.75, "rewards/chosen": 0.03467197343707085, "rewards/margins": 0.061485495418310165, "rewards/rejected": -0.02681351825594902, "step": 63 }, { "dpo_lambda": 0.9933962225914001, "epoch": 0.13399633603768646, "grad_norm": 3.6883053376031145, "learning_rate": 4.982858918131906e-07, "logits/chosen": -0.8752248287200928, "logits/rejected": -0.918498158454895, "logps/chosen": -278.55010986328125, "logps/rejected": -274.21343994140625, "loss": 0.675, "rewards/accuracies": 0.609375, "rewards/chosen": 0.004433615133166313, "rewards/margins": 0.035762809216976166, "rewards/rejected": -0.031329195946455, "step": 64 }, { "dpo_lambda": 0.9932913780212402, "epoch": 0.1360900287882753, "grad_norm": 8.921064153450724, "learning_rate": 4.980652179769217e-07, "logits/chosen": -0.8184975385665894, "logits/rejected": -0.8712335824966431, "logps/chosen": -291.94488525390625, "logps/rejected": -271.47967529296875, "loss": 0.6645, "rewards/accuracies": 0.65625, "rewards/chosen": 0.01280883327126503, "rewards/margins": 0.057765569537878036, "rewards/rejected": -0.04495673626661301, "step": 65 }, { "dpo_lambda": 0.9931865334510803, "epoch": 0.13818372153886418, "grad_norm": 6.714346880863985, "learning_rate": 4.978312411558517e-07, "logits/chosen": -0.8184133172035217, "logits/rejected": -0.8564844727516174, "logps/chosen": -355.27081298828125, "logps/rejected": -319.2922058105469, "loss": 0.6674, "rewards/accuracies": 0.734375, "rewards/chosen": 0.02395615540444851, "rewards/margins": 0.07637568563222885, "rewards/rejected": -0.05241953209042549, "step": 66 }, { "dpo_lambda": 0.9930816888809204, "epoch": 0.14027741428945303, "grad_norm": 10.679575076032895, "learning_rate": 4.975839738974473e-07, "logits/chosen": -0.7999946475028992, "logits/rejected": -0.8521921634674072, "logps/chosen": -330.9016418457031, "logps/rejected": -294.4024963378906, "loss": 0.6446, "rewards/accuracies": 0.65625, "rewards/chosen": 0.010321049951016903, "rewards/margins": 0.07487591356039047, "rewards/rejected": -0.0645548552274704, "step": 67 }, { "dpo_lambda": 0.9929770231246948, "epoch": 0.14237110704004188, "grad_norm": 7.875861769063148, "learning_rate": 4.97323429461901e-07, "logits/chosen": -0.8829125165939331, "logits/rejected": -0.9044405817985535, "logps/chosen": -282.6078796386719, "logps/rejected": -283.01708984375, "loss": 0.6603, "rewards/accuracies": 0.703125, "rewards/chosen": -0.018198691308498383, "rewards/margins": 0.04956141486763954, "rewards/rejected": -0.06776010245084763, "step": 68 }, { "dpo_lambda": 0.9928721785545349, "epoch": 0.14446479979063073, "grad_norm": 4.618590469666639, "learning_rate": 4.970496218214204e-07, "logits/chosen": -0.8894454836845398, "logits/rejected": -0.8717415928840637, "logps/chosen": -294.20208740234375, "logps/rejected": -314.8604431152344, "loss": 0.6688, "rewards/accuracies": 0.6875, "rewards/chosen": -0.017065059393644333, "rewards/margins": 0.0473010316491127, "rewards/rejected": -0.06436608731746674, "step": 69 }, { "dpo_lambda": 0.9927672743797302, "epoch": 0.14655849254121958, "grad_norm": 6.326104311922183, "learning_rate": 4.967625656594781e-07, "logits/chosen": -0.8711340427398682, "logits/rejected": -0.9299571514129639, "logps/chosen": -270.4837646484375, "logps/rejected": -288.94024658203125, "loss": 0.666, "rewards/accuracies": 0.71875, "rewards/chosen": -0.007623748853802681, "rewards/margins": 0.05371168255805969, "rewards/rejected": -0.061335429549217224, "step": 70 }, { "dpo_lambda": 0.9926624894142151, "epoch": 0.14865218529180843, "grad_norm": 5.426572331511348, "learning_rate": 4.964622763700252e-07, "logits/chosen": -0.9065719842910767, "logits/rejected": -0.937716007232666, "logps/chosen": -359.8345947265625, "logps/rejected": -333.8294372558594, "loss": 0.6646, "rewards/accuracies": 0.671875, "rewards/chosen": -0.006900262087583542, "rewards/margins": 0.06472197920084, "rewards/rejected": -0.07162223756313324, "step": 71 }, { "dpo_lambda": 0.9925575852394104, "epoch": 0.15074587804239728, "grad_norm": 5.887744964119917, "learning_rate": 4.961487700566646e-07, "logits/chosen": -0.8798828721046448, "logits/rejected": -0.9241186380386353, "logps/chosen": -349.4935607910156, "logps/rejected": -300.86651611328125, "loss": 0.6537, "rewards/accuracies": 0.671875, "rewards/chosen": 0.019017428159713745, "rewards/margins": 0.08565981686115265, "rewards/rejected": -0.0666423887014389, "step": 72 }, { "dpo_lambda": 0.9924527406692505, "epoch": 0.15283957079298613, "grad_norm": 5.112043217977206, "learning_rate": 4.958220635317885e-07, "logits/chosen": -0.874078094959259, "logits/rejected": -0.9611667990684509, "logps/chosen": -358.426513671875, "logps/rejected": -303.3841552734375, "loss": 0.6627, "rewards/accuracies": 0.640625, "rewards/chosen": -0.017709314823150635, "rewards/margins": 0.08091504871845245, "rewards/rejected": -0.09862436354160309, "step": 73 }, { "dpo_lambda": 0.9923480749130249, "epoch": 0.15493326354357498, "grad_norm": 4.166549228246831, "learning_rate": 4.954821743156767e-07, "logits/chosen": -0.9340280890464783, "logits/rejected": -0.8813725113868713, "logps/chosen": -250.7559814453125, "logps/rejected": -250.04638671875, "loss": 0.6608, "rewards/accuracies": 0.671875, "rewards/chosen": -0.008466021157801151, "rewards/margins": 0.06686487793922424, "rewards/rejected": -0.07533089816570282, "step": 74 }, { "dpo_lambda": 0.992243230342865, "epoch": 0.15702695629416383, "grad_norm": 4.579667286901192, "learning_rate": 4.951291206355559e-07, "logits/chosen": -0.8417796492576599, "logits/rejected": -0.8593603372573853, "logps/chosen": -278.1746826171875, "logps/rejected": -288.68060302734375, "loss": 0.6551, "rewards/accuracies": 0.78125, "rewards/chosen": -0.008493858389556408, "rewards/margins": 0.0969361662864685, "rewards/rejected": -0.10543002933263779, "step": 75 }, { "dpo_lambda": 0.9921383857727051, "epoch": 0.15912064904475268, "grad_norm": 4.441173202852156, "learning_rate": 4.947629214246236e-07, "logits/chosen": -0.824394166469574, "logits/rejected": -0.9439455270767212, "logps/chosen": -264.31573486328125, "logps/rejected": -238.005615234375, "loss": 0.6518, "rewards/accuracies": 0.75, "rewards/chosen": 0.01480570062994957, "rewards/margins": 0.118388332426548, "rewards/rejected": -0.10358262807130814, "step": 76 }, { "dpo_lambda": 0.9920335412025452, "epoch": 0.16121434179534153, "grad_norm": 4.800895103736595, "learning_rate": 4.943835963210323e-07, "logits/chosen": -0.9381400942802429, "logits/rejected": -0.9885379672050476, "logps/chosen": -291.7290344238281, "logps/rejected": -253.0125274658203, "loss": 0.6549, "rewards/accuracies": 0.75, "rewards/chosen": -0.015666170045733452, "rewards/margins": 0.09368825703859329, "rewards/rejected": -0.1093544289469719, "step": 77 }, { "dpo_lambda": 0.9919286370277405, "epoch": 0.16330803454593038, "grad_norm": 6.9168703022954725, "learning_rate": 4.939911656668361e-07, "logits/chosen": -0.8449192047119141, "logits/rejected": -0.9392167329788208, "logps/chosen": -314.6385803222656, "logps/rejected": -261.46875, "loss": 0.6502, "rewards/accuracies": 0.71875, "rewards/chosen": 0.029134994372725487, "rewards/margins": 0.11234299838542938, "rewards/rejected": -0.08320800215005875, "step": 78 }, { "dpo_lambda": 0.9918240308761597, "epoch": 0.16540172729651922, "grad_norm": 7.545177066125938, "learning_rate": 4.935856505068998e-07, "logits/chosen": -0.9078419804573059, "logits/rejected": -1.0092755556106567, "logps/chosen": -275.2136535644531, "logps/rejected": -308.03533935546875, "loss": 0.6369, "rewards/accuracies": 0.859375, "rewards/chosen": 0.024624880403280258, "rewards/margins": 0.13832628726959229, "rewards/rejected": -0.11370141804218292, "step": 79 }, { "dpo_lambda": 0.991719126701355, "epoch": 0.16749542004710807, "grad_norm": 4.95330943717671, "learning_rate": 4.93167072587771e-07, "logits/chosen": -0.8874943256378174, "logits/rejected": -0.9544984102249146, "logps/chosen": -335.6016540527344, "logps/rejected": -310.4072570800781, "loss": 0.6488, "rewards/accuracies": 0.703125, "rewards/chosen": -0.06745388358831406, "rewards/margins": 0.10092146694660187, "rewards/rejected": -0.16837534308433533, "step": 80 }, { "dpo_lambda": 0.9916142821311951, "epoch": 0.16958911279769695, "grad_norm": 5.518201533595615, "learning_rate": 4.92735454356513e-07, "logits/chosen": -0.9439139366149902, "logits/rejected": -0.9663654565811157, "logps/chosen": -269.6962585449219, "logps/rejected": -272.7530212402344, "loss": 0.6392, "rewards/accuracies": 0.75, "rewards/chosen": -0.0002646269276738167, "rewards/margins": 0.1335701048374176, "rewards/rejected": -0.13383471965789795, "step": 81 }, { "dpo_lambda": 0.9915094375610352, "epoch": 0.1716828055482858, "grad_norm": 6.030336679079051, "learning_rate": 4.922908189595017e-07, "logits/chosen": -0.9288738369941711, "logits/rejected": -0.9926230311393738, "logps/chosen": -279.9482421875, "logps/rejected": -245.69894409179688, "loss": 0.6471, "rewards/accuracies": 0.75, "rewards/chosen": 0.009446481242775917, "rewards/margins": 0.1037771925330162, "rewards/rejected": -0.09433071315288544, "step": 82 }, { "dpo_lambda": 0.9914045929908752, "epoch": 0.17377649829887465, "grad_norm": 6.812109507351604, "learning_rate": 4.918331902411841e-07, "logits/chosen": -0.8729565143585205, "logits/rejected": -0.9138444662094116, "logps/chosen": -292.5542907714844, "logps/rejected": -281.9892272949219, "loss": 0.6523, "rewards/accuracies": 0.640625, "rewards/chosen": -0.008294099941849709, "rewards/margins": 0.10660440474748611, "rewards/rejected": -0.11489850282669067, "step": 83 }, { "dpo_lambda": 0.9912997484207153, "epoch": 0.1758701910494635, "grad_norm": 7.516475417347365, "learning_rate": 4.913625927427995e-07, "logits/chosen": -0.9265223741531372, "logits/rejected": -0.9852555394172668, "logps/chosen": -332.3023681640625, "logps/rejected": -303.4096984863281, "loss": 0.625, "rewards/accuracies": 0.75, "rewards/chosen": -0.007811751216650009, "rewards/margins": 0.1576412171125412, "rewards/rejected": -0.1654529720544815, "step": 84 }, { "dpo_lambda": 0.9911948442459106, "epoch": 0.17796388380005235, "grad_norm": 5.373997547948311, "learning_rate": 4.908790517010636e-07, "logits/chosen": -0.9465526342391968, "logits/rejected": -0.9650004506111145, "logps/chosen": -305.8551330566406, "logps/rejected": -287.52679443359375, "loss": 0.6332, "rewards/accuracies": 0.734375, "rewards/chosen": -0.002578308340162039, "rewards/margins": 0.12680523097515106, "rewards/rejected": -0.12938354909420013, "step": 85 }, { "dpo_lambda": 0.9910901784896851, "epoch": 0.1800575765506412, "grad_norm": 5.931519307525962, "learning_rate": 4.903825930468148e-07, "logits/chosen": -0.8247543573379517, "logits/rejected": -0.9537917375564575, "logps/chosen": -288.71148681640625, "logps/rejected": -268.2547302246094, "loss": 0.623, "rewards/accuracies": 0.84375, "rewards/chosen": -0.024831930175423622, "rewards/margins": 0.19303402304649353, "rewards/rejected": -0.2178659439086914, "step": 86 }, { "dpo_lambda": 0.9909853339195251, "epoch": 0.18215126930123005, "grad_norm": 6.129440307717882, "learning_rate": 4.898732434036243e-07, "logits/chosen": -0.8919886350631714, "logits/rejected": -0.9316573143005371, "logps/chosen": -299.3219909667969, "logps/rejected": -325.168212890625, "loss": 0.6235, "rewards/accuracies": 0.734375, "rewards/chosen": -0.023229926824569702, "rewards/margins": 0.1745581179857254, "rewards/rejected": -0.1977880299091339, "step": 87 }, { "dpo_lambda": 0.9908804893493652, "epoch": 0.1842449620518189, "grad_norm": 21.191472422356895, "learning_rate": 4.893510300863676e-07, "logits/chosen": -0.8724699020385742, "logits/rejected": -0.894172191619873, "logps/chosen": -249.15235900878906, "logps/rejected": -276.5699768066406, "loss": 0.6509, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10194307565689087, "rewards/margins": 0.10438381880521774, "rewards/rejected": -0.2063269019126892, "step": 88 }, { "dpo_lambda": 0.9907756447792053, "epoch": 0.18633865480240774, "grad_norm": 9.630094496567592, "learning_rate": 4.8881598109976e-07, "logits/chosen": -0.8519224524497986, "logits/rejected": -0.9527184367179871, "logps/chosen": -374.3948974609375, "logps/rejected": -337.03668212890625, "loss": 0.6426, "rewards/accuracies": 0.703125, "rewards/chosen": -0.08539784699678421, "rewards/margins": 0.12617787718772888, "rewards/rejected": -0.2115756869316101, "step": 89 }, { "dpo_lambda": 0.9906708002090454, "epoch": 0.1884323475529966, "grad_norm": 6.3578631755387525, "learning_rate": 4.882681251368548e-07, "logits/chosen": -0.8806661367416382, "logits/rejected": -0.8995652794837952, "logps/chosen": -294.17474365234375, "logps/rejected": -294.7281494140625, "loss": 0.6287, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12962456047534943, "rewards/margins": 0.16316621005535126, "rewards/rejected": -0.29279080033302307, "step": 90 }, { "dpo_lambda": 0.9905661344528198, "epoch": 0.19052604030358544, "grad_norm": 6.600987454158478, "learning_rate": 4.877074915775048e-07, "logits/chosen": -0.8745786547660828, "logits/rejected": -0.8542971611022949, "logps/chosen": -327.0505065917969, "logps/rejected": -289.8307800292969, "loss": 0.6266, "rewards/accuracies": 0.71875, "rewards/chosen": -0.08782470226287842, "rewards/margins": 0.18679843842983246, "rewards/rejected": -0.2746231257915497, "step": 91 }, { "dpo_lambda": 0.9904612898826599, "epoch": 0.1926197330541743, "grad_norm": 8.081977238915549, "learning_rate": 4.871341104867864e-07, "logits/chosen": -0.8323448896408081, "logits/rejected": -0.9532727599143982, "logps/chosen": -301.7206726074219, "logps/rejected": -277.83123779296875, "loss": 0.6013, "rewards/accuracies": 0.875, "rewards/chosen": -0.041278161108493805, "rewards/margins": 0.25224485993385315, "rewards/rejected": -0.29352301359176636, "step": 92 }, { "dpo_lambda": 0.9903563857078552, "epoch": 0.19471342580476314, "grad_norm": 11.457731875661583, "learning_rate": 4.865480126133871e-07, "logits/chosen": -0.8399688005447388, "logits/rejected": -0.9014586210250854, "logps/chosen": -277.01605224609375, "logps/rejected": -324.4666442871094, "loss": 0.618, "rewards/accuracies": 0.765625, "rewards/chosen": -0.08516222983598709, "rewards/margins": 0.19609113037586212, "rewards/rejected": -0.2812533378601074, "step": 93 }, { "dpo_lambda": 0.9902515411376953, "epoch": 0.196807118555352, "grad_norm": 8.795130599904098, "learning_rate": 4.859492293879573e-07, "logits/chosen": -0.8535427451133728, "logits/rejected": -0.8940081000328064, "logps/chosen": -243.99441528320312, "logps/rejected": -254.5758819580078, "loss": 0.6554, "rewards/accuracies": 0.75, "rewards/chosen": -0.05271986871957779, "rewards/margins": 0.1381668746471405, "rewards/rejected": -0.1908867508172989, "step": 94 }, { "dpo_lambda": 0.9901466965675354, "epoch": 0.19890081130594087, "grad_norm": 22.46452853252336, "learning_rate": 4.853377929214243e-07, "logits/chosen": -0.8657770156860352, "logits/rejected": -0.8967228531837463, "logps/chosen": -309.3226623535156, "logps/rejected": -328.7403564453125, "loss": 0.6138, "rewards/accuracies": 0.75, "rewards/chosen": -0.12215033918619156, "rewards/margins": 0.18180522322654724, "rewards/rejected": -0.3039555251598358, "step": 95 }, { "dpo_lambda": 0.9900418519973755, "epoch": 0.20099450405652972, "grad_norm": 6.7949580684730515, "learning_rate": 4.847137360032699e-07, "logits/chosen": -0.9191737771034241, "logits/rejected": -0.9387893676757812, "logps/chosen": -324.2237854003906, "logps/rejected": -314.5294189453125, "loss": 0.6324, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09853880852460861, "rewards/margins": 0.2057497352361679, "rewards/rejected": -0.3042885363101959, "step": 96 }, { "dpo_lambda": 0.9899371862411499, "epoch": 0.20308819680711857, "grad_norm": 6.158606287373961, "learning_rate": 4.84077092099773e-07, "logits/chosen": -0.8146138787269592, "logits/rejected": -0.8111870884895325, "logps/chosen": -346.63946533203125, "logps/rejected": -317.3700866699219, "loss": 0.6245, "rewards/accuracies": 0.625, "rewards/chosen": -0.11687865853309631, "rewards/margins": 0.15650136768817902, "rewards/rejected": -0.2733800411224365, "step": 97 }, { "dpo_lambda": 0.98983234167099, "epoch": 0.20518188955770741, "grad_norm": 5.273144558536801, "learning_rate": 4.834278953522137e-07, "logits/chosen": -0.8933238983154297, "logits/rejected": -0.9000183343887329, "logps/chosen": -341.7193298339844, "logps/rejected": -319.6878662109375, "loss": 0.6252, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05204794183373451, "rewards/margins": 0.19676589965820312, "rewards/rejected": -0.24881383776664734, "step": 98 }, { "dpo_lambda": 0.9897274374961853, "epoch": 0.20727558230829626, "grad_norm": 13.514515103851846, "learning_rate": 4.827661805750437e-07, "logits/chosen": -0.8712120652198792, "logits/rejected": -0.8879258632659912, "logps/chosen": -251.89381408691406, "logps/rejected": -281.3234558105469, "loss": 0.6548, "rewards/accuracies": 0.671875, "rewards/chosen": -0.11316128820180893, "rewards/margins": 0.16183218359947205, "rewards/rejected": -0.2749934792518616, "step": 99 }, { "dpo_lambda": 0.9896226525306702, "epoch": 0.2093692750588851, "grad_norm": 10.260926722198429, "learning_rate": 4.820919832540181e-07, "logits/chosen": -0.8627618551254272, "logits/rejected": -0.9980306029319763, "logps/chosen": -315.52777099609375, "logps/rejected": -257.67620849609375, "loss": 0.5951, "rewards/accuracies": 0.65625, "rewards/chosen": -0.033613838255405426, "rewards/margins": 0.22266733646392822, "rewards/rejected": -0.25628116726875305, "step": 100 }, { "epoch": 0.2093692750588851, "eval_dpo_lambda": 0.9895178079605103, "eval_logits/chosen": -0.8666641712188721, "eval_logits/rejected": -0.919507622718811, "eval_logps/chosen": -309.2590637207031, "eval_logps/rejected": -298.4849853515625, "eval_loss": 0.6223126649856567, "eval_rewards/accuracies": 0.7129999995231628, "eval_rewards/chosen": -0.08612177520990372, "eval_rewards/margins": 0.18842005729675293, "eval_rewards/rejected": -0.27454182505607605, "eval_runtime": 560.924, "eval_samples_per_second": 3.566, "eval_steps_per_second": 0.891, "step": 100 }, { "dpo_lambda": 0.9895177483558655, "epoch": 0.21146296780947396, "grad_norm": 11.934315941165567, "learning_rate": 4.814053395442932e-07, "logits/chosen": -0.9160792231559753, "logits/rejected": -0.958967924118042, "logps/chosen": -295.62933349609375, "logps/rejected": -283.5747985839844, "loss": 0.6385, "rewards/accuracies": 0.703125, "rewards/chosen": -0.10350628197193146, "rewards/margins": 0.18040822446346283, "rewards/rejected": -0.2839145362377167, "step": 101 }, { "dpo_lambda": 0.9894131422042847, "epoch": 0.2135566605600628, "grad_norm": 13.96418452441393, "learning_rate": 4.807062862684873e-07, "logits/chosen": -0.8515546917915344, "logits/rejected": -0.8713206648826599, "logps/chosen": -274.0570068359375, "logps/rejected": -318.78070068359375, "loss": 0.6274, "rewards/accuracies": 0.65625, "rewards/chosen": -0.14027418196201324, "rewards/margins": 0.13299782574176788, "rewards/rejected": -0.27327200770378113, "step": 102 }, { "dpo_lambda": 0.98930823802948, "epoch": 0.21565035331065166, "grad_norm": 8.696396287402754, "learning_rate": 4.799948609147061e-07, "logits/chosen": -0.8648073673248291, "logits/rejected": -0.9215875267982483, "logps/chosen": -344.8481140136719, "logps/rejected": -380.5777893066406, "loss": 0.6581, "rewards/accuracies": 0.640625, "rewards/chosen": -0.17415529489517212, "rewards/margins": 0.04592770338058472, "rewards/rejected": -0.22008298337459564, "step": 103 }, { "dpo_lambda": 0.9892033934593201, "epoch": 0.2177440460612405, "grad_norm": 13.71055022495119, "learning_rate": 4.792711016345321e-07, "logits/chosen": -0.8732472658157349, "logits/rejected": -0.9386597275733948, "logps/chosen": -254.61373901367188, "logps/rejected": -265.4243469238281, "loss": 0.6175, "rewards/accuracies": 0.609375, "rewards/chosen": -0.06950341910123825, "rewards/margins": 0.21822991967201233, "rewards/rejected": -0.2877333462238312, "step": 104 }, { "dpo_lambda": 0.9890985488891602, "epoch": 0.21983773881182936, "grad_norm": 13.927329236949847, "learning_rate": 4.785350472409791e-07, "logits/chosen": -0.8163785338401794, "logits/rejected": -0.862995982170105, "logps/chosen": -336.977783203125, "logps/rejected": -323.1407165527344, "loss": 0.6072, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0720912292599678, "rewards/margins": 0.2766712009906769, "rewards/rejected": -0.3487624526023865, "step": 105 }, { "dpo_lambda": 0.9889937043190002, "epoch": 0.2219314315624182, "grad_norm": 17.021289496777705, "learning_rate": 4.777867372064105e-07, "logits/chosen": -0.8436889052391052, "logits/rejected": -0.8120547533035278, "logps/chosen": -302.01715087890625, "logps/rejected": -307.5523681640625, "loss": 0.6113, "rewards/accuracies": 0.734375, "rewards/chosen": -0.051165711134672165, "rewards/margins": 0.24847164750099182, "rewards/rejected": -0.2996373772621155, "step": 106 }, { "dpo_lambda": 0.9888888001441956, "epoch": 0.22402512431300706, "grad_norm": 16.04000803772397, "learning_rate": 4.770262116604223e-07, "logits/chosen": -0.7302054166793823, "logits/rejected": -0.7980797290802002, "logps/chosen": -311.28961181640625, "logps/rejected": -275.90948486328125, "loss": 0.6348, "rewards/accuracies": 0.671875, "rewards/chosen": -0.109623983502388, "rewards/margins": 0.18497368693351746, "rewards/rejected": -0.29459768533706665, "step": 107 }, { "dpo_lambda": 0.9887839555740356, "epoch": 0.2261188170635959, "grad_norm": 15.39744839277282, "learning_rate": 4.7625351138769166e-07, "logits/chosen": -0.8831318020820618, "logits/rejected": -0.9750317335128784, "logps/chosen": -317.7901306152344, "logps/rejected": -317.9429016113281, "loss": 0.6099, "rewards/accuracies": 0.75, "rewards/chosen": -0.16366863250732422, "rewards/margins": 0.2613745927810669, "rewards/rejected": -0.4250431954860687, "step": 108 }, { "dpo_lambda": 0.9886792898178101, "epoch": 0.22821250981418476, "grad_norm": 22.120796176992506, "learning_rate": 4.75468677825789e-07, "logits/chosen": -0.796947717666626, "logits/rejected": -0.8122372031211853, "logps/chosen": -347.1046447753906, "logps/rejected": -348.04705810546875, "loss": 0.6219, "rewards/accuracies": 0.671875, "rewards/chosen": -0.19824260473251343, "rewards/margins": 0.1869257688522339, "rewards/rejected": -0.3851684033870697, "step": 109 }, { "dpo_lambda": 0.9885744452476501, "epoch": 0.23030620256477363, "grad_norm": 22.21838839500956, "learning_rate": 4.7467175306295647e-07, "logits/chosen": -0.8764458894729614, "logits/rejected": -0.8621913194656372, "logps/chosen": -290.6605224609375, "logps/rejected": -304.8687438964844, "loss": 0.6084, "rewards/accuracies": 0.703125, "rewards/chosen": -0.20384612679481506, "rewards/margins": 0.18661168217658997, "rewards/rejected": -0.39045780897140503, "step": 110 }, { "dpo_lambda": 0.9884696006774902, "epoch": 0.23239989531536248, "grad_norm": 15.44321482685626, "learning_rate": 4.7386277983585053e-07, "logits/chosen": -0.8445209264755249, "logits/rejected": -0.8973492980003357, "logps/chosen": -310.8070983886719, "logps/rejected": -336.9670104980469, "loss": 0.6198, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22197970747947693, "rewards/margins": 0.25436967611312866, "rewards/rejected": -0.4763493537902832, "step": 111 }, { "dpo_lambda": 0.9883647561073303, "epoch": 0.23449358806595133, "grad_norm": 21.734524282209797, "learning_rate": 4.7304180152725024e-07, "logits/chosen": -0.7842965126037598, "logits/rejected": -0.7857354283332825, "logps/chosen": -340.5823974609375, "logps/rejected": -327.64093017578125, "loss": 0.5882, "rewards/accuracies": 0.796875, "rewards/chosen": -0.17074204981327057, "rewards/margins": 0.3331073522567749, "rewards/rejected": -0.5038493871688843, "step": 112 }, { "dpo_lambda": 0.9882599115371704, "epoch": 0.23658728081654018, "grad_norm": 17.283697072731332, "learning_rate": 4.7220886216373085e-07, "logits/chosen": -0.834023654460907, "logits/rejected": -0.8667508363723755, "logps/chosen": -341.08538818359375, "logps/rejected": -288.1911315917969, "loss": 0.6443, "rewards/accuracies": 0.625, "rewards/chosen": -0.27939462661743164, "rewards/margins": 0.20295719802379608, "rewards/rejected": -0.48235180974006653, "step": 113 }, { "dpo_lambda": 0.9881552457809448, "epoch": 0.23868097356712903, "grad_norm": 23.58869951254614, "learning_rate": 4.7136400641330245e-07, "logits/chosen": -0.8713721036911011, "logits/rejected": -0.8854274153709412, "logps/chosen": -334.2297058105469, "logps/rejected": -340.6756591796875, "loss": 0.5903, "rewards/accuracies": 0.734375, "rewards/chosen": -0.16744357347488403, "rewards/margins": 0.3070833086967468, "rewards/rejected": -0.47452688217163086, "step": 114 }, { "dpo_lambda": 0.9880504012107849, "epoch": 0.24077466631771788, "grad_norm": 15.006316309545257, "learning_rate": 4.70507279583015e-07, "logits/chosen": -0.7990316152572632, "logits/rejected": -0.9022696018218994, "logps/chosen": -310.5735168457031, "logps/rejected": -295.2878112792969, "loss": 0.6205, "rewards/accuracies": 0.671875, "rewards/chosen": -0.14761359989643097, "rewards/margins": 0.22134283185005188, "rewards/rejected": -0.36895644664764404, "step": 115 }, { "dpo_lambda": 0.9879454970359802, "epoch": 0.24286835906830673, "grad_norm": 11.83329907541787, "learning_rate": 4.6963872761652834e-07, "logits/chosen": -0.7804672718048096, "logits/rejected": -0.8577914834022522, "logps/chosen": -336.4219665527344, "logps/rejected": -331.4212341308594, "loss": 0.6112, "rewards/accuracies": 0.609375, "rewards/chosen": -0.18096496164798737, "rewards/margins": 0.20200692117214203, "rewards/rejected": -0.3829718828201294, "step": 116 }, { "dpo_lambda": 0.9878406524658203, "epoch": 0.24496205181889558, "grad_norm": 23.71777196224923, "learning_rate": 4.687583970916486e-07, "logits/chosen": -0.8969647884368896, "logits/rejected": -0.885977029800415, "logps/chosen": -370.19537353515625, "logps/rejected": -361.7889709472656, "loss": 0.5848, "rewards/accuracies": 0.640625, "rewards/chosen": -0.09151718765497208, "rewards/margins": 0.2100907266139984, "rewards/rejected": -0.3016079068183899, "step": 117 }, { "dpo_lambda": 0.9877358078956604, "epoch": 0.24705574456948443, "grad_norm": 10.243313554179645, "learning_rate": 4.6786633521783005e-07, "logits/chosen": -0.8844261765480042, "logits/rejected": -0.8958966732025146, "logps/chosen": -304.04376220703125, "logps/rejected": -314.54034423828125, "loss": 0.6086, "rewards/accuracies": 0.625, "rewards/chosen": -0.05434232950210571, "rewards/margins": 0.1959638148546219, "rewards/rejected": -0.2503061294555664, "step": 118 }, { "dpo_lambda": 0.9876309633255005, "epoch": 0.24914943732007327, "grad_norm": 14.635815803976675, "learning_rate": 4.669625898336438e-07, "logits/chosen": -0.8229978084564209, "logits/rejected": -0.9754506945610046, "logps/chosen": -313.97821044921875, "logps/rejected": -286.31610107421875, "loss": 0.6014, "rewards/accuracies": 0.796875, "rewards/chosen": -0.030857756733894348, "rewards/margins": 0.2675139904022217, "rewards/rejected": -0.2983717620372772, "step": 119 }, { "dpo_lambda": 0.9875260591506958, "epoch": 0.2512431300706621, "grad_norm": 14.480284643195642, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -0.9286684989929199, "logits/rejected": -0.9161196947097778, "logps/chosen": -287.4082336425781, "logps/rejected": -302.98394775390625, "loss": 0.6246, "rewards/accuracies": 0.640625, "rewards/chosen": -0.07125397026538849, "rewards/margins": 0.1770327091217041, "rewards/rejected": -0.2482866793870926, "step": 120 }, { "dpo_lambda": 0.987421452999115, "epoch": 0.253336822821251, "grad_norm": 23.772174182910327, "learning_rate": 4.651202430186092e-07, "logits/chosen": -0.8641669750213623, "logits/rejected": -0.9186245203018188, "logps/chosen": -300.3369445800781, "logps/rejected": -298.85186767578125, "loss": 0.5863, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03199971839785576, "rewards/margins": 0.27820879220962524, "rewards/rejected": -0.2462090402841568, "step": 121 }, { "dpo_lambda": 0.9873165488243103, "epoch": 0.2554305155718398, "grad_norm": 9.02656815319192, "learning_rate": 4.6418174038722924e-07, "logits/chosen": -0.7821521162986755, "logits/rejected": -0.8236280679702759, "logps/chosen": -308.3457946777344, "logps/rejected": -312.6419372558594, "loss": 0.6026, "rewards/accuracies": 0.828125, "rewards/chosen": 0.005551555659621954, "rewards/margins": 0.258663147687912, "rewards/rejected": -0.2531115412712097, "step": 122 }, { "dpo_lambda": 0.9872117638587952, "epoch": 0.25752420832242867, "grad_norm": 12.797039121653263, "learning_rate": 4.6323175183912023e-07, "logits/chosen": -0.8209193348884583, "logits/rejected": -0.8819643259048462, "logps/chosen": -309.806640625, "logps/rejected": -299.98272705078125, "loss": 0.5909, "rewards/accuracies": 0.796875, "rewards/chosen": 0.021201107650995255, "rewards/margins": 0.2730109691619873, "rewards/rejected": -0.25180989503860474, "step": 123 }, { "dpo_lambda": 0.9871068596839905, "epoch": 0.2596179010730175, "grad_norm": 7.647022983322272, "learning_rate": 4.6227032831928483e-07, "logits/chosen": -0.8404148817062378, "logits/rejected": -0.9573003649711609, "logps/chosen": -296.4405822753906, "logps/rejected": -270.74261474609375, "loss": 0.586, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0421200767159462, "rewards/margins": 0.28268861770629883, "rewards/rejected": -0.24056854844093323, "step": 124 }, { "dpo_lambda": 0.9870020151138306, "epoch": 0.26171159382360637, "grad_norm": 11.869445654498206, "learning_rate": 4.612975213859487e-07, "logits/chosen": -0.7934362888336182, "logits/rejected": -0.8030596971511841, "logps/chosen": -304.19854736328125, "logps/rejected": -326.1468811035156, "loss": 0.6335, "rewards/accuracies": 0.6875, "rewards/chosen": -0.024688338860869408, "rewards/margins": 0.17326180636882782, "rewards/rejected": -0.19795015454292297, "step": 125 }, { "dpo_lambda": 0.986897349357605, "epoch": 0.2638052865741952, "grad_norm": 12.097213815976195, "learning_rate": 4.603133832077953e-07, "logits/chosen": -0.8276994228363037, "logits/rejected": -0.8561450242996216, "logps/chosen": -279.8558044433594, "logps/rejected": -314.57623291015625, "loss": 0.6161, "rewards/accuracies": 0.765625, "rewards/chosen": -0.027820399031043053, "rewards/margins": 0.24257630109786987, "rewards/rejected": -0.27039670944213867, "step": 126 }, { "dpo_lambda": 0.9867925047874451, "epoch": 0.26589897932478407, "grad_norm": 12.805906049426016, "learning_rate": 4.5931796656116837e-07, "logits/chosen": -0.8533681631088257, "logits/rejected": -0.9015665054321289, "logps/chosen": -276.1855773925781, "logps/rejected": -236.68948364257812, "loss": 0.6133, "rewards/accuracies": 0.734375, "rewards/chosen": -0.03461135923862457, "rewards/margins": 0.2219991385936737, "rewards/rejected": -0.2566105127334595, "step": 127 }, { "dpo_lambda": 0.9866876006126404, "epoch": 0.2679926720753729, "grad_norm": 13.678592922691685, "learning_rate": 4.5831132482724193e-07, "logits/chosen": -0.7873022556304932, "logits/rejected": -0.8093962669372559, "logps/chosen": -333.6779479980469, "logps/rejected": -326.9761657714844, "loss": 0.6197, "rewards/accuracies": 0.734375, "rewards/chosen": 0.007106524892151356, "rewards/margins": 0.2661459445953369, "rewards/rejected": -0.25903940200805664, "step": 128 }, { "dpo_lambda": 0.9865828156471252, "epoch": 0.27008636482596177, "grad_norm": 17.49566260823807, "learning_rate": 4.5729351198915705e-07, "logits/chosen": -0.7583063244819641, "logits/rejected": -0.7912325263023376, "logps/chosen": -297.1742248535156, "logps/rejected": -275.2450256347656, "loss": 0.6159, "rewards/accuracies": 0.703125, "rewards/chosen": -0.051353394985198975, "rewards/margins": 0.2515460252761841, "rewards/rejected": -0.30289942026138306, "step": 129 }, { "dpo_lambda": 0.9864779114723206, "epoch": 0.2721800575765506, "grad_norm": 25.445809464675612, "learning_rate": 4.5626458262912735e-07, "logits/chosen": -0.8182171583175659, "logits/rejected": -0.9255108833312988, "logps/chosen": -311.99383544921875, "logps/rejected": -313.9915771484375, "loss": 0.5597, "rewards/accuracies": 0.78125, "rewards/chosen": -0.030926331877708435, "rewards/margins": 0.39924901723861694, "rewards/rejected": -0.4301753342151642, "step": 130 }, { "dpo_lambda": 0.9863730669021606, "epoch": 0.27427375032713947, "grad_norm": 26.072654774993698, "learning_rate": 4.5522459192551166e-07, "logits/chosen": -0.9452874660491943, "logits/rejected": -0.9563214778900146, "logps/chosen": -303.7942199707031, "logps/rejected": -345.29248046875, "loss": 0.5636, "rewards/accuracies": 0.765625, "rewards/chosen": -0.014308175072073936, "rewards/margins": 0.3875415325164795, "rewards/rejected": -0.40184974670410156, "step": 131 }, { "dpo_lambda": 0.9862684011459351, "epoch": 0.27636744307772837, "grad_norm": 10.966972949000139, "learning_rate": 4.541735956498554e-07, "logits/chosen": -0.8571950793266296, "logits/rejected": -0.9126186966896057, "logps/chosen": -292.915283203125, "logps/rejected": -323.6187744140625, "loss": 0.6018, "rewards/accuracies": 0.671875, "rewards/chosen": -0.16021493077278137, "rewards/margins": 0.1987317055463791, "rewards/rejected": -0.35894662141799927, "step": 132 }, { "dpo_lambda": 0.9861635565757751, "epoch": 0.2784611358283172, "grad_norm": 34.670416720237725, "learning_rate": 4.5311165016389914e-07, "logits/chosen": -0.8171315789222717, "logits/rejected": -0.9207563400268555, "logps/chosen": -342.6995849609375, "logps/rejected": -296.8974609375, "loss": 0.597, "rewards/accuracies": 0.671875, "rewards/chosen": -0.10333568602800369, "rewards/margins": 0.25441792607307434, "rewards/rejected": -0.35775357484817505, "step": 133 }, { "dpo_lambda": 0.9860587120056152, "epoch": 0.28055482857890607, "grad_norm": 23.394857486137187, "learning_rate": 4.520388124165564e-07, "logits/chosen": -0.8245919942855835, "logits/rejected": -0.8185215592384338, "logps/chosen": -300.1319580078125, "logps/rejected": -297.31475830078125, "loss": 0.5945, "rewards/accuracies": 0.640625, "rewards/chosen": -0.19418853521347046, "rewards/margins": 0.20099210739135742, "rewards/rejected": -0.3951806426048279, "step": 134 }, { "dpo_lambda": 0.9859538674354553, "epoch": 0.2826485213294949, "grad_norm": 7.718880144493543, "learning_rate": 4.5095513994085974e-07, "logits/chosen": -0.8987231850624084, "logits/rejected": -0.9302247166633606, "logps/chosen": -382.30987548828125, "logps/rejected": -354.4898681640625, "loss": 0.5767, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1764790415763855, "rewards/margins": 0.21645787358283997, "rewards/rejected": -0.39293694496154785, "step": 135 }, { "dpo_lambda": 0.9858490228652954, "epoch": 0.28474221408008377, "grad_norm": 17.245232915887495, "learning_rate": 4.498606908508753e-07, "logits/chosen": -0.8018785119056702, "logits/rejected": -0.8587301969528198, "logps/chosen": -328.37103271484375, "logps/rejected": -353.09375, "loss": 0.6028, "rewards/accuracies": 0.703125, "rewards/chosen": -0.25376439094543457, "rewards/margins": 0.2805079221725464, "rewards/rejected": -0.534272313117981, "step": 136 }, { "dpo_lambda": 0.9857443571090698, "epoch": 0.2868359068306726, "grad_norm": 15.123900053540126, "learning_rate": 4.487555238385862e-07, "logits/chosen": -0.7876870632171631, "logits/rejected": -0.8026723861694336, "logps/chosen": -352.7374267578125, "logps/rejected": -343.4482421875, "loss": 0.6128, "rewards/accuracies": 0.65625, "rewards/chosen": -0.346821665763855, "rewards/margins": 0.20842283964157104, "rewards/rejected": -0.555244505405426, "step": 137 }, { "dpo_lambda": 0.9856394529342651, "epoch": 0.28892959958126146, "grad_norm": 26.727256401161796, "learning_rate": 4.476396981707453e-07, "logits/chosen": -0.8450735807418823, "logits/rejected": -0.8367375135421753, "logps/chosen": -352.42425537109375, "logps/rejected": -333.8371887207031, "loss": 0.6371, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3087271749973297, "rewards/margins": 0.2823084592819214, "rewards/rejected": -0.5910355448722839, "step": 138 }, { "dpo_lambda": 0.9855346083641052, "epoch": 0.2910232923318503, "grad_norm": 14.563523819252962, "learning_rate": 4.4651327368569684e-07, "logits/chosen": -0.845096230506897, "logits/rejected": -0.8808258771896362, "logps/chosen": -354.4432067871094, "logps/rejected": -384.110595703125, "loss": 0.5936, "rewards/accuracies": 0.734375, "rewards/chosen": -0.2789064645767212, "rewards/margins": 0.35447943210601807, "rewards/rejected": -0.6333858370780945, "step": 139 }, { "dpo_lambda": 0.9854297637939453, "epoch": 0.29311698508243916, "grad_norm": 31.649857730723095, "learning_rate": 4.453763107901675e-07, "logits/chosen": -0.8781813383102417, "logits/rejected": -0.9100785851478577, "logps/chosen": -329.0050048828125, "logps/rejected": -334.69085693359375, "loss": 0.6038, "rewards/accuracies": 0.671875, "rewards/chosen": -0.3453499972820282, "rewards/margins": 0.23073923587799072, "rewards/rejected": -0.5760892629623413, "step": 140 }, { "dpo_lambda": 0.9853249192237854, "epoch": 0.295210677833028, "grad_norm": 12.957393357512265, "learning_rate": 4.4422887045602674e-07, "logits/chosen": -0.862395167350769, "logits/rejected": -0.9039013981819153, "logps/chosen": -357.94085693359375, "logps/rejected": -343.2564697265625, "loss": 0.6154, "rewards/accuracies": 0.625, "rewards/chosen": -0.4187580347061157, "rewards/margins": 0.26384854316711426, "rewards/rejected": -0.6826066374778748, "step": 141 }, { "dpo_lambda": 0.9852200746536255, "epoch": 0.29730437058361686, "grad_norm": 23.14332525158461, "learning_rate": 4.4307101421701755e-07, "logits/chosen": -0.8050290942192078, "logits/rejected": -0.7742469310760498, "logps/chosen": -327.2164001464844, "logps/rejected": -315.2928466796875, "loss": 0.5857, "rewards/accuracies": 0.734375, "rewards/chosen": -0.27529165148735046, "rewards/margins": 0.37010371685028076, "rewards/rejected": -0.6453953385353088, "step": 142 }, { "dpo_lambda": 0.9851151704788208, "epoch": 0.2993980633342057, "grad_norm": 30.166424003092107, "learning_rate": 4.419028041654559e-07, "logits/chosen": -0.841701865196228, "logits/rejected": -0.9091044664382935, "logps/chosen": -418.12188720703125, "logps/rejected": -399.4429931640625, "loss": 0.5794, "rewards/accuracies": 0.765625, "rewards/chosen": -0.28405290842056274, "rewards/margins": 0.3614083528518677, "rewards/rejected": -0.6454612612724304, "step": 143 }, { "dpo_lambda": 0.98501056432724, "epoch": 0.30149175608479456, "grad_norm": 26.814166923520364, "learning_rate": 4.4072430294890166e-07, "logits/chosen": -0.858585000038147, "logits/rejected": -0.9625511765480042, "logps/chosen": -327.3453674316406, "logps/rejected": -345.5610656738281, "loss": 0.5726, "rewards/accuracies": 0.78125, "rewards/chosen": -0.25485959649086, "rewards/margins": 0.4206133484840393, "rewards/rejected": -0.6754729747772217, "step": 144 }, { "dpo_lambda": 0.9849056601524353, "epoch": 0.3035854488353834, "grad_norm": 33.673895861881924, "learning_rate": 4.395355737667985e-07, "logits/chosen": -0.7440855503082275, "logits/rejected": -0.7877547740936279, "logps/chosen": -297.7661437988281, "logps/rejected": -269.4111633300781, "loss": 0.5726, "rewards/accuracies": 0.796875, "rewards/chosen": -0.2709321677684784, "rewards/margins": 0.4294186532497406, "rewards/rejected": -0.7003507614135742, "step": 145 }, { "dpo_lambda": 0.9848008155822754, "epoch": 0.30567914158597226, "grad_norm": 14.495187775717087, "learning_rate": 4.3833668036708483e-07, "logits/chosen": -0.8378518223762512, "logits/rejected": -0.9328808784484863, "logps/chosen": -362.03997802734375, "logps/rejected": -315.01885986328125, "loss": 0.6036, "rewards/accuracies": 0.734375, "rewards/chosen": -0.33166226744651794, "rewards/margins": 0.36146122217178345, "rewards/rejected": -0.6931235194206238, "step": 146 }, { "dpo_lambda": 0.9846959710121155, "epoch": 0.3077728343365611, "grad_norm": 24.93817427447256, "learning_rate": 4.3712768704277524e-07, "logits/chosen": -0.8426337242126465, "logits/rejected": -0.8408235907554626, "logps/chosen": -309.55853271484375, "logps/rejected": -324.4660949707031, "loss": 0.6112, "rewards/accuracies": 0.703125, "rewards/chosen": -0.31168290972709656, "rewards/margins": 0.2369699627161026, "rewards/rejected": -0.5486528873443604, "step": 147 }, { "dpo_lambda": 0.9845911264419556, "epoch": 0.30986652708714996, "grad_norm": 30.124781935289477, "learning_rate": 4.3590865862851263e-07, "logits/chosen": -0.7106437087059021, "logits/rejected": -0.8285320997238159, "logps/chosen": -318.2632141113281, "logps/rejected": -340.59735107421875, "loss": 0.5777, "rewards/accuracies": 0.71875, "rewards/chosen": -0.31502288579940796, "rewards/margins": 0.3463708162307739, "rewards/rejected": -0.6613936424255371, "step": 148 }, { "dpo_lambda": 0.98448646068573, "epoch": 0.3119602198377388, "grad_norm": 14.108616126201548, "learning_rate": 4.346796604970912e-07, "logits/chosen": -0.8086866736412048, "logits/rejected": -0.8452584743499756, "logps/chosen": -288.48724365234375, "logps/rejected": -311.91583251953125, "loss": 0.6104, "rewards/accuracies": 0.75, "rewards/chosen": -0.21689504384994507, "rewards/margins": 0.3877273499965668, "rewards/rejected": -0.6046223640441895, "step": 149 }, { "dpo_lambda": 0.9843816161155701, "epoch": 0.31405391258832765, "grad_norm": 13.499936499224043, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -0.9197017550468445, "logits/rejected": -0.8716071844100952, "logps/chosen": -272.93463134765625, "logps/rejected": -351.92022705078125, "loss": 0.6296, "rewards/accuracies": 0.703125, "rewards/chosen": -0.2734215557575226, "rewards/margins": 0.2905915081501007, "rewards/rejected": -0.5640130639076233, "step": 150 }, { "epoch": 0.31405391258832765, "eval_dpo_lambda": 0.9842765927314758, "eval_logits/chosen": -0.8554379940032959, "eval_logits/rejected": -0.9008473753929138, "eval_logps/chosen": -323.7625427246094, "eval_logps/rejected": -323.91766357421875, "eval_loss": 0.5972165465354919, "eval_rewards/accuracies": 0.7099999785423279, "eval_rewards/chosen": -0.23115603625774384, "eval_rewards/margins": 0.2977127432823181, "eval_rewards/rejected": -0.5288687944412231, "eval_runtime": 561.0863, "eval_samples_per_second": 3.565, "eval_steps_per_second": 0.891, "step": 150 }, { "dpo_lambda": 0.9842767119407654, "epoch": 0.3161476053389165, "grad_norm": 12.904660992134803, "learning_rate": 4.3219201924364323e-07, "logits/chosen": -0.8644286394119263, "logits/rejected": -0.9774547815322876, "logps/chosen": -282.9930114746094, "logps/rejected": -282.949462890625, "loss": 0.6005, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2265167385339737, "rewards/margins": 0.2916352152824402, "rewards/rejected": -0.5181519389152527, "step": 151 }, { "dpo_lambda": 0.9841719269752502, "epoch": 0.31824129808950535, "grad_norm": 12.854949528250394, "learning_rate": 4.309335095262675e-07, "logits/chosen": -0.8625615239143372, "logits/rejected": -0.8919581174850464, "logps/chosen": -322.06402587890625, "logps/rejected": -339.1501159667969, "loss": 0.5788, "rewards/accuracies": 0.71875, "rewards/chosen": -0.23511157929897308, "rewards/margins": 0.34126943349838257, "rewards/rejected": -0.5763810276985168, "step": 152 }, { "dpo_lambda": 0.9840670228004456, "epoch": 0.3203349908400942, "grad_norm": 17.39339060709683, "learning_rate": 4.2966529689388064e-07, "logits/chosen": -0.7854524850845337, "logits/rejected": -0.8845511674880981, "logps/chosen": -360.5038146972656, "logps/rejected": -352.4007263183594, "loss": 0.5801, "rewards/accuracies": 0.765625, "rewards/chosen": -0.19930708408355713, "rewards/margins": 0.3281744718551636, "rewards/rejected": -0.5274815559387207, "step": 153 }, { "dpo_lambda": 0.9839621782302856, "epoch": 0.32242868359068305, "grad_norm": 12.230075690240232, "learning_rate": 4.2838744935687716e-07, "logits/chosen": -0.9072690606117249, "logits/rejected": -0.9368817210197449, "logps/chosen": -416.1206359863281, "logps/rejected": -363.433837890625, "loss": 0.5941, "rewards/accuracies": 0.65625, "rewards/chosen": -0.21174053847789764, "rewards/margins": 0.27678394317626953, "rewards/rejected": -0.48852449655532837, "step": 154 }, { "dpo_lambda": 0.9838575124740601, "epoch": 0.3245223763412719, "grad_norm": 19.713309712130915, "learning_rate": 4.271000354423425e-07, "logits/chosen": -0.8548458814620972, "logits/rejected": -0.8624852299690247, "logps/chosen": -318.42828369140625, "logps/rejected": -347.64312744140625, "loss": 0.5767, "rewards/accuracies": 0.75, "rewards/chosen": -0.24433034658432007, "rewards/margins": 0.39910271763801575, "rewards/rejected": -0.6434330344200134, "step": 155 }, { "dpo_lambda": 0.9837526679039001, "epoch": 0.32661606909186075, "grad_norm": 12.298734283978302, "learning_rate": 4.258031241903777e-07, "logits/chosen": -0.864250898361206, "logits/rejected": -0.9269083738327026, "logps/chosen": -321.51031494140625, "logps/rejected": -344.8136291503906, "loss": 0.6089, "rewards/accuracies": 0.671875, "rewards/chosen": -0.1979214996099472, "rewards/margins": 0.3120463788509369, "rewards/rejected": -0.5099678635597229, "step": 156 }, { "dpo_lambda": 0.9836478233337402, "epoch": 0.3287097618424496, "grad_norm": 14.320512468110298, "learning_rate": 4.2449678515039743e-07, "logits/chosen": -0.7892401218414307, "logits/rejected": -0.8079796433448792, "logps/chosen": -285.41900634765625, "logps/rejected": -318.0777893066406, "loss": 0.5903, "rewards/accuracies": 0.75, "rewards/chosen": -0.18783962726593018, "rewards/margins": 0.3478223383426666, "rewards/rejected": -0.5356619954109192, "step": 157 }, { "dpo_lambda": 0.9835429787635803, "epoch": 0.33080345459303845, "grad_norm": 18.475777829614685, "learning_rate": 4.2318108837739986e-07, "logits/chosen": -0.8126506805419922, "logits/rejected": -0.8585901260375977, "logps/chosen": -347.2370910644531, "logps/rejected": -341.1890563964844, "loss": 0.5828, "rewards/accuracies": 0.765625, "rewards/chosen": -0.23844991624355316, "rewards/margins": 0.38391149044036865, "rewards/rejected": -0.6223613619804382, "step": 158 }, { "dpo_lambda": 0.9834380745887756, "epoch": 0.3328971473436273, "grad_norm": 24.273769149931997, "learning_rate": 4.218561044282098e-07, "logits/chosen": -0.8415407538414001, "logits/rejected": -0.9418197274208069, "logps/chosen": -289.1123046875, "logps/rejected": -292.8475036621094, "loss": 0.603, "rewards/accuracies": 0.640625, "rewards/chosen": -0.2409561723470688, "rewards/margins": 0.232768252491951, "rewards/rejected": -0.4737243950366974, "step": 159 }, { "dpo_lambda": 0.9833334684371948, "epoch": 0.33499084009421615, "grad_norm": 19.16029439073329, "learning_rate": 4.2052190435769554e-07, "logits/chosen": -0.8843802809715271, "logits/rejected": -0.8616979718208313, "logps/chosen": -278.7174072265625, "logps/rejected": -297.68646240234375, "loss": 0.5729, "rewards/accuracies": 0.765625, "rewards/chosen": -0.22543789446353912, "rewards/margins": 0.3052656948566437, "rewards/rejected": -0.530703604221344, "step": 160 }, { "dpo_lambda": 0.9832285642623901, "epoch": 0.33708453284480505, "grad_norm": 27.70767136536445, "learning_rate": 4.1917855971495763e-07, "logits/chosen": -0.8649369478225708, "logits/rejected": -0.8855935335159302, "logps/chosen": -345.121337890625, "logps/rejected": -321.77685546875, "loss": 0.5818, "rewards/accuracies": 0.703125, "rewards/chosen": -0.23856690526008606, "rewards/margins": 0.43436694145202637, "rewards/rejected": -0.6729338765144348, "step": 161 }, { "dpo_lambda": 0.9831237196922302, "epoch": 0.3391782255953939, "grad_norm": 11.16607639247331, "learning_rate": 4.1782614253949255e-07, "logits/chosen": -0.8402847647666931, "logits/rejected": -0.8687560558319092, "logps/chosen": -296.60333251953125, "logps/rejected": -322.4178466796875, "loss": 0.5853, "rewards/accuracies": 0.65625, "rewards/chosen": -0.23870237171649933, "rewards/margins": 0.29797500371932983, "rewards/rejected": -0.5366774201393127, "step": 162 }, { "dpo_lambda": 0.9830188751220703, "epoch": 0.34127191834598275, "grad_norm": 12.827576377715113, "learning_rate": 4.164647253573289e-07, "logits/chosen": -0.8716577887535095, "logits/rejected": -0.8448370695114136, "logps/chosen": -319.44024658203125, "logps/rejected": -368.4493103027344, "loss": 0.5854, "rewards/accuracies": 0.765625, "rewards/chosen": -0.18674519658088684, "rewards/margins": 0.33129802346229553, "rewards/rejected": -0.5180432796478271, "step": 163 }, { "dpo_lambda": 0.9829140305519104, "epoch": 0.3433656110965716, "grad_norm": 22.369252142701303, "learning_rate": 4.1509438117713863e-07, "logits/chosen": -0.8533887267112732, "logits/rejected": -0.8673970103263855, "logps/chosen": -253.2539520263672, "logps/rejected": -341.646240234375, "loss": 0.5423, "rewards/accuracies": 0.828125, "rewards/chosen": -0.22686009109020233, "rewards/margins": 0.4404720067977905, "rewards/rejected": -0.667332112789154, "step": 164 }, { "dpo_lambda": 0.9828091859817505, "epoch": 0.34545930384716045, "grad_norm": 28.98060239941749, "learning_rate": 4.137151834863213e-07, "logits/chosen": -0.9048106670379639, "logits/rejected": -0.8988927602767944, "logps/chosen": -297.9773254394531, "logps/rejected": -348.6954040527344, "loss": 0.6144, "rewards/accuracies": 0.703125, "rewards/chosen": -0.26717424392700195, "rewards/margins": 0.3538207411766052, "rewards/rejected": -0.6209949851036072, "step": 165 }, { "dpo_lambda": 0.9827042818069458, "epoch": 0.3475529965977493, "grad_norm": 31.29118255209507, "learning_rate": 4.123272062470633e-07, "logits/chosen": -0.8654748201370239, "logits/rejected": -0.9055193662643433, "logps/chosen": -285.4839172363281, "logps/rejected": -312.4461364746094, "loss": 0.6107, "rewards/accuracies": 0.765625, "rewards/chosen": -0.27893736958503723, "rewards/margins": 0.3586209714412689, "rewards/rejected": -0.6375582814216614, "step": 166 }, { "dpo_lambda": 0.9825996160507202, "epoch": 0.34964668934833815, "grad_norm": 24.71676505385136, "learning_rate": 4.1093052389237174e-07, "logits/chosen": -0.8707860708236694, "logits/rejected": -0.915932834148407, "logps/chosen": -295.7852478027344, "logps/rejected": -305.9165954589844, "loss": 0.6194, "rewards/accuracies": 0.671875, "rewards/chosen": -0.323090136051178, "rewards/margins": 0.2546401023864746, "rewards/rejected": -0.5777302980422974, "step": 167 }, { "dpo_lambda": 0.9824947714805603, "epoch": 0.351740382098927, "grad_norm": 34.53755245562597, "learning_rate": 4.0952521132208267e-07, "logits/chosen": -0.8362221121788025, "logits/rejected": -0.9009256362915039, "logps/chosen": -357.5538024902344, "logps/rejected": -332.5069274902344, "loss": 0.6059, "rewards/accuracies": 0.671875, "rewards/chosen": -0.3512650430202484, "rewards/margins": 0.34191185235977173, "rewards/rejected": -0.6931769251823425, "step": 168 }, { "dpo_lambda": 0.9823899269104004, "epoch": 0.35383407484951584, "grad_norm": 19.31427143784745, "learning_rate": 4.081113438988443e-07, "logits/chosen": -0.7671630382537842, "logits/rejected": -0.797345757484436, "logps/chosen": -353.42620849609375, "logps/rejected": -344.4584045410156, "loss": 0.6152, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3698110580444336, "rewards/margins": 0.3182661235332489, "rewards/rejected": -0.6880770921707153, "step": 169 }, { "dpo_lambda": 0.9822850823402405, "epoch": 0.3559277676001047, "grad_norm": 47.091883796618596, "learning_rate": 4.0668899744407567e-07, "logits/chosen": -0.7545144557952881, "logits/rejected": -0.844253659248352, "logps/chosen": -298.80206298828125, "logps/rejected": -371.48785400390625, "loss": 0.5473, "rewards/accuracies": 0.75, "rewards/chosen": -0.32974353432655334, "rewards/margins": 0.4235215485095978, "rewards/rejected": -0.7532650828361511, "step": 170 }, { "dpo_lambda": 0.9821802377700806, "epoch": 0.35802146035069354, "grad_norm": 58.75258029657535, "learning_rate": 4.0525824823390043e-07, "logits/chosen": -0.856792151927948, "logits/rejected": -0.8787572383880615, "logps/chosen": -319.53118896484375, "logps/rejected": -321.0688781738281, "loss": 0.5704, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3029014468193054, "rewards/margins": 0.43716755509376526, "rewards/rejected": -0.7400689721107483, "step": 171 }, { "dpo_lambda": 0.982075572013855, "epoch": 0.3601151531012824, "grad_norm": 13.60799588728662, "learning_rate": 4.0381917299505686e-07, "logits/chosen": -0.8702411651611328, "logits/rejected": -0.9157892465591431, "logps/chosen": -356.87030029296875, "logps/rejected": -373.1155700683594, "loss": 0.5837, "rewards/accuracies": 0.734375, "rewards/chosen": -0.3470417857170105, "rewards/margins": 0.35888221859931946, "rewards/rejected": -0.7059239745140076, "step": 172 }, { "dpo_lambda": 0.9819707274436951, "epoch": 0.36220884585187124, "grad_norm": 33.81883702967302, "learning_rate": 4.0237184890078243e-07, "logits/chosen": -0.9117413759231567, "logits/rejected": -0.9689619541168213, "logps/chosen": -386.8931579589844, "logps/rejected": -378.5868835449219, "loss": 0.5509, "rewards/accuracies": 0.609375, "rewards/chosen": -0.4211375415325165, "rewards/margins": 0.35862040519714355, "rewards/rejected": -0.7797579765319824, "step": 173 }, { "dpo_lambda": 0.9818658232688904, "epoch": 0.3643025386024601, "grad_norm": 25.48673031671082, "learning_rate": 4.00916353566676e-07, "logits/chosen": -0.8898088335990906, "logits/rejected": -0.9703992605209351, "logps/chosen": -353.3209228515625, "logps/rejected": -407.6015930175781, "loss": 0.5869, "rewards/accuracies": 0.75, "rewards/chosen": -0.4089928865432739, "rewards/margins": 0.37967556715011597, "rewards/rejected": -0.7886685132980347, "step": 174 }, { "dpo_lambda": 0.9817609786987305, "epoch": 0.36639623135304894, "grad_norm": 60.06891288607582, "learning_rate": 3.994527650465352e-07, "logits/chosen": -0.8689991235733032, "logits/rejected": -0.9080208539962769, "logps/chosen": -352.3197021484375, "logps/rejected": -376.54254150390625, "loss": 0.5918, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34678542613983154, "rewards/margins": 0.4048980474472046, "rewards/rejected": -0.7516834735870361, "step": 175 }, { "dpo_lambda": 0.9816561341285706, "epoch": 0.3684899241036378, "grad_norm": 44.658305155054634, "learning_rate": 3.979811618281705e-07, "logits/chosen": -0.846853494644165, "logits/rejected": -0.8561269044876099, "logps/chosen": -276.9634704589844, "logps/rejected": -319.3321838378906, "loss": 0.541, "rewards/accuracies": 0.75, "rewards/chosen": -0.2748173773288727, "rewards/margins": 0.4001079499721527, "rewards/rejected": -0.6749253869056702, "step": 176 }, { "dpo_lambda": 0.9815512895584106, "epoch": 0.37058361685422664, "grad_norm": 76.56965412827621, "learning_rate": 3.9650162282919654e-07, "logits/chosen": -0.7609463930130005, "logits/rejected": -0.8508155941963196, "logps/chosen": -374.0244445800781, "logps/rejected": -343.30615234375, "loss": 0.5848, "rewards/accuracies": 0.78125, "rewards/chosen": -0.25858065485954285, "rewards/margins": 0.4455993175506592, "rewards/rejected": -0.7041800022125244, "step": 177 }, { "dpo_lambda": 0.9814466238021851, "epoch": 0.3726773096048155, "grad_norm": 23.278575599376282, "learning_rate": 3.9501422739279953e-07, "logits/chosen": -0.879623293876648, "logits/rejected": -0.8756167888641357, "logps/chosen": -329.4707946777344, "logps/rejected": -348.7636413574219, "loss": 0.5543, "rewards/accuracies": 0.765625, "rewards/chosen": -0.2765873074531555, "rewards/margins": 0.47019055485725403, "rewards/rejected": -0.7467778921127319, "step": 178 }, { "dpo_lambda": 0.9813417792320251, "epoch": 0.37477100235540434, "grad_norm": 21.754414666752645, "learning_rate": 3.935190552834828e-07, "logits/chosen": -0.9142841100692749, "logits/rejected": -0.9394044876098633, "logps/chosen": -328.6793518066406, "logps/rejected": -372.35064697265625, "loss": 0.582, "rewards/accuracies": 0.734375, "rewards/chosen": -0.2454470992088318, "rewards/margins": 0.38764071464538574, "rewards/rejected": -0.6330878138542175, "step": 179 }, { "dpo_lambda": 0.9812368750572205, "epoch": 0.3768646951059932, "grad_norm": 18.271146036301225, "learning_rate": 3.920161866827889e-07, "logits/chosen": -0.9230740666389465, "logits/rejected": -0.9215205907821655, "logps/chosen": -291.40277099609375, "logps/rejected": -328.21923828125, "loss": 0.5845, "rewards/accuracies": 0.796875, "rewards/chosen": -0.2792706787586212, "rewards/margins": 0.36451247334480286, "rewards/rejected": -0.6437831521034241, "step": 180 }, { "dpo_lambda": 0.9811320900917053, "epoch": 0.37895838785658204, "grad_norm": 51.1159936417294, "learning_rate": 3.90505702185e-07, "logits/chosen": -0.9331147074699402, "logits/rejected": -0.9082576632499695, "logps/chosen": -305.66302490234375, "logps/rejected": -335.3623046875, "loss": 0.6069, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41276705265045166, "rewards/margins": 0.3198656439781189, "rewards/rejected": -0.7326326966285706, "step": 181 }, { "dpo_lambda": 0.9810271859169006, "epoch": 0.3810520806071709, "grad_norm": 32.33011835733113, "learning_rate": 3.889876827928156e-07, "logits/chosen": -0.9150969982147217, "logits/rejected": -0.8999711275100708, "logps/chosen": -335.1192932128906, "logps/rejected": -358.4149475097656, "loss": 0.5721, "rewards/accuracies": 0.78125, "rewards/chosen": -0.20234210789203644, "rewards/margins": 0.4882466793060303, "rewards/rejected": -0.6905888319015503, "step": 182 }, { "dpo_lambda": 0.9809223413467407, "epoch": 0.38314577335775973, "grad_norm": 22.410780663539438, "learning_rate": 3.874622099130087e-07, "logits/chosen": -0.8482258319854736, "logits/rejected": -0.8427585363388062, "logps/chosen": -368.4677734375, "logps/rejected": -365.698486328125, "loss": 0.5342, "rewards/accuracies": 0.8125, "rewards/chosen": -0.31965532898902893, "rewards/margins": 0.5097999572753906, "rewards/rejected": -0.8294552564620972, "step": 183 }, { "dpo_lambda": 0.9808176755905151, "epoch": 0.3852394661083486, "grad_norm": 40.02939969290002, "learning_rate": 3.859293653520604e-07, "logits/chosen": -0.8177944421768188, "logits/rejected": -0.8422019481658936, "logps/chosen": -315.3299865722656, "logps/rejected": -358.39556884765625, "loss": 0.5567, "rewards/accuracies": 0.71875, "rewards/chosen": -0.24775481224060059, "rewards/margins": 0.4265044331550598, "rewards/rejected": -0.6742592453956604, "step": 184 }, { "dpo_lambda": 0.9807128310203552, "epoch": 0.38733315885893743, "grad_norm": 28.798964377153162, "learning_rate": 3.8438923131177237e-07, "logits/chosen": -0.8667696714401245, "logits/rejected": -0.9194180369377136, "logps/chosen": -323.02752685546875, "logps/rejected": -272.7005615234375, "loss": 0.5917, "rewards/accuracies": 0.734375, "rewards/chosen": -0.28678402304649353, "rewards/margins": 0.456429660320282, "rewards/rejected": -0.7432136535644531, "step": 185 }, { "dpo_lambda": 0.9806079864501953, "epoch": 0.3894268516095263, "grad_norm": 45.016534861006626, "learning_rate": 3.828418903848593e-07, "logits/chosen": -0.9506804943084717, "logits/rejected": -0.9454731345176697, "logps/chosen": -313.6388244628906, "logps/rejected": -321.62261962890625, "loss": 0.5862, "rewards/accuracies": 0.578125, "rewards/chosen": -0.4552803635597229, "rewards/margins": 0.20712202787399292, "rewards/rejected": -0.6624024510383606, "step": 186 }, { "dpo_lambda": 0.9805031418800354, "epoch": 0.39152054436011513, "grad_norm": 57.520777558231316, "learning_rate": 3.812874255505191e-07, "logits/chosen": -0.8905811905860901, "logits/rejected": -0.9345070123672485, "logps/chosen": -321.8720703125, "logps/rejected": -344.01934814453125, "loss": 0.5348, "rewards/accuracies": 0.75, "rewards/chosen": -0.35681191086769104, "rewards/margins": 0.4626830220222473, "rewards/rejected": -0.819494903087616, "step": 187 }, { "dpo_lambda": 0.9803982377052307, "epoch": 0.393614237110704, "grad_norm": 24.238007823715385, "learning_rate": 3.797259201699833e-07, "logits/chosen": -0.9128227233886719, "logits/rejected": -0.8823633193969727, "logps/chosen": -353.4268493652344, "logps/rejected": -359.056396484375, "loss": 0.6241, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5055346488952637, "rewards/margins": 0.16114425659179688, "rewards/rejected": -0.6666789054870605, "step": 188 }, { "dpo_lambda": 0.9802933931350708, "epoch": 0.39570792986129283, "grad_norm": 48.17875296602705, "learning_rate": 3.781574579820464e-07, "logits/chosen": -0.8731621503829956, "logits/rejected": -0.8945026397705078, "logps/chosen": -283.6103820800781, "logps/rejected": -288.29144287109375, "loss": 0.6123, "rewards/accuracies": 0.5625, "rewards/chosen": -0.523615837097168, "rewards/margins": 0.24947702884674072, "rewards/rejected": -0.7730928659439087, "step": 189 }, { "dpo_lambda": 0.9801887273788452, "epoch": 0.39780162261188173, "grad_norm": 18.0718217557433, "learning_rate": 3.765821230985757e-07, "logits/chosen": -0.8426337242126465, "logits/rejected": -0.8446739912033081, "logps/chosen": -367.47344970703125, "logps/rejected": -384.815185546875, "loss": 0.5439, "rewards/accuracies": 0.78125, "rewards/chosen": -0.39618077874183655, "rewards/margins": 0.5162340998649597, "rewards/rejected": -0.9124149084091187, "step": 190 }, { "dpo_lambda": 0.9800838828086853, "epoch": 0.3998953153624706, "grad_norm": 21.830819632208076, "learning_rate": 3.75e-07, "logits/chosen": -0.9197947978973389, "logits/rejected": -0.9434197545051575, "logps/chosen": -327.9869384765625, "logps/rejected": -383.2161560058594, "loss": 0.6243, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4305557608604431, "rewards/margins": 0.25802311301231384, "rewards/rejected": -0.6885788440704346, "step": 191 }, { "dpo_lambda": 0.9799790382385254, "epoch": 0.40198900811305943, "grad_norm": 78.10755969507458, "learning_rate": 3.734111735307796e-07, "logits/chosen": -0.8385262489318848, "logits/rejected": -0.8365335464477539, "logps/chosen": -269.979736328125, "logps/rejected": -328.7428283691406, "loss": 0.5734, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3273383378982544, "rewards/margins": 0.37994542717933655, "rewards/rejected": -0.7072837352752686, "step": 192 }, { "dpo_lambda": 0.9798741936683655, "epoch": 0.4040827008636483, "grad_norm": 13.276534069096611, "learning_rate": 3.7181572889485623e-07, "logits/chosen": -0.8739302158355713, "logits/rejected": -0.8805893659591675, "logps/chosen": -382.5960998535156, "logps/rejected": -386.1152648925781, "loss": 0.5846, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5214099287986755, "rewards/margins": 0.38677844405174255, "rewards/rejected": -0.9081884622573853, "step": 193 }, { "dpo_lambda": 0.9797693490982056, "epoch": 0.40617639361423713, "grad_norm": 31.90170636280833, "learning_rate": 3.7021375165108377e-07, "logits/chosen": -0.8950821757316589, "logits/rejected": -0.8792182803153992, "logps/chosen": -327.24017333984375, "logps/rejected": -352.606689453125, "loss": 0.5566, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4424905478954315, "rewards/margins": 0.2954648733139038, "rewards/rejected": -0.7379554510116577, "step": 194 }, { "dpo_lambda": 0.97966468334198, "epoch": 0.408270086364826, "grad_norm": 48.50857855939344, "learning_rate": 3.6860532770864005e-07, "logits/chosen": -0.7950228452682495, "logits/rejected": -0.9293493628501892, "logps/chosen": -341.1165466308594, "logps/rejected": -340.11322021484375, "loss": 0.5936, "rewards/accuracies": 0.703125, "rewards/chosen": -0.47899454832077026, "rewards/margins": 0.3304406404495239, "rewards/rejected": -0.8094351291656494, "step": 195 }, { "dpo_lambda": 0.9795597791671753, "epoch": 0.41036377911541483, "grad_norm": 14.344018616235443, "learning_rate": 3.6699054332241985e-07, "logits/chosen": -0.9165274500846863, "logits/rejected": -0.948397696018219, "logps/chosen": -339.11407470703125, "logps/rejected": -363.7457275390625, "loss": 0.5414, "rewards/accuracies": 0.75, "rewards/chosen": -0.2974359691143036, "rewards/margins": 0.4967215061187744, "rewards/rejected": -0.7941575050354004, "step": 196 }, { "dpo_lambda": 0.9794549345970154, "epoch": 0.4124574718660037, "grad_norm": 16.146850927636255, "learning_rate": 3.653694850884091e-07, "logits/chosen": -0.9266453981399536, "logits/rejected": -0.9306536912918091, "logps/chosen": -336.86529541015625, "logps/rejected": -395.9813537597656, "loss": 0.6083, "rewards/accuracies": 0.65625, "rewards/chosen": -0.34124135971069336, "rewards/margins": 0.3672024607658386, "rewards/rejected": -0.708443820476532, "step": 197 }, { "dpo_lambda": 0.9793500900268555, "epoch": 0.4145511646165925, "grad_norm": 16.33656205714644, "learning_rate": 3.6374223993904124e-07, "logits/chosen": -0.8765822052955627, "logits/rejected": -0.9160308837890625, "logps/chosen": -349.8119201660156, "logps/rejected": -395.2640380859375, "loss": 0.5783, "rewards/accuracies": 0.671875, "rewards/chosen": -0.4447760283946991, "rewards/margins": 0.34209224581718445, "rewards/rejected": -0.7868682146072388, "step": 198 }, { "dpo_lambda": 0.9792452454566956, "epoch": 0.4166448573671814, "grad_norm": 23.060842987392316, "learning_rate": 3.621088951385353e-07, "logits/chosen": -0.9447203278541565, "logits/rejected": -0.9942541718482971, "logps/chosen": -333.0228271484375, "logps/rejected": -373.6483459472656, "loss": 0.5973, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4236346483230591, "rewards/margins": 0.3587067723274231, "rewards/rejected": -0.7823415398597717, "step": 199 }, { "dpo_lambda": 0.9791404008865356, "epoch": 0.4187385501177702, "grad_norm": 35.20357050968758, "learning_rate": 3.604695382782159e-07, "logits/chosen": -0.8524163365364075, "logits/rejected": -0.9107453227043152, "logps/chosen": -377.02886962890625, "logps/rejected": -383.6340026855469, "loss": 0.6219, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5846968293190002, "rewards/margins": 0.27121788263320923, "rewards/rejected": -0.8559147119522095, "step": 200 }, { "epoch": 0.4187385501177702, "eval_dpo_lambda": 0.9790353775024414, "eval_logits/chosen": -0.8926590085029602, "eval_logits/rejected": -0.9312657117843628, "eval_logps/chosen": -341.60223388671875, "eval_logps/rejected": -351.5380554199219, "eval_loss": 0.578376829624176, "eval_rewards/accuracies": 0.7310000061988831, "eval_rewards/chosen": -0.40955302119255066, "eval_rewards/margins": 0.3955199420452118, "eval_rewards/rejected": -0.8050729632377625, "eval_runtime": 561.0371, "eval_samples_per_second": 3.565, "eval_steps_per_second": 0.891, "step": 200 }, { "dpo_lambda": 0.979035496711731, "epoch": 0.4208322428683591, "grad_norm": 12.619078496504596, "learning_rate": 3.588242572718162e-07, "logits/chosen": -0.81094890832901, "logits/rejected": -0.8716937303543091, "logps/chosen": -346.4378967285156, "logps/rejected": -399.423095703125, "loss": 0.5737, "rewards/accuracies": 0.671875, "rewards/chosen": -0.3493123948574066, "rewards/margins": 0.4185783267021179, "rewards/rejected": -0.7678907513618469, "step": 201 }, { "dpo_lambda": 0.9789308905601501, "epoch": 0.4229259356189479, "grad_norm": 55.03692628232677, "learning_rate": 3.571731403507635e-07, "logits/chosen": -0.877128005027771, "logits/rejected": -0.8727900385856628, "logps/chosen": -308.8694152832031, "logps/rejected": -371.779296875, "loss": 0.5829, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4663529098033905, "rewards/margins": 0.43134671449661255, "rewards/rejected": -0.8976995944976807, "step": 202 }, { "dpo_lambda": 0.9788259863853455, "epoch": 0.4250196283695368, "grad_norm": 23.62457430224331, "learning_rate": 3.5551627605944746e-07, "logits/chosen": -0.9041829109191895, "logits/rejected": -0.9442777633666992, "logps/chosen": -306.6086120605469, "logps/rejected": -325.716796875, "loss": 0.6029, "rewards/accuracies": 0.625, "rewards/chosen": -0.4549506902694702, "rewards/margins": 0.29961732029914856, "rewards/rejected": -0.7545679807662964, "step": 203 }, { "dpo_lambda": 0.9787211418151855, "epoch": 0.4271133211201256, "grad_norm": 22.05233559020067, "learning_rate": 3.5385375325047163e-07, "logits/chosen": -0.9502891302108765, "logits/rejected": -0.9089999198913574, "logps/chosen": -298.177978515625, "logps/rejected": -349.14715576171875, "loss": 0.6196, "rewards/accuracies": 0.734375, "rewards/chosen": -0.45129668712615967, "rewards/margins": 0.33733218908309937, "rewards/rejected": -0.7886289358139038, "step": 204 }, { "dpo_lambda": 0.9786162972450256, "epoch": 0.42920701387071447, "grad_norm": 24.065121080878257, "learning_rate": 3.5218566107988867e-07, "logits/chosen": -0.8696624636650085, "logits/rejected": -0.9973980784416199, "logps/chosen": -336.4361877441406, "logps/rejected": -342.7047119140625, "loss": 0.5722, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4459235370159149, "rewards/margins": 0.40657177567481995, "rewards/rejected": -0.8524953126907349, "step": 205 }, { "dpo_lambda": 0.9785114526748657, "epoch": 0.4313007066213033, "grad_norm": 22.47093848456195, "learning_rate": 3.505120890024195e-07, "logits/chosen": -0.7719739675521851, "logits/rejected": -0.8493779897689819, "logps/chosen": -291.58270263671875, "logps/rejected": -316.1316223144531, "loss": 0.5942, "rewards/accuracies": 0.671875, "rewards/chosen": -0.36826789379119873, "rewards/margins": 0.37382954359054565, "rewards/rejected": -0.7420974969863892, "step": 206 }, { "dpo_lambda": 0.9784067869186401, "epoch": 0.43339439937189217, "grad_norm": 44.05441694450149, "learning_rate": 3.4883312676665534e-07, "logits/chosen": -0.9384758472442627, "logits/rejected": -0.9461789131164551, "logps/chosen": -323.506591796875, "logps/rejected": -331.2817687988281, "loss": 0.6098, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3830246329307556, "rewards/margins": 0.41553816199302673, "rewards/rejected": -0.79856276512146, "step": 207 }, { "dpo_lambda": 0.9783019423484802, "epoch": 0.435488092122481, "grad_norm": 26.81934954378791, "learning_rate": 3.4714886441024573e-07, "logits/chosen": -0.9290968179702759, "logits/rejected": -0.9625118374824524, "logps/chosen": -313.3687744140625, "logps/rejected": -333.42425537109375, "loss": 0.5725, "rewards/accuracies": 0.625, "rewards/chosen": -0.4035572409629822, "rewards/margins": 0.3620757460594177, "rewards/rejected": -0.7656329870223999, "step": 208 }, { "dpo_lambda": 0.9781970381736755, "epoch": 0.43758178487306987, "grad_norm": 29.672471588932904, "learning_rate": 3.454593922550693e-07, "logits/chosen": -0.9330939054489136, "logits/rejected": -0.9976711869239807, "logps/chosen": -343.06463623046875, "logps/rejected": -349.95062255859375, "loss": 0.6026, "rewards/accuracies": 0.640625, "rewards/chosen": -0.4826650619506836, "rewards/margins": 0.24588173627853394, "rewards/rejected": -0.7285467982292175, "step": 209 }, { "dpo_lambda": 0.9780922532081604, "epoch": 0.4396754776236587, "grad_norm": 31.246108954505512, "learning_rate": 3.4376480090239047e-07, "logits/chosen": -0.8667219281196594, "logits/rejected": -0.9327815771102905, "logps/chosen": -344.1031799316406, "logps/rejected": -346.087158203125, "loss": 0.5434, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3688032627105713, "rewards/margins": 0.4194371700286865, "rewards/rejected": -0.7882405519485474, "step": 210 }, { "dpo_lambda": 0.9779873490333557, "epoch": 0.44176917037424757, "grad_norm": 8.990953515181312, "learning_rate": 3.4206518122800055e-07, "logits/chosen": -0.9713053703308105, "logits/rejected": -1.0429770946502686, "logps/chosen": -408.6382751464844, "logps/rejected": -343.381591796875, "loss": 0.5813, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3207278251647949, "rewards/margins": 0.327059268951416, "rewards/rejected": -0.6477870941162109, "step": 211 }, { "dpo_lambda": 0.9778825044631958, "epoch": 0.4438628631248364, "grad_norm": 23.5993671623435, "learning_rate": 3.403606243773448e-07, "logits/chosen": -0.8349948525428772, "logits/rejected": -0.8876362442970276, "logps/chosen": -308.45355224609375, "logps/rejected": -344.296875, "loss": 0.5984, "rewards/accuracies": 0.640625, "rewards/chosen": -0.43563535809516907, "rewards/margins": 0.35835322737693787, "rewards/rejected": -0.7939885854721069, "step": 212 }, { "dpo_lambda": 0.9777778387069702, "epoch": 0.44595655587542526, "grad_norm": 31.422298138505585, "learning_rate": 3.3865122176063385e-07, "logits/chosen": -0.8766756057739258, "logits/rejected": -0.9191992282867432, "logps/chosen": -303.934814453125, "logps/rejected": -390.5749816894531, "loss": 0.5448, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3933745324611664, "rewards/margins": 0.47594064474105835, "rewards/rejected": -0.8693151473999023, "step": 213 }, { "dpo_lambda": 0.9776729941368103, "epoch": 0.4480502486260141, "grad_norm": 17.891900661691857, "learning_rate": 3.3693706504794243e-07, "logits/chosen": -0.8904320001602173, "logits/rejected": -0.8882500529289246, "logps/chosen": -349.6669921875, "logps/rejected": -394.34912109375, "loss": 0.5758, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4451632499694824, "rewards/margins": 0.2383812963962555, "rewards/rejected": -0.6835446357727051, "step": 214 }, { "dpo_lambda": 0.9775681495666504, "epoch": 0.45014394137660296, "grad_norm": 26.962413321674106, "learning_rate": 3.3521824616429284e-07, "logits/chosen": -0.9297071695327759, "logits/rejected": -1.0011214017868042, "logps/chosen": -299.71392822265625, "logps/rejected": -358.0120544433594, "loss": 0.5707, "rewards/accuracies": 0.734375, "rewards/chosen": -0.36456355452537537, "rewards/margins": 0.4526520371437073, "rewards/rejected": -0.8172155618667603, "step": 215 }, { "dpo_lambda": 0.9774633049964905, "epoch": 0.4522376341271918, "grad_norm": 19.029499853418418, "learning_rate": 3.334948572847253e-07, "logits/chosen": -0.9595445990562439, "logits/rejected": -0.9602202773094177, "logps/chosen": -287.4220886230469, "logps/rejected": -356.87335205078125, "loss": 0.5705, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3573736250400543, "rewards/margins": 0.4587840139865875, "rewards/rejected": -0.8161575198173523, "step": 216 }, { "dpo_lambda": 0.9773584008216858, "epoch": 0.45433132687778066, "grad_norm": 42.01861807639346, "learning_rate": 3.317669908293554e-07, "logits/chosen": -0.8164669871330261, "logits/rejected": -0.8533629179000854, "logps/chosen": -361.13067626953125, "logps/rejected": -366.3695068359375, "loss": 0.6027, "rewards/accuracies": 0.625, "rewards/chosen": -0.4534375071525574, "rewards/margins": 0.28975534439086914, "rewards/rejected": -0.7431929111480713, "step": 217 }, { "dpo_lambda": 0.977253794670105, "epoch": 0.4564250196283695, "grad_norm": 36.179024344052024, "learning_rate": 3.300347394584172e-07, "logits/chosen": -0.9450358152389526, "logits/rejected": -0.964739203453064, "logps/chosen": -338.791015625, "logps/rejected": -352.18853759765625, "loss": 0.5833, "rewards/accuracies": 0.65625, "rewards/chosen": -0.509911835193634, "rewards/margins": 0.29347842931747437, "rewards/rejected": -0.8033902049064636, "step": 218 }, { "dpo_lambda": 0.9771488904953003, "epoch": 0.4585187123789584, "grad_norm": 13.931536630842533, "learning_rate": 3.2829819606729477e-07, "logits/chosen": -0.9925215840339661, "logits/rejected": -0.9919613599777222, "logps/chosen": -351.99560546875, "logps/rejected": -336.3402099609375, "loss": 0.5738, "rewards/accuracies": 0.671875, "rewards/chosen": -0.33854570984840393, "rewards/margins": 0.41257885098457336, "rewards/rejected": -0.7511245012283325, "step": 219 }, { "dpo_lambda": 0.9770440459251404, "epoch": 0.46061240512954726, "grad_norm": 20.796560138343686, "learning_rate": 3.265574537815398e-07, "logits/chosen": -0.9090243577957153, "logits/rejected": -0.87982577085495, "logps/chosen": -291.6847229003906, "logps/rejected": -295.15838623046875, "loss": 0.5522, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4356476664543152, "rewards/margins": 0.30086103081703186, "rewards/rejected": -0.7365086078643799, "step": 220 }, { "dpo_lambda": 0.9769392013549805, "epoch": 0.4627060978801361, "grad_norm": 17.55086377387192, "learning_rate": 3.248126059518784e-07, "logits/chosen": -0.8062466382980347, "logits/rejected": -0.8155995607376099, "logps/chosen": -350.0935974121094, "logps/rejected": -417.79290771484375, "loss": 0.5654, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5696654319763184, "rewards/margins": 0.3574390411376953, "rewards/rejected": -0.9271044731140137, "step": 221 }, { "dpo_lambda": 0.9768343567848206, "epoch": 0.46479979063072496, "grad_norm": 21.511465866436165, "learning_rate": 3.230637461492043e-07, "logits/chosen": -0.9671846628189087, "logits/rejected": -0.93825763463974, "logps/chosen": -309.1865234375, "logps/rejected": -365.5205383300781, "loss": 0.5752, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4525972604751587, "rewards/margins": 0.44703030586242676, "rewards/rejected": -0.8996275663375854, "step": 222 }, { "dpo_lambda": 0.9767295122146606, "epoch": 0.4668934833813138, "grad_norm": 21.781379476586974, "learning_rate": 3.213109681595612e-07, "logits/chosen": -0.885411262512207, "logits/rejected": -0.9873719811439514, "logps/chosen": -344.0679626464844, "logps/rejected": -348.89544677734375, "loss": 0.5736, "rewards/accuracies": 0.765625, "rewards/chosen": -0.47906970977783203, "rewards/margins": 0.42836642265319824, "rewards/rejected": -0.9074360132217407, "step": 223 }, { "dpo_lambda": 0.976624608039856, "epoch": 0.46898717613190266, "grad_norm": 34.15253461253864, "learning_rate": 3.1955436597911315e-07, "logits/chosen": -0.8723405003547668, "logits/rejected": -0.9412495493888855, "logps/chosen": -360.9906005859375, "logps/rejected": -400.2396240234375, "loss": 0.597, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5952271223068237, "rewards/margins": 0.4624156057834625, "rewards/rejected": -1.0576426982879639, "step": 224 }, { "dpo_lambda": 0.9765199422836304, "epoch": 0.4710808688824915, "grad_norm": 17.76135160440879, "learning_rate": 3.1779403380910425e-07, "logits/chosen": -0.8463708162307739, "logits/rejected": -0.8820710182189941, "logps/chosen": -297.9212646484375, "logps/rejected": -382.1929931640625, "loss": 0.5682, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5124987363815308, "rewards/margins": 0.5348488688468933, "rewards/rejected": -1.0473475456237793, "step": 225 }, { "dpo_lambda": 0.9764150977134705, "epoch": 0.47317456163308036, "grad_norm": 27.616034119162922, "learning_rate": 3.160300660508064e-07, "logits/chosen": -0.9552580714225769, "logits/rejected": -0.9441887140274048, "logps/chosen": -325.70379638671875, "logps/rejected": -377.0942687988281, "loss": 0.5982, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5950958728790283, "rewards/margins": 0.43342557549476624, "rewards/rejected": -1.0285214185714722, "step": 226 }, { "dpo_lambda": 0.9763102531433105, "epoch": 0.4752682543836692, "grad_norm": 35.0109328334649, "learning_rate": 3.1426255730045695e-07, "logits/chosen": -0.9298887252807617, "logits/rejected": -0.8998876214027405, "logps/chosen": -342.6550598144531, "logps/rejected": -410.9444885253906, "loss": 0.5938, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7437422275543213, "rewards/margins": 0.36158329248428345, "rewards/rejected": -1.10532546043396, "step": 227 }, { "dpo_lambda": 0.9762054085731506, "epoch": 0.47736194713425806, "grad_norm": 21.706855446762994, "learning_rate": 3.1249160234418644e-07, "logits/chosen": -0.9593175649642944, "logits/rejected": -0.9820206165313721, "logps/chosen": -393.0712890625, "logps/rejected": -404.17303466796875, "loss": 0.6354, "rewards/accuracies": 0.671875, "rewards/chosen": -0.6769300699234009, "rewards/margins": 0.3926386535167694, "rewards/rejected": -1.0695687532424927, "step": 228 }, { "dpo_lambda": 0.9761005640029907, "epoch": 0.4794556398848469, "grad_norm": 50.00541063078205, "learning_rate": 3.1071729615293424e-07, "logits/chosen": -0.9064358472824097, "logits/rejected": -0.9524105787277222, "logps/chosen": -417.2607727050781, "logps/rejected": -373.5041198730469, "loss": 0.6075, "rewards/accuracies": 0.625, "rewards/chosen": -0.8266244530677795, "rewards/margins": 0.33187851309776306, "rewards/rejected": -1.1585030555725098, "step": 229 }, { "dpo_lambda": 0.9759958982467651, "epoch": 0.48154933263543576, "grad_norm": 14.21994582685636, "learning_rate": 3.0893973387735683e-07, "logits/chosen": -0.9525047540664673, "logits/rejected": -0.9384512901306152, "logps/chosen": -357.6230163574219, "logps/rejected": -417.1165771484375, "loss": 0.6028, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6341197490692139, "rewards/margins": 0.4427669644355774, "rewards/rejected": -1.0768866539001465, "step": 230 }, { "dpo_lambda": 0.9758910536766052, "epoch": 0.4836430253860246, "grad_norm": 31.64740174231057, "learning_rate": 3.071590108427243e-07, "logits/chosen": -0.8360011577606201, "logits/rejected": -0.9409224987030029, "logps/chosen": -367.2627258300781, "logps/rejected": -375.06622314453125, "loss": 0.5399, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6678786277770996, "rewards/margins": 0.44707852602005005, "rewards/rejected": -1.1149570941925049, "step": 231 }, { "dpo_lambda": 0.9757861495018005, "epoch": 0.48573671813661345, "grad_norm": 12.32973304776845, "learning_rate": 3.05375222543809e-07, "logits/chosen": -0.9434400796890259, "logits/rejected": -0.9206544756889343, "logps/chosen": -377.33905029296875, "logps/rejected": -420.7717590332031, "loss": 0.571, "rewards/accuracies": 0.75, "rewards/chosen": -0.5599082708358765, "rewards/margins": 0.3856970965862274, "rewards/rejected": -0.9456053972244263, "step": 232 }, { "dpo_lambda": 0.9756813645362854, "epoch": 0.4878304108872023, "grad_norm": 29.85350415813659, "learning_rate": 3.035884646397637e-07, "logits/chosen": -0.8815343976020813, "logits/rejected": -0.9104249477386475, "logps/chosen": -394.9241943359375, "logps/rejected": -365.0865783691406, "loss": 0.5728, "rewards/accuracies": 0.75, "rewards/chosen": -0.5130780935287476, "rewards/margins": 0.42393574118614197, "rewards/rejected": -0.9370138049125671, "step": 233 }, { "dpo_lambda": 0.9755764603614807, "epoch": 0.48992410363779115, "grad_norm": 28.006756307210736, "learning_rate": 3.017988329489923e-07, "logits/chosen": -0.8777972459793091, "logits/rejected": -0.837729811668396, "logps/chosen": -272.7120666503906, "logps/rejected": -368.10809326171875, "loss": 0.4952, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3836371600627899, "rewards/margins": 0.5006729960441589, "rewards/rejected": -0.8843101263046265, "step": 234 }, { "dpo_lambda": 0.9754716157913208, "epoch": 0.49201779638838, "grad_norm": 22.112467921438846, "learning_rate": 3.000064234440111e-07, "logits/chosen": -0.9350335597991943, "logits/rejected": -1.0734158754348755, "logps/chosen": -397.2668762207031, "logps/rejected": -365.10101318359375, "loss": 0.5751, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4139818251132965, "rewards/margins": 0.4701218605041504, "rewards/rejected": -0.8841036558151245, "step": 235 }, { "dpo_lambda": 0.9753669500350952, "epoch": 0.49411148913896885, "grad_norm": 38.42963861882906, "learning_rate": 2.9821133224630223e-07, "logits/chosen": -0.9367507696151733, "logits/rejected": -0.9378839731216431, "logps/chosen": -346.4058532714844, "logps/rejected": -385.244140625, "loss": 0.5592, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5048272013664246, "rewards/margins": 0.40106263756752014, "rewards/rejected": -0.9058898687362671, "step": 236 }, { "dpo_lambda": 0.9752621054649353, "epoch": 0.4962051818895577, "grad_norm": 20.272542585780734, "learning_rate": 2.964136556211588e-07, "logits/chosen": -1.0191184282302856, "logits/rejected": -1.0410274267196655, "logps/chosen": -351.78411865234375, "logps/rejected": -365.6335144042969, "loss": 0.5576, "rewards/accuracies": 0.671875, "rewards/chosen": -0.29722094535827637, "rewards/margins": 0.4697399139404297, "rewards/rejected": -0.766960859298706, "step": 237 }, { "dpo_lambda": 0.9751572012901306, "epoch": 0.49829887464014655, "grad_norm": 12.14766408231827, "learning_rate": 2.946134899725226e-07, "logits/chosen": -0.935865044593811, "logits/rejected": -0.974367618560791, "logps/chosen": -348.61761474609375, "logps/rejected": -370.552978515625, "loss": 0.5527, "rewards/accuracies": 0.71875, "rewards/chosen": -0.35987845063209534, "rewards/margins": 0.5407595634460449, "rewards/rejected": -0.9006379842758179, "step": 238 }, { "dpo_lambda": 0.9750524163246155, "epoch": 0.5003925673907355, "grad_norm": 10.99279566098769, "learning_rate": 2.9281093183781403e-07, "logits/chosen": -0.8710501194000244, "logits/rejected": -0.8785250186920166, "logps/chosen": -268.412109375, "logps/rejected": -303.83709716796875, "loss": 0.5426, "rewards/accuracies": 0.734375, "rewards/chosen": -0.2745110094547272, "rewards/margins": 0.5189369916915894, "rewards/rejected": -0.7934479713439941, "step": 239 }, { "dpo_lambda": 0.9749475121498108, "epoch": 0.5024862601413242, "grad_norm": 19.68518029818432, "learning_rate": 2.910060778827554e-07, "logits/chosen": -0.9147160053253174, "logits/rejected": -0.9373922944068909, "logps/chosen": -341.3359069824219, "logps/rejected": -412.0687255859375, "loss": 0.5678, "rewards/accuracies": 0.765625, "rewards/chosen": -0.2783906161785126, "rewards/margins": 0.5757811069488525, "rewards/rejected": -0.8541717529296875, "step": 240 }, { "dpo_lambda": 0.97484290599823, "epoch": 0.5045799528919132, "grad_norm": 22.785011379156558, "learning_rate": 2.891990248961871e-07, "logits/chosen": -0.941616952419281, "logits/rejected": -0.9725069999694824, "logps/chosen": -358.02911376953125, "logps/rejected": -356.3277587890625, "loss": 0.5623, "rewards/accuracies": 0.734375, "rewards/chosen": -0.3600071966648102, "rewards/margins": 0.49038365483283997, "rewards/rejected": -0.8503908514976501, "step": 241 }, { "dpo_lambda": 0.9747380018234253, "epoch": 0.506673645642502, "grad_norm": 28.10712370816071, "learning_rate": 2.873898697848762e-07, "logits/chosen": -0.9158104658126831, "logits/rejected": -1.0190154314041138, "logps/chosen": -337.3912353515625, "logps/rejected": -319.40380859375, "loss": 0.5311, "rewards/accuracies": 0.734375, "rewards/chosen": -0.30729302763938904, "rewards/margins": 0.49401095509529114, "rewards/rejected": -0.8013039827346802, "step": 242 }, { "dpo_lambda": 0.9746331572532654, "epoch": 0.5087673383930909, "grad_norm": 30.036231340589893, "learning_rate": 2.8557870956832133e-07, "logits/chosen": -0.9809325337409973, "logits/rejected": -1.0072345733642578, "logps/chosen": -363.1551208496094, "logps/rejected": -405.18438720703125, "loss": 0.6094, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3382487893104553, "rewards/margins": 0.4237140417098999, "rewards/rejected": -0.7619627714157104, "step": 243 }, { "dpo_lambda": 0.9745283126831055, "epoch": 0.5108610311436796, "grad_norm": 38.498725672458825, "learning_rate": 2.837656413735479e-07, "logits/chosen": -0.9273790717124939, "logits/rejected": -0.9404940009117126, "logps/chosen": -370.77655029296875, "logps/rejected": -379.8350830078125, "loss": 0.552, "rewards/accuracies": 0.75, "rewards/chosen": -0.34177640080451965, "rewards/margins": 0.5170974731445312, "rewards/rejected": -0.8588739037513733, "step": 244 }, { "dpo_lambda": 0.9744234681129456, "epoch": 0.5129547238942685, "grad_norm": 26.230890552567814, "learning_rate": 2.8195076242990116e-07, "logits/chosen": -0.8353450298309326, "logits/rejected": -0.8841926455497742, "logps/chosen": -295.8029479980469, "logps/rejected": -348.716552734375, "loss": 0.5904, "rewards/accuracies": 0.671875, "rewards/chosen": -0.31430384516716003, "rewards/margins": 0.49294087290763855, "rewards/rejected": -0.8072446584701538, "step": 245 }, { "dpo_lambda": 0.9743185639381409, "epoch": 0.5150484166448573, "grad_norm": 20.742157064446065, "learning_rate": 2.801341700638307e-07, "logits/chosen": -1.0004804134368896, "logits/rejected": -1.0661176443099976, "logps/chosen": -306.6329040527344, "logps/rejected": -319.06097412109375, "loss": 0.5881, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3133764863014221, "rewards/margins": 0.5279111266136169, "rewards/rejected": -0.8412876725196838, "step": 246 }, { "dpo_lambda": 0.974213719367981, "epoch": 0.5171421093954462, "grad_norm": 41.92705112635296, "learning_rate": 2.7831596169367227e-07, "logits/chosen": -1.0456688404083252, "logits/rejected": -0.9775162935256958, "logps/chosen": -349.86138916015625, "logps/rejected": -417.2496337890625, "loss": 0.5622, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2810763120651245, "rewards/margins": 0.5058405995368958, "rewards/rejected": -0.7869168519973755, "step": 247 }, { "dpo_lambda": 0.9741090536117554, "epoch": 0.519235802146035, "grad_norm": 35.15777219108126, "learning_rate": 2.7649623482442274e-07, "logits/chosen": -0.9570460319519043, "logits/rejected": -0.9240747094154358, "logps/chosen": -279.88751220703125, "logps/rejected": -371.7086486816406, "loss": 0.5803, "rewards/accuracies": 0.765625, "rewards/chosen": -0.38907331228256226, "rewards/margins": 0.3909260630607605, "rewards/rejected": -0.7799993753433228, "step": 248 }, { "dpo_lambda": 0.9740042090415955, "epoch": 0.521329494896624, "grad_norm": 42.73962957924612, "learning_rate": 2.7467508704251135e-07, "logits/chosen": -0.9482097625732422, "logits/rejected": -0.9136594533920288, "logps/chosen": -295.8553771972656, "logps/rejected": -348.8441162109375, "loss": 0.5374, "rewards/accuracies": 0.75, "rewards/chosen": -0.3581140637397766, "rewards/margins": 0.5417485237121582, "rewards/rejected": -0.8998625874519348, "step": 249 }, { "dpo_lambda": 0.9738993644714355, "epoch": 0.5234231876472127, "grad_norm": 42.5375636334624, "learning_rate": 2.7285261601056697e-07, "logits/chosen": -0.9109609127044678, "logits/rejected": -0.9083539247512817, "logps/chosen": -359.6793212890625, "logps/rejected": -369.906494140625, "loss": 0.5738, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4524061381816864, "rewards/margins": 0.37430381774902344, "rewards/rejected": -0.8267098665237427, "step": 250 }, { "epoch": 0.5234231876472127, "eval_dpo_lambda": 0.97379469871521, "eval_logits/chosen": -0.933293879032135, "eval_logits/rejected": -0.9690828919410706, "eval_logps/chosen": -344.027587890625, "eval_logps/rejected": -359.6706848144531, "eval_loss": 0.5684590339660645, "eval_rewards/accuracies": 0.7260000109672546, "eval_rewards/chosen": -0.43380701541900635, "eval_rewards/margins": 0.4525919556617737, "eval_rewards/rejected": -0.88639897108078, "eval_runtime": 561.0273, "eval_samples_per_second": 3.565, "eval_steps_per_second": 0.891, "step": 250 }, { "dpo_lambda": 0.9737945199012756, "epoch": 0.5255168803978016, "grad_norm": 21.60785466834392, "learning_rate": 2.7102891946217994e-07, "logits/chosen": -0.9251986742019653, "logits/rejected": -0.9310693740844727, "logps/chosen": -336.9014892578125, "logps/rejected": -380.8359069824219, "loss": 0.5422, "rewards/accuracies": 0.671875, "rewards/chosen": -0.44501379132270813, "rewards/margins": 0.45862460136413574, "rewards/rejected": -0.903638482093811, "step": 251 }, { "dpo_lambda": 0.9736896753311157, "epoch": 0.5276105731483904, "grad_norm": 19.01047964693019, "learning_rate": 2.692040951966617e-07, "logits/chosen": -1.0268956422805786, "logits/rejected": -0.991072416305542, "logps/chosen": -305.11199951171875, "logps/rejected": -360.6181945800781, "loss": 0.575, "rewards/accuracies": 0.703125, "rewards/chosen": -0.34655502438545227, "rewards/margins": 0.4588647484779358, "rewards/rejected": -0.8054197430610657, "step": 252 }, { "dpo_lambda": 0.9735850095748901, "epoch": 0.5297042658989793, "grad_norm": 22.46505736505972, "learning_rate": 2.6737824107379947e-07, "logits/chosen": -0.9305629730224609, "logits/rejected": -1.0360764265060425, "logps/chosen": -383.3492736816406, "logps/rejected": -389.62158203125, "loss": 0.5413, "rewards/accuracies": 0.734375, "rewards/chosen": -0.37200137972831726, "rewards/margins": 0.5039467215538025, "rewards/rejected": -0.8759480714797974, "step": 253 }, { "dpo_lambda": 0.9734801650047302, "epoch": 0.5317979586495681, "grad_norm": 23.321068867881976, "learning_rate": 2.655514550086086e-07, "logits/chosen": -1.0165340900421143, "logits/rejected": -1.0736379623413086, "logps/chosen": -300.9527587890625, "logps/rejected": -284.6766052246094, "loss": 0.575, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3121712803840637, "rewards/margins": 0.4902705252170563, "rewards/rejected": -0.8024417757987976, "step": 254 }, { "dpo_lambda": 0.9733752608299255, "epoch": 0.533891651400157, "grad_norm": 23.03157221916909, "learning_rate": 2.6372383496608186e-07, "logits/chosen": -0.9229246973991394, "logits/rejected": -0.9603086113929749, "logps/chosen": -347.5473937988281, "logps/rejected": -392.124267578125, "loss": 0.5832, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4431746006011963, "rewards/margins": 0.49735429883003235, "rewards/rejected": -0.9405289888381958, "step": 255 }, { "dpo_lambda": 0.9732704162597656, "epoch": 0.5359853441507458, "grad_norm": 34.91779344587045, "learning_rate": 2.618954789559356e-07, "logits/chosen": -0.9654712677001953, "logits/rejected": -0.950838565826416, "logps/chosen": -334.4616394042969, "logps/rejected": -320.8468017578125, "loss": 0.5851, "rewards/accuracies": 0.75, "rewards/chosen": -0.587689995765686, "rewards/margins": 0.39830049872398376, "rewards/rejected": -0.9859905242919922, "step": 256 }, { "dpo_lambda": 0.9731655716896057, "epoch": 0.5380790369013347, "grad_norm": 14.085882529594617, "learning_rate": 2.600664850273538e-07, "logits/chosen": -0.9465386867523193, "logits/rejected": -0.9746992588043213, "logps/chosen": -362.1474609375, "logps/rejected": -368.56982421875, "loss": 0.5371, "rewards/accuracies": 0.734375, "rewards/chosen": -0.506095290184021, "rewards/margins": 0.5111798644065857, "rewards/rejected": -1.017275094985962, "step": 257 }, { "dpo_lambda": 0.9730607271194458, "epoch": 0.5401727296519235, "grad_norm": 15.014510984375404, "learning_rate": 2.582369512637302e-07, "logits/chosen": -0.9997269511222839, "logits/rejected": -1.0087834596633911, "logps/chosen": -357.7166442871094, "logps/rejected": -397.3079833984375, "loss": 0.5591, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6475968956947327, "rewards/margins": 0.450656533241272, "rewards/rejected": -1.0982534885406494, "step": 258 }, { "dpo_lambda": 0.9729558229446411, "epoch": 0.5422664224025124, "grad_norm": 48.7863772083964, "learning_rate": 2.5640697577740815e-07, "logits/chosen": -1.0039561986923218, "logits/rejected": -0.9903517365455627, "logps/chosen": -336.15496826171875, "logps/rejected": -392.0978698730469, "loss": 0.5776, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5708963871002197, "rewards/margins": 0.4194872975349426, "rewards/rejected": -0.9903836846351624, "step": 259 }, { "dpo_lambda": 0.9728512167930603, "epoch": 0.5443601151531012, "grad_norm": 43.91502909416745, "learning_rate": 2.5457665670441937e-07, "logits/chosen": -0.9723465442657471, "logits/rejected": -0.9578585028648376, "logps/chosen": -315.97186279296875, "logps/rejected": -356.4257507324219, "loss": 0.5949, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4254854619503021, "rewards/margins": 0.569304347038269, "rewards/rejected": -0.994789719581604, "step": 260 }, { "dpo_lambda": 0.9727463126182556, "epoch": 0.5464538079036901, "grad_norm": 34.6980965461914, "learning_rate": 2.527460921992209e-07, "logits/chosen": -0.9857003688812256, "logits/rejected": -0.9960317611694336, "logps/chosen": -363.2552490234375, "logps/rejected": -397.5711975097656, "loss": 0.5132, "rewards/accuracies": 0.75, "rewards/chosen": -0.43664252758026123, "rewards/margins": 0.5617834329605103, "rewards/rejected": -0.9984259605407715, "step": 261 }, { "dpo_lambda": 0.9726415276527405, "epoch": 0.5485475006542789, "grad_norm": 15.205720751303948, "learning_rate": 2.509153804294318e-07, "logits/chosen": -0.9044791460037231, "logits/rejected": -0.9335215091705322, "logps/chosen": -326.0076904296875, "logps/rejected": -371.7582092285156, "loss": 0.5265, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4435957968235016, "rewards/margins": 0.5709205865859985, "rewards/rejected": -1.0145163536071777, "step": 262 }, { "dpo_lambda": 0.9725366234779358, "epoch": 0.5506411934048678, "grad_norm": 43.35266805487184, "learning_rate": 2.4908461957056825e-07, "logits/chosen": -0.9475391507148743, "logits/rejected": -0.9761734008789062, "logps/chosen": -405.1850891113281, "logps/rejected": -402.66412353515625, "loss": 0.5699, "rewards/accuracies": 0.71875, "rewards/chosen": -0.44304272532463074, "rewards/margins": 0.4835435450077057, "rewards/rejected": -0.9265862107276917, "step": 263 }, { "dpo_lambda": 0.9724317789077759, "epoch": 0.5527348861554567, "grad_norm": 32.41316077566274, "learning_rate": 2.4725390780077905e-07, "logits/chosen": -0.9430174827575684, "logits/rejected": -0.9314047694206238, "logps/chosen": -363.8974914550781, "logps/rejected": -381.5852966308594, "loss": 0.5414, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5308617353439331, "rewards/margins": 0.37887492775917053, "rewards/rejected": -0.9097366333007812, "step": 264 }, { "dpo_lambda": 0.9723271131515503, "epoch": 0.5548285789060455, "grad_norm": 40.621721822921614, "learning_rate": 2.454233432955807e-07, "logits/chosen": -0.8697124719619751, "logits/rejected": -0.9561302065849304, "logps/chosen": -302.15875244140625, "logps/rejected": -327.24993896484375, "loss": 0.5883, "rewards/accuracies": 0.671875, "rewards/chosen": -0.45329856872558594, "rewards/margins": 0.38187170028686523, "rewards/rejected": -0.835170328617096, "step": 265 }, { "dpo_lambda": 0.9722222685813904, "epoch": 0.5569222716566344, "grad_norm": 368.844160392225, "learning_rate": 2.435930242225919e-07, "logits/chosen": -1.0060429573059082, "logits/rejected": -0.9610638618469238, "logps/chosen": -361.58856201171875, "logps/rejected": -349.51171875, "loss": 0.5887, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4895636737346649, "rewards/margins": 0.3743961751461029, "rewards/rejected": -0.863959789276123, "step": 266 }, { "dpo_lambda": 0.9721174240112305, "epoch": 0.5590159644072232, "grad_norm": 28.521813474186093, "learning_rate": 2.4176304873626984e-07, "logits/chosen": -0.9990183115005493, "logits/rejected": -1.001824140548706, "logps/chosen": -289.71588134765625, "logps/rejected": -331.92138671875, "loss": 0.5715, "rewards/accuracies": 0.75, "rewards/chosen": -0.42115557193756104, "rewards/margins": 0.5132542848587036, "rewards/rejected": -0.9344099164009094, "step": 267 }, { "dpo_lambda": 0.9720125794410706, "epoch": 0.5611096571578121, "grad_norm": 22.847288152780855, "learning_rate": 2.399335149726463e-07, "logits/chosen": -0.9127358198165894, "logits/rejected": -0.9713425040245056, "logps/chosen": -324.10101318359375, "logps/rejected": -332.75848388671875, "loss": 0.5619, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3632182478904724, "rewards/margins": 0.5284265279769897, "rewards/rejected": -0.8916447758674622, "step": 268 }, { "dpo_lambda": 0.9719076752662659, "epoch": 0.5632033499084009, "grad_norm": 19.55742024939717, "learning_rate": 2.381045210440644e-07, "logits/chosen": -0.8354206681251526, "logits/rejected": -0.8707402348518372, "logps/chosen": -338.0024719238281, "logps/rejected": -319.0690002441406, "loss": 0.6275, "rewards/accuracies": 0.609375, "rewards/chosen": -0.5561378002166748, "rewards/margins": 0.29612746834754944, "rewards/rejected": -0.8522651791572571, "step": 269 }, { "dpo_lambda": 0.971802830696106, "epoch": 0.5652970426589898, "grad_norm": 18.91699155034226, "learning_rate": 2.3627616503391812e-07, "logits/chosen": -0.9677299857139587, "logits/rejected": -1.0638877153396606, "logps/chosen": -398.1684875488281, "logps/rejected": -357.58416748046875, "loss": 0.5519, "rewards/accuracies": 0.671875, "rewards/chosen": -0.32594913244247437, "rewards/margins": 0.5193861126899719, "rewards/rejected": -0.8453351855278015, "step": 270 }, { "dpo_lambda": 0.9716981649398804, "epoch": 0.5673907354095786, "grad_norm": 102.59972259193498, "learning_rate": 2.344485449913914e-07, "logits/chosen": -0.911102831363678, "logits/rejected": -1.0000059604644775, "logps/chosen": -372.7582702636719, "logps/rejected": -373.64996337890625, "loss": 0.5403, "rewards/accuracies": 0.796875, "rewards/chosen": -0.42270204424858093, "rewards/margins": 0.5330160856246948, "rewards/rejected": -0.9557181596755981, "step": 271 }, { "dpo_lambda": 0.9715933203697205, "epoch": 0.5694844281601675, "grad_norm": 22.287403967915196, "learning_rate": 2.3262175892620062e-07, "logits/chosen": -0.9188674092292786, "logits/rejected": -0.9886456727981567, "logps/chosen": -334.9513244628906, "logps/rejected": -324.5064697265625, "loss": 0.5841, "rewards/accuracies": 0.609375, "rewards/chosen": -0.5083807110786438, "rewards/margins": 0.40126848220825195, "rewards/rejected": -0.9096491932868958, "step": 272 }, { "dpo_lambda": 0.9714884757995605, "epoch": 0.5715781209107563, "grad_norm": 17.09621759169118, "learning_rate": 2.3079590480333827e-07, "logits/chosen": -0.9051034450531006, "logits/rejected": -0.9472033977508545, "logps/chosen": -316.8356628417969, "logps/rejected": -348.45428466796875, "loss": 0.5844, "rewards/accuracies": 0.765625, "rewards/chosen": -0.46500861644744873, "rewards/margins": 0.4641115069389343, "rewards/rejected": -0.9291200637817383, "step": 273 }, { "dpo_lambda": 0.9713836312294006, "epoch": 0.5736718136613452, "grad_norm": 26.00890481729291, "learning_rate": 2.2897108053782e-07, "logits/chosen": -1.0062130689620972, "logits/rejected": -1.063223958015442, "logps/chosen": -324.51312255859375, "logps/rejected": -338.21392822265625, "loss": 0.6106, "rewards/accuracies": 0.640625, "rewards/chosen": -0.3091861605644226, "rewards/margins": 0.3285538852214813, "rewards/rejected": -0.6377400159835815, "step": 274 }, { "dpo_lambda": 0.9712787866592407, "epoch": 0.575765506411934, "grad_norm": 21.297733625729723, "learning_rate": 2.2714738398943308e-07, "logits/chosen": -0.8751081228256226, "logits/rejected": -0.9124285578727722, "logps/chosen": -327.5624084472656, "logps/rejected": -395.2243347167969, "loss": 0.5115, "rewards/accuracies": 0.8125, "rewards/chosen": -0.24274717271327972, "rewards/margins": 0.703201413154602, "rewards/rejected": -0.9459485411643982, "step": 275 }, { "dpo_lambda": 0.9711741209030151, "epoch": 0.5778591991625229, "grad_norm": 42.009923318640546, "learning_rate": 2.2532491295748865e-07, "logits/chosen": -1.0958585739135742, "logits/rejected": -1.0163934230804443, "logps/chosen": -331.8833923339844, "logps/rejected": -386.0252380371094, "loss": 0.5985, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3500264585018158, "rewards/margins": 0.35328248143196106, "rewards/rejected": -0.7033089399337769, "step": 276 }, { "dpo_lambda": 0.9710692167282104, "epoch": 0.5799528919131117, "grad_norm": 24.909203298179698, "learning_rate": 2.2350376517557726e-07, "logits/chosen": -0.9364175200462341, "logits/rejected": -0.9506851434707642, "logps/chosen": -374.86480712890625, "logps/rejected": -346.6588134765625, "loss": 0.6034, "rewards/accuracies": 0.65625, "rewards/chosen": -0.38636910915374756, "rewards/margins": 0.2424267828464508, "rewards/rejected": -0.628795862197876, "step": 277 }, { "dpo_lambda": 0.9709643721580505, "epoch": 0.5820465846637006, "grad_norm": 17.112568138221835, "learning_rate": 2.2168403830632769e-07, "logits/chosen": -0.9554729461669922, "logits/rejected": -0.9818132519721985, "logps/chosen": -345.8843078613281, "logps/rejected": -395.644775390625, "loss": 0.5531, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4023358225822449, "rewards/margins": 0.5377238392829895, "rewards/rejected": -0.9400597214698792, "step": 278 }, { "dpo_lambda": 0.9708595275878906, "epoch": 0.5841402774142894, "grad_norm": 27.27451330453449, "learning_rate": 2.1986582993616925e-07, "logits/chosen": -0.8535746335983276, "logits/rejected": -0.9038280248641968, "logps/chosen": -400.73455810546875, "logps/rejected": -396.37921142578125, "loss": 0.5588, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2820463180541992, "rewards/margins": 0.48483264446258545, "rewards/rejected": -0.7668789029121399, "step": 279 }, { "dpo_lambda": 0.9707546830177307, "epoch": 0.5862339701648783, "grad_norm": 35.46381152183321, "learning_rate": 2.1804923757009882e-07, "logits/chosen": -1.0355491638183594, "logits/rejected": -1.008597493171692, "logps/chosen": -331.9897766113281, "logps/rejected": -345.2893981933594, "loss": 0.6156, "rewards/accuracies": 0.671875, "rewards/chosen": -0.3384499251842499, "rewards/margins": 0.22750774025917053, "rewards/rejected": -0.5659576654434204, "step": 280 }, { "dpo_lambda": 0.9706498384475708, "epoch": 0.5883276629154671, "grad_norm": 18.095530899983487, "learning_rate": 2.1623435862645205e-07, "logits/chosen": -0.896825909614563, "logits/rejected": -0.8690459728240967, "logps/chosen": -325.27313232421875, "logps/rejected": -389.0777587890625, "loss": 0.5353, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17332224547863007, "rewards/margins": 0.4838894009590149, "rewards/rejected": -0.6572116613388062, "step": 281 }, { "dpo_lambda": 0.9705449342727661, "epoch": 0.590421355666056, "grad_norm": 26.60962349078345, "learning_rate": 2.1442129043167873e-07, "logits/chosen": -0.9322452545166016, "logits/rejected": -0.9412853121757507, "logps/chosen": -319.7808532714844, "logps/rejected": -352.3249816894531, "loss": 0.5779, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3038173317909241, "rewards/margins": 0.4064999520778656, "rewards/rejected": -0.7103172540664673, "step": 282 }, { "dpo_lambda": 0.9704403281211853, "epoch": 0.5925150484166448, "grad_norm": 13.698113538753912, "learning_rate": 2.1261013021512378e-07, "logits/chosen": -1.0408434867858887, "logits/rejected": -1.0634641647338867, "logps/chosen": -324.38421630859375, "logps/rejected": -296.24310302734375, "loss": 0.5603, "rewards/accuracies": 0.703125, "rewards/chosen": -0.2441764771938324, "rewards/margins": 0.45150530338287354, "rewards/rejected": -0.6956817507743835, "step": 283 }, { "dpo_lambda": 0.9703354239463806, "epoch": 0.5946087411672337, "grad_norm": 35.606967375148706, "learning_rate": 2.1080097510381294e-07, "logits/chosen": -0.9592381119728088, "logits/rejected": -0.9521263241767883, "logps/chosen": -367.49041748046875, "logps/rejected": -358.9259948730469, "loss": 0.572, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4385915994644165, "rewards/margins": 0.35387519001960754, "rewards/rejected": -0.7924667596817017, "step": 284 }, { "dpo_lambda": 0.9702305793762207, "epoch": 0.5967024339178225, "grad_norm": 35.5832827055, "learning_rate": 2.089939221172446e-07, "logits/chosen": -0.866810142993927, "logits/rejected": -0.9270384311676025, "logps/chosen": -296.6413269042969, "logps/rejected": -340.2668762207031, "loss": 0.534, "rewards/accuracies": 0.828125, "rewards/chosen": -0.17586766183376312, "rewards/margins": 0.633611798286438, "rewards/rejected": -0.8094794154167175, "step": 285 }, { "dpo_lambda": 0.9701257348060608, "epoch": 0.5987961266684114, "grad_norm": 35.734015963321454, "learning_rate": 2.0718906816218595e-07, "logits/chosen": -0.9615485668182373, "logits/rejected": -0.9750257134437561, "logps/chosen": -349.071533203125, "logps/rejected": -384.72247314453125, "loss": 0.5693, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4143269658088684, "rewards/margins": 0.5034248232841492, "rewards/rejected": -0.9177517890930176, "step": 286 }, { "dpo_lambda": 0.9700208902359009, "epoch": 0.6008898194190002, "grad_norm": 30.680257622648437, "learning_rate": 2.053865100274774e-07, "logits/chosen": -1.0024163722991943, "logits/rejected": -1.057032823562622, "logps/chosen": -356.08160400390625, "logps/rejected": -342.71380615234375, "loss": 0.5694, "rewards/accuracies": 0.6875, "rewards/chosen": -0.33542853593826294, "rewards/margins": 0.4585578143596649, "rewards/rejected": -0.7939863801002502, "step": 287 }, { "dpo_lambda": 0.9699162244796753, "epoch": 0.6029835121695891, "grad_norm": 23.556702970914774, "learning_rate": 2.035863443788411e-07, "logits/chosen": -0.9257787466049194, "logits/rejected": -0.9421148300170898, "logps/chosen": -327.1122131347656, "logps/rejected": -364.0360412597656, "loss": 0.5885, "rewards/accuracies": 0.765625, "rewards/chosen": -0.31706058979034424, "rewards/margins": 0.6234011054039001, "rewards/rejected": -0.9404616355895996, "step": 288 }, { "dpo_lambda": 0.9698113799095154, "epoch": 0.6050772049201779, "grad_norm": 28.145132727704436, "learning_rate": 2.0178866775369774e-07, "logits/chosen": -0.9009627103805542, "logits/rejected": -0.9209296107292175, "logps/chosen": -294.1601867675781, "logps/rejected": -330.5748291015625, "loss": 0.5464, "rewards/accuracies": 0.765625, "rewards/chosen": -0.27767202258110046, "rewards/margins": 0.6135878562927246, "rewards/rejected": -0.8912597894668579, "step": 289 }, { "dpo_lambda": 0.9697064757347107, "epoch": 0.6071708976707668, "grad_norm": 19.389825728653452, "learning_rate": 1.9999357655598891e-07, "logits/chosen": -0.8907105326652527, "logits/rejected": -0.9732505083084106, "logps/chosen": -292.0400085449219, "logps/rejected": -282.02520751953125, "loss": 0.5727, "rewards/accuracies": 0.796875, "rewards/chosen": -0.28145384788513184, "rewards/margins": 0.5408211350440979, "rewards/rejected": -0.822274923324585, "step": 290 }, { "dpo_lambda": 0.9696016907691956, "epoch": 0.6092645904213556, "grad_norm": 22.906647894771478, "learning_rate": 1.9820116705100775e-07, "logits/chosen": -0.974831759929657, "logits/rejected": -0.9915525317192078, "logps/chosen": -350.82232666015625, "logps/rejected": -394.2654724121094, "loss": 0.6151, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4027952551841736, "rewards/margins": 0.3092948794364929, "rewards/rejected": -0.7120901346206665, "step": 291 }, { "dpo_lambda": 0.9694967865943909, "epoch": 0.6113582831719445, "grad_norm": 30.636875549929094, "learning_rate": 1.9641153536023642e-07, "logits/chosen": -0.7265560030937195, "logits/rejected": -0.8953325152397156, "logps/chosen": -318.35992431640625, "logps/rejected": -342.0888671875, "loss": 0.5318, "rewards/accuracies": 0.65625, "rewards/chosen": -0.43135565519332886, "rewards/margins": 0.5195130109786987, "rewards/rejected": -0.9508687257766724, "step": 292 }, { "dpo_lambda": 0.969391942024231, "epoch": 0.6134519759225334, "grad_norm": 22.769319565191545, "learning_rate": 1.9462477745619106e-07, "logits/chosen": -0.9743357300758362, "logits/rejected": -0.9867159128189087, "logps/chosen": -365.81536865234375, "logps/rejected": -400.97064208984375, "loss": 0.6079, "rewards/accuracies": 0.71875, "rewards/chosen": -0.49305474758148193, "rewards/margins": 0.5272669196128845, "rewards/rejected": -1.0203216075897217, "step": 293 }, { "dpo_lambda": 0.9692872762680054, "epoch": 0.6155456686731222, "grad_norm": 28.096584516595076, "learning_rate": 1.928409891572757e-07, "logits/chosen": -0.9767225980758667, "logits/rejected": -1.018739104270935, "logps/chosen": -290.4533996582031, "logps/rejected": -369.98883056640625, "loss": 0.5211, "rewards/accuracies": 0.828125, "rewards/chosen": -0.24696438014507294, "rewards/margins": 0.6717772483825684, "rewards/rejected": -0.9187415838241577, "step": 294 }, { "dpo_lambda": 0.9691824316978455, "epoch": 0.6176393614237111, "grad_norm": 11.67452650850463, "learning_rate": 1.9106026612264315e-07, "logits/chosen": -0.9304852485656738, "logits/rejected": -0.9730537533760071, "logps/chosen": -334.56304931640625, "logps/rejected": -366.45904541015625, "loss": 0.4976, "rewards/accuracies": 0.75, "rewards/chosen": -0.28208717703819275, "rewards/margins": 0.617598295211792, "rewards/rejected": -0.8996855020523071, "step": 295 }, { "dpo_lambda": 0.9690775871276855, "epoch": 0.6197330541742999, "grad_norm": 29.939339000249788, "learning_rate": 1.8928270384706582e-07, "logits/chosen": -0.9597479104995728, "logits/rejected": -1.0559049844741821, "logps/chosen": -361.4211120605469, "logps/rejected": -343.28009033203125, "loss": 0.5373, "rewards/accuracies": 0.890625, "rewards/chosen": -0.31935498118400574, "rewards/margins": 0.6444027423858643, "rewards/rejected": -0.9637576937675476, "step": 296 }, { "dpo_lambda": 0.9689727425575256, "epoch": 0.6218267469248888, "grad_norm": 19.420346723363878, "learning_rate": 1.875083976558136e-07, "logits/chosen": -0.8957151174545288, "logits/rejected": -0.9098566770553589, "logps/chosen": -330.9611511230469, "logps/rejected": -398.48834228515625, "loss": 0.5834, "rewards/accuracies": 0.75, "rewards/chosen": -0.3520338535308838, "rewards/margins": 0.5692517757415771, "rewards/rejected": -0.9212855696678162, "step": 297 }, { "dpo_lambda": 0.968867838382721, "epoch": 0.6239204396754776, "grad_norm": 17.95489479788074, "learning_rate": 1.8573744269954297e-07, "logits/chosen": -0.9044655561447144, "logits/rejected": -0.986747145652771, "logps/chosen": -312.3471984863281, "logps/rejected": -321.05810546875, "loss": 0.5608, "rewards/accuracies": 0.75, "rewards/chosen": -0.31143227219581604, "rewards/margins": 0.5159186124801636, "rewards/rejected": -0.8273508548736572, "step": 298 }, { "dpo_lambda": 0.9687632322311401, "epoch": 0.6260141324260665, "grad_norm": 15.662522135428798, "learning_rate": 1.839699339491937e-07, "logits/chosen": -0.9157642126083374, "logits/rejected": -0.9654196500778198, "logps/chosen": -341.7589416503906, "logps/rejected": -348.1306457519531, "loss": 0.5487, "rewards/accuracies": 0.71875, "rewards/chosen": -0.28629571199417114, "rewards/margins": 0.6190310716629028, "rewards/rejected": -0.905326783657074, "step": 299 }, { "dpo_lambda": 0.9686583280563354, "epoch": 0.6281078251766553, "grad_norm": 23.191123388712988, "learning_rate": 1.8220596619089573e-07, "logits/chosen": -0.9885008931159973, "logits/rejected": -0.9450974464416504, "logps/chosen": -309.62091064453125, "logps/rejected": -363.9620361328125, "loss": 0.5598, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4183036684989929, "rewards/margins": 0.5146893262863159, "rewards/rejected": -0.9329929947853088, "step": 300 }, { "epoch": 0.6281078251766553, "eval_dpo_lambda": 0.9685534834861755, "eval_logits/chosen": -0.9607605934143066, "eval_logits/rejected": -1.0002164840698242, "eval_logps/chosen": -343.1056823730469, "eval_logps/rejected": -361.8922424316406, "eval_loss": 0.5694788694381714, "eval_rewards/accuracies": 0.722000002861023, "eval_rewards/chosen": -0.4245879352092743, "eval_rewards/margins": 0.4840264618396759, "eval_rewards/rejected": -0.9086143374443054, "eval_runtime": 561.1076, "eval_samples_per_second": 3.564, "eval_steps_per_second": 0.891, "step": 300 }, { "dpo_lambda": 0.9685534834861755, "epoch": 0.6302015179272442, "grad_norm": 71.58397918802766, "learning_rate": 1.8044563402088682e-07, "logits/chosen": -0.9689053893089294, "logits/rejected": -1.0844143629074097, "logps/chosen": -326.18499755859375, "logps/rejected": -329.495849609375, "loss": 0.5791, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5225777626037598, "rewards/margins": 0.3462575674057007, "rewards/rejected": -0.86883544921875, "step": 301 }, { "dpo_lambda": 0.9684486389160156, "epoch": 0.632295210677833, "grad_norm": 26.151243700328262, "learning_rate": 1.7868903184043885e-07, "logits/chosen": -0.8586608171463013, "logits/rejected": -0.9734141230583191, "logps/chosen": -307.2423400878906, "logps/rejected": -328.2541809082031, "loss": 0.5424, "rewards/accuracies": 0.75, "rewards/chosen": -0.3227950930595398, "rewards/margins": 0.5719585418701172, "rewards/rejected": -0.8947535753250122, "step": 302 }, { "dpo_lambda": 0.9683437943458557, "epoch": 0.6343889034284219, "grad_norm": 18.993097696585455, "learning_rate": 1.7693625385079574e-07, "logits/chosen": -0.9815896153450012, "logits/rejected": -1.009246826171875, "logps/chosen": -317.59832763671875, "logps/rejected": -327.9718322753906, "loss": 0.5344, "rewards/accuracies": 0.71875, "rewards/chosen": -0.23856814205646515, "rewards/margins": 0.5028955936431885, "rewards/rejected": -0.7414637207984924, "step": 303 }, { "dpo_lambda": 0.9682389497756958, "epoch": 0.6364825961790107, "grad_norm": 53.94254872286131, "learning_rate": 1.7518739404812155e-07, "logits/chosen": -0.9886007308959961, "logits/rejected": -1.0203882455825806, "logps/chosen": -387.0471496582031, "logps/rejected": -371.6809997558594, "loss": 0.5769, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5815944671630859, "rewards/margins": 0.38539034128189087, "rewards/rejected": -0.966984748840332, "step": 304 }, { "dpo_lambda": 0.9681340456008911, "epoch": 0.6385762889295996, "grad_norm": 26.17411762649003, "learning_rate": 1.7344254621846017e-07, "logits/chosen": -1.0002169609069824, "logits/rejected": -1.000640869140625, "logps/chosen": -312.68365478515625, "logps/rejected": -287.2958068847656, "loss": 0.5631, "rewards/accuracies": 0.75, "rewards/chosen": -0.35965269804000854, "rewards/margins": 0.4842557907104492, "rewards/rejected": -0.8439084887504578, "step": 305 }, { "dpo_lambda": 0.9680293798446655, "epoch": 0.6406699816801884, "grad_norm": 25.76044753092561, "learning_rate": 1.717018039327053e-07, "logits/chosen": -0.8985933065414429, "logits/rejected": -0.9996975064277649, "logps/chosen": -389.3902282714844, "logps/rejected": -382.3996887207031, "loss": 0.5286, "rewards/accuracies": 0.71875, "rewards/chosen": -0.32364633679389954, "rewards/margins": 0.6448994278907776, "rewards/rejected": -0.9685457944869995, "step": 306 }, { "dpo_lambda": 0.9679245352745056, "epoch": 0.6427636744307773, "grad_norm": 21.07447363401618, "learning_rate": 1.699652605415828e-07, "logits/chosen": -0.9501347541809082, "logits/rejected": -1.0148643255233765, "logps/chosen": -408.3487243652344, "logps/rejected": -361.47979736328125, "loss": 0.5062, "rewards/accuracies": 0.71875, "rewards/chosen": -0.41872522234916687, "rewards/margins": 0.4738251268863678, "rewards/rejected": -0.8925502896308899, "step": 307 }, { "dpo_lambda": 0.9678196907043457, "epoch": 0.6448573671813661, "grad_norm": 37.352068506484386, "learning_rate": 1.6823300917064458e-07, "logits/chosen": -0.9194483160972595, "logits/rejected": -1.0151333808898926, "logps/chosen": -348.45068359375, "logps/rejected": -390.6669921875, "loss": 0.5342, "rewards/accuracies": 0.71875, "rewards/chosen": -0.37650251388549805, "rewards/margins": 0.5100971460342407, "rewards/rejected": -0.886599600315094, "step": 308 }, { "dpo_lambda": 0.9677148461341858, "epoch": 0.646951059931955, "grad_norm": 65.9272239651892, "learning_rate": 1.6650514271527465e-07, "logits/chosen": -0.8971225619316101, "logits/rejected": -0.9499090909957886, "logps/chosen": -341.97503662109375, "logps/rejected": -398.912353515625, "loss": 0.558, "rewards/accuracies": 0.765625, "rewards/chosen": -0.261827677488327, "rewards/margins": 0.5827478766441345, "rewards/rejected": -0.8445755243301392, "step": 309 }, { "dpo_lambda": 0.9676100015640259, "epoch": 0.6490447526825438, "grad_norm": 20.04658084797354, "learning_rate": 1.647817538357072e-07, "logits/chosen": -0.7898070812225342, "logits/rejected": -0.8542658090591431, "logps/chosen": -318.4727783203125, "logps/rejected": -329.48333740234375, "loss": 0.5176, "rewards/accuracies": 0.78125, "rewards/chosen": -0.29877397418022156, "rewards/margins": 0.5912772417068481, "rewards/rejected": -0.8900513052940369, "step": 310 }, { "dpo_lambda": 0.9675053358078003, "epoch": 0.6511384454331327, "grad_norm": 26.373458404599162, "learning_rate": 1.6306293495205755e-07, "logits/chosen": -0.9485715627670288, "logits/rejected": -0.9856311082839966, "logps/chosen": -303.1679382324219, "logps/rejected": -325.40655517578125, "loss": 0.538, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3055274486541748, "rewards/margins": 0.5957139134407043, "rewards/rejected": -0.9012414216995239, "step": 311 }, { "dpo_lambda": 0.9674004912376404, "epoch": 0.6532321381837215, "grad_norm": 15.730910702810599, "learning_rate": 1.6134877823936607e-07, "logits/chosen": -1.0132697820663452, "logits/rejected": -1.0205020904541016, "logps/chosen": -329.7890319824219, "logps/rejected": -403.00732421875, "loss": 0.5396, "rewards/accuracies": 0.734375, "rewards/chosen": -0.3859696388244629, "rewards/margins": 0.5894760489463806, "rewards/rejected": -0.9754457473754883, "step": 312 }, { "dpo_lambda": 0.9672955870628357, "epoch": 0.6553258309343104, "grad_norm": 28.053518321597984, "learning_rate": 1.5963937562265522e-07, "logits/chosen": -0.9850528240203857, "logits/rejected": -1.0680859088897705, "logps/chosen": -329.48236083984375, "logps/rejected": -358.2930908203125, "loss": 0.5493, "rewards/accuracies": 0.765625, "rewards/chosen": -0.25245147943496704, "rewards/margins": 0.5047082901000977, "rewards/rejected": -0.7571598291397095, "step": 313 }, { "dpo_lambda": 0.9671907424926758, "epoch": 0.6574195236848992, "grad_norm": 44.39330956194096, "learning_rate": 1.5793481877199943e-07, "logits/chosen": -1.0563923120498657, "logits/rejected": -1.0740464925765991, "logps/chosen": -327.7634582519531, "logps/rejected": -326.1741943359375, "loss": 0.5415, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1798219382762909, "rewards/margins": 0.5696390867233276, "rewards/rejected": -0.7494610548019409, "step": 314 }, { "dpo_lambda": 0.9670858979225159, "epoch": 0.6595132164354881, "grad_norm": 52.35196491982725, "learning_rate": 1.562351990976095e-07, "logits/chosen": -0.8370614051818848, "logits/rejected": -0.9014161825180054, "logps/chosen": -346.0189208984375, "logps/rejected": -366.05963134765625, "loss": 0.5071, "rewards/accuracies": 0.796875, "rewards/chosen": -0.19401825964450836, "rewards/margins": 0.7404112219810486, "rewards/rejected": -0.934429407119751, "step": 315 }, { "dpo_lambda": 0.966981053352356, "epoch": 0.6616069091860769, "grad_norm": 20.25931696368988, "learning_rate": 1.5454060774493065e-07, "logits/chosen": -0.9522421360015869, "logits/rejected": -0.9345359802246094, "logps/chosen": -300.9903564453125, "logps/rejected": -332.50396728515625, "loss": 0.5295, "rewards/accuracies": 0.796875, "rewards/chosen": -0.33971095085144043, "rewards/margins": 0.5237520337104797, "rewards/rejected": -0.8634629249572754, "step": 316 }, { "dpo_lambda": 0.9668763875961304, "epoch": 0.6637006019366658, "grad_norm": 28.05666808675557, "learning_rate": 1.5285113558975427e-07, "logits/chosen": -0.9864431619644165, "logits/rejected": -0.94712233543396, "logps/chosen": -314.5718688964844, "logps/rejected": -369.55609130859375, "loss": 0.5285, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3466428816318512, "rewards/margins": 0.5084124207496643, "rewards/rejected": -0.8550552725791931, "step": 317 }, { "dpo_lambda": 0.9667715430259705, "epoch": 0.6657942946872546, "grad_norm": 28.907111649527103, "learning_rate": 1.5116687323334464e-07, "logits/chosen": -1.031725525856018, "logits/rejected": -1.0288310050964355, "logps/chosen": -351.31671142578125, "logps/rejected": -391.8863220214844, "loss": 0.5332, "rewards/accuracies": 0.796875, "rewards/chosen": -0.29358726739883423, "rewards/margins": 0.6588489413261414, "rewards/rejected": -0.9524362683296204, "step": 318 }, { "dpo_lambda": 0.9666666388511658, "epoch": 0.6678879874378435, "grad_norm": 13.586327459344083, "learning_rate": 1.4948791099758052e-07, "logits/chosen": -0.940189003944397, "logits/rejected": -0.968927264213562, "logps/chosen": -283.3269958496094, "logps/rejected": -314.5475769042969, "loss": 0.5594, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2659313678741455, "rewards/margins": 0.5632839202880859, "rewards/rejected": -0.829215407371521, "step": 319 }, { "dpo_lambda": 0.9665618538856506, "epoch": 0.6699816801884323, "grad_norm": 17.287945972540122, "learning_rate": 1.478143389201113e-07, "logits/chosen": -0.9991787075996399, "logits/rejected": -0.9350276589393616, "logps/chosen": -420.7625427246094, "logps/rejected": -389.25408935546875, "loss": 0.5606, "rewards/accuracies": 0.6875, "rewards/chosen": -0.39572352170944214, "rewards/margins": 0.4197995960712433, "rewards/rejected": -0.8155230283737183, "step": 320 }, { "dpo_lambda": 0.966456949710846, "epoch": 0.6720753729390212, "grad_norm": 22.059789067729692, "learning_rate": 1.461462467495284e-07, "logits/chosen": -0.8892531991004944, "logits/rejected": -0.9207965135574341, "logps/chosen": -371.59234619140625, "logps/rejected": -356.5474548339844, "loss": 0.6141, "rewards/accuracies": 0.640625, "rewards/chosen": -0.3681826591491699, "rewards/margins": 0.5087204575538635, "rewards/rejected": -0.8769031763076782, "step": 321 }, { "dpo_lambda": 0.9663523435592651, "epoch": 0.6741690656896101, "grad_norm": 17.733315729809913, "learning_rate": 1.4448372394055246e-07, "logits/chosen": -0.9744499325752258, "logits/rejected": -0.9986258745193481, "logps/chosen": -321.5147399902344, "logps/rejected": -330.0378112792969, "loss": 0.517, "rewards/accuracies": 0.796875, "rewards/chosen": -0.30931758880615234, "rewards/margins": 0.5616300106048584, "rewards/rejected": -0.870947539806366, "step": 322 }, { "dpo_lambda": 0.9662474393844604, "epoch": 0.6762627584401989, "grad_norm": 23.90951181882235, "learning_rate": 1.428268596492364e-07, "logits/chosen": -0.8879707455635071, "logits/rejected": -0.8915808796882629, "logps/chosen": -326.7454528808594, "logps/rejected": -378.8501892089844, "loss": 0.5528, "rewards/accuracies": 0.71875, "rewards/chosen": -0.325499564409256, "rewards/margins": 0.5490182042121887, "rewards/rejected": -0.8745177984237671, "step": 323 }, { "dpo_lambda": 0.9661425948143005, "epoch": 0.6783564511907878, "grad_norm": 16.39553700201974, "learning_rate": 1.4117574272818386e-07, "logits/chosen": -0.9681739211082458, "logits/rejected": -1.015541911125183, "logps/chosen": -317.4237976074219, "logps/rejected": -372.75439453125, "loss": 0.5674, "rewards/accuracies": 0.765625, "rewards/chosen": -0.33222252130508423, "rewards/margins": 0.5510666966438293, "rewards/rejected": -0.8832892179489136, "step": 324 }, { "dpo_lambda": 0.9660377502441406, "epoch": 0.6804501439413766, "grad_norm": 18.791465531764167, "learning_rate": 1.3953046172178413e-07, "logits/chosen": -0.9151560664176941, "logits/rejected": -0.9827873706817627, "logps/chosen": -323.977783203125, "logps/rejected": -314.0232238769531, "loss": 0.5845, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4418525993824005, "rewards/margins": 0.37086665630340576, "rewards/rejected": -0.8127192258834839, "step": 325 }, { "dpo_lambda": 0.9659329056739807, "epoch": 0.6825438366919655, "grad_norm": 27.99201137301602, "learning_rate": 1.3789110486146468e-07, "logits/chosen": -1.0051405429840088, "logits/rejected": -1.0525906085968018, "logps/chosen": -324.9797668457031, "logps/rejected": -370.70794677734375, "loss": 0.5747, "rewards/accuracies": 0.703125, "rewards/chosen": -0.27201488614082336, "rewards/margins": 0.48317158222198486, "rewards/rejected": -0.7551864385604858, "step": 326 }, { "dpo_lambda": 0.965828001499176, "epoch": 0.6846375294425543, "grad_norm": 20.528135742260847, "learning_rate": 1.362577600609588e-07, "logits/chosen": -1.0105348825454712, "logits/rejected": -0.9638998508453369, "logps/chosen": -318.3458251953125, "logps/rejected": -370.9636535644531, "loss": 0.555, "rewards/accuracies": 0.75, "rewards/chosen": -0.293737530708313, "rewards/margins": 0.45407989621162415, "rewards/rejected": -0.74781733751297, "step": 327 }, { "dpo_lambda": 0.9657231569290161, "epoch": 0.6867312221931432, "grad_norm": 40.497052517569266, "learning_rate": 1.3463051491159093e-07, "logits/chosen": -0.9311863780021667, "logits/rejected": -0.9145044684410095, "logps/chosen": -306.9552001953125, "logps/rejected": -379.8552551269531, "loss": 0.5613, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3483826816082001, "rewards/margins": 0.598532497882843, "rewards/rejected": -0.9469150900840759, "step": 328 }, { "dpo_lambda": 0.9656184911727905, "epoch": 0.688824914943732, "grad_norm": 16.832445288233277, "learning_rate": 1.3300945667758012e-07, "logits/chosen": -0.8724278807640076, "logits/rejected": -0.9922559261322021, "logps/chosen": -349.7203369140625, "logps/rejected": -389.9562683105469, "loss": 0.5049, "rewards/accuracies": 0.796875, "rewards/chosen": -0.22436396777629852, "rewards/margins": 0.7489404678344727, "rewards/rejected": -0.9733043909072876, "step": 329 }, { "dpo_lambda": 0.9655136466026306, "epoch": 0.6909186076943209, "grad_norm": 26.95436484029208, "learning_rate": 1.3139467229135998e-07, "logits/chosen": -0.9775909185409546, "logits/rejected": -0.9572923183441162, "logps/chosen": -296.727783203125, "logps/rejected": -318.11578369140625, "loss": 0.5528, "rewards/accuracies": 0.6875, "rewards/chosen": -0.32260772585868835, "rewards/margins": 0.43575817346572876, "rewards/rejected": -0.7583658695220947, "step": 330 }, { "dpo_lambda": 0.9654088020324707, "epoch": 0.6930123004449097, "grad_norm": 21.94704622393633, "learning_rate": 1.2978624834891626e-07, "logits/chosen": -0.856591522693634, "logits/rejected": -0.925330638885498, "logps/chosen": -363.8154602050781, "logps/rejected": -351.26409912109375, "loss": 0.4924, "rewards/accuracies": 0.734375, "rewards/chosen": -0.2571355402469635, "rewards/margins": 0.6332350373268127, "rewards/rejected": -0.8903706669807434, "step": 331 }, { "dpo_lambda": 0.9653039574623108, "epoch": 0.6951059931954986, "grad_norm": 178.8370298654354, "learning_rate": 1.281842711051438e-07, "logits/chosen": -1.0046966075897217, "logits/rejected": -0.9471710920333862, "logps/chosen": -407.97955322265625, "logps/rejected": -443.640869140625, "loss": 0.5206, "rewards/accuracies": 0.828125, "rewards/chosen": -0.3523060083389282, "rewards/margins": 0.6791791915893555, "rewards/rejected": -1.0314850807189941, "step": 332 }, { "dpo_lambda": 0.9651991128921509, "epoch": 0.6971996859460874, "grad_norm": 34.231688437828396, "learning_rate": 1.2658882646922033e-07, "logits/chosen": -0.8789958357810974, "logits/rejected": -0.9038445949554443, "logps/chosen": -354.1581726074219, "logps/rejected": -366.3446044921875, "loss": 0.5678, "rewards/accuracies": 0.796875, "rewards/chosen": -0.36087289452552795, "rewards/margins": 0.5995725393295288, "rewards/rejected": -0.9604454040527344, "step": 333 }, { "dpo_lambda": 0.9650944471359253, "epoch": 0.6992933786966763, "grad_norm": 18.04385028677772, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.9685949683189392, "logits/rejected": -0.9957066774368286, "logps/chosen": -335.9698181152344, "logps/rejected": -395.140869140625, "loss": 0.5561, "rewards/accuracies": 0.765625, "rewards/chosen": -0.30749112367630005, "rewards/margins": 0.4485364258289337, "rewards/rejected": -0.7560275793075562, "step": 334 }, { "dpo_lambda": 0.9649895429611206, "epoch": 0.7013870714472651, "grad_norm": 21.053487705520137, "learning_rate": 1.2341787690142435e-07, "logits/chosen": -0.9309602379798889, "logits/rejected": -0.963007926940918, "logps/chosen": -331.0724792480469, "logps/rejected": -347.17486572265625, "loss": 0.5789, "rewards/accuracies": 0.71875, "rewards/chosen": -0.40828946232795715, "rewards/margins": 0.4629323184490204, "rewards/rejected": -0.8712217807769775, "step": 335 }, { "dpo_lambda": 0.9648846983909607, "epoch": 0.703480764197854, "grad_norm": 20.90654959482647, "learning_rate": 1.2184254201795363e-07, "logits/chosen": -0.935300350189209, "logits/rejected": -1.0370335578918457, "logps/chosen": -361.41436767578125, "logps/rejected": -372.24920654296875, "loss": 0.5524, "rewards/accuracies": 0.625, "rewards/chosen": -0.3370175063610077, "rewards/margins": 0.41511785984039307, "rewards/rejected": -0.7521353960037231, "step": 336 }, { "dpo_lambda": 0.9647798538208008, "epoch": 0.7055744569484428, "grad_norm": 25.47986436291456, "learning_rate": 1.202740798300168e-07, "logits/chosen": -1.0110206604003906, "logits/rejected": -0.9602785110473633, "logps/chosen": -369.0711669921875, "logps/rejected": -426.0411376953125, "loss": 0.5572, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5039616823196411, "rewards/margins": 0.48775193095207214, "rewards/rejected": -0.9917135834693909, "step": 337 }, { "dpo_lambda": 0.9646750092506409, "epoch": 0.7076681496990317, "grad_norm": 12.80550446526403, "learning_rate": 1.1871257444948096e-07, "logits/chosen": -0.959979772567749, "logits/rejected": -1.0622349977493286, "logps/chosen": -339.4168395996094, "logps/rejected": -341.2720947265625, "loss": 0.5448, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4172990918159485, "rewards/margins": 0.5843453407287598, "rewards/rejected": -1.0016443729400635, "step": 338 }, { "dpo_lambda": 0.964570164680481, "epoch": 0.7097618424496205, "grad_norm": 24.972770330893827, "learning_rate": 1.1715810961514072e-07, "logits/chosen": -0.8925620317459106, "logits/rejected": -0.9319013357162476, "logps/chosen": -293.1116638183594, "logps/rejected": -370.85284423828125, "loss": 0.5661, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2533493638038635, "rewards/margins": 0.6773079037666321, "rewards/rejected": -0.9306572079658508, "step": 339 }, { "dpo_lambda": 0.9644652605056763, "epoch": 0.7118555352002094, "grad_norm": 22.919753073070936, "learning_rate": 1.1561076868822755e-07, "logits/chosen": -0.9318410158157349, "logits/rejected": -0.9421119689941406, "logps/chosen": -321.55072021484375, "logps/rejected": -356.93603515625, "loss": 0.595, "rewards/accuracies": 0.765625, "rewards/chosen": -0.32962632179260254, "rewards/margins": 0.37742599844932556, "rewards/rejected": -0.7070522308349609, "step": 340 }, { "dpo_lambda": 0.9643606543540955, "epoch": 0.7139492279507982, "grad_norm": 29.275505863100147, "learning_rate": 1.1407063464793965e-07, "logits/chosen": -1.0172332525253296, "logits/rejected": -1.0134183168411255, "logps/chosen": -278.432861328125, "logps/rejected": -322.0915222167969, "loss": 0.5227, "rewards/accuracies": 0.734375, "rewards/chosen": -0.3785405158996582, "rewards/margins": 0.5554640293121338, "rewards/rejected": -0.9340046048164368, "step": 341 }, { "dpo_lambda": 0.9642557501792908, "epoch": 0.7160429207013871, "grad_norm": 35.203314569620424, "learning_rate": 1.125377900869913e-07, "logits/chosen": -0.9718924164772034, "logits/rejected": -0.9619014859199524, "logps/chosen": -321.0333251953125, "logps/rejected": -392.4592590332031, "loss": 0.5256, "rewards/accuracies": 0.75, "rewards/chosen": -0.2639404237270355, "rewards/margins": 0.6683336496353149, "rewards/rejected": -0.9322740435600281, "step": 342 }, { "dpo_lambda": 0.9641509056091309, "epoch": 0.7181366134519759, "grad_norm": 55.55269901625133, "learning_rate": 1.110123172071844e-07, "logits/chosen": -0.914270281791687, "logits/rejected": -0.860942006111145, "logps/chosen": -304.1083068847656, "logps/rejected": -381.48651123046875, "loss": 0.5444, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2739492952823639, "rewards/margins": 0.5674920678138733, "rewards/rejected": -0.8414413332939148, "step": 343 }, { "dpo_lambda": 0.964046061038971, "epoch": 0.7202303062025648, "grad_norm": 17.912467497986132, "learning_rate": 1.09494297815e-07, "logits/chosen": -0.9377856850624084, "logits/rejected": -1.0394203662872314, "logps/chosen": -314.60284423828125, "logps/rejected": -331.8394775390625, "loss": 0.5344, "rewards/accuracies": 0.703125, "rewards/chosen": -0.25695669651031494, "rewards/margins": 0.441311776638031, "rewards/rejected": -0.6982684135437012, "step": 344 }, { "dpo_lambda": 0.963941216468811, "epoch": 0.7223239989531536, "grad_norm": 14.243821913967022, "learning_rate": 1.0798381331721107e-07, "logits/chosen": -0.9512354731559753, "logits/rejected": -0.9822427034378052, "logps/chosen": -287.7138671875, "logps/rejected": -316.0299072265625, "loss": 0.5271, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3327217400074005, "rewards/margins": 0.6620177626609802, "rewards/rejected": -0.9947394728660583, "step": 345 }, { "dpo_lambda": 0.9638365507125854, "epoch": 0.7244176917037425, "grad_norm": 15.398034329860014, "learning_rate": 1.0648094471651722e-07, "logits/chosen": -0.9969007968902588, "logits/rejected": -0.9967204928398132, "logps/chosen": -319.8764953613281, "logps/rejected": -340.1079406738281, "loss": 0.5834, "rewards/accuracies": 0.6875, "rewards/chosen": -0.35108503699302673, "rewards/margins": 0.5176693797111511, "rewards/rejected": -0.8687544465065002, "step": 346 }, { "dpo_lambda": 0.9637317061424255, "epoch": 0.7265113844543313, "grad_norm": 24.431486213535006, "learning_rate": 1.0498577260720048e-07, "logits/chosen": -0.8135185241699219, "logits/rejected": -0.9067487120628357, "logps/chosen": -316.9966125488281, "logps/rejected": -341.70318603515625, "loss": 0.5464, "rewards/accuracies": 0.71875, "rewards/chosen": -0.41448765993118286, "rewards/margins": 0.4467889368534088, "rewards/rejected": -0.8612766265869141, "step": 347 }, { "dpo_lambda": 0.9636268019676208, "epoch": 0.7286050772049202, "grad_norm": 28.722137917702902, "learning_rate": 1.0349837717080347e-07, "logits/chosen": -0.9912142753601074, "logits/rejected": -1.0412789583206177, "logps/chosen": -357.5257873535156, "logps/rejected": -371.5506591796875, "loss": 0.5016, "rewards/accuracies": 0.703125, "rewards/chosen": -0.33226823806762695, "rewards/margins": 0.5479937791824341, "rewards/rejected": -0.880262017250061, "step": 348 }, { "dpo_lambda": 0.9635220170021057, "epoch": 0.730698769955509, "grad_norm": 33.358713923011784, "learning_rate": 1.0201883817182949e-07, "logits/chosen": -1.0769925117492676, "logits/rejected": -1.0619198083877563, "logps/chosen": -268.9877014160156, "logps/rejected": -304.9781799316406, "loss": 0.5214, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3280823230743408, "rewards/margins": 0.5363447666168213, "rewards/rejected": -0.8644271492958069, "step": 349 }, { "dpo_lambda": 0.963417112827301, "epoch": 0.7327924627060979, "grad_norm": 16.957391268850262, "learning_rate": 1.0054723495346482e-07, "logits/chosen": -0.9406857490539551, "logits/rejected": -0.9522515535354614, "logps/chosen": -370.5696105957031, "logps/rejected": -418.59381103515625, "loss": 0.566, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3182927072048187, "rewards/margins": 0.6044560670852661, "rewards/rejected": -0.9227486848831177, "step": 350 }, { "epoch": 0.7327924627060979, "eval_dpo_lambda": 0.9633123278617859, "eval_logits/chosen": -0.9592387676239014, "eval_logits/rejected": -0.9957619905471802, "eval_logps/chosen": -335.34930419921875, "eval_logps/rejected": -355.07366943359375, "eval_loss": 0.5613225698471069, "eval_rewards/accuracies": 0.7260000109672546, "eval_rewards/chosen": -0.34702369570732117, "eval_rewards/margins": 0.49340489506721497, "eval_rewards/rejected": -0.8404285311698914, "eval_runtime": 561.9253, "eval_samples_per_second": 3.559, "eval_steps_per_second": 0.89, "step": 350 }, { "dpo_lambda": 0.9633122682571411, "epoch": 0.7348861554566868, "grad_norm": 24.71599545821725, "learning_rate": 9.908364643332398e-08, "logits/chosen": -0.9522049427032471, "logits/rejected": -0.9981221556663513, "logps/chosen": -317.208251953125, "logps/rejected": -374.0013427734375, "loss": 0.5682, "rewards/accuracies": 0.625, "rewards/chosen": -0.34796202182769775, "rewards/margins": 0.4574374258518219, "rewards/rejected": -0.8053994178771973, "step": 351 }, { "dpo_lambda": 0.9632076025009155, "epoch": 0.7369798482072756, "grad_norm": 15.696672979497066, "learning_rate": 9.76281510992176e-08, "logits/chosen": -0.9699813723564148, "logits/rejected": -1.0038105249404907, "logps/chosen": -323.82733154296875, "logps/rejected": -332.93475341796875, "loss": 0.563, "rewards/accuracies": 0.734375, "rewards/chosen": -0.3269641697406769, "rewards/margins": 0.5215096473693848, "rewards/rejected": -0.8484737873077393, "step": 352 }, { "dpo_lambda": 0.9631027579307556, "epoch": 0.7390735409578645, "grad_norm": 38.2658159785458, "learning_rate": 9.618082700494318e-08, "logits/chosen": -0.8864728212356567, "logits/rejected": -1.0044827461242676, "logps/chosen": -291.521728515625, "logps/rejected": -291.72802734375, "loss": 0.62, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41974952816963196, "rewards/margins": 0.38726168870925903, "rewards/rejected": -0.8070113062858582, "step": 353 }, { "dpo_lambda": 0.9629979133605957, "epoch": 0.7411672337084533, "grad_norm": 16.091443237501974, "learning_rate": 9.474175176609956e-08, "logits/chosen": -0.972978413105011, "logits/rejected": -0.9209941625595093, "logps/chosen": -299.625244140625, "logps/rejected": -310.5818786621094, "loss": 0.5659, "rewards/accuracies": 0.671875, "rewards/chosen": -0.3460564911365509, "rewards/margins": 0.40129145979881287, "rewards/rejected": -0.7473480105400085, "step": 354 }, { "dpo_lambda": 0.9628930687904358, "epoch": 0.7432609264590422, "grad_norm": 28.036122454792796, "learning_rate": 9.331100255592436e-08, "logits/chosen": -0.9111910462379456, "logits/rejected": -0.9185968041419983, "logps/chosen": -379.6253356933594, "logps/rejected": -339.806884765625, "loss": 0.6143, "rewards/accuracies": 0.734375, "rewards/chosen": -0.45352065563201904, "rewards/margins": 0.28414058685302734, "rewards/rejected": -0.7376612424850464, "step": 355 }, { "dpo_lambda": 0.9627881646156311, "epoch": 0.745354619209631, "grad_norm": 15.812271707140557, "learning_rate": 9.18886561011557e-08, "logits/chosen": -0.8909232020378113, "logits/rejected": -0.9465504884719849, "logps/chosen": -372.94317626953125, "logps/rejected": -410.23406982421875, "loss": 0.518, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2967568039894104, "rewards/margins": 0.6505753397941589, "rewards/rejected": -0.9473320841789246, "step": 356 }, { "dpo_lambda": 0.9626835584640503, "epoch": 0.7474483119602199, "grad_norm": 17.91342430337661, "learning_rate": 9.047478867791731e-08, "logits/chosen": -0.9878717064857483, "logits/rejected": -0.9517794847488403, "logps/chosen": -346.8152770996094, "logps/rejected": -356.7705993652344, "loss": 0.5188, "rewards/accuracies": 0.65625, "rewards/chosen": -0.24196535348892212, "rewards/margins": 0.5725304484367371, "rewards/rejected": -0.814495861530304, "step": 357 }, { "dpo_lambda": 0.9625786542892456, "epoch": 0.7495420047108087, "grad_norm": 22.723355184888202, "learning_rate": 8.906947610762825e-08, "logits/chosen": -0.9669464826583862, "logits/rejected": -0.9961766004562378, "logps/chosen": -387.2091064453125, "logps/rejected": -412.39404296875, "loss": 0.5696, "rewards/accuracies": 0.765625, "rewards/chosen": -0.2763371467590332, "rewards/margins": 0.541536271572113, "rewards/rejected": -0.817873477935791, "step": 358 }, { "dpo_lambda": 0.9624738097190857, "epoch": 0.7516356974613976, "grad_norm": 17.29151886084971, "learning_rate": 8.76727937529367e-08, "logits/chosen": -1.1109174489974976, "logits/rejected": -1.0934998989105225, "logps/chosen": -317.03350830078125, "logps/rejected": -388.8250732421875, "loss": 0.6099, "rewards/accuracies": 0.703125, "rewards/chosen": -0.30949756503105164, "rewards/margins": 0.31883934140205383, "rewards/rejected": -0.6283369064331055, "step": 359 }, { "dpo_lambda": 0.9623689651489258, "epoch": 0.7537293902119864, "grad_norm": 14.070230696764554, "learning_rate": 8.628481651367875e-08, "logits/chosen": -1.0067811012268066, "logits/rejected": -1.0087178945541382, "logps/chosen": -297.63873291015625, "logps/rejected": -330.9602966308594, "loss": 0.5515, "rewards/accuracies": 0.796875, "rewards/chosen": -0.2629808187484741, "rewards/margins": 0.5231103897094727, "rewards/rejected": -0.7860912084579468, "step": 360 }, { "dpo_lambda": 0.9622641205787659, "epoch": 0.7558230829625753, "grad_norm": 19.06758201325056, "learning_rate": 8.490561882286135e-08, "logits/chosen": -1.0516997575759888, "logits/rejected": -1.0711076259613037, "logps/chosen": -349.70343017578125, "logps/rejected": -354.0404052734375, "loss": 0.536, "rewards/accuracies": 0.71875, "rewards/chosen": -0.26353931427001953, "rewards/margins": 0.5523825883865356, "rewards/rejected": -0.8159219026565552, "step": 361 }, { "dpo_lambda": 0.962159276008606, "epoch": 0.7579167757131641, "grad_norm": 26.94997496124403, "learning_rate": 8.353527464267104e-08, "logits/chosen": -0.9543567299842834, "logits/rejected": -1.0147325992584229, "logps/chosen": -305.9583740234375, "logps/rejected": -365.9261779785156, "loss": 0.5791, "rewards/accuracies": 0.703125, "rewards/chosen": -0.21509583294391632, "rewards/margins": 0.5419927835464478, "rewards/rejected": -0.7570887207984924, "step": 362 }, { "dpo_lambda": 0.9620543718338013, "epoch": 0.760010468463753, "grad_norm": 74.72502474786462, "learning_rate": 8.217385746050742e-08, "logits/chosen": -0.9676810503005981, "logits/rejected": -1.0195143222808838, "logps/chosen": -336.9450378417969, "logps/rejected": -331.18798828125, "loss": 0.5299, "rewards/accuracies": 0.703125, "rewards/chosen": -0.381142795085907, "rewards/margins": 0.49130770564079285, "rewards/rejected": -0.8724505305290222, "step": 363 }, { "dpo_lambda": 0.9619497656822205, "epoch": 0.7621041612143418, "grad_norm": 19.404892216299483, "learning_rate": 8.082144028504231e-08, "logits/chosen": -0.8616032600402832, "logits/rejected": -0.9383154511451721, "logps/chosen": -275.91143798828125, "logps/rejected": -338.1697692871094, "loss": 0.5035, "rewards/accuracies": 0.71875, "rewards/chosen": -0.23062482476234436, "rewards/margins": 0.6644704341888428, "rewards/rejected": -0.8950952887535095, "step": 364 }, { "dpo_lambda": 0.9618448615074158, "epoch": 0.7641978539649307, "grad_norm": 29.53717574810475, "learning_rate": 7.947809564230445e-08, "logits/chosen": -0.8835728764533997, "logits/rejected": -0.9612337946891785, "logps/chosen": -369.8661193847656, "logps/rejected": -361.1076354980469, "loss": 0.6037, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3223067820072174, "rewards/margins": 0.3532503843307495, "rewards/rejected": -0.6755571961402893, "step": 365 }, { "dpo_lambda": 0.9617400169372559, "epoch": 0.7662915467155195, "grad_norm": 10.307277460270527, "learning_rate": 7.814389557179016e-08, "logits/chosen": -1.0233443975448608, "logits/rejected": -1.0403273105621338, "logps/chosen": -318.3058776855469, "logps/rejected": -338.8187255859375, "loss": 0.5634, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3016853332519531, "rewards/margins": 0.32519543170928955, "rewards/rejected": -0.6268807649612427, "step": 366 }, { "dpo_lambda": 0.961635172367096, "epoch": 0.7683852394661084, "grad_norm": 45.00455227104742, "learning_rate": 7.681891162260015e-08, "logits/chosen": -0.9098859429359436, "logits/rejected": -0.9500433206558228, "logps/chosen": -321.8980712890625, "logps/rejected": -321.658447265625, "loss": 0.5075, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3374080955982208, "rewards/margins": 0.6366255283355713, "rewards/rejected": -0.974033534526825, "step": 367 }, { "dpo_lambda": 0.961530327796936, "epoch": 0.7704789322166972, "grad_norm": 16.44474720560638, "learning_rate": 7.550321484960251e-08, "logits/chosen": -0.9235467314720154, "logits/rejected": -0.9864405393600464, "logps/chosen": -318.61566162109375, "logps/rejected": -323.4438171386719, "loss": 0.5266, "rewards/accuracies": 0.765625, "rewards/chosen": -0.1688789427280426, "rewards/margins": 0.5078449845314026, "rewards/rejected": -0.6767238974571228, "step": 368 }, { "dpo_lambda": 0.9614256620407104, "epoch": 0.7725726249672861, "grad_norm": 31.33862125443488, "learning_rate": 7.419687580962222e-08, "logits/chosen": -0.8263933658599854, "logits/rejected": -0.8173232674598694, "logps/chosen": -324.09283447265625, "logps/rejected": -351.4126281738281, "loss": 0.5795, "rewards/accuracies": 0.671875, "rewards/chosen": -0.42795392870903015, "rewards/margins": 0.4751317799091339, "rewards/rejected": -0.9030857086181641, "step": 369 }, { "dpo_lambda": 0.9613208174705505, "epoch": 0.7746663177178749, "grad_norm": 42.98613541209831, "learning_rate": 7.289996455765748e-08, "logits/chosen": -0.9016161561012268, "logits/rejected": -0.9416869282722473, "logps/chosen": -355.3125305175781, "logps/rejected": -369.4179992675781, "loss": 0.5451, "rewards/accuracies": 0.71875, "rewards/chosen": -0.31742802262306213, "rewards/margins": 0.5414060354232788, "rewards/rejected": -0.8588340878486633, "step": 370 }, { "dpo_lambda": 0.9612159132957458, "epoch": 0.7767600104684638, "grad_norm": 28.36509965644124, "learning_rate": 7.161255064312283e-08, "logits/chosen": -0.9476215243339539, "logits/rejected": -0.9706940650939941, "logps/chosen": -316.4869384765625, "logps/rejected": -396.5313415527344, "loss": 0.5328, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3681057095527649, "rewards/margins": 0.6468718647956848, "rewards/rejected": -1.0149775743484497, "step": 371 }, { "dpo_lambda": 0.9611111283302307, "epoch": 0.7788537032190526, "grad_norm": 23.803503779777298, "learning_rate": 7.033470310611945e-08, "logits/chosen": -0.9440561532974243, "logits/rejected": -1.0243254899978638, "logps/chosen": -438.1816711425781, "logps/rejected": -381.82623291015625, "loss": 0.5516, "rewards/accuracies": 0.703125, "rewards/chosen": -0.37924379110336304, "rewards/margins": 0.5478464365005493, "rewards/rejected": -0.9270902276039124, "step": 372 }, { "dpo_lambda": 0.961006224155426, "epoch": 0.7809473959696415, "grad_norm": 27.72182838034728, "learning_rate": 6.906649047373245e-08, "logits/chosen": -0.9706982970237732, "logits/rejected": -0.9654771089553833, "logps/chosen": -333.77789306640625, "logps/rejected": -391.4490966796875, "loss": 0.5528, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2363077700138092, "rewards/margins": 0.6813668608665466, "rewards/rejected": -0.9176746606826782, "step": 373 }, { "dpo_lambda": 0.9609013795852661, "epoch": 0.7830410887202303, "grad_norm": 20.681732016502796, "learning_rate": 6.780798075635675e-08, "logits/chosen": -0.9225718379020691, "logits/rejected": -0.9658865928649902, "logps/chosen": -353.7083435058594, "logps/rejected": -369.9117431640625, "loss": 0.5534, "rewards/accuracies": 0.734375, "rewards/chosen": -0.30636295676231384, "rewards/margins": 0.6070533990859985, "rewards/rejected": -0.9134163856506348, "step": 374 }, { "dpo_lambda": 0.9607967138290405, "epoch": 0.7851347814708192, "grad_norm": 31.186102107762363, "learning_rate": 6.655924144404906e-08, "logits/chosen": -0.8658726215362549, "logits/rejected": -0.9505506157875061, "logps/chosen": -310.1218566894531, "logps/rejected": -356.1629943847656, "loss": 0.5331, "rewards/accuracies": 0.703125, "rewards/chosen": -0.30431026220321655, "rewards/margins": 0.6359937191009521, "rewards/rejected": -0.9403039216995239, "step": 375 }, { "dpo_lambda": 0.9606918692588806, "epoch": 0.787228474221408, "grad_norm": 24.269420848253173, "learning_rate": 6.532033950290885e-08, "logits/chosen": -0.9355162382125854, "logits/rejected": -0.9577205181121826, "logps/chosen": -338.4344787597656, "logps/rejected": -387.79913330078125, "loss": 0.5694, "rewards/accuracies": 0.6875, "rewards/chosen": -0.39728814363479614, "rewards/margins": 0.5328594446182251, "rewards/rejected": -0.9301475882530212, "step": 376 }, { "dpo_lambda": 0.9605870246887207, "epoch": 0.7893221669719969, "grad_norm": 19.188064859196658, "learning_rate": 6.409134137148736e-08, "logits/chosen": -1.0201990604400635, "logits/rejected": -1.047384262084961, "logps/chosen": -343.03875732421875, "logps/rejected": -315.65362548828125, "loss": 0.5314, "rewards/accuracies": 0.734375, "rewards/chosen": -0.3237241506576538, "rewards/margins": 0.49253687262535095, "rewards/rejected": -0.8162609934806824, "step": 377 }, { "dpo_lambda": 0.9604821801185608, "epoch": 0.7914158597225857, "grad_norm": 24.57431465319421, "learning_rate": 6.28723129572247e-08, "logits/chosen": -0.8773887157440186, "logits/rejected": -0.9270977973937988, "logps/chosen": -334.6279296875, "logps/rejected": -367.80572509765625, "loss": 0.594, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3744312524795532, "rewards/margins": 0.42570799589157104, "rewards/rejected": -0.800139307975769, "step": 378 }, { "dpo_lambda": 0.9603772759437561, "epoch": 0.7935095524731746, "grad_norm": 28.46042702704832, "learning_rate": 6.166331963291519e-08, "logits/chosen": -1.055816411972046, "logits/rejected": -1.029515027999878, "logps/chosen": -323.7863464355469, "logps/rejected": -333.7519226074219, "loss": 0.5726, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4126008450984955, "rewards/margins": 0.49342429637908936, "rewards/rejected": -0.9060250520706177, "step": 379 }, { "dpo_lambda": 0.9602726697921753, "epoch": 0.7956032452237635, "grad_norm": 18.658115359359524, "learning_rate": 6.046442623320145e-08, "logits/chosen": -1.018285870552063, "logits/rejected": -1.0573500394821167, "logps/chosen": -330.10101318359375, "logps/rejected": -346.8293151855469, "loss": 0.5255, "rewards/accuracies": 0.796875, "rewards/chosen": -0.22925367951393127, "rewards/margins": 0.6084834337234497, "rewards/rejected": -0.8377372026443481, "step": 380 }, { "dpo_lambda": 0.9601677656173706, "epoch": 0.7976969379743523, "grad_norm": 16.408753713537546, "learning_rate": 5.9275697051098275e-08, "logits/chosen": -0.8530938625335693, "logits/rejected": -0.9515883922576904, "logps/chosen": -239.74586486816406, "logps/rejected": -339.2502746582031, "loss": 0.502, "rewards/accuracies": 0.796875, "rewards/chosen": -0.22224396467208862, "rewards/margins": 0.6877234578132629, "rewards/rejected": -0.9099674224853516, "step": 381 }, { "dpo_lambda": 0.9600629210472107, "epoch": 0.7997906307249412, "grad_norm": 21.903923323456592, "learning_rate": 5.809719583454414e-08, "logits/chosen": -0.9614814519882202, "logits/rejected": -0.962985634803772, "logps/chosen": -300.4097595214844, "logps/rejected": -382.86370849609375, "loss": 0.5261, "rewards/accuracies": 0.75, "rewards/chosen": -0.26442432403564453, "rewards/margins": 0.5285326242446899, "rewards/rejected": -0.7929569482803345, "step": 382 }, { "dpo_lambda": 0.9599580764770508, "epoch": 0.80188432347553, "grad_norm": 27.69460396939364, "learning_rate": 5.6928985782982524e-08, "logits/chosen": -1.00211763381958, "logits/rejected": -0.9333101511001587, "logps/chosen": -371.8981628417969, "logps/rejected": -389.7657165527344, "loss": 0.5818, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3752930164337158, "rewards/margins": 0.4458085000514984, "rewards/rejected": -0.8211015462875366, "step": 383 }, { "dpo_lambda": 0.9598532319068909, "epoch": 0.8039780162261189, "grad_norm": 15.880624776905542, "learning_rate": 5.57711295439732e-08, "logits/chosen": -1.0322285890579224, "logits/rejected": -1.0425132513046265, "logps/chosen": -356.7442626953125, "logps/rejected": -409.0237121582031, "loss": 0.5218, "rewards/accuracies": 0.765625, "rewards/chosen": -0.2565978765487671, "rewards/margins": 0.6591013669967651, "rewards/rejected": -0.9156992435455322, "step": 384 }, { "dpo_lambda": 0.959748387336731, "epoch": 0.8060717089767077, "grad_norm": 15.997055013777837, "learning_rate": 5.4623689209832484e-08, "logits/chosen": -0.9438518285751343, "logits/rejected": -1.0183533430099487, "logps/chosen": -339.08306884765625, "logps/rejected": -395.9681396484375, "loss": 0.5102, "rewards/accuracies": 0.6875, "rewards/chosen": -0.36264917254447937, "rewards/margins": 0.6978291273117065, "rewards/rejected": -1.0604783296585083, "step": 385 }, { "dpo_lambda": 0.9596434831619263, "epoch": 0.8081654017272966, "grad_norm": 21.110905175575006, "learning_rate": 5.3486726314303175e-08, "logits/chosen": -0.9347030520439148, "logits/rejected": -0.9683570265769958, "logps/chosen": -353.3894958496094, "logps/rejected": -328.5455322265625, "loss": 0.5475, "rewards/accuracies": 0.734375, "rewards/chosen": -0.2658626139163971, "rewards/margins": 0.49515584111213684, "rewards/rejected": -0.7610185146331787, "step": 386 }, { "dpo_lambda": 0.9595388174057007, "epoch": 0.8102590944778854, "grad_norm": 14.055777953199689, "learning_rate": 5.2360301829254745e-08, "logits/chosen": -0.9676686525344849, "logits/rejected": -1.0681111812591553, "logps/chosen": -349.0576171875, "logps/rejected": -325.227294921875, "loss": 0.5377, "rewards/accuracies": 0.796875, "rewards/chosen": -0.14794021844863892, "rewards/margins": 0.6320769786834717, "rewards/rejected": -0.7800171971321106, "step": 387 }, { "dpo_lambda": 0.9594339728355408, "epoch": 0.8123527872284743, "grad_norm": 37.7395918970165, "learning_rate": 5.1244476161413806e-08, "logits/chosen": -0.9613102078437805, "logits/rejected": -0.9784658551216125, "logps/chosen": -340.90447998046875, "logps/rejected": -406.65533447265625, "loss": 0.55, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2638137936592102, "rewards/margins": 0.6216176748275757, "rewards/rejected": -0.8854314684867859, "step": 388 }, { "dpo_lambda": 0.9593291282653809, "epoch": 0.814446479979063, "grad_norm": 20.798435257784853, "learning_rate": 5.013930914912476e-08, "logits/chosen": -0.8479200601577759, "logits/rejected": -0.9247003197669983, "logps/chosen": -310.15374755859375, "logps/rejected": -348.4466247558594, "loss": 0.5478, "rewards/accuracies": 0.75, "rewards/chosen": -0.3163961172103882, "rewards/margins": 0.522079348564148, "rewards/rejected": -0.8384754061698914, "step": 389 }, { "dpo_lambda": 0.959224283695221, "epoch": 0.816540172729652, "grad_norm": 54.64575176217907, "learning_rate": 4.904486005914027e-08, "logits/chosen": -0.9330431222915649, "logits/rejected": -0.9549581408500671, "logps/chosen": -365.50762939453125, "logps/rejected": -346.893310546875, "loss": 0.5593, "rewards/accuracies": 0.71875, "rewards/chosen": -0.45057371258735657, "rewards/margins": 0.45801979303359985, "rewards/rejected": -0.908593475818634, "step": 390 }, { "dpo_lambda": 0.959119439125061, "epoch": 0.8186338654802408, "grad_norm": 28.884583790867374, "learning_rate": 4.796118758344353e-08, "logits/chosen": -0.9886559247970581, "logits/rejected": -1.0360742807388306, "logps/chosen": -311.6133728027344, "logps/rejected": -329.0717468261719, "loss": 0.5356, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3692110478878021, "rewards/margins": 0.5127213001251221, "rewards/rejected": -0.8819323182106018, "step": 391 }, { "dpo_lambda": 0.9590147733688354, "epoch": 0.8207275582308297, "grad_norm": 47.21987697553988, "learning_rate": 4.688834983610082e-08, "logits/chosen": -1.0307235717773438, "logits/rejected": -0.9974226355552673, "logps/chosen": -307.7640380859375, "logps/rejected": -384.6656799316406, "loss": 0.541, "rewards/accuracies": 0.734375, "rewards/chosen": -0.34472477436065674, "rewards/margins": 0.5033468008041382, "rewards/rejected": -0.8480715155601501, "step": 392 }, { "dpo_lambda": 0.9589099287986755, "epoch": 0.8228212509814185, "grad_norm": 36.144797883020956, "learning_rate": 4.582640435014459e-08, "logits/chosen": -1.0393571853637695, "logits/rejected": -1.0646288394927979, "logps/chosen": -303.0776672363281, "logps/rejected": -308.91864013671875, "loss": 0.5749, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3313845694065094, "rewards/margins": 0.454150915145874, "rewards/rejected": -0.785535454750061, "step": 393 }, { "dpo_lambda": 0.9588050246238708, "epoch": 0.8249149437320074, "grad_norm": 28.405590539672186, "learning_rate": 4.477540807448832e-08, "logits/chosen": -0.8921625018119812, "logits/rejected": -0.9136333465576172, "logps/chosen": -355.0310363769531, "logps/rejected": -426.3016662597656, "loss": 0.5566, "rewards/accuracies": 0.75, "rewards/chosen": -0.3053607940673828, "rewards/margins": 0.6183335781097412, "rewards/rejected": -0.9236944913864136, "step": 394 }, { "dpo_lambda": 0.9587001800537109, "epoch": 0.8270086364825961, "grad_norm": 15.539276399808447, "learning_rate": 4.373541737087263e-08, "logits/chosen": -1.0075271129608154, "logits/rejected": -1.028421401977539, "logps/chosen": -312.7923583984375, "logps/rejected": -374.7745361328125, "loss": 0.5639, "rewards/accuracies": 0.734375, "rewards/chosen": -0.3122669458389282, "rewards/margins": 0.5935348868370056, "rewards/rejected": -0.9058018922805786, "step": 395 }, { "dpo_lambda": 0.958595335483551, "epoch": 0.829102329233185, "grad_norm": 26.03734566553524, "learning_rate": 4.270648801084295e-08, "logits/chosen": -0.9819083213806152, "logits/rejected": -0.9753702282905579, "logps/chosen": -316.38751220703125, "logps/rejected": -319.94915771484375, "loss": 0.5703, "rewards/accuracies": 0.703125, "rewards/chosen": -0.39307281374931335, "rewards/margins": 0.3845985233783722, "rewards/rejected": -0.7776713967323303, "step": 396 }, { "dpo_lambda": 0.9584904909133911, "epoch": 0.8311960219837738, "grad_norm": 25.113826259403137, "learning_rate": 4.168867517275806e-08, "logits/chosen": -0.9145287871360779, "logits/rejected": -0.9403222799301147, "logps/chosen": -245.5500030517578, "logps/rejected": -329.1048583984375, "loss": 0.5393, "rewards/accuracies": 0.71875, "rewards/chosen": -0.36393922567367554, "rewards/margins": 0.5811597108840942, "rewards/rejected": -0.945098876953125, "step": 397 }, { "dpo_lambda": 0.9583855867385864, "epoch": 0.8332897147343628, "grad_norm": 44.51116164273478, "learning_rate": 4.0682033438831584e-08, "logits/chosen": -0.9946930408477783, "logits/rejected": -0.9629707932472229, "logps/chosen": -267.51715087890625, "logps/rejected": -350.3848876953125, "loss": 0.5696, "rewards/accuracies": 0.671875, "rewards/chosen": -0.40482285618782043, "rewards/margins": 0.33537381887435913, "rewards/rejected": -0.7401966452598572, "step": 398 }, { "dpo_lambda": 0.9582809805870056, "epoch": 0.8353834074849515, "grad_norm": 30.344521852168064, "learning_rate": 3.968661679220467e-08, "logits/chosen": -0.9283524751663208, "logits/rejected": -0.9831556081771851, "logps/chosen": -384.8466491699219, "logps/rejected": -402.1518249511719, "loss": 0.5402, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4025726616382599, "rewards/margins": 0.5733284950256348, "rewards/rejected": -0.9759011268615723, "step": 399 }, { "dpo_lambda": 0.9581760764122009, "epoch": 0.8374771002355405, "grad_norm": 44.88339103217458, "learning_rate": 3.8702478614051345e-08, "logits/chosen": -0.9853564500808716, "logits/rejected": -0.9621061086654663, "logps/chosen": -376.42376708984375, "logps/rejected": -432.05963134765625, "loss": 0.5423, "rewards/accuracies": 0.78125, "rewards/chosen": -0.31213808059692383, "rewards/margins": 0.6543071866035461, "rewards/rejected": -0.96644526720047, "step": 400 }, { "epoch": 0.8374771002355405, "eval_dpo_lambda": 0.9580713510513306, "eval_logits/chosen": -0.9665474891662598, "eval_logits/rejected": -1.0032734870910645, "eval_logps/chosen": -339.02130126953125, "eval_logps/rejected": -360.9908142089844, "eval_loss": 0.561345636844635, "eval_rewards/accuracies": 0.7289999723434448, "eval_rewards/chosen": -0.38374418020248413, "eval_rewards/margins": 0.5158559679985046, "eval_rewards/rejected": -0.899600088596344, "eval_runtime": 560.9381, "eval_samples_per_second": 3.565, "eval_steps_per_second": 0.891, "step": 400 }, { "dpo_lambda": 0.9580712914466858, "epoch": 0.8395707929861292, "grad_norm": 82.15355464781004, "learning_rate": 3.772967168071517e-08, "logits/chosen": -1.0035700798034668, "logits/rejected": -1.0144524574279785, "logps/chosen": -322.69830322265625, "logps/rejected": -336.5651550292969, "loss": 0.5728, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3935065269470215, "rewards/margins": 0.46786949038505554, "rewards/rejected": -0.8613760471343994, "step": 401 }, { "dpo_lambda": 0.9579663872718811, "epoch": 0.8416644857367181, "grad_norm": 20.35226428221194, "learning_rate": 3.676824816087978e-08, "logits/chosen": -0.9380530714988708, "logits/rejected": -0.9650043845176697, "logps/chosen": -355.93719482421875, "logps/rejected": -356.6396789550781, "loss": 0.5223, "rewards/accuracies": 0.78125, "rewards/chosen": -0.19415956735610962, "rewards/margins": 0.6138423681259155, "rewards/rejected": -0.8080019950866699, "step": 402 }, { "dpo_lambda": 0.9578615427017212, "epoch": 0.8437581784873069, "grad_norm": 42.29350910621693, "learning_rate": 3.581825961277074e-08, "logits/chosen": -0.9577827453613281, "logits/rejected": -1.0395058393478394, "logps/chosen": -357.50103759765625, "logps/rejected": -378.7171630859375, "loss": 0.5769, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3822963237762451, "rewards/margins": 0.4797073006629944, "rewards/rejected": -0.8620035648345947, "step": 403 }, { "dpo_lambda": 0.9577568769454956, "epoch": 0.8458518712378958, "grad_norm": 21.139864207028474, "learning_rate": 3.487975698139084e-08, "logits/chosen": -1.053609848022461, "logits/rejected": -1.046288013458252, "logps/chosen": -348.7900085449219, "logps/rejected": -376.8426208496094, "loss": 0.5609, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4225596487522125, "rewards/margins": 0.3801911175251007, "rewards/rejected": -0.802750825881958, "step": 404 }, { "dpo_lambda": 0.9576520323753357, "epoch": 0.8479455639884846, "grad_norm": 48.23981949574903, "learning_rate": 3.3952790595787986e-08, "logits/chosen": -1.039850115776062, "logits/rejected": -1.068182349205017, "logps/chosen": -318.3228759765625, "logps/rejected": -346.5895080566406, "loss": 0.522, "rewards/accuracies": 0.75, "rewards/chosen": -0.4849662184715271, "rewards/margins": 0.4318442940711975, "rewards/rejected": -0.9168104529380798, "step": 405 }, { "dpo_lambda": 0.9575471878051758, "epoch": 0.8500392567390735, "grad_norm": 27.516331980321333, "learning_rate": 3.303741016635614e-08, "logits/chosen": -1.0031503438949585, "logits/rejected": -1.0047969818115234, "logps/chosen": -370.5401306152344, "logps/rejected": -397.98040771484375, "loss": 0.557, "rewards/accuracies": 0.703125, "rewards/chosen": -0.27663034200668335, "rewards/margins": 0.6528828144073486, "rewards/rejected": -0.9295130372047424, "step": 406 }, { "dpo_lambda": 0.9574423432350159, "epoch": 0.8521329494896623, "grad_norm": 20.04658083953901, "learning_rate": 3.2133664782169944e-08, "logits/chosen": -0.975938081741333, "logits/rejected": -0.956437349319458, "logps/chosen": -325.4172058105469, "logps/rejected": -350.14862060546875, "loss": 0.523, "rewards/accuracies": 0.734375, "rewards/chosen": -0.40274572372436523, "rewards/margins": 0.6021479964256287, "rewards/rejected": -1.0048936605453491, "step": 407 }, { "dpo_lambda": 0.9573374390602112, "epoch": 0.8542266422402512, "grad_norm": 28.927907054098547, "learning_rate": 3.12416029083514e-08, "logits/chosen": -0.9998895525932312, "logits/rejected": -1.0152561664581299, "logps/chosen": -333.8466796875, "logps/rejected": -372.66162109375, "loss": 0.5512, "rewards/accuracies": 0.703125, "rewards/chosen": -0.42361342906951904, "rewards/margins": 0.4960295557975769, "rewards/rejected": -0.919642984867096, "step": 408 }, { "dpo_lambda": 0.9572325944900513, "epoch": 0.8563203349908401, "grad_norm": 28.95623360194244, "learning_rate": 3.036127238347164e-08, "logits/chosen": -1.0345758199691772, "logits/rejected": -1.0678658485412598, "logps/chosen": -393.51934814453125, "logps/rejected": -419.20819091796875, "loss": 0.5428, "rewards/accuracies": 0.75, "rewards/chosen": -0.3088226020336151, "rewards/margins": 0.5775026082992554, "rewards/rejected": -0.8863251805305481, "step": 409 }, { "dpo_lambda": 0.9571279287338257, "epoch": 0.8584140277414289, "grad_norm": 38.71222399422127, "learning_rate": 2.9492720416985e-08, "logits/chosen": -0.9824857115745544, "logits/rejected": -0.9625248908996582, "logps/chosen": -327.7845764160156, "logps/rejected": -384.799560546875, "loss": 0.5879, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3323441743850708, "rewards/margins": 0.6053525805473328, "rewards/rejected": -0.937696635723114, "step": 410 }, { "dpo_lambda": 0.9570230841636658, "epoch": 0.8605077204920178, "grad_norm": 63.673739836807414, "learning_rate": 2.863599358669755e-08, "logits/chosen": -1.0002498626708984, "logits/rejected": -0.9433389902114868, "logps/chosen": -332.62017822265625, "logps/rejected": -383.5394592285156, "loss": 0.5866, "rewards/accuracies": 0.671875, "rewards/chosen": -0.4152870178222656, "rewards/margins": 0.41510239243507385, "rewards/rejected": -0.8303893804550171, "step": 411 }, { "dpo_lambda": 0.9569182395935059, "epoch": 0.8626014132426066, "grad_norm": 47.031673049240815, "learning_rate": 2.7791137836269158e-08, "logits/chosen": -0.9564170241355896, "logits/rejected": -0.9423116445541382, "logps/chosen": -329.16876220703125, "logps/rejected": -396.54949951171875, "loss": 0.5372, "rewards/accuracies": 0.703125, "rewards/chosen": -0.37793248891830444, "rewards/margins": 0.6080735325813293, "rewards/rejected": -0.986005961894989, "step": 412 }, { "dpo_lambda": 0.956813395023346, "epoch": 0.8646951059931955, "grad_norm": 20.115652999297403, "learning_rate": 2.6958198472749717e-08, "logits/chosen": -0.993791937828064, "logits/rejected": -1.0071156024932861, "logps/chosen": -288.9036560058594, "logps/rejected": -332.8777770996094, "loss": 0.5507, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34835270047187805, "rewards/margins": 0.5191000699996948, "rewards/rejected": -0.8674527406692505, "step": 413 }, { "dpo_lambda": 0.956708550453186, "epoch": 0.8667887987437843, "grad_norm": 28.643664444242688, "learning_rate": 2.613722016414943e-08, "logits/chosen": -0.9087300896644592, "logits/rejected": -0.9807073473930359, "logps/chosen": -331.19329833984375, "logps/rejected": -377.75054931640625, "loss": 0.5429, "rewards/accuracies": 0.71875, "rewards/chosen": -0.48658519983291626, "rewards/margins": 0.5433869361877441, "rewards/rejected": -1.0299721956253052, "step": 414 }, { "dpo_lambda": 0.9566038846969604, "epoch": 0.8688824914943732, "grad_norm": 25.510442310385585, "learning_rate": 2.5328246937043525e-08, "logits/chosen": -0.9275650382041931, "logits/rejected": -0.966849684715271, "logps/chosen": -344.0802001953125, "logps/rejected": -348.220947265625, "loss": 0.5552, "rewards/accuracies": 0.78125, "rewards/chosen": -0.34558096528053284, "rewards/margins": 0.6371087431907654, "rewards/rejected": -0.982689619064331, "step": 415 }, { "dpo_lambda": 0.9564989805221558, "epoch": 0.870976184244962, "grad_norm": 19.797185961582827, "learning_rate": 2.4531322174210973e-08, "logits/chosen": -0.9148194789886475, "logits/rejected": -1.0240569114685059, "logps/chosen": -363.20098876953125, "logps/rejected": -388.9029235839844, "loss": 0.5692, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2716209888458252, "rewards/margins": 0.8080213665962219, "rewards/rejected": -1.079642415046692, "step": 416 }, { "dpo_lambda": 0.9563941359519958, "epoch": 0.8730698769955509, "grad_norm": 37.76417428426363, "learning_rate": 2.3746488612308295e-08, "logits/chosen": -0.9347919225692749, "logits/rejected": -0.9365442395210266, "logps/chosen": -319.30450439453125, "logps/rejected": -377.2983703613281, "loss": 0.5504, "rewards/accuracies": 0.625, "rewards/chosen": -0.5698833465576172, "rewards/margins": 0.35962584614753723, "rewards/rejected": -0.9295092225074768, "step": 417 }, { "dpo_lambda": 0.9562892913818359, "epoch": 0.8751635697461397, "grad_norm": 30.23045274143435, "learning_rate": 2.297378833957761e-08, "logits/chosen": -1.0036827325820923, "logits/rejected": -0.954819917678833, "logps/chosen": -329.52276611328125, "logps/rejected": -358.5249938964844, "loss": 0.5439, "rewards/accuracies": 0.765625, "rewards/chosen": -0.37042689323425293, "rewards/margins": 0.5442904829978943, "rewards/rejected": -0.914717435836792, "step": 418 }, { "dpo_lambda": 0.956184446811676, "epoch": 0.8772572624967286, "grad_norm": 17.865199078300027, "learning_rate": 2.2213262793589482e-08, "logits/chosen": -0.9423766136169434, "logits/rejected": -1.0231984853744507, "logps/chosen": -332.6385192871094, "logps/rejected": -357.6366882324219, "loss": 0.5404, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3678462505340576, "rewards/margins": 0.5428791046142578, "rewards/rejected": -0.9107253551483154, "step": 419 }, { "dpo_lambda": 0.9560796022415161, "epoch": 0.8793509552473174, "grad_norm": 12.8919686565624, "learning_rate": 2.1464952759020856e-08, "logits/chosen": -0.8776789903640747, "logits/rejected": -0.9112561941146851, "logps/chosen": -362.9369812011719, "logps/rejected": -445.8304748535156, "loss": 0.5028, "rewards/accuracies": 0.78125, "rewards/chosen": -0.24908962845802307, "rewards/margins": 0.7293524146080017, "rewards/rejected": -0.9784420728683472, "step": 420 }, { "dpo_lambda": 0.9559746980667114, "epoch": 0.8814446479979063, "grad_norm": 17.459827778455995, "learning_rate": 2.07288983654679e-08, "logits/chosen": -0.9969652891159058, "logits/rejected": -1.0456459522247314, "logps/chosen": -284.34271240234375, "logps/rejected": -328.07318115234375, "loss": 0.4952, "rewards/accuracies": 0.8125, "rewards/chosen": -0.37537533044815063, "rewards/margins": 0.6198078989982605, "rewards/rejected": -0.9951832294464111, "step": 421 }, { "dpo_lambda": 0.9558700919151306, "epoch": 0.8835383407484951, "grad_norm": 43.84604357896676, "learning_rate": 2.0005139085293942e-08, "logits/chosen": -0.9257749319076538, "logits/rejected": -0.8814691305160522, "logps/chosen": -320.6876220703125, "logps/rejected": -344.1969299316406, "loss": 0.5525, "rewards/accuracies": 0.703125, "rewards/chosen": -0.41354262828826904, "rewards/margins": 0.4604254961013794, "rewards/rejected": -0.8739681839942932, "step": 422 }, { "dpo_lambda": 0.9557651877403259, "epoch": 0.885632033499084, "grad_norm": 46.36867747989169, "learning_rate": 1.9293713731512673e-08, "logits/chosen": -0.9757592082023621, "logits/rejected": -0.9672430157661438, "logps/chosen": -335.0474853515625, "logps/rejected": -392.3810119628906, "loss": 0.5251, "rewards/accuracies": 0.703125, "rewards/chosen": -0.2871450185775757, "rewards/margins": 0.6362358331680298, "rewards/rejected": -0.9233807921409607, "step": 423 }, { "dpo_lambda": 0.955660343170166, "epoch": 0.8877257262496728, "grad_norm": 34.23218298531432, "learning_rate": 1.8594660455706763e-08, "logits/chosen": -0.8614395260810852, "logits/rejected": -0.9894289970397949, "logps/chosen": -327.67840576171875, "logps/rejected": -366.784423828125, "loss": 0.5035, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3272741734981537, "rewards/margins": 0.7001060247421265, "rewards/rejected": -1.0273802280426025, "step": 424 }, { "dpo_lambda": 0.9555554986000061, "epoch": 0.8898194190002617, "grad_norm": 16.45766662860836, "learning_rate": 1.7908016745981856e-08, "logits/chosen": -0.9176836013793945, "logits/rejected": -0.9757386445999146, "logps/chosen": -409.4639587402344, "logps/rejected": -359.6163330078125, "loss": 0.5317, "rewards/accuracies": 0.78125, "rewards/chosen": -0.19637420773506165, "rewards/margins": 0.7027495503425598, "rewards/rejected": -0.8991237878799438, "step": 425 }, { "dpo_lambda": 0.9554506540298462, "epoch": 0.8919131117508505, "grad_norm": 38.79029297459658, "learning_rate": 1.7233819424956247e-08, "logits/chosen": -0.9371305108070374, "logits/rejected": -0.9625764489173889, "logps/chosen": -356.5591735839844, "logps/rejected": -382.1607666015625, "loss": 0.5267, "rewards/accuracies": 0.671875, "rewards/chosen": -0.34867197275161743, "rewards/margins": 0.5435488224029541, "rewards/rejected": -0.8922207951545715, "step": 426 }, { "dpo_lambda": 0.9553459882736206, "epoch": 0.8940068045014394, "grad_norm": 39.594727805679064, "learning_rate": 1.6572104647786245e-08, "logits/chosen": -1.0280911922454834, "logits/rejected": -1.0318591594696045, "logps/chosen": -322.733154296875, "logps/rejected": -334.4246826171875, "loss": 0.549, "rewards/accuracies": 0.6875, "rewards/chosen": -0.39758163690567017, "rewards/margins": 0.5534409284591675, "rewards/rejected": -0.9510226249694824, "step": 427 }, { "dpo_lambda": 0.9552411437034607, "epoch": 0.8961004972520282, "grad_norm": 12.049727091764039, "learning_rate": 1.5922907900227017e-08, "logits/chosen": -0.9746778011322021, "logits/rejected": -1.0556857585906982, "logps/chosen": -374.712646484375, "logps/rejected": -401.69976806640625, "loss": 0.5366, "rewards/accuracies": 0.78125, "rewards/chosen": -0.30862295627593994, "rewards/margins": 0.7240450978279114, "rewards/rejected": -1.0326679944992065, "step": 428 }, { "dpo_lambda": 0.955136239528656, "epoch": 0.8981941900026171, "grad_norm": 13.2303060377412, "learning_rate": 1.5286263996730026e-08, "logits/chosen": -0.9656727313995361, "logits/rejected": -0.9746390581130981, "logps/chosen": -362.42327880859375, "logps/rejected": -377.53375244140625, "loss": 0.5027, "rewards/accuracies": 0.859375, "rewards/chosen": -0.19976939260959625, "rewards/margins": 0.6579819917678833, "rewards/rejected": -0.8577514290809631, "step": 429 }, { "dpo_lambda": 0.9550314545631409, "epoch": 0.9002878827532059, "grad_norm": 18.80666324691916, "learning_rate": 1.4662207078575684e-08, "logits/chosen": -0.9734973907470703, "logits/rejected": -1.0407707691192627, "logps/chosen": -289.5849609375, "logps/rejected": -310.9867858886719, "loss": 0.5709, "rewards/accuracies": 0.75, "rewards/chosen": -0.3958791494369507, "rewards/margins": 0.4667404592037201, "rewards/rejected": -0.8626196980476379, "step": 430 }, { "dpo_lambda": 0.9549265503883362, "epoch": 0.9023815755037948, "grad_norm": 21.860909121869923, "learning_rate": 1.40507706120426e-08, "logits/chosen": -1.0206691026687622, "logits/rejected": -0.9979550242424011, "logps/chosen": -302.2188720703125, "logps/rejected": -331.165283203125, "loss": 0.5281, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3756422698497772, "rewards/margins": 0.45793285965919495, "rewards/rejected": -0.8335750699043274, "step": 431 }, { "dpo_lambda": 0.9548217058181763, "epoch": 0.9044752682543836, "grad_norm": 20.022550700009774, "learning_rate": 1.345198738661285e-08, "logits/chosen": -0.9343916177749634, "logits/rejected": -0.9401783347129822, "logps/chosen": -302.5766906738281, "logps/rejected": -372.9184265136719, "loss": 0.5767, "rewards/accuracies": 0.703125, "rewards/chosen": -0.39758020639419556, "rewards/margins": 0.5371288061141968, "rewards/rejected": -0.9347091317176819, "step": 432 }, { "dpo_lambda": 0.9547170400619507, "epoch": 0.9065689610049725, "grad_norm": 22.247403237563105, "learning_rate": 1.2865889513213628e-08, "logits/chosen": -0.8874664306640625, "logits/rejected": -0.8951385617256165, "logps/chosen": -334.44207763671875, "logps/rejected": -412.373291015625, "loss": 0.5207, "rewards/accuracies": 0.828125, "rewards/chosen": -0.30106163024902344, "rewards/margins": 0.7014852166175842, "rewards/rejected": -1.0025469064712524, "step": 433 }, { "dpo_lambda": 0.9546121954917908, "epoch": 0.9086626537555613, "grad_norm": 20.122232158028673, "learning_rate": 1.2292508422495157e-08, "logits/chosen": -0.905904233455658, "logits/rejected": -0.977434515953064, "logps/chosen": -324.328125, "logps/rejected": -333.3005676269531, "loss": 0.551, "rewards/accuracies": 0.671875, "rewards/chosen": -0.26499003171920776, "rewards/margins": 0.5595492124557495, "rewards/rejected": -0.8245391845703125, "step": 434 }, { "dpo_lambda": 0.9545073509216309, "epoch": 0.9107563465061502, "grad_norm": 173.3897449087187, "learning_rate": 1.1731874863145142e-08, "logits/chosen": -1.058231234550476, "logits/rejected": -1.04081392288208, "logps/chosen": -298.2507629394531, "logps/rejected": -354.2373046875, "loss": 0.572, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3913850784301758, "rewards/margins": 0.42320144176483154, "rewards/rejected": -0.8145864605903625, "step": 435 }, { "dpo_lambda": 0.954402506351471, "epoch": 0.912850039256739, "grad_norm": 147.39545680366115, "learning_rate": 1.118401890024001e-08, "logits/chosen": -1.000075101852417, "logits/rejected": -1.0360493659973145, "logps/chosen": -382.035888671875, "logps/rejected": -419.6065673828125, "loss": 0.5386, "rewards/accuracies": 0.671875, "rewards/chosen": -0.40097954869270325, "rewards/margins": 0.5406019687652588, "rewards/rejected": -0.9415814876556396, "step": 436 }, { "dpo_lambda": 0.9542976021766663, "epoch": 0.9149437320073279, "grad_norm": 21.217270364197876, "learning_rate": 1.06489699136324e-08, "logits/chosen": -0.9560334086418152, "logits/rejected": -1.035660743713379, "logps/chosen": -325.727783203125, "logps/rejected": -312.07403564453125, "loss": 0.5881, "rewards/accuracies": 0.765625, "rewards/chosen": -0.39235901832580566, "rewards/margins": 0.48345211148262024, "rewards/rejected": -0.8758111000061035, "step": 437 }, { "dpo_lambda": 0.9541929960250854, "epoch": 0.9170374247579168, "grad_norm": 20.559866554121044, "learning_rate": 1.0126756596375685e-08, "logits/chosen": -0.9488348960876465, "logits/rejected": -0.9916976690292358, "logps/chosen": -308.44549560546875, "logps/rejected": -326.90802001953125, "loss": 0.5357, "rewards/accuracies": 0.8125, "rewards/chosen": -0.28534749150276184, "rewards/margins": 0.6132487654685974, "rewards/rejected": -0.8985961675643921, "step": 438 }, { "dpo_lambda": 0.9540880918502808, "epoch": 0.9191311175085056, "grad_norm": 20.55816599168075, "learning_rate": 9.617406953185136e-09, "logits/chosen": -0.9830228686332703, "logits/rejected": -1.0618979930877686, "logps/chosen": -367.99737548828125, "logps/rejected": -469.106689453125, "loss": 0.5231, "rewards/accuracies": 0.828125, "rewards/chosen": -0.24455446004867554, "rewards/margins": 0.8762832283973694, "rewards/rejected": -1.120837688446045, "step": 439 }, { "dpo_lambda": 0.9539832472801208, "epoch": 0.9212248102590945, "grad_norm": 28.839706619333853, "learning_rate": 9.12094829893642e-09, "logits/chosen": -1.036233901977539, "logits/rejected": -0.9664689302444458, "logps/chosen": -367.5104064941406, "logps/rejected": -434.5174560546875, "loss": 0.5366, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2284281998872757, "rewards/margins": 0.7398945093154907, "rewards/rejected": -0.96832275390625, "step": 440 }, { "dpo_lambda": 0.9538784027099609, "epoch": 0.9233185030096833, "grad_norm": 31.28605342903596, "learning_rate": 8.637407257200496e-09, "logits/chosen": -0.89495849609375, "logits/rejected": -0.8919556736946106, "logps/chosen": -318.3620300292969, "logps/rejected": -347.7049255371094, "loss": 0.5517, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36943739652633667, "rewards/margins": 0.465165376663208, "rewards/rejected": -0.8346028327941895, "step": 441 }, { "dpo_lambda": 0.953773558139801, "epoch": 0.9254121957602722, "grad_norm": 33.18324707086255, "learning_rate": 8.166809758815895e-09, "logits/chosen": -1.043076992034912, "logits/rejected": -1.0556950569152832, "logps/chosen": -394.7267150878906, "logps/rejected": -431.6220703125, "loss": 0.5411, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4038960337638855, "rewards/margins": 0.5986083149909973, "rewards/rejected": -1.0025043487548828, "step": 442 }, { "dpo_lambda": 0.9536687135696411, "epoch": 0.927505888510861, "grad_norm": 51.68212005214359, "learning_rate": 7.709181040498253e-09, "logits/chosen": -0.960411787033081, "logits/rejected": -0.9813458323478699, "logps/chosen": -330.04736328125, "logps/rejected": -385.2778015136719, "loss": 0.5518, "rewards/accuracies": 0.734375, "rewards/chosen": -0.2757166922092438, "rewards/margins": 0.5993854999542236, "rewards/rejected": -0.8751022219657898, "step": 443 }, { "dpo_lambda": 0.9535638093948364, "epoch": 0.9295995812614499, "grad_norm": 19.114821707183086, "learning_rate": 7.2645456434869965e-09, "logits/chosen": -0.9486753344535828, "logits/rejected": -1.034155011177063, "logps/chosen": -294.3712158203125, "logps/rejected": -359.9336242675781, "loss": 0.57, "rewards/accuracies": 0.640625, "rewards/chosen": -0.41555386781692505, "rewards/margins": 0.40340498089790344, "rewards/rejected": -0.8189587593078613, "step": 444 }, { "dpo_lambda": 0.9534591436386108, "epoch": 0.9316932740120387, "grad_norm": 29.115280986236147, "learning_rate": 6.832927412229017e-09, "logits/chosen": -1.0227396488189697, "logits/rejected": -1.0331047773361206, "logps/chosen": -310.11004638671875, "logps/rejected": -339.3343811035156, "loss": 0.5575, "rewards/accuracies": 0.671875, "rewards/chosen": -0.2876787781715393, "rewards/margins": 0.5395625829696655, "rewards/rejected": -0.8272414207458496, "step": 445 }, { "dpo_lambda": 0.9533542990684509, "epoch": 0.9337869667626276, "grad_norm": 15.00058131136615, "learning_rate": 6.414349493100129e-09, "logits/chosen": -0.9562291502952576, "logits/rejected": -0.9506933093070984, "logps/chosen": -337.495849609375, "logps/rejected": -396.34637451171875, "loss": 0.5277, "rewards/accuracies": 0.75, "rewards/chosen": -0.23384301364421844, "rewards/margins": 0.6402769684791565, "rewards/rejected": -0.8741199970245361, "step": 446 }, { "dpo_lambda": 0.953249454498291, "epoch": 0.9358806595132164, "grad_norm": 16.07486927161977, "learning_rate": 6.0088343331638756e-09, "logits/chosen": -0.9611780047416687, "logits/rejected": -0.934112548828125, "logps/chosen": -322.6107482910156, "logps/rejected": -392.5720520019531, "loss": 0.5271, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3303907811641693, "rewards/margins": 0.6360582709312439, "rewards/rejected": -0.9664490818977356, "step": 447 }, { "dpo_lambda": 0.9531446099281311, "epoch": 0.9379743522638053, "grad_norm": 29.673921637972377, "learning_rate": 5.616403678967624e-09, "logits/chosen": -1.0144095420837402, "logits/rejected": -1.026979684829712, "logps/chosen": -349.0781555175781, "logps/rejected": -386.89556884765625, "loss": 0.539, "rewards/accuracies": 0.75, "rewards/chosen": -0.4534134566783905, "rewards/margins": 0.6086537837982178, "rewards/rejected": -1.0620671510696411, "step": 448 }, { "dpo_lambda": 0.9530397653579712, "epoch": 0.9400680450143941, "grad_norm": 14.254234329687867, "learning_rate": 5.2370785753763356e-09, "logits/chosen": -0.8938382267951965, "logits/rejected": -0.9977080821990967, "logps/chosen": -288.8353271484375, "logps/rejected": -345.09039306640625, "loss": 0.5622, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34774893522262573, "rewards/margins": 0.44677218794822693, "rewards/rejected": -0.7945210933685303, "step": 449 }, { "dpo_lambda": 0.9529350996017456, "epoch": 0.942161737764983, "grad_norm": 44.196200512016574, "learning_rate": 4.8708793644441086e-09, "logits/chosen": -0.8911122679710388, "logits/rejected": -0.9161196351051331, "logps/chosen": -303.7921142578125, "logps/rejected": -349.2629699707031, "loss": 0.5357, "rewards/accuracies": 0.796875, "rewards/chosen": -0.26537635922431946, "rewards/margins": 0.6387584209442139, "rewards/rejected": -0.9041347503662109, "step": 450 }, { "epoch": 0.942161737764983, "eval_dpo_lambda": 0.9528301954269409, "eval_logits/chosen": -0.9672125577926636, "eval_logits/rejected": -1.003003478050232, "eval_logps/chosen": -338.48345947265625, "eval_logps/rejected": -360.6006164550781, "eval_loss": 0.561907947063446, "eval_rewards/accuracies": 0.7310000061988831, "eval_rewards/chosen": -0.3783654570579529, "eval_rewards/margins": 0.517332911491394, "eval_rewards/rejected": -0.8956983685493469, "eval_runtime": 561.384, "eval_samples_per_second": 3.563, "eval_steps_per_second": 0.891, "step": 450 }, { "dpo_lambda": 0.9528302550315857, "epoch": 0.9442554305155718, "grad_norm": 15.699215737825678, "learning_rate": 4.517825684323323e-09, "logits/chosen": -0.9667340517044067, "logits/rejected": -0.9940100312232971, "logps/chosen": -347.5166015625, "logps/rejected": -398.2073669433594, "loss": 0.5401, "rewards/accuracies": 0.765625, "rewards/chosen": -0.36576253175735474, "rewards/margins": 0.6338947415351868, "rewards/rejected": -0.9996572732925415, "step": 451 }, { "dpo_lambda": 0.952725350856781, "epoch": 0.9463491232661607, "grad_norm": 73.7455655355022, "learning_rate": 4.1779364682113794e-09, "logits/chosen": -0.916843593120575, "logits/rejected": -0.9902130365371704, "logps/chosen": -378.66424560546875, "logps/rejected": -383.3100280761719, "loss": 0.6002, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4185040593147278, "rewards/margins": 0.43680867552757263, "rewards/rejected": -0.8553128242492676, "step": 452 }, { "dpo_lambda": 0.9526205062866211, "epoch": 0.9484428160167495, "grad_norm": 19.729553777174523, "learning_rate": 3.851229943335393e-09, "logits/chosen": -0.9096695184707642, "logits/rejected": -0.9353246688842773, "logps/chosen": -337.8310852050781, "logps/rejected": -331.5688781738281, "loss": 0.5197, "rewards/accuracies": 0.796875, "rewards/chosen": -0.29002678394317627, "rewards/margins": 0.6455761194229126, "rewards/rejected": -0.9356027841567993, "step": 453 }, { "dpo_lambda": 0.9525156617164612, "epoch": 0.9505365087673384, "grad_norm": 31.56183094417888, "learning_rate": 3.5377236299748147e-09, "logits/chosen": -1.017897605895996, "logits/rejected": -1.077573299407959, "logps/chosen": -306.55389404296875, "logps/rejected": -319.935791015625, "loss": 0.5622, "rewards/accuracies": 0.640625, "rewards/chosen": -0.4877539575099945, "rewards/margins": 0.3666626214981079, "rewards/rejected": -0.8544166088104248, "step": 454 }, { "dpo_lambda": 0.9524108171463013, "epoch": 0.9526302015179272, "grad_norm": 37.50830191860258, "learning_rate": 3.2374343405217884e-09, "logits/chosen": -0.9689821600914001, "logits/rejected": -1.0138612985610962, "logps/chosen": -333.1699523925781, "logps/rejected": -356.8009338378906, "loss": 0.5582, "rewards/accuracies": 0.75, "rewards/chosen": -0.24983340501785278, "rewards/margins": 0.7474940419197083, "rewards/rejected": -0.997327446937561, "step": 455 }, { "dpo_lambda": 0.9523061513900757, "epoch": 0.9547238942685161, "grad_norm": 35.54479092391772, "learning_rate": 2.9503781785795713e-09, "logits/chosen": -0.9847046732902527, "logits/rejected": -1.0381470918655396, "logps/chosen": -319.5542297363281, "logps/rejected": -387.990234375, "loss": 0.5381, "rewards/accuracies": 0.6875, "rewards/chosen": -0.40106528997421265, "rewards/margins": 0.5943878293037415, "rewards/rejected": -0.9954531192779541, "step": 456 }, { "dpo_lambda": 0.9522013068199158, "epoch": 0.9568175870191049, "grad_norm": 27.64991824437417, "learning_rate": 2.6765705380989432e-09, "logits/chosen": -0.9121294617652893, "logits/rejected": -0.8870172500610352, "logps/chosen": -301.5426940917969, "logps/rejected": -367.2798767089844, "loss": 0.5532, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3733138144016266, "rewards/margins": 0.5066189765930176, "rewards/rejected": -0.8799328804016113, "step": 457 }, { "dpo_lambda": 0.9520964026451111, "epoch": 0.9589112797696938, "grad_norm": 36.24041769583764, "learning_rate": 2.416026102552732e-09, "logits/chosen": -0.9376819133758545, "logits/rejected": -0.9311866760253906, "logps/chosen": -327.5171203613281, "logps/rejected": -355.0273742675781, "loss": 0.6179, "rewards/accuracies": 0.640625, "rewards/chosen": -0.3835518956184387, "rewards/margins": 0.282795786857605, "rewards/rejected": -0.6663477420806885, "step": 458 }, { "dpo_lambda": 0.951991617679596, "epoch": 0.9610049725202826, "grad_norm": 27.119630162374623, "learning_rate": 2.168758844148272e-09, "logits/chosen": -1.0208796262741089, "logits/rejected": -1.0136921405792236, "logps/chosen": -353.8718566894531, "logps/rejected": -364.9420471191406, "loss": 0.5599, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4231835603713989, "rewards/margins": 0.5398699045181274, "rewards/rejected": -0.9630534648895264, "step": 459 }, { "dpo_lambda": 0.9518867135047913, "epoch": 0.9630986652708715, "grad_norm": 38.794159915138245, "learning_rate": 1.9347820230782295e-09, "logits/chosen": -0.9907146692276001, "logits/rejected": -1.038648009300232, "logps/chosen": -281.57666015625, "logps/rejected": -395.8458557128906, "loss": 0.4965, "rewards/accuracies": 0.890625, "rewards/chosen": -0.20416496694087982, "rewards/margins": 0.9015572667121887, "rewards/rejected": -1.105722188949585, "step": 460 }, { "dpo_lambda": 0.9517821073532104, "epoch": 0.9651923580214603, "grad_norm": 18.734944359406175, "learning_rate": 1.7141081868094209e-09, "logits/chosen": -0.92900550365448, "logits/rejected": -0.9331774115562439, "logps/chosen": -385.0363464355469, "logps/rejected": -431.5898742675781, "loss": 0.4995, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3005678057670593, "rewards/margins": 0.7972238063812256, "rewards/rejected": -1.0977916717529297, "step": 461 }, { "dpo_lambda": 0.9516772031784058, "epoch": 0.9672860507720492, "grad_norm": 19.901695883142253, "learning_rate": 1.5067491694100153e-09, "logits/chosen": -0.9556981325149536, "logits/rejected": -0.9943455457687378, "logps/chosen": -376.994873046875, "logps/rejected": -431.7843322753906, "loss": 0.5344, "rewards/accuracies": 0.765625, "rewards/chosen": -0.35393840074539185, "rewards/margins": 0.6615272164344788, "rewards/rejected": -1.0154657363891602, "step": 462 }, { "dpo_lambda": 0.9515723586082458, "epoch": 0.969379743522638, "grad_norm": 29.033019965362797, "learning_rate": 1.3127160909147672e-09, "logits/chosen": -0.9761213064193726, "logits/rejected": -0.9421603679656982, "logps/chosen": -364.5227966308594, "logps/rejected": -408.348388671875, "loss": 0.5222, "rewards/accuracies": 0.6875, "rewards/chosen": -0.35811877250671387, "rewards/margins": 0.5714206099510193, "rewards/rejected": -0.9295394420623779, "step": 463 }, { "dpo_lambda": 0.9514675140380859, "epoch": 0.9714734362732269, "grad_norm": 26.959809231046826, "learning_rate": 1.1320193567288527e-09, "logits/chosen": -0.9518054723739624, "logits/rejected": -0.9945077896118164, "logps/chosen": -261.4985046386719, "logps/rejected": -289.08148193359375, "loss": 0.5476, "rewards/accuracies": 0.71875, "rewards/chosen": -0.28891125321388245, "rewards/margins": 0.48678356409072876, "rewards/rejected": -0.7756948471069336, "step": 464 }, { "dpo_lambda": 0.951362669467926, "epoch": 0.9735671290238157, "grad_norm": 22.208879671906168, "learning_rate": 9.64668657069706e-10, "logits/chosen": -1.0004689693450928, "logits/rejected": -0.997818648815155, "logps/chosen": -292.04632568359375, "logps/rejected": -323.6458435058594, "loss": 0.5204, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4061770439147949, "rewards/margins": 0.5217178463935852, "rewards/rejected": -0.9278948903083801, "step": 465 }, { "dpo_lambda": 0.9512577652931213, "epoch": 0.9756608217744046, "grad_norm": 18.508622786366722, "learning_rate": 8.106729664475176e-10, "logits/chosen": -0.9931919574737549, "logits/rejected": -1.0254158973693848, "logps/chosen": -320.24200439453125, "logps/rejected": -325.3582458496094, "loss": 0.5428, "rewards/accuracies": 0.71875, "rewards/chosen": -0.28171780705451965, "rewards/margins": 0.5396276712417603, "rewards/rejected": -0.8213454484939575, "step": 466 }, { "dpo_lambda": 0.9511529207229614, "epoch": 0.9777545145249935, "grad_norm": 55.96110779105031, "learning_rate": 6.700405431837585e-10, "logits/chosen": -0.9660060405731201, "logits/rejected": -0.9795838594436646, "logps/chosen": -325.99249267578125, "logps/rejected": -389.2981872558594, "loss": 0.49, "rewards/accuracies": 0.8125, "rewards/chosen": -0.21306493878364563, "rewards/margins": 0.7328426241874695, "rewards/rejected": -0.9459075927734375, "step": 467 }, { "dpo_lambda": 0.9510482549667358, "epoch": 0.9798482072755823, "grad_norm": 52.446456162690176, "learning_rate": 5.427789289685347e-10, "logits/chosen": -0.9498654007911682, "logits/rejected": -1.0212403535842896, "logps/chosen": -371.9898681640625, "logps/rejected": -436.3253479003906, "loss": 0.5287, "rewards/accuracies": 0.71875, "rewards/chosen": -0.32743874192237854, "rewards/margins": 0.6464335322380066, "rewards/rejected": -0.9738723039627075, "step": 468 }, { "dpo_lambda": 0.9509434103965759, "epoch": 0.9819419000261712, "grad_norm": 45.04906672619121, "learning_rate": 4.288949484559934e-10, "logits/chosen": -0.9477793574333191, "logits/rejected": -0.9834456443786621, "logps/chosen": -323.77288818359375, "logps/rejected": -345.29022216796875, "loss": 0.5288, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3528931140899658, "rewards/margins": 0.5689555406570435, "rewards/rejected": -0.921848714351654, "step": 469 }, { "dpo_lambda": 0.950838565826416, "epoch": 0.98403559277676, "grad_norm": 25.992732802981653, "learning_rate": 3.2839470889836627e-10, "logits/chosen": -0.9501466155052185, "logits/rejected": -0.9828510880470276, "logps/chosen": -316.51806640625, "logps/rejected": -342.9505615234375, "loss": 0.5701, "rewards/accuracies": 0.671875, "rewards/chosen": -0.3308340907096863, "rewards/margins": 0.5020270347595215, "rewards/rejected": -0.8328611850738525, "step": 470 }, { "dpo_lambda": 0.9507337212562561, "epoch": 0.9861292855273489, "grad_norm": 25.58925647010266, "learning_rate": 2.412835998185092e-10, "logits/chosen": -0.8885989785194397, "logits/rejected": -0.9959658980369568, "logps/chosen": -363.73175048828125, "logps/rejected": -365.086181640625, "loss": 0.5018, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3134402632713318, "rewards/margins": 0.6511830687522888, "rewards/rejected": -0.9646233320236206, "step": 471 }, { "dpo_lambda": 0.9506288766860962, "epoch": 0.9882229782779377, "grad_norm": 23.16302027932693, "learning_rate": 1.6756629272085544e-10, "logits/chosen": -0.9789613485336304, "logits/rejected": -0.9847142696380615, "logps/chosen": -333.34613037109375, "logps/rejected": -401.7735595703125, "loss": 0.6441, "rewards/accuracies": 0.640625, "rewards/chosen": -0.46712347865104675, "rewards/margins": 0.3497805893421173, "rewards/rejected": -0.8169040083885193, "step": 472 }, { "dpo_lambda": 0.9505242109298706, "epoch": 0.9903166710285266, "grad_norm": 28.986250925054538, "learning_rate": 1.072467408408384e-10, "logits/chosen": -1.011475682258606, "logits/rejected": -0.9854814410209656, "logps/chosen": -323.3624267578125, "logps/rejected": -392.7468566894531, "loss": 0.5504, "rewards/accuracies": 0.71875, "rewards/chosen": -0.43965718150138855, "rewards/margins": 0.4960758090019226, "rewards/rejected": -0.9357329607009888, "step": 473 }, { "dpo_lambda": 0.9504193663597107, "epoch": 0.9924103637791154, "grad_norm": 29.69154282719129, "learning_rate": 6.032817893297793e-11, "logits/chosen": -0.9288985133171082, "logits/rejected": -0.9573264122009277, "logps/chosen": -349.6865234375, "logps/rejected": -371.0149841308594, "loss": 0.5234, "rewards/accuracies": 0.734375, "rewards/chosen": -0.2881234288215637, "rewards/margins": 0.5843805074691772, "rewards/rejected": -0.872503936290741, "step": 474 }, { "dpo_lambda": 0.950314462184906, "epoch": 0.9945040565297043, "grad_norm": 17.52475572003096, "learning_rate": 2.6813123097352287e-11, "logits/chosen": -1.0027269124984741, "logits/rejected": -0.9944903254508972, "logps/chosen": -321.1978454589844, "logps/rejected": -340.96600341796875, "loss": 0.5784, "rewards/accuracies": 0.671875, "rewards/chosen": -0.35715794563293457, "rewards/margins": 0.4749200642108917, "rewards/rejected": -0.8320780396461487, "step": 475 }, { "dpo_lambda": 0.9502096176147461, "epoch": 0.9965977492802931, "grad_norm": 18.527439231672, "learning_rate": 6.7033706447061635e-12, "logits/chosen": -0.9148775935173035, "logits/rejected": -0.8974156975746155, "logps/chosen": -284.0632019042969, "logps/rejected": -353.8758544921875, "loss": 0.5733, "rewards/accuracies": 0.765625, "rewards/chosen": -0.34222733974456787, "rewards/margins": 0.6102782487869263, "rewards/rejected": -0.9525056481361389, "step": 476 }, { "dpo_lambda": 0.9501047730445862, "epoch": 0.998691442030882, "grad_norm": 17.038344537628184, "learning_rate": 0.0, "logits/chosen": -0.9984913468360901, "logits/rejected": -0.9600070714950562, "logps/chosen": -301.17230224609375, "logps/rejected": -387.6405029296875, "loss": 0.5673, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3698651194572449, "rewards/margins": 0.6756829619407654, "rewards/rejected": -1.0455480813980103, "step": 477 }, { "epoch": 0.998691442030882, "step": 477, "total_flos": 0.0, "train_loss": 0.5879578035582537, "train_runtime": 40532.5341, "train_samples_per_second": 1.508, "train_steps_per_second": 0.012 } ], "logging_steps": 1, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }